1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE 5 * Copyright (C) 2016 - 2020 Christoph Hellwig 6 */ 7 #include <linux/init.h> 8 #include <linux/mm.h> 9 #include <linux/blkdev.h> 10 #include <linux/buffer_head.h> 11 #include <linux/mpage.h> 12 #include <linux/uio.h> 13 #include <linux/namei.h> 14 #include <linux/task_io_accounting_ops.h> 15 #include <linux/falloc.h> 16 #include <linux/suspend.h> 17 #include <linux/fs.h> 18 #include "blk.h" 19 20 static struct inode *bdev_file_inode(struct file *file) 21 { 22 return file->f_mapping->host; 23 } 24 25 static int blkdev_get_block(struct inode *inode, sector_t iblock, 26 struct buffer_head *bh, int create) 27 { 28 bh->b_bdev = I_BDEV(inode); 29 bh->b_blocknr = iblock; 30 set_buffer_mapped(bh); 31 return 0; 32 } 33 34 static unsigned int dio_bio_write_op(struct kiocb *iocb) 35 { 36 unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; 37 38 /* avoid the need for a I/O completion work item */ 39 if (iocb->ki_flags & IOCB_DSYNC) 40 op |= REQ_FUA; 41 return op; 42 } 43 44 #define DIO_INLINE_BIO_VECS 4 45 46 static void blkdev_bio_end_io_simple(struct bio *bio) 47 { 48 struct task_struct *waiter = bio->bi_private; 49 50 WRITE_ONCE(bio->bi_private, NULL); 51 blk_wake_io_task(waiter); 52 } 53 54 static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, 55 struct iov_iter *iter, unsigned int nr_pages) 56 { 57 struct file *file = iocb->ki_filp; 58 struct block_device *bdev = I_BDEV(bdev_file_inode(file)); 59 struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs; 60 loff_t pos = iocb->ki_pos; 61 bool should_dirty = false; 62 struct bio bio; 63 ssize_t ret; 64 65 if ((pos | iov_iter_alignment(iter)) & 66 (bdev_logical_block_size(bdev) - 1)) 67 return -EINVAL; 68 69 if (nr_pages <= DIO_INLINE_BIO_VECS) 70 vecs = inline_vecs; 71 else { 72 vecs = kmalloc_array(nr_pages, sizeof(struct bio_vec), 73 GFP_KERNEL); 74 if (!vecs) 75 return -ENOMEM; 76 } 77 78 bio_init(&bio, vecs, nr_pages); 79 bio_set_dev(&bio, bdev); 80 bio.bi_iter.bi_sector = pos >> 9; 81 bio.bi_write_hint = iocb->ki_hint; 82 bio.bi_private = current; 83 bio.bi_end_io = blkdev_bio_end_io_simple; 84 bio.bi_ioprio = iocb->ki_ioprio; 85 86 ret = bio_iov_iter_get_pages(&bio, iter); 87 if (unlikely(ret)) 88 goto out; 89 ret = bio.bi_iter.bi_size; 90 91 if (iov_iter_rw(iter) == READ) { 92 bio.bi_opf = REQ_OP_READ; 93 if (iter_is_iovec(iter)) 94 should_dirty = true; 95 } else { 96 bio.bi_opf = dio_bio_write_op(iocb); 97 task_io_account_write(ret); 98 } 99 if (iocb->ki_flags & IOCB_NOWAIT) 100 bio.bi_opf |= REQ_NOWAIT; 101 if (iocb->ki_flags & IOCB_HIPRI) 102 bio_set_polled(&bio, iocb); 103 104 submit_bio(&bio); 105 for (;;) { 106 set_current_state(TASK_UNINTERRUPTIBLE); 107 if (!READ_ONCE(bio.bi_private)) 108 break; 109 if (!(iocb->ki_flags & IOCB_HIPRI) || !bio_poll(&bio, 0)) 110 blk_io_schedule(); 111 } 112 __set_current_state(TASK_RUNNING); 113 114 bio_release_pages(&bio, should_dirty); 115 if (unlikely(bio.bi_status)) 116 ret = blk_status_to_errno(bio.bi_status); 117 118 out: 119 if (vecs != inline_vecs) 120 kfree(vecs); 121 122 bio_uninit(&bio); 123 124 return ret; 125 } 126 127 struct blkdev_dio { 128 union { 129 struct kiocb *iocb; 130 struct task_struct *waiter; 131 }; 132 size_t size; 133 atomic_t ref; 134 bool multi_bio : 1; 135 bool should_dirty : 1; 136 bool is_sync : 1; 137 struct bio bio; 138 }; 139 140 static struct bio_set blkdev_dio_pool; 141 142 static void blkdev_bio_end_io(struct bio *bio) 143 { 144 struct blkdev_dio *dio = bio->bi_private; 145 bool should_dirty = dio->should_dirty; 146 147 if (bio->bi_status && !dio->bio.bi_status) 148 dio->bio.bi_status = bio->bi_status; 149 150 if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) { 151 if (!dio->is_sync) { 152 struct kiocb *iocb = dio->iocb; 153 ssize_t ret; 154 155 WRITE_ONCE(iocb->private, NULL); 156 157 if (likely(!dio->bio.bi_status)) { 158 ret = dio->size; 159 iocb->ki_pos += ret; 160 } else { 161 ret = blk_status_to_errno(dio->bio.bi_status); 162 } 163 164 dio->iocb->ki_complete(iocb, ret, 0); 165 if (dio->multi_bio) 166 bio_put(&dio->bio); 167 } else { 168 struct task_struct *waiter = dio->waiter; 169 170 WRITE_ONCE(dio->waiter, NULL); 171 blk_wake_io_task(waiter); 172 } 173 } 174 175 if (should_dirty) { 176 bio_check_pages_dirty(bio); 177 } else { 178 bio_release_pages(bio, false); 179 bio_put(bio); 180 } 181 } 182 183 static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, 184 unsigned int nr_pages) 185 { 186 struct file *file = iocb->ki_filp; 187 struct inode *inode = bdev_file_inode(file); 188 struct block_device *bdev = I_BDEV(inode); 189 struct blk_plug plug; 190 struct blkdev_dio *dio; 191 struct bio *bio; 192 bool do_poll = (iocb->ki_flags & IOCB_HIPRI); 193 bool is_read = (iov_iter_rw(iter) == READ), is_sync; 194 loff_t pos = iocb->ki_pos; 195 int ret = 0; 196 197 if ((pos | iov_iter_alignment(iter)) & 198 (bdev_logical_block_size(bdev) - 1)) 199 return -EINVAL; 200 201 bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool); 202 203 dio = container_of(bio, struct blkdev_dio, bio); 204 dio->is_sync = is_sync = is_sync_kiocb(iocb); 205 if (dio->is_sync) { 206 dio->waiter = current; 207 bio_get(bio); 208 } else { 209 dio->iocb = iocb; 210 } 211 212 dio->size = 0; 213 dio->multi_bio = false; 214 dio->should_dirty = is_read && iter_is_iovec(iter); 215 216 /* 217 * Don't plug for HIPRI/polled IO, as those should go straight 218 * to issue 219 */ 220 if (!(iocb->ki_flags & IOCB_HIPRI)) 221 blk_start_plug(&plug); 222 223 for (;;) { 224 bio_set_dev(bio, bdev); 225 bio->bi_iter.bi_sector = pos >> 9; 226 bio->bi_write_hint = iocb->ki_hint; 227 bio->bi_private = dio; 228 bio->bi_end_io = blkdev_bio_end_io; 229 bio->bi_ioprio = iocb->ki_ioprio; 230 231 ret = bio_iov_iter_get_pages(bio, iter); 232 if (unlikely(ret)) { 233 bio->bi_status = BLK_STS_IOERR; 234 bio_endio(bio); 235 break; 236 } 237 238 if (is_read) { 239 bio->bi_opf = REQ_OP_READ; 240 if (dio->should_dirty) 241 bio_set_pages_dirty(bio); 242 } else { 243 bio->bi_opf = dio_bio_write_op(iocb); 244 task_io_account_write(bio->bi_iter.bi_size); 245 } 246 if (iocb->ki_flags & IOCB_NOWAIT) 247 bio->bi_opf |= REQ_NOWAIT; 248 249 dio->size += bio->bi_iter.bi_size; 250 pos += bio->bi_iter.bi_size; 251 252 nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS); 253 if (!nr_pages) { 254 if (do_poll) 255 bio_set_polled(bio, iocb); 256 submit_bio(bio); 257 if (do_poll) 258 WRITE_ONCE(iocb->private, bio); 259 break; 260 } 261 if (!dio->multi_bio) { 262 /* 263 * AIO needs an extra reference to ensure the dio 264 * structure which is embedded into the first bio 265 * stays around. 266 */ 267 if (!is_sync) 268 bio_get(bio); 269 dio->multi_bio = true; 270 atomic_set(&dio->ref, 2); 271 do_poll = false; 272 } else { 273 atomic_inc(&dio->ref); 274 } 275 276 submit_bio(bio); 277 bio = bio_alloc(GFP_KERNEL, nr_pages); 278 } 279 280 if (!(iocb->ki_flags & IOCB_HIPRI)) 281 blk_finish_plug(&plug); 282 283 if (!is_sync) 284 return -EIOCBQUEUED; 285 286 for (;;) { 287 set_current_state(TASK_UNINTERRUPTIBLE); 288 if (!READ_ONCE(dio->waiter)) 289 break; 290 291 if (!do_poll || !bio_poll(bio, 0)) 292 blk_io_schedule(); 293 } 294 __set_current_state(TASK_RUNNING); 295 296 if (!ret) 297 ret = blk_status_to_errno(dio->bio.bi_status); 298 if (likely(!ret)) 299 ret = dio->size; 300 301 bio_put(&dio->bio); 302 return ret; 303 } 304 305 static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) 306 { 307 unsigned int nr_pages; 308 309 if (!iov_iter_count(iter)) 310 return 0; 311 312 nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); 313 if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_VECS) 314 return __blkdev_direct_IO_simple(iocb, iter, nr_pages); 315 316 return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); 317 } 318 319 static int blkdev_writepage(struct page *page, struct writeback_control *wbc) 320 { 321 return block_write_full_page(page, blkdev_get_block, wbc); 322 } 323 324 static int blkdev_readpage(struct file * file, struct page * page) 325 { 326 return block_read_full_page(page, blkdev_get_block); 327 } 328 329 static void blkdev_readahead(struct readahead_control *rac) 330 { 331 mpage_readahead(rac, blkdev_get_block); 332 } 333 334 static int blkdev_write_begin(struct file *file, struct address_space *mapping, 335 loff_t pos, unsigned len, unsigned flags, struct page **pagep, 336 void **fsdata) 337 { 338 return block_write_begin(mapping, pos, len, flags, pagep, 339 blkdev_get_block); 340 } 341 342 static int blkdev_write_end(struct file *file, struct address_space *mapping, 343 loff_t pos, unsigned len, unsigned copied, struct page *page, 344 void *fsdata) 345 { 346 int ret; 347 ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); 348 349 unlock_page(page); 350 put_page(page); 351 352 return ret; 353 } 354 355 static int blkdev_writepages(struct address_space *mapping, 356 struct writeback_control *wbc) 357 { 358 return generic_writepages(mapping, wbc); 359 } 360 361 const struct address_space_operations def_blk_aops = { 362 .set_page_dirty = __set_page_dirty_buffers, 363 .readpage = blkdev_readpage, 364 .readahead = blkdev_readahead, 365 .writepage = blkdev_writepage, 366 .write_begin = blkdev_write_begin, 367 .write_end = blkdev_write_end, 368 .writepages = blkdev_writepages, 369 .direct_IO = blkdev_direct_IO, 370 .migratepage = buffer_migrate_page_norefs, 371 .is_dirty_writeback = buffer_check_dirty_writeback, 372 }; 373 374 /* 375 * for a block special file file_inode(file)->i_size is zero 376 * so we compute the size by hand (just as in block_read/write above) 377 */ 378 static loff_t blkdev_llseek(struct file *file, loff_t offset, int whence) 379 { 380 struct inode *bd_inode = bdev_file_inode(file); 381 loff_t retval; 382 383 inode_lock(bd_inode); 384 retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode)); 385 inode_unlock(bd_inode); 386 return retval; 387 } 388 389 static int blkdev_fsync(struct file *filp, loff_t start, loff_t end, 390 int datasync) 391 { 392 struct inode *bd_inode = bdev_file_inode(filp); 393 struct block_device *bdev = I_BDEV(bd_inode); 394 int error; 395 396 error = file_write_and_wait_range(filp, start, end); 397 if (error) 398 return error; 399 400 /* 401 * There is no need to serialise calls to blkdev_issue_flush with 402 * i_mutex and doing so causes performance issues with concurrent 403 * O_SYNC writers to a block device. 404 */ 405 error = blkdev_issue_flush(bdev); 406 if (error == -EOPNOTSUPP) 407 error = 0; 408 409 return error; 410 } 411 412 static int blkdev_open(struct inode *inode, struct file *filp) 413 { 414 struct block_device *bdev; 415 416 /* 417 * Preserve backwards compatibility and allow large file access 418 * even if userspace doesn't ask for it explicitly. Some mkfs 419 * binary needs it. We might want to drop this workaround 420 * during an unstable branch. 421 */ 422 filp->f_flags |= O_LARGEFILE; 423 filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; 424 425 if (filp->f_flags & O_NDELAY) 426 filp->f_mode |= FMODE_NDELAY; 427 if (filp->f_flags & O_EXCL) 428 filp->f_mode |= FMODE_EXCL; 429 if ((filp->f_flags & O_ACCMODE) == 3) 430 filp->f_mode |= FMODE_WRITE_IOCTL; 431 432 bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp); 433 if (IS_ERR(bdev)) 434 return PTR_ERR(bdev); 435 filp->f_mapping = bdev->bd_inode->i_mapping; 436 filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); 437 return 0; 438 } 439 440 static int blkdev_close(struct inode *inode, struct file *filp) 441 { 442 struct block_device *bdev = I_BDEV(bdev_file_inode(filp)); 443 444 blkdev_put(bdev, filp->f_mode); 445 return 0; 446 } 447 448 /* 449 * Write data to the block device. Only intended for the block device itself 450 * and the raw driver which basically is a fake block device. 451 * 452 * Does not take i_mutex for the write and thus is not for general purpose 453 * use. 454 */ 455 static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) 456 { 457 struct file *file = iocb->ki_filp; 458 struct inode *bd_inode = bdev_file_inode(file); 459 loff_t size = i_size_read(bd_inode); 460 struct blk_plug plug; 461 size_t shorted = 0; 462 ssize_t ret; 463 464 if (bdev_read_only(I_BDEV(bd_inode))) 465 return -EPERM; 466 467 if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev)) 468 return -ETXTBSY; 469 470 if (!iov_iter_count(from)) 471 return 0; 472 473 if (iocb->ki_pos >= size) 474 return -ENOSPC; 475 476 if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT) 477 return -EOPNOTSUPP; 478 479 size -= iocb->ki_pos; 480 if (iov_iter_count(from) > size) { 481 shorted = iov_iter_count(from) - size; 482 iov_iter_truncate(from, size); 483 } 484 485 blk_start_plug(&plug); 486 ret = __generic_file_write_iter(iocb, from); 487 if (ret > 0) 488 ret = generic_write_sync(iocb, ret); 489 iov_iter_reexpand(from, iov_iter_count(from) + shorted); 490 blk_finish_plug(&plug); 491 return ret; 492 } 493 494 static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) 495 { 496 struct file *file = iocb->ki_filp; 497 struct inode *bd_inode = bdev_file_inode(file); 498 loff_t size = i_size_read(bd_inode); 499 loff_t pos = iocb->ki_pos; 500 size_t shorted = 0; 501 ssize_t ret; 502 503 if (pos >= size) 504 return 0; 505 506 size -= pos; 507 if (iov_iter_count(to) > size) { 508 shorted = iov_iter_count(to) - size; 509 iov_iter_truncate(to, size); 510 } 511 512 ret = generic_file_read_iter(iocb, to); 513 iov_iter_reexpand(to, iov_iter_count(to) + shorted); 514 return ret; 515 } 516 517 #define BLKDEV_FALLOC_FL_SUPPORTED \ 518 (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ 519 FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE) 520 521 static long blkdev_fallocate(struct file *file, int mode, loff_t start, 522 loff_t len) 523 { 524 struct inode *inode = bdev_file_inode(file); 525 struct block_device *bdev = I_BDEV(inode); 526 loff_t end = start + len - 1; 527 loff_t isize; 528 int error; 529 530 /* Fail if we don't recognize the flags. */ 531 if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED) 532 return -EOPNOTSUPP; 533 534 /* Don't go off the end of the device. */ 535 isize = i_size_read(bdev->bd_inode); 536 if (start >= isize) 537 return -EINVAL; 538 if (end >= isize) { 539 if (mode & FALLOC_FL_KEEP_SIZE) { 540 len = isize - start; 541 end = start + len - 1; 542 } else 543 return -EINVAL; 544 } 545 546 /* 547 * Don't allow IO that isn't aligned to logical block size. 548 */ 549 if ((start | len) & (bdev_logical_block_size(bdev) - 1)) 550 return -EINVAL; 551 552 filemap_invalidate_lock(inode->i_mapping); 553 554 /* Invalidate the page cache, including dirty pages. */ 555 error = truncate_bdev_range(bdev, file->f_mode, start, end); 556 if (error) 557 goto fail; 558 559 switch (mode) { 560 case FALLOC_FL_ZERO_RANGE: 561 case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE: 562 error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, 563 GFP_KERNEL, BLKDEV_ZERO_NOUNMAP); 564 break; 565 case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: 566 error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, 567 GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK); 568 break; 569 case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE: 570 error = blkdev_issue_discard(bdev, start >> 9, len >> 9, 571 GFP_KERNEL, 0); 572 break; 573 default: 574 error = -EOPNOTSUPP; 575 } 576 577 fail: 578 filemap_invalidate_unlock(inode->i_mapping); 579 return error; 580 } 581 582 const struct file_operations def_blk_fops = { 583 .open = blkdev_open, 584 .release = blkdev_close, 585 .llseek = blkdev_llseek, 586 .read_iter = blkdev_read_iter, 587 .write_iter = blkdev_write_iter, 588 .iopoll = iocb_bio_iopoll, 589 .mmap = generic_file_mmap, 590 .fsync = blkdev_fsync, 591 .unlocked_ioctl = blkdev_ioctl, 592 #ifdef CONFIG_COMPAT 593 .compat_ioctl = compat_blkdev_ioctl, 594 #endif 595 .splice_read = generic_file_splice_read, 596 .splice_write = iter_file_splice_write, 597 .fallocate = blkdev_fallocate, 598 }; 599 600 static __init int blkdev_init(void) 601 { 602 return bioset_init(&blkdev_dio_pool, 4, 603 offsetof(struct blkdev_dio, bio), 604 BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE); 605 } 606 module_init(blkdev_init); 607