1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE 5 * Copyright (C) 2016 - 2020 Christoph Hellwig 6 */ 7 #include <linux/init.h> 8 #include <linux/mm.h> 9 #include <linux/blkdev.h> 10 #include <linux/buffer_head.h> 11 #include <linux/mpage.h> 12 #include <linux/uio.h> 13 #include <linux/namei.h> 14 #include <linux/task_io_accounting_ops.h> 15 #include <linux/falloc.h> 16 #include <linux/suspend.h> 17 #include <linux/fs.h> 18 #include "blk.h" 19 20 static struct inode *bdev_file_inode(struct file *file) 21 { 22 return file->f_mapping->host; 23 } 24 25 static int blkdev_get_block(struct inode *inode, sector_t iblock, 26 struct buffer_head *bh, int create) 27 { 28 bh->b_bdev = I_BDEV(inode); 29 bh->b_blocknr = iblock; 30 set_buffer_mapped(bh); 31 return 0; 32 } 33 34 static unsigned int dio_bio_write_op(struct kiocb *iocb) 35 { 36 unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; 37 38 /* avoid the need for a I/O completion work item */ 39 if (iocb->ki_flags & IOCB_DSYNC) 40 op |= REQ_FUA; 41 return op; 42 } 43 44 #define DIO_INLINE_BIO_VECS 4 45 46 static void blkdev_bio_end_io_simple(struct bio *bio) 47 { 48 struct task_struct *waiter = bio->bi_private; 49 50 WRITE_ONCE(bio->bi_private, NULL); 51 blk_wake_io_task(waiter); 52 } 53 54 static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, 55 struct iov_iter *iter, unsigned int nr_pages) 56 { 57 struct file *file = iocb->ki_filp; 58 struct block_device *bdev = I_BDEV(bdev_file_inode(file)); 59 struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs; 60 loff_t pos = iocb->ki_pos; 61 bool should_dirty = false; 62 struct bio bio; 63 ssize_t ret; 64 blk_qc_t qc; 65 66 if ((pos | iov_iter_alignment(iter)) & 67 (bdev_logical_block_size(bdev) - 1)) 68 return -EINVAL; 69 70 if (nr_pages <= DIO_INLINE_BIO_VECS) 71 vecs = inline_vecs; 72 else { 73 vecs = kmalloc_array(nr_pages, sizeof(struct bio_vec), 74 GFP_KERNEL); 75 if (!vecs) 76 return -ENOMEM; 77 } 78 79 bio_init(&bio, vecs, nr_pages); 80 bio_set_dev(&bio, bdev); 81 bio.bi_iter.bi_sector = pos >> 9; 82 bio.bi_write_hint = iocb->ki_hint; 83 bio.bi_private = current; 84 bio.bi_end_io = blkdev_bio_end_io_simple; 85 bio.bi_ioprio = iocb->ki_ioprio; 86 87 ret = bio_iov_iter_get_pages(&bio, iter); 88 if (unlikely(ret)) 89 goto out; 90 ret = bio.bi_iter.bi_size; 91 92 if (iov_iter_rw(iter) == READ) { 93 bio.bi_opf = REQ_OP_READ; 94 if (iter_is_iovec(iter)) 95 should_dirty = true; 96 } else { 97 bio.bi_opf = dio_bio_write_op(iocb); 98 task_io_account_write(ret); 99 } 100 if (iocb->ki_flags & IOCB_NOWAIT) 101 bio.bi_opf |= REQ_NOWAIT; 102 if (iocb->ki_flags & IOCB_HIPRI) 103 bio_set_polled(&bio, iocb); 104 105 qc = submit_bio(&bio); 106 for (;;) { 107 set_current_state(TASK_UNINTERRUPTIBLE); 108 if (!READ_ONCE(bio.bi_private)) 109 break; 110 if (!(iocb->ki_flags & IOCB_HIPRI) || 111 !blk_poll(bdev_get_queue(bdev), qc, true)) 112 blk_io_schedule(); 113 } 114 __set_current_state(TASK_RUNNING); 115 116 bio_release_pages(&bio, should_dirty); 117 if (unlikely(bio.bi_status)) 118 ret = blk_status_to_errno(bio.bi_status); 119 120 out: 121 if (vecs != inline_vecs) 122 kfree(vecs); 123 124 bio_uninit(&bio); 125 126 return ret; 127 } 128 129 struct blkdev_dio { 130 union { 131 struct kiocb *iocb; 132 struct task_struct *waiter; 133 }; 134 size_t size; 135 atomic_t ref; 136 bool multi_bio : 1; 137 bool should_dirty : 1; 138 bool is_sync : 1; 139 struct bio bio; 140 }; 141 142 static struct bio_set blkdev_dio_pool; 143 144 static int blkdev_iopoll(struct kiocb *kiocb, bool wait) 145 { 146 struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host); 147 struct request_queue *q = bdev_get_queue(bdev); 148 149 return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait); 150 } 151 152 static void blkdev_bio_end_io(struct bio *bio) 153 { 154 struct blkdev_dio *dio = bio->bi_private; 155 bool should_dirty = dio->should_dirty; 156 157 if (bio->bi_status && !dio->bio.bi_status) 158 dio->bio.bi_status = bio->bi_status; 159 160 if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) { 161 if (!dio->is_sync) { 162 struct kiocb *iocb = dio->iocb; 163 ssize_t ret; 164 165 if (likely(!dio->bio.bi_status)) { 166 ret = dio->size; 167 iocb->ki_pos += ret; 168 } else { 169 ret = blk_status_to_errno(dio->bio.bi_status); 170 } 171 172 dio->iocb->ki_complete(iocb, ret, 0); 173 if (dio->multi_bio) 174 bio_put(&dio->bio); 175 } else { 176 struct task_struct *waiter = dio->waiter; 177 178 WRITE_ONCE(dio->waiter, NULL); 179 blk_wake_io_task(waiter); 180 } 181 } 182 183 if (should_dirty) { 184 bio_check_pages_dirty(bio); 185 } else { 186 bio_release_pages(bio, false); 187 bio_put(bio); 188 } 189 } 190 191 static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, 192 unsigned int nr_pages) 193 { 194 struct file *file = iocb->ki_filp; 195 struct inode *inode = bdev_file_inode(file); 196 struct block_device *bdev = I_BDEV(inode); 197 struct blk_plug plug; 198 struct blkdev_dio *dio; 199 struct bio *bio; 200 bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0; 201 bool is_read = (iov_iter_rw(iter) == READ), is_sync; 202 loff_t pos = iocb->ki_pos; 203 blk_qc_t qc = BLK_QC_T_NONE; 204 int ret = 0; 205 206 if ((pos | iov_iter_alignment(iter)) & 207 (bdev_logical_block_size(bdev) - 1)) 208 return -EINVAL; 209 210 bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool); 211 212 dio = container_of(bio, struct blkdev_dio, bio); 213 dio->is_sync = is_sync = is_sync_kiocb(iocb); 214 if (dio->is_sync) { 215 dio->waiter = current; 216 bio_get(bio); 217 } else { 218 dio->iocb = iocb; 219 } 220 221 dio->size = 0; 222 dio->multi_bio = false; 223 dio->should_dirty = is_read && iter_is_iovec(iter); 224 225 /* 226 * Don't plug for HIPRI/polled IO, as those should go straight 227 * to issue 228 */ 229 if (!is_poll) 230 blk_start_plug(&plug); 231 232 for (;;) { 233 bio_set_dev(bio, bdev); 234 bio->bi_iter.bi_sector = pos >> 9; 235 bio->bi_write_hint = iocb->ki_hint; 236 bio->bi_private = dio; 237 bio->bi_end_io = blkdev_bio_end_io; 238 bio->bi_ioprio = iocb->ki_ioprio; 239 240 ret = bio_iov_iter_get_pages(bio, iter); 241 if (unlikely(ret)) { 242 bio->bi_status = BLK_STS_IOERR; 243 bio_endio(bio); 244 break; 245 } 246 247 if (is_read) { 248 bio->bi_opf = REQ_OP_READ; 249 if (dio->should_dirty) 250 bio_set_pages_dirty(bio); 251 } else { 252 bio->bi_opf = dio_bio_write_op(iocb); 253 task_io_account_write(bio->bi_iter.bi_size); 254 } 255 if (iocb->ki_flags & IOCB_NOWAIT) 256 bio->bi_opf |= REQ_NOWAIT; 257 258 dio->size += bio->bi_iter.bi_size; 259 pos += bio->bi_iter.bi_size; 260 261 nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS); 262 if (!nr_pages) { 263 bool polled = false; 264 265 if (iocb->ki_flags & IOCB_HIPRI) { 266 bio_set_polled(bio, iocb); 267 polled = true; 268 } 269 270 qc = submit_bio(bio); 271 272 if (polled) 273 WRITE_ONCE(iocb->ki_cookie, qc); 274 break; 275 } 276 277 if (!dio->multi_bio) { 278 /* 279 * AIO needs an extra reference to ensure the dio 280 * structure which is embedded into the first bio 281 * stays around. 282 */ 283 if (!is_sync) 284 bio_get(bio); 285 dio->multi_bio = true; 286 atomic_set(&dio->ref, 2); 287 } else { 288 atomic_inc(&dio->ref); 289 } 290 291 submit_bio(bio); 292 bio = bio_alloc(GFP_KERNEL, nr_pages); 293 } 294 295 if (!is_poll) 296 blk_finish_plug(&plug); 297 298 if (!is_sync) 299 return -EIOCBQUEUED; 300 301 for (;;) { 302 set_current_state(TASK_UNINTERRUPTIBLE); 303 if (!READ_ONCE(dio->waiter)) 304 break; 305 306 if (!(iocb->ki_flags & IOCB_HIPRI) || 307 !blk_poll(bdev_get_queue(bdev), qc, true)) 308 blk_io_schedule(); 309 } 310 __set_current_state(TASK_RUNNING); 311 312 if (!ret) 313 ret = blk_status_to_errno(dio->bio.bi_status); 314 if (likely(!ret)) 315 ret = dio->size; 316 317 bio_put(&dio->bio); 318 return ret; 319 } 320 321 static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) 322 { 323 unsigned int nr_pages; 324 325 if (!iov_iter_count(iter)) 326 return 0; 327 328 nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); 329 if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_VECS) 330 return __blkdev_direct_IO_simple(iocb, iter, nr_pages); 331 332 return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); 333 } 334 335 static int blkdev_writepage(struct page *page, struct writeback_control *wbc) 336 { 337 return block_write_full_page(page, blkdev_get_block, wbc); 338 } 339 340 static int blkdev_readpage(struct file * file, struct page * page) 341 { 342 return block_read_full_page(page, blkdev_get_block); 343 } 344 345 static void blkdev_readahead(struct readahead_control *rac) 346 { 347 mpage_readahead(rac, blkdev_get_block); 348 } 349 350 static int blkdev_write_begin(struct file *file, struct address_space *mapping, 351 loff_t pos, unsigned len, unsigned flags, struct page **pagep, 352 void **fsdata) 353 { 354 return block_write_begin(mapping, pos, len, flags, pagep, 355 blkdev_get_block); 356 } 357 358 static int blkdev_write_end(struct file *file, struct address_space *mapping, 359 loff_t pos, unsigned len, unsigned copied, struct page *page, 360 void *fsdata) 361 { 362 int ret; 363 ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); 364 365 unlock_page(page); 366 put_page(page); 367 368 return ret; 369 } 370 371 static int blkdev_writepages(struct address_space *mapping, 372 struct writeback_control *wbc) 373 { 374 return generic_writepages(mapping, wbc); 375 } 376 377 const struct address_space_operations def_blk_aops = { 378 .set_page_dirty = __set_page_dirty_buffers, 379 .readpage = blkdev_readpage, 380 .readahead = blkdev_readahead, 381 .writepage = blkdev_writepage, 382 .write_begin = blkdev_write_begin, 383 .write_end = blkdev_write_end, 384 .writepages = blkdev_writepages, 385 .direct_IO = blkdev_direct_IO, 386 .migratepage = buffer_migrate_page_norefs, 387 .is_dirty_writeback = buffer_check_dirty_writeback, 388 }; 389 390 /* 391 * for a block special file file_inode(file)->i_size is zero 392 * so we compute the size by hand (just as in block_read/write above) 393 */ 394 static loff_t blkdev_llseek(struct file *file, loff_t offset, int whence) 395 { 396 struct inode *bd_inode = bdev_file_inode(file); 397 loff_t retval; 398 399 inode_lock(bd_inode); 400 retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode)); 401 inode_unlock(bd_inode); 402 return retval; 403 } 404 405 static int blkdev_fsync(struct file *filp, loff_t start, loff_t end, 406 int datasync) 407 { 408 struct inode *bd_inode = bdev_file_inode(filp); 409 struct block_device *bdev = I_BDEV(bd_inode); 410 int error; 411 412 error = file_write_and_wait_range(filp, start, end); 413 if (error) 414 return error; 415 416 /* 417 * There is no need to serialise calls to blkdev_issue_flush with 418 * i_mutex and doing so causes performance issues with concurrent 419 * O_SYNC writers to a block device. 420 */ 421 error = blkdev_issue_flush(bdev); 422 if (error == -EOPNOTSUPP) 423 error = 0; 424 425 return error; 426 } 427 428 static int blkdev_open(struct inode *inode, struct file *filp) 429 { 430 struct block_device *bdev; 431 432 /* 433 * Preserve backwards compatibility and allow large file access 434 * even if userspace doesn't ask for it explicitly. Some mkfs 435 * binary needs it. We might want to drop this workaround 436 * during an unstable branch. 437 */ 438 filp->f_flags |= O_LARGEFILE; 439 filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; 440 441 if (filp->f_flags & O_NDELAY) 442 filp->f_mode |= FMODE_NDELAY; 443 if (filp->f_flags & O_EXCL) 444 filp->f_mode |= FMODE_EXCL; 445 if ((filp->f_flags & O_ACCMODE) == 3) 446 filp->f_mode |= FMODE_WRITE_IOCTL; 447 448 bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp); 449 if (IS_ERR(bdev)) 450 return PTR_ERR(bdev); 451 filp->f_mapping = bdev->bd_inode->i_mapping; 452 filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); 453 return 0; 454 } 455 456 static int blkdev_close(struct inode *inode, struct file *filp) 457 { 458 struct block_device *bdev = I_BDEV(bdev_file_inode(filp)); 459 460 blkdev_put(bdev, filp->f_mode); 461 return 0; 462 } 463 464 static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) 465 { 466 struct block_device *bdev = I_BDEV(bdev_file_inode(file)); 467 fmode_t mode = file->f_mode; 468 469 /* 470 * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have 471 * to updated it before every ioctl. 472 */ 473 if (file->f_flags & O_NDELAY) 474 mode |= FMODE_NDELAY; 475 else 476 mode &= ~FMODE_NDELAY; 477 478 return blkdev_ioctl(bdev, mode, cmd, arg); 479 } 480 481 /* 482 * Write data to the block device. Only intended for the block device itself 483 * and the raw driver which basically is a fake block device. 484 * 485 * Does not take i_mutex for the write and thus is not for general purpose 486 * use. 487 */ 488 static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) 489 { 490 struct file *file = iocb->ki_filp; 491 struct inode *bd_inode = bdev_file_inode(file); 492 loff_t size = i_size_read(bd_inode); 493 struct blk_plug plug; 494 size_t shorted = 0; 495 ssize_t ret; 496 497 if (bdev_read_only(I_BDEV(bd_inode))) 498 return -EPERM; 499 500 if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev)) 501 return -ETXTBSY; 502 503 if (!iov_iter_count(from)) 504 return 0; 505 506 if (iocb->ki_pos >= size) 507 return -ENOSPC; 508 509 if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT) 510 return -EOPNOTSUPP; 511 512 size -= iocb->ki_pos; 513 if (iov_iter_count(from) > size) { 514 shorted = iov_iter_count(from) - size; 515 iov_iter_truncate(from, size); 516 } 517 518 blk_start_plug(&plug); 519 ret = __generic_file_write_iter(iocb, from); 520 if (ret > 0) 521 ret = generic_write_sync(iocb, ret); 522 iov_iter_reexpand(from, iov_iter_count(from) + shorted); 523 blk_finish_plug(&plug); 524 return ret; 525 } 526 527 static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) 528 { 529 struct file *file = iocb->ki_filp; 530 struct inode *bd_inode = bdev_file_inode(file); 531 loff_t size = i_size_read(bd_inode); 532 loff_t pos = iocb->ki_pos; 533 size_t shorted = 0; 534 ssize_t ret; 535 536 if (pos >= size) 537 return 0; 538 539 size -= pos; 540 if (iov_iter_count(to) > size) { 541 shorted = iov_iter_count(to) - size; 542 iov_iter_truncate(to, size); 543 } 544 545 ret = generic_file_read_iter(iocb, to); 546 iov_iter_reexpand(to, iov_iter_count(to) + shorted); 547 return ret; 548 } 549 550 #define BLKDEV_FALLOC_FL_SUPPORTED \ 551 (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ 552 FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE) 553 554 static long blkdev_fallocate(struct file *file, int mode, loff_t start, 555 loff_t len) 556 { 557 struct inode *inode = bdev_file_inode(file); 558 struct block_device *bdev = I_BDEV(inode); 559 loff_t end = start + len - 1; 560 loff_t isize; 561 int error; 562 563 /* Fail if we don't recognize the flags. */ 564 if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED) 565 return -EOPNOTSUPP; 566 567 /* Don't go off the end of the device. */ 568 isize = i_size_read(bdev->bd_inode); 569 if (start >= isize) 570 return -EINVAL; 571 if (end >= isize) { 572 if (mode & FALLOC_FL_KEEP_SIZE) { 573 len = isize - start; 574 end = start + len - 1; 575 } else 576 return -EINVAL; 577 } 578 579 /* 580 * Don't allow IO that isn't aligned to logical block size. 581 */ 582 if ((start | len) & (bdev_logical_block_size(bdev) - 1)) 583 return -EINVAL; 584 585 filemap_invalidate_lock(inode->i_mapping); 586 587 /* Invalidate the page cache, including dirty pages. */ 588 error = truncate_bdev_range(bdev, file->f_mode, start, end); 589 if (error) 590 goto fail; 591 592 switch (mode) { 593 case FALLOC_FL_ZERO_RANGE: 594 case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE: 595 error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, 596 GFP_KERNEL, BLKDEV_ZERO_NOUNMAP); 597 break; 598 case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: 599 error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, 600 GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK); 601 break; 602 case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE: 603 error = blkdev_issue_discard(bdev, start >> 9, len >> 9, 604 GFP_KERNEL, 0); 605 break; 606 default: 607 error = -EOPNOTSUPP; 608 } 609 610 fail: 611 filemap_invalidate_unlock(inode->i_mapping); 612 return error; 613 } 614 615 const struct file_operations def_blk_fops = { 616 .open = blkdev_open, 617 .release = blkdev_close, 618 .llseek = blkdev_llseek, 619 .read_iter = blkdev_read_iter, 620 .write_iter = blkdev_write_iter, 621 .iopoll = blkdev_iopoll, 622 .mmap = generic_file_mmap, 623 .fsync = blkdev_fsync, 624 .unlocked_ioctl = block_ioctl, 625 #ifdef CONFIG_COMPAT 626 .compat_ioctl = compat_blkdev_ioctl, 627 #endif 628 .splice_read = generic_file_splice_read, 629 .splice_write = iter_file_splice_write, 630 .fallocate = blkdev_fallocate, 631 }; 632 633 static __init int blkdev_init(void) 634 { 635 return bioset_init(&blkdev_dio_pool, 4, 636 offsetof(struct blkdev_dio, bio), 637 BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE); 638 } 639 module_init(blkdev_init); 640