1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE 5 * Copyright (C) 2016 - 2020 Christoph Hellwig 6 */ 7 #include <linux/init.h> 8 #include <linux/mm.h> 9 #include <linux/blkdev.h> 10 #include <linux/buffer_head.h> 11 #include <linux/mpage.h> 12 #include <linux/uio.h> 13 #include <linux/namei.h> 14 #include <linux/task_io_accounting_ops.h> 15 #include <linux/falloc.h> 16 #include <linux/suspend.h> 17 #include "blk.h" 18 19 static struct inode *bdev_file_inode(struct file *file) 20 { 21 return file->f_mapping->host; 22 } 23 24 static int blkdev_get_block(struct inode *inode, sector_t iblock, 25 struct buffer_head *bh, int create) 26 { 27 bh->b_bdev = I_BDEV(inode); 28 bh->b_blocknr = iblock; 29 set_buffer_mapped(bh); 30 return 0; 31 } 32 33 static unsigned int dio_bio_write_op(struct kiocb *iocb) 34 { 35 unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; 36 37 /* avoid the need for a I/O completion work item */ 38 if (iocb->ki_flags & IOCB_DSYNC) 39 op |= REQ_FUA; 40 return op; 41 } 42 43 #define DIO_INLINE_BIO_VECS 4 44 45 static void blkdev_bio_end_io_simple(struct bio *bio) 46 { 47 struct task_struct *waiter = bio->bi_private; 48 49 WRITE_ONCE(bio->bi_private, NULL); 50 blk_wake_io_task(waiter); 51 } 52 53 static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, 54 struct iov_iter *iter, unsigned int nr_pages) 55 { 56 struct file *file = iocb->ki_filp; 57 struct block_device *bdev = I_BDEV(bdev_file_inode(file)); 58 struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs; 59 loff_t pos = iocb->ki_pos; 60 bool should_dirty = false; 61 struct bio bio; 62 ssize_t ret; 63 blk_qc_t qc; 64 65 if ((pos | iov_iter_alignment(iter)) & 66 (bdev_logical_block_size(bdev) - 1)) 67 return -EINVAL; 68 69 if (nr_pages <= DIO_INLINE_BIO_VECS) 70 vecs = inline_vecs; 71 else { 72 vecs = kmalloc_array(nr_pages, sizeof(struct bio_vec), 73 GFP_KERNEL); 74 if (!vecs) 75 return -ENOMEM; 76 } 77 78 bio_init(&bio, vecs, nr_pages); 79 bio_set_dev(&bio, bdev); 80 bio.bi_iter.bi_sector = pos >> 9; 81 bio.bi_write_hint = iocb->ki_hint; 82 bio.bi_private = current; 83 bio.bi_end_io = blkdev_bio_end_io_simple; 84 bio.bi_ioprio = iocb->ki_ioprio; 85 86 ret = bio_iov_iter_get_pages(&bio, iter); 87 if (unlikely(ret)) 88 goto out; 89 ret = bio.bi_iter.bi_size; 90 91 if (iov_iter_rw(iter) == READ) { 92 bio.bi_opf = REQ_OP_READ; 93 if (iter_is_iovec(iter)) 94 should_dirty = true; 95 } else { 96 bio.bi_opf = dio_bio_write_op(iocb); 97 task_io_account_write(ret); 98 } 99 if (iocb->ki_flags & IOCB_NOWAIT) 100 bio.bi_opf |= REQ_NOWAIT; 101 if (iocb->ki_flags & IOCB_HIPRI) 102 bio_set_polled(&bio, iocb); 103 104 qc = submit_bio(&bio); 105 for (;;) { 106 set_current_state(TASK_UNINTERRUPTIBLE); 107 if (!READ_ONCE(bio.bi_private)) 108 break; 109 if (!(iocb->ki_flags & IOCB_HIPRI) || 110 !blk_poll(bdev_get_queue(bdev), qc, true)) 111 blk_io_schedule(); 112 } 113 __set_current_state(TASK_RUNNING); 114 115 bio_release_pages(&bio, should_dirty); 116 if (unlikely(bio.bi_status)) 117 ret = blk_status_to_errno(bio.bi_status); 118 119 out: 120 if (vecs != inline_vecs) 121 kfree(vecs); 122 123 bio_uninit(&bio); 124 125 return ret; 126 } 127 128 struct blkdev_dio { 129 union { 130 struct kiocb *iocb; 131 struct task_struct *waiter; 132 }; 133 size_t size; 134 atomic_t ref; 135 bool multi_bio : 1; 136 bool should_dirty : 1; 137 bool is_sync : 1; 138 struct bio bio; 139 }; 140 141 static struct bio_set blkdev_dio_pool; 142 143 static int blkdev_iopoll(struct kiocb *kiocb, bool wait) 144 { 145 struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host); 146 struct request_queue *q = bdev_get_queue(bdev); 147 148 return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait); 149 } 150 151 static void blkdev_bio_end_io(struct bio *bio) 152 { 153 struct blkdev_dio *dio = bio->bi_private; 154 bool should_dirty = dio->should_dirty; 155 156 if (bio->bi_status && !dio->bio.bi_status) 157 dio->bio.bi_status = bio->bi_status; 158 159 if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) { 160 if (!dio->is_sync) { 161 struct kiocb *iocb = dio->iocb; 162 ssize_t ret; 163 164 if (likely(!dio->bio.bi_status)) { 165 ret = dio->size; 166 iocb->ki_pos += ret; 167 } else { 168 ret = blk_status_to_errno(dio->bio.bi_status); 169 } 170 171 dio->iocb->ki_complete(iocb, ret, 0); 172 if (dio->multi_bio) 173 bio_put(&dio->bio); 174 } else { 175 struct task_struct *waiter = dio->waiter; 176 177 WRITE_ONCE(dio->waiter, NULL); 178 blk_wake_io_task(waiter); 179 } 180 } 181 182 if (should_dirty) { 183 bio_check_pages_dirty(bio); 184 } else { 185 bio_release_pages(bio, false); 186 bio_put(bio); 187 } 188 } 189 190 static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, 191 unsigned int nr_pages) 192 { 193 struct file *file = iocb->ki_filp; 194 struct inode *inode = bdev_file_inode(file); 195 struct block_device *bdev = I_BDEV(inode); 196 struct blk_plug plug; 197 struct blkdev_dio *dio; 198 struct bio *bio; 199 bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0; 200 bool is_read = (iov_iter_rw(iter) == READ), is_sync; 201 loff_t pos = iocb->ki_pos; 202 blk_qc_t qc = BLK_QC_T_NONE; 203 int ret = 0; 204 205 if ((pos | iov_iter_alignment(iter)) & 206 (bdev_logical_block_size(bdev) - 1)) 207 return -EINVAL; 208 209 bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool); 210 211 dio = container_of(bio, struct blkdev_dio, bio); 212 dio->is_sync = is_sync = is_sync_kiocb(iocb); 213 if (dio->is_sync) { 214 dio->waiter = current; 215 bio_get(bio); 216 } else { 217 dio->iocb = iocb; 218 } 219 220 dio->size = 0; 221 dio->multi_bio = false; 222 dio->should_dirty = is_read && iter_is_iovec(iter); 223 224 /* 225 * Don't plug for HIPRI/polled IO, as those should go straight 226 * to issue 227 */ 228 if (!is_poll) 229 blk_start_plug(&plug); 230 231 for (;;) { 232 bio_set_dev(bio, bdev); 233 bio->bi_iter.bi_sector = pos >> 9; 234 bio->bi_write_hint = iocb->ki_hint; 235 bio->bi_private = dio; 236 bio->bi_end_io = blkdev_bio_end_io; 237 bio->bi_ioprio = iocb->ki_ioprio; 238 239 ret = bio_iov_iter_get_pages(bio, iter); 240 if (unlikely(ret)) { 241 bio->bi_status = BLK_STS_IOERR; 242 bio_endio(bio); 243 break; 244 } 245 246 if (is_read) { 247 bio->bi_opf = REQ_OP_READ; 248 if (dio->should_dirty) 249 bio_set_pages_dirty(bio); 250 } else { 251 bio->bi_opf = dio_bio_write_op(iocb); 252 task_io_account_write(bio->bi_iter.bi_size); 253 } 254 if (iocb->ki_flags & IOCB_NOWAIT) 255 bio->bi_opf |= REQ_NOWAIT; 256 257 dio->size += bio->bi_iter.bi_size; 258 pos += bio->bi_iter.bi_size; 259 260 nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS); 261 if (!nr_pages) { 262 bool polled = false; 263 264 if (iocb->ki_flags & IOCB_HIPRI) { 265 bio_set_polled(bio, iocb); 266 polled = true; 267 } 268 269 qc = submit_bio(bio); 270 271 if (polled) 272 WRITE_ONCE(iocb->ki_cookie, qc); 273 break; 274 } 275 276 if (!dio->multi_bio) { 277 /* 278 * AIO needs an extra reference to ensure the dio 279 * structure which is embedded into the first bio 280 * stays around. 281 */ 282 if (!is_sync) 283 bio_get(bio); 284 dio->multi_bio = true; 285 atomic_set(&dio->ref, 2); 286 } else { 287 atomic_inc(&dio->ref); 288 } 289 290 submit_bio(bio); 291 bio = bio_alloc(GFP_KERNEL, nr_pages); 292 } 293 294 if (!is_poll) 295 blk_finish_plug(&plug); 296 297 if (!is_sync) 298 return -EIOCBQUEUED; 299 300 for (;;) { 301 set_current_state(TASK_UNINTERRUPTIBLE); 302 if (!READ_ONCE(dio->waiter)) 303 break; 304 305 if (!(iocb->ki_flags & IOCB_HIPRI) || 306 !blk_poll(bdev_get_queue(bdev), qc, true)) 307 blk_io_schedule(); 308 } 309 __set_current_state(TASK_RUNNING); 310 311 if (!ret) 312 ret = blk_status_to_errno(dio->bio.bi_status); 313 if (likely(!ret)) 314 ret = dio->size; 315 316 bio_put(&dio->bio); 317 return ret; 318 } 319 320 static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) 321 { 322 unsigned int nr_pages; 323 324 if (!iov_iter_count(iter)) 325 return 0; 326 327 nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); 328 if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_VECS) 329 return __blkdev_direct_IO_simple(iocb, iter, nr_pages); 330 331 return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); 332 } 333 334 static int blkdev_writepage(struct page *page, struct writeback_control *wbc) 335 { 336 return block_write_full_page(page, blkdev_get_block, wbc); 337 } 338 339 static int blkdev_readpage(struct file * file, struct page * page) 340 { 341 return block_read_full_page(page, blkdev_get_block); 342 } 343 344 static void blkdev_readahead(struct readahead_control *rac) 345 { 346 mpage_readahead(rac, blkdev_get_block); 347 } 348 349 static int blkdev_write_begin(struct file *file, struct address_space *mapping, 350 loff_t pos, unsigned len, unsigned flags, struct page **pagep, 351 void **fsdata) 352 { 353 return block_write_begin(mapping, pos, len, flags, pagep, 354 blkdev_get_block); 355 } 356 357 static int blkdev_write_end(struct file *file, struct address_space *mapping, 358 loff_t pos, unsigned len, unsigned copied, struct page *page, 359 void *fsdata) 360 { 361 int ret; 362 ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); 363 364 unlock_page(page); 365 put_page(page); 366 367 return ret; 368 } 369 370 static int blkdev_writepages(struct address_space *mapping, 371 struct writeback_control *wbc) 372 { 373 return generic_writepages(mapping, wbc); 374 } 375 376 const struct address_space_operations def_blk_aops = { 377 .set_page_dirty = __set_page_dirty_buffers, 378 .readpage = blkdev_readpage, 379 .readahead = blkdev_readahead, 380 .writepage = blkdev_writepage, 381 .write_begin = blkdev_write_begin, 382 .write_end = blkdev_write_end, 383 .writepages = blkdev_writepages, 384 .direct_IO = blkdev_direct_IO, 385 .migratepage = buffer_migrate_page_norefs, 386 .is_dirty_writeback = buffer_check_dirty_writeback, 387 }; 388 389 /* 390 * for a block special file file_inode(file)->i_size is zero 391 * so we compute the size by hand (just as in block_read/write above) 392 */ 393 static loff_t blkdev_llseek(struct file *file, loff_t offset, int whence) 394 { 395 struct inode *bd_inode = bdev_file_inode(file); 396 loff_t retval; 397 398 inode_lock(bd_inode); 399 retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode)); 400 inode_unlock(bd_inode); 401 return retval; 402 } 403 404 static int blkdev_fsync(struct file *filp, loff_t start, loff_t end, 405 int datasync) 406 { 407 struct inode *bd_inode = bdev_file_inode(filp); 408 struct block_device *bdev = I_BDEV(bd_inode); 409 int error; 410 411 error = file_write_and_wait_range(filp, start, end); 412 if (error) 413 return error; 414 415 /* 416 * There is no need to serialise calls to blkdev_issue_flush with 417 * i_mutex and doing so causes performance issues with concurrent 418 * O_SYNC writers to a block device. 419 */ 420 error = blkdev_issue_flush(bdev); 421 if (error == -EOPNOTSUPP) 422 error = 0; 423 424 return error; 425 } 426 427 static int blkdev_open(struct inode *inode, struct file *filp) 428 { 429 struct block_device *bdev; 430 431 /* 432 * Preserve backwards compatibility and allow large file access 433 * even if userspace doesn't ask for it explicitly. Some mkfs 434 * binary needs it. We might want to drop this workaround 435 * during an unstable branch. 436 */ 437 filp->f_flags |= O_LARGEFILE; 438 filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; 439 440 if (filp->f_flags & O_NDELAY) 441 filp->f_mode |= FMODE_NDELAY; 442 if (filp->f_flags & O_EXCL) 443 filp->f_mode |= FMODE_EXCL; 444 if ((filp->f_flags & O_ACCMODE) == 3) 445 filp->f_mode |= FMODE_WRITE_IOCTL; 446 447 bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp); 448 if (IS_ERR(bdev)) 449 return PTR_ERR(bdev); 450 filp->f_mapping = bdev->bd_inode->i_mapping; 451 filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); 452 return 0; 453 } 454 455 static int blkdev_close(struct inode *inode, struct file *filp) 456 { 457 struct block_device *bdev = I_BDEV(bdev_file_inode(filp)); 458 459 blkdev_put(bdev, filp->f_mode); 460 return 0; 461 } 462 463 static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) 464 { 465 struct block_device *bdev = I_BDEV(bdev_file_inode(file)); 466 fmode_t mode = file->f_mode; 467 468 /* 469 * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have 470 * to updated it before every ioctl. 471 */ 472 if (file->f_flags & O_NDELAY) 473 mode |= FMODE_NDELAY; 474 else 475 mode &= ~FMODE_NDELAY; 476 477 return blkdev_ioctl(bdev, mode, cmd, arg); 478 } 479 480 /* 481 * Write data to the block device. Only intended for the block device itself 482 * and the raw driver which basically is a fake block device. 483 * 484 * Does not take i_mutex for the write and thus is not for general purpose 485 * use. 486 */ 487 static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) 488 { 489 struct file *file = iocb->ki_filp; 490 struct inode *bd_inode = bdev_file_inode(file); 491 loff_t size = i_size_read(bd_inode); 492 struct blk_plug plug; 493 size_t shorted = 0; 494 ssize_t ret; 495 496 if (bdev_read_only(I_BDEV(bd_inode))) 497 return -EPERM; 498 499 if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev)) 500 return -ETXTBSY; 501 502 if (!iov_iter_count(from)) 503 return 0; 504 505 if (iocb->ki_pos >= size) 506 return -ENOSPC; 507 508 if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT) 509 return -EOPNOTSUPP; 510 511 size -= iocb->ki_pos; 512 if (iov_iter_count(from) > size) { 513 shorted = iov_iter_count(from) - size; 514 iov_iter_truncate(from, size); 515 } 516 517 blk_start_plug(&plug); 518 ret = __generic_file_write_iter(iocb, from); 519 if (ret > 0) 520 ret = generic_write_sync(iocb, ret); 521 iov_iter_reexpand(from, iov_iter_count(from) + shorted); 522 blk_finish_plug(&plug); 523 return ret; 524 } 525 526 static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) 527 { 528 struct file *file = iocb->ki_filp; 529 struct inode *bd_inode = bdev_file_inode(file); 530 loff_t size = i_size_read(bd_inode); 531 loff_t pos = iocb->ki_pos; 532 size_t shorted = 0; 533 ssize_t ret; 534 535 if (pos >= size) 536 return 0; 537 538 size -= pos; 539 if (iov_iter_count(to) > size) { 540 shorted = iov_iter_count(to) - size; 541 iov_iter_truncate(to, size); 542 } 543 544 ret = generic_file_read_iter(iocb, to); 545 iov_iter_reexpand(to, iov_iter_count(to) + shorted); 546 return ret; 547 } 548 549 #define BLKDEV_FALLOC_FL_SUPPORTED \ 550 (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ 551 FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE) 552 553 static long blkdev_fallocate(struct file *file, int mode, loff_t start, 554 loff_t len) 555 { 556 struct block_device *bdev = I_BDEV(bdev_file_inode(file)); 557 loff_t end = start + len - 1; 558 loff_t isize; 559 int error; 560 561 /* Fail if we don't recognize the flags. */ 562 if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED) 563 return -EOPNOTSUPP; 564 565 /* Don't go off the end of the device. */ 566 isize = i_size_read(bdev->bd_inode); 567 if (start >= isize) 568 return -EINVAL; 569 if (end >= isize) { 570 if (mode & FALLOC_FL_KEEP_SIZE) { 571 len = isize - start; 572 end = start + len - 1; 573 } else 574 return -EINVAL; 575 } 576 577 /* 578 * Don't allow IO that isn't aligned to logical block size. 579 */ 580 if ((start | len) & (bdev_logical_block_size(bdev) - 1)) 581 return -EINVAL; 582 583 /* Invalidate the page cache, including dirty pages. */ 584 error = truncate_bdev_range(bdev, file->f_mode, start, end); 585 if (error) 586 return error; 587 588 switch (mode) { 589 case FALLOC_FL_ZERO_RANGE: 590 case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE: 591 error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, 592 GFP_KERNEL, BLKDEV_ZERO_NOUNMAP); 593 break; 594 case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: 595 error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, 596 GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK); 597 break; 598 case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE: 599 error = blkdev_issue_discard(bdev, start >> 9, len >> 9, 600 GFP_KERNEL, 0); 601 break; 602 default: 603 return -EOPNOTSUPP; 604 } 605 if (error) 606 return error; 607 608 /* 609 * Invalidate the page cache again; if someone wandered in and dirtied 610 * a page, we just discard it - userspace has no way of knowing whether 611 * the write happened before or after discard completing... 612 */ 613 return truncate_bdev_range(bdev, file->f_mode, start, end); 614 } 615 616 const struct file_operations def_blk_fops = { 617 .open = blkdev_open, 618 .release = blkdev_close, 619 .llseek = blkdev_llseek, 620 .read_iter = blkdev_read_iter, 621 .write_iter = blkdev_write_iter, 622 .iopoll = blkdev_iopoll, 623 .mmap = generic_file_mmap, 624 .fsync = blkdev_fsync, 625 .unlocked_ioctl = block_ioctl, 626 #ifdef CONFIG_COMPAT 627 .compat_ioctl = compat_blkdev_ioctl, 628 #endif 629 .splice_read = generic_file_splice_read, 630 .splice_write = iter_file_splice_write, 631 .fallocate = blkdev_fallocate, 632 }; 633 634 static __init int blkdev_init(void) 635 { 636 return bioset_init(&blkdev_dio_pool, 4, 637 offsetof(struct blkdev_dio, bio), 638 BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE); 639 } 640 module_init(blkdev_init); 641