1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Simple file system for zoned block devices exposing zones as files. 4 * 5 * Copyright (C) 2022 Western Digital Corporation or its affiliates. 6 */ 7 #include <linux/module.h> 8 #include <linux/pagemap.h> 9 #include <linux/iomap.h> 10 #include <linux/init.h> 11 #include <linux/slab.h> 12 #include <linux/blkdev.h> 13 #include <linux/statfs.h> 14 #include <linux/writeback.h> 15 #include <linux/quotaops.h> 16 #include <linux/seq_file.h> 17 #include <linux/parser.h> 18 #include <linux/uio.h> 19 #include <linux/mman.h> 20 #include <linux/sched/mm.h> 21 #include <linux/task_io_accounting_ops.h> 22 23 #include "zonefs.h" 24 25 #include "trace.h" 26 27 static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset, 28 loff_t length, unsigned int flags, 29 struct iomap *iomap, struct iomap *srcmap) 30 { 31 struct zonefs_inode_info *zi = ZONEFS_I(inode); 32 struct super_block *sb = inode->i_sb; 33 loff_t isize; 34 35 /* 36 * All blocks are always mapped below EOF. If reading past EOF, 37 * act as if there is a hole up to the file maximum size. 38 */ 39 mutex_lock(&zi->i_truncate_mutex); 40 iomap->bdev = inode->i_sb->s_bdev; 41 iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); 42 isize = i_size_read(inode); 43 if (iomap->offset >= isize) { 44 iomap->type = IOMAP_HOLE; 45 iomap->addr = IOMAP_NULL_ADDR; 46 iomap->length = length; 47 } else { 48 iomap->type = IOMAP_MAPPED; 49 iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; 50 iomap->length = isize - iomap->offset; 51 } 52 mutex_unlock(&zi->i_truncate_mutex); 53 54 trace_zonefs_iomap_begin(inode, iomap); 55 56 return 0; 57 } 58 59 static const struct iomap_ops zonefs_read_iomap_ops = { 60 .iomap_begin = zonefs_read_iomap_begin, 61 }; 62 63 static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, 64 loff_t length, unsigned int flags, 65 struct iomap *iomap, struct iomap *srcmap) 66 { 67 struct zonefs_inode_info *zi = ZONEFS_I(inode); 68 struct super_block *sb = inode->i_sb; 69 loff_t isize; 70 71 /* All write I/Os should always be within the file maximum size */ 72 if (WARN_ON_ONCE(offset + length > zi->i_max_size)) 73 return -EIO; 74 75 /* 76 * Sequential zones can only accept direct writes. This is already 77 * checked when writes are issued, so warn if we see a page writeback 78 * operation. 79 */ 80 if (WARN_ON_ONCE(zonefs_zone_is_seq(zi) && !(flags & IOMAP_DIRECT))) 81 return -EIO; 82 83 /* 84 * For conventional zones, all blocks are always mapped. For sequential 85 * zones, all blocks after always mapped below the inode size (zone 86 * write pointer) and unwriten beyond. 87 */ 88 mutex_lock(&zi->i_truncate_mutex); 89 iomap->bdev = inode->i_sb->s_bdev; 90 iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); 91 iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; 92 isize = i_size_read(inode); 93 if (iomap->offset >= isize) { 94 iomap->type = IOMAP_UNWRITTEN; 95 iomap->length = zi->i_max_size - iomap->offset; 96 } else { 97 iomap->type = IOMAP_MAPPED; 98 iomap->length = isize - iomap->offset; 99 } 100 mutex_unlock(&zi->i_truncate_mutex); 101 102 trace_zonefs_iomap_begin(inode, iomap); 103 104 return 0; 105 } 106 107 static const struct iomap_ops zonefs_write_iomap_ops = { 108 .iomap_begin = zonefs_write_iomap_begin, 109 }; 110 111 static int zonefs_read_folio(struct file *unused, struct folio *folio) 112 { 113 return iomap_read_folio(folio, &zonefs_read_iomap_ops); 114 } 115 116 static void zonefs_readahead(struct readahead_control *rac) 117 { 118 iomap_readahead(rac, &zonefs_read_iomap_ops); 119 } 120 121 /* 122 * Map blocks for page writeback. This is used only on conventional zone files, 123 * which implies that the page range can only be within the fixed inode size. 124 */ 125 static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, 126 struct inode *inode, loff_t offset) 127 { 128 struct zonefs_inode_info *zi = ZONEFS_I(inode); 129 130 if (WARN_ON_ONCE(zonefs_zone_is_seq(zi))) 131 return -EIO; 132 if (WARN_ON_ONCE(offset >= i_size_read(inode))) 133 return -EIO; 134 135 /* If the mapping is already OK, nothing needs to be done */ 136 if (offset >= wpc->iomap.offset && 137 offset < wpc->iomap.offset + wpc->iomap.length) 138 return 0; 139 140 return zonefs_write_iomap_begin(inode, offset, zi->i_max_size - offset, 141 IOMAP_WRITE, &wpc->iomap, NULL); 142 } 143 144 static const struct iomap_writeback_ops zonefs_writeback_ops = { 145 .map_blocks = zonefs_write_map_blocks, 146 }; 147 148 static int zonefs_writepages(struct address_space *mapping, 149 struct writeback_control *wbc) 150 { 151 struct iomap_writepage_ctx wpc = { }; 152 153 return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops); 154 } 155 156 static int zonefs_swap_activate(struct swap_info_struct *sis, 157 struct file *swap_file, sector_t *span) 158 { 159 struct inode *inode = file_inode(swap_file); 160 161 if (zonefs_inode_is_seq(inode)) { 162 zonefs_err(inode->i_sb, 163 "swap file: not a conventional zone file\n"); 164 return -EINVAL; 165 } 166 167 return iomap_swapfile_activate(sis, swap_file, span, 168 &zonefs_read_iomap_ops); 169 } 170 171 const struct address_space_operations zonefs_file_aops = { 172 .read_folio = zonefs_read_folio, 173 .readahead = zonefs_readahead, 174 .writepages = zonefs_writepages, 175 .dirty_folio = filemap_dirty_folio, 176 .release_folio = iomap_release_folio, 177 .invalidate_folio = iomap_invalidate_folio, 178 .migrate_folio = filemap_migrate_folio, 179 .is_partially_uptodate = iomap_is_partially_uptodate, 180 .error_remove_page = generic_error_remove_page, 181 .direct_IO = noop_direct_IO, 182 .swap_activate = zonefs_swap_activate, 183 }; 184 185 int zonefs_file_truncate(struct inode *inode, loff_t isize) 186 { 187 struct zonefs_inode_info *zi = ZONEFS_I(inode); 188 loff_t old_isize; 189 enum req_op op; 190 int ret = 0; 191 192 /* 193 * Only sequential zone files can be truncated and truncation is allowed 194 * only down to a 0 size, which is equivalent to a zone reset, and to 195 * the maximum file size, which is equivalent to a zone finish. 196 */ 197 if (!zonefs_zone_is_seq(zi)) 198 return -EPERM; 199 200 if (!isize) 201 op = REQ_OP_ZONE_RESET; 202 else if (isize == zi->i_max_size) 203 op = REQ_OP_ZONE_FINISH; 204 else 205 return -EPERM; 206 207 inode_dio_wait(inode); 208 209 /* Serialize against page faults */ 210 filemap_invalidate_lock(inode->i_mapping); 211 212 /* Serialize against zonefs_iomap_begin() */ 213 mutex_lock(&zi->i_truncate_mutex); 214 215 old_isize = i_size_read(inode); 216 if (isize == old_isize) 217 goto unlock; 218 219 ret = zonefs_zone_mgmt(inode, op); 220 if (ret) 221 goto unlock; 222 223 /* 224 * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set, 225 * take care of open zones. 226 */ 227 if (zi->i_flags & ZONEFS_ZONE_OPEN) { 228 /* 229 * Truncating a zone to EMPTY or FULL is the equivalent of 230 * closing the zone. For a truncation to 0, we need to 231 * re-open the zone to ensure new writes can be processed. 232 * For a truncation to the maximum file size, the zone is 233 * closed and writes cannot be accepted anymore, so clear 234 * the open flag. 235 */ 236 if (!isize) 237 ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); 238 else 239 zi->i_flags &= ~ZONEFS_ZONE_OPEN; 240 } 241 242 zonefs_update_stats(inode, isize); 243 truncate_setsize(inode, isize); 244 zi->i_wpoffset = isize; 245 zonefs_account_active(inode); 246 247 unlock: 248 mutex_unlock(&zi->i_truncate_mutex); 249 filemap_invalidate_unlock(inode->i_mapping); 250 251 return ret; 252 } 253 254 static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, 255 int datasync) 256 { 257 struct inode *inode = file_inode(file); 258 int ret = 0; 259 260 if (unlikely(IS_IMMUTABLE(inode))) 261 return -EPERM; 262 263 /* 264 * Since only direct writes are allowed in sequential files, page cache 265 * flush is needed only for conventional zone files. 266 */ 267 if (zonefs_inode_is_cnv(inode)) 268 ret = file_write_and_wait_range(file, start, end); 269 if (!ret) 270 ret = blkdev_issue_flush(inode->i_sb->s_bdev); 271 272 if (ret) 273 zonefs_io_error(inode, true); 274 275 return ret; 276 } 277 278 static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) 279 { 280 struct inode *inode = file_inode(vmf->vma->vm_file); 281 vm_fault_t ret; 282 283 if (unlikely(IS_IMMUTABLE(inode))) 284 return VM_FAULT_SIGBUS; 285 286 /* 287 * Sanity check: only conventional zone files can have shared 288 * writeable mappings. 289 */ 290 if (zonefs_inode_is_seq(inode)) 291 return VM_FAULT_NOPAGE; 292 293 sb_start_pagefault(inode->i_sb); 294 file_update_time(vmf->vma->vm_file); 295 296 /* Serialize against truncates */ 297 filemap_invalidate_lock_shared(inode->i_mapping); 298 ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops); 299 filemap_invalidate_unlock_shared(inode->i_mapping); 300 301 sb_end_pagefault(inode->i_sb); 302 return ret; 303 } 304 305 static const struct vm_operations_struct zonefs_file_vm_ops = { 306 .fault = filemap_fault, 307 .map_pages = filemap_map_pages, 308 .page_mkwrite = zonefs_filemap_page_mkwrite, 309 }; 310 311 static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma) 312 { 313 /* 314 * Conventional zones accept random writes, so their files can support 315 * shared writable mappings. For sequential zone files, only read 316 * mappings are possible since there are no guarantees for write 317 * ordering between msync() and page cache writeback. 318 */ 319 if (zonefs_inode_is_seq(file_inode(file)) && 320 (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) 321 return -EINVAL; 322 323 file_accessed(file); 324 vma->vm_ops = &zonefs_file_vm_ops; 325 326 return 0; 327 } 328 329 static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence) 330 { 331 loff_t isize = i_size_read(file_inode(file)); 332 333 /* 334 * Seeks are limited to below the zone size for conventional zones 335 * and below the zone write pointer for sequential zones. In both 336 * cases, this limit is the inode size. 337 */ 338 return generic_file_llseek_size(file, offset, whence, isize, isize); 339 } 340 341 static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, 342 int error, unsigned int flags) 343 { 344 struct inode *inode = file_inode(iocb->ki_filp); 345 struct zonefs_inode_info *zi = ZONEFS_I(inode); 346 347 if (error) { 348 zonefs_io_error(inode, true); 349 return error; 350 } 351 352 if (size && zonefs_zone_is_seq(zi)) { 353 /* 354 * Note that we may be seeing completions out of order, 355 * but that is not a problem since a write completed 356 * successfully necessarily means that all preceding writes 357 * were also successful. So we can safely increase the inode 358 * size to the write end location. 359 */ 360 mutex_lock(&zi->i_truncate_mutex); 361 if (i_size_read(inode) < iocb->ki_pos + size) { 362 zonefs_update_stats(inode, iocb->ki_pos + size); 363 zonefs_i_size_write(inode, iocb->ki_pos + size); 364 } 365 mutex_unlock(&zi->i_truncate_mutex); 366 } 367 368 return 0; 369 } 370 371 static const struct iomap_dio_ops zonefs_write_dio_ops = { 372 .end_io = zonefs_file_write_dio_end_io, 373 }; 374 375 static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) 376 { 377 struct inode *inode = file_inode(iocb->ki_filp); 378 struct zonefs_inode_info *zi = ZONEFS_I(inode); 379 struct block_device *bdev = inode->i_sb->s_bdev; 380 unsigned int max = bdev_max_zone_append_sectors(bdev); 381 struct bio *bio; 382 ssize_t size; 383 int nr_pages; 384 ssize_t ret; 385 386 max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize); 387 iov_iter_truncate(from, max); 388 389 nr_pages = iov_iter_npages(from, BIO_MAX_VECS); 390 if (!nr_pages) 391 return 0; 392 393 bio = bio_alloc(bdev, nr_pages, 394 REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS); 395 bio->bi_iter.bi_sector = zi->i_zsector; 396 bio->bi_ioprio = iocb->ki_ioprio; 397 if (iocb_is_dsync(iocb)) 398 bio->bi_opf |= REQ_FUA; 399 400 ret = bio_iov_iter_get_pages(bio, from); 401 if (unlikely(ret)) 402 goto out_release; 403 404 size = bio->bi_iter.bi_size; 405 task_io_account_write(size); 406 407 if (iocb->ki_flags & IOCB_HIPRI) 408 bio_set_polled(bio, iocb); 409 410 ret = submit_bio_wait(bio); 411 412 /* 413 * If the file zone was written underneath the file system, the zone 414 * write pointer may not be where we expect it to be, but the zone 415 * append write can still succeed. So check manually that we wrote where 416 * we intended to, that is, at zi->i_wpoffset. 417 */ 418 if (!ret) { 419 sector_t wpsector = 420 zi->i_zsector + (zi->i_wpoffset >> SECTOR_SHIFT); 421 422 if (bio->bi_iter.bi_sector != wpsector) { 423 zonefs_warn(inode->i_sb, 424 "Corrupted write pointer %llu for zone at %llu\n", 425 wpsector, zi->i_zsector); 426 ret = -EIO; 427 } 428 } 429 430 zonefs_file_write_dio_end_io(iocb, size, ret, 0); 431 trace_zonefs_file_dio_append(inode, size, ret); 432 433 out_release: 434 bio_release_pages(bio, false); 435 bio_put(bio); 436 437 if (ret >= 0) { 438 iocb->ki_pos += size; 439 return size; 440 } 441 442 return ret; 443 } 444 445 /* 446 * Do not exceed the LFS limits nor the file zone size. If pos is under the 447 * limit it becomes a short access. If it exceeds the limit, return -EFBIG. 448 */ 449 static loff_t zonefs_write_check_limits(struct file *file, loff_t pos, 450 loff_t count) 451 { 452 struct inode *inode = file_inode(file); 453 struct zonefs_inode_info *zi = ZONEFS_I(inode); 454 loff_t limit = rlimit(RLIMIT_FSIZE); 455 loff_t max_size = zi->i_max_size; 456 457 if (limit != RLIM_INFINITY) { 458 if (pos >= limit) { 459 send_sig(SIGXFSZ, current, 0); 460 return -EFBIG; 461 } 462 count = min(count, limit - pos); 463 } 464 465 if (!(file->f_flags & O_LARGEFILE)) 466 max_size = min_t(loff_t, MAX_NON_LFS, max_size); 467 468 if (unlikely(pos >= max_size)) 469 return -EFBIG; 470 471 return min(count, max_size - pos); 472 } 473 474 static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from) 475 { 476 struct file *file = iocb->ki_filp; 477 struct inode *inode = file_inode(file); 478 struct zonefs_inode_info *zi = ZONEFS_I(inode); 479 loff_t count; 480 481 if (IS_SWAPFILE(inode)) 482 return -ETXTBSY; 483 484 if (!iov_iter_count(from)) 485 return 0; 486 487 if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) 488 return -EINVAL; 489 490 if (iocb->ki_flags & IOCB_APPEND) { 491 if (zonefs_zone_is_cnv(zi)) 492 return -EINVAL; 493 mutex_lock(&zi->i_truncate_mutex); 494 iocb->ki_pos = zi->i_wpoffset; 495 mutex_unlock(&zi->i_truncate_mutex); 496 } 497 498 count = zonefs_write_check_limits(file, iocb->ki_pos, 499 iov_iter_count(from)); 500 if (count < 0) 501 return count; 502 503 iov_iter_truncate(from, count); 504 return iov_iter_count(from); 505 } 506 507 /* 508 * Handle direct writes. For sequential zone files, this is the only possible 509 * write path. For these files, check that the user is issuing writes 510 * sequentially from the end of the file. This code assumes that the block layer 511 * delivers write requests to the device in sequential order. This is always the 512 * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE 513 * elevator feature is being used (e.g. mq-deadline). The block layer always 514 * automatically select such an elevator for zoned block devices during the 515 * device initialization. 516 */ 517 static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) 518 { 519 struct inode *inode = file_inode(iocb->ki_filp); 520 struct zonefs_inode_info *zi = ZONEFS_I(inode); 521 struct super_block *sb = inode->i_sb; 522 bool sync = is_sync_kiocb(iocb); 523 bool append = false; 524 ssize_t ret, count; 525 526 /* 527 * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT 528 * as this can cause write reordering (e.g. the first aio gets EAGAIN 529 * on the inode lock but the second goes through but is now unaligned). 530 */ 531 if (zonefs_zone_is_seq(zi) && !sync && (iocb->ki_flags & IOCB_NOWAIT)) 532 return -EOPNOTSUPP; 533 534 if (iocb->ki_flags & IOCB_NOWAIT) { 535 if (!inode_trylock(inode)) 536 return -EAGAIN; 537 } else { 538 inode_lock(inode); 539 } 540 541 count = zonefs_write_checks(iocb, from); 542 if (count <= 0) { 543 ret = count; 544 goto inode_unlock; 545 } 546 547 if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { 548 ret = -EINVAL; 549 goto inode_unlock; 550 } 551 552 /* Enforce sequential writes (append only) in sequential zones */ 553 if (zonefs_zone_is_seq(zi)) { 554 mutex_lock(&zi->i_truncate_mutex); 555 if (iocb->ki_pos != zi->i_wpoffset) { 556 mutex_unlock(&zi->i_truncate_mutex); 557 ret = -EINVAL; 558 goto inode_unlock; 559 } 560 mutex_unlock(&zi->i_truncate_mutex); 561 append = sync; 562 } 563 564 if (append) 565 ret = zonefs_file_dio_append(iocb, from); 566 else 567 ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, 568 &zonefs_write_dio_ops, 0, NULL, 0); 569 if (zonefs_zone_is_seq(zi) && 570 (ret > 0 || ret == -EIOCBQUEUED)) { 571 if (ret > 0) 572 count = ret; 573 574 /* 575 * Update the zone write pointer offset assuming the write 576 * operation succeeded. If it did not, the error recovery path 577 * will correct it. Also do active seq file accounting. 578 */ 579 mutex_lock(&zi->i_truncate_mutex); 580 zi->i_wpoffset += count; 581 zonefs_account_active(inode); 582 mutex_unlock(&zi->i_truncate_mutex); 583 } 584 585 inode_unlock: 586 inode_unlock(inode); 587 588 return ret; 589 } 590 591 static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, 592 struct iov_iter *from) 593 { 594 struct inode *inode = file_inode(iocb->ki_filp); 595 ssize_t ret; 596 597 /* 598 * Direct IO writes are mandatory for sequential zone files so that the 599 * write IO issuing order is preserved. 600 */ 601 if (zonefs_inode_is_seq(inode)) 602 return -EIO; 603 604 if (iocb->ki_flags & IOCB_NOWAIT) { 605 if (!inode_trylock(inode)) 606 return -EAGAIN; 607 } else { 608 inode_lock(inode); 609 } 610 611 ret = zonefs_write_checks(iocb, from); 612 if (ret <= 0) 613 goto inode_unlock; 614 615 ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops); 616 if (ret > 0) 617 iocb->ki_pos += ret; 618 else if (ret == -EIO) 619 zonefs_io_error(inode, true); 620 621 inode_unlock: 622 inode_unlock(inode); 623 if (ret > 0) 624 ret = generic_write_sync(iocb, ret); 625 626 return ret; 627 } 628 629 static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 630 { 631 struct inode *inode = file_inode(iocb->ki_filp); 632 633 if (unlikely(IS_IMMUTABLE(inode))) 634 return -EPERM; 635 636 if (sb_rdonly(inode->i_sb)) 637 return -EROFS; 638 639 /* Write operations beyond the zone size are not allowed */ 640 if (iocb->ki_pos >= ZONEFS_I(inode)->i_max_size) 641 return -EFBIG; 642 643 if (iocb->ki_flags & IOCB_DIRECT) { 644 ssize_t ret = zonefs_file_dio_write(iocb, from); 645 646 if (ret != -ENOTBLK) 647 return ret; 648 } 649 650 return zonefs_file_buffered_write(iocb, from); 651 } 652 653 static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size, 654 int error, unsigned int flags) 655 { 656 if (error) { 657 zonefs_io_error(file_inode(iocb->ki_filp), false); 658 return error; 659 } 660 661 return 0; 662 } 663 664 static const struct iomap_dio_ops zonefs_read_dio_ops = { 665 .end_io = zonefs_file_read_dio_end_io, 666 }; 667 668 static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 669 { 670 struct inode *inode = file_inode(iocb->ki_filp); 671 struct zonefs_inode_info *zi = ZONEFS_I(inode); 672 struct super_block *sb = inode->i_sb; 673 loff_t isize; 674 ssize_t ret; 675 676 /* Offline zones cannot be read */ 677 if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777))) 678 return -EPERM; 679 680 if (iocb->ki_pos >= zi->i_max_size) 681 return 0; 682 683 if (iocb->ki_flags & IOCB_NOWAIT) { 684 if (!inode_trylock_shared(inode)) 685 return -EAGAIN; 686 } else { 687 inode_lock_shared(inode); 688 } 689 690 /* Limit read operations to written data */ 691 mutex_lock(&zi->i_truncate_mutex); 692 isize = i_size_read(inode); 693 if (iocb->ki_pos >= isize) { 694 mutex_unlock(&zi->i_truncate_mutex); 695 ret = 0; 696 goto inode_unlock; 697 } 698 iov_iter_truncate(to, isize - iocb->ki_pos); 699 mutex_unlock(&zi->i_truncate_mutex); 700 701 if (iocb->ki_flags & IOCB_DIRECT) { 702 size_t count = iov_iter_count(to); 703 704 if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { 705 ret = -EINVAL; 706 goto inode_unlock; 707 } 708 file_accessed(iocb->ki_filp); 709 ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops, 710 &zonefs_read_dio_ops, 0, NULL, 0); 711 } else { 712 ret = generic_file_read_iter(iocb, to); 713 if (ret == -EIO) 714 zonefs_io_error(inode, false); 715 } 716 717 inode_unlock: 718 inode_unlock_shared(inode); 719 720 return ret; 721 } 722 723 /* 724 * Write open accounting is done only for sequential files. 725 */ 726 static inline bool zonefs_seq_file_need_wro(struct inode *inode, 727 struct file *file) 728 { 729 if (zonefs_inode_is_cnv(inode)) 730 return false; 731 732 if (!(file->f_mode & FMODE_WRITE)) 733 return false; 734 735 return true; 736 } 737 738 static int zonefs_seq_file_write_open(struct inode *inode) 739 { 740 struct zonefs_inode_info *zi = ZONEFS_I(inode); 741 int ret = 0; 742 743 mutex_lock(&zi->i_truncate_mutex); 744 745 if (!zi->i_wr_refcnt) { 746 struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); 747 unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files); 748 749 if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { 750 751 if (sbi->s_max_wro_seq_files 752 && wro > sbi->s_max_wro_seq_files) { 753 atomic_dec(&sbi->s_wro_seq_files); 754 ret = -EBUSY; 755 goto unlock; 756 } 757 758 if (i_size_read(inode) < zi->i_max_size) { 759 ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); 760 if (ret) { 761 atomic_dec(&sbi->s_wro_seq_files); 762 goto unlock; 763 } 764 zi->i_flags |= ZONEFS_ZONE_OPEN; 765 zonefs_account_active(inode); 766 } 767 } 768 } 769 770 zi->i_wr_refcnt++; 771 772 unlock: 773 mutex_unlock(&zi->i_truncate_mutex); 774 775 return ret; 776 } 777 778 static int zonefs_file_open(struct inode *inode, struct file *file) 779 { 780 int ret; 781 782 ret = generic_file_open(inode, file); 783 if (ret) 784 return ret; 785 786 if (zonefs_seq_file_need_wro(inode, file)) 787 return zonefs_seq_file_write_open(inode); 788 789 return 0; 790 } 791 792 static void zonefs_seq_file_write_close(struct inode *inode) 793 { 794 struct zonefs_inode_info *zi = ZONEFS_I(inode); 795 struct super_block *sb = inode->i_sb; 796 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 797 int ret = 0; 798 799 mutex_lock(&zi->i_truncate_mutex); 800 801 zi->i_wr_refcnt--; 802 if (zi->i_wr_refcnt) 803 goto unlock; 804 805 /* 806 * The file zone may not be open anymore (e.g. the file was truncated to 807 * its maximum size or it was fully written). For this case, we only 808 * need to decrement the write open count. 809 */ 810 if (zi->i_flags & ZONEFS_ZONE_OPEN) { 811 ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); 812 if (ret) { 813 __zonefs_io_error(inode, false); 814 /* 815 * Leaving zones explicitly open may lead to a state 816 * where most zones cannot be written (zone resources 817 * exhausted). So take preventive action by remounting 818 * read-only. 819 */ 820 if (zi->i_flags & ZONEFS_ZONE_OPEN && 821 !(sb->s_flags & SB_RDONLY)) { 822 zonefs_warn(sb, 823 "closing zone at %llu failed %d\n", 824 zi->i_zsector, ret); 825 zonefs_warn(sb, 826 "remounting filesystem read-only\n"); 827 sb->s_flags |= SB_RDONLY; 828 } 829 goto unlock; 830 } 831 832 zi->i_flags &= ~ZONEFS_ZONE_OPEN; 833 zonefs_account_active(inode); 834 } 835 836 atomic_dec(&sbi->s_wro_seq_files); 837 838 unlock: 839 mutex_unlock(&zi->i_truncate_mutex); 840 } 841 842 static int zonefs_file_release(struct inode *inode, struct file *file) 843 { 844 /* 845 * If we explicitly open a zone we must close it again as well, but the 846 * zone management operation can fail (either due to an IO error or as 847 * the zone has gone offline or read-only). Make sure we don't fail the 848 * close(2) for user-space. 849 */ 850 if (zonefs_seq_file_need_wro(inode, file)) 851 zonefs_seq_file_write_close(inode); 852 853 return 0; 854 } 855 856 const struct file_operations zonefs_file_operations = { 857 .open = zonefs_file_open, 858 .release = zonefs_file_release, 859 .fsync = zonefs_file_fsync, 860 .mmap = zonefs_file_mmap, 861 .llseek = zonefs_file_llseek, 862 .read_iter = zonefs_file_read_iter, 863 .write_iter = zonefs_file_write_iter, 864 .splice_read = generic_file_splice_read, 865 .splice_write = iter_file_splice_write, 866 .iopoll = iocb_bio_iopoll, 867 }; 868