1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Simple file system for zoned block devices exposing zones as files. 4 * 5 * Copyright (C) 2019 Western Digital Corporation or its affiliates. 6 */ 7 #include <linux/module.h> 8 #include <linux/pagemap.h> 9 #include <linux/magic.h> 10 #include <linux/iomap.h> 11 #include <linux/init.h> 12 #include <linux/slab.h> 13 #include <linux/blkdev.h> 14 #include <linux/statfs.h> 15 #include <linux/writeback.h> 16 #include <linux/quotaops.h> 17 #include <linux/seq_file.h> 18 #include <linux/parser.h> 19 #include <linux/uio.h> 20 #include <linux/mman.h> 21 #include <linux/sched/mm.h> 22 #include <linux/crc32.h> 23 #include <linux/task_io_accounting_ops.h> 24 25 #include "zonefs.h" 26 27 #define CREATE_TRACE_POINTS 28 #include "trace.h" 29 30 /* 31 * Get the name of a zone group directory. 32 */ 33 static const char *zonefs_zgroup_name(enum zonefs_ztype ztype) 34 { 35 switch (ztype) { 36 case ZONEFS_ZTYPE_CNV: 37 return "cnv"; 38 case ZONEFS_ZTYPE_SEQ: 39 return "seq"; 40 default: 41 WARN_ON_ONCE(1); 42 return "???"; 43 } 44 } 45 46 /* 47 * Manage the active zone count. 48 */ 49 static void zonefs_account_active(struct super_block *sb, 50 struct zonefs_zone *z) 51 { 52 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 53 54 if (zonefs_zone_is_cnv(z)) 55 return; 56 57 /* 58 * For zones that transitioned to the offline or readonly condition, 59 * we only need to clear the active state. 60 */ 61 if (z->z_flags & (ZONEFS_ZONE_OFFLINE | ZONEFS_ZONE_READONLY)) 62 goto out; 63 64 /* 65 * If the zone is active, that is, if it is explicitly open or 66 * partially written, check if it was already accounted as active. 67 */ 68 if ((z->z_flags & ZONEFS_ZONE_OPEN) || 69 (z->z_wpoffset > 0 && z->z_wpoffset < z->z_capacity)) { 70 if (!(z->z_flags & ZONEFS_ZONE_ACTIVE)) { 71 z->z_flags |= ZONEFS_ZONE_ACTIVE; 72 atomic_inc(&sbi->s_active_seq_files); 73 } 74 return; 75 } 76 77 out: 78 /* The zone is not active. If it was, update the active count */ 79 if (z->z_flags & ZONEFS_ZONE_ACTIVE) { 80 z->z_flags &= ~ZONEFS_ZONE_ACTIVE; 81 atomic_dec(&sbi->s_active_seq_files); 82 } 83 } 84 85 /* 86 * Manage the active zone count. Called with zi->i_truncate_mutex held. 87 */ 88 void zonefs_inode_account_active(struct inode *inode) 89 { 90 lockdep_assert_held(&ZONEFS_I(inode)->i_truncate_mutex); 91 92 return zonefs_account_active(inode->i_sb, zonefs_inode_zone(inode)); 93 } 94 95 /* 96 * Execute a zone management operation. 97 */ 98 static int zonefs_zone_mgmt(struct super_block *sb, 99 struct zonefs_zone *z, enum req_op op) 100 { 101 int ret; 102 103 /* 104 * With ZNS drives, closing an explicitly open zone that has not been 105 * written will change the zone state to "closed", that is, the zone 106 * will remain active. Since this can then cause failure of explicit 107 * open operation on other zones if the drive active zone resources 108 * are exceeded, make sure that the zone does not remain active by 109 * resetting it. 110 */ 111 if (op == REQ_OP_ZONE_CLOSE && !z->z_wpoffset) 112 op = REQ_OP_ZONE_RESET; 113 114 trace_zonefs_zone_mgmt(sb, z, op); 115 ret = blkdev_zone_mgmt(sb->s_bdev, op, z->z_sector, 116 z->z_size >> SECTOR_SHIFT, GFP_NOFS); 117 if (ret) { 118 zonefs_err(sb, 119 "Zone management operation %s at %llu failed %d\n", 120 blk_op_str(op), z->z_sector, ret); 121 return ret; 122 } 123 124 return 0; 125 } 126 127 int zonefs_inode_zone_mgmt(struct inode *inode, enum req_op op) 128 { 129 lockdep_assert_held(&ZONEFS_I(inode)->i_truncate_mutex); 130 131 return zonefs_zone_mgmt(inode->i_sb, zonefs_inode_zone(inode), op); 132 } 133 134 void zonefs_i_size_write(struct inode *inode, loff_t isize) 135 { 136 struct zonefs_zone *z = zonefs_inode_zone(inode); 137 138 i_size_write(inode, isize); 139 140 /* 141 * A full zone is no longer open/active and does not need 142 * explicit closing. 143 */ 144 if (isize >= z->z_capacity) { 145 struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); 146 147 if (z->z_flags & ZONEFS_ZONE_ACTIVE) 148 atomic_dec(&sbi->s_active_seq_files); 149 z->z_flags &= ~(ZONEFS_ZONE_OPEN | ZONEFS_ZONE_ACTIVE); 150 } 151 } 152 153 void zonefs_update_stats(struct inode *inode, loff_t new_isize) 154 { 155 struct super_block *sb = inode->i_sb; 156 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 157 loff_t old_isize = i_size_read(inode); 158 loff_t nr_blocks; 159 160 if (new_isize == old_isize) 161 return; 162 163 spin_lock(&sbi->s_lock); 164 165 /* 166 * This may be called for an update after an IO error. 167 * So beware of the values seen. 168 */ 169 if (new_isize < old_isize) { 170 nr_blocks = (old_isize - new_isize) >> sb->s_blocksize_bits; 171 if (sbi->s_used_blocks > nr_blocks) 172 sbi->s_used_blocks -= nr_blocks; 173 else 174 sbi->s_used_blocks = 0; 175 } else { 176 sbi->s_used_blocks += 177 (new_isize - old_isize) >> sb->s_blocksize_bits; 178 if (sbi->s_used_blocks > sbi->s_blocks) 179 sbi->s_used_blocks = sbi->s_blocks; 180 } 181 182 spin_unlock(&sbi->s_lock); 183 } 184 185 /* 186 * Check a zone condition. Return the amount of written (and still readable) 187 * data in the zone. 188 */ 189 static loff_t zonefs_check_zone_condition(struct super_block *sb, 190 struct zonefs_zone *z, 191 struct blk_zone *zone) 192 { 193 switch (zone->cond) { 194 case BLK_ZONE_COND_OFFLINE: 195 zonefs_warn(sb, "Zone %llu: offline zone\n", 196 z->z_sector); 197 z->z_flags |= ZONEFS_ZONE_OFFLINE; 198 return 0; 199 case BLK_ZONE_COND_READONLY: 200 /* 201 * The write pointer of read-only zones is invalid, so we cannot 202 * determine the zone wpoffset (inode size). We thus keep the 203 * zone wpoffset as is, which leads to an empty file 204 * (wpoffset == 0) on mount. For a runtime error, this keeps 205 * the inode size as it was when last updated so that the user 206 * can recover data. 207 */ 208 zonefs_warn(sb, "Zone %llu: read-only zone\n", 209 z->z_sector); 210 z->z_flags |= ZONEFS_ZONE_READONLY; 211 if (zonefs_zone_is_cnv(z)) 212 return z->z_capacity; 213 return z->z_wpoffset; 214 case BLK_ZONE_COND_FULL: 215 /* The write pointer of full zones is invalid. */ 216 return z->z_capacity; 217 default: 218 if (zonefs_zone_is_cnv(z)) 219 return z->z_capacity; 220 return (zone->wp - zone->start) << SECTOR_SHIFT; 221 } 222 } 223 224 /* 225 * Check a zone condition and adjust its inode access permissions for 226 * offline and readonly zones. 227 */ 228 static void zonefs_inode_update_mode(struct inode *inode) 229 { 230 struct zonefs_zone *z = zonefs_inode_zone(inode); 231 232 if (z->z_flags & ZONEFS_ZONE_OFFLINE) { 233 /* Offline zones cannot be read nor written */ 234 inode->i_flags |= S_IMMUTABLE; 235 inode->i_mode &= ~0777; 236 } else if (z->z_flags & ZONEFS_ZONE_READONLY) { 237 /* Readonly zones cannot be written */ 238 inode->i_flags |= S_IMMUTABLE; 239 if (z->z_flags & ZONEFS_ZONE_INIT_MODE) 240 inode->i_mode &= ~0777; 241 else 242 inode->i_mode &= ~0222; 243 } 244 245 z->z_flags &= ~ZONEFS_ZONE_INIT_MODE; 246 z->z_mode = inode->i_mode; 247 } 248 249 struct zonefs_ioerr_data { 250 struct inode *inode; 251 bool write; 252 }; 253 254 static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, 255 void *data) 256 { 257 struct zonefs_ioerr_data *err = data; 258 struct inode *inode = err->inode; 259 struct zonefs_zone *z = zonefs_inode_zone(inode); 260 struct super_block *sb = inode->i_sb; 261 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 262 loff_t isize, data_size; 263 264 /* 265 * Check the zone condition: if the zone is not "bad" (offline or 266 * read-only), read errors are simply signaled to the IO issuer as long 267 * as there is no inconsistency between the inode size and the amount of 268 * data writen in the zone (data_size). 269 */ 270 data_size = zonefs_check_zone_condition(sb, z, zone); 271 isize = i_size_read(inode); 272 if (!(z->z_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)) && 273 !err->write && isize == data_size) 274 return 0; 275 276 /* 277 * At this point, we detected either a bad zone or an inconsistency 278 * between the inode size and the amount of data written in the zone. 279 * For the latter case, the cause may be a write IO error or an external 280 * action on the device. Two error patterns exist: 281 * 1) The inode size is lower than the amount of data in the zone: 282 * a write operation partially failed and data was writen at the end 283 * of the file. This can happen in the case of a large direct IO 284 * needing several BIOs and/or write requests to be processed. 285 * 2) The inode size is larger than the amount of data in the zone: 286 * this can happen with a deferred write error with the use of the 287 * device side write cache after getting successful write IO 288 * completions. Other possibilities are (a) an external corruption, 289 * e.g. an application reset the zone directly, or (b) the device 290 * has a serious problem (e.g. firmware bug). 291 * 292 * In all cases, warn about inode size inconsistency and handle the 293 * IO error according to the zone condition and to the mount options. 294 */ 295 if (zonefs_zone_is_seq(z) && isize != data_size) 296 zonefs_warn(sb, 297 "inode %lu: invalid size %lld (should be %lld)\n", 298 inode->i_ino, isize, data_size); 299 300 /* 301 * First handle bad zones signaled by hardware. The mount options 302 * errors=zone-ro and errors=zone-offline result in changing the 303 * zone condition to read-only and offline respectively, as if the 304 * condition was signaled by the hardware. 305 */ 306 if ((z->z_flags & ZONEFS_ZONE_OFFLINE) || 307 (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL)) { 308 zonefs_warn(sb, "inode %lu: read/write access disabled\n", 309 inode->i_ino); 310 if (!(z->z_flags & ZONEFS_ZONE_OFFLINE)) 311 z->z_flags |= ZONEFS_ZONE_OFFLINE; 312 zonefs_inode_update_mode(inode); 313 data_size = 0; 314 } else if ((z->z_flags & ZONEFS_ZONE_READONLY) || 315 (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO)) { 316 zonefs_warn(sb, "inode %lu: write access disabled\n", 317 inode->i_ino); 318 if (!(z->z_flags & ZONEFS_ZONE_READONLY)) 319 z->z_flags |= ZONEFS_ZONE_READONLY; 320 zonefs_inode_update_mode(inode); 321 data_size = isize; 322 } else if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO && 323 data_size > isize) { 324 /* Do not expose garbage data */ 325 data_size = isize; 326 } 327 328 /* 329 * If the filesystem is mounted with the explicit-open mount option, we 330 * need to clear the ZONEFS_ZONE_OPEN flag if the zone transitioned to 331 * the read-only or offline condition, to avoid attempting an explicit 332 * close of the zone when the inode file is closed. 333 */ 334 if ((sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) && 335 (z->z_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE))) 336 z->z_flags &= ~ZONEFS_ZONE_OPEN; 337 338 /* 339 * If error=remount-ro was specified, any error result in remounting 340 * the volume as read-only. 341 */ 342 if ((sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO) && !sb_rdonly(sb)) { 343 zonefs_warn(sb, "remounting filesystem read-only\n"); 344 sb->s_flags |= SB_RDONLY; 345 } 346 347 /* 348 * Update block usage stats and the inode size to prevent access to 349 * invalid data. 350 */ 351 zonefs_update_stats(inode, data_size); 352 zonefs_i_size_write(inode, data_size); 353 z->z_wpoffset = data_size; 354 zonefs_inode_account_active(inode); 355 356 return 0; 357 } 358 359 /* 360 * When an file IO error occurs, check the file zone to see if there is a change 361 * in the zone condition (e.g. offline or read-only). For a failed write to a 362 * sequential zone, the zone write pointer position must also be checked to 363 * eventually correct the file size and zonefs inode write pointer offset 364 * (which can be out of sync with the drive due to partial write failures). 365 */ 366 void __zonefs_io_error(struct inode *inode, bool write) 367 { 368 struct zonefs_zone *z = zonefs_inode_zone(inode); 369 struct super_block *sb = inode->i_sb; 370 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 371 unsigned int noio_flag; 372 unsigned int nr_zones = 1; 373 struct zonefs_ioerr_data err = { 374 .inode = inode, 375 .write = write, 376 }; 377 int ret; 378 379 /* 380 * The only files that have more than one zone are conventional zone 381 * files with aggregated conventional zones, for which the inode zone 382 * size is always larger than the device zone size. 383 */ 384 if (z->z_size > bdev_zone_sectors(sb->s_bdev)) 385 nr_zones = z->z_size >> 386 (sbi->s_zone_sectors_shift + SECTOR_SHIFT); 387 388 /* 389 * Memory allocations in blkdev_report_zones() can trigger a memory 390 * reclaim which may in turn cause a recursion into zonefs as well as 391 * struct request allocations for the same device. The former case may 392 * end up in a deadlock on the inode truncate mutex, while the latter 393 * may prevent IO forward progress. Executing the report zones under 394 * the GFP_NOIO context avoids both problems. 395 */ 396 noio_flag = memalloc_noio_save(); 397 ret = blkdev_report_zones(sb->s_bdev, z->z_sector, nr_zones, 398 zonefs_io_error_cb, &err); 399 if (ret != nr_zones) 400 zonefs_err(sb, "Get inode %lu zone information failed %d\n", 401 inode->i_ino, ret); 402 memalloc_noio_restore(noio_flag); 403 } 404 405 static struct kmem_cache *zonefs_inode_cachep; 406 407 static struct inode *zonefs_alloc_inode(struct super_block *sb) 408 { 409 struct zonefs_inode_info *zi; 410 411 zi = alloc_inode_sb(sb, zonefs_inode_cachep, GFP_KERNEL); 412 if (!zi) 413 return NULL; 414 415 inode_init_once(&zi->i_vnode); 416 mutex_init(&zi->i_truncate_mutex); 417 zi->i_wr_refcnt = 0; 418 419 return &zi->i_vnode; 420 } 421 422 static void zonefs_free_inode(struct inode *inode) 423 { 424 kmem_cache_free(zonefs_inode_cachep, ZONEFS_I(inode)); 425 } 426 427 /* 428 * File system stat. 429 */ 430 static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf) 431 { 432 struct super_block *sb = dentry->d_sb; 433 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 434 enum zonefs_ztype t; 435 436 buf->f_type = ZONEFS_MAGIC; 437 buf->f_bsize = sb->s_blocksize; 438 buf->f_namelen = ZONEFS_NAME_MAX; 439 440 spin_lock(&sbi->s_lock); 441 442 buf->f_blocks = sbi->s_blocks; 443 if (WARN_ON(sbi->s_used_blocks > sbi->s_blocks)) 444 buf->f_bfree = 0; 445 else 446 buf->f_bfree = buf->f_blocks - sbi->s_used_blocks; 447 buf->f_bavail = buf->f_bfree; 448 449 for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) { 450 if (sbi->s_zgroup[t].g_nr_zones) 451 buf->f_files += sbi->s_zgroup[t].g_nr_zones + 1; 452 } 453 buf->f_ffree = 0; 454 455 spin_unlock(&sbi->s_lock); 456 457 buf->f_fsid = uuid_to_fsid(sbi->s_uuid.b); 458 459 return 0; 460 } 461 462 enum { 463 Opt_errors_ro, Opt_errors_zro, Opt_errors_zol, Opt_errors_repair, 464 Opt_explicit_open, Opt_err, 465 }; 466 467 static const match_table_t tokens = { 468 { Opt_errors_ro, "errors=remount-ro"}, 469 { Opt_errors_zro, "errors=zone-ro"}, 470 { Opt_errors_zol, "errors=zone-offline"}, 471 { Opt_errors_repair, "errors=repair"}, 472 { Opt_explicit_open, "explicit-open" }, 473 { Opt_err, NULL} 474 }; 475 476 static int zonefs_parse_options(struct super_block *sb, char *options) 477 { 478 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 479 substring_t args[MAX_OPT_ARGS]; 480 char *p; 481 482 if (!options) 483 return 0; 484 485 while ((p = strsep(&options, ",")) != NULL) { 486 int token; 487 488 if (!*p) 489 continue; 490 491 token = match_token(p, tokens, args); 492 switch (token) { 493 case Opt_errors_ro: 494 sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; 495 sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_RO; 496 break; 497 case Opt_errors_zro: 498 sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; 499 sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_ZRO; 500 break; 501 case Opt_errors_zol: 502 sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; 503 sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_ZOL; 504 break; 505 case Opt_errors_repair: 506 sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; 507 sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_REPAIR; 508 break; 509 case Opt_explicit_open: 510 sbi->s_mount_opts |= ZONEFS_MNTOPT_EXPLICIT_OPEN; 511 break; 512 default: 513 return -EINVAL; 514 } 515 } 516 517 return 0; 518 } 519 520 static int zonefs_show_options(struct seq_file *seq, struct dentry *root) 521 { 522 struct zonefs_sb_info *sbi = ZONEFS_SB(root->d_sb); 523 524 if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO) 525 seq_puts(seq, ",errors=remount-ro"); 526 if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO) 527 seq_puts(seq, ",errors=zone-ro"); 528 if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL) 529 seq_puts(seq, ",errors=zone-offline"); 530 if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_REPAIR) 531 seq_puts(seq, ",errors=repair"); 532 533 return 0; 534 } 535 536 static int zonefs_remount(struct super_block *sb, int *flags, char *data) 537 { 538 sync_filesystem(sb); 539 540 return zonefs_parse_options(sb, data); 541 } 542 543 static int zonefs_inode_setattr(struct mnt_idmap *idmap, 544 struct dentry *dentry, struct iattr *iattr) 545 { 546 struct inode *inode = d_inode(dentry); 547 int ret; 548 549 if (unlikely(IS_IMMUTABLE(inode))) 550 return -EPERM; 551 552 ret = setattr_prepare(&nop_mnt_idmap, dentry, iattr); 553 if (ret) 554 return ret; 555 556 /* 557 * Since files and directories cannot be created nor deleted, do not 558 * allow setting any write attributes on the sub-directories grouping 559 * files by zone type. 560 */ 561 if ((iattr->ia_valid & ATTR_MODE) && S_ISDIR(inode->i_mode) && 562 (iattr->ia_mode & 0222)) 563 return -EPERM; 564 565 if (((iattr->ia_valid & ATTR_UID) && 566 !uid_eq(iattr->ia_uid, inode->i_uid)) || 567 ((iattr->ia_valid & ATTR_GID) && 568 !gid_eq(iattr->ia_gid, inode->i_gid))) { 569 ret = dquot_transfer(&nop_mnt_idmap, inode, iattr); 570 if (ret) 571 return ret; 572 } 573 574 if (iattr->ia_valid & ATTR_SIZE) { 575 ret = zonefs_file_truncate(inode, iattr->ia_size); 576 if (ret) 577 return ret; 578 } 579 580 setattr_copy(&nop_mnt_idmap, inode, iattr); 581 582 if (S_ISREG(inode->i_mode)) { 583 struct zonefs_zone *z = zonefs_inode_zone(inode); 584 585 z->z_mode = inode->i_mode; 586 z->z_uid = inode->i_uid; 587 z->z_gid = inode->i_gid; 588 } 589 590 return 0; 591 } 592 593 static const struct inode_operations zonefs_file_inode_operations = { 594 .setattr = zonefs_inode_setattr, 595 }; 596 597 static long zonefs_fname_to_fno(const struct qstr *fname) 598 { 599 const char *name = fname->name; 600 unsigned int len = fname->len; 601 long fno = 0, shift = 1; 602 const char *rname; 603 char c = *name; 604 unsigned int i; 605 606 /* 607 * File names are always a base-10 number string without any 608 * leading 0s. 609 */ 610 if (!isdigit(c)) 611 return -ENOENT; 612 613 if (len > 1 && c == '0') 614 return -ENOENT; 615 616 if (len == 1) 617 return c - '0'; 618 619 for (i = 0, rname = name + len - 1; i < len; i++, rname--) { 620 c = *rname; 621 if (!isdigit(c)) 622 return -ENOENT; 623 fno += (c - '0') * shift; 624 shift *= 10; 625 } 626 627 return fno; 628 } 629 630 static struct inode *zonefs_get_file_inode(struct inode *dir, 631 struct dentry *dentry) 632 { 633 struct zonefs_zone_group *zgroup = dir->i_private; 634 struct super_block *sb = dir->i_sb; 635 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 636 struct zonefs_zone *z; 637 struct inode *inode; 638 ino_t ino; 639 long fno; 640 641 /* Get the file number from the file name */ 642 fno = zonefs_fname_to_fno(&dentry->d_name); 643 if (fno < 0) 644 return ERR_PTR(fno); 645 646 if (!zgroup->g_nr_zones || fno >= zgroup->g_nr_zones) 647 return ERR_PTR(-ENOENT); 648 649 z = &zgroup->g_zones[fno]; 650 ino = z->z_sector >> sbi->s_zone_sectors_shift; 651 inode = iget_locked(sb, ino); 652 if (!inode) 653 return ERR_PTR(-ENOMEM); 654 if (!(inode->i_state & I_NEW)) { 655 WARN_ON_ONCE(inode->i_private != z); 656 return inode; 657 } 658 659 inode->i_ino = ino; 660 inode->i_mode = z->z_mode; 661 inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime; 662 inode->i_uid = z->z_uid; 663 inode->i_gid = z->z_gid; 664 inode->i_size = z->z_wpoffset; 665 inode->i_blocks = z->z_capacity >> SECTOR_SHIFT; 666 inode->i_private = z; 667 668 inode->i_op = &zonefs_file_inode_operations; 669 inode->i_fop = &zonefs_file_operations; 670 inode->i_mapping->a_ops = &zonefs_file_aops; 671 672 /* Update the inode access rights depending on the zone condition */ 673 zonefs_inode_update_mode(inode); 674 675 unlock_new_inode(inode); 676 677 return inode; 678 } 679 680 static struct inode *zonefs_get_zgroup_inode(struct super_block *sb, 681 enum zonefs_ztype ztype) 682 { 683 struct inode *root = d_inode(sb->s_root); 684 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 685 struct inode *inode; 686 ino_t ino = bdev_nr_zones(sb->s_bdev) + ztype + 1; 687 688 inode = iget_locked(sb, ino); 689 if (!inode) 690 return ERR_PTR(-ENOMEM); 691 if (!(inode->i_state & I_NEW)) 692 return inode; 693 694 inode->i_ino = ino; 695 inode_init_owner(&nop_mnt_idmap, inode, root, S_IFDIR | 0555); 696 inode->i_size = sbi->s_zgroup[ztype].g_nr_zones; 697 inode->i_ctime = inode->i_mtime = inode->i_atime = root->i_ctime; 698 inode->i_private = &sbi->s_zgroup[ztype]; 699 set_nlink(inode, 2); 700 701 inode->i_op = &zonefs_dir_inode_operations; 702 inode->i_fop = &zonefs_dir_operations; 703 704 unlock_new_inode(inode); 705 706 return inode; 707 } 708 709 710 static struct inode *zonefs_get_dir_inode(struct inode *dir, 711 struct dentry *dentry) 712 { 713 struct super_block *sb = dir->i_sb; 714 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 715 const char *name = dentry->d_name.name; 716 enum zonefs_ztype ztype; 717 718 /* 719 * We only need to check for the "seq" directory and 720 * the "cnv" directory if we have conventional zones. 721 */ 722 if (dentry->d_name.len != 3) 723 return ERR_PTR(-ENOENT); 724 725 for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { 726 if (sbi->s_zgroup[ztype].g_nr_zones && 727 memcmp(name, zonefs_zgroup_name(ztype), 3) == 0) 728 break; 729 } 730 if (ztype == ZONEFS_ZTYPE_MAX) 731 return ERR_PTR(-ENOENT); 732 733 return zonefs_get_zgroup_inode(sb, ztype); 734 } 735 736 static struct dentry *zonefs_lookup(struct inode *dir, struct dentry *dentry, 737 unsigned int flags) 738 { 739 struct inode *inode; 740 741 if (dentry->d_name.len > ZONEFS_NAME_MAX) 742 return ERR_PTR(-ENAMETOOLONG); 743 744 if (dir == d_inode(dir->i_sb->s_root)) 745 inode = zonefs_get_dir_inode(dir, dentry); 746 else 747 inode = zonefs_get_file_inode(dir, dentry); 748 if (IS_ERR(inode)) 749 return ERR_CAST(inode); 750 751 return d_splice_alias(inode, dentry); 752 } 753 754 static int zonefs_readdir_root(struct file *file, struct dir_context *ctx) 755 { 756 struct inode *inode = file_inode(file); 757 struct super_block *sb = inode->i_sb; 758 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 759 enum zonefs_ztype ztype = ZONEFS_ZTYPE_CNV; 760 ino_t base_ino = bdev_nr_zones(sb->s_bdev) + 1; 761 762 if (ctx->pos >= inode->i_size) 763 return 0; 764 765 if (!dir_emit_dots(file, ctx)) 766 return 0; 767 768 if (ctx->pos == 2) { 769 if (!sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones) 770 ztype = ZONEFS_ZTYPE_SEQ; 771 772 if (!dir_emit(ctx, zonefs_zgroup_name(ztype), 3, 773 base_ino + ztype, DT_DIR)) 774 return 0; 775 ctx->pos++; 776 } 777 778 if (ctx->pos == 3 && ztype != ZONEFS_ZTYPE_SEQ) { 779 ztype = ZONEFS_ZTYPE_SEQ; 780 if (!dir_emit(ctx, zonefs_zgroup_name(ztype), 3, 781 base_ino + ztype, DT_DIR)) 782 return 0; 783 ctx->pos++; 784 } 785 786 return 0; 787 } 788 789 static int zonefs_readdir_zgroup(struct file *file, 790 struct dir_context *ctx) 791 { 792 struct inode *inode = file_inode(file); 793 struct zonefs_zone_group *zgroup = inode->i_private; 794 struct super_block *sb = inode->i_sb; 795 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 796 struct zonefs_zone *z; 797 int fname_len; 798 char *fname; 799 ino_t ino; 800 int f; 801 802 /* 803 * The size of zone group directories is equal to the number 804 * of zone files in the group and does note include the "." and 805 * ".." entries. Hence the "+ 2" here. 806 */ 807 if (ctx->pos >= inode->i_size + 2) 808 return 0; 809 810 if (!dir_emit_dots(file, ctx)) 811 return 0; 812 813 fname = kmalloc(ZONEFS_NAME_MAX, GFP_KERNEL); 814 if (!fname) 815 return -ENOMEM; 816 817 for (f = ctx->pos - 2; f < zgroup->g_nr_zones; f++) { 818 z = &zgroup->g_zones[f]; 819 ino = z->z_sector >> sbi->s_zone_sectors_shift; 820 fname_len = snprintf(fname, ZONEFS_NAME_MAX - 1, "%u", f); 821 if (!dir_emit(ctx, fname, fname_len, ino, DT_REG)) 822 break; 823 ctx->pos++; 824 } 825 826 kfree(fname); 827 828 return 0; 829 } 830 831 static int zonefs_readdir(struct file *file, struct dir_context *ctx) 832 { 833 struct inode *inode = file_inode(file); 834 835 if (inode == d_inode(inode->i_sb->s_root)) 836 return zonefs_readdir_root(file, ctx); 837 838 return zonefs_readdir_zgroup(file, ctx); 839 } 840 841 const struct inode_operations zonefs_dir_inode_operations = { 842 .lookup = zonefs_lookup, 843 .setattr = zonefs_inode_setattr, 844 }; 845 846 const struct file_operations zonefs_dir_operations = { 847 .llseek = generic_file_llseek, 848 .read = generic_read_dir, 849 .iterate_shared = zonefs_readdir, 850 }; 851 852 struct zonefs_zone_data { 853 struct super_block *sb; 854 unsigned int nr_zones[ZONEFS_ZTYPE_MAX]; 855 sector_t cnv_zone_start; 856 struct blk_zone *zones; 857 }; 858 859 static int zonefs_get_zone_info_cb(struct blk_zone *zone, unsigned int idx, 860 void *data) 861 { 862 struct zonefs_zone_data *zd = data; 863 struct super_block *sb = zd->sb; 864 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 865 866 /* 867 * We do not care about the first zone: it contains the super block 868 * and not exposed as a file. 869 */ 870 if (!idx) 871 return 0; 872 873 /* 874 * Count the number of zones that will be exposed as files. 875 * For sequential zones, we always have as many files as zones. 876 * FOr conventional zones, the number of files depends on if we have 877 * conventional zones aggregation enabled. 878 */ 879 switch (zone->type) { 880 case BLK_ZONE_TYPE_CONVENTIONAL: 881 if (sbi->s_features & ZONEFS_F_AGGRCNV) { 882 /* One file per set of contiguous conventional zones */ 883 if (!(sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones) || 884 zone->start != zd->cnv_zone_start) 885 sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones++; 886 zd->cnv_zone_start = zone->start + zone->len; 887 } else { 888 /* One file per zone */ 889 sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones++; 890 } 891 break; 892 case BLK_ZONE_TYPE_SEQWRITE_REQ: 893 case BLK_ZONE_TYPE_SEQWRITE_PREF: 894 sbi->s_zgroup[ZONEFS_ZTYPE_SEQ].g_nr_zones++; 895 break; 896 default: 897 zonefs_err(zd->sb, "Unsupported zone type 0x%x\n", 898 zone->type); 899 return -EIO; 900 } 901 902 memcpy(&zd->zones[idx], zone, sizeof(struct blk_zone)); 903 904 return 0; 905 } 906 907 static int zonefs_get_zone_info(struct zonefs_zone_data *zd) 908 { 909 struct block_device *bdev = zd->sb->s_bdev; 910 int ret; 911 912 zd->zones = kvcalloc(bdev_nr_zones(bdev), sizeof(struct blk_zone), 913 GFP_KERNEL); 914 if (!zd->zones) 915 return -ENOMEM; 916 917 /* Get zones information from the device */ 918 ret = blkdev_report_zones(bdev, 0, BLK_ALL_ZONES, 919 zonefs_get_zone_info_cb, zd); 920 if (ret < 0) { 921 zonefs_err(zd->sb, "Zone report failed %d\n", ret); 922 return ret; 923 } 924 925 if (ret != bdev_nr_zones(bdev)) { 926 zonefs_err(zd->sb, "Invalid zone report (%d/%u zones)\n", 927 ret, bdev_nr_zones(bdev)); 928 return -EIO; 929 } 930 931 return 0; 932 } 933 934 static inline void zonefs_free_zone_info(struct zonefs_zone_data *zd) 935 { 936 kvfree(zd->zones); 937 } 938 939 /* 940 * Create a zone group and populate it with zone files. 941 */ 942 static int zonefs_init_zgroup(struct super_block *sb, 943 struct zonefs_zone_data *zd, 944 enum zonefs_ztype ztype) 945 { 946 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 947 struct zonefs_zone_group *zgroup = &sbi->s_zgroup[ztype]; 948 struct blk_zone *zone, *next, *end; 949 struct zonefs_zone *z; 950 unsigned int n = 0; 951 int ret; 952 953 /* Allocate the zone group. If it is empty, we have nothing to do. */ 954 if (!zgroup->g_nr_zones) 955 return 0; 956 957 zgroup->g_zones = kvcalloc(zgroup->g_nr_zones, 958 sizeof(struct zonefs_zone), GFP_KERNEL); 959 if (!zgroup->g_zones) 960 return -ENOMEM; 961 962 /* 963 * Initialize the zone groups using the device zone information. 964 * We always skip the first zone as it contains the super block 965 * and is not use to back a file. 966 */ 967 end = zd->zones + bdev_nr_zones(sb->s_bdev); 968 for (zone = &zd->zones[1]; zone < end; zone = next) { 969 970 next = zone + 1; 971 if (zonefs_zone_type(zone) != ztype) 972 continue; 973 974 if (WARN_ON_ONCE(n >= zgroup->g_nr_zones)) 975 return -EINVAL; 976 977 /* 978 * For conventional zones, contiguous zones can be aggregated 979 * together to form larger files. Note that this overwrites the 980 * length of the first zone of the set of contiguous zones 981 * aggregated together. If one offline or read-only zone is 982 * found, assume that all zones aggregated have the same 983 * condition. 984 */ 985 if (ztype == ZONEFS_ZTYPE_CNV && 986 (sbi->s_features & ZONEFS_F_AGGRCNV)) { 987 for (; next < end; next++) { 988 if (zonefs_zone_type(next) != ztype) 989 break; 990 zone->len += next->len; 991 zone->capacity += next->capacity; 992 if (next->cond == BLK_ZONE_COND_READONLY && 993 zone->cond != BLK_ZONE_COND_OFFLINE) 994 zone->cond = BLK_ZONE_COND_READONLY; 995 else if (next->cond == BLK_ZONE_COND_OFFLINE) 996 zone->cond = BLK_ZONE_COND_OFFLINE; 997 } 998 } 999 1000 z = &zgroup->g_zones[n]; 1001 if (ztype == ZONEFS_ZTYPE_CNV) 1002 z->z_flags |= ZONEFS_ZONE_CNV; 1003 z->z_sector = zone->start; 1004 z->z_size = zone->len << SECTOR_SHIFT; 1005 if (z->z_size > bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT && 1006 !(sbi->s_features & ZONEFS_F_AGGRCNV)) { 1007 zonefs_err(sb, 1008 "Invalid zone size %llu (device zone sectors %llu)\n", 1009 z->z_size, 1010 bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT); 1011 return -EINVAL; 1012 } 1013 1014 z->z_capacity = min_t(loff_t, MAX_LFS_FILESIZE, 1015 zone->capacity << SECTOR_SHIFT); 1016 z->z_wpoffset = zonefs_check_zone_condition(sb, z, zone); 1017 1018 z->z_mode = S_IFREG | sbi->s_perm; 1019 z->z_uid = sbi->s_uid; 1020 z->z_gid = sbi->s_gid; 1021 1022 /* 1023 * Let zonefs_inode_update_mode() know that we will need 1024 * special initialization of the inode mode the first time 1025 * it is accessed. 1026 */ 1027 z->z_flags |= ZONEFS_ZONE_INIT_MODE; 1028 1029 sb->s_maxbytes = max(z->z_capacity, sb->s_maxbytes); 1030 sbi->s_blocks += z->z_capacity >> sb->s_blocksize_bits; 1031 sbi->s_used_blocks += z->z_wpoffset >> sb->s_blocksize_bits; 1032 1033 /* 1034 * For sequential zones, make sure that any open zone is closed 1035 * first to ensure that the initial number of open zones is 0, 1036 * in sync with the open zone accounting done when the mount 1037 * option ZONEFS_MNTOPT_EXPLICIT_OPEN is used. 1038 */ 1039 if (ztype == ZONEFS_ZTYPE_SEQ && 1040 (zone->cond == BLK_ZONE_COND_IMP_OPEN || 1041 zone->cond == BLK_ZONE_COND_EXP_OPEN)) { 1042 ret = zonefs_zone_mgmt(sb, z, REQ_OP_ZONE_CLOSE); 1043 if (ret) 1044 return ret; 1045 } 1046 1047 zonefs_account_active(sb, z); 1048 1049 n++; 1050 } 1051 1052 if (WARN_ON_ONCE(n != zgroup->g_nr_zones)) 1053 return -EINVAL; 1054 1055 zonefs_info(sb, "Zone group \"%s\" has %u file%s\n", 1056 zonefs_zgroup_name(ztype), 1057 zgroup->g_nr_zones, 1058 zgroup->g_nr_zones > 1 ? "s" : ""); 1059 1060 return 0; 1061 } 1062 1063 static void zonefs_free_zgroups(struct super_block *sb) 1064 { 1065 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 1066 enum zonefs_ztype ztype; 1067 1068 if (!sbi) 1069 return; 1070 1071 for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { 1072 kvfree(sbi->s_zgroup[ztype].g_zones); 1073 sbi->s_zgroup[ztype].g_zones = NULL; 1074 } 1075 } 1076 1077 /* 1078 * Create a zone group and populate it with zone files. 1079 */ 1080 static int zonefs_init_zgroups(struct super_block *sb) 1081 { 1082 struct zonefs_zone_data zd; 1083 enum zonefs_ztype ztype; 1084 int ret; 1085 1086 /* First get the device zone information */ 1087 memset(&zd, 0, sizeof(struct zonefs_zone_data)); 1088 zd.sb = sb; 1089 ret = zonefs_get_zone_info(&zd); 1090 if (ret) 1091 goto cleanup; 1092 1093 /* Allocate and initialize the zone groups */ 1094 for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { 1095 ret = zonefs_init_zgroup(sb, &zd, ztype); 1096 if (ret) { 1097 zonefs_info(sb, 1098 "Zone group \"%s\" initialization failed\n", 1099 zonefs_zgroup_name(ztype)); 1100 break; 1101 } 1102 } 1103 1104 cleanup: 1105 zonefs_free_zone_info(&zd); 1106 if (ret) 1107 zonefs_free_zgroups(sb); 1108 1109 return ret; 1110 } 1111 1112 /* 1113 * Read super block information from the device. 1114 */ 1115 static int zonefs_read_super(struct super_block *sb) 1116 { 1117 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 1118 struct zonefs_super *super; 1119 u32 crc, stored_crc; 1120 struct page *page; 1121 struct bio_vec bio_vec; 1122 struct bio bio; 1123 int ret; 1124 1125 page = alloc_page(GFP_KERNEL); 1126 if (!page) 1127 return -ENOMEM; 1128 1129 bio_init(&bio, sb->s_bdev, &bio_vec, 1, REQ_OP_READ); 1130 bio.bi_iter.bi_sector = 0; 1131 __bio_add_page(&bio, page, PAGE_SIZE, 0); 1132 1133 ret = submit_bio_wait(&bio); 1134 if (ret) 1135 goto free_page; 1136 1137 super = page_address(page); 1138 1139 ret = -EINVAL; 1140 if (le32_to_cpu(super->s_magic) != ZONEFS_MAGIC) 1141 goto free_page; 1142 1143 stored_crc = le32_to_cpu(super->s_crc); 1144 super->s_crc = 0; 1145 crc = crc32(~0U, (unsigned char *)super, sizeof(struct zonefs_super)); 1146 if (crc != stored_crc) { 1147 zonefs_err(sb, "Invalid checksum (Expected 0x%08x, got 0x%08x)", 1148 crc, stored_crc); 1149 goto free_page; 1150 } 1151 1152 sbi->s_features = le64_to_cpu(super->s_features); 1153 if (sbi->s_features & ~ZONEFS_F_DEFINED_FEATURES) { 1154 zonefs_err(sb, "Unknown features set 0x%llx\n", 1155 sbi->s_features); 1156 goto free_page; 1157 } 1158 1159 if (sbi->s_features & ZONEFS_F_UID) { 1160 sbi->s_uid = make_kuid(current_user_ns(), 1161 le32_to_cpu(super->s_uid)); 1162 if (!uid_valid(sbi->s_uid)) { 1163 zonefs_err(sb, "Invalid UID feature\n"); 1164 goto free_page; 1165 } 1166 } 1167 1168 if (sbi->s_features & ZONEFS_F_GID) { 1169 sbi->s_gid = make_kgid(current_user_ns(), 1170 le32_to_cpu(super->s_gid)); 1171 if (!gid_valid(sbi->s_gid)) { 1172 zonefs_err(sb, "Invalid GID feature\n"); 1173 goto free_page; 1174 } 1175 } 1176 1177 if (sbi->s_features & ZONEFS_F_PERM) 1178 sbi->s_perm = le32_to_cpu(super->s_perm); 1179 1180 if (memchr_inv(super->s_reserved, 0, sizeof(super->s_reserved))) { 1181 zonefs_err(sb, "Reserved area is being used\n"); 1182 goto free_page; 1183 } 1184 1185 import_uuid(&sbi->s_uuid, super->s_uuid); 1186 ret = 0; 1187 1188 free_page: 1189 __free_page(page); 1190 1191 return ret; 1192 } 1193 1194 static const struct super_operations zonefs_sops = { 1195 .alloc_inode = zonefs_alloc_inode, 1196 .free_inode = zonefs_free_inode, 1197 .statfs = zonefs_statfs, 1198 .remount_fs = zonefs_remount, 1199 .show_options = zonefs_show_options, 1200 }; 1201 1202 static int zonefs_get_zgroup_inodes(struct super_block *sb) 1203 { 1204 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 1205 struct inode *dir_inode; 1206 enum zonefs_ztype ztype; 1207 1208 for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { 1209 if (!sbi->s_zgroup[ztype].g_nr_zones) 1210 continue; 1211 1212 dir_inode = zonefs_get_zgroup_inode(sb, ztype); 1213 if (IS_ERR(dir_inode)) 1214 return PTR_ERR(dir_inode); 1215 1216 sbi->s_zgroup[ztype].g_inode = dir_inode; 1217 } 1218 1219 return 0; 1220 } 1221 1222 static void zonefs_release_zgroup_inodes(struct super_block *sb) 1223 { 1224 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 1225 enum zonefs_ztype ztype; 1226 1227 if (!sbi) 1228 return; 1229 1230 for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { 1231 if (sbi->s_zgroup[ztype].g_inode) { 1232 iput(sbi->s_zgroup[ztype].g_inode); 1233 sbi->s_zgroup[ztype].g_inode = NULL; 1234 } 1235 } 1236 } 1237 1238 /* 1239 * Check that the device is zoned. If it is, get the list of zones and create 1240 * sub-directories and files according to the device zone configuration and 1241 * format options. 1242 */ 1243 static int zonefs_fill_super(struct super_block *sb, void *data, int silent) 1244 { 1245 struct zonefs_sb_info *sbi; 1246 struct inode *inode; 1247 enum zonefs_ztype ztype; 1248 int ret; 1249 1250 if (!bdev_is_zoned(sb->s_bdev)) { 1251 zonefs_err(sb, "Not a zoned block device\n"); 1252 return -EINVAL; 1253 } 1254 1255 /* 1256 * Initialize super block information: the maximum file size is updated 1257 * when the zone files are created so that the format option 1258 * ZONEFS_F_AGGRCNV which increases the maximum file size of a file 1259 * beyond the zone size is taken into account. 1260 */ 1261 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 1262 if (!sbi) 1263 return -ENOMEM; 1264 1265 spin_lock_init(&sbi->s_lock); 1266 sb->s_fs_info = sbi; 1267 sb->s_magic = ZONEFS_MAGIC; 1268 sb->s_maxbytes = 0; 1269 sb->s_op = &zonefs_sops; 1270 sb->s_time_gran = 1; 1271 1272 /* 1273 * The block size is set to the device zone write granularity to ensure 1274 * that write operations are always aligned according to the device 1275 * interface constraints. 1276 */ 1277 sb_set_blocksize(sb, bdev_zone_write_granularity(sb->s_bdev)); 1278 sbi->s_zone_sectors_shift = ilog2(bdev_zone_sectors(sb->s_bdev)); 1279 sbi->s_uid = GLOBAL_ROOT_UID; 1280 sbi->s_gid = GLOBAL_ROOT_GID; 1281 sbi->s_perm = 0640; 1282 sbi->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO; 1283 1284 atomic_set(&sbi->s_wro_seq_files, 0); 1285 sbi->s_max_wro_seq_files = bdev_max_open_zones(sb->s_bdev); 1286 atomic_set(&sbi->s_active_seq_files, 0); 1287 sbi->s_max_active_seq_files = bdev_max_active_zones(sb->s_bdev); 1288 1289 ret = zonefs_read_super(sb); 1290 if (ret) 1291 return ret; 1292 1293 ret = zonefs_parse_options(sb, data); 1294 if (ret) 1295 return ret; 1296 1297 zonefs_info(sb, "Mounting %u zones", bdev_nr_zones(sb->s_bdev)); 1298 1299 if (!sbi->s_max_wro_seq_files && 1300 !sbi->s_max_active_seq_files && 1301 sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { 1302 zonefs_info(sb, 1303 "No open and active zone limits. Ignoring explicit_open mount option\n"); 1304 sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN; 1305 } 1306 1307 /* Initialize the zone groups */ 1308 ret = zonefs_init_zgroups(sb); 1309 if (ret) 1310 goto cleanup; 1311 1312 /* Create the root directory inode */ 1313 ret = -ENOMEM; 1314 inode = new_inode(sb); 1315 if (!inode) 1316 goto cleanup; 1317 1318 inode->i_ino = bdev_nr_zones(sb->s_bdev); 1319 inode->i_mode = S_IFDIR | 0555; 1320 inode->i_ctime = inode->i_mtime = inode->i_atime = current_time(inode); 1321 inode->i_op = &zonefs_dir_inode_operations; 1322 inode->i_fop = &zonefs_dir_operations; 1323 inode->i_size = 2; 1324 set_nlink(inode, 2); 1325 for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { 1326 if (sbi->s_zgroup[ztype].g_nr_zones) { 1327 inc_nlink(inode); 1328 inode->i_size++; 1329 } 1330 } 1331 1332 sb->s_root = d_make_root(inode); 1333 if (!sb->s_root) 1334 goto cleanup; 1335 1336 /* 1337 * Take a reference on the zone groups directory inodes 1338 * to keep them in the inode cache. 1339 */ 1340 ret = zonefs_get_zgroup_inodes(sb); 1341 if (ret) 1342 goto cleanup; 1343 1344 ret = zonefs_sysfs_register(sb); 1345 if (ret) 1346 goto cleanup; 1347 1348 return 0; 1349 1350 cleanup: 1351 zonefs_release_zgroup_inodes(sb); 1352 zonefs_free_zgroups(sb); 1353 1354 return ret; 1355 } 1356 1357 static struct dentry *zonefs_mount(struct file_system_type *fs_type, 1358 int flags, const char *dev_name, void *data) 1359 { 1360 return mount_bdev(fs_type, flags, dev_name, data, zonefs_fill_super); 1361 } 1362 1363 static void zonefs_kill_super(struct super_block *sb) 1364 { 1365 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 1366 1367 /* Release the reference on the zone group directory inodes */ 1368 zonefs_release_zgroup_inodes(sb); 1369 1370 kill_block_super(sb); 1371 1372 zonefs_sysfs_unregister(sb); 1373 zonefs_free_zgroups(sb); 1374 kfree(sbi); 1375 } 1376 1377 /* 1378 * File system definition and registration. 1379 */ 1380 static struct file_system_type zonefs_type = { 1381 .owner = THIS_MODULE, 1382 .name = "zonefs", 1383 .mount = zonefs_mount, 1384 .kill_sb = zonefs_kill_super, 1385 .fs_flags = FS_REQUIRES_DEV, 1386 }; 1387 1388 static int __init zonefs_init_inodecache(void) 1389 { 1390 zonefs_inode_cachep = kmem_cache_create("zonefs_inode_cache", 1391 sizeof(struct zonefs_inode_info), 0, 1392 (SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT), 1393 NULL); 1394 if (zonefs_inode_cachep == NULL) 1395 return -ENOMEM; 1396 return 0; 1397 } 1398 1399 static void zonefs_destroy_inodecache(void) 1400 { 1401 /* 1402 * Make sure all delayed rcu free inodes are flushed before we 1403 * destroy the inode cache. 1404 */ 1405 rcu_barrier(); 1406 kmem_cache_destroy(zonefs_inode_cachep); 1407 } 1408 1409 static int __init zonefs_init(void) 1410 { 1411 int ret; 1412 1413 BUILD_BUG_ON(sizeof(struct zonefs_super) != ZONEFS_SUPER_SIZE); 1414 1415 ret = zonefs_file_bioset_init(); 1416 if (ret) 1417 return ret; 1418 1419 ret = zonefs_init_inodecache(); 1420 if (ret) 1421 goto destroy_bioset; 1422 1423 ret = zonefs_sysfs_init(); 1424 if (ret) 1425 goto destroy_inodecache; 1426 1427 ret = register_filesystem(&zonefs_type); 1428 if (ret) 1429 goto sysfs_exit; 1430 1431 return 0; 1432 1433 sysfs_exit: 1434 zonefs_sysfs_exit(); 1435 destroy_inodecache: 1436 zonefs_destroy_inodecache(); 1437 destroy_bioset: 1438 zonefs_file_bioset_exit(); 1439 1440 return ret; 1441 } 1442 1443 static void __exit zonefs_exit(void) 1444 { 1445 unregister_filesystem(&zonefs_type); 1446 zonefs_sysfs_exit(); 1447 zonefs_destroy_inodecache(); 1448 zonefs_file_bioset_exit(); 1449 } 1450 1451 MODULE_AUTHOR("Damien Le Moal"); 1452 MODULE_DESCRIPTION("Zone file system for zoned block devices"); 1453 MODULE_LICENSE("GPL"); 1454 MODULE_ALIAS_FS("zonefs"); 1455 module_init(zonefs_init); 1456 module_exit(zonefs_exit); 1457