1 /* 2 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include "xfs.h" 19 #include "xfs_fs.h" 20 #include "xfs_types.h" 21 #include "xfs_bit.h" 22 #include "xfs_log.h" 23 #include "xfs_inum.h" 24 #include "xfs_trans.h" 25 #include "xfs_sb.h" 26 #include "xfs_ag.h" 27 #include "xfs_dir2.h" 28 #include "xfs_dmapi.h" 29 #include "xfs_mount.h" 30 #include "xfs_bmap_btree.h" 31 #include "xfs_alloc_btree.h" 32 #include "xfs_ialloc_btree.h" 33 #include "xfs_dir2_sf.h" 34 #include "xfs_attr_sf.h" 35 #include "xfs_dinode.h" 36 #include "xfs_inode.h" 37 #include "xfs_btree.h" 38 #include "xfs_ialloc.h" 39 #include "xfs_alloc.h" 40 #include "xfs_rtalloc.h" 41 #include "xfs_bmap.h" 42 #include "xfs_error.h" 43 #include "xfs_rw.h" 44 #include "xfs_quota.h" 45 #include "xfs_fsops.h" 46 47 STATIC void xfs_mount_log_sbunit(xfs_mount_t *, __int64_t); 48 STATIC int xfs_uuid_mount(xfs_mount_t *); 49 STATIC void xfs_uuid_unmount(xfs_mount_t *mp); 50 STATIC void xfs_unmountfs_wait(xfs_mount_t *); 51 52 53 #ifdef HAVE_PERCPU_SB 54 STATIC void xfs_icsb_destroy_counters(xfs_mount_t *); 55 STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t, 56 int, int); 57 STATIC void xfs_icsb_sync_counters(xfs_mount_t *); 58 STATIC int xfs_icsb_modify_counters(xfs_mount_t *, xfs_sb_field_t, 59 int64_t, int); 60 STATIC int xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t); 61 62 #else 63 64 #define xfs_icsb_destroy_counters(mp) do { } while (0) 65 #define xfs_icsb_balance_counter(mp, a, b, c) do { } while (0) 66 #define xfs_icsb_sync_counters(mp) do { } while (0) 67 #define xfs_icsb_modify_counters(mp, a, b, c) do { } while (0) 68 69 #endif 70 71 static const struct { 72 short offset; 73 short type; /* 0 = integer 74 * 1 = binary / string (no translation) 75 */ 76 } xfs_sb_info[] = { 77 { offsetof(xfs_sb_t, sb_magicnum), 0 }, 78 { offsetof(xfs_sb_t, sb_blocksize), 0 }, 79 { offsetof(xfs_sb_t, sb_dblocks), 0 }, 80 { offsetof(xfs_sb_t, sb_rblocks), 0 }, 81 { offsetof(xfs_sb_t, sb_rextents), 0 }, 82 { offsetof(xfs_sb_t, sb_uuid), 1 }, 83 { offsetof(xfs_sb_t, sb_logstart), 0 }, 84 { offsetof(xfs_sb_t, sb_rootino), 0 }, 85 { offsetof(xfs_sb_t, sb_rbmino), 0 }, 86 { offsetof(xfs_sb_t, sb_rsumino), 0 }, 87 { offsetof(xfs_sb_t, sb_rextsize), 0 }, 88 { offsetof(xfs_sb_t, sb_agblocks), 0 }, 89 { offsetof(xfs_sb_t, sb_agcount), 0 }, 90 { offsetof(xfs_sb_t, sb_rbmblocks), 0 }, 91 { offsetof(xfs_sb_t, sb_logblocks), 0 }, 92 { offsetof(xfs_sb_t, sb_versionnum), 0 }, 93 { offsetof(xfs_sb_t, sb_sectsize), 0 }, 94 { offsetof(xfs_sb_t, sb_inodesize), 0 }, 95 { offsetof(xfs_sb_t, sb_inopblock), 0 }, 96 { offsetof(xfs_sb_t, sb_fname[0]), 1 }, 97 { offsetof(xfs_sb_t, sb_blocklog), 0 }, 98 { offsetof(xfs_sb_t, sb_sectlog), 0 }, 99 { offsetof(xfs_sb_t, sb_inodelog), 0 }, 100 { offsetof(xfs_sb_t, sb_inopblog), 0 }, 101 { offsetof(xfs_sb_t, sb_agblklog), 0 }, 102 { offsetof(xfs_sb_t, sb_rextslog), 0 }, 103 { offsetof(xfs_sb_t, sb_inprogress), 0 }, 104 { offsetof(xfs_sb_t, sb_imax_pct), 0 }, 105 { offsetof(xfs_sb_t, sb_icount), 0 }, 106 { offsetof(xfs_sb_t, sb_ifree), 0 }, 107 { offsetof(xfs_sb_t, sb_fdblocks), 0 }, 108 { offsetof(xfs_sb_t, sb_frextents), 0 }, 109 { offsetof(xfs_sb_t, sb_uquotino), 0 }, 110 { offsetof(xfs_sb_t, sb_gquotino), 0 }, 111 { offsetof(xfs_sb_t, sb_qflags), 0 }, 112 { offsetof(xfs_sb_t, sb_flags), 0 }, 113 { offsetof(xfs_sb_t, sb_shared_vn), 0 }, 114 { offsetof(xfs_sb_t, sb_inoalignmt), 0 }, 115 { offsetof(xfs_sb_t, sb_unit), 0 }, 116 { offsetof(xfs_sb_t, sb_width), 0 }, 117 { offsetof(xfs_sb_t, sb_dirblklog), 0 }, 118 { offsetof(xfs_sb_t, sb_logsectlog), 0 }, 119 { offsetof(xfs_sb_t, sb_logsectsize),0 }, 120 { offsetof(xfs_sb_t, sb_logsunit), 0 }, 121 { offsetof(xfs_sb_t, sb_features2), 0 }, 122 { sizeof(xfs_sb_t), 0 } 123 }; 124 125 /* 126 * Return a pointer to an initialized xfs_mount structure. 127 */ 128 xfs_mount_t * 129 xfs_mount_init(void) 130 { 131 xfs_mount_t *mp; 132 133 mp = kmem_zalloc(sizeof(xfs_mount_t), KM_SLEEP); 134 135 if (xfs_icsb_init_counters(mp)) { 136 mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB; 137 } 138 139 AIL_LOCKINIT(&mp->m_ail_lock, "xfs_ail"); 140 spinlock_init(&mp->m_sb_lock, "xfs_sb"); 141 mutex_init(&mp->m_ilock); 142 initnsema(&mp->m_growlock, 1, "xfs_grow"); 143 /* 144 * Initialize the AIL. 145 */ 146 xfs_trans_ail_init(mp); 147 148 atomic_set(&mp->m_active_trans, 0); 149 150 return mp; 151 } 152 153 /* 154 * Free up the resources associated with a mount structure. Assume that 155 * the structure was initially zeroed, so we can tell which fields got 156 * initialized. 157 */ 158 void 159 xfs_mount_free( 160 xfs_mount_t *mp, 161 int remove_bhv) 162 { 163 if (mp->m_ihash) 164 xfs_ihash_free(mp); 165 if (mp->m_chash) 166 xfs_chash_free(mp); 167 168 if (mp->m_perag) { 169 int agno; 170 171 for (agno = 0; agno < mp->m_maxagi; agno++) 172 if (mp->m_perag[agno].pagb_list) 173 kmem_free(mp->m_perag[agno].pagb_list, 174 sizeof(xfs_perag_busy_t) * 175 XFS_PAGB_NUM_SLOTS); 176 kmem_free(mp->m_perag, 177 sizeof(xfs_perag_t) * mp->m_sb.sb_agcount); 178 } 179 180 AIL_LOCK_DESTROY(&mp->m_ail_lock); 181 spinlock_destroy(&mp->m_sb_lock); 182 mutex_destroy(&mp->m_ilock); 183 freesema(&mp->m_growlock); 184 if (mp->m_quotainfo) 185 XFS_QM_DONE(mp); 186 187 if (mp->m_fsname != NULL) 188 kmem_free(mp->m_fsname, mp->m_fsname_len); 189 if (mp->m_rtname != NULL) 190 kmem_free(mp->m_rtname, strlen(mp->m_rtname) + 1); 191 if (mp->m_logname != NULL) 192 kmem_free(mp->m_logname, strlen(mp->m_logname) + 1); 193 194 if (remove_bhv) { 195 struct bhv_vfs *vfsp = XFS_MTOVFS(mp); 196 197 bhv_remove_all_vfsops(vfsp, 0); 198 VFS_REMOVEBHV(vfsp, &mp->m_bhv); 199 } 200 201 xfs_icsb_destroy_counters(mp); 202 kmem_free(mp, sizeof(xfs_mount_t)); 203 } 204 205 /* 206 * Check size of device based on the (data/realtime) block count. 207 * Note: this check is used by the growfs code as well as mount. 208 */ 209 int 210 xfs_sb_validate_fsb_count( 211 xfs_sb_t *sbp, 212 __uint64_t nblocks) 213 { 214 ASSERT(PAGE_SHIFT >= sbp->sb_blocklog); 215 ASSERT(sbp->sb_blocklog >= BBSHIFT); 216 217 #if XFS_BIG_BLKNOS /* Limited by ULONG_MAX of page cache index */ 218 if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX) 219 return E2BIG; 220 #else /* Limited by UINT_MAX of sectors */ 221 if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX) 222 return E2BIG; 223 #endif 224 return 0; 225 } 226 227 /* 228 * Check the validity of the SB found. 229 */ 230 STATIC int 231 xfs_mount_validate_sb( 232 xfs_mount_t *mp, 233 xfs_sb_t *sbp, 234 int flags) 235 { 236 /* 237 * If the log device and data device have the 238 * same device number, the log is internal. 239 * Consequently, the sb_logstart should be non-zero. If 240 * we have a zero sb_logstart in this case, we may be trying to mount 241 * a volume filesystem in a non-volume manner. 242 */ 243 if (sbp->sb_magicnum != XFS_SB_MAGIC) { 244 xfs_fs_mount_cmn_err(flags, "bad magic number"); 245 return XFS_ERROR(EWRONGFS); 246 } 247 248 if (!XFS_SB_GOOD_VERSION(sbp)) { 249 xfs_fs_mount_cmn_err(flags, "bad version"); 250 return XFS_ERROR(EWRONGFS); 251 } 252 253 if (unlikely( 254 sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { 255 xfs_fs_mount_cmn_err(flags, 256 "filesystem is marked as having an external log; " 257 "specify logdev on the\nmount command line."); 258 return XFS_ERROR(EINVAL); 259 } 260 261 if (unlikely( 262 sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) { 263 xfs_fs_mount_cmn_err(flags, 264 "filesystem is marked as having an internal log; " 265 "do not specify logdev on\nthe mount command line."); 266 return XFS_ERROR(EINVAL); 267 } 268 269 /* 270 * More sanity checking. These were stolen directly from 271 * xfs_repair. 272 */ 273 if (unlikely( 274 sbp->sb_agcount <= 0 || 275 sbp->sb_sectsize < XFS_MIN_SECTORSIZE || 276 sbp->sb_sectsize > XFS_MAX_SECTORSIZE || 277 sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG || 278 sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG || 279 sbp->sb_blocksize < XFS_MIN_BLOCKSIZE || 280 sbp->sb_blocksize > XFS_MAX_BLOCKSIZE || 281 sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG || 282 sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG || 283 sbp->sb_inodesize < XFS_DINODE_MIN_SIZE || 284 sbp->sb_inodesize > XFS_DINODE_MAX_SIZE || 285 sbp->sb_inodelog < XFS_DINODE_MIN_LOG || 286 sbp->sb_inodelog > XFS_DINODE_MAX_LOG || 287 (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) || 288 (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || 289 (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || 290 (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */))) { 291 xfs_fs_mount_cmn_err(flags, "SB sanity check 1 failed"); 292 return XFS_ERROR(EFSCORRUPTED); 293 } 294 295 /* 296 * Sanity check AG count, size fields against data size field 297 */ 298 if (unlikely( 299 sbp->sb_dblocks == 0 || 300 sbp->sb_dblocks > 301 (xfs_drfsbno_t)sbp->sb_agcount * sbp->sb_agblocks || 302 sbp->sb_dblocks < (xfs_drfsbno_t)(sbp->sb_agcount - 1) * 303 sbp->sb_agblocks + XFS_MIN_AG_BLOCKS)) { 304 xfs_fs_mount_cmn_err(flags, "SB sanity check 2 failed"); 305 return XFS_ERROR(EFSCORRUPTED); 306 } 307 308 if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || 309 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { 310 xfs_fs_mount_cmn_err(flags, 311 "file system too large to be mounted on this system."); 312 return XFS_ERROR(E2BIG); 313 } 314 315 if (unlikely(sbp->sb_inprogress)) { 316 xfs_fs_mount_cmn_err(flags, "file system busy"); 317 return XFS_ERROR(EFSCORRUPTED); 318 } 319 320 /* 321 * Version 1 directory format has never worked on Linux. 322 */ 323 if (unlikely(!XFS_SB_VERSION_HASDIRV2(sbp))) { 324 xfs_fs_mount_cmn_err(flags, 325 "file system using version 1 directory format"); 326 return XFS_ERROR(ENOSYS); 327 } 328 329 /* 330 * Until this is fixed only page-sized or smaller data blocks work. 331 */ 332 if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { 333 xfs_fs_mount_cmn_err(flags, 334 "file system with blocksize %d bytes", 335 sbp->sb_blocksize); 336 xfs_fs_mount_cmn_err(flags, 337 "only pagesize (%ld) or less will currently work.", 338 PAGE_SIZE); 339 return XFS_ERROR(ENOSYS); 340 } 341 342 return 0; 343 } 344 345 xfs_agnumber_t 346 xfs_initialize_perag( 347 bhv_vfs_t *vfs, 348 xfs_mount_t *mp, 349 xfs_agnumber_t agcount) 350 { 351 xfs_agnumber_t index, max_metadata; 352 xfs_perag_t *pag; 353 xfs_agino_t agino; 354 xfs_ino_t ino; 355 xfs_sb_t *sbp = &mp->m_sb; 356 xfs_ino_t max_inum = XFS_MAXINUMBER_32; 357 358 /* Check to see if the filesystem can overflow 32 bit inodes */ 359 agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0); 360 ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino); 361 362 /* Clear the mount flag if no inode can overflow 32 bits 363 * on this filesystem, or if specifically requested.. 364 */ 365 if ((vfs->vfs_flag & VFS_32BITINODES) && ino > max_inum) { 366 mp->m_flags |= XFS_MOUNT_32BITINODES; 367 } else { 368 mp->m_flags &= ~XFS_MOUNT_32BITINODES; 369 } 370 371 /* If we can overflow then setup the ag headers accordingly */ 372 if (mp->m_flags & XFS_MOUNT_32BITINODES) { 373 /* Calculate how much should be reserved for inodes to 374 * meet the max inode percentage. 375 */ 376 if (mp->m_maxicount) { 377 __uint64_t icount; 378 379 icount = sbp->sb_dblocks * sbp->sb_imax_pct; 380 do_div(icount, 100); 381 icount += sbp->sb_agblocks - 1; 382 do_div(icount, sbp->sb_agblocks); 383 max_metadata = icount; 384 } else { 385 max_metadata = agcount; 386 } 387 for (index = 0; index < agcount; index++) { 388 ino = XFS_AGINO_TO_INO(mp, index, agino); 389 if (ino > max_inum) { 390 index++; 391 break; 392 } 393 394 /* This ag is preferred for inodes */ 395 pag = &mp->m_perag[index]; 396 pag->pagi_inodeok = 1; 397 if (index < max_metadata) 398 pag->pagf_metadata = 1; 399 } 400 } else { 401 /* Setup default behavior for smaller filesystems */ 402 for (index = 0; index < agcount; index++) { 403 pag = &mp->m_perag[index]; 404 pag->pagi_inodeok = 1; 405 } 406 } 407 return index; 408 } 409 410 /* 411 * xfs_xlatesb 412 * 413 * data - on disk version of sb 414 * sb - a superblock 415 * dir - conversion direction: <0 - convert sb to buf 416 * >0 - convert buf to sb 417 * fields - which fields to copy (bitmask) 418 */ 419 void 420 xfs_xlatesb( 421 void *data, 422 xfs_sb_t *sb, 423 int dir, 424 __int64_t fields) 425 { 426 xfs_caddr_t buf_ptr; 427 xfs_caddr_t mem_ptr; 428 xfs_sb_field_t f; 429 int first; 430 int size; 431 432 ASSERT(dir); 433 ASSERT(fields); 434 435 if (!fields) 436 return; 437 438 buf_ptr = (xfs_caddr_t)data; 439 mem_ptr = (xfs_caddr_t)sb; 440 441 while (fields) { 442 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); 443 first = xfs_sb_info[f].offset; 444 size = xfs_sb_info[f + 1].offset - first; 445 446 ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1); 447 448 if (size == 1 || xfs_sb_info[f].type == 1) { 449 if (dir > 0) { 450 memcpy(mem_ptr + first, buf_ptr + first, size); 451 } else { 452 memcpy(buf_ptr + first, mem_ptr + first, size); 453 } 454 } else { 455 switch (size) { 456 case 2: 457 INT_XLATE(*(__uint16_t*)(buf_ptr+first), 458 *(__uint16_t*)(mem_ptr+first), 459 dir, ARCH_CONVERT); 460 break; 461 case 4: 462 INT_XLATE(*(__uint32_t*)(buf_ptr+first), 463 *(__uint32_t*)(mem_ptr+first), 464 dir, ARCH_CONVERT); 465 break; 466 case 8: 467 INT_XLATE(*(__uint64_t*)(buf_ptr+first), 468 *(__uint64_t*)(mem_ptr+first), dir, ARCH_CONVERT); 469 break; 470 default: 471 ASSERT(0); 472 } 473 } 474 475 fields &= ~(1LL << f); 476 } 477 } 478 479 /* 480 * xfs_readsb 481 * 482 * Does the initial read of the superblock. 483 */ 484 int 485 xfs_readsb(xfs_mount_t *mp, int flags) 486 { 487 unsigned int sector_size; 488 unsigned int extra_flags; 489 xfs_buf_t *bp; 490 xfs_sb_t *sbp; 491 int error; 492 493 ASSERT(mp->m_sb_bp == NULL); 494 ASSERT(mp->m_ddev_targp != NULL); 495 496 /* 497 * Allocate a (locked) buffer to hold the superblock. 498 * This will be kept around at all times to optimize 499 * access to the superblock. 500 */ 501 sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); 502 extra_flags = XFS_BUF_LOCK | XFS_BUF_MANAGE | XFS_BUF_MAPPED; 503 504 bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR, 505 BTOBB(sector_size), extra_flags); 506 if (!bp || XFS_BUF_ISERROR(bp)) { 507 xfs_fs_mount_cmn_err(flags, "SB read failed"); 508 error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM; 509 goto fail; 510 } 511 ASSERT(XFS_BUF_ISBUSY(bp)); 512 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); 513 514 /* 515 * Initialize the mount structure from the superblock. 516 * But first do some basic consistency checking. 517 */ 518 sbp = XFS_BUF_TO_SBP(bp); 519 xfs_xlatesb(XFS_BUF_PTR(bp), &(mp->m_sb), 1, XFS_SB_ALL_BITS); 520 521 error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags); 522 if (error) { 523 xfs_fs_mount_cmn_err(flags, "SB validate failed"); 524 goto fail; 525 } 526 527 /* 528 * We must be able to do sector-sized and sector-aligned IO. 529 */ 530 if (sector_size > mp->m_sb.sb_sectsize) { 531 xfs_fs_mount_cmn_err(flags, 532 "device supports only %u byte sectors (not %u)", 533 sector_size, mp->m_sb.sb_sectsize); 534 error = ENOSYS; 535 goto fail; 536 } 537 538 /* 539 * If device sector size is smaller than the superblock size, 540 * re-read the superblock so the buffer is correctly sized. 541 */ 542 if (sector_size < mp->m_sb.sb_sectsize) { 543 XFS_BUF_UNMANAGE(bp); 544 xfs_buf_relse(bp); 545 sector_size = mp->m_sb.sb_sectsize; 546 bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR, 547 BTOBB(sector_size), extra_flags); 548 if (!bp || XFS_BUF_ISERROR(bp)) { 549 xfs_fs_mount_cmn_err(flags, "SB re-read failed"); 550 error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM; 551 goto fail; 552 } 553 ASSERT(XFS_BUF_ISBUSY(bp)); 554 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); 555 } 556 557 /* Initialize per-cpu counters */ 558 xfs_icsb_reinit_counters(mp); 559 560 mp->m_sb_bp = bp; 561 xfs_buf_relse(bp); 562 ASSERT(XFS_BUF_VALUSEMA(bp) > 0); 563 return 0; 564 565 fail: 566 if (bp) { 567 XFS_BUF_UNMANAGE(bp); 568 xfs_buf_relse(bp); 569 } 570 return error; 571 } 572 573 574 /* 575 * xfs_mount_common 576 * 577 * Mount initialization code establishing various mount 578 * fields from the superblock associated with the given 579 * mount structure 580 */ 581 STATIC void 582 xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp) 583 { 584 int i; 585 586 mp->m_agfrotor = mp->m_agirotor = 0; 587 spinlock_init(&mp->m_agirotor_lock, "m_agirotor_lock"); 588 mp->m_maxagi = mp->m_sb.sb_agcount; 589 mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG; 590 mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT; 591 mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT; 592 mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1; 593 mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog; 594 mp->m_litino = sbp->sb_inodesize - 595 ((uint)sizeof(xfs_dinode_core_t) + (uint)sizeof(xfs_agino_t)); 596 mp->m_blockmask = sbp->sb_blocksize - 1; 597 mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG; 598 mp->m_blockwmask = mp->m_blockwsize - 1; 599 INIT_LIST_HEAD(&mp->m_del_inodes); 600 601 /* 602 * Setup for attributes, in case they get created. 603 * This value is for inodes getting attributes for the first time, 604 * the per-inode value is for old attribute values. 605 */ 606 ASSERT(sbp->sb_inodesize >= 256 && sbp->sb_inodesize <= 2048); 607 switch (sbp->sb_inodesize) { 608 case 256: 609 mp->m_attroffset = XFS_LITINO(mp) - 610 XFS_BMDR_SPACE_CALC(MINABTPTRS); 611 break; 612 case 512: 613 case 1024: 614 case 2048: 615 mp->m_attroffset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS); 616 break; 617 default: 618 ASSERT(0); 619 } 620 ASSERT(mp->m_attroffset < XFS_LITINO(mp)); 621 622 for (i = 0; i < 2; i++) { 623 mp->m_alloc_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize, 624 xfs_alloc, i == 0); 625 mp->m_alloc_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize, 626 xfs_alloc, i == 0); 627 } 628 for (i = 0; i < 2; i++) { 629 mp->m_bmap_dmxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize, 630 xfs_bmbt, i == 0); 631 mp->m_bmap_dmnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize, 632 xfs_bmbt, i == 0); 633 } 634 for (i = 0; i < 2; i++) { 635 mp->m_inobt_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize, 636 xfs_inobt, i == 0); 637 mp->m_inobt_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize, 638 xfs_inobt, i == 0); 639 } 640 641 mp->m_bsize = XFS_FSB_TO_BB(mp, 1); 642 mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK, 643 sbp->sb_inopblock); 644 mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog; 645 } 646 647 /* 648 * xfs_initialize_perag_data 649 * 650 * Read in each per-ag structure so we can count up the number of 651 * allocated inodes, free inodes and used filesystem blocks as this 652 * information is no longer persistent in the superblock. Once we have 653 * this information, write it into the in-core superblock structure. 654 */ 655 STATIC int 656 xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount) 657 { 658 xfs_agnumber_t index; 659 xfs_perag_t *pag; 660 xfs_sb_t *sbp = &mp->m_sb; 661 uint64_t ifree = 0; 662 uint64_t ialloc = 0; 663 uint64_t bfree = 0; 664 uint64_t bfreelst = 0; 665 uint64_t btree = 0; 666 int error; 667 int s; 668 669 for (index = 0; index < agcount; index++) { 670 /* 671 * read the agf, then the agi. This gets us 672 * all the inforamtion we need and populates the 673 * per-ag structures for us. 674 */ 675 error = xfs_alloc_pagf_init(mp, NULL, index, 0); 676 if (error) 677 return error; 678 679 error = xfs_ialloc_pagi_init(mp, NULL, index); 680 if (error) 681 return error; 682 pag = &mp->m_perag[index]; 683 ifree += pag->pagi_freecount; 684 ialloc += pag->pagi_count; 685 bfree += pag->pagf_freeblks; 686 bfreelst += pag->pagf_flcount; 687 btree += pag->pagf_btreeblks; 688 } 689 /* 690 * Overwrite incore superblock counters with just-read data 691 */ 692 s = XFS_SB_LOCK(mp); 693 sbp->sb_ifree = ifree; 694 sbp->sb_icount = ialloc; 695 sbp->sb_fdblocks = bfree + bfreelst + btree; 696 XFS_SB_UNLOCK(mp, s); 697 698 /* Fixup the per-cpu counters as well. */ 699 xfs_icsb_reinit_counters(mp); 700 701 return 0; 702 } 703 704 /* 705 * xfs_mountfs 706 * 707 * This function does the following on an initial mount of a file system: 708 * - reads the superblock from disk and init the mount struct 709 * - if we're a 32-bit kernel, do a size check on the superblock 710 * so we don't mount terabyte filesystems 711 * - init mount struct realtime fields 712 * - allocate inode hash table for fs 713 * - init directory manager 714 * - perform recovery and init the log manager 715 */ 716 int 717 xfs_mountfs( 718 bhv_vfs_t *vfsp, 719 xfs_mount_t *mp, 720 int mfsi_flags) 721 { 722 xfs_buf_t *bp; 723 xfs_sb_t *sbp = &(mp->m_sb); 724 xfs_inode_t *rip; 725 bhv_vnode_t *rvp = NULL; 726 int readio_log, writeio_log; 727 xfs_daddr_t d; 728 __uint64_t resblks; 729 __int64_t update_flags; 730 uint quotamount, quotaflags; 731 int agno; 732 int uuid_mounted = 0; 733 int error = 0; 734 735 if (mp->m_sb_bp == NULL) { 736 if ((error = xfs_readsb(mp, mfsi_flags))) { 737 return error; 738 } 739 } 740 xfs_mount_common(mp, sbp); 741 742 /* 743 * Check if sb_agblocks is aligned at stripe boundary 744 * If sb_agblocks is NOT aligned turn off m_dalign since 745 * allocator alignment is within an ag, therefore ag has 746 * to be aligned at stripe boundary. 747 */ 748 update_flags = 0LL; 749 if (mp->m_dalign && !(mfsi_flags & XFS_MFSI_SECOND)) { 750 /* 751 * If stripe unit and stripe width are not multiples 752 * of the fs blocksize turn off alignment. 753 */ 754 if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || 755 (BBTOB(mp->m_swidth) & mp->m_blockmask)) { 756 if (mp->m_flags & XFS_MOUNT_RETERR) { 757 cmn_err(CE_WARN, 758 "XFS: alignment check 1 failed"); 759 error = XFS_ERROR(EINVAL); 760 goto error1; 761 } 762 mp->m_dalign = mp->m_swidth = 0; 763 } else { 764 /* 765 * Convert the stripe unit and width to FSBs. 766 */ 767 mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); 768 if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) { 769 if (mp->m_flags & XFS_MOUNT_RETERR) { 770 error = XFS_ERROR(EINVAL); 771 goto error1; 772 } 773 xfs_fs_cmn_err(CE_WARN, mp, 774 "stripe alignment turned off: sunit(%d)/swidth(%d) incompatible with agsize(%d)", 775 mp->m_dalign, mp->m_swidth, 776 sbp->sb_agblocks); 777 778 mp->m_dalign = 0; 779 mp->m_swidth = 0; 780 } else if (mp->m_dalign) { 781 mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); 782 } else { 783 if (mp->m_flags & XFS_MOUNT_RETERR) { 784 xfs_fs_cmn_err(CE_WARN, mp, 785 "stripe alignment turned off: sunit(%d) less than bsize(%d)", 786 mp->m_dalign, 787 mp->m_blockmask +1); 788 error = XFS_ERROR(EINVAL); 789 goto error1; 790 } 791 mp->m_swidth = 0; 792 } 793 } 794 795 /* 796 * Update superblock with new values 797 * and log changes 798 */ 799 if (XFS_SB_VERSION_HASDALIGN(sbp)) { 800 if (sbp->sb_unit != mp->m_dalign) { 801 sbp->sb_unit = mp->m_dalign; 802 update_flags |= XFS_SB_UNIT; 803 } 804 if (sbp->sb_width != mp->m_swidth) { 805 sbp->sb_width = mp->m_swidth; 806 update_flags |= XFS_SB_WIDTH; 807 } 808 } 809 } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN && 810 XFS_SB_VERSION_HASDALIGN(&mp->m_sb)) { 811 mp->m_dalign = sbp->sb_unit; 812 mp->m_swidth = sbp->sb_width; 813 } 814 815 xfs_alloc_compute_maxlevels(mp); 816 xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK); 817 xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK); 818 xfs_ialloc_compute_maxlevels(mp); 819 820 if (sbp->sb_imax_pct) { 821 __uint64_t icount; 822 823 /* Make sure the maximum inode count is a multiple of the 824 * units we allocate inodes in. 825 */ 826 827 icount = sbp->sb_dblocks * sbp->sb_imax_pct; 828 do_div(icount, 100); 829 do_div(icount, mp->m_ialloc_blks); 830 mp->m_maxicount = (icount * mp->m_ialloc_blks) << 831 sbp->sb_inopblog; 832 } else 833 mp->m_maxicount = 0; 834 835 mp->m_maxioffset = xfs_max_file_offset(sbp->sb_blocklog); 836 837 /* 838 * XFS uses the uuid from the superblock as the unique 839 * identifier for fsid. We can not use the uuid from the volume 840 * since a single partition filesystem is identical to a single 841 * partition volume/filesystem. 842 */ 843 if ((mfsi_flags & XFS_MFSI_SECOND) == 0 && 844 (mp->m_flags & XFS_MOUNT_NOUUID) == 0) { 845 __uint64_t ret64; 846 if (xfs_uuid_mount(mp)) { 847 error = XFS_ERROR(EINVAL); 848 goto error1; 849 } 850 uuid_mounted=1; 851 ret64 = uuid_hash64(&sbp->sb_uuid); 852 memcpy(&vfsp->vfs_fsid, &ret64, sizeof(ret64)); 853 } 854 855 /* 856 * Set the default minimum read and write sizes unless 857 * already specified in a mount option. 858 * We use smaller I/O sizes when the file system 859 * is being used for NFS service (wsync mount option). 860 */ 861 if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) { 862 if (mp->m_flags & XFS_MOUNT_WSYNC) { 863 readio_log = XFS_WSYNC_READIO_LOG; 864 writeio_log = XFS_WSYNC_WRITEIO_LOG; 865 } else { 866 readio_log = XFS_READIO_LOG_LARGE; 867 writeio_log = XFS_WRITEIO_LOG_LARGE; 868 } 869 } else { 870 readio_log = mp->m_readio_log; 871 writeio_log = mp->m_writeio_log; 872 } 873 874 /* 875 * Set the number of readahead buffers to use based on 876 * physical memory size. 877 */ 878 if (xfs_physmem <= 4096) /* <= 16MB */ 879 mp->m_nreadaheads = XFS_RW_NREADAHEAD_16MB; 880 else if (xfs_physmem <= 8192) /* <= 32MB */ 881 mp->m_nreadaheads = XFS_RW_NREADAHEAD_32MB; 882 else 883 mp->m_nreadaheads = XFS_RW_NREADAHEAD_K32; 884 if (sbp->sb_blocklog > readio_log) { 885 mp->m_readio_log = sbp->sb_blocklog; 886 } else { 887 mp->m_readio_log = readio_log; 888 } 889 mp->m_readio_blocks = 1 << (mp->m_readio_log - sbp->sb_blocklog); 890 if (sbp->sb_blocklog > writeio_log) { 891 mp->m_writeio_log = sbp->sb_blocklog; 892 } else { 893 mp->m_writeio_log = writeio_log; 894 } 895 mp->m_writeio_blocks = 1 << (mp->m_writeio_log - sbp->sb_blocklog); 896 897 /* 898 * Set the inode cluster size based on the physical memory 899 * size. This may still be overridden by the file system 900 * block size if it is larger than the chosen cluster size. 901 */ 902 if (xfs_physmem <= btoc(32 * 1024 * 1024)) { /* <= 32 MB */ 903 mp->m_inode_cluster_size = XFS_INODE_SMALL_CLUSTER_SIZE; 904 } else { 905 mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE; 906 } 907 /* 908 * Set whether we're using inode alignment. 909 */ 910 if (XFS_SB_VERSION_HASALIGN(&mp->m_sb) && 911 mp->m_sb.sb_inoalignmt >= 912 XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) 913 mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1; 914 else 915 mp->m_inoalign_mask = 0; 916 /* 917 * If we are using stripe alignment, check whether 918 * the stripe unit is a multiple of the inode alignment 919 */ 920 if (mp->m_dalign && mp->m_inoalign_mask && 921 !(mp->m_dalign & mp->m_inoalign_mask)) 922 mp->m_sinoalign = mp->m_dalign; 923 else 924 mp->m_sinoalign = 0; 925 /* 926 * Check that the data (and log if separate) are an ok size. 927 */ 928 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); 929 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) { 930 cmn_err(CE_WARN, "XFS: size check 1 failed"); 931 error = XFS_ERROR(E2BIG); 932 goto error1; 933 } 934 error = xfs_read_buf(mp, mp->m_ddev_targp, 935 d - XFS_FSS_TO_BB(mp, 1), 936 XFS_FSS_TO_BB(mp, 1), 0, &bp); 937 if (!error) { 938 xfs_buf_relse(bp); 939 } else { 940 cmn_err(CE_WARN, "XFS: size check 2 failed"); 941 if (error == ENOSPC) { 942 error = XFS_ERROR(E2BIG); 943 } 944 goto error1; 945 } 946 947 if (((mfsi_flags & XFS_MFSI_CLIENT) == 0) && 948 mp->m_logdev_targp != mp->m_ddev_targp) { 949 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); 950 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { 951 cmn_err(CE_WARN, "XFS: size check 3 failed"); 952 error = XFS_ERROR(E2BIG); 953 goto error1; 954 } 955 error = xfs_read_buf(mp, mp->m_logdev_targp, 956 d - XFS_FSB_TO_BB(mp, 1), 957 XFS_FSB_TO_BB(mp, 1), 0, &bp); 958 if (!error) { 959 xfs_buf_relse(bp); 960 } else { 961 cmn_err(CE_WARN, "XFS: size check 3 failed"); 962 if (error == ENOSPC) { 963 error = XFS_ERROR(E2BIG); 964 } 965 goto error1; 966 } 967 } 968 969 /* 970 * Initialize realtime fields in the mount structure 971 */ 972 if ((error = xfs_rtmount_init(mp))) { 973 cmn_err(CE_WARN, "XFS: RT mount failed"); 974 goto error1; 975 } 976 977 /* 978 * For client case we are done now 979 */ 980 if (mfsi_flags & XFS_MFSI_CLIENT) { 981 return 0; 982 } 983 984 /* 985 * Copies the low order bits of the timestamp and the randomly 986 * set "sequence" number out of a UUID. 987 */ 988 uuid_getnodeuniq(&sbp->sb_uuid, mp->m_fixedfsid); 989 990 /* 991 * The vfs structure needs to have a file system independent 992 * way of checking for the invariant file system ID. Since it 993 * can't look at mount structures it has a pointer to the data 994 * in the mount structure. 995 * 996 * File systems that don't support user level file handles (i.e. 997 * all of them except for XFS) will leave vfs_altfsid as NULL. 998 */ 999 vfsp->vfs_altfsid = (xfs_fsid_t *)mp->m_fixedfsid; 1000 mp->m_dmevmask = 0; /* not persistent; set after each mount */ 1001 1002 xfs_dir_mount(mp); 1003 1004 /* 1005 * Initialize the attribute manager's entries. 1006 */ 1007 mp->m_attr_magicpct = (mp->m_sb.sb_blocksize * 37) / 100; 1008 1009 /* 1010 * Initialize the precomputed transaction reservations values. 1011 */ 1012 xfs_trans_init(mp); 1013 1014 /* 1015 * Allocate and initialize the inode hash table for this 1016 * file system. 1017 */ 1018 xfs_ihash_init(mp); 1019 xfs_chash_init(mp); 1020 1021 /* 1022 * Allocate and initialize the per-ag data. 1023 */ 1024 init_rwsem(&mp->m_peraglock); 1025 mp->m_perag = 1026 kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), KM_SLEEP); 1027 1028 mp->m_maxagi = xfs_initialize_perag(vfsp, mp, sbp->sb_agcount); 1029 1030 /* 1031 * log's mount-time initialization. Perform 1st part recovery if needed 1032 */ 1033 if (likely(sbp->sb_logblocks > 0)) { /* check for volume case */ 1034 error = xfs_log_mount(mp, mp->m_logdev_targp, 1035 XFS_FSB_TO_DADDR(mp, sbp->sb_logstart), 1036 XFS_FSB_TO_BB(mp, sbp->sb_logblocks)); 1037 if (error) { 1038 cmn_err(CE_WARN, "XFS: log mount failed"); 1039 goto error2; 1040 } 1041 } else { /* No log has been defined */ 1042 cmn_err(CE_WARN, "XFS: no log defined"); 1043 XFS_ERROR_REPORT("xfs_mountfs_int(1)", XFS_ERRLEVEL_LOW, mp); 1044 error = XFS_ERROR(EFSCORRUPTED); 1045 goto error2; 1046 } 1047 1048 /* 1049 * Now the log is mounted, we know if it was an unclean shutdown or 1050 * not. If it was, with the first phase of recovery has completed, we 1051 * have consistent AG blocks on disk. We have not recovered EFIs yet, 1052 * but they are recovered transactionally in the second recovery phase 1053 * later. 1054 * 1055 * Hence we can safely re-initialise incore superblock counters from 1056 * the per-ag data. These may not be correct if the filesystem was not 1057 * cleanly unmounted, so we need to wait for recovery to finish before 1058 * doing this. 1059 * 1060 * If the filesystem was cleanly unmounted, then we can trust the 1061 * values in the superblock to be correct and we don't need to do 1062 * anything here. 1063 * 1064 * If we are currently making the filesystem, the initialisation will 1065 * fail as the perag data is in an undefined state. 1066 */ 1067 1068 if (xfs_sb_version_haslazysbcount(&mp->m_sb) && 1069 !XFS_LAST_UNMOUNT_WAS_CLEAN(mp) && 1070 !mp->m_sb.sb_inprogress) { 1071 error = xfs_initialize_perag_data(mp, sbp->sb_agcount); 1072 if (error) { 1073 goto error2; 1074 } 1075 } 1076 /* 1077 * Get and sanity-check the root inode. 1078 * Save the pointer to it in the mount structure. 1079 */ 1080 error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip, 0); 1081 if (error) { 1082 cmn_err(CE_WARN, "XFS: failed to read root inode"); 1083 goto error3; 1084 } 1085 1086 ASSERT(rip != NULL); 1087 rvp = XFS_ITOV(rip); 1088 1089 if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) { 1090 cmn_err(CE_WARN, "XFS: corrupted root inode"); 1091 cmn_err(CE_WARN, "Device %s - root %llu is not a directory", 1092 XFS_BUFTARG_NAME(mp->m_ddev_targp), 1093 (unsigned long long)rip->i_ino); 1094 xfs_iunlock(rip, XFS_ILOCK_EXCL); 1095 XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW, 1096 mp); 1097 error = XFS_ERROR(EFSCORRUPTED); 1098 goto error4; 1099 } 1100 mp->m_rootip = rip; /* save it */ 1101 1102 xfs_iunlock(rip, XFS_ILOCK_EXCL); 1103 1104 /* 1105 * Initialize realtime inode pointers in the mount structure 1106 */ 1107 if ((error = xfs_rtmount_inodes(mp))) { 1108 /* 1109 * Free up the root inode. 1110 */ 1111 cmn_err(CE_WARN, "XFS: failed to read RT inodes"); 1112 goto error4; 1113 } 1114 1115 /* 1116 * If fs is not mounted readonly, then update the superblock 1117 * unit and width changes. 1118 */ 1119 if (update_flags && !(vfsp->vfs_flag & VFS_RDONLY)) 1120 xfs_mount_log_sbunit(mp, update_flags); 1121 1122 /* 1123 * Initialise the XFS quota management subsystem for this mount 1124 */ 1125 if ((error = XFS_QM_INIT(mp, "amount, "aflags))) 1126 goto error4; 1127 1128 /* 1129 * Finish recovering the file system. This part needed to be 1130 * delayed until after the root and real-time bitmap inodes 1131 * were consistently read in. 1132 */ 1133 error = xfs_log_mount_finish(mp, mfsi_flags); 1134 if (error) { 1135 cmn_err(CE_WARN, "XFS: log mount finish failed"); 1136 goto error4; 1137 } 1138 1139 /* 1140 * Complete the quota initialisation, post-log-replay component. 1141 */ 1142 if ((error = XFS_QM_MOUNT(mp, quotamount, quotaflags, mfsi_flags))) 1143 goto error4; 1144 1145 /* 1146 * Now we are mounted, reserve a small amount of unused space for 1147 * privileged transactions. This is needed so that transaction 1148 * space required for critical operations can dip into this pool 1149 * when at ENOSPC. This is needed for operations like create with 1150 * attr, unwritten extent conversion at ENOSPC, etc. Data allocations 1151 * are not allowed to use this reserved space. 1152 * 1153 * We default to 5% or 1024 fsbs of space reserved, whichever is smaller. 1154 * This may drive us straight to ENOSPC on mount, but that implies 1155 * we were already there on the last unmount. 1156 */ 1157 resblks = mp->m_sb.sb_dblocks; 1158 do_div(resblks, 20); 1159 resblks = min_t(__uint64_t, resblks, 1024); 1160 xfs_reserve_blocks(mp, &resblks, NULL); 1161 1162 return 0; 1163 1164 error4: 1165 /* 1166 * Free up the root inode. 1167 */ 1168 VN_RELE(rvp); 1169 error3: 1170 xfs_log_unmount_dealloc(mp); 1171 error2: 1172 xfs_ihash_free(mp); 1173 xfs_chash_free(mp); 1174 for (agno = 0; agno < sbp->sb_agcount; agno++) 1175 if (mp->m_perag[agno].pagb_list) 1176 kmem_free(mp->m_perag[agno].pagb_list, 1177 sizeof(xfs_perag_busy_t) * XFS_PAGB_NUM_SLOTS); 1178 kmem_free(mp->m_perag, sbp->sb_agcount * sizeof(xfs_perag_t)); 1179 mp->m_perag = NULL; 1180 /* FALLTHROUGH */ 1181 error1: 1182 if (uuid_mounted) 1183 xfs_uuid_unmount(mp); 1184 xfs_freesb(mp); 1185 return error; 1186 } 1187 1188 /* 1189 * xfs_unmountfs 1190 * 1191 * This flushes out the inodes,dquots and the superblock, unmounts the 1192 * log and makes sure that incore structures are freed. 1193 */ 1194 int 1195 xfs_unmountfs(xfs_mount_t *mp, struct cred *cr) 1196 { 1197 struct bhv_vfs *vfsp = XFS_MTOVFS(mp); 1198 #if defined(DEBUG) || defined(INDUCE_IO_ERROR) 1199 int64_t fsid; 1200 #endif 1201 __uint64_t resblks; 1202 1203 /* 1204 * We can potentially deadlock here if we have an inode cluster 1205 * that has been freed has it's buffer still pinned in memory because 1206 * the transaction is still sitting in a iclog. The stale inodes 1207 * on that buffer will have their flush locks held until the 1208 * transaction hits the disk and the callbacks run. the inode 1209 * flush takes the flush lock unconditionally and with nothing to 1210 * push out the iclog we will never get that unlocked. hence we 1211 * need to force the log first. 1212 */ 1213 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); 1214 xfs_iflush_all(mp); 1215 1216 XFS_QM_DQPURGEALL(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING); 1217 1218 /* 1219 * Flush out the log synchronously so that we know for sure 1220 * that nothing is pinned. This is important because bflush() 1221 * will skip pinned buffers. 1222 */ 1223 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); 1224 1225 xfs_binval(mp->m_ddev_targp); 1226 if (mp->m_rtdev_targp) { 1227 xfs_binval(mp->m_rtdev_targp); 1228 } 1229 1230 /* 1231 * Unreserve any blocks we have so that when we unmount we don't account 1232 * the reserved free space as used. This is really only necessary for 1233 * lazy superblock counting because it trusts the incore superblock 1234 * counters to be aboslutely correct on clean unmount. 1235 * 1236 * We don't bother correcting this elsewhere for lazy superblock 1237 * counting because on mount of an unclean filesystem we reconstruct the 1238 * correct counter value and this is irrelevant. 1239 * 1240 * For non-lazy counter filesystems, this doesn't matter at all because 1241 * we only every apply deltas to the superblock and hence the incore 1242 * value does not matter.... 1243 */ 1244 resblks = 0; 1245 xfs_reserve_blocks(mp, &resblks, NULL); 1246 1247 xfs_log_sbcount(mp, 1); 1248 xfs_unmountfs_writesb(mp); 1249 xfs_unmountfs_wait(mp); /* wait for async bufs */ 1250 xfs_log_unmount(mp); /* Done! No more fs ops. */ 1251 1252 xfs_freesb(mp); 1253 1254 /* 1255 * All inodes from this mount point should be freed. 1256 */ 1257 ASSERT(mp->m_inodes == NULL); 1258 1259 xfs_unmountfs_close(mp, cr); 1260 if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0) 1261 xfs_uuid_unmount(mp); 1262 1263 #if defined(DEBUG) || defined(INDUCE_IO_ERROR) 1264 /* 1265 * clear all error tags on this filesystem 1266 */ 1267 memcpy(&fsid, &vfsp->vfs_fsid, sizeof(int64_t)); 1268 xfs_errortag_clearall_umount(fsid, mp->m_fsname, 0); 1269 #endif 1270 XFS_IODONE(vfsp); 1271 xfs_mount_free(mp, 1); 1272 return 0; 1273 } 1274 1275 void 1276 xfs_unmountfs_close(xfs_mount_t *mp, struct cred *cr) 1277 { 1278 if (mp->m_logdev_targp != mp->m_ddev_targp) 1279 xfs_free_buftarg(mp->m_logdev_targp, 1); 1280 if (mp->m_rtdev_targp) 1281 xfs_free_buftarg(mp->m_rtdev_targp, 1); 1282 xfs_free_buftarg(mp->m_ddev_targp, 0); 1283 } 1284 1285 STATIC void 1286 xfs_unmountfs_wait(xfs_mount_t *mp) 1287 { 1288 if (mp->m_logdev_targp != mp->m_ddev_targp) 1289 xfs_wait_buftarg(mp->m_logdev_targp); 1290 if (mp->m_rtdev_targp) 1291 xfs_wait_buftarg(mp->m_rtdev_targp); 1292 xfs_wait_buftarg(mp->m_ddev_targp); 1293 } 1294 1295 int 1296 xfs_fs_writable(xfs_mount_t *mp) 1297 { 1298 bhv_vfs_t *vfsp = XFS_MTOVFS(mp); 1299 1300 return !(vfs_test_for_freeze(vfsp) || XFS_FORCED_SHUTDOWN(mp) || 1301 (vfsp->vfs_flag & VFS_RDONLY)); 1302 } 1303 1304 /* 1305 * xfs_log_sbcount 1306 * 1307 * Called either periodically to keep the on disk superblock values 1308 * roughly up to date or from unmount to make sure the values are 1309 * correct on a clean unmount. 1310 * 1311 * Note this code can be called during the process of freezing, so 1312 * we may need to use the transaction allocator which does not not 1313 * block when the transaction subsystem is in its frozen state. 1314 */ 1315 int 1316 xfs_log_sbcount( 1317 xfs_mount_t *mp, 1318 uint sync) 1319 { 1320 xfs_trans_t *tp; 1321 int error; 1322 1323 if (!xfs_fs_writable(mp)) 1324 return 0; 1325 1326 xfs_icsb_sync_counters(mp); 1327 1328 /* 1329 * we don't need to do this if we are updating the superblock 1330 * counters on every modification. 1331 */ 1332 if (!xfs_sb_version_haslazysbcount(&mp->m_sb)) 1333 return 0; 1334 1335 tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT); 1336 error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, 1337 XFS_DEFAULT_LOG_COUNT); 1338 if (error) { 1339 xfs_trans_cancel(tp, 0); 1340 return error; 1341 } 1342 1343 xfs_mod_sb(tp, XFS_SB_IFREE | XFS_SB_ICOUNT | XFS_SB_FDBLOCKS); 1344 if (sync) 1345 xfs_trans_set_sync(tp); 1346 xfs_trans_commit(tp, 0); 1347 1348 return 0; 1349 } 1350 1351 int 1352 xfs_unmountfs_writesb(xfs_mount_t *mp) 1353 { 1354 xfs_buf_t *sbp; 1355 xfs_sb_t *sb; 1356 int error = 0; 1357 1358 /* 1359 * skip superblock write if fs is read-only, or 1360 * if we are doing a forced umount. 1361 */ 1362 if (!(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY || 1363 XFS_FORCED_SHUTDOWN(mp))) { 1364 1365 sbp = xfs_getsb(mp, 0); 1366 sb = XFS_BUF_TO_SBP(sbp); 1367 1368 /* 1369 * mark shared-readonly if desired 1370 */ 1371 if (mp->m_mk_sharedro) { 1372 if (!(sb->sb_flags & XFS_SBF_READONLY)) 1373 sb->sb_flags |= XFS_SBF_READONLY; 1374 if (!XFS_SB_VERSION_HASSHARED(sb)) 1375 XFS_SB_VERSION_ADDSHARED(sb); 1376 xfs_fs_cmn_err(CE_NOTE, mp, 1377 "Unmounting, marking shared read-only"); 1378 } 1379 1380 XFS_BUF_UNDONE(sbp); 1381 XFS_BUF_UNREAD(sbp); 1382 XFS_BUF_UNDELAYWRITE(sbp); 1383 XFS_BUF_WRITE(sbp); 1384 XFS_BUF_UNASYNC(sbp); 1385 ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp); 1386 xfsbdstrat(mp, sbp); 1387 /* Nevermind errors we might get here. */ 1388 error = xfs_iowait(sbp); 1389 if (error) 1390 xfs_ioerror_alert("xfs_unmountfs_writesb", 1391 mp, sbp, XFS_BUF_ADDR(sbp)); 1392 if (error && mp->m_mk_sharedro) 1393 xfs_fs_cmn_err(CE_ALERT, mp, "Superblock write error detected while unmounting. Filesystem may not be marked shared readonly"); 1394 xfs_buf_relse(sbp); 1395 } 1396 return error; 1397 } 1398 1399 /* 1400 * xfs_mod_sb() can be used to copy arbitrary changes to the 1401 * in-core superblock into the superblock buffer to be logged. 1402 * It does not provide the higher level of locking that is 1403 * needed to protect the in-core superblock from concurrent 1404 * access. 1405 */ 1406 void 1407 xfs_mod_sb(xfs_trans_t *tp, __int64_t fields) 1408 { 1409 xfs_buf_t *bp; 1410 int first; 1411 int last; 1412 xfs_mount_t *mp; 1413 xfs_sb_t *sbp; 1414 xfs_sb_field_t f; 1415 1416 ASSERT(fields); 1417 if (!fields) 1418 return; 1419 mp = tp->t_mountp; 1420 bp = xfs_trans_getsb(tp, mp, 0); 1421 sbp = XFS_BUF_TO_SBP(bp); 1422 first = sizeof(xfs_sb_t); 1423 last = 0; 1424 1425 /* translate/copy */ 1426 1427 xfs_xlatesb(XFS_BUF_PTR(bp), &(mp->m_sb), -1, fields); 1428 1429 /* find modified range */ 1430 1431 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); 1432 ASSERT((1LL << f) & XFS_SB_MOD_BITS); 1433 first = xfs_sb_info[f].offset; 1434 1435 f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields); 1436 ASSERT((1LL << f) & XFS_SB_MOD_BITS); 1437 last = xfs_sb_info[f + 1].offset - 1; 1438 1439 xfs_trans_log_buf(tp, bp, first, last); 1440 } 1441 1442 1443 /* 1444 * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply 1445 * a delta to a specified field in the in-core superblock. Simply 1446 * switch on the field indicated and apply the delta to that field. 1447 * Fields are not allowed to dip below zero, so if the delta would 1448 * do this do not apply it and return EINVAL. 1449 * 1450 * The SB_LOCK must be held when this routine is called. 1451 */ 1452 int 1453 xfs_mod_incore_sb_unlocked( 1454 xfs_mount_t *mp, 1455 xfs_sb_field_t field, 1456 int64_t delta, 1457 int rsvd) 1458 { 1459 int scounter; /* short counter for 32 bit fields */ 1460 long long lcounter; /* long counter for 64 bit fields */ 1461 long long res_used, rem; 1462 1463 /* 1464 * With the in-core superblock spin lock held, switch 1465 * on the indicated field. Apply the delta to the 1466 * proper field. If the fields value would dip below 1467 * 0, then do not apply the delta and return EINVAL. 1468 */ 1469 switch (field) { 1470 case XFS_SBS_ICOUNT: 1471 lcounter = (long long)mp->m_sb.sb_icount; 1472 lcounter += delta; 1473 if (lcounter < 0) { 1474 ASSERT(0); 1475 return XFS_ERROR(EINVAL); 1476 } 1477 mp->m_sb.sb_icount = lcounter; 1478 return 0; 1479 case XFS_SBS_IFREE: 1480 lcounter = (long long)mp->m_sb.sb_ifree; 1481 lcounter += delta; 1482 if (lcounter < 0) { 1483 ASSERT(0); 1484 return XFS_ERROR(EINVAL); 1485 } 1486 mp->m_sb.sb_ifree = lcounter; 1487 return 0; 1488 case XFS_SBS_FDBLOCKS: 1489 lcounter = (long long) 1490 mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); 1491 res_used = (long long)(mp->m_resblks - mp->m_resblks_avail); 1492 1493 if (delta > 0) { /* Putting blocks back */ 1494 if (res_used > delta) { 1495 mp->m_resblks_avail += delta; 1496 } else { 1497 rem = delta - res_used; 1498 mp->m_resblks_avail = mp->m_resblks; 1499 lcounter += rem; 1500 } 1501 } else { /* Taking blocks away */ 1502 1503 lcounter += delta; 1504 1505 /* 1506 * If were out of blocks, use any available reserved blocks if 1507 * were allowed to. 1508 */ 1509 1510 if (lcounter < 0) { 1511 if (rsvd) { 1512 lcounter = (long long)mp->m_resblks_avail + delta; 1513 if (lcounter < 0) { 1514 return XFS_ERROR(ENOSPC); 1515 } 1516 mp->m_resblks_avail = lcounter; 1517 return 0; 1518 } else { /* not reserved */ 1519 return XFS_ERROR(ENOSPC); 1520 } 1521 } 1522 } 1523 1524 mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp); 1525 return 0; 1526 case XFS_SBS_FREXTENTS: 1527 lcounter = (long long)mp->m_sb.sb_frextents; 1528 lcounter += delta; 1529 if (lcounter < 0) { 1530 return XFS_ERROR(ENOSPC); 1531 } 1532 mp->m_sb.sb_frextents = lcounter; 1533 return 0; 1534 case XFS_SBS_DBLOCKS: 1535 lcounter = (long long)mp->m_sb.sb_dblocks; 1536 lcounter += delta; 1537 if (lcounter < 0) { 1538 ASSERT(0); 1539 return XFS_ERROR(EINVAL); 1540 } 1541 mp->m_sb.sb_dblocks = lcounter; 1542 return 0; 1543 case XFS_SBS_AGCOUNT: 1544 scounter = mp->m_sb.sb_agcount; 1545 scounter += delta; 1546 if (scounter < 0) { 1547 ASSERT(0); 1548 return XFS_ERROR(EINVAL); 1549 } 1550 mp->m_sb.sb_agcount = scounter; 1551 return 0; 1552 case XFS_SBS_IMAX_PCT: 1553 scounter = mp->m_sb.sb_imax_pct; 1554 scounter += delta; 1555 if (scounter < 0) { 1556 ASSERT(0); 1557 return XFS_ERROR(EINVAL); 1558 } 1559 mp->m_sb.sb_imax_pct = scounter; 1560 return 0; 1561 case XFS_SBS_REXTSIZE: 1562 scounter = mp->m_sb.sb_rextsize; 1563 scounter += delta; 1564 if (scounter < 0) { 1565 ASSERT(0); 1566 return XFS_ERROR(EINVAL); 1567 } 1568 mp->m_sb.sb_rextsize = scounter; 1569 return 0; 1570 case XFS_SBS_RBMBLOCKS: 1571 scounter = mp->m_sb.sb_rbmblocks; 1572 scounter += delta; 1573 if (scounter < 0) { 1574 ASSERT(0); 1575 return XFS_ERROR(EINVAL); 1576 } 1577 mp->m_sb.sb_rbmblocks = scounter; 1578 return 0; 1579 case XFS_SBS_RBLOCKS: 1580 lcounter = (long long)mp->m_sb.sb_rblocks; 1581 lcounter += delta; 1582 if (lcounter < 0) { 1583 ASSERT(0); 1584 return XFS_ERROR(EINVAL); 1585 } 1586 mp->m_sb.sb_rblocks = lcounter; 1587 return 0; 1588 case XFS_SBS_REXTENTS: 1589 lcounter = (long long)mp->m_sb.sb_rextents; 1590 lcounter += delta; 1591 if (lcounter < 0) { 1592 ASSERT(0); 1593 return XFS_ERROR(EINVAL); 1594 } 1595 mp->m_sb.sb_rextents = lcounter; 1596 return 0; 1597 case XFS_SBS_REXTSLOG: 1598 scounter = mp->m_sb.sb_rextslog; 1599 scounter += delta; 1600 if (scounter < 0) { 1601 ASSERT(0); 1602 return XFS_ERROR(EINVAL); 1603 } 1604 mp->m_sb.sb_rextslog = scounter; 1605 return 0; 1606 default: 1607 ASSERT(0); 1608 return XFS_ERROR(EINVAL); 1609 } 1610 } 1611 1612 /* 1613 * xfs_mod_incore_sb() is used to change a field in the in-core 1614 * superblock structure by the specified delta. This modification 1615 * is protected by the SB_LOCK. Just use the xfs_mod_incore_sb_unlocked() 1616 * routine to do the work. 1617 */ 1618 int 1619 xfs_mod_incore_sb( 1620 xfs_mount_t *mp, 1621 xfs_sb_field_t field, 1622 int64_t delta, 1623 int rsvd) 1624 { 1625 unsigned long s; 1626 int status; 1627 1628 /* check for per-cpu counters */ 1629 switch (field) { 1630 #ifdef HAVE_PERCPU_SB 1631 case XFS_SBS_ICOUNT: 1632 case XFS_SBS_IFREE: 1633 case XFS_SBS_FDBLOCKS: 1634 if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) { 1635 status = xfs_icsb_modify_counters(mp, field, 1636 delta, rsvd); 1637 break; 1638 } 1639 /* FALLTHROUGH */ 1640 #endif 1641 default: 1642 s = XFS_SB_LOCK(mp); 1643 status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd); 1644 XFS_SB_UNLOCK(mp, s); 1645 break; 1646 } 1647 1648 return status; 1649 } 1650 1651 /* 1652 * xfs_mod_incore_sb_batch() is used to change more than one field 1653 * in the in-core superblock structure at a time. This modification 1654 * is protected by a lock internal to this module. The fields and 1655 * changes to those fields are specified in the array of xfs_mod_sb 1656 * structures passed in. 1657 * 1658 * Either all of the specified deltas will be applied or none of 1659 * them will. If any modified field dips below 0, then all modifications 1660 * will be backed out and EINVAL will be returned. 1661 */ 1662 int 1663 xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd) 1664 { 1665 unsigned long s; 1666 int status=0; 1667 xfs_mod_sb_t *msbp; 1668 1669 /* 1670 * Loop through the array of mod structures and apply each 1671 * individually. If any fail, then back out all those 1672 * which have already been applied. Do all of this within 1673 * the scope of the SB_LOCK so that all of the changes will 1674 * be atomic. 1675 */ 1676 s = XFS_SB_LOCK(mp); 1677 msbp = &msb[0]; 1678 for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) { 1679 /* 1680 * Apply the delta at index n. If it fails, break 1681 * from the loop so we'll fall into the undo loop 1682 * below. 1683 */ 1684 switch (msbp->msb_field) { 1685 #ifdef HAVE_PERCPU_SB 1686 case XFS_SBS_ICOUNT: 1687 case XFS_SBS_IFREE: 1688 case XFS_SBS_FDBLOCKS: 1689 if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) { 1690 XFS_SB_UNLOCK(mp, s); 1691 status = xfs_icsb_modify_counters(mp, 1692 msbp->msb_field, 1693 msbp->msb_delta, rsvd); 1694 s = XFS_SB_LOCK(mp); 1695 break; 1696 } 1697 /* FALLTHROUGH */ 1698 #endif 1699 default: 1700 status = xfs_mod_incore_sb_unlocked(mp, 1701 msbp->msb_field, 1702 msbp->msb_delta, rsvd); 1703 break; 1704 } 1705 1706 if (status != 0) { 1707 break; 1708 } 1709 } 1710 1711 /* 1712 * If we didn't complete the loop above, then back out 1713 * any changes made to the superblock. If you add code 1714 * between the loop above and here, make sure that you 1715 * preserve the value of status. Loop back until 1716 * we step below the beginning of the array. Make sure 1717 * we don't touch anything back there. 1718 */ 1719 if (status != 0) { 1720 msbp--; 1721 while (msbp >= msb) { 1722 switch (msbp->msb_field) { 1723 #ifdef HAVE_PERCPU_SB 1724 case XFS_SBS_ICOUNT: 1725 case XFS_SBS_IFREE: 1726 case XFS_SBS_FDBLOCKS: 1727 if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) { 1728 XFS_SB_UNLOCK(mp, s); 1729 status = xfs_icsb_modify_counters(mp, 1730 msbp->msb_field, 1731 -(msbp->msb_delta), 1732 rsvd); 1733 s = XFS_SB_LOCK(mp); 1734 break; 1735 } 1736 /* FALLTHROUGH */ 1737 #endif 1738 default: 1739 status = xfs_mod_incore_sb_unlocked(mp, 1740 msbp->msb_field, 1741 -(msbp->msb_delta), 1742 rsvd); 1743 break; 1744 } 1745 ASSERT(status == 0); 1746 msbp--; 1747 } 1748 } 1749 XFS_SB_UNLOCK(mp, s); 1750 return status; 1751 } 1752 1753 /* 1754 * xfs_getsb() is called to obtain the buffer for the superblock. 1755 * The buffer is returned locked and read in from disk. 1756 * The buffer should be released with a call to xfs_brelse(). 1757 * 1758 * If the flags parameter is BUF_TRYLOCK, then we'll only return 1759 * the superblock buffer if it can be locked without sleeping. 1760 * If it can't then we'll return NULL. 1761 */ 1762 xfs_buf_t * 1763 xfs_getsb( 1764 xfs_mount_t *mp, 1765 int flags) 1766 { 1767 xfs_buf_t *bp; 1768 1769 ASSERT(mp->m_sb_bp != NULL); 1770 bp = mp->m_sb_bp; 1771 if (flags & XFS_BUF_TRYLOCK) { 1772 if (!XFS_BUF_CPSEMA(bp)) { 1773 return NULL; 1774 } 1775 } else { 1776 XFS_BUF_PSEMA(bp, PRIBIO); 1777 } 1778 XFS_BUF_HOLD(bp); 1779 ASSERT(XFS_BUF_ISDONE(bp)); 1780 return bp; 1781 } 1782 1783 /* 1784 * Used to free the superblock along various error paths. 1785 */ 1786 void 1787 xfs_freesb( 1788 xfs_mount_t *mp) 1789 { 1790 xfs_buf_t *bp; 1791 1792 /* 1793 * Use xfs_getsb() so that the buffer will be locked 1794 * when we call xfs_buf_relse(). 1795 */ 1796 bp = xfs_getsb(mp, 0); 1797 XFS_BUF_UNMANAGE(bp); 1798 xfs_buf_relse(bp); 1799 mp->m_sb_bp = NULL; 1800 } 1801 1802 /* 1803 * See if the UUID is unique among mounted XFS filesystems. 1804 * Mount fails if UUID is nil or a FS with the same UUID is already mounted. 1805 */ 1806 STATIC int 1807 xfs_uuid_mount( 1808 xfs_mount_t *mp) 1809 { 1810 if (uuid_is_nil(&mp->m_sb.sb_uuid)) { 1811 cmn_err(CE_WARN, 1812 "XFS: Filesystem %s has nil UUID - can't mount", 1813 mp->m_fsname); 1814 return -1; 1815 } 1816 if (!uuid_table_insert(&mp->m_sb.sb_uuid)) { 1817 cmn_err(CE_WARN, 1818 "XFS: Filesystem %s has duplicate UUID - can't mount", 1819 mp->m_fsname); 1820 return -1; 1821 } 1822 return 0; 1823 } 1824 1825 /* 1826 * Remove filesystem from the UUID table. 1827 */ 1828 STATIC void 1829 xfs_uuid_unmount( 1830 xfs_mount_t *mp) 1831 { 1832 uuid_table_remove(&mp->m_sb.sb_uuid); 1833 } 1834 1835 /* 1836 * Used to log changes to the superblock unit and width fields which could 1837 * be altered by the mount options. Only the first superblock is updated. 1838 */ 1839 STATIC void 1840 xfs_mount_log_sbunit( 1841 xfs_mount_t *mp, 1842 __int64_t fields) 1843 { 1844 xfs_trans_t *tp; 1845 1846 ASSERT(fields & (XFS_SB_UNIT|XFS_SB_WIDTH|XFS_SB_UUID)); 1847 1848 tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT); 1849 if (xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, 1850 XFS_DEFAULT_LOG_COUNT)) { 1851 xfs_trans_cancel(tp, 0); 1852 return; 1853 } 1854 xfs_mod_sb(tp, fields); 1855 xfs_trans_commit(tp, 0); 1856 } 1857 1858 1859 #ifdef HAVE_PERCPU_SB 1860 /* 1861 * Per-cpu incore superblock counters 1862 * 1863 * Simple concept, difficult implementation 1864 * 1865 * Basically, replace the incore superblock counters with a distributed per cpu 1866 * counter for contended fields (e.g. free block count). 1867 * 1868 * Difficulties arise in that the incore sb is used for ENOSPC checking, and 1869 * hence needs to be accurately read when we are running low on space. Hence 1870 * there is a method to enable and disable the per-cpu counters based on how 1871 * much "stuff" is available in them. 1872 * 1873 * Basically, a counter is enabled if there is enough free resource to justify 1874 * running a per-cpu fast-path. If the per-cpu counter runs out (i.e. a local 1875 * ENOSPC), then we disable the counters to synchronise all callers and 1876 * re-distribute the available resources. 1877 * 1878 * If, once we redistributed the available resources, we still get a failure, 1879 * we disable the per-cpu counter and go through the slow path. 1880 * 1881 * The slow path is the current xfs_mod_incore_sb() function. This means that 1882 * when we disable a per-cpu counter, we need to drain it's resources back to 1883 * the global superblock. We do this after disabling the counter to prevent 1884 * more threads from queueing up on the counter. 1885 * 1886 * Essentially, this means that we still need a lock in the fast path to enable 1887 * synchronisation between the global counters and the per-cpu counters. This 1888 * is not a problem because the lock will be local to a CPU almost all the time 1889 * and have little contention except when we get to ENOSPC conditions. 1890 * 1891 * Basically, this lock becomes a barrier that enables us to lock out the fast 1892 * path while we do things like enabling and disabling counters and 1893 * synchronising the counters. 1894 * 1895 * Locking rules: 1896 * 1897 * 1. XFS_SB_LOCK() before picking up per-cpu locks 1898 * 2. per-cpu locks always picked up via for_each_online_cpu() order 1899 * 3. accurate counter sync requires XFS_SB_LOCK + per cpu locks 1900 * 4. modifying per-cpu counters requires holding per-cpu lock 1901 * 5. modifying global counters requires holding XFS_SB_LOCK 1902 * 6. enabling or disabling a counter requires holding the XFS_SB_LOCK 1903 * and _none_ of the per-cpu locks. 1904 * 1905 * Disabled counters are only ever re-enabled by a balance operation 1906 * that results in more free resources per CPU than a given threshold. 1907 * To ensure counters don't remain disabled, they are rebalanced when 1908 * the global resource goes above a higher threshold (i.e. some hysteresis 1909 * is present to prevent thrashing). 1910 */ 1911 1912 #ifdef CONFIG_HOTPLUG_CPU 1913 /* 1914 * hot-plug CPU notifier support. 1915 * 1916 * We need a notifier per filesystem as we need to be able to identify 1917 * the filesystem to balance the counters out. This is achieved by 1918 * having a notifier block embedded in the xfs_mount_t and doing pointer 1919 * magic to get the mount pointer from the notifier block address. 1920 */ 1921 STATIC int 1922 xfs_icsb_cpu_notify( 1923 struct notifier_block *nfb, 1924 unsigned long action, 1925 void *hcpu) 1926 { 1927 xfs_icsb_cnts_t *cntp; 1928 xfs_mount_t *mp; 1929 int s; 1930 1931 mp = (xfs_mount_t *)container_of(nfb, xfs_mount_t, m_icsb_notifier); 1932 cntp = (xfs_icsb_cnts_t *) 1933 per_cpu_ptr(mp->m_sb_cnts, (unsigned long)hcpu); 1934 switch (action) { 1935 case CPU_UP_PREPARE: 1936 case CPU_UP_PREPARE_FROZEN: 1937 /* Easy Case - initialize the area and locks, and 1938 * then rebalance when online does everything else for us. */ 1939 memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); 1940 break; 1941 case CPU_ONLINE: 1942 case CPU_ONLINE_FROZEN: 1943 xfs_icsb_lock(mp); 1944 xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0, 0); 1945 xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0, 0); 1946 xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0, 0); 1947 xfs_icsb_unlock(mp); 1948 break; 1949 case CPU_DEAD: 1950 case CPU_DEAD_FROZEN: 1951 /* Disable all the counters, then fold the dead cpu's 1952 * count into the total on the global superblock and 1953 * re-enable the counters. */ 1954 xfs_icsb_lock(mp); 1955 s = XFS_SB_LOCK(mp); 1956 xfs_icsb_disable_counter(mp, XFS_SBS_ICOUNT); 1957 xfs_icsb_disable_counter(mp, XFS_SBS_IFREE); 1958 xfs_icsb_disable_counter(mp, XFS_SBS_FDBLOCKS); 1959 1960 mp->m_sb.sb_icount += cntp->icsb_icount; 1961 mp->m_sb.sb_ifree += cntp->icsb_ifree; 1962 mp->m_sb.sb_fdblocks += cntp->icsb_fdblocks; 1963 1964 memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); 1965 1966 xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 1967 XFS_ICSB_SB_LOCKED, 0); 1968 xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 1969 XFS_ICSB_SB_LOCKED, 0); 1970 xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 1971 XFS_ICSB_SB_LOCKED, 0); 1972 XFS_SB_UNLOCK(mp, s); 1973 xfs_icsb_unlock(mp); 1974 break; 1975 } 1976 1977 return NOTIFY_OK; 1978 } 1979 #endif /* CONFIG_HOTPLUG_CPU */ 1980 1981 int 1982 xfs_icsb_init_counters( 1983 xfs_mount_t *mp) 1984 { 1985 xfs_icsb_cnts_t *cntp; 1986 int i; 1987 1988 mp->m_sb_cnts = alloc_percpu(xfs_icsb_cnts_t); 1989 if (mp->m_sb_cnts == NULL) 1990 return -ENOMEM; 1991 1992 #ifdef CONFIG_HOTPLUG_CPU 1993 mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify; 1994 mp->m_icsb_notifier.priority = 0; 1995 register_hotcpu_notifier(&mp->m_icsb_notifier); 1996 #endif /* CONFIG_HOTPLUG_CPU */ 1997 1998 for_each_online_cpu(i) { 1999 cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); 2000 memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); 2001 } 2002 2003 mutex_init(&mp->m_icsb_mutex); 2004 2005 /* 2006 * start with all counters disabled so that the 2007 * initial balance kicks us off correctly 2008 */ 2009 mp->m_icsb_counters = -1; 2010 return 0; 2011 } 2012 2013 void 2014 xfs_icsb_reinit_counters( 2015 xfs_mount_t *mp) 2016 { 2017 xfs_icsb_lock(mp); 2018 /* 2019 * start with all counters disabled so that the 2020 * initial balance kicks us off correctly 2021 */ 2022 mp->m_icsb_counters = -1; 2023 xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0, 0); 2024 xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0, 0); 2025 xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0, 0); 2026 xfs_icsb_unlock(mp); 2027 } 2028 2029 STATIC void 2030 xfs_icsb_destroy_counters( 2031 xfs_mount_t *mp) 2032 { 2033 if (mp->m_sb_cnts) { 2034 unregister_hotcpu_notifier(&mp->m_icsb_notifier); 2035 free_percpu(mp->m_sb_cnts); 2036 } 2037 mutex_destroy(&mp->m_icsb_mutex); 2038 } 2039 2040 STATIC_INLINE void 2041 xfs_icsb_lock_cntr( 2042 xfs_icsb_cnts_t *icsbp) 2043 { 2044 while (test_and_set_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags)) { 2045 ndelay(1000); 2046 } 2047 } 2048 2049 STATIC_INLINE void 2050 xfs_icsb_unlock_cntr( 2051 xfs_icsb_cnts_t *icsbp) 2052 { 2053 clear_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags); 2054 } 2055 2056 2057 STATIC_INLINE void 2058 xfs_icsb_lock_all_counters( 2059 xfs_mount_t *mp) 2060 { 2061 xfs_icsb_cnts_t *cntp; 2062 int i; 2063 2064 for_each_online_cpu(i) { 2065 cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); 2066 xfs_icsb_lock_cntr(cntp); 2067 } 2068 } 2069 2070 STATIC_INLINE void 2071 xfs_icsb_unlock_all_counters( 2072 xfs_mount_t *mp) 2073 { 2074 xfs_icsb_cnts_t *cntp; 2075 int i; 2076 2077 for_each_online_cpu(i) { 2078 cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); 2079 xfs_icsb_unlock_cntr(cntp); 2080 } 2081 } 2082 2083 STATIC void 2084 xfs_icsb_count( 2085 xfs_mount_t *mp, 2086 xfs_icsb_cnts_t *cnt, 2087 int flags) 2088 { 2089 xfs_icsb_cnts_t *cntp; 2090 int i; 2091 2092 memset(cnt, 0, sizeof(xfs_icsb_cnts_t)); 2093 2094 if (!(flags & XFS_ICSB_LAZY_COUNT)) 2095 xfs_icsb_lock_all_counters(mp); 2096 2097 for_each_online_cpu(i) { 2098 cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); 2099 cnt->icsb_icount += cntp->icsb_icount; 2100 cnt->icsb_ifree += cntp->icsb_ifree; 2101 cnt->icsb_fdblocks += cntp->icsb_fdblocks; 2102 } 2103 2104 if (!(flags & XFS_ICSB_LAZY_COUNT)) 2105 xfs_icsb_unlock_all_counters(mp); 2106 } 2107 2108 STATIC int 2109 xfs_icsb_counter_disabled( 2110 xfs_mount_t *mp, 2111 xfs_sb_field_t field) 2112 { 2113 ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS)); 2114 return test_bit(field, &mp->m_icsb_counters); 2115 } 2116 2117 STATIC int 2118 xfs_icsb_disable_counter( 2119 xfs_mount_t *mp, 2120 xfs_sb_field_t field) 2121 { 2122 xfs_icsb_cnts_t cnt; 2123 2124 ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS)); 2125 2126 /* 2127 * If we are already disabled, then there is nothing to do 2128 * here. We check before locking all the counters to avoid 2129 * the expensive lock operation when being called in the 2130 * slow path and the counter is already disabled. This is 2131 * safe because the only time we set or clear this state is under 2132 * the m_icsb_mutex. 2133 */ 2134 if (xfs_icsb_counter_disabled(mp, field)) 2135 return 0; 2136 2137 xfs_icsb_lock_all_counters(mp); 2138 if (!test_and_set_bit(field, &mp->m_icsb_counters)) { 2139 /* drain back to superblock */ 2140 2141 xfs_icsb_count(mp, &cnt, XFS_ICSB_SB_LOCKED|XFS_ICSB_LAZY_COUNT); 2142 switch(field) { 2143 case XFS_SBS_ICOUNT: 2144 mp->m_sb.sb_icount = cnt.icsb_icount; 2145 break; 2146 case XFS_SBS_IFREE: 2147 mp->m_sb.sb_ifree = cnt.icsb_ifree; 2148 break; 2149 case XFS_SBS_FDBLOCKS: 2150 mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks; 2151 break; 2152 default: 2153 BUG(); 2154 } 2155 } 2156 2157 xfs_icsb_unlock_all_counters(mp); 2158 2159 return 0; 2160 } 2161 2162 STATIC void 2163 xfs_icsb_enable_counter( 2164 xfs_mount_t *mp, 2165 xfs_sb_field_t field, 2166 uint64_t count, 2167 uint64_t resid) 2168 { 2169 xfs_icsb_cnts_t *cntp; 2170 int i; 2171 2172 ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS)); 2173 2174 xfs_icsb_lock_all_counters(mp); 2175 for_each_online_cpu(i) { 2176 cntp = per_cpu_ptr(mp->m_sb_cnts, i); 2177 switch (field) { 2178 case XFS_SBS_ICOUNT: 2179 cntp->icsb_icount = count + resid; 2180 break; 2181 case XFS_SBS_IFREE: 2182 cntp->icsb_ifree = count + resid; 2183 break; 2184 case XFS_SBS_FDBLOCKS: 2185 cntp->icsb_fdblocks = count + resid; 2186 break; 2187 default: 2188 BUG(); 2189 break; 2190 } 2191 resid = 0; 2192 } 2193 clear_bit(field, &mp->m_icsb_counters); 2194 xfs_icsb_unlock_all_counters(mp); 2195 } 2196 2197 void 2198 xfs_icsb_sync_counters_flags( 2199 xfs_mount_t *mp, 2200 int flags) 2201 { 2202 xfs_icsb_cnts_t cnt; 2203 int s; 2204 2205 /* Pass 1: lock all counters */ 2206 if ((flags & XFS_ICSB_SB_LOCKED) == 0) 2207 s = XFS_SB_LOCK(mp); 2208 2209 xfs_icsb_count(mp, &cnt, flags); 2210 2211 /* Step 3: update mp->m_sb fields */ 2212 if (!xfs_icsb_counter_disabled(mp, XFS_SBS_ICOUNT)) 2213 mp->m_sb.sb_icount = cnt.icsb_icount; 2214 if (!xfs_icsb_counter_disabled(mp, XFS_SBS_IFREE)) 2215 mp->m_sb.sb_ifree = cnt.icsb_ifree; 2216 if (!xfs_icsb_counter_disabled(mp, XFS_SBS_FDBLOCKS)) 2217 mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks; 2218 2219 if ((flags & XFS_ICSB_SB_LOCKED) == 0) 2220 XFS_SB_UNLOCK(mp, s); 2221 } 2222 2223 /* 2224 * Accurate update of per-cpu counters to incore superblock 2225 */ 2226 STATIC void 2227 xfs_icsb_sync_counters( 2228 xfs_mount_t *mp) 2229 { 2230 xfs_icsb_sync_counters_flags(mp, 0); 2231 } 2232 2233 /* 2234 * Balance and enable/disable counters as necessary. 2235 * 2236 * Thresholds for re-enabling counters are somewhat magic. inode counts are 2237 * chosen to be the same number as single on disk allocation chunk per CPU, and 2238 * free blocks is something far enough zero that we aren't going thrash when we 2239 * get near ENOSPC. We also need to supply a minimum we require per cpu to 2240 * prevent looping endlessly when xfs_alloc_space asks for more than will 2241 * be distributed to a single CPU but each CPU has enough blocks to be 2242 * reenabled. 2243 * 2244 * Note that we can be called when counters are already disabled. 2245 * xfs_icsb_disable_counter() optimises the counter locking in this case to 2246 * prevent locking every per-cpu counter needlessly. 2247 */ 2248 2249 #define XFS_ICSB_INO_CNTR_REENABLE (uint64_t)64 2250 #define XFS_ICSB_FDBLK_CNTR_REENABLE(mp) \ 2251 (uint64_t)(512 + XFS_ALLOC_SET_ASIDE(mp)) 2252 STATIC void 2253 xfs_icsb_balance_counter( 2254 xfs_mount_t *mp, 2255 xfs_sb_field_t field, 2256 int flags, 2257 int min_per_cpu) 2258 { 2259 uint64_t count, resid; 2260 int weight = num_online_cpus(); 2261 int s; 2262 uint64_t min = (uint64_t)min_per_cpu; 2263 2264 if (!(flags & XFS_ICSB_SB_LOCKED)) 2265 s = XFS_SB_LOCK(mp); 2266 2267 /* disable counter and sync counter */ 2268 xfs_icsb_disable_counter(mp, field); 2269 2270 /* update counters - first CPU gets residual*/ 2271 switch (field) { 2272 case XFS_SBS_ICOUNT: 2273 count = mp->m_sb.sb_icount; 2274 resid = do_div(count, weight); 2275 if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE)) 2276 goto out; 2277 break; 2278 case XFS_SBS_IFREE: 2279 count = mp->m_sb.sb_ifree; 2280 resid = do_div(count, weight); 2281 if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE)) 2282 goto out; 2283 break; 2284 case XFS_SBS_FDBLOCKS: 2285 count = mp->m_sb.sb_fdblocks; 2286 resid = do_div(count, weight); 2287 if (count < max(min, XFS_ICSB_FDBLK_CNTR_REENABLE(mp))) 2288 goto out; 2289 break; 2290 default: 2291 BUG(); 2292 count = resid = 0; /* quiet, gcc */ 2293 break; 2294 } 2295 2296 xfs_icsb_enable_counter(mp, field, count, resid); 2297 out: 2298 if (!(flags & XFS_ICSB_SB_LOCKED)) 2299 XFS_SB_UNLOCK(mp, s); 2300 } 2301 2302 int 2303 xfs_icsb_modify_counters( 2304 xfs_mount_t *mp, 2305 xfs_sb_field_t field, 2306 int64_t delta, 2307 int rsvd) 2308 { 2309 xfs_icsb_cnts_t *icsbp; 2310 long long lcounter; /* long counter for 64 bit fields */ 2311 int cpu, ret = 0, s; 2312 2313 might_sleep(); 2314 again: 2315 cpu = get_cpu(); 2316 icsbp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, cpu); 2317 2318 /* 2319 * if the counter is disabled, go to slow path 2320 */ 2321 if (unlikely(xfs_icsb_counter_disabled(mp, field))) 2322 goto slow_path; 2323 xfs_icsb_lock_cntr(icsbp); 2324 if (unlikely(xfs_icsb_counter_disabled(mp, field))) { 2325 xfs_icsb_unlock_cntr(icsbp); 2326 goto slow_path; 2327 } 2328 2329 switch (field) { 2330 case XFS_SBS_ICOUNT: 2331 lcounter = icsbp->icsb_icount; 2332 lcounter += delta; 2333 if (unlikely(lcounter < 0)) 2334 goto balance_counter; 2335 icsbp->icsb_icount = lcounter; 2336 break; 2337 2338 case XFS_SBS_IFREE: 2339 lcounter = icsbp->icsb_ifree; 2340 lcounter += delta; 2341 if (unlikely(lcounter < 0)) 2342 goto balance_counter; 2343 icsbp->icsb_ifree = lcounter; 2344 break; 2345 2346 case XFS_SBS_FDBLOCKS: 2347 BUG_ON((mp->m_resblks - mp->m_resblks_avail) != 0); 2348 2349 lcounter = icsbp->icsb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); 2350 lcounter += delta; 2351 if (unlikely(lcounter < 0)) 2352 goto balance_counter; 2353 icsbp->icsb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp); 2354 break; 2355 default: 2356 BUG(); 2357 break; 2358 } 2359 xfs_icsb_unlock_cntr(icsbp); 2360 put_cpu(); 2361 return 0; 2362 2363 slow_path: 2364 put_cpu(); 2365 2366 /* 2367 * serialise with a mutex so we don't burn lots of cpu on 2368 * the superblock lock. We still need to hold the superblock 2369 * lock, however, when we modify the global structures. 2370 */ 2371 xfs_icsb_lock(mp); 2372 2373 /* 2374 * Now running atomically. 2375 * 2376 * If the counter is enabled, someone has beaten us to rebalancing. 2377 * Drop the lock and try again in the fast path.... 2378 */ 2379 if (!(xfs_icsb_counter_disabled(mp, field))) { 2380 xfs_icsb_unlock(mp); 2381 goto again; 2382 } 2383 2384 /* 2385 * The counter is currently disabled. Because we are 2386 * running atomically here, we know a rebalance cannot 2387 * be in progress. Hence we can go straight to operating 2388 * on the global superblock. We do not call xfs_mod_incore_sb() 2389 * here even though we need to get the SB_LOCK. Doing so 2390 * will cause us to re-enter this function and deadlock. 2391 * Hence we get the SB_LOCK ourselves and then call 2392 * xfs_mod_incore_sb_unlocked() as the unlocked path operates 2393 * directly on the global counters. 2394 */ 2395 s = XFS_SB_LOCK(mp); 2396 ret = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd); 2397 XFS_SB_UNLOCK(mp, s); 2398 2399 /* 2400 * Now that we've modified the global superblock, we 2401 * may be able to re-enable the distributed counters 2402 * (e.g. lots of space just got freed). After that 2403 * we are done. 2404 */ 2405 if (ret != ENOSPC) 2406 xfs_icsb_balance_counter(mp, field, 0, 0); 2407 xfs_icsb_unlock(mp); 2408 return ret; 2409 2410 balance_counter: 2411 xfs_icsb_unlock_cntr(icsbp); 2412 put_cpu(); 2413 2414 /* 2415 * We may have multiple threads here if multiple per-cpu 2416 * counters run dry at the same time. This will mean we can 2417 * do more balances than strictly necessary but it is not 2418 * the common slowpath case. 2419 */ 2420 xfs_icsb_lock(mp); 2421 2422 /* 2423 * running atomically. 2424 * 2425 * This will leave the counter in the correct state for future 2426 * accesses. After the rebalance, we simply try again and our retry 2427 * will either succeed through the fast path or slow path without 2428 * another balance operation being required. 2429 */ 2430 xfs_icsb_balance_counter(mp, field, 0, delta); 2431 xfs_icsb_unlock(mp); 2432 goto again; 2433 } 2434 2435 #endif 2436