1 /* 2 * Copyright (c) 2006-2007 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include "xfs.h" 19 #include "xfs_bmap_btree.h" 20 #include "xfs_inum.h" 21 #include "xfs_dinode.h" 22 #include "xfs_inode.h" 23 #include "xfs_ag.h" 24 #include "xfs_log.h" 25 #include "xfs_trans.h" 26 #include "xfs_sb.h" 27 #include "xfs_mount.h" 28 #include "xfs_bmap.h" 29 #include "xfs_alloc.h" 30 #include "xfs_utils.h" 31 #include "xfs_mru_cache.h" 32 #include "xfs_filestream.h" 33 #include "xfs_trace.h" 34 35 #ifdef XFS_FILESTREAMS_TRACE 36 37 ktrace_t *xfs_filestreams_trace_buf; 38 39 STATIC void 40 xfs_filestreams_trace( 41 xfs_mount_t *mp, /* mount point */ 42 int type, /* type of trace */ 43 const char *func, /* source function */ 44 int line, /* source line number */ 45 __psunsigned_t arg0, 46 __psunsigned_t arg1, 47 __psunsigned_t arg2, 48 __psunsigned_t arg3, 49 __psunsigned_t arg4, 50 __psunsigned_t arg5) 51 { 52 ktrace_enter(xfs_filestreams_trace_buf, 53 (void *)(__psint_t)(type | (line << 16)), 54 (void *)func, 55 (void *)(__psunsigned_t)current_pid(), 56 (void *)mp, 57 (void *)(__psunsigned_t)arg0, 58 (void *)(__psunsigned_t)arg1, 59 (void *)(__psunsigned_t)arg2, 60 (void *)(__psunsigned_t)arg3, 61 (void *)(__psunsigned_t)arg4, 62 (void *)(__psunsigned_t)arg5, 63 NULL, NULL, NULL, NULL, NULL, NULL); 64 } 65 66 #define TRACE0(mp,t) TRACE6(mp,t,0,0,0,0,0,0) 67 #define TRACE1(mp,t,a0) TRACE6(mp,t,a0,0,0,0,0,0) 68 #define TRACE2(mp,t,a0,a1) TRACE6(mp,t,a0,a1,0,0,0,0) 69 #define TRACE3(mp,t,a0,a1,a2) TRACE6(mp,t,a0,a1,a2,0,0,0) 70 #define TRACE4(mp,t,a0,a1,a2,a3) TRACE6(mp,t,a0,a1,a2,a3,0,0) 71 #define TRACE5(mp,t,a0,a1,a2,a3,a4) TRACE6(mp,t,a0,a1,a2,a3,a4,0) 72 #define TRACE6(mp,t,a0,a1,a2,a3,a4,a5) \ 73 xfs_filestreams_trace(mp, t, __func__, __LINE__, \ 74 (__psunsigned_t)a0, (__psunsigned_t)a1, \ 75 (__psunsigned_t)a2, (__psunsigned_t)a3, \ 76 (__psunsigned_t)a4, (__psunsigned_t)a5) 77 78 #define TRACE_AG_SCAN(mp, ag, ag2) \ 79 TRACE2(mp, XFS_FSTRM_KTRACE_AGSCAN, ag, ag2); 80 #define TRACE_AG_PICK1(mp, max_ag, maxfree) \ 81 TRACE2(mp, XFS_FSTRM_KTRACE_AGPICK1, max_ag, maxfree); 82 #define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag) \ 83 TRACE6(mp, XFS_FSTRM_KTRACE_AGPICK2, ag, ag2, \ 84 cnt, free, scan, flag) 85 #define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2) \ 86 TRACE5(mp, XFS_FSTRM_KTRACE_UPDATE, ip, ag, cnt, ag2, cnt2) 87 #define TRACE_FREE(mp, ip, pip, ag, cnt) \ 88 TRACE4(mp, XFS_FSTRM_KTRACE_FREE, ip, pip, ag, cnt) 89 #define TRACE_LOOKUP(mp, ip, pip, ag, cnt) \ 90 TRACE4(mp, XFS_FSTRM_KTRACE_ITEM_LOOKUP, ip, pip, ag, cnt) 91 #define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt) \ 92 TRACE4(mp, XFS_FSTRM_KTRACE_ASSOCIATE, ip, pip, ag, cnt) 93 #define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt) \ 94 TRACE6(mp, XFS_FSTRM_KTRACE_MOVEAG, ip, pip, oag, ocnt, nag, ncnt) 95 #define TRACE_ORPHAN(mp, ip, ag) \ 96 TRACE2(mp, XFS_FSTRM_KTRACE_ORPHAN, ip, ag); 97 98 99 #else 100 #define TRACE_AG_SCAN(mp, ag, ag2) 101 #define TRACE_AG_PICK1(mp, max_ag, maxfree) 102 #define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag) 103 #define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2) 104 #define TRACE_FREE(mp, ip, pip, ag, cnt) 105 #define TRACE_LOOKUP(mp, ip, pip, ag, cnt) 106 #define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt) 107 #define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt) 108 #define TRACE_ORPHAN(mp, ip, ag) 109 #endif 110 111 static kmem_zone_t *item_zone; 112 113 /* 114 * Structure for associating a file or a directory with an allocation group. 115 * The parent directory pointer is only needed for files, but since there will 116 * generally be vastly more files than directories in the cache, using the same 117 * data structure simplifies the code with very little memory overhead. 118 */ 119 typedef struct fstrm_item 120 { 121 xfs_agnumber_t ag; /* AG currently in use for the file/directory. */ 122 xfs_inode_t *ip; /* inode self-pointer. */ 123 xfs_inode_t *pip; /* Parent directory inode pointer. */ 124 } fstrm_item_t; 125 126 /* 127 * Allocation group filestream associations are tracked with per-ag atomic 128 * counters. These counters allow _xfs_filestream_pick_ag() to tell whether a 129 * particular AG already has active filestreams associated with it. The mount 130 * point's m_peraglock is used to protect these counters from per-ag array 131 * re-allocation during a growfs operation. When xfs_growfs_data_private() is 132 * about to reallocate the array, it calls xfs_filestream_flush() with the 133 * m_peraglock held in write mode. 134 * 135 * Since xfs_mru_cache_flush() guarantees that all the free functions for all 136 * the cache elements have finished executing before it returns, it's safe for 137 * the free functions to use the atomic counters without m_peraglock protection. 138 * This allows the implementation of xfs_fstrm_free_func() to be agnostic about 139 * whether it was called with the m_peraglock held in read mode, write mode or 140 * not held at all. The race condition this addresses is the following: 141 * 142 * - The work queue scheduler fires and pulls a filestream directory cache 143 * element off the LRU end of the cache for deletion, then gets pre-empted. 144 * - A growfs operation grabs the m_peraglock in write mode, flushes all the 145 * remaining items from the cache and reallocates the mount point's per-ag 146 * array, resetting all the counters to zero. 147 * - The work queue thread resumes and calls the free function for the element 148 * it started cleaning up earlier. In the process it decrements the 149 * filestreams counter for an AG that now has no references. 150 * 151 * With a shrinkfs feature, the above scenario could panic the system. 152 * 153 * All other uses of the following macros should be protected by either the 154 * m_peraglock held in read mode, or the cache's internal locking exposed by the 155 * interval between a call to xfs_mru_cache_lookup() and a call to 156 * xfs_mru_cache_done(). In addition, the m_peraglock must be held in read mode 157 * when new elements are added to the cache. 158 * 159 * Combined, these locking rules ensure that no associations will ever exist in 160 * the cache that reference per-ag array elements that have since been 161 * reallocated. 162 */ 163 static int 164 xfs_filestream_peek_ag( 165 xfs_mount_t *mp, 166 xfs_agnumber_t agno) 167 { 168 struct xfs_perag *pag; 169 int ret; 170 171 pag = xfs_perag_get(mp, agno); 172 ret = atomic_read(&pag->pagf_fstrms); 173 xfs_perag_put(pag); 174 return ret; 175 } 176 177 static int 178 xfs_filestream_get_ag( 179 xfs_mount_t *mp, 180 xfs_agnumber_t agno) 181 { 182 struct xfs_perag *pag; 183 int ret; 184 185 pag = xfs_perag_get(mp, agno); 186 ret = atomic_inc_return(&pag->pagf_fstrms); 187 xfs_perag_put(pag); 188 return ret; 189 } 190 191 static void 192 xfs_filestream_put_ag( 193 xfs_mount_t *mp, 194 xfs_agnumber_t agno) 195 { 196 struct xfs_perag *pag; 197 198 pag = xfs_perag_get(mp, agno); 199 atomic_dec(&pag->pagf_fstrms); 200 xfs_perag_put(pag); 201 } 202 203 /* 204 * Scan the AGs starting at startag looking for an AG that isn't in use and has 205 * at least minlen blocks free. 206 */ 207 static int 208 _xfs_filestream_pick_ag( 209 xfs_mount_t *mp, 210 xfs_agnumber_t startag, 211 xfs_agnumber_t *agp, 212 int flags, 213 xfs_extlen_t minlen) 214 { 215 int streams, max_streams; 216 int err, trylock, nscan; 217 xfs_extlen_t longest, free, minfree, maxfree = 0; 218 xfs_agnumber_t ag, max_ag = NULLAGNUMBER; 219 struct xfs_perag *pag; 220 221 /* 2% of an AG's blocks must be free for it to be chosen. */ 222 minfree = mp->m_sb.sb_agblocks / 50; 223 224 ag = startag; 225 *agp = NULLAGNUMBER; 226 227 /* For the first pass, don't sleep trying to init the per-AG. */ 228 trylock = XFS_ALLOC_FLAG_TRYLOCK; 229 230 for (nscan = 0; 1; nscan++) { 231 pag = xfs_perag_get(mp, ag); 232 TRACE_AG_SCAN(mp, ag, atomic_read(&pag->pagf_fstrms)); 233 234 if (!pag->pagf_init) { 235 err = xfs_alloc_pagf_init(mp, NULL, ag, trylock); 236 if (err && !trylock) { 237 xfs_perag_put(pag); 238 return err; 239 } 240 } 241 242 /* Might fail sometimes during the 1st pass with trylock set. */ 243 if (!pag->pagf_init) 244 goto next_ag; 245 246 /* Keep track of the AG with the most free blocks. */ 247 if (pag->pagf_freeblks > maxfree) { 248 maxfree = pag->pagf_freeblks; 249 max_streams = atomic_read(&pag->pagf_fstrms); 250 max_ag = ag; 251 } 252 253 /* 254 * The AG reference count does two things: it enforces mutual 255 * exclusion when examining the suitability of an AG in this 256 * loop, and it guards against two filestreams being established 257 * in the same AG as each other. 258 */ 259 if (xfs_filestream_get_ag(mp, ag) > 1) { 260 xfs_filestream_put_ag(mp, ag); 261 goto next_ag; 262 } 263 264 longest = xfs_alloc_longest_free_extent(mp, pag); 265 if (((minlen && longest >= minlen) || 266 (!minlen && pag->pagf_freeblks >= minfree)) && 267 (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) || 268 (flags & XFS_PICK_LOWSPACE))) { 269 270 /* Break out, retaining the reference on the AG. */ 271 free = pag->pagf_freeblks; 272 streams = atomic_read(&pag->pagf_fstrms); 273 xfs_perag_put(pag); 274 *agp = ag; 275 break; 276 } 277 278 /* Drop the reference on this AG, it's not usable. */ 279 xfs_filestream_put_ag(mp, ag); 280 next_ag: 281 xfs_perag_put(pag); 282 /* Move to the next AG, wrapping to AG 0 if necessary. */ 283 if (++ag >= mp->m_sb.sb_agcount) 284 ag = 0; 285 286 /* If a full pass of the AGs hasn't been done yet, continue. */ 287 if (ag != startag) 288 continue; 289 290 /* Allow sleeping in xfs_alloc_pagf_init() on the 2nd pass. */ 291 if (trylock != 0) { 292 trylock = 0; 293 continue; 294 } 295 296 /* Finally, if lowspace wasn't set, set it for the 3rd pass. */ 297 if (!(flags & XFS_PICK_LOWSPACE)) { 298 flags |= XFS_PICK_LOWSPACE; 299 continue; 300 } 301 302 /* 303 * Take the AG with the most free space, regardless of whether 304 * it's already in use by another filestream. 305 */ 306 if (max_ag != NULLAGNUMBER) { 307 xfs_filestream_get_ag(mp, max_ag); 308 TRACE_AG_PICK1(mp, max_ag, maxfree); 309 streams = max_streams; 310 free = maxfree; 311 *agp = max_ag; 312 break; 313 } 314 315 /* take AG 0 if none matched */ 316 TRACE_AG_PICK1(mp, max_ag, maxfree); 317 *agp = 0; 318 return 0; 319 } 320 321 TRACE_AG_PICK2(mp, startag, *agp, streams, free, nscan, flags); 322 323 return 0; 324 } 325 326 /* 327 * Set the allocation group number for a file or a directory, updating inode 328 * references and per-AG references as appropriate. 329 */ 330 static int 331 _xfs_filestream_update_ag( 332 xfs_inode_t *ip, 333 xfs_inode_t *pip, 334 xfs_agnumber_t ag) 335 { 336 int err = 0; 337 xfs_mount_t *mp; 338 xfs_mru_cache_t *cache; 339 fstrm_item_t *item; 340 xfs_agnumber_t old_ag; 341 xfs_inode_t *old_pip; 342 343 /* 344 * Either ip is a regular file and pip is a directory, or ip is a 345 * directory and pip is NULL. 346 */ 347 ASSERT(ip && ((S_ISREG(ip->i_d.di_mode) && pip && 348 S_ISDIR(pip->i_d.di_mode)) || 349 (S_ISDIR(ip->i_d.di_mode) && !pip))); 350 351 mp = ip->i_mount; 352 cache = mp->m_filestream; 353 354 item = xfs_mru_cache_lookup(cache, ip->i_ino); 355 if (item) { 356 ASSERT(item->ip == ip); 357 old_ag = item->ag; 358 item->ag = ag; 359 old_pip = item->pip; 360 item->pip = pip; 361 xfs_mru_cache_done(cache); 362 363 /* 364 * If the AG has changed, drop the old ref and take a new one, 365 * effectively transferring the reference from old to new AG. 366 */ 367 if (ag != old_ag) { 368 xfs_filestream_put_ag(mp, old_ag); 369 xfs_filestream_get_ag(mp, ag); 370 } 371 372 /* 373 * If ip is a file and its pip has changed, drop the old ref and 374 * take a new one. 375 */ 376 if (pip && pip != old_pip) { 377 IRELE(old_pip); 378 IHOLD(pip); 379 } 380 381 TRACE_UPDATE(mp, ip, old_ag, xfs_filestream_peek_ag(mp, old_ag), 382 ag, xfs_filestream_peek_ag(mp, ag)); 383 return 0; 384 } 385 386 item = kmem_zone_zalloc(item_zone, KM_MAYFAIL); 387 if (!item) 388 return ENOMEM; 389 390 item->ag = ag; 391 item->ip = ip; 392 item->pip = pip; 393 394 err = xfs_mru_cache_insert(cache, ip->i_ino, item); 395 if (err) { 396 kmem_zone_free(item_zone, item); 397 return err; 398 } 399 400 /* Take a reference on the AG. */ 401 xfs_filestream_get_ag(mp, ag); 402 403 /* 404 * Take a reference on the inode itself regardless of whether it's a 405 * regular file or a directory. 406 */ 407 IHOLD(ip); 408 409 /* 410 * In the case of a regular file, take a reference on the parent inode 411 * as well to ensure it remains in-core. 412 */ 413 if (pip) 414 IHOLD(pip); 415 416 TRACE_UPDATE(mp, ip, ag, xfs_filestream_peek_ag(mp, ag), 417 ag, xfs_filestream_peek_ag(mp, ag)); 418 419 return 0; 420 } 421 422 /* xfs_fstrm_free_func(): callback for freeing cached stream items. */ 423 STATIC void 424 xfs_fstrm_free_func( 425 unsigned long ino, 426 void *data) 427 { 428 fstrm_item_t *item = (fstrm_item_t *)data; 429 xfs_inode_t *ip = item->ip; 430 431 ASSERT(ip->i_ino == ino); 432 433 xfs_iflags_clear(ip, XFS_IFILESTREAM); 434 435 /* Drop the reference taken on the AG when the item was added. */ 436 xfs_filestream_put_ag(ip->i_mount, item->ag); 437 438 TRACE_FREE(ip->i_mount, ip, item->pip, item->ag, 439 xfs_filestream_peek_ag(ip->i_mount, item->ag)); 440 441 /* 442 * _xfs_filestream_update_ag() always takes a reference on the inode 443 * itself, whether it's a file or a directory. Release it here. 444 * This can result in the inode being freed and so we must 445 * not hold any inode locks when freeing filesstreams objects 446 * otherwise we can deadlock here. 447 */ 448 IRELE(ip); 449 450 /* 451 * In the case of a regular file, _xfs_filestream_update_ag() also 452 * takes a ref on the parent inode to keep it in-core. Release that 453 * too. 454 */ 455 if (item->pip) 456 IRELE(item->pip); 457 458 /* Finally, free the memory allocated for the item. */ 459 kmem_zone_free(item_zone, item); 460 } 461 462 /* 463 * xfs_filestream_init() is called at xfs initialisation time to set up the 464 * memory zone that will be used for filestream data structure allocation. 465 */ 466 int 467 xfs_filestream_init(void) 468 { 469 item_zone = kmem_zone_init(sizeof(fstrm_item_t), "fstrm_item"); 470 if (!item_zone) 471 return -ENOMEM; 472 473 return 0; 474 } 475 476 /* 477 * xfs_filestream_uninit() is called at xfs termination time to destroy the 478 * memory zone that was used for filestream data structure allocation. 479 */ 480 void 481 xfs_filestream_uninit(void) 482 { 483 kmem_zone_destroy(item_zone); 484 } 485 486 /* 487 * xfs_filestream_mount() is called when a file system is mounted with the 488 * filestream option. It is responsible for allocating the data structures 489 * needed to track the new file system's file streams. 490 */ 491 int 492 xfs_filestream_mount( 493 xfs_mount_t *mp) 494 { 495 int err; 496 unsigned int lifetime, grp_count; 497 498 /* 499 * The filestream timer tunable is currently fixed within the range of 500 * one second to four minutes, with five seconds being the default. The 501 * group count is somewhat arbitrary, but it'd be nice to adhere to the 502 * timer tunable to within about 10 percent. This requires at least 10 503 * groups. 504 */ 505 lifetime = xfs_fstrm_centisecs * 10; 506 grp_count = 10; 507 508 err = xfs_mru_cache_create(&mp->m_filestream, lifetime, grp_count, 509 xfs_fstrm_free_func); 510 511 return err; 512 } 513 514 /* 515 * xfs_filestream_unmount() is called when a file system that was mounted with 516 * the filestream option is unmounted. It drains the data structures created 517 * to track the file system's file streams and frees all the memory that was 518 * allocated. 519 */ 520 void 521 xfs_filestream_unmount( 522 xfs_mount_t *mp) 523 { 524 xfs_mru_cache_destroy(mp->m_filestream); 525 } 526 527 /* 528 * Return the AG of the filestream the file or directory belongs to, or 529 * NULLAGNUMBER otherwise. 530 */ 531 xfs_agnumber_t 532 xfs_filestream_lookup_ag( 533 xfs_inode_t *ip) 534 { 535 xfs_mru_cache_t *cache; 536 fstrm_item_t *item; 537 xfs_agnumber_t ag; 538 int ref; 539 540 if (!S_ISREG(ip->i_d.di_mode) && !S_ISDIR(ip->i_d.di_mode)) { 541 ASSERT(0); 542 return NULLAGNUMBER; 543 } 544 545 cache = ip->i_mount->m_filestream; 546 item = xfs_mru_cache_lookup(cache, ip->i_ino); 547 if (!item) { 548 TRACE_LOOKUP(ip->i_mount, ip, NULL, NULLAGNUMBER, 0); 549 return NULLAGNUMBER; 550 } 551 552 ASSERT(ip == item->ip); 553 ag = item->ag; 554 ref = xfs_filestream_peek_ag(ip->i_mount, ag); 555 xfs_mru_cache_done(cache); 556 557 TRACE_LOOKUP(ip->i_mount, ip, item->pip, ag, ref); 558 return ag; 559 } 560 561 /* 562 * xfs_filestream_associate() should only be called to associate a regular file 563 * with its parent directory. Calling it with a child directory isn't 564 * appropriate because filestreams don't apply to entire directory hierarchies. 565 * Creating a file in a child directory of an existing filestream directory 566 * starts a new filestream with its own allocation group association. 567 * 568 * Returns < 0 on error, 0 if successful association occurred, > 0 if 569 * we failed to get an association because of locking issues. 570 */ 571 int 572 xfs_filestream_associate( 573 xfs_inode_t *pip, 574 xfs_inode_t *ip) 575 { 576 xfs_mount_t *mp; 577 xfs_mru_cache_t *cache; 578 fstrm_item_t *item; 579 xfs_agnumber_t ag, rotorstep, startag; 580 int err = 0; 581 582 ASSERT(S_ISDIR(pip->i_d.di_mode)); 583 ASSERT(S_ISREG(ip->i_d.di_mode)); 584 if (!S_ISDIR(pip->i_d.di_mode) || !S_ISREG(ip->i_d.di_mode)) 585 return -EINVAL; 586 587 mp = pip->i_mount; 588 cache = mp->m_filestream; 589 590 /* 591 * We have a problem, Houston. 592 * 593 * Taking the iolock here violates inode locking order - we already 594 * hold the ilock. Hence if we block getting this lock we may never 595 * wake. Unfortunately, that means if we can't get the lock, we're 596 * screwed in terms of getting a stream association - we can't spin 597 * waiting for the lock because someone else is waiting on the lock we 598 * hold and we cannot drop that as we are in a transaction here. 599 * 600 * Lucky for us, this inversion is not a problem because it's a 601 * directory inode that we are trying to lock here. 602 * 603 * So, if we can't get the iolock without sleeping then just give up 604 */ 605 if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL)) 606 return 1; 607 608 /* If the parent directory is already in the cache, use its AG. */ 609 item = xfs_mru_cache_lookup(cache, pip->i_ino); 610 if (item) { 611 ASSERT(item->ip == pip); 612 ag = item->ag; 613 xfs_mru_cache_done(cache); 614 615 TRACE_LOOKUP(mp, pip, pip, ag, xfs_filestream_peek_ag(mp, ag)); 616 err = _xfs_filestream_update_ag(ip, pip, ag); 617 618 goto exit; 619 } 620 621 /* 622 * Set the starting AG using the rotor for inode32, otherwise 623 * use the directory inode's AG. 624 */ 625 if (mp->m_flags & XFS_MOUNT_32BITINODES) { 626 rotorstep = xfs_rotorstep; 627 startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount; 628 mp->m_agfrotor = (mp->m_agfrotor + 1) % 629 (mp->m_sb.sb_agcount * rotorstep); 630 } else 631 startag = XFS_INO_TO_AGNO(mp, pip->i_ino); 632 633 /* Pick a new AG for the parent inode starting at startag. */ 634 err = _xfs_filestream_pick_ag(mp, startag, &ag, 0, 0); 635 if (err || ag == NULLAGNUMBER) 636 goto exit_did_pick; 637 638 /* Associate the parent inode with the AG. */ 639 err = _xfs_filestream_update_ag(pip, NULL, ag); 640 if (err) 641 goto exit_did_pick; 642 643 /* Associate the file inode with the AG. */ 644 err = _xfs_filestream_update_ag(ip, pip, ag); 645 if (err) 646 goto exit_did_pick; 647 648 TRACE_ASSOCIATE(mp, ip, pip, ag, xfs_filestream_peek_ag(mp, ag)); 649 650 exit_did_pick: 651 /* 652 * If _xfs_filestream_pick_ag() returned a valid AG, remove the 653 * reference it took on it, since the file and directory will have taken 654 * their own now if they were successfully cached. 655 */ 656 if (ag != NULLAGNUMBER) 657 xfs_filestream_put_ag(mp, ag); 658 659 exit: 660 xfs_iunlock(pip, XFS_IOLOCK_EXCL); 661 return -err; 662 } 663 664 /* 665 * Pick a new allocation group for the current file and its file stream. This 666 * function is called by xfs_bmap_filestreams() with the mount point's per-ag 667 * lock held. 668 */ 669 int 670 xfs_filestream_new_ag( 671 xfs_bmalloca_t *ap, 672 xfs_agnumber_t *agp) 673 { 674 int flags, err; 675 xfs_inode_t *ip, *pip = NULL; 676 xfs_mount_t *mp; 677 xfs_mru_cache_t *cache; 678 xfs_extlen_t minlen; 679 fstrm_item_t *dir, *file; 680 xfs_agnumber_t ag = NULLAGNUMBER; 681 682 ip = ap->ip; 683 mp = ip->i_mount; 684 cache = mp->m_filestream; 685 minlen = ap->length; 686 *agp = NULLAGNUMBER; 687 688 /* 689 * Look for the file in the cache, removing it if it's found. Doing 690 * this allows it to be held across the dir lookup that follows. 691 */ 692 file = xfs_mru_cache_remove(cache, ip->i_ino); 693 if (file) { 694 ASSERT(ip == file->ip); 695 696 /* Save the file's parent inode and old AG number for later. */ 697 pip = file->pip; 698 ag = file->ag; 699 700 /* Look for the file's directory in the cache. */ 701 dir = xfs_mru_cache_lookup(cache, pip->i_ino); 702 if (dir) { 703 ASSERT(pip == dir->ip); 704 705 /* 706 * If the directory has already moved on to a new AG, 707 * use that AG as the new AG for the file. Don't 708 * forget to twiddle the AG refcounts to match the 709 * movement. 710 */ 711 if (dir->ag != file->ag) { 712 xfs_filestream_put_ag(mp, file->ag); 713 xfs_filestream_get_ag(mp, dir->ag); 714 *agp = file->ag = dir->ag; 715 } 716 717 xfs_mru_cache_done(cache); 718 } 719 720 /* 721 * Put the file back in the cache. If this fails, the free 722 * function needs to be called to tidy up in the same way as if 723 * the item had simply expired from the cache. 724 */ 725 err = xfs_mru_cache_insert(cache, ip->i_ino, file); 726 if (err) { 727 xfs_fstrm_free_func(ip->i_ino, file); 728 return err; 729 } 730 731 /* 732 * If the file's AG was moved to the directory's new AG, there's 733 * nothing more to be done. 734 */ 735 if (*agp != NULLAGNUMBER) { 736 TRACE_MOVEAG(mp, ip, pip, 737 ag, xfs_filestream_peek_ag(mp, ag), 738 *agp, xfs_filestream_peek_ag(mp, *agp)); 739 return 0; 740 } 741 } 742 743 /* 744 * If the file's parent directory is known, take its iolock in exclusive 745 * mode to prevent two sibling files from racing each other to migrate 746 * themselves and their parent to different AGs. 747 * 748 * Note that we lock the parent directory iolock inside the child 749 * iolock here. That's fine as we never hold both parent and child 750 * iolock in any other place. This is different from the ilock, 751 * which requires locking of the child after the parent for namespace 752 * operations. 753 */ 754 if (pip) 755 xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); 756 757 /* 758 * A new AG needs to be found for the file. If the file's parent 759 * directory is also known, it will be moved to the new AG as well to 760 * ensure that files created inside it in future use the new AG. 761 */ 762 ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount; 763 flags = (ap->userdata ? XFS_PICK_USERDATA : 0) | 764 (ap->flist->xbf_low ? XFS_PICK_LOWSPACE : 0); 765 766 err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen); 767 if (err || *agp == NULLAGNUMBER) 768 goto exit; 769 770 /* 771 * If the file wasn't found in the file cache, then its parent directory 772 * inode isn't known. For this to have happened, the file must either 773 * be pre-existing, or it was created long enough ago that its cache 774 * entry has expired. This isn't the sort of usage that the filestreams 775 * allocator is trying to optimise, so there's no point trying to track 776 * its new AG somehow in the filestream data structures. 777 */ 778 if (!pip) { 779 TRACE_ORPHAN(mp, ip, *agp); 780 goto exit; 781 } 782 783 /* Associate the parent inode with the AG. */ 784 err = _xfs_filestream_update_ag(pip, NULL, *agp); 785 if (err) 786 goto exit; 787 788 /* Associate the file inode with the AG. */ 789 err = _xfs_filestream_update_ag(ip, pip, *agp); 790 if (err) 791 goto exit; 792 793 TRACE_MOVEAG(mp, ip, pip, NULLAGNUMBER, 0, 794 *agp, xfs_filestream_peek_ag(mp, *agp)); 795 796 exit: 797 /* 798 * If _xfs_filestream_pick_ag() returned a valid AG, remove the 799 * reference it took on it, since the file and directory will have taken 800 * their own now if they were successfully cached. 801 */ 802 if (*agp != NULLAGNUMBER) 803 xfs_filestream_put_ag(mp, *agp); 804 else 805 *agp = 0; 806 807 if (pip) 808 xfs_iunlock(pip, XFS_IOLOCK_EXCL); 809 810 return err; 811 } 812 813 /* 814 * Remove an association between an inode and a filestream object. 815 * Typically this is done on last close of an unlinked file. 816 */ 817 void 818 xfs_filestream_deassociate( 819 xfs_inode_t *ip) 820 { 821 xfs_mru_cache_t *cache = ip->i_mount->m_filestream; 822 823 xfs_mru_cache_delete(cache, ip->i_ino); 824 } 825