1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2020 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <darrick.wong@oracle.com> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_mount.h" 14 #include "xfs_inode.h" 15 #include "xfs_trans.h" 16 #include "xfs_btree.h" 17 #include "xfs_trace.h" 18 #include "xfs_btree_staging.h" 19 20 /* 21 * Staging Cursors and Fake Roots for Btrees 22 * ========================================= 23 * 24 * A staging btree cursor is a special type of btree cursor that callers must 25 * use to construct a new btree index using the btree bulk loader code. The 26 * bulk loading code uses the staging btree cursor to abstract the details of 27 * initializing new btree blocks and filling them with records or key/ptr 28 * pairs. Regular btree operations (e.g. queries and modifications) are not 29 * supported with staging cursors, and callers must not invoke them. 30 * 31 * Fake root structures contain all the information about a btree that is under 32 * construction by the bulk loading code. Staging btree cursors point to fake 33 * root structures instead of the usual AG header or inode structure. 34 * 35 * Callers are expected to initialize a fake root structure and pass it into 36 * the _stage_cursor function for a specific btree type. When bulk loading is 37 * complete, callers should call the _commit_staged_btree function for that 38 * specific btree type to commit the new btree into the filesystem. 39 */ 40 41 /* 42 * Don't allow staging cursors to be duplicated because they're supposed to be 43 * kept private to a single thread. 44 */ 45 STATIC struct xfs_btree_cur * 46 xfs_btree_fakeroot_dup_cursor( 47 struct xfs_btree_cur *cur) 48 { 49 ASSERT(0); 50 return NULL; 51 } 52 53 /* 54 * Don't allow block allocation for a staging cursor, because staging cursors 55 * do not support regular btree modifications. 56 * 57 * Bulk loading uses a separate callback to obtain new blocks from a 58 * preallocated list, which prevents ENOSPC failures during loading. 59 */ 60 STATIC int 61 xfs_btree_fakeroot_alloc_block( 62 struct xfs_btree_cur *cur, 63 const union xfs_btree_ptr *start_bno, 64 union xfs_btree_ptr *new_bno, 65 int *stat) 66 { 67 ASSERT(0); 68 return -EFSCORRUPTED; 69 } 70 71 /* 72 * Don't allow block freeing for a staging cursor, because staging cursors 73 * do not support regular btree modifications. 74 */ 75 STATIC int 76 xfs_btree_fakeroot_free_block( 77 struct xfs_btree_cur *cur, 78 struct xfs_buf *bp) 79 { 80 ASSERT(0); 81 return -EFSCORRUPTED; 82 } 83 84 /* Initialize a pointer to the root block from the fakeroot. */ 85 STATIC void 86 xfs_btree_fakeroot_init_ptr_from_cur( 87 struct xfs_btree_cur *cur, 88 union xfs_btree_ptr *ptr) 89 { 90 struct xbtree_afakeroot *afake; 91 92 ASSERT(cur->bc_flags & XFS_BTREE_STAGING); 93 94 afake = cur->bc_ag.afake; 95 ptr->s = cpu_to_be32(afake->af_root); 96 } 97 98 /* 99 * Bulk Loading for AG Btrees 100 * ========================== 101 * 102 * For a btree rooted in an AG header, pass a xbtree_afakeroot structure to the 103 * staging cursor. Callers should initialize this to zero. 104 * 105 * The _stage_cursor() function for a specific btree type should call 106 * xfs_btree_stage_afakeroot to set up the in-memory cursor as a staging 107 * cursor. The corresponding _commit_staged_btree() function should log the 108 * new root and call xfs_btree_commit_afakeroot() to transform the staging 109 * cursor into a regular btree cursor. 110 */ 111 112 /* Update the btree root information for a per-AG fake root. */ 113 STATIC void 114 xfs_btree_afakeroot_set_root( 115 struct xfs_btree_cur *cur, 116 const union xfs_btree_ptr *ptr, 117 int inc) 118 { 119 struct xbtree_afakeroot *afake = cur->bc_ag.afake; 120 121 ASSERT(cur->bc_flags & XFS_BTREE_STAGING); 122 afake->af_root = be32_to_cpu(ptr->s); 123 afake->af_levels += inc; 124 } 125 126 /* 127 * Initialize a AG-rooted btree cursor with the given AG btree fake root. 128 * The btree cursor's bc_ops will be overridden as needed to make the staging 129 * functionality work. 130 */ 131 void 132 xfs_btree_stage_afakeroot( 133 struct xfs_btree_cur *cur, 134 struct xbtree_afakeroot *afake) 135 { 136 struct xfs_btree_ops *nops; 137 138 ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING)); 139 ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)); 140 ASSERT(cur->bc_tp == NULL); 141 142 nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS); 143 memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops)); 144 nops->alloc_block = xfs_btree_fakeroot_alloc_block; 145 nops->free_block = xfs_btree_fakeroot_free_block; 146 nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur; 147 nops->set_root = xfs_btree_afakeroot_set_root; 148 nops->dup_cursor = xfs_btree_fakeroot_dup_cursor; 149 150 cur->bc_ag.afake = afake; 151 cur->bc_nlevels = afake->af_levels; 152 cur->bc_ops = nops; 153 cur->bc_flags |= XFS_BTREE_STAGING; 154 } 155 156 /* 157 * Transform an AG-rooted staging btree cursor back into a regular cursor by 158 * substituting a real btree root for the fake one and restoring normal btree 159 * cursor ops. The caller must log the btree root change prior to calling 160 * this. 161 */ 162 void 163 xfs_btree_commit_afakeroot( 164 struct xfs_btree_cur *cur, 165 struct xfs_trans *tp, 166 struct xfs_buf *agbp, 167 const struct xfs_btree_ops *ops) 168 { 169 ASSERT(cur->bc_flags & XFS_BTREE_STAGING); 170 ASSERT(cur->bc_tp == NULL); 171 172 trace_xfs_btree_commit_afakeroot(cur); 173 174 kmem_free((void *)cur->bc_ops); 175 cur->bc_ag.agbp = agbp; 176 cur->bc_ops = ops; 177 cur->bc_flags &= ~XFS_BTREE_STAGING; 178 cur->bc_tp = tp; 179 } 180 181 /* 182 * Bulk Loading for Inode-Rooted Btrees 183 * ==================================== 184 * 185 * For a btree rooted in an inode fork, pass a xbtree_ifakeroot structure to 186 * the staging cursor. This structure should be initialized as follows: 187 * 188 * - if_fork_size field should be set to the number of bytes available to the 189 * fork in the inode. 190 * 191 * - if_fork should point to a freshly allocated struct xfs_ifork. 192 * 193 * - if_format should be set to the appropriate fork type (e.g. 194 * XFS_DINODE_FMT_BTREE). 195 * 196 * All other fields must be zero. 197 * 198 * The _stage_cursor() function for a specific btree type should call 199 * xfs_btree_stage_ifakeroot to set up the in-memory cursor as a staging 200 * cursor. The corresponding _commit_staged_btree() function should log the 201 * new root and call xfs_btree_commit_ifakeroot() to transform the staging 202 * cursor into a regular btree cursor. 203 */ 204 205 /* 206 * Initialize an inode-rooted btree cursor with the given inode btree fake 207 * root. The btree cursor's bc_ops will be overridden as needed to make the 208 * staging functionality work. If new_ops is not NULL, these new ops will be 209 * passed out to the caller for further overriding. 210 */ 211 void 212 xfs_btree_stage_ifakeroot( 213 struct xfs_btree_cur *cur, 214 struct xbtree_ifakeroot *ifake, 215 struct xfs_btree_ops **new_ops) 216 { 217 struct xfs_btree_ops *nops; 218 219 ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING)); 220 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); 221 ASSERT(cur->bc_tp == NULL); 222 223 nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS); 224 memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops)); 225 nops->alloc_block = xfs_btree_fakeroot_alloc_block; 226 nops->free_block = xfs_btree_fakeroot_free_block; 227 nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur; 228 nops->dup_cursor = xfs_btree_fakeroot_dup_cursor; 229 230 cur->bc_ino.ifake = ifake; 231 cur->bc_nlevels = ifake->if_levels; 232 cur->bc_ops = nops; 233 cur->bc_flags |= XFS_BTREE_STAGING; 234 235 if (new_ops) 236 *new_ops = nops; 237 } 238 239 /* 240 * Transform an inode-rooted staging btree cursor back into a regular cursor by 241 * substituting a real btree root for the fake one and restoring normal btree 242 * cursor ops. The caller must log the btree root change prior to calling 243 * this. 244 */ 245 void 246 xfs_btree_commit_ifakeroot( 247 struct xfs_btree_cur *cur, 248 struct xfs_trans *tp, 249 int whichfork, 250 const struct xfs_btree_ops *ops) 251 { 252 ASSERT(cur->bc_flags & XFS_BTREE_STAGING); 253 ASSERT(cur->bc_tp == NULL); 254 255 trace_xfs_btree_commit_ifakeroot(cur); 256 257 kmem_free((void *)cur->bc_ops); 258 cur->bc_ino.ifake = NULL; 259 cur->bc_ino.whichfork = whichfork; 260 cur->bc_ops = ops; 261 cur->bc_flags &= ~XFS_BTREE_STAGING; 262 cur->bc_tp = tp; 263 } 264 265 /* 266 * Bulk Loading of Staged Btrees 267 * ============================= 268 * 269 * This interface is used with a staged btree cursor to create a totally new 270 * btree with a large number of records (i.e. more than what would fit in a 271 * single root block). When the creation is complete, the new root can be 272 * linked atomically into the filesystem by committing the staged cursor. 273 * 274 * Creation of a new btree proceeds roughly as follows: 275 * 276 * The first step is to initialize an appropriate fake btree root structure and 277 * then construct a staged btree cursor. Refer to the block comments about 278 * "Bulk Loading for AG Btrees" and "Bulk Loading for Inode-Rooted Btrees" for 279 * more information about how to do this. 280 * 281 * The second step is to initialize a struct xfs_btree_bload context as 282 * documented in the structure definition. 283 * 284 * The third step is to call xfs_btree_bload_compute_geometry to compute the 285 * height of and the number of blocks needed to construct the btree. See the 286 * section "Computing the Geometry of the New Btree" for details about this 287 * computation. 288 * 289 * In step four, the caller must allocate xfs_btree_bload.nr_blocks blocks and 290 * save them for later use by ->claim_block(). Bulk loading requires all 291 * blocks to be allocated beforehand to avoid ENOSPC failures midway through a 292 * rebuild, and to minimize seek distances of the new btree. 293 * 294 * Step five is to call xfs_btree_bload() to start constructing the btree. 295 * 296 * The final step is to commit the staging btree cursor, which logs the new 297 * btree root and turns the staging cursor into a regular cursor. The caller 298 * is responsible for cleaning up the previous btree blocks, if any. 299 * 300 * Computing the Geometry of the New Btree 301 * ======================================= 302 * 303 * The number of items placed in each btree block is computed via the following 304 * algorithm: For leaf levels, the number of items for the level is nr_records 305 * in the bload structure. For node levels, the number of items for the level 306 * is the number of blocks in the next lower level of the tree. For each 307 * level, the desired number of items per block is defined as: 308 * 309 * desired = max(minrecs, maxrecs - slack factor) 310 * 311 * The number of blocks for the level is defined to be: 312 * 313 * blocks = floor(nr_items / desired) 314 * 315 * Note this is rounded down so that the npb calculation below will never fall 316 * below minrecs. The number of items that will actually be loaded into each 317 * btree block is defined as: 318 * 319 * npb = nr_items / blocks 320 * 321 * Some of the leftmost blocks in the level will contain one extra record as 322 * needed to handle uneven division. If the number of records in any block 323 * would exceed maxrecs for that level, blocks is incremented and npb is 324 * recalculated. 325 * 326 * In other words, we compute the number of blocks needed to satisfy a given 327 * loading level, then spread the items as evenly as possible. 328 * 329 * The height and number of fs blocks required to create the btree are computed 330 * and returned via btree_height and nr_blocks. 331 */ 332 333 /* 334 * Put a btree block that we're loading onto the ordered list and release it. 335 * The btree blocks will be written to disk when bulk loading is finished. 336 */ 337 static void 338 xfs_btree_bload_drop_buf( 339 struct list_head *buffers_list, 340 struct xfs_buf **bpp) 341 { 342 if (*bpp == NULL) 343 return; 344 345 if (!xfs_buf_delwri_queue(*bpp, buffers_list)) 346 ASSERT(0); 347 348 xfs_buf_relse(*bpp); 349 *bpp = NULL; 350 } 351 352 /* 353 * Allocate and initialize one btree block for bulk loading. 354 * 355 * The new btree block will have its level and numrecs fields set to the values 356 * of the level and nr_this_block parameters, respectively. 357 * 358 * The caller should ensure that ptrp, bpp, and blockp refer to the left 359 * sibling of the new block, if there is any. On exit, ptrp, bpp, and blockp 360 * will all point to the new block. 361 */ 362 STATIC int 363 xfs_btree_bload_prep_block( 364 struct xfs_btree_cur *cur, 365 struct xfs_btree_bload *bbl, 366 struct list_head *buffers_list, 367 unsigned int level, 368 unsigned int nr_this_block, 369 union xfs_btree_ptr *ptrp, /* in/out */ 370 struct xfs_buf **bpp, /* in/out */ 371 struct xfs_btree_block **blockp, /* in/out */ 372 void *priv) 373 { 374 union xfs_btree_ptr new_ptr; 375 struct xfs_buf *new_bp; 376 struct xfs_btree_block *new_block; 377 int ret; 378 379 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && 380 level == cur->bc_nlevels - 1) { 381 struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); 382 size_t new_size; 383 384 ASSERT(*bpp == NULL); 385 386 /* Allocate a new incore btree root block. */ 387 new_size = bbl->iroot_size(cur, nr_this_block, priv); 388 ifp->if_broot = kmem_zalloc(new_size, 0); 389 ifp->if_broot_bytes = (int)new_size; 390 391 /* Initialize it and send it out. */ 392 xfs_btree_init_block_int(cur->bc_mp, ifp->if_broot, 393 XFS_BUF_DADDR_NULL, cur->bc_btnum, level, 394 nr_this_block, cur->bc_ino.ip->i_ino, 395 cur->bc_flags); 396 397 *bpp = NULL; 398 *blockp = ifp->if_broot; 399 xfs_btree_set_ptr_null(cur, ptrp); 400 return 0; 401 } 402 403 /* Claim one of the caller's preallocated blocks. */ 404 xfs_btree_set_ptr_null(cur, &new_ptr); 405 ret = bbl->claim_block(cur, &new_ptr, priv); 406 if (ret) 407 return ret; 408 409 ASSERT(!xfs_btree_ptr_is_null(cur, &new_ptr)); 410 411 ret = xfs_btree_get_buf_block(cur, &new_ptr, &new_block, &new_bp); 412 if (ret) 413 return ret; 414 415 /* 416 * The previous block (if any) is the left sibling of the new block, 417 * so set its right sibling pointer to the new block and drop it. 418 */ 419 if (*blockp) 420 xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB); 421 xfs_btree_bload_drop_buf(buffers_list, bpp); 422 423 /* Initialize the new btree block. */ 424 xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block); 425 xfs_btree_set_sibling(cur, new_block, ptrp, XFS_BB_LEFTSIB); 426 427 /* Set the out parameters. */ 428 *bpp = new_bp; 429 *blockp = new_block; 430 xfs_btree_copy_ptrs(cur, ptrp, &new_ptr, 1); 431 return 0; 432 } 433 434 /* Load one leaf block. */ 435 STATIC int 436 xfs_btree_bload_leaf( 437 struct xfs_btree_cur *cur, 438 unsigned int recs_this_block, 439 xfs_btree_bload_get_record_fn get_record, 440 struct xfs_btree_block *block, 441 void *priv) 442 { 443 unsigned int j; 444 int ret; 445 446 /* Fill the leaf block with records. */ 447 for (j = 1; j <= recs_this_block; j++) { 448 union xfs_btree_rec *block_rec; 449 450 ret = get_record(cur, priv); 451 if (ret) 452 return ret; 453 block_rec = xfs_btree_rec_addr(cur, j, block); 454 cur->bc_ops->init_rec_from_cur(cur, block_rec); 455 } 456 457 return 0; 458 } 459 460 /* 461 * Load one node block with key/ptr pairs. 462 * 463 * child_ptr must point to a block within the next level down in the tree. A 464 * key/ptr entry will be created in the new node block to the block pointed to 465 * by child_ptr. On exit, child_ptr points to the next block on the child 466 * level that needs processing. 467 */ 468 STATIC int 469 xfs_btree_bload_node( 470 struct xfs_btree_cur *cur, 471 unsigned int recs_this_block, 472 union xfs_btree_ptr *child_ptr, 473 struct xfs_btree_block *block) 474 { 475 unsigned int j; 476 int ret; 477 478 /* Fill the node block with keys and pointers. */ 479 for (j = 1; j <= recs_this_block; j++) { 480 union xfs_btree_key child_key; 481 union xfs_btree_ptr *block_ptr; 482 union xfs_btree_key *block_key; 483 struct xfs_btree_block *child_block; 484 struct xfs_buf *child_bp; 485 486 ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr)); 487 488 ret = xfs_btree_get_buf_block(cur, child_ptr, &child_block, 489 &child_bp); 490 if (ret) 491 return ret; 492 493 block_ptr = xfs_btree_ptr_addr(cur, j, block); 494 xfs_btree_copy_ptrs(cur, block_ptr, child_ptr, 1); 495 496 block_key = xfs_btree_key_addr(cur, j, block); 497 xfs_btree_get_keys(cur, child_block, &child_key); 498 xfs_btree_copy_keys(cur, block_key, &child_key, 1); 499 500 xfs_btree_get_sibling(cur, child_block, child_ptr, 501 XFS_BB_RIGHTSIB); 502 xfs_buf_relse(child_bp); 503 } 504 505 return 0; 506 } 507 508 /* 509 * Compute the maximum number of records (or keyptrs) per block that we want to 510 * install at this level in the btree. Caller is responsible for having set 511 * @cur->bc_ino.forksize to the desired fork size, if appropriate. 512 */ 513 STATIC unsigned int 514 xfs_btree_bload_max_npb( 515 struct xfs_btree_cur *cur, 516 struct xfs_btree_bload *bbl, 517 unsigned int level) 518 { 519 unsigned int ret; 520 521 if (level == cur->bc_nlevels - 1 && cur->bc_ops->get_dmaxrecs) 522 return cur->bc_ops->get_dmaxrecs(cur, level); 523 524 ret = cur->bc_ops->get_maxrecs(cur, level); 525 if (level == 0) 526 ret -= bbl->leaf_slack; 527 else 528 ret -= bbl->node_slack; 529 return ret; 530 } 531 532 /* 533 * Compute the desired number of records (or keyptrs) per block that we want to 534 * install at this level in the btree, which must be somewhere between minrecs 535 * and max_npb. The caller is free to install fewer records per block. 536 */ 537 STATIC unsigned int 538 xfs_btree_bload_desired_npb( 539 struct xfs_btree_cur *cur, 540 struct xfs_btree_bload *bbl, 541 unsigned int level) 542 { 543 unsigned int npb = xfs_btree_bload_max_npb(cur, bbl, level); 544 545 /* Root blocks are not subject to minrecs rules. */ 546 if (level == cur->bc_nlevels - 1) 547 return max(1U, npb); 548 549 return max_t(unsigned int, cur->bc_ops->get_minrecs(cur, level), npb); 550 } 551 552 /* 553 * Compute the number of records to be stored in each block at this level and 554 * the number of blocks for this level. For leaf levels, we must populate an 555 * empty root block even if there are no records, so we have to have at least 556 * one block. 557 */ 558 STATIC void 559 xfs_btree_bload_level_geometry( 560 struct xfs_btree_cur *cur, 561 struct xfs_btree_bload *bbl, 562 unsigned int level, 563 uint64_t nr_this_level, 564 unsigned int *avg_per_block, 565 uint64_t *blocks, 566 uint64_t *blocks_with_extra) 567 { 568 uint64_t npb; 569 uint64_t dontcare; 570 unsigned int desired_npb; 571 unsigned int maxnr; 572 573 maxnr = cur->bc_ops->get_maxrecs(cur, level); 574 575 /* 576 * Compute the number of blocks we need to fill each block with the 577 * desired number of records/keyptrs per block. Because desired_npb 578 * could be minrecs, we use regular integer division (which rounds 579 * the block count down) so that in the next step the effective # of 580 * items per block will never be less than desired_npb. 581 */ 582 desired_npb = xfs_btree_bload_desired_npb(cur, bbl, level); 583 *blocks = div64_u64_rem(nr_this_level, desired_npb, &dontcare); 584 *blocks = max(1ULL, *blocks); 585 586 /* 587 * Compute the number of records that we will actually put in each 588 * block, assuming that we want to spread the records evenly between 589 * the blocks. Take care that the effective # of items per block (npb) 590 * won't exceed maxrecs even for the blocks that get an extra record, 591 * since desired_npb could be maxrecs, and in the previous step we 592 * rounded the block count down. 593 */ 594 npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra); 595 if (npb > maxnr || (npb == maxnr && *blocks_with_extra > 0)) { 596 (*blocks)++; 597 npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra); 598 } 599 600 *avg_per_block = min_t(uint64_t, npb, nr_this_level); 601 602 trace_xfs_btree_bload_level_geometry(cur, level, nr_this_level, 603 *avg_per_block, desired_npb, *blocks, 604 *blocks_with_extra); 605 } 606 607 /* 608 * Ensure a slack value is appropriate for the btree. 609 * 610 * If the slack value is negative, set slack so that we fill the block to 611 * halfway between minrecs and maxrecs. Make sure the slack is never so large 612 * that we can underflow minrecs. 613 */ 614 static void 615 xfs_btree_bload_ensure_slack( 616 struct xfs_btree_cur *cur, 617 int *slack, 618 int level) 619 { 620 int maxr; 621 int minr; 622 623 maxr = cur->bc_ops->get_maxrecs(cur, level); 624 minr = cur->bc_ops->get_minrecs(cur, level); 625 626 /* 627 * If slack is negative, automatically set slack so that we load the 628 * btree block approximately halfway between minrecs and maxrecs. 629 * Generally, this will net us 75% loading. 630 */ 631 if (*slack < 0) 632 *slack = maxr - ((maxr + minr) >> 1); 633 634 *slack = min(*slack, maxr - minr); 635 } 636 637 /* 638 * Prepare a btree cursor for a bulk load operation by computing the geometry 639 * fields in bbl. Caller must ensure that the btree cursor is a staging 640 * cursor. This function can be called multiple times. 641 */ 642 int 643 xfs_btree_bload_compute_geometry( 644 struct xfs_btree_cur *cur, 645 struct xfs_btree_bload *bbl, 646 uint64_t nr_records) 647 { 648 uint64_t nr_blocks = 0; 649 uint64_t nr_this_level; 650 651 ASSERT(cur->bc_flags & XFS_BTREE_STAGING); 652 653 /* 654 * Make sure that the slack values make sense for traditional leaf and 655 * node blocks. Inode-rooted btrees will return different minrecs and 656 * maxrecs values for the root block (bc_nlevels == level - 1). We're 657 * checking levels 0 and 1 here, so set bc_nlevels such that the btree 658 * code doesn't interpret either as the root level. 659 */ 660 cur->bc_nlevels = cur->bc_maxlevels - 1; 661 xfs_btree_bload_ensure_slack(cur, &bbl->leaf_slack, 0); 662 xfs_btree_bload_ensure_slack(cur, &bbl->node_slack, 1); 663 664 bbl->nr_records = nr_this_level = nr_records; 665 for (cur->bc_nlevels = 1; cur->bc_nlevels <= cur->bc_maxlevels;) { 666 uint64_t level_blocks; 667 uint64_t dontcare64; 668 unsigned int level = cur->bc_nlevels - 1; 669 unsigned int avg_per_block; 670 671 xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, 672 &avg_per_block, &level_blocks, &dontcare64); 673 674 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { 675 /* 676 * If all the items we want to store at this level 677 * would fit in the inode root block, then we have our 678 * btree root and are done. 679 * 680 * Note that bmap btrees forbid records in the root. 681 */ 682 if (level != 0 && nr_this_level <= avg_per_block) { 683 nr_blocks++; 684 break; 685 } 686 687 /* 688 * Otherwise, we have to store all the items for this 689 * level in traditional btree blocks and therefore need 690 * another level of btree to point to those blocks. 691 * 692 * We have to re-compute the geometry for each level of 693 * an inode-rooted btree because the geometry differs 694 * between a btree root in an inode fork and a 695 * traditional btree block. 696 * 697 * This distinction is made in the btree code based on 698 * whether level == bc_nlevels - 1. Based on the 699 * previous root block size check against the root 700 * block geometry, we know that we aren't yet ready to 701 * populate the root. Increment bc_nevels and 702 * recalculate the geometry for a traditional 703 * block-based btree level. 704 */ 705 cur->bc_nlevels++; 706 ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); 707 xfs_btree_bload_level_geometry(cur, bbl, level, 708 nr_this_level, &avg_per_block, 709 &level_blocks, &dontcare64); 710 } else { 711 /* 712 * If all the items we want to store at this level 713 * would fit in a single root block, we're done. 714 */ 715 if (nr_this_level <= avg_per_block) { 716 nr_blocks++; 717 break; 718 } 719 720 /* Otherwise, we need another level of btree. */ 721 cur->bc_nlevels++; 722 ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); 723 } 724 725 nr_blocks += level_blocks; 726 nr_this_level = level_blocks; 727 } 728 729 if (cur->bc_nlevels > cur->bc_maxlevels) 730 return -EOVERFLOW; 731 732 bbl->btree_height = cur->bc_nlevels; 733 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) 734 bbl->nr_blocks = nr_blocks - 1; 735 else 736 bbl->nr_blocks = nr_blocks; 737 return 0; 738 } 739 740 /* Bulk load a btree given the parameters and geometry established in bbl. */ 741 int 742 xfs_btree_bload( 743 struct xfs_btree_cur *cur, 744 struct xfs_btree_bload *bbl, 745 void *priv) 746 { 747 struct list_head buffers_list; 748 union xfs_btree_ptr child_ptr; 749 union xfs_btree_ptr ptr; 750 struct xfs_buf *bp = NULL; 751 struct xfs_btree_block *block = NULL; 752 uint64_t nr_this_level = bbl->nr_records; 753 uint64_t blocks; 754 uint64_t i; 755 uint64_t blocks_with_extra; 756 uint64_t total_blocks = 0; 757 unsigned int avg_per_block; 758 unsigned int level = 0; 759 int ret; 760 761 ASSERT(cur->bc_flags & XFS_BTREE_STAGING); 762 763 INIT_LIST_HEAD(&buffers_list); 764 cur->bc_nlevels = bbl->btree_height; 765 xfs_btree_set_ptr_null(cur, &child_ptr); 766 xfs_btree_set_ptr_null(cur, &ptr); 767 768 xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, 769 &avg_per_block, &blocks, &blocks_with_extra); 770 771 /* Load each leaf block. */ 772 for (i = 0; i < blocks; i++) { 773 unsigned int nr_this_block = avg_per_block; 774 775 /* 776 * Due to rounding, btree blocks will not be evenly populated 777 * in most cases. blocks_with_extra tells us how many blocks 778 * will receive an extra record to distribute the excess across 779 * the current level as evenly as possible. 780 */ 781 if (i < blocks_with_extra) 782 nr_this_block++; 783 784 ret = xfs_btree_bload_prep_block(cur, bbl, &buffers_list, level, 785 nr_this_block, &ptr, &bp, &block, priv); 786 if (ret) 787 goto out; 788 789 trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr, 790 nr_this_block); 791 792 ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_record, 793 block, priv); 794 if (ret) 795 goto out; 796 797 /* 798 * Record the leftmost leaf pointer so we know where to start 799 * with the first node level. 800 */ 801 if (i == 0) 802 xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1); 803 } 804 total_blocks += blocks; 805 xfs_btree_bload_drop_buf(&buffers_list, &bp); 806 807 /* Populate the internal btree nodes. */ 808 for (level = 1; level < cur->bc_nlevels; level++) { 809 union xfs_btree_ptr first_ptr; 810 811 nr_this_level = blocks; 812 block = NULL; 813 xfs_btree_set_ptr_null(cur, &ptr); 814 815 xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, 816 &avg_per_block, &blocks, &blocks_with_extra); 817 818 /* Load each node block. */ 819 for (i = 0; i < blocks; i++) { 820 unsigned int nr_this_block = avg_per_block; 821 822 if (i < blocks_with_extra) 823 nr_this_block++; 824 825 ret = xfs_btree_bload_prep_block(cur, bbl, 826 &buffers_list, level, nr_this_block, 827 &ptr, &bp, &block, priv); 828 if (ret) 829 goto out; 830 831 trace_xfs_btree_bload_block(cur, level, i, blocks, 832 &ptr, nr_this_block); 833 834 ret = xfs_btree_bload_node(cur, nr_this_block, 835 &child_ptr, block); 836 if (ret) 837 goto out; 838 839 /* 840 * Record the leftmost node pointer so that we know 841 * where to start the next node level above this one. 842 */ 843 if (i == 0) 844 xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1); 845 } 846 total_blocks += blocks; 847 xfs_btree_bload_drop_buf(&buffers_list, &bp); 848 xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1); 849 } 850 851 /* Initialize the new root. */ 852 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { 853 ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); 854 cur->bc_ino.ifake->if_levels = cur->bc_nlevels; 855 cur->bc_ino.ifake->if_blocks = total_blocks - 1; 856 } else { 857 cur->bc_ag.afake->af_root = be32_to_cpu(ptr.s); 858 cur->bc_ag.afake->af_levels = cur->bc_nlevels; 859 cur->bc_ag.afake->af_blocks = total_blocks; 860 } 861 862 /* 863 * Write the new blocks to disk. If the ordered list isn't empty after 864 * that, then something went wrong and we have to fail. This should 865 * never happen, but we'll check anyway. 866 */ 867 ret = xfs_buf_delwri_submit(&buffers_list); 868 if (ret) 869 goto out; 870 if (!list_empty(&buffers_list)) { 871 ASSERT(list_empty(&buffers_list)); 872 ret = -EIO; 873 } 874 875 out: 876 xfs_buf_delwri_cancel(&buffers_list); 877 if (bp) 878 xfs_buf_relse(bp); 879 return ret; 880 } 881