1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_mount.h" 14 #include "xfs_inode.h" 15 #include "xfs_trans.h" 16 #include "xfs_buf_item.h" 17 #include "xfs_btree.h" 18 #include "xfs_errortag.h" 19 #include "xfs_error.h" 20 #include "xfs_trace.h" 21 #include "xfs_alloc.h" 22 #include "xfs_log.h" 23 #include "xfs_btree_staging.h" 24 #include "xfs_ag.h" 25 #include "xfs_alloc_btree.h" 26 #include "xfs_ialloc_btree.h" 27 #include "xfs_bmap_btree.h" 28 #include "xfs_rmap_btree.h" 29 #include "xfs_refcount_btree.h" 30 31 /* 32 * Btree magic numbers. 33 */ 34 static const uint32_t xfs_magics[2][XFS_BTNUM_MAX] = { 35 { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC, 36 XFS_FIBT_MAGIC, 0 }, 37 { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC, 38 XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC, 39 XFS_REFC_CRC_MAGIC } 40 }; 41 42 uint32_t 43 xfs_btree_magic( 44 int crc, 45 xfs_btnum_t btnum) 46 { 47 uint32_t magic = xfs_magics[crc][btnum]; 48 49 /* Ensure we asked for crc for crc-only magics. */ 50 ASSERT(magic != 0); 51 return magic; 52 } 53 54 static xfs_failaddr_t 55 xfs_btree_check_lblock_siblings( 56 struct xfs_mount *mp, 57 struct xfs_btree_cur *cur, 58 int level, 59 xfs_fsblock_t fsb, 60 xfs_fsblock_t sibling) 61 { 62 if (sibling == NULLFSBLOCK) 63 return NULL; 64 if (sibling == fsb) 65 return __this_address; 66 if (level >= 0) { 67 if (!xfs_btree_check_lptr(cur, sibling, level + 1)) 68 return __this_address; 69 } else { 70 if (!xfs_verify_fsbno(mp, sibling)) 71 return __this_address; 72 } 73 74 return NULL; 75 } 76 77 static xfs_failaddr_t 78 xfs_btree_check_sblock_siblings( 79 struct xfs_mount *mp, 80 struct xfs_btree_cur *cur, 81 int level, 82 xfs_agnumber_t agno, 83 xfs_agblock_t agbno, 84 xfs_agblock_t sibling) 85 { 86 if (sibling == NULLAGBLOCK) 87 return NULL; 88 if (sibling == agbno) 89 return __this_address; 90 if (level >= 0) { 91 if (!xfs_btree_check_sptr(cur, sibling, level + 1)) 92 return __this_address; 93 } else { 94 if (!xfs_verify_agbno(mp, agno, sibling)) 95 return __this_address; 96 } 97 return NULL; 98 } 99 100 /* 101 * Check a long btree block header. Return the address of the failing check, 102 * or NULL if everything is ok. 103 */ 104 xfs_failaddr_t 105 __xfs_btree_check_lblock( 106 struct xfs_btree_cur *cur, 107 struct xfs_btree_block *block, 108 int level, 109 struct xfs_buf *bp) 110 { 111 struct xfs_mount *mp = cur->bc_mp; 112 xfs_btnum_t btnum = cur->bc_btnum; 113 int crc = xfs_has_crc(mp); 114 xfs_failaddr_t fa; 115 xfs_fsblock_t fsb = NULLFSBLOCK; 116 117 if (crc) { 118 if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) 119 return __this_address; 120 if (block->bb_u.l.bb_blkno != 121 cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL)) 122 return __this_address; 123 if (block->bb_u.l.bb_pad != cpu_to_be32(0)) 124 return __this_address; 125 } 126 127 if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum)) 128 return __this_address; 129 if (be16_to_cpu(block->bb_level) != level) 130 return __this_address; 131 if (be16_to_cpu(block->bb_numrecs) > 132 cur->bc_ops->get_maxrecs(cur, level)) 133 return __this_address; 134 135 if (bp) 136 fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); 137 138 fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, 139 be64_to_cpu(block->bb_u.l.bb_leftsib)); 140 if (!fa) 141 fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, 142 be64_to_cpu(block->bb_u.l.bb_rightsib)); 143 return fa; 144 } 145 146 /* Check a long btree block header. */ 147 static int 148 xfs_btree_check_lblock( 149 struct xfs_btree_cur *cur, 150 struct xfs_btree_block *block, 151 int level, 152 struct xfs_buf *bp) 153 { 154 struct xfs_mount *mp = cur->bc_mp; 155 xfs_failaddr_t fa; 156 157 fa = __xfs_btree_check_lblock(cur, block, level, bp); 158 if (XFS_IS_CORRUPT(mp, fa != NULL) || 159 XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK)) { 160 if (bp) 161 trace_xfs_btree_corrupt(bp, _RET_IP_); 162 return -EFSCORRUPTED; 163 } 164 return 0; 165 } 166 167 /* 168 * Check a short btree block header. Return the address of the failing check, 169 * or NULL if everything is ok. 170 */ 171 xfs_failaddr_t 172 __xfs_btree_check_sblock( 173 struct xfs_btree_cur *cur, 174 struct xfs_btree_block *block, 175 int level, 176 struct xfs_buf *bp) 177 { 178 struct xfs_mount *mp = cur->bc_mp; 179 xfs_btnum_t btnum = cur->bc_btnum; 180 int crc = xfs_has_crc(mp); 181 xfs_failaddr_t fa; 182 xfs_agblock_t agbno = NULLAGBLOCK; 183 xfs_agnumber_t agno = NULLAGNUMBER; 184 185 if (crc) { 186 if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) 187 return __this_address; 188 if (block->bb_u.s.bb_blkno != 189 cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL)) 190 return __this_address; 191 } 192 193 if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum)) 194 return __this_address; 195 if (be16_to_cpu(block->bb_level) != level) 196 return __this_address; 197 if (be16_to_cpu(block->bb_numrecs) > 198 cur->bc_ops->get_maxrecs(cur, level)) 199 return __this_address; 200 201 if (bp) { 202 agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); 203 agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp)); 204 } 205 206 fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, agbno, 207 be32_to_cpu(block->bb_u.s.bb_leftsib)); 208 if (!fa) 209 fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, 210 agbno, be32_to_cpu(block->bb_u.s.bb_rightsib)); 211 return fa; 212 } 213 214 /* Check a short btree block header. */ 215 STATIC int 216 xfs_btree_check_sblock( 217 struct xfs_btree_cur *cur, 218 struct xfs_btree_block *block, 219 int level, 220 struct xfs_buf *bp) 221 { 222 struct xfs_mount *mp = cur->bc_mp; 223 xfs_failaddr_t fa; 224 225 fa = __xfs_btree_check_sblock(cur, block, level, bp); 226 if (XFS_IS_CORRUPT(mp, fa != NULL) || 227 XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_SBLOCK)) { 228 if (bp) 229 trace_xfs_btree_corrupt(bp, _RET_IP_); 230 return -EFSCORRUPTED; 231 } 232 return 0; 233 } 234 235 /* 236 * Debug routine: check that block header is ok. 237 */ 238 int 239 xfs_btree_check_block( 240 struct xfs_btree_cur *cur, /* btree cursor */ 241 struct xfs_btree_block *block, /* generic btree block pointer */ 242 int level, /* level of the btree block */ 243 struct xfs_buf *bp) /* buffer containing block, if any */ 244 { 245 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 246 return xfs_btree_check_lblock(cur, block, level, bp); 247 else 248 return xfs_btree_check_sblock(cur, block, level, bp); 249 } 250 251 /* Check that this long pointer is valid and points within the fs. */ 252 bool 253 xfs_btree_check_lptr( 254 struct xfs_btree_cur *cur, 255 xfs_fsblock_t fsbno, 256 int level) 257 { 258 if (level <= 0) 259 return false; 260 return xfs_verify_fsbno(cur->bc_mp, fsbno); 261 } 262 263 /* Check that this short pointer is valid and points within the AG. */ 264 bool 265 xfs_btree_check_sptr( 266 struct xfs_btree_cur *cur, 267 xfs_agblock_t agbno, 268 int level) 269 { 270 if (level <= 0) 271 return false; 272 return xfs_verify_agbno(cur->bc_mp, cur->bc_ag.pag->pag_agno, agbno); 273 } 274 275 /* 276 * Check that a given (indexed) btree pointer at a certain level of a 277 * btree is valid and doesn't point past where it should. 278 */ 279 static int 280 xfs_btree_check_ptr( 281 struct xfs_btree_cur *cur, 282 const union xfs_btree_ptr *ptr, 283 int index, 284 int level) 285 { 286 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 287 if (xfs_btree_check_lptr(cur, be64_to_cpu((&ptr->l)[index]), 288 level)) 289 return 0; 290 xfs_err(cur->bc_mp, 291 "Inode %llu fork %d: Corrupt btree %d pointer at level %d index %d.", 292 cur->bc_ino.ip->i_ino, 293 cur->bc_ino.whichfork, cur->bc_btnum, 294 level, index); 295 } else { 296 if (xfs_btree_check_sptr(cur, be32_to_cpu((&ptr->s)[index]), 297 level)) 298 return 0; 299 xfs_err(cur->bc_mp, 300 "AG %u: Corrupt btree %d pointer at level %d index %d.", 301 cur->bc_ag.pag->pag_agno, cur->bc_btnum, 302 level, index); 303 } 304 305 return -EFSCORRUPTED; 306 } 307 308 #ifdef DEBUG 309 # define xfs_btree_debug_check_ptr xfs_btree_check_ptr 310 #else 311 # define xfs_btree_debug_check_ptr(...) (0) 312 #endif 313 314 /* 315 * Calculate CRC on the whole btree block and stuff it into the 316 * long-form btree header. 317 * 318 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put 319 * it into the buffer so recovery knows what the last modification was that made 320 * it to disk. 321 */ 322 void 323 xfs_btree_lblock_calc_crc( 324 struct xfs_buf *bp) 325 { 326 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 327 struct xfs_buf_log_item *bip = bp->b_log_item; 328 329 if (!xfs_has_crc(bp->b_mount)) 330 return; 331 if (bip) 332 block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); 333 xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); 334 } 335 336 bool 337 xfs_btree_lblock_verify_crc( 338 struct xfs_buf *bp) 339 { 340 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 341 struct xfs_mount *mp = bp->b_mount; 342 343 if (xfs_has_crc(mp)) { 344 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn))) 345 return false; 346 return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); 347 } 348 349 return true; 350 } 351 352 /* 353 * Calculate CRC on the whole btree block and stuff it into the 354 * short-form btree header. 355 * 356 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put 357 * it into the buffer so recovery knows what the last modification was that made 358 * it to disk. 359 */ 360 void 361 xfs_btree_sblock_calc_crc( 362 struct xfs_buf *bp) 363 { 364 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 365 struct xfs_buf_log_item *bip = bp->b_log_item; 366 367 if (!xfs_has_crc(bp->b_mount)) 368 return; 369 if (bip) 370 block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); 371 xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); 372 } 373 374 bool 375 xfs_btree_sblock_verify_crc( 376 struct xfs_buf *bp) 377 { 378 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 379 struct xfs_mount *mp = bp->b_mount; 380 381 if (xfs_has_crc(mp)) { 382 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn))) 383 return false; 384 return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); 385 } 386 387 return true; 388 } 389 390 static int 391 xfs_btree_free_block( 392 struct xfs_btree_cur *cur, 393 struct xfs_buf *bp) 394 { 395 int error; 396 397 error = cur->bc_ops->free_block(cur, bp); 398 if (!error) { 399 xfs_trans_binval(cur->bc_tp, bp); 400 XFS_BTREE_STATS_INC(cur, free); 401 } 402 return error; 403 } 404 405 /* 406 * Delete the btree cursor. 407 */ 408 void 409 xfs_btree_del_cursor( 410 struct xfs_btree_cur *cur, /* btree cursor */ 411 int error) /* del because of error */ 412 { 413 int i; /* btree level */ 414 415 /* 416 * Clear the buffer pointers and release the buffers. If we're doing 417 * this because of an error, inspect all of the entries in the bc_bufs 418 * array for buffers to be unlocked. This is because some of the btree 419 * code works from level n down to 0, and if we get an error along the 420 * way we won't have initialized all the entries down to 0. 421 */ 422 for (i = 0; i < cur->bc_nlevels; i++) { 423 if (cur->bc_levels[i].bp) 424 xfs_trans_brelse(cur->bc_tp, cur->bc_levels[i].bp); 425 else if (!error) 426 break; 427 } 428 429 ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 || 430 xfs_is_shutdown(cur->bc_mp)); 431 if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) 432 kmem_free(cur->bc_ops); 433 if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag) 434 xfs_perag_put(cur->bc_ag.pag); 435 kmem_cache_free(cur->bc_cache, cur); 436 } 437 438 /* 439 * Duplicate the btree cursor. 440 * Allocate a new one, copy the record, re-get the buffers. 441 */ 442 int /* error */ 443 xfs_btree_dup_cursor( 444 struct xfs_btree_cur *cur, /* input cursor */ 445 struct xfs_btree_cur **ncur) /* output cursor */ 446 { 447 struct xfs_buf *bp; /* btree block's buffer pointer */ 448 int error; /* error return value */ 449 int i; /* level number of btree block */ 450 xfs_mount_t *mp; /* mount structure for filesystem */ 451 struct xfs_btree_cur *new; /* new cursor value */ 452 xfs_trans_t *tp; /* transaction pointer, can be NULL */ 453 454 tp = cur->bc_tp; 455 mp = cur->bc_mp; 456 457 /* 458 * Allocate a new cursor like the old one. 459 */ 460 new = cur->bc_ops->dup_cursor(cur); 461 462 /* 463 * Copy the record currently in the cursor. 464 */ 465 new->bc_rec = cur->bc_rec; 466 467 /* 468 * For each level current, re-get the buffer and copy the ptr value. 469 */ 470 for (i = 0; i < new->bc_nlevels; i++) { 471 new->bc_levels[i].ptr = cur->bc_levels[i].ptr; 472 new->bc_levels[i].ra = cur->bc_levels[i].ra; 473 bp = cur->bc_levels[i].bp; 474 if (bp) { 475 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 476 xfs_buf_daddr(bp), mp->m_bsize, 477 0, &bp, 478 cur->bc_ops->buf_ops); 479 if (error) { 480 xfs_btree_del_cursor(new, error); 481 *ncur = NULL; 482 return error; 483 } 484 } 485 new->bc_levels[i].bp = bp; 486 } 487 *ncur = new; 488 return 0; 489 } 490 491 /* 492 * XFS btree block layout and addressing: 493 * 494 * There are two types of blocks in the btree: leaf and non-leaf blocks. 495 * 496 * The leaf record start with a header then followed by records containing 497 * the values. A non-leaf block also starts with the same header, and 498 * then first contains lookup keys followed by an equal number of pointers 499 * to the btree blocks at the previous level. 500 * 501 * +--------+-------+-------+-------+-------+-------+-------+ 502 * Leaf: | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N | 503 * +--------+-------+-------+-------+-------+-------+-------+ 504 * 505 * +--------+-------+-------+-------+-------+-------+-------+ 506 * Non-Leaf: | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N | 507 * +--------+-------+-------+-------+-------+-------+-------+ 508 * 509 * The header is called struct xfs_btree_block for reasons better left unknown 510 * and comes in different versions for short (32bit) and long (64bit) block 511 * pointers. The record and key structures are defined by the btree instances 512 * and opaque to the btree core. The block pointers are simple disk endian 513 * integers, available in a short (32bit) and long (64bit) variant. 514 * 515 * The helpers below calculate the offset of a given record, key or pointer 516 * into a btree block (xfs_btree_*_offset) or return a pointer to the given 517 * record, key or pointer (xfs_btree_*_addr). Note that all addressing 518 * inside the btree block is done using indices starting at one, not zero! 519 * 520 * If XFS_BTREE_OVERLAPPING is set, then this btree supports keys containing 521 * overlapping intervals. In such a tree, records are still sorted lowest to 522 * highest and indexed by the smallest key value that refers to the record. 523 * However, nodes are different: each pointer has two associated keys -- one 524 * indexing the lowest key available in the block(s) below (the same behavior 525 * as the key in a regular btree) and another indexing the highest key 526 * available in the block(s) below. Because records are /not/ sorted by the 527 * highest key, all leaf block updates require us to compute the highest key 528 * that matches any record in the leaf and to recursively update the high keys 529 * in the nodes going further up in the tree, if necessary. Nodes look like 530 * this: 531 * 532 * +--------+-----+-----+-----+-----+-----+-------+-------+-----+ 533 * Non-Leaf: | header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... | 534 * +--------+-----+-----+-----+-----+-----+-------+-------+-----+ 535 * 536 * To perform an interval query on an overlapped tree, perform the usual 537 * depth-first search and use the low and high keys to decide if we can skip 538 * that particular node. If a leaf node is reached, return the records that 539 * intersect the interval. Note that an interval query may return numerous 540 * entries. For a non-overlapped tree, simply search for the record associated 541 * with the lowest key and iterate forward until a non-matching record is 542 * found. Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by 543 * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in 544 * more detail. 545 * 546 * Why do we care about overlapping intervals? Let's say you have a bunch of 547 * reverse mapping records on a reflink filesystem: 548 * 549 * 1: +- file A startblock B offset C length D -----------+ 550 * 2: +- file E startblock F offset G length H --------------+ 551 * 3: +- file I startblock F offset J length K --+ 552 * 4: +- file L... --+ 553 * 554 * Now say we want to map block (B+D) into file A at offset (C+D). Ideally, 555 * we'd simply increment the length of record 1. But how do we find the record 556 * that ends at (B+D-1) (i.e. record 1)? A LE lookup of (B+D-1) would return 557 * record 3 because the keys are ordered first by startblock. An interval 558 * query would return records 1 and 2 because they both overlap (B+D-1), and 559 * from that we can pick out record 1 as the appropriate left neighbor. 560 * 561 * In the non-overlapped case you can do a LE lookup and decrement the cursor 562 * because a record's interval must end before the next record. 563 */ 564 565 /* 566 * Return size of the btree block header for this btree instance. 567 */ 568 static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur) 569 { 570 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 571 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) 572 return XFS_BTREE_LBLOCK_CRC_LEN; 573 return XFS_BTREE_LBLOCK_LEN; 574 } 575 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) 576 return XFS_BTREE_SBLOCK_CRC_LEN; 577 return XFS_BTREE_SBLOCK_LEN; 578 } 579 580 /* 581 * Return size of btree block pointers for this btree instance. 582 */ 583 static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur) 584 { 585 return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? 586 sizeof(__be64) : sizeof(__be32); 587 } 588 589 /* 590 * Calculate offset of the n-th record in a btree block. 591 */ 592 STATIC size_t 593 xfs_btree_rec_offset( 594 struct xfs_btree_cur *cur, 595 int n) 596 { 597 return xfs_btree_block_len(cur) + 598 (n - 1) * cur->bc_ops->rec_len; 599 } 600 601 /* 602 * Calculate offset of the n-th key in a btree block. 603 */ 604 STATIC size_t 605 xfs_btree_key_offset( 606 struct xfs_btree_cur *cur, 607 int n) 608 { 609 return xfs_btree_block_len(cur) + 610 (n - 1) * cur->bc_ops->key_len; 611 } 612 613 /* 614 * Calculate offset of the n-th high key in a btree block. 615 */ 616 STATIC size_t 617 xfs_btree_high_key_offset( 618 struct xfs_btree_cur *cur, 619 int n) 620 { 621 return xfs_btree_block_len(cur) + 622 (n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2); 623 } 624 625 /* 626 * Calculate offset of the n-th block pointer in a btree block. 627 */ 628 STATIC size_t 629 xfs_btree_ptr_offset( 630 struct xfs_btree_cur *cur, 631 int n, 632 int level) 633 { 634 return xfs_btree_block_len(cur) + 635 cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len + 636 (n - 1) * xfs_btree_ptr_len(cur); 637 } 638 639 /* 640 * Return a pointer to the n-th record in the btree block. 641 */ 642 union xfs_btree_rec * 643 xfs_btree_rec_addr( 644 struct xfs_btree_cur *cur, 645 int n, 646 struct xfs_btree_block *block) 647 { 648 return (union xfs_btree_rec *) 649 ((char *)block + xfs_btree_rec_offset(cur, n)); 650 } 651 652 /* 653 * Return a pointer to the n-th key in the btree block. 654 */ 655 union xfs_btree_key * 656 xfs_btree_key_addr( 657 struct xfs_btree_cur *cur, 658 int n, 659 struct xfs_btree_block *block) 660 { 661 return (union xfs_btree_key *) 662 ((char *)block + xfs_btree_key_offset(cur, n)); 663 } 664 665 /* 666 * Return a pointer to the n-th high key in the btree block. 667 */ 668 union xfs_btree_key * 669 xfs_btree_high_key_addr( 670 struct xfs_btree_cur *cur, 671 int n, 672 struct xfs_btree_block *block) 673 { 674 return (union xfs_btree_key *) 675 ((char *)block + xfs_btree_high_key_offset(cur, n)); 676 } 677 678 /* 679 * Return a pointer to the n-th block pointer in the btree block. 680 */ 681 union xfs_btree_ptr * 682 xfs_btree_ptr_addr( 683 struct xfs_btree_cur *cur, 684 int n, 685 struct xfs_btree_block *block) 686 { 687 int level = xfs_btree_get_level(block); 688 689 ASSERT(block->bb_level != 0); 690 691 return (union xfs_btree_ptr *) 692 ((char *)block + xfs_btree_ptr_offset(cur, n, level)); 693 } 694 695 struct xfs_ifork * 696 xfs_btree_ifork_ptr( 697 struct xfs_btree_cur *cur) 698 { 699 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); 700 701 if (cur->bc_flags & XFS_BTREE_STAGING) 702 return cur->bc_ino.ifake->if_fork; 703 return XFS_IFORK_PTR(cur->bc_ino.ip, cur->bc_ino.whichfork); 704 } 705 706 /* 707 * Get the root block which is stored in the inode. 708 * 709 * For now this btree implementation assumes the btree root is always 710 * stored in the if_broot field of an inode fork. 711 */ 712 STATIC struct xfs_btree_block * 713 xfs_btree_get_iroot( 714 struct xfs_btree_cur *cur) 715 { 716 struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); 717 718 return (struct xfs_btree_block *)ifp->if_broot; 719 } 720 721 /* 722 * Retrieve the block pointer from the cursor at the given level. 723 * This may be an inode btree root or from a buffer. 724 */ 725 struct xfs_btree_block * /* generic btree block pointer */ 726 xfs_btree_get_block( 727 struct xfs_btree_cur *cur, /* btree cursor */ 728 int level, /* level in btree */ 729 struct xfs_buf **bpp) /* buffer containing the block */ 730 { 731 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && 732 (level == cur->bc_nlevels - 1)) { 733 *bpp = NULL; 734 return xfs_btree_get_iroot(cur); 735 } 736 737 *bpp = cur->bc_levels[level].bp; 738 return XFS_BUF_TO_BLOCK(*bpp); 739 } 740 741 /* 742 * Change the cursor to point to the first record at the given level. 743 * Other levels are unaffected. 744 */ 745 STATIC int /* success=1, failure=0 */ 746 xfs_btree_firstrec( 747 struct xfs_btree_cur *cur, /* btree cursor */ 748 int level) /* level to change */ 749 { 750 struct xfs_btree_block *block; /* generic btree block pointer */ 751 struct xfs_buf *bp; /* buffer containing block */ 752 753 /* 754 * Get the block pointer for this level. 755 */ 756 block = xfs_btree_get_block(cur, level, &bp); 757 if (xfs_btree_check_block(cur, block, level, bp)) 758 return 0; 759 /* 760 * It's empty, there is no such record. 761 */ 762 if (!block->bb_numrecs) 763 return 0; 764 /* 765 * Set the ptr value to 1, that's the first record/key. 766 */ 767 cur->bc_levels[level].ptr = 1; 768 return 1; 769 } 770 771 /* 772 * Change the cursor to point to the last record in the current block 773 * at the given level. Other levels are unaffected. 774 */ 775 STATIC int /* success=1, failure=0 */ 776 xfs_btree_lastrec( 777 struct xfs_btree_cur *cur, /* btree cursor */ 778 int level) /* level to change */ 779 { 780 struct xfs_btree_block *block; /* generic btree block pointer */ 781 struct xfs_buf *bp; /* buffer containing block */ 782 783 /* 784 * Get the block pointer for this level. 785 */ 786 block = xfs_btree_get_block(cur, level, &bp); 787 if (xfs_btree_check_block(cur, block, level, bp)) 788 return 0; 789 /* 790 * It's empty, there is no such record. 791 */ 792 if (!block->bb_numrecs) 793 return 0; 794 /* 795 * Set the ptr value to numrecs, that's the last record/key. 796 */ 797 cur->bc_levels[level].ptr = be16_to_cpu(block->bb_numrecs); 798 return 1; 799 } 800 801 /* 802 * Compute first and last byte offsets for the fields given. 803 * Interprets the offsets table, which contains struct field offsets. 804 */ 805 void 806 xfs_btree_offsets( 807 uint32_t fields, /* bitmask of fields */ 808 const short *offsets, /* table of field offsets */ 809 int nbits, /* number of bits to inspect */ 810 int *first, /* output: first byte offset */ 811 int *last) /* output: last byte offset */ 812 { 813 int i; /* current bit number */ 814 uint32_t imask; /* mask for current bit number */ 815 816 ASSERT(fields != 0); 817 /* 818 * Find the lowest bit, so the first byte offset. 819 */ 820 for (i = 0, imask = 1u; ; i++, imask <<= 1) { 821 if (imask & fields) { 822 *first = offsets[i]; 823 break; 824 } 825 } 826 /* 827 * Find the highest bit, so the last byte offset. 828 */ 829 for (i = nbits - 1, imask = 1u << i; ; i--, imask >>= 1) { 830 if (imask & fields) { 831 *last = offsets[i + 1] - 1; 832 break; 833 } 834 } 835 } 836 837 /* 838 * Get a buffer for the block, return it read in. 839 * Long-form addressing. 840 */ 841 int 842 xfs_btree_read_bufl( 843 struct xfs_mount *mp, /* file system mount point */ 844 struct xfs_trans *tp, /* transaction pointer */ 845 xfs_fsblock_t fsbno, /* file system block number */ 846 struct xfs_buf **bpp, /* buffer for fsbno */ 847 int refval, /* ref count value for buffer */ 848 const struct xfs_buf_ops *ops) 849 { 850 struct xfs_buf *bp; /* return value */ 851 xfs_daddr_t d; /* real disk block address */ 852 int error; 853 854 if (!xfs_verify_fsbno(mp, fsbno)) 855 return -EFSCORRUPTED; 856 d = XFS_FSB_TO_DADDR(mp, fsbno); 857 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, 858 mp->m_bsize, 0, &bp, ops); 859 if (error) 860 return error; 861 if (bp) 862 xfs_buf_set_ref(bp, refval); 863 *bpp = bp; 864 return 0; 865 } 866 867 /* 868 * Read-ahead the block, don't wait for it, don't return a buffer. 869 * Long-form addressing. 870 */ 871 /* ARGSUSED */ 872 void 873 xfs_btree_reada_bufl( 874 struct xfs_mount *mp, /* file system mount point */ 875 xfs_fsblock_t fsbno, /* file system block number */ 876 xfs_extlen_t count, /* count of filesystem blocks */ 877 const struct xfs_buf_ops *ops) 878 { 879 xfs_daddr_t d; 880 881 ASSERT(fsbno != NULLFSBLOCK); 882 d = XFS_FSB_TO_DADDR(mp, fsbno); 883 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops); 884 } 885 886 /* 887 * Read-ahead the block, don't wait for it, don't return a buffer. 888 * Short-form addressing. 889 */ 890 /* ARGSUSED */ 891 void 892 xfs_btree_reada_bufs( 893 struct xfs_mount *mp, /* file system mount point */ 894 xfs_agnumber_t agno, /* allocation group number */ 895 xfs_agblock_t agbno, /* allocation group block number */ 896 xfs_extlen_t count, /* count of filesystem blocks */ 897 const struct xfs_buf_ops *ops) 898 { 899 xfs_daddr_t d; 900 901 ASSERT(agno != NULLAGNUMBER); 902 ASSERT(agbno != NULLAGBLOCK); 903 d = XFS_AGB_TO_DADDR(mp, agno, agbno); 904 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops); 905 } 906 907 STATIC int 908 xfs_btree_readahead_lblock( 909 struct xfs_btree_cur *cur, 910 int lr, 911 struct xfs_btree_block *block) 912 { 913 int rval = 0; 914 xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); 915 xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); 916 917 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) { 918 xfs_btree_reada_bufl(cur->bc_mp, left, 1, 919 cur->bc_ops->buf_ops); 920 rval++; 921 } 922 923 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) { 924 xfs_btree_reada_bufl(cur->bc_mp, right, 1, 925 cur->bc_ops->buf_ops); 926 rval++; 927 } 928 929 return rval; 930 } 931 932 STATIC int 933 xfs_btree_readahead_sblock( 934 struct xfs_btree_cur *cur, 935 int lr, 936 struct xfs_btree_block *block) 937 { 938 int rval = 0; 939 xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib); 940 xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib); 941 942 943 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { 944 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno, 945 left, 1, cur->bc_ops->buf_ops); 946 rval++; 947 } 948 949 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { 950 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno, 951 right, 1, cur->bc_ops->buf_ops); 952 rval++; 953 } 954 955 return rval; 956 } 957 958 /* 959 * Read-ahead btree blocks, at the given level. 960 * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA. 961 */ 962 STATIC int 963 xfs_btree_readahead( 964 struct xfs_btree_cur *cur, /* btree cursor */ 965 int lev, /* level in btree */ 966 int lr) /* left/right bits */ 967 { 968 struct xfs_btree_block *block; 969 970 /* 971 * No readahead needed if we are at the root level and the 972 * btree root is stored in the inode. 973 */ 974 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && 975 (lev == cur->bc_nlevels - 1)) 976 return 0; 977 978 if ((cur->bc_levels[lev].ra | lr) == cur->bc_levels[lev].ra) 979 return 0; 980 981 cur->bc_levels[lev].ra |= lr; 982 block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp); 983 984 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 985 return xfs_btree_readahead_lblock(cur, lr, block); 986 return xfs_btree_readahead_sblock(cur, lr, block); 987 } 988 989 STATIC int 990 xfs_btree_ptr_to_daddr( 991 struct xfs_btree_cur *cur, 992 const union xfs_btree_ptr *ptr, 993 xfs_daddr_t *daddr) 994 { 995 xfs_fsblock_t fsbno; 996 xfs_agblock_t agbno; 997 int error; 998 999 error = xfs_btree_check_ptr(cur, ptr, 0, 1); 1000 if (error) 1001 return error; 1002 1003 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 1004 fsbno = be64_to_cpu(ptr->l); 1005 *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno); 1006 } else { 1007 agbno = be32_to_cpu(ptr->s); 1008 *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.pag->pag_agno, 1009 agbno); 1010 } 1011 1012 return 0; 1013 } 1014 1015 /* 1016 * Readahead @count btree blocks at the given @ptr location. 1017 * 1018 * We don't need to care about long or short form btrees here as we have a 1019 * method of converting the ptr directly to a daddr available to us. 1020 */ 1021 STATIC void 1022 xfs_btree_readahead_ptr( 1023 struct xfs_btree_cur *cur, 1024 union xfs_btree_ptr *ptr, 1025 xfs_extlen_t count) 1026 { 1027 xfs_daddr_t daddr; 1028 1029 if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr)) 1030 return; 1031 xfs_buf_readahead(cur->bc_mp->m_ddev_targp, daddr, 1032 cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops); 1033 } 1034 1035 /* 1036 * Set the buffer for level "lev" in the cursor to bp, releasing 1037 * any previous buffer. 1038 */ 1039 STATIC void 1040 xfs_btree_setbuf( 1041 struct xfs_btree_cur *cur, /* btree cursor */ 1042 int lev, /* level in btree */ 1043 struct xfs_buf *bp) /* new buffer to set */ 1044 { 1045 struct xfs_btree_block *b; /* btree block */ 1046 1047 if (cur->bc_levels[lev].bp) 1048 xfs_trans_brelse(cur->bc_tp, cur->bc_levels[lev].bp); 1049 cur->bc_levels[lev].bp = bp; 1050 cur->bc_levels[lev].ra = 0; 1051 1052 b = XFS_BUF_TO_BLOCK(bp); 1053 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 1054 if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)) 1055 cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA; 1056 if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)) 1057 cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA; 1058 } else { 1059 if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK)) 1060 cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA; 1061 if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK)) 1062 cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA; 1063 } 1064 } 1065 1066 bool 1067 xfs_btree_ptr_is_null( 1068 struct xfs_btree_cur *cur, 1069 const union xfs_btree_ptr *ptr) 1070 { 1071 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 1072 return ptr->l == cpu_to_be64(NULLFSBLOCK); 1073 else 1074 return ptr->s == cpu_to_be32(NULLAGBLOCK); 1075 } 1076 1077 void 1078 xfs_btree_set_ptr_null( 1079 struct xfs_btree_cur *cur, 1080 union xfs_btree_ptr *ptr) 1081 { 1082 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 1083 ptr->l = cpu_to_be64(NULLFSBLOCK); 1084 else 1085 ptr->s = cpu_to_be32(NULLAGBLOCK); 1086 } 1087 1088 /* 1089 * Get/set/init sibling pointers 1090 */ 1091 void 1092 xfs_btree_get_sibling( 1093 struct xfs_btree_cur *cur, 1094 struct xfs_btree_block *block, 1095 union xfs_btree_ptr *ptr, 1096 int lr) 1097 { 1098 ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); 1099 1100 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 1101 if (lr == XFS_BB_RIGHTSIB) 1102 ptr->l = block->bb_u.l.bb_rightsib; 1103 else 1104 ptr->l = block->bb_u.l.bb_leftsib; 1105 } else { 1106 if (lr == XFS_BB_RIGHTSIB) 1107 ptr->s = block->bb_u.s.bb_rightsib; 1108 else 1109 ptr->s = block->bb_u.s.bb_leftsib; 1110 } 1111 } 1112 1113 void 1114 xfs_btree_set_sibling( 1115 struct xfs_btree_cur *cur, 1116 struct xfs_btree_block *block, 1117 const union xfs_btree_ptr *ptr, 1118 int lr) 1119 { 1120 ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); 1121 1122 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 1123 if (lr == XFS_BB_RIGHTSIB) 1124 block->bb_u.l.bb_rightsib = ptr->l; 1125 else 1126 block->bb_u.l.bb_leftsib = ptr->l; 1127 } else { 1128 if (lr == XFS_BB_RIGHTSIB) 1129 block->bb_u.s.bb_rightsib = ptr->s; 1130 else 1131 block->bb_u.s.bb_leftsib = ptr->s; 1132 } 1133 } 1134 1135 void 1136 xfs_btree_init_block_int( 1137 struct xfs_mount *mp, 1138 struct xfs_btree_block *buf, 1139 xfs_daddr_t blkno, 1140 xfs_btnum_t btnum, 1141 __u16 level, 1142 __u16 numrecs, 1143 __u64 owner, 1144 unsigned int flags) 1145 { 1146 int crc = xfs_has_crc(mp); 1147 __u32 magic = xfs_btree_magic(crc, btnum); 1148 1149 buf->bb_magic = cpu_to_be32(magic); 1150 buf->bb_level = cpu_to_be16(level); 1151 buf->bb_numrecs = cpu_to_be16(numrecs); 1152 1153 if (flags & XFS_BTREE_LONG_PTRS) { 1154 buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK); 1155 buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK); 1156 if (crc) { 1157 buf->bb_u.l.bb_blkno = cpu_to_be64(blkno); 1158 buf->bb_u.l.bb_owner = cpu_to_be64(owner); 1159 uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid); 1160 buf->bb_u.l.bb_pad = 0; 1161 buf->bb_u.l.bb_lsn = 0; 1162 } 1163 } else { 1164 /* owner is a 32 bit value on short blocks */ 1165 __u32 __owner = (__u32)owner; 1166 1167 buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); 1168 buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); 1169 if (crc) { 1170 buf->bb_u.s.bb_blkno = cpu_to_be64(blkno); 1171 buf->bb_u.s.bb_owner = cpu_to_be32(__owner); 1172 uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid); 1173 buf->bb_u.s.bb_lsn = 0; 1174 } 1175 } 1176 } 1177 1178 void 1179 xfs_btree_init_block( 1180 struct xfs_mount *mp, 1181 struct xfs_buf *bp, 1182 xfs_btnum_t btnum, 1183 __u16 level, 1184 __u16 numrecs, 1185 __u64 owner) 1186 { 1187 xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), xfs_buf_daddr(bp), 1188 btnum, level, numrecs, owner, 0); 1189 } 1190 1191 void 1192 xfs_btree_init_block_cur( 1193 struct xfs_btree_cur *cur, 1194 struct xfs_buf *bp, 1195 int level, 1196 int numrecs) 1197 { 1198 __u64 owner; 1199 1200 /* 1201 * we can pull the owner from the cursor right now as the different 1202 * owners align directly with the pointer size of the btree. This may 1203 * change in future, but is safe for current users of the generic btree 1204 * code. 1205 */ 1206 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 1207 owner = cur->bc_ino.ip->i_ino; 1208 else 1209 owner = cur->bc_ag.pag->pag_agno; 1210 1211 xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), 1212 xfs_buf_daddr(bp), cur->bc_btnum, level, 1213 numrecs, owner, cur->bc_flags); 1214 } 1215 1216 /* 1217 * Return true if ptr is the last record in the btree and 1218 * we need to track updates to this record. The decision 1219 * will be further refined in the update_lastrec method. 1220 */ 1221 STATIC int 1222 xfs_btree_is_lastrec( 1223 struct xfs_btree_cur *cur, 1224 struct xfs_btree_block *block, 1225 int level) 1226 { 1227 union xfs_btree_ptr ptr; 1228 1229 if (level > 0) 1230 return 0; 1231 if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE)) 1232 return 0; 1233 1234 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 1235 if (!xfs_btree_ptr_is_null(cur, &ptr)) 1236 return 0; 1237 return 1; 1238 } 1239 1240 STATIC void 1241 xfs_btree_buf_to_ptr( 1242 struct xfs_btree_cur *cur, 1243 struct xfs_buf *bp, 1244 union xfs_btree_ptr *ptr) 1245 { 1246 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 1247 ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, 1248 xfs_buf_daddr(bp))); 1249 else { 1250 ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp, 1251 xfs_buf_daddr(bp))); 1252 } 1253 } 1254 1255 STATIC void 1256 xfs_btree_set_refs( 1257 struct xfs_btree_cur *cur, 1258 struct xfs_buf *bp) 1259 { 1260 switch (cur->bc_btnum) { 1261 case XFS_BTNUM_BNO: 1262 case XFS_BTNUM_CNT: 1263 xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF); 1264 break; 1265 case XFS_BTNUM_INO: 1266 case XFS_BTNUM_FINO: 1267 xfs_buf_set_ref(bp, XFS_INO_BTREE_REF); 1268 break; 1269 case XFS_BTNUM_BMAP: 1270 xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF); 1271 break; 1272 case XFS_BTNUM_RMAP: 1273 xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF); 1274 break; 1275 case XFS_BTNUM_REFC: 1276 xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF); 1277 break; 1278 default: 1279 ASSERT(0); 1280 } 1281 } 1282 1283 int 1284 xfs_btree_get_buf_block( 1285 struct xfs_btree_cur *cur, 1286 const union xfs_btree_ptr *ptr, 1287 struct xfs_btree_block **block, 1288 struct xfs_buf **bpp) 1289 { 1290 struct xfs_mount *mp = cur->bc_mp; 1291 xfs_daddr_t d; 1292 int error; 1293 1294 error = xfs_btree_ptr_to_daddr(cur, ptr, &d); 1295 if (error) 1296 return error; 1297 error = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize, 1298 0, bpp); 1299 if (error) 1300 return error; 1301 1302 (*bpp)->b_ops = cur->bc_ops->buf_ops; 1303 *block = XFS_BUF_TO_BLOCK(*bpp); 1304 return 0; 1305 } 1306 1307 /* 1308 * Read in the buffer at the given ptr and return the buffer and 1309 * the block pointer within the buffer. 1310 */ 1311 STATIC int 1312 xfs_btree_read_buf_block( 1313 struct xfs_btree_cur *cur, 1314 const union xfs_btree_ptr *ptr, 1315 int flags, 1316 struct xfs_btree_block **block, 1317 struct xfs_buf **bpp) 1318 { 1319 struct xfs_mount *mp = cur->bc_mp; 1320 xfs_daddr_t d; 1321 int error; 1322 1323 /* need to sort out how callers deal with failures first */ 1324 ASSERT(!(flags & XBF_TRYLOCK)); 1325 1326 error = xfs_btree_ptr_to_daddr(cur, ptr, &d); 1327 if (error) 1328 return error; 1329 error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, 1330 mp->m_bsize, flags, bpp, 1331 cur->bc_ops->buf_ops); 1332 if (error) 1333 return error; 1334 1335 xfs_btree_set_refs(cur, *bpp); 1336 *block = XFS_BUF_TO_BLOCK(*bpp); 1337 return 0; 1338 } 1339 1340 /* 1341 * Copy keys from one btree block to another. 1342 */ 1343 void 1344 xfs_btree_copy_keys( 1345 struct xfs_btree_cur *cur, 1346 union xfs_btree_key *dst_key, 1347 const union xfs_btree_key *src_key, 1348 int numkeys) 1349 { 1350 ASSERT(numkeys >= 0); 1351 memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len); 1352 } 1353 1354 /* 1355 * Copy records from one btree block to another. 1356 */ 1357 STATIC void 1358 xfs_btree_copy_recs( 1359 struct xfs_btree_cur *cur, 1360 union xfs_btree_rec *dst_rec, 1361 union xfs_btree_rec *src_rec, 1362 int numrecs) 1363 { 1364 ASSERT(numrecs >= 0); 1365 memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len); 1366 } 1367 1368 /* 1369 * Copy block pointers from one btree block to another. 1370 */ 1371 void 1372 xfs_btree_copy_ptrs( 1373 struct xfs_btree_cur *cur, 1374 union xfs_btree_ptr *dst_ptr, 1375 const union xfs_btree_ptr *src_ptr, 1376 int numptrs) 1377 { 1378 ASSERT(numptrs >= 0); 1379 memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur)); 1380 } 1381 1382 /* 1383 * Shift keys one index left/right inside a single btree block. 1384 */ 1385 STATIC void 1386 xfs_btree_shift_keys( 1387 struct xfs_btree_cur *cur, 1388 union xfs_btree_key *key, 1389 int dir, 1390 int numkeys) 1391 { 1392 char *dst_key; 1393 1394 ASSERT(numkeys >= 0); 1395 ASSERT(dir == 1 || dir == -1); 1396 1397 dst_key = (char *)key + (dir * cur->bc_ops->key_len); 1398 memmove(dst_key, key, numkeys * cur->bc_ops->key_len); 1399 } 1400 1401 /* 1402 * Shift records one index left/right inside a single btree block. 1403 */ 1404 STATIC void 1405 xfs_btree_shift_recs( 1406 struct xfs_btree_cur *cur, 1407 union xfs_btree_rec *rec, 1408 int dir, 1409 int numrecs) 1410 { 1411 char *dst_rec; 1412 1413 ASSERT(numrecs >= 0); 1414 ASSERT(dir == 1 || dir == -1); 1415 1416 dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len); 1417 memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len); 1418 } 1419 1420 /* 1421 * Shift block pointers one index left/right inside a single btree block. 1422 */ 1423 STATIC void 1424 xfs_btree_shift_ptrs( 1425 struct xfs_btree_cur *cur, 1426 union xfs_btree_ptr *ptr, 1427 int dir, 1428 int numptrs) 1429 { 1430 char *dst_ptr; 1431 1432 ASSERT(numptrs >= 0); 1433 ASSERT(dir == 1 || dir == -1); 1434 1435 dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur)); 1436 memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur)); 1437 } 1438 1439 /* 1440 * Log key values from the btree block. 1441 */ 1442 STATIC void 1443 xfs_btree_log_keys( 1444 struct xfs_btree_cur *cur, 1445 struct xfs_buf *bp, 1446 int first, 1447 int last) 1448 { 1449 1450 if (bp) { 1451 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1452 xfs_trans_log_buf(cur->bc_tp, bp, 1453 xfs_btree_key_offset(cur, first), 1454 xfs_btree_key_offset(cur, last + 1) - 1); 1455 } else { 1456 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, 1457 xfs_ilog_fbroot(cur->bc_ino.whichfork)); 1458 } 1459 } 1460 1461 /* 1462 * Log record values from the btree block. 1463 */ 1464 void 1465 xfs_btree_log_recs( 1466 struct xfs_btree_cur *cur, 1467 struct xfs_buf *bp, 1468 int first, 1469 int last) 1470 { 1471 1472 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1473 xfs_trans_log_buf(cur->bc_tp, bp, 1474 xfs_btree_rec_offset(cur, first), 1475 xfs_btree_rec_offset(cur, last + 1) - 1); 1476 1477 } 1478 1479 /* 1480 * Log block pointer fields from a btree block (nonleaf). 1481 */ 1482 STATIC void 1483 xfs_btree_log_ptrs( 1484 struct xfs_btree_cur *cur, /* btree cursor */ 1485 struct xfs_buf *bp, /* buffer containing btree block */ 1486 int first, /* index of first pointer to log */ 1487 int last) /* index of last pointer to log */ 1488 { 1489 1490 if (bp) { 1491 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 1492 int level = xfs_btree_get_level(block); 1493 1494 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1495 xfs_trans_log_buf(cur->bc_tp, bp, 1496 xfs_btree_ptr_offset(cur, first, level), 1497 xfs_btree_ptr_offset(cur, last + 1, level) - 1); 1498 } else { 1499 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, 1500 xfs_ilog_fbroot(cur->bc_ino.whichfork)); 1501 } 1502 1503 } 1504 1505 /* 1506 * Log fields from a btree block header. 1507 */ 1508 void 1509 xfs_btree_log_block( 1510 struct xfs_btree_cur *cur, /* btree cursor */ 1511 struct xfs_buf *bp, /* buffer containing btree block */ 1512 uint32_t fields) /* mask of fields: XFS_BB_... */ 1513 { 1514 int first; /* first byte offset logged */ 1515 int last; /* last byte offset logged */ 1516 static const short soffsets[] = { /* table of offsets (short) */ 1517 offsetof(struct xfs_btree_block, bb_magic), 1518 offsetof(struct xfs_btree_block, bb_level), 1519 offsetof(struct xfs_btree_block, bb_numrecs), 1520 offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib), 1521 offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib), 1522 offsetof(struct xfs_btree_block, bb_u.s.bb_blkno), 1523 offsetof(struct xfs_btree_block, bb_u.s.bb_lsn), 1524 offsetof(struct xfs_btree_block, bb_u.s.bb_uuid), 1525 offsetof(struct xfs_btree_block, bb_u.s.bb_owner), 1526 offsetof(struct xfs_btree_block, bb_u.s.bb_crc), 1527 XFS_BTREE_SBLOCK_CRC_LEN 1528 }; 1529 static const short loffsets[] = { /* table of offsets (long) */ 1530 offsetof(struct xfs_btree_block, bb_magic), 1531 offsetof(struct xfs_btree_block, bb_level), 1532 offsetof(struct xfs_btree_block, bb_numrecs), 1533 offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib), 1534 offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib), 1535 offsetof(struct xfs_btree_block, bb_u.l.bb_blkno), 1536 offsetof(struct xfs_btree_block, bb_u.l.bb_lsn), 1537 offsetof(struct xfs_btree_block, bb_u.l.bb_uuid), 1538 offsetof(struct xfs_btree_block, bb_u.l.bb_owner), 1539 offsetof(struct xfs_btree_block, bb_u.l.bb_crc), 1540 offsetof(struct xfs_btree_block, bb_u.l.bb_pad), 1541 XFS_BTREE_LBLOCK_CRC_LEN 1542 }; 1543 1544 if (bp) { 1545 int nbits; 1546 1547 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { 1548 /* 1549 * We don't log the CRC when updating a btree 1550 * block but instead recreate it during log 1551 * recovery. As the log buffers have checksums 1552 * of their own this is safe and avoids logging a crc 1553 * update in a lot of places. 1554 */ 1555 if (fields == XFS_BB_ALL_BITS) 1556 fields = XFS_BB_ALL_BITS_CRC; 1557 nbits = XFS_BB_NUM_BITS_CRC; 1558 } else { 1559 nbits = XFS_BB_NUM_BITS; 1560 } 1561 xfs_btree_offsets(fields, 1562 (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? 1563 loffsets : soffsets, 1564 nbits, &first, &last); 1565 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1566 xfs_trans_log_buf(cur->bc_tp, bp, first, last); 1567 } else { 1568 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, 1569 xfs_ilog_fbroot(cur->bc_ino.whichfork)); 1570 } 1571 } 1572 1573 /* 1574 * Increment cursor by one record at the level. 1575 * For nonzero levels the leaf-ward information is untouched. 1576 */ 1577 int /* error */ 1578 xfs_btree_increment( 1579 struct xfs_btree_cur *cur, 1580 int level, 1581 int *stat) /* success/failure */ 1582 { 1583 struct xfs_btree_block *block; 1584 union xfs_btree_ptr ptr; 1585 struct xfs_buf *bp; 1586 int error; /* error return value */ 1587 int lev; 1588 1589 ASSERT(level < cur->bc_nlevels); 1590 1591 /* Read-ahead to the right at this level. */ 1592 xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); 1593 1594 /* Get a pointer to the btree block. */ 1595 block = xfs_btree_get_block(cur, level, &bp); 1596 1597 #ifdef DEBUG 1598 error = xfs_btree_check_block(cur, block, level, bp); 1599 if (error) 1600 goto error0; 1601 #endif 1602 1603 /* We're done if we remain in the block after the increment. */ 1604 if (++cur->bc_levels[level].ptr <= xfs_btree_get_numrecs(block)) 1605 goto out1; 1606 1607 /* Fail if we just went off the right edge of the tree. */ 1608 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 1609 if (xfs_btree_ptr_is_null(cur, &ptr)) 1610 goto out0; 1611 1612 XFS_BTREE_STATS_INC(cur, increment); 1613 1614 /* 1615 * March up the tree incrementing pointers. 1616 * Stop when we don't go off the right edge of a block. 1617 */ 1618 for (lev = level + 1; lev < cur->bc_nlevels; lev++) { 1619 block = xfs_btree_get_block(cur, lev, &bp); 1620 1621 #ifdef DEBUG 1622 error = xfs_btree_check_block(cur, block, lev, bp); 1623 if (error) 1624 goto error0; 1625 #endif 1626 1627 if (++cur->bc_levels[lev].ptr <= xfs_btree_get_numrecs(block)) 1628 break; 1629 1630 /* Read-ahead the right block for the next loop. */ 1631 xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA); 1632 } 1633 1634 /* 1635 * If we went off the root then we are either seriously 1636 * confused or have the tree root in an inode. 1637 */ 1638 if (lev == cur->bc_nlevels) { 1639 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) 1640 goto out0; 1641 ASSERT(0); 1642 error = -EFSCORRUPTED; 1643 goto error0; 1644 } 1645 ASSERT(lev < cur->bc_nlevels); 1646 1647 /* 1648 * Now walk back down the tree, fixing up the cursor's buffer 1649 * pointers and key numbers. 1650 */ 1651 for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { 1652 union xfs_btree_ptr *ptrp; 1653 1654 ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block); 1655 --lev; 1656 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); 1657 if (error) 1658 goto error0; 1659 1660 xfs_btree_setbuf(cur, lev, bp); 1661 cur->bc_levels[lev].ptr = 1; 1662 } 1663 out1: 1664 *stat = 1; 1665 return 0; 1666 1667 out0: 1668 *stat = 0; 1669 return 0; 1670 1671 error0: 1672 return error; 1673 } 1674 1675 /* 1676 * Decrement cursor by one record at the level. 1677 * For nonzero levels the leaf-ward information is untouched. 1678 */ 1679 int /* error */ 1680 xfs_btree_decrement( 1681 struct xfs_btree_cur *cur, 1682 int level, 1683 int *stat) /* success/failure */ 1684 { 1685 struct xfs_btree_block *block; 1686 struct xfs_buf *bp; 1687 int error; /* error return value */ 1688 int lev; 1689 union xfs_btree_ptr ptr; 1690 1691 ASSERT(level < cur->bc_nlevels); 1692 1693 /* Read-ahead to the left at this level. */ 1694 xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA); 1695 1696 /* We're done if we remain in the block after the decrement. */ 1697 if (--cur->bc_levels[level].ptr > 0) 1698 goto out1; 1699 1700 /* Get a pointer to the btree block. */ 1701 block = xfs_btree_get_block(cur, level, &bp); 1702 1703 #ifdef DEBUG 1704 error = xfs_btree_check_block(cur, block, level, bp); 1705 if (error) 1706 goto error0; 1707 #endif 1708 1709 /* Fail if we just went off the left edge of the tree. */ 1710 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); 1711 if (xfs_btree_ptr_is_null(cur, &ptr)) 1712 goto out0; 1713 1714 XFS_BTREE_STATS_INC(cur, decrement); 1715 1716 /* 1717 * March up the tree decrementing pointers. 1718 * Stop when we don't go off the left edge of a block. 1719 */ 1720 for (lev = level + 1; lev < cur->bc_nlevels; lev++) { 1721 if (--cur->bc_levels[lev].ptr > 0) 1722 break; 1723 /* Read-ahead the left block for the next loop. */ 1724 xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA); 1725 } 1726 1727 /* 1728 * If we went off the root then we are seriously confused. 1729 * or the root of the tree is in an inode. 1730 */ 1731 if (lev == cur->bc_nlevels) { 1732 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) 1733 goto out0; 1734 ASSERT(0); 1735 error = -EFSCORRUPTED; 1736 goto error0; 1737 } 1738 ASSERT(lev < cur->bc_nlevels); 1739 1740 /* 1741 * Now walk back down the tree, fixing up the cursor's buffer 1742 * pointers and key numbers. 1743 */ 1744 for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { 1745 union xfs_btree_ptr *ptrp; 1746 1747 ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block); 1748 --lev; 1749 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); 1750 if (error) 1751 goto error0; 1752 xfs_btree_setbuf(cur, lev, bp); 1753 cur->bc_levels[lev].ptr = xfs_btree_get_numrecs(block); 1754 } 1755 out1: 1756 *stat = 1; 1757 return 0; 1758 1759 out0: 1760 *stat = 0; 1761 return 0; 1762 1763 error0: 1764 return error; 1765 } 1766 1767 int 1768 xfs_btree_lookup_get_block( 1769 struct xfs_btree_cur *cur, /* btree cursor */ 1770 int level, /* level in the btree */ 1771 const union xfs_btree_ptr *pp, /* ptr to btree block */ 1772 struct xfs_btree_block **blkp) /* return btree block */ 1773 { 1774 struct xfs_buf *bp; /* buffer pointer for btree block */ 1775 xfs_daddr_t daddr; 1776 int error = 0; 1777 1778 /* special case the root block if in an inode */ 1779 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && 1780 (level == cur->bc_nlevels - 1)) { 1781 *blkp = xfs_btree_get_iroot(cur); 1782 return 0; 1783 } 1784 1785 /* 1786 * If the old buffer at this level for the disk address we are 1787 * looking for re-use it. 1788 * 1789 * Otherwise throw it away and get a new one. 1790 */ 1791 bp = cur->bc_levels[level].bp; 1792 error = xfs_btree_ptr_to_daddr(cur, pp, &daddr); 1793 if (error) 1794 return error; 1795 if (bp && xfs_buf_daddr(bp) == daddr) { 1796 *blkp = XFS_BUF_TO_BLOCK(bp); 1797 return 0; 1798 } 1799 1800 error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp); 1801 if (error) 1802 return error; 1803 1804 /* Check the inode owner since the verifiers don't. */ 1805 if (xfs_has_crc(cur->bc_mp) && 1806 !(cur->bc_ino.flags & XFS_BTCUR_BMBT_INVALID_OWNER) && 1807 (cur->bc_flags & XFS_BTREE_LONG_PTRS) && 1808 be64_to_cpu((*blkp)->bb_u.l.bb_owner) != 1809 cur->bc_ino.ip->i_ino) 1810 goto out_bad; 1811 1812 /* Did we get the level we were looking for? */ 1813 if (be16_to_cpu((*blkp)->bb_level) != level) 1814 goto out_bad; 1815 1816 /* Check that internal nodes have at least one record. */ 1817 if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0) 1818 goto out_bad; 1819 1820 xfs_btree_setbuf(cur, level, bp); 1821 return 0; 1822 1823 out_bad: 1824 *blkp = NULL; 1825 xfs_buf_mark_corrupt(bp); 1826 xfs_trans_brelse(cur->bc_tp, bp); 1827 return -EFSCORRUPTED; 1828 } 1829 1830 /* 1831 * Get current search key. For level 0 we don't actually have a key 1832 * structure so we make one up from the record. For all other levels 1833 * we just return the right key. 1834 */ 1835 STATIC union xfs_btree_key * 1836 xfs_lookup_get_search_key( 1837 struct xfs_btree_cur *cur, 1838 int level, 1839 int keyno, 1840 struct xfs_btree_block *block, 1841 union xfs_btree_key *kp) 1842 { 1843 if (level == 0) { 1844 cur->bc_ops->init_key_from_rec(kp, 1845 xfs_btree_rec_addr(cur, keyno, block)); 1846 return kp; 1847 } 1848 1849 return xfs_btree_key_addr(cur, keyno, block); 1850 } 1851 1852 /* 1853 * Lookup the record. The cursor is made to point to it, based on dir. 1854 * stat is set to 0 if can't find any such record, 1 for success. 1855 */ 1856 int /* error */ 1857 xfs_btree_lookup( 1858 struct xfs_btree_cur *cur, /* btree cursor */ 1859 xfs_lookup_t dir, /* <=, ==, or >= */ 1860 int *stat) /* success/failure */ 1861 { 1862 struct xfs_btree_block *block; /* current btree block */ 1863 int64_t diff; /* difference for the current key */ 1864 int error; /* error return value */ 1865 int keyno; /* current key number */ 1866 int level; /* level in the btree */ 1867 union xfs_btree_ptr *pp; /* ptr to btree block */ 1868 union xfs_btree_ptr ptr; /* ptr to btree block */ 1869 1870 XFS_BTREE_STATS_INC(cur, lookup); 1871 1872 /* No such thing as a zero-level tree. */ 1873 if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0)) 1874 return -EFSCORRUPTED; 1875 1876 block = NULL; 1877 keyno = 0; 1878 1879 /* initialise start pointer from cursor */ 1880 cur->bc_ops->init_ptr_from_cur(cur, &ptr); 1881 pp = &ptr; 1882 1883 /* 1884 * Iterate over each level in the btree, starting at the root. 1885 * For each level above the leaves, find the key we need, based 1886 * on the lookup record, then follow the corresponding block 1887 * pointer down to the next level. 1888 */ 1889 for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) { 1890 /* Get the block we need to do the lookup on. */ 1891 error = xfs_btree_lookup_get_block(cur, level, pp, &block); 1892 if (error) 1893 goto error0; 1894 1895 if (diff == 0) { 1896 /* 1897 * If we already had a key match at a higher level, we 1898 * know we need to use the first entry in this block. 1899 */ 1900 keyno = 1; 1901 } else { 1902 /* Otherwise search this block. Do a binary search. */ 1903 1904 int high; /* high entry number */ 1905 int low; /* low entry number */ 1906 1907 /* Set low and high entry numbers, 1-based. */ 1908 low = 1; 1909 high = xfs_btree_get_numrecs(block); 1910 if (!high) { 1911 /* Block is empty, must be an empty leaf. */ 1912 if (level != 0 || cur->bc_nlevels != 1) { 1913 XFS_CORRUPTION_ERROR(__func__, 1914 XFS_ERRLEVEL_LOW, 1915 cur->bc_mp, block, 1916 sizeof(*block)); 1917 return -EFSCORRUPTED; 1918 } 1919 1920 cur->bc_levels[0].ptr = dir != XFS_LOOKUP_LE; 1921 *stat = 0; 1922 return 0; 1923 } 1924 1925 /* Binary search the block. */ 1926 while (low <= high) { 1927 union xfs_btree_key key; 1928 union xfs_btree_key *kp; 1929 1930 XFS_BTREE_STATS_INC(cur, compare); 1931 1932 /* keyno is average of low and high. */ 1933 keyno = (low + high) >> 1; 1934 1935 /* Get current search key */ 1936 kp = xfs_lookup_get_search_key(cur, level, 1937 keyno, block, &key); 1938 1939 /* 1940 * Compute difference to get next direction: 1941 * - less than, move right 1942 * - greater than, move left 1943 * - equal, we're done 1944 */ 1945 diff = cur->bc_ops->key_diff(cur, kp); 1946 if (diff < 0) 1947 low = keyno + 1; 1948 else if (diff > 0) 1949 high = keyno - 1; 1950 else 1951 break; 1952 } 1953 } 1954 1955 /* 1956 * If there are more levels, set up for the next level 1957 * by getting the block number and filling in the cursor. 1958 */ 1959 if (level > 0) { 1960 /* 1961 * If we moved left, need the previous key number, 1962 * unless there isn't one. 1963 */ 1964 if (diff > 0 && --keyno < 1) 1965 keyno = 1; 1966 pp = xfs_btree_ptr_addr(cur, keyno, block); 1967 1968 error = xfs_btree_debug_check_ptr(cur, pp, 0, level); 1969 if (error) 1970 goto error0; 1971 1972 cur->bc_levels[level].ptr = keyno; 1973 } 1974 } 1975 1976 /* Done with the search. See if we need to adjust the results. */ 1977 if (dir != XFS_LOOKUP_LE && diff < 0) { 1978 keyno++; 1979 /* 1980 * If ge search and we went off the end of the block, but it's 1981 * not the last block, we're in the wrong block. 1982 */ 1983 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 1984 if (dir == XFS_LOOKUP_GE && 1985 keyno > xfs_btree_get_numrecs(block) && 1986 !xfs_btree_ptr_is_null(cur, &ptr)) { 1987 int i; 1988 1989 cur->bc_levels[0].ptr = keyno; 1990 error = xfs_btree_increment(cur, 0, &i); 1991 if (error) 1992 goto error0; 1993 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) 1994 return -EFSCORRUPTED; 1995 *stat = 1; 1996 return 0; 1997 } 1998 } else if (dir == XFS_LOOKUP_LE && diff > 0) 1999 keyno--; 2000 cur->bc_levels[0].ptr = keyno; 2001 2002 /* Return if we succeeded or not. */ 2003 if (keyno == 0 || keyno > xfs_btree_get_numrecs(block)) 2004 *stat = 0; 2005 else if (dir != XFS_LOOKUP_EQ || diff == 0) 2006 *stat = 1; 2007 else 2008 *stat = 0; 2009 return 0; 2010 2011 error0: 2012 return error; 2013 } 2014 2015 /* Find the high key storage area from a regular key. */ 2016 union xfs_btree_key * 2017 xfs_btree_high_key_from_key( 2018 struct xfs_btree_cur *cur, 2019 union xfs_btree_key *key) 2020 { 2021 ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING); 2022 return (union xfs_btree_key *)((char *)key + 2023 (cur->bc_ops->key_len / 2)); 2024 } 2025 2026 /* Determine the low (and high if overlapped) keys of a leaf block */ 2027 STATIC void 2028 xfs_btree_get_leaf_keys( 2029 struct xfs_btree_cur *cur, 2030 struct xfs_btree_block *block, 2031 union xfs_btree_key *key) 2032 { 2033 union xfs_btree_key max_hkey; 2034 union xfs_btree_key hkey; 2035 union xfs_btree_rec *rec; 2036 union xfs_btree_key *high; 2037 int n; 2038 2039 rec = xfs_btree_rec_addr(cur, 1, block); 2040 cur->bc_ops->init_key_from_rec(key, rec); 2041 2042 if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { 2043 2044 cur->bc_ops->init_high_key_from_rec(&max_hkey, rec); 2045 for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { 2046 rec = xfs_btree_rec_addr(cur, n, block); 2047 cur->bc_ops->init_high_key_from_rec(&hkey, rec); 2048 if (cur->bc_ops->diff_two_keys(cur, &hkey, &max_hkey) 2049 > 0) 2050 max_hkey = hkey; 2051 } 2052 2053 high = xfs_btree_high_key_from_key(cur, key); 2054 memcpy(high, &max_hkey, cur->bc_ops->key_len / 2); 2055 } 2056 } 2057 2058 /* Determine the low (and high if overlapped) keys of a node block */ 2059 STATIC void 2060 xfs_btree_get_node_keys( 2061 struct xfs_btree_cur *cur, 2062 struct xfs_btree_block *block, 2063 union xfs_btree_key *key) 2064 { 2065 union xfs_btree_key *hkey; 2066 union xfs_btree_key *max_hkey; 2067 union xfs_btree_key *high; 2068 int n; 2069 2070 if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { 2071 memcpy(key, xfs_btree_key_addr(cur, 1, block), 2072 cur->bc_ops->key_len / 2); 2073 2074 max_hkey = xfs_btree_high_key_addr(cur, 1, block); 2075 for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { 2076 hkey = xfs_btree_high_key_addr(cur, n, block); 2077 if (cur->bc_ops->diff_two_keys(cur, hkey, max_hkey) > 0) 2078 max_hkey = hkey; 2079 } 2080 2081 high = xfs_btree_high_key_from_key(cur, key); 2082 memcpy(high, max_hkey, cur->bc_ops->key_len / 2); 2083 } else { 2084 memcpy(key, xfs_btree_key_addr(cur, 1, block), 2085 cur->bc_ops->key_len); 2086 } 2087 } 2088 2089 /* Derive the keys for any btree block. */ 2090 void 2091 xfs_btree_get_keys( 2092 struct xfs_btree_cur *cur, 2093 struct xfs_btree_block *block, 2094 union xfs_btree_key *key) 2095 { 2096 if (be16_to_cpu(block->bb_level) == 0) 2097 xfs_btree_get_leaf_keys(cur, block, key); 2098 else 2099 xfs_btree_get_node_keys(cur, block, key); 2100 } 2101 2102 /* 2103 * Decide if we need to update the parent keys of a btree block. For 2104 * a standard btree this is only necessary if we're updating the first 2105 * record/key. For an overlapping btree, we must always update the 2106 * keys because the highest key can be in any of the records or keys 2107 * in the block. 2108 */ 2109 static inline bool 2110 xfs_btree_needs_key_update( 2111 struct xfs_btree_cur *cur, 2112 int ptr) 2113 { 2114 return (cur->bc_flags & XFS_BTREE_OVERLAPPING) || ptr == 1; 2115 } 2116 2117 /* 2118 * Update the low and high parent keys of the given level, progressing 2119 * towards the root. If force_all is false, stop if the keys for a given 2120 * level do not need updating. 2121 */ 2122 STATIC int 2123 __xfs_btree_updkeys( 2124 struct xfs_btree_cur *cur, 2125 int level, 2126 struct xfs_btree_block *block, 2127 struct xfs_buf *bp0, 2128 bool force_all) 2129 { 2130 union xfs_btree_key key; /* keys from current level */ 2131 union xfs_btree_key *lkey; /* keys from the next level up */ 2132 union xfs_btree_key *hkey; 2133 union xfs_btree_key *nlkey; /* keys from the next level up */ 2134 union xfs_btree_key *nhkey; 2135 struct xfs_buf *bp; 2136 int ptr; 2137 2138 ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING); 2139 2140 /* Exit if there aren't any parent levels to update. */ 2141 if (level + 1 >= cur->bc_nlevels) 2142 return 0; 2143 2144 trace_xfs_btree_updkeys(cur, level, bp0); 2145 2146 lkey = &key; 2147 hkey = xfs_btree_high_key_from_key(cur, lkey); 2148 xfs_btree_get_keys(cur, block, lkey); 2149 for (level++; level < cur->bc_nlevels; level++) { 2150 #ifdef DEBUG 2151 int error; 2152 #endif 2153 block = xfs_btree_get_block(cur, level, &bp); 2154 trace_xfs_btree_updkeys(cur, level, bp); 2155 #ifdef DEBUG 2156 error = xfs_btree_check_block(cur, block, level, bp); 2157 if (error) 2158 return error; 2159 #endif 2160 ptr = cur->bc_levels[level].ptr; 2161 nlkey = xfs_btree_key_addr(cur, ptr, block); 2162 nhkey = xfs_btree_high_key_addr(cur, ptr, block); 2163 if (!force_all && 2164 !(cur->bc_ops->diff_two_keys(cur, nlkey, lkey) != 0 || 2165 cur->bc_ops->diff_two_keys(cur, nhkey, hkey) != 0)) 2166 break; 2167 xfs_btree_copy_keys(cur, nlkey, lkey, 1); 2168 xfs_btree_log_keys(cur, bp, ptr, ptr); 2169 if (level + 1 >= cur->bc_nlevels) 2170 break; 2171 xfs_btree_get_node_keys(cur, block, lkey); 2172 } 2173 2174 return 0; 2175 } 2176 2177 /* Update all the keys from some level in cursor back to the root. */ 2178 STATIC int 2179 xfs_btree_updkeys_force( 2180 struct xfs_btree_cur *cur, 2181 int level) 2182 { 2183 struct xfs_buf *bp; 2184 struct xfs_btree_block *block; 2185 2186 block = xfs_btree_get_block(cur, level, &bp); 2187 return __xfs_btree_updkeys(cur, level, block, bp, true); 2188 } 2189 2190 /* 2191 * Update the parent keys of the given level, progressing towards the root. 2192 */ 2193 STATIC int 2194 xfs_btree_update_keys( 2195 struct xfs_btree_cur *cur, 2196 int level) 2197 { 2198 struct xfs_btree_block *block; 2199 struct xfs_buf *bp; 2200 union xfs_btree_key *kp; 2201 union xfs_btree_key key; 2202 int ptr; 2203 2204 ASSERT(level >= 0); 2205 2206 block = xfs_btree_get_block(cur, level, &bp); 2207 if (cur->bc_flags & XFS_BTREE_OVERLAPPING) 2208 return __xfs_btree_updkeys(cur, level, block, bp, false); 2209 2210 /* 2211 * Go up the tree from this level toward the root. 2212 * At each level, update the key value to the value input. 2213 * Stop when we reach a level where the cursor isn't pointing 2214 * at the first entry in the block. 2215 */ 2216 xfs_btree_get_keys(cur, block, &key); 2217 for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) { 2218 #ifdef DEBUG 2219 int error; 2220 #endif 2221 block = xfs_btree_get_block(cur, level, &bp); 2222 #ifdef DEBUG 2223 error = xfs_btree_check_block(cur, block, level, bp); 2224 if (error) 2225 return error; 2226 #endif 2227 ptr = cur->bc_levels[level].ptr; 2228 kp = xfs_btree_key_addr(cur, ptr, block); 2229 xfs_btree_copy_keys(cur, kp, &key, 1); 2230 xfs_btree_log_keys(cur, bp, ptr, ptr); 2231 } 2232 2233 return 0; 2234 } 2235 2236 /* 2237 * Update the record referred to by cur to the value in the 2238 * given record. This either works (return 0) or gets an 2239 * EFSCORRUPTED error. 2240 */ 2241 int 2242 xfs_btree_update( 2243 struct xfs_btree_cur *cur, 2244 union xfs_btree_rec *rec) 2245 { 2246 struct xfs_btree_block *block; 2247 struct xfs_buf *bp; 2248 int error; 2249 int ptr; 2250 union xfs_btree_rec *rp; 2251 2252 /* Pick up the current block. */ 2253 block = xfs_btree_get_block(cur, 0, &bp); 2254 2255 #ifdef DEBUG 2256 error = xfs_btree_check_block(cur, block, 0, bp); 2257 if (error) 2258 goto error0; 2259 #endif 2260 /* Get the address of the rec to be updated. */ 2261 ptr = cur->bc_levels[0].ptr; 2262 rp = xfs_btree_rec_addr(cur, ptr, block); 2263 2264 /* Fill in the new contents and log them. */ 2265 xfs_btree_copy_recs(cur, rp, rec, 1); 2266 xfs_btree_log_recs(cur, bp, ptr, ptr); 2267 2268 /* 2269 * If we are tracking the last record in the tree and 2270 * we are at the far right edge of the tree, update it. 2271 */ 2272 if (xfs_btree_is_lastrec(cur, block, 0)) { 2273 cur->bc_ops->update_lastrec(cur, block, rec, 2274 ptr, LASTREC_UPDATE); 2275 } 2276 2277 /* Pass new key value up to our parent. */ 2278 if (xfs_btree_needs_key_update(cur, ptr)) { 2279 error = xfs_btree_update_keys(cur, 0); 2280 if (error) 2281 goto error0; 2282 } 2283 2284 return 0; 2285 2286 error0: 2287 return error; 2288 } 2289 2290 /* 2291 * Move 1 record left from cur/level if possible. 2292 * Update cur to reflect the new path. 2293 */ 2294 STATIC int /* error */ 2295 xfs_btree_lshift( 2296 struct xfs_btree_cur *cur, 2297 int level, 2298 int *stat) /* success/failure */ 2299 { 2300 struct xfs_buf *lbp; /* left buffer pointer */ 2301 struct xfs_btree_block *left; /* left btree block */ 2302 int lrecs; /* left record count */ 2303 struct xfs_buf *rbp; /* right buffer pointer */ 2304 struct xfs_btree_block *right; /* right btree block */ 2305 struct xfs_btree_cur *tcur; /* temporary btree cursor */ 2306 int rrecs; /* right record count */ 2307 union xfs_btree_ptr lptr; /* left btree pointer */ 2308 union xfs_btree_key *rkp = NULL; /* right btree key */ 2309 union xfs_btree_ptr *rpp = NULL; /* right address pointer */ 2310 union xfs_btree_rec *rrp = NULL; /* right record pointer */ 2311 int error; /* error return value */ 2312 int i; 2313 2314 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && 2315 level == cur->bc_nlevels - 1) 2316 goto out0; 2317 2318 /* Set up variables for this block as "right". */ 2319 right = xfs_btree_get_block(cur, level, &rbp); 2320 2321 #ifdef DEBUG 2322 error = xfs_btree_check_block(cur, right, level, rbp); 2323 if (error) 2324 goto error0; 2325 #endif 2326 2327 /* If we've got no left sibling then we can't shift an entry left. */ 2328 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); 2329 if (xfs_btree_ptr_is_null(cur, &lptr)) 2330 goto out0; 2331 2332 /* 2333 * If the cursor entry is the one that would be moved, don't 2334 * do it... it's too complicated. 2335 */ 2336 if (cur->bc_levels[level].ptr <= 1) 2337 goto out0; 2338 2339 /* Set up the left neighbor as "left". */ 2340 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); 2341 if (error) 2342 goto error0; 2343 2344 /* If it's full, it can't take another entry. */ 2345 lrecs = xfs_btree_get_numrecs(left); 2346 if (lrecs == cur->bc_ops->get_maxrecs(cur, level)) 2347 goto out0; 2348 2349 rrecs = xfs_btree_get_numrecs(right); 2350 2351 /* 2352 * We add one entry to the left side and remove one for the right side. 2353 * Account for it here, the changes will be updated on disk and logged 2354 * later. 2355 */ 2356 lrecs++; 2357 rrecs--; 2358 2359 XFS_BTREE_STATS_INC(cur, lshift); 2360 XFS_BTREE_STATS_ADD(cur, moves, 1); 2361 2362 /* 2363 * If non-leaf, copy a key and a ptr to the left block. 2364 * Log the changes to the left block. 2365 */ 2366 if (level > 0) { 2367 /* It's a non-leaf. Move keys and pointers. */ 2368 union xfs_btree_key *lkp; /* left btree key */ 2369 union xfs_btree_ptr *lpp; /* left address pointer */ 2370 2371 lkp = xfs_btree_key_addr(cur, lrecs, left); 2372 rkp = xfs_btree_key_addr(cur, 1, right); 2373 2374 lpp = xfs_btree_ptr_addr(cur, lrecs, left); 2375 rpp = xfs_btree_ptr_addr(cur, 1, right); 2376 2377 error = xfs_btree_debug_check_ptr(cur, rpp, 0, level); 2378 if (error) 2379 goto error0; 2380 2381 xfs_btree_copy_keys(cur, lkp, rkp, 1); 2382 xfs_btree_copy_ptrs(cur, lpp, rpp, 1); 2383 2384 xfs_btree_log_keys(cur, lbp, lrecs, lrecs); 2385 xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs); 2386 2387 ASSERT(cur->bc_ops->keys_inorder(cur, 2388 xfs_btree_key_addr(cur, lrecs - 1, left), lkp)); 2389 } else { 2390 /* It's a leaf. Move records. */ 2391 union xfs_btree_rec *lrp; /* left record pointer */ 2392 2393 lrp = xfs_btree_rec_addr(cur, lrecs, left); 2394 rrp = xfs_btree_rec_addr(cur, 1, right); 2395 2396 xfs_btree_copy_recs(cur, lrp, rrp, 1); 2397 xfs_btree_log_recs(cur, lbp, lrecs, lrecs); 2398 2399 ASSERT(cur->bc_ops->recs_inorder(cur, 2400 xfs_btree_rec_addr(cur, lrecs - 1, left), lrp)); 2401 } 2402 2403 xfs_btree_set_numrecs(left, lrecs); 2404 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); 2405 2406 xfs_btree_set_numrecs(right, rrecs); 2407 xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); 2408 2409 /* 2410 * Slide the contents of right down one entry. 2411 */ 2412 XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1); 2413 if (level > 0) { 2414 /* It's a nonleaf. operate on keys and ptrs */ 2415 for (i = 0; i < rrecs; i++) { 2416 error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level); 2417 if (error) 2418 goto error0; 2419 } 2420 2421 xfs_btree_shift_keys(cur, 2422 xfs_btree_key_addr(cur, 2, right), 2423 -1, rrecs); 2424 xfs_btree_shift_ptrs(cur, 2425 xfs_btree_ptr_addr(cur, 2, right), 2426 -1, rrecs); 2427 2428 xfs_btree_log_keys(cur, rbp, 1, rrecs); 2429 xfs_btree_log_ptrs(cur, rbp, 1, rrecs); 2430 } else { 2431 /* It's a leaf. operate on records */ 2432 xfs_btree_shift_recs(cur, 2433 xfs_btree_rec_addr(cur, 2, right), 2434 -1, rrecs); 2435 xfs_btree_log_recs(cur, rbp, 1, rrecs); 2436 } 2437 2438 /* 2439 * Using a temporary cursor, update the parent key values of the 2440 * block on the left. 2441 */ 2442 if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { 2443 error = xfs_btree_dup_cursor(cur, &tcur); 2444 if (error) 2445 goto error0; 2446 i = xfs_btree_firstrec(tcur, level); 2447 if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) { 2448 error = -EFSCORRUPTED; 2449 goto error0; 2450 } 2451 2452 error = xfs_btree_decrement(tcur, level, &i); 2453 if (error) 2454 goto error1; 2455 2456 /* Update the parent high keys of the left block, if needed. */ 2457 error = xfs_btree_update_keys(tcur, level); 2458 if (error) 2459 goto error1; 2460 2461 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 2462 } 2463 2464 /* Update the parent keys of the right block. */ 2465 error = xfs_btree_update_keys(cur, level); 2466 if (error) 2467 goto error0; 2468 2469 /* Slide the cursor value left one. */ 2470 cur->bc_levels[level].ptr--; 2471 2472 *stat = 1; 2473 return 0; 2474 2475 out0: 2476 *stat = 0; 2477 return 0; 2478 2479 error0: 2480 return error; 2481 2482 error1: 2483 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 2484 return error; 2485 } 2486 2487 /* 2488 * Move 1 record right from cur/level if possible. 2489 * Update cur to reflect the new path. 2490 */ 2491 STATIC int /* error */ 2492 xfs_btree_rshift( 2493 struct xfs_btree_cur *cur, 2494 int level, 2495 int *stat) /* success/failure */ 2496 { 2497 struct xfs_buf *lbp; /* left buffer pointer */ 2498 struct xfs_btree_block *left; /* left btree block */ 2499 struct xfs_buf *rbp; /* right buffer pointer */ 2500 struct xfs_btree_block *right; /* right btree block */ 2501 struct xfs_btree_cur *tcur; /* temporary btree cursor */ 2502 union xfs_btree_ptr rptr; /* right block pointer */ 2503 union xfs_btree_key *rkp; /* right btree key */ 2504 int rrecs; /* right record count */ 2505 int lrecs; /* left record count */ 2506 int error; /* error return value */ 2507 int i; /* loop counter */ 2508 2509 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && 2510 (level == cur->bc_nlevels - 1)) 2511 goto out0; 2512 2513 /* Set up variables for this block as "left". */ 2514 left = xfs_btree_get_block(cur, level, &lbp); 2515 2516 #ifdef DEBUG 2517 error = xfs_btree_check_block(cur, left, level, lbp); 2518 if (error) 2519 goto error0; 2520 #endif 2521 2522 /* If we've got no right sibling then we can't shift an entry right. */ 2523 xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); 2524 if (xfs_btree_ptr_is_null(cur, &rptr)) 2525 goto out0; 2526 2527 /* 2528 * If the cursor entry is the one that would be moved, don't 2529 * do it... it's too complicated. 2530 */ 2531 lrecs = xfs_btree_get_numrecs(left); 2532 if (cur->bc_levels[level].ptr >= lrecs) 2533 goto out0; 2534 2535 /* Set up the right neighbor as "right". */ 2536 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); 2537 if (error) 2538 goto error0; 2539 2540 /* If it's full, it can't take another entry. */ 2541 rrecs = xfs_btree_get_numrecs(right); 2542 if (rrecs == cur->bc_ops->get_maxrecs(cur, level)) 2543 goto out0; 2544 2545 XFS_BTREE_STATS_INC(cur, rshift); 2546 XFS_BTREE_STATS_ADD(cur, moves, rrecs); 2547 2548 /* 2549 * Make a hole at the start of the right neighbor block, then 2550 * copy the last left block entry to the hole. 2551 */ 2552 if (level > 0) { 2553 /* It's a nonleaf. make a hole in the keys and ptrs */ 2554 union xfs_btree_key *lkp; 2555 union xfs_btree_ptr *lpp; 2556 union xfs_btree_ptr *rpp; 2557 2558 lkp = xfs_btree_key_addr(cur, lrecs, left); 2559 lpp = xfs_btree_ptr_addr(cur, lrecs, left); 2560 rkp = xfs_btree_key_addr(cur, 1, right); 2561 rpp = xfs_btree_ptr_addr(cur, 1, right); 2562 2563 for (i = rrecs - 1; i >= 0; i--) { 2564 error = xfs_btree_debug_check_ptr(cur, rpp, i, level); 2565 if (error) 2566 goto error0; 2567 } 2568 2569 xfs_btree_shift_keys(cur, rkp, 1, rrecs); 2570 xfs_btree_shift_ptrs(cur, rpp, 1, rrecs); 2571 2572 error = xfs_btree_debug_check_ptr(cur, lpp, 0, level); 2573 if (error) 2574 goto error0; 2575 2576 /* Now put the new data in, and log it. */ 2577 xfs_btree_copy_keys(cur, rkp, lkp, 1); 2578 xfs_btree_copy_ptrs(cur, rpp, lpp, 1); 2579 2580 xfs_btree_log_keys(cur, rbp, 1, rrecs + 1); 2581 xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1); 2582 2583 ASSERT(cur->bc_ops->keys_inorder(cur, rkp, 2584 xfs_btree_key_addr(cur, 2, right))); 2585 } else { 2586 /* It's a leaf. make a hole in the records */ 2587 union xfs_btree_rec *lrp; 2588 union xfs_btree_rec *rrp; 2589 2590 lrp = xfs_btree_rec_addr(cur, lrecs, left); 2591 rrp = xfs_btree_rec_addr(cur, 1, right); 2592 2593 xfs_btree_shift_recs(cur, rrp, 1, rrecs); 2594 2595 /* Now put the new data in, and log it. */ 2596 xfs_btree_copy_recs(cur, rrp, lrp, 1); 2597 xfs_btree_log_recs(cur, rbp, 1, rrecs + 1); 2598 } 2599 2600 /* 2601 * Decrement and log left's numrecs, bump and log right's numrecs. 2602 */ 2603 xfs_btree_set_numrecs(left, --lrecs); 2604 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); 2605 2606 xfs_btree_set_numrecs(right, ++rrecs); 2607 xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); 2608 2609 /* 2610 * Using a temporary cursor, update the parent key values of the 2611 * block on the right. 2612 */ 2613 error = xfs_btree_dup_cursor(cur, &tcur); 2614 if (error) 2615 goto error0; 2616 i = xfs_btree_lastrec(tcur, level); 2617 if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) { 2618 error = -EFSCORRUPTED; 2619 goto error0; 2620 } 2621 2622 error = xfs_btree_increment(tcur, level, &i); 2623 if (error) 2624 goto error1; 2625 2626 /* Update the parent high keys of the left block, if needed. */ 2627 if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { 2628 error = xfs_btree_update_keys(cur, level); 2629 if (error) 2630 goto error1; 2631 } 2632 2633 /* Update the parent keys of the right block. */ 2634 error = xfs_btree_update_keys(tcur, level); 2635 if (error) 2636 goto error1; 2637 2638 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 2639 2640 *stat = 1; 2641 return 0; 2642 2643 out0: 2644 *stat = 0; 2645 return 0; 2646 2647 error0: 2648 return error; 2649 2650 error1: 2651 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 2652 return error; 2653 } 2654 2655 /* 2656 * Split cur/level block in half. 2657 * Return new block number and the key to its first 2658 * record (to be inserted into parent). 2659 */ 2660 STATIC int /* error */ 2661 __xfs_btree_split( 2662 struct xfs_btree_cur *cur, 2663 int level, 2664 union xfs_btree_ptr *ptrp, 2665 union xfs_btree_key *key, 2666 struct xfs_btree_cur **curp, 2667 int *stat) /* success/failure */ 2668 { 2669 union xfs_btree_ptr lptr; /* left sibling block ptr */ 2670 struct xfs_buf *lbp; /* left buffer pointer */ 2671 struct xfs_btree_block *left; /* left btree block */ 2672 union xfs_btree_ptr rptr; /* right sibling block ptr */ 2673 struct xfs_buf *rbp; /* right buffer pointer */ 2674 struct xfs_btree_block *right; /* right btree block */ 2675 union xfs_btree_ptr rrptr; /* right-right sibling ptr */ 2676 struct xfs_buf *rrbp; /* right-right buffer pointer */ 2677 struct xfs_btree_block *rrblock; /* right-right btree block */ 2678 int lrecs; 2679 int rrecs; 2680 int src_index; 2681 int error; /* error return value */ 2682 int i; 2683 2684 XFS_BTREE_STATS_INC(cur, split); 2685 2686 /* Set up left block (current one). */ 2687 left = xfs_btree_get_block(cur, level, &lbp); 2688 2689 #ifdef DEBUG 2690 error = xfs_btree_check_block(cur, left, level, lbp); 2691 if (error) 2692 goto error0; 2693 #endif 2694 2695 xfs_btree_buf_to_ptr(cur, lbp, &lptr); 2696 2697 /* Allocate the new block. If we can't do it, we're toast. Give up. */ 2698 error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat); 2699 if (error) 2700 goto error0; 2701 if (*stat == 0) 2702 goto out0; 2703 XFS_BTREE_STATS_INC(cur, alloc); 2704 2705 /* Set up the new block as "right". */ 2706 error = xfs_btree_get_buf_block(cur, &rptr, &right, &rbp); 2707 if (error) 2708 goto error0; 2709 2710 /* Fill in the btree header for the new right block. */ 2711 xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0); 2712 2713 /* 2714 * Split the entries between the old and the new block evenly. 2715 * Make sure that if there's an odd number of entries now, that 2716 * each new block will have the same number of entries. 2717 */ 2718 lrecs = xfs_btree_get_numrecs(left); 2719 rrecs = lrecs / 2; 2720 if ((lrecs & 1) && cur->bc_levels[level].ptr <= rrecs + 1) 2721 rrecs++; 2722 src_index = (lrecs - rrecs + 1); 2723 2724 XFS_BTREE_STATS_ADD(cur, moves, rrecs); 2725 2726 /* Adjust numrecs for the later get_*_keys() calls. */ 2727 lrecs -= rrecs; 2728 xfs_btree_set_numrecs(left, lrecs); 2729 xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs); 2730 2731 /* 2732 * Copy btree block entries from the left block over to the 2733 * new block, the right. Update the right block and log the 2734 * changes. 2735 */ 2736 if (level > 0) { 2737 /* It's a non-leaf. Move keys and pointers. */ 2738 union xfs_btree_key *lkp; /* left btree key */ 2739 union xfs_btree_ptr *lpp; /* left address pointer */ 2740 union xfs_btree_key *rkp; /* right btree key */ 2741 union xfs_btree_ptr *rpp; /* right address pointer */ 2742 2743 lkp = xfs_btree_key_addr(cur, src_index, left); 2744 lpp = xfs_btree_ptr_addr(cur, src_index, left); 2745 rkp = xfs_btree_key_addr(cur, 1, right); 2746 rpp = xfs_btree_ptr_addr(cur, 1, right); 2747 2748 for (i = src_index; i < rrecs; i++) { 2749 error = xfs_btree_debug_check_ptr(cur, lpp, i, level); 2750 if (error) 2751 goto error0; 2752 } 2753 2754 /* Copy the keys & pointers to the new block. */ 2755 xfs_btree_copy_keys(cur, rkp, lkp, rrecs); 2756 xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs); 2757 2758 xfs_btree_log_keys(cur, rbp, 1, rrecs); 2759 xfs_btree_log_ptrs(cur, rbp, 1, rrecs); 2760 2761 /* Stash the keys of the new block for later insertion. */ 2762 xfs_btree_get_node_keys(cur, right, key); 2763 } else { 2764 /* It's a leaf. Move records. */ 2765 union xfs_btree_rec *lrp; /* left record pointer */ 2766 union xfs_btree_rec *rrp; /* right record pointer */ 2767 2768 lrp = xfs_btree_rec_addr(cur, src_index, left); 2769 rrp = xfs_btree_rec_addr(cur, 1, right); 2770 2771 /* Copy records to the new block. */ 2772 xfs_btree_copy_recs(cur, rrp, lrp, rrecs); 2773 xfs_btree_log_recs(cur, rbp, 1, rrecs); 2774 2775 /* Stash the keys of the new block for later insertion. */ 2776 xfs_btree_get_leaf_keys(cur, right, key); 2777 } 2778 2779 /* 2780 * Find the left block number by looking in the buffer. 2781 * Adjust sibling pointers. 2782 */ 2783 xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB); 2784 xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB); 2785 xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); 2786 xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); 2787 2788 xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS); 2789 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); 2790 2791 /* 2792 * If there's a block to the new block's right, make that block 2793 * point back to right instead of to left. 2794 */ 2795 if (!xfs_btree_ptr_is_null(cur, &rrptr)) { 2796 error = xfs_btree_read_buf_block(cur, &rrptr, 2797 0, &rrblock, &rrbp); 2798 if (error) 2799 goto error0; 2800 xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB); 2801 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); 2802 } 2803 2804 /* Update the parent high keys of the left block, if needed. */ 2805 if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { 2806 error = xfs_btree_update_keys(cur, level); 2807 if (error) 2808 goto error0; 2809 } 2810 2811 /* 2812 * If the cursor is really in the right block, move it there. 2813 * If it's just pointing past the last entry in left, then we'll 2814 * insert there, so don't change anything in that case. 2815 */ 2816 if (cur->bc_levels[level].ptr > lrecs + 1) { 2817 xfs_btree_setbuf(cur, level, rbp); 2818 cur->bc_levels[level].ptr -= lrecs; 2819 } 2820 /* 2821 * If there are more levels, we'll need another cursor which refers 2822 * the right block, no matter where this cursor was. 2823 */ 2824 if (level + 1 < cur->bc_nlevels) { 2825 error = xfs_btree_dup_cursor(cur, curp); 2826 if (error) 2827 goto error0; 2828 (*curp)->bc_levels[level + 1].ptr++; 2829 } 2830 *ptrp = rptr; 2831 *stat = 1; 2832 return 0; 2833 out0: 2834 *stat = 0; 2835 return 0; 2836 2837 error0: 2838 return error; 2839 } 2840 2841 #ifdef __KERNEL__ 2842 struct xfs_btree_split_args { 2843 struct xfs_btree_cur *cur; 2844 int level; 2845 union xfs_btree_ptr *ptrp; 2846 union xfs_btree_key *key; 2847 struct xfs_btree_cur **curp; 2848 int *stat; /* success/failure */ 2849 int result; 2850 bool kswapd; /* allocation in kswapd context */ 2851 struct completion *done; 2852 struct work_struct work; 2853 }; 2854 2855 /* 2856 * Stack switching interfaces for allocation 2857 */ 2858 static void 2859 xfs_btree_split_worker( 2860 struct work_struct *work) 2861 { 2862 struct xfs_btree_split_args *args = container_of(work, 2863 struct xfs_btree_split_args, work); 2864 unsigned long pflags; 2865 unsigned long new_pflags = 0; 2866 2867 /* 2868 * we are in a transaction context here, but may also be doing work 2869 * in kswapd context, and hence we may need to inherit that state 2870 * temporarily to ensure that we don't block waiting for memory reclaim 2871 * in any way. 2872 */ 2873 if (args->kswapd) 2874 new_pflags |= PF_MEMALLOC | PF_KSWAPD; 2875 2876 current_set_flags_nested(&pflags, new_pflags); 2877 xfs_trans_set_context(args->cur->bc_tp); 2878 2879 args->result = __xfs_btree_split(args->cur, args->level, args->ptrp, 2880 args->key, args->curp, args->stat); 2881 2882 xfs_trans_clear_context(args->cur->bc_tp); 2883 current_restore_flags_nested(&pflags, new_pflags); 2884 2885 /* 2886 * Do not access args after complete() has run here. We don't own args 2887 * and the owner may run and free args before we return here. 2888 */ 2889 complete(args->done); 2890 2891 } 2892 2893 /* 2894 * BMBT split requests often come in with little stack to work on. Push 2895 * them off to a worker thread so there is lots of stack to use. For the other 2896 * btree types, just call directly to avoid the context switch overhead here. 2897 */ 2898 STATIC int /* error */ 2899 xfs_btree_split( 2900 struct xfs_btree_cur *cur, 2901 int level, 2902 union xfs_btree_ptr *ptrp, 2903 union xfs_btree_key *key, 2904 struct xfs_btree_cur **curp, 2905 int *stat) /* success/failure */ 2906 { 2907 struct xfs_btree_split_args args; 2908 DECLARE_COMPLETION_ONSTACK(done); 2909 2910 if (cur->bc_btnum != XFS_BTNUM_BMAP) 2911 return __xfs_btree_split(cur, level, ptrp, key, curp, stat); 2912 2913 args.cur = cur; 2914 args.level = level; 2915 args.ptrp = ptrp; 2916 args.key = key; 2917 args.curp = curp; 2918 args.stat = stat; 2919 args.done = &done; 2920 args.kswapd = current_is_kswapd(); 2921 INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker); 2922 queue_work(xfs_alloc_wq, &args.work); 2923 wait_for_completion(&done); 2924 destroy_work_on_stack(&args.work); 2925 return args.result; 2926 } 2927 #else 2928 #define xfs_btree_split __xfs_btree_split 2929 #endif /* __KERNEL__ */ 2930 2931 2932 /* 2933 * Copy the old inode root contents into a real block and make the 2934 * broot point to it. 2935 */ 2936 int /* error */ 2937 xfs_btree_new_iroot( 2938 struct xfs_btree_cur *cur, /* btree cursor */ 2939 int *logflags, /* logging flags for inode */ 2940 int *stat) /* return status - 0 fail */ 2941 { 2942 struct xfs_buf *cbp; /* buffer for cblock */ 2943 struct xfs_btree_block *block; /* btree block */ 2944 struct xfs_btree_block *cblock; /* child btree block */ 2945 union xfs_btree_key *ckp; /* child key pointer */ 2946 union xfs_btree_ptr *cpp; /* child ptr pointer */ 2947 union xfs_btree_key *kp; /* pointer to btree key */ 2948 union xfs_btree_ptr *pp; /* pointer to block addr */ 2949 union xfs_btree_ptr nptr; /* new block addr */ 2950 int level; /* btree level */ 2951 int error; /* error return code */ 2952 int i; /* loop counter */ 2953 2954 XFS_BTREE_STATS_INC(cur, newroot); 2955 2956 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); 2957 2958 level = cur->bc_nlevels - 1; 2959 2960 block = xfs_btree_get_iroot(cur); 2961 pp = xfs_btree_ptr_addr(cur, 1, block); 2962 2963 /* Allocate the new block. If we can't do it, we're toast. Give up. */ 2964 error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat); 2965 if (error) 2966 goto error0; 2967 if (*stat == 0) 2968 return 0; 2969 2970 XFS_BTREE_STATS_INC(cur, alloc); 2971 2972 /* Copy the root into a real block. */ 2973 error = xfs_btree_get_buf_block(cur, &nptr, &cblock, &cbp); 2974 if (error) 2975 goto error0; 2976 2977 /* 2978 * we can't just memcpy() the root in for CRC enabled btree blocks. 2979 * In that case have to also ensure the blkno remains correct 2980 */ 2981 memcpy(cblock, block, xfs_btree_block_len(cur)); 2982 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { 2983 __be64 bno = cpu_to_be64(xfs_buf_daddr(cbp)); 2984 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 2985 cblock->bb_u.l.bb_blkno = bno; 2986 else 2987 cblock->bb_u.s.bb_blkno = bno; 2988 } 2989 2990 be16_add_cpu(&block->bb_level, 1); 2991 xfs_btree_set_numrecs(block, 1); 2992 cur->bc_nlevels++; 2993 ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); 2994 cur->bc_levels[level + 1].ptr = 1; 2995 2996 kp = xfs_btree_key_addr(cur, 1, block); 2997 ckp = xfs_btree_key_addr(cur, 1, cblock); 2998 xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock)); 2999 3000 cpp = xfs_btree_ptr_addr(cur, 1, cblock); 3001 for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) { 3002 error = xfs_btree_debug_check_ptr(cur, pp, i, level); 3003 if (error) 3004 goto error0; 3005 } 3006 3007 xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock)); 3008 3009 error = xfs_btree_debug_check_ptr(cur, &nptr, 0, level); 3010 if (error) 3011 goto error0; 3012 3013 xfs_btree_copy_ptrs(cur, pp, &nptr, 1); 3014 3015 xfs_iroot_realloc(cur->bc_ino.ip, 3016 1 - xfs_btree_get_numrecs(cblock), 3017 cur->bc_ino.whichfork); 3018 3019 xfs_btree_setbuf(cur, level, cbp); 3020 3021 /* 3022 * Do all this logging at the end so that 3023 * the root is at the right level. 3024 */ 3025 xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS); 3026 xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); 3027 xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); 3028 3029 *logflags |= 3030 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork); 3031 *stat = 1; 3032 return 0; 3033 error0: 3034 return error; 3035 } 3036 3037 /* 3038 * Allocate a new root block, fill it in. 3039 */ 3040 STATIC int /* error */ 3041 xfs_btree_new_root( 3042 struct xfs_btree_cur *cur, /* btree cursor */ 3043 int *stat) /* success/failure */ 3044 { 3045 struct xfs_btree_block *block; /* one half of the old root block */ 3046 struct xfs_buf *bp; /* buffer containing block */ 3047 int error; /* error return value */ 3048 struct xfs_buf *lbp; /* left buffer pointer */ 3049 struct xfs_btree_block *left; /* left btree block */ 3050 struct xfs_buf *nbp; /* new (root) buffer */ 3051 struct xfs_btree_block *new; /* new (root) btree block */ 3052 int nptr; /* new value for key index, 1 or 2 */ 3053 struct xfs_buf *rbp; /* right buffer pointer */ 3054 struct xfs_btree_block *right; /* right btree block */ 3055 union xfs_btree_ptr rptr; 3056 union xfs_btree_ptr lptr; 3057 3058 XFS_BTREE_STATS_INC(cur, newroot); 3059 3060 /* initialise our start point from the cursor */ 3061 cur->bc_ops->init_ptr_from_cur(cur, &rptr); 3062 3063 /* Allocate the new block. If we can't do it, we're toast. Give up. */ 3064 error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat); 3065 if (error) 3066 goto error0; 3067 if (*stat == 0) 3068 goto out0; 3069 XFS_BTREE_STATS_INC(cur, alloc); 3070 3071 /* Set up the new block. */ 3072 error = xfs_btree_get_buf_block(cur, &lptr, &new, &nbp); 3073 if (error) 3074 goto error0; 3075 3076 /* Set the root in the holding structure increasing the level by 1. */ 3077 cur->bc_ops->set_root(cur, &lptr, 1); 3078 3079 /* 3080 * At the previous root level there are now two blocks: the old root, 3081 * and the new block generated when it was split. We don't know which 3082 * one the cursor is pointing at, so we set up variables "left" and 3083 * "right" for each case. 3084 */ 3085 block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp); 3086 3087 #ifdef DEBUG 3088 error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp); 3089 if (error) 3090 goto error0; 3091 #endif 3092 3093 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); 3094 if (!xfs_btree_ptr_is_null(cur, &rptr)) { 3095 /* Our block is left, pick up the right block. */ 3096 lbp = bp; 3097 xfs_btree_buf_to_ptr(cur, lbp, &lptr); 3098 left = block; 3099 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); 3100 if (error) 3101 goto error0; 3102 bp = rbp; 3103 nptr = 1; 3104 } else { 3105 /* Our block is right, pick up the left block. */ 3106 rbp = bp; 3107 xfs_btree_buf_to_ptr(cur, rbp, &rptr); 3108 right = block; 3109 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); 3110 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); 3111 if (error) 3112 goto error0; 3113 bp = lbp; 3114 nptr = 2; 3115 } 3116 3117 /* Fill in the new block's btree header and log it. */ 3118 xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2); 3119 xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); 3120 ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) && 3121 !xfs_btree_ptr_is_null(cur, &rptr)); 3122 3123 /* Fill in the key data in the new root. */ 3124 if (xfs_btree_get_level(left) > 0) { 3125 /* 3126 * Get the keys for the left block's keys and put them directly 3127 * in the parent block. Do the same for the right block. 3128 */ 3129 xfs_btree_get_node_keys(cur, left, 3130 xfs_btree_key_addr(cur, 1, new)); 3131 xfs_btree_get_node_keys(cur, right, 3132 xfs_btree_key_addr(cur, 2, new)); 3133 } else { 3134 /* 3135 * Get the keys for the left block's records and put them 3136 * directly in the parent block. Do the same for the right 3137 * block. 3138 */ 3139 xfs_btree_get_leaf_keys(cur, left, 3140 xfs_btree_key_addr(cur, 1, new)); 3141 xfs_btree_get_leaf_keys(cur, right, 3142 xfs_btree_key_addr(cur, 2, new)); 3143 } 3144 xfs_btree_log_keys(cur, nbp, 1, 2); 3145 3146 /* Fill in the pointer data in the new root. */ 3147 xfs_btree_copy_ptrs(cur, 3148 xfs_btree_ptr_addr(cur, 1, new), &lptr, 1); 3149 xfs_btree_copy_ptrs(cur, 3150 xfs_btree_ptr_addr(cur, 2, new), &rptr, 1); 3151 xfs_btree_log_ptrs(cur, nbp, 1, 2); 3152 3153 /* Fix up the cursor. */ 3154 xfs_btree_setbuf(cur, cur->bc_nlevels, nbp); 3155 cur->bc_levels[cur->bc_nlevels].ptr = nptr; 3156 cur->bc_nlevels++; 3157 ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); 3158 *stat = 1; 3159 return 0; 3160 error0: 3161 return error; 3162 out0: 3163 *stat = 0; 3164 return 0; 3165 } 3166 3167 STATIC int 3168 xfs_btree_make_block_unfull( 3169 struct xfs_btree_cur *cur, /* btree cursor */ 3170 int level, /* btree level */ 3171 int numrecs,/* # of recs in block */ 3172 int *oindex,/* old tree index */ 3173 int *index, /* new tree index */ 3174 union xfs_btree_ptr *nptr, /* new btree ptr */ 3175 struct xfs_btree_cur **ncur, /* new btree cursor */ 3176 union xfs_btree_key *key, /* key of new block */ 3177 int *stat) 3178 { 3179 int error = 0; 3180 3181 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && 3182 level == cur->bc_nlevels - 1) { 3183 struct xfs_inode *ip = cur->bc_ino.ip; 3184 3185 if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { 3186 /* A root block that can be made bigger. */ 3187 xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork); 3188 *stat = 1; 3189 } else { 3190 /* A root block that needs replacing */ 3191 int logflags = 0; 3192 3193 error = xfs_btree_new_iroot(cur, &logflags, stat); 3194 if (error || *stat == 0) 3195 return error; 3196 3197 xfs_trans_log_inode(cur->bc_tp, ip, logflags); 3198 } 3199 3200 return 0; 3201 } 3202 3203 /* First, try shifting an entry to the right neighbor. */ 3204 error = xfs_btree_rshift(cur, level, stat); 3205 if (error || *stat) 3206 return error; 3207 3208 /* Next, try shifting an entry to the left neighbor. */ 3209 error = xfs_btree_lshift(cur, level, stat); 3210 if (error) 3211 return error; 3212 3213 if (*stat) { 3214 *oindex = *index = cur->bc_levels[level].ptr; 3215 return 0; 3216 } 3217 3218 /* 3219 * Next, try splitting the current block in half. 3220 * 3221 * If this works we have to re-set our variables because we 3222 * could be in a different block now. 3223 */ 3224 error = xfs_btree_split(cur, level, nptr, key, ncur, stat); 3225 if (error || *stat == 0) 3226 return error; 3227 3228 3229 *index = cur->bc_levels[level].ptr; 3230 return 0; 3231 } 3232 3233 /* 3234 * Insert one record/level. Return information to the caller 3235 * allowing the next level up to proceed if necessary. 3236 */ 3237 STATIC int 3238 xfs_btree_insrec( 3239 struct xfs_btree_cur *cur, /* btree cursor */ 3240 int level, /* level to insert record at */ 3241 union xfs_btree_ptr *ptrp, /* i/o: block number inserted */ 3242 union xfs_btree_rec *rec, /* record to insert */ 3243 union xfs_btree_key *key, /* i/o: block key for ptrp */ 3244 struct xfs_btree_cur **curp, /* output: new cursor replacing cur */ 3245 int *stat) /* success/failure */ 3246 { 3247 struct xfs_btree_block *block; /* btree block */ 3248 struct xfs_buf *bp; /* buffer for block */ 3249 union xfs_btree_ptr nptr; /* new block ptr */ 3250 struct xfs_btree_cur *ncur; /* new btree cursor */ 3251 union xfs_btree_key nkey; /* new block key */ 3252 union xfs_btree_key *lkey; 3253 int optr; /* old key/record index */ 3254 int ptr; /* key/record index */ 3255 int numrecs;/* number of records */ 3256 int error; /* error return value */ 3257 int i; 3258 xfs_daddr_t old_bn; 3259 3260 ncur = NULL; 3261 lkey = &nkey; 3262 3263 /* 3264 * If we have an external root pointer, and we've made it to the 3265 * root level, allocate a new root block and we're done. 3266 */ 3267 if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && 3268 (level >= cur->bc_nlevels)) { 3269 error = xfs_btree_new_root(cur, stat); 3270 xfs_btree_set_ptr_null(cur, ptrp); 3271 3272 return error; 3273 } 3274 3275 /* If we're off the left edge, return failure. */ 3276 ptr = cur->bc_levels[level].ptr; 3277 if (ptr == 0) { 3278 *stat = 0; 3279 return 0; 3280 } 3281 3282 optr = ptr; 3283 3284 XFS_BTREE_STATS_INC(cur, insrec); 3285 3286 /* Get pointers to the btree buffer and block. */ 3287 block = xfs_btree_get_block(cur, level, &bp); 3288 old_bn = bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL; 3289 numrecs = xfs_btree_get_numrecs(block); 3290 3291 #ifdef DEBUG 3292 error = xfs_btree_check_block(cur, block, level, bp); 3293 if (error) 3294 goto error0; 3295 3296 /* Check that the new entry is being inserted in the right place. */ 3297 if (ptr <= numrecs) { 3298 if (level == 0) { 3299 ASSERT(cur->bc_ops->recs_inorder(cur, rec, 3300 xfs_btree_rec_addr(cur, ptr, block))); 3301 } else { 3302 ASSERT(cur->bc_ops->keys_inorder(cur, key, 3303 xfs_btree_key_addr(cur, ptr, block))); 3304 } 3305 } 3306 #endif 3307 3308 /* 3309 * If the block is full, we can't insert the new entry until we 3310 * make the block un-full. 3311 */ 3312 xfs_btree_set_ptr_null(cur, &nptr); 3313 if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) { 3314 error = xfs_btree_make_block_unfull(cur, level, numrecs, 3315 &optr, &ptr, &nptr, &ncur, lkey, stat); 3316 if (error || *stat == 0) 3317 goto error0; 3318 } 3319 3320 /* 3321 * The current block may have changed if the block was 3322 * previously full and we have just made space in it. 3323 */ 3324 block = xfs_btree_get_block(cur, level, &bp); 3325 numrecs = xfs_btree_get_numrecs(block); 3326 3327 #ifdef DEBUG 3328 error = xfs_btree_check_block(cur, block, level, bp); 3329 if (error) 3330 return error; 3331 #endif 3332 3333 /* 3334 * At this point we know there's room for our new entry in the block 3335 * we're pointing at. 3336 */ 3337 XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1); 3338 3339 if (level > 0) { 3340 /* It's a nonleaf. make a hole in the keys and ptrs */ 3341 union xfs_btree_key *kp; 3342 union xfs_btree_ptr *pp; 3343 3344 kp = xfs_btree_key_addr(cur, ptr, block); 3345 pp = xfs_btree_ptr_addr(cur, ptr, block); 3346 3347 for (i = numrecs - ptr; i >= 0; i--) { 3348 error = xfs_btree_debug_check_ptr(cur, pp, i, level); 3349 if (error) 3350 return error; 3351 } 3352 3353 xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1); 3354 xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1); 3355 3356 error = xfs_btree_debug_check_ptr(cur, ptrp, 0, level); 3357 if (error) 3358 goto error0; 3359 3360 /* Now put the new data in, bump numrecs and log it. */ 3361 xfs_btree_copy_keys(cur, kp, key, 1); 3362 xfs_btree_copy_ptrs(cur, pp, ptrp, 1); 3363 numrecs++; 3364 xfs_btree_set_numrecs(block, numrecs); 3365 xfs_btree_log_ptrs(cur, bp, ptr, numrecs); 3366 xfs_btree_log_keys(cur, bp, ptr, numrecs); 3367 #ifdef DEBUG 3368 if (ptr < numrecs) { 3369 ASSERT(cur->bc_ops->keys_inorder(cur, kp, 3370 xfs_btree_key_addr(cur, ptr + 1, block))); 3371 } 3372 #endif 3373 } else { 3374 /* It's a leaf. make a hole in the records */ 3375 union xfs_btree_rec *rp; 3376 3377 rp = xfs_btree_rec_addr(cur, ptr, block); 3378 3379 xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1); 3380 3381 /* Now put the new data in, bump numrecs and log it. */ 3382 xfs_btree_copy_recs(cur, rp, rec, 1); 3383 xfs_btree_set_numrecs(block, ++numrecs); 3384 xfs_btree_log_recs(cur, bp, ptr, numrecs); 3385 #ifdef DEBUG 3386 if (ptr < numrecs) { 3387 ASSERT(cur->bc_ops->recs_inorder(cur, rp, 3388 xfs_btree_rec_addr(cur, ptr + 1, block))); 3389 } 3390 #endif 3391 } 3392 3393 /* Log the new number of records in the btree header. */ 3394 xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); 3395 3396 /* 3397 * If we just inserted into a new tree block, we have to 3398 * recalculate nkey here because nkey is out of date. 3399 * 3400 * Otherwise we're just updating an existing block (having shoved 3401 * some records into the new tree block), so use the regular key 3402 * update mechanism. 3403 */ 3404 if (bp && xfs_buf_daddr(bp) != old_bn) { 3405 xfs_btree_get_keys(cur, block, lkey); 3406 } else if (xfs_btree_needs_key_update(cur, optr)) { 3407 error = xfs_btree_update_keys(cur, level); 3408 if (error) 3409 goto error0; 3410 } 3411 3412 /* 3413 * If we are tracking the last record in the tree and 3414 * we are at the far right edge of the tree, update it. 3415 */ 3416 if (xfs_btree_is_lastrec(cur, block, level)) { 3417 cur->bc_ops->update_lastrec(cur, block, rec, 3418 ptr, LASTREC_INSREC); 3419 } 3420 3421 /* 3422 * Return the new block number, if any. 3423 * If there is one, give back a record value and a cursor too. 3424 */ 3425 *ptrp = nptr; 3426 if (!xfs_btree_ptr_is_null(cur, &nptr)) { 3427 xfs_btree_copy_keys(cur, key, lkey, 1); 3428 *curp = ncur; 3429 } 3430 3431 *stat = 1; 3432 return 0; 3433 3434 error0: 3435 return error; 3436 } 3437 3438 /* 3439 * Insert the record at the point referenced by cur. 3440 * 3441 * A multi-level split of the tree on insert will invalidate the original 3442 * cursor. All callers of this function should assume that the cursor is 3443 * no longer valid and revalidate it. 3444 */ 3445 int 3446 xfs_btree_insert( 3447 struct xfs_btree_cur *cur, 3448 int *stat) 3449 { 3450 int error; /* error return value */ 3451 int i; /* result value, 0 for failure */ 3452 int level; /* current level number in btree */ 3453 union xfs_btree_ptr nptr; /* new block number (split result) */ 3454 struct xfs_btree_cur *ncur; /* new cursor (split result) */ 3455 struct xfs_btree_cur *pcur; /* previous level's cursor */ 3456 union xfs_btree_key bkey; /* key of block to insert */ 3457 union xfs_btree_key *key; 3458 union xfs_btree_rec rec; /* record to insert */ 3459 3460 level = 0; 3461 ncur = NULL; 3462 pcur = cur; 3463 key = &bkey; 3464 3465 xfs_btree_set_ptr_null(cur, &nptr); 3466 3467 /* Make a key out of the record data to be inserted, and save it. */ 3468 cur->bc_ops->init_rec_from_cur(cur, &rec); 3469 cur->bc_ops->init_key_from_rec(key, &rec); 3470 3471 /* 3472 * Loop going up the tree, starting at the leaf level. 3473 * Stop when we don't get a split block, that must mean that 3474 * the insert is finished with this level. 3475 */ 3476 do { 3477 /* 3478 * Insert nrec/nptr into this level of the tree. 3479 * Note if we fail, nptr will be null. 3480 */ 3481 error = xfs_btree_insrec(pcur, level, &nptr, &rec, key, 3482 &ncur, &i); 3483 if (error) { 3484 if (pcur != cur) 3485 xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); 3486 goto error0; 3487 } 3488 3489 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 3490 error = -EFSCORRUPTED; 3491 goto error0; 3492 } 3493 level++; 3494 3495 /* 3496 * See if the cursor we just used is trash. 3497 * Can't trash the caller's cursor, but otherwise we should 3498 * if ncur is a new cursor or we're about to be done. 3499 */ 3500 if (pcur != cur && 3501 (ncur || xfs_btree_ptr_is_null(cur, &nptr))) { 3502 /* Save the state from the cursor before we trash it */ 3503 if (cur->bc_ops->update_cursor) 3504 cur->bc_ops->update_cursor(pcur, cur); 3505 cur->bc_nlevels = pcur->bc_nlevels; 3506 xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); 3507 } 3508 /* If we got a new cursor, switch to it. */ 3509 if (ncur) { 3510 pcur = ncur; 3511 ncur = NULL; 3512 } 3513 } while (!xfs_btree_ptr_is_null(cur, &nptr)); 3514 3515 *stat = i; 3516 return 0; 3517 error0: 3518 return error; 3519 } 3520 3521 /* 3522 * Try to merge a non-leaf block back into the inode root. 3523 * 3524 * Note: the killroot names comes from the fact that we're effectively 3525 * killing the old root block. But because we can't just delete the 3526 * inode we have to copy the single block it was pointing to into the 3527 * inode. 3528 */ 3529 STATIC int 3530 xfs_btree_kill_iroot( 3531 struct xfs_btree_cur *cur) 3532 { 3533 int whichfork = cur->bc_ino.whichfork; 3534 struct xfs_inode *ip = cur->bc_ino.ip; 3535 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); 3536 struct xfs_btree_block *block; 3537 struct xfs_btree_block *cblock; 3538 union xfs_btree_key *kp; 3539 union xfs_btree_key *ckp; 3540 union xfs_btree_ptr *pp; 3541 union xfs_btree_ptr *cpp; 3542 struct xfs_buf *cbp; 3543 int level; 3544 int index; 3545 int numrecs; 3546 int error; 3547 #ifdef DEBUG 3548 union xfs_btree_ptr ptr; 3549 #endif 3550 int i; 3551 3552 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); 3553 ASSERT(cur->bc_nlevels > 1); 3554 3555 /* 3556 * Don't deal with the root block needs to be a leaf case. 3557 * We're just going to turn the thing back into extents anyway. 3558 */ 3559 level = cur->bc_nlevels - 1; 3560 if (level == 1) 3561 goto out0; 3562 3563 /* 3564 * Give up if the root has multiple children. 3565 */ 3566 block = xfs_btree_get_iroot(cur); 3567 if (xfs_btree_get_numrecs(block) != 1) 3568 goto out0; 3569 3570 cblock = xfs_btree_get_block(cur, level - 1, &cbp); 3571 numrecs = xfs_btree_get_numrecs(cblock); 3572 3573 /* 3574 * Only do this if the next level will fit. 3575 * Then the data must be copied up to the inode, 3576 * instead of freeing the root you free the next level. 3577 */ 3578 if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level)) 3579 goto out0; 3580 3581 XFS_BTREE_STATS_INC(cur, killroot); 3582 3583 #ifdef DEBUG 3584 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); 3585 ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); 3586 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 3587 ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); 3588 #endif 3589 3590 index = numrecs - cur->bc_ops->get_maxrecs(cur, level); 3591 if (index) { 3592 xfs_iroot_realloc(cur->bc_ino.ip, index, 3593 cur->bc_ino.whichfork); 3594 block = ifp->if_broot; 3595 } 3596 3597 be16_add_cpu(&block->bb_numrecs, index); 3598 ASSERT(block->bb_numrecs == cblock->bb_numrecs); 3599 3600 kp = xfs_btree_key_addr(cur, 1, block); 3601 ckp = xfs_btree_key_addr(cur, 1, cblock); 3602 xfs_btree_copy_keys(cur, kp, ckp, numrecs); 3603 3604 pp = xfs_btree_ptr_addr(cur, 1, block); 3605 cpp = xfs_btree_ptr_addr(cur, 1, cblock); 3606 3607 for (i = 0; i < numrecs; i++) { 3608 error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1); 3609 if (error) 3610 return error; 3611 } 3612 3613 xfs_btree_copy_ptrs(cur, pp, cpp, numrecs); 3614 3615 error = xfs_btree_free_block(cur, cbp); 3616 if (error) 3617 return error; 3618 3619 cur->bc_levels[level - 1].bp = NULL; 3620 be16_add_cpu(&block->bb_level, -1); 3621 xfs_trans_log_inode(cur->bc_tp, ip, 3622 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork)); 3623 cur->bc_nlevels--; 3624 out0: 3625 return 0; 3626 } 3627 3628 /* 3629 * Kill the current root node, and replace it with it's only child node. 3630 */ 3631 STATIC int 3632 xfs_btree_kill_root( 3633 struct xfs_btree_cur *cur, 3634 struct xfs_buf *bp, 3635 int level, 3636 union xfs_btree_ptr *newroot) 3637 { 3638 int error; 3639 3640 XFS_BTREE_STATS_INC(cur, killroot); 3641 3642 /* 3643 * Update the root pointer, decreasing the level by 1 and then 3644 * free the old root. 3645 */ 3646 cur->bc_ops->set_root(cur, newroot, -1); 3647 3648 error = xfs_btree_free_block(cur, bp); 3649 if (error) 3650 return error; 3651 3652 cur->bc_levels[level].bp = NULL; 3653 cur->bc_levels[level].ra = 0; 3654 cur->bc_nlevels--; 3655 3656 return 0; 3657 } 3658 3659 STATIC int 3660 xfs_btree_dec_cursor( 3661 struct xfs_btree_cur *cur, 3662 int level, 3663 int *stat) 3664 { 3665 int error; 3666 int i; 3667 3668 if (level > 0) { 3669 error = xfs_btree_decrement(cur, level, &i); 3670 if (error) 3671 return error; 3672 } 3673 3674 *stat = 1; 3675 return 0; 3676 } 3677 3678 /* 3679 * Single level of the btree record deletion routine. 3680 * Delete record pointed to by cur/level. 3681 * Remove the record from its block then rebalance the tree. 3682 * Return 0 for error, 1 for done, 2 to go on to the next level. 3683 */ 3684 STATIC int /* error */ 3685 xfs_btree_delrec( 3686 struct xfs_btree_cur *cur, /* btree cursor */ 3687 int level, /* level removing record from */ 3688 int *stat) /* fail/done/go-on */ 3689 { 3690 struct xfs_btree_block *block; /* btree block */ 3691 union xfs_btree_ptr cptr; /* current block ptr */ 3692 struct xfs_buf *bp; /* buffer for block */ 3693 int error; /* error return value */ 3694 int i; /* loop counter */ 3695 union xfs_btree_ptr lptr; /* left sibling block ptr */ 3696 struct xfs_buf *lbp; /* left buffer pointer */ 3697 struct xfs_btree_block *left; /* left btree block */ 3698 int lrecs = 0; /* left record count */ 3699 int ptr; /* key/record index */ 3700 union xfs_btree_ptr rptr; /* right sibling block ptr */ 3701 struct xfs_buf *rbp; /* right buffer pointer */ 3702 struct xfs_btree_block *right; /* right btree block */ 3703 struct xfs_btree_block *rrblock; /* right-right btree block */ 3704 struct xfs_buf *rrbp; /* right-right buffer pointer */ 3705 int rrecs = 0; /* right record count */ 3706 struct xfs_btree_cur *tcur; /* temporary btree cursor */ 3707 int numrecs; /* temporary numrec count */ 3708 3709 tcur = NULL; 3710 3711 /* Get the index of the entry being deleted, check for nothing there. */ 3712 ptr = cur->bc_levels[level].ptr; 3713 if (ptr == 0) { 3714 *stat = 0; 3715 return 0; 3716 } 3717 3718 /* Get the buffer & block containing the record or key/ptr. */ 3719 block = xfs_btree_get_block(cur, level, &bp); 3720 numrecs = xfs_btree_get_numrecs(block); 3721 3722 #ifdef DEBUG 3723 error = xfs_btree_check_block(cur, block, level, bp); 3724 if (error) 3725 goto error0; 3726 #endif 3727 3728 /* Fail if we're off the end of the block. */ 3729 if (ptr > numrecs) { 3730 *stat = 0; 3731 return 0; 3732 } 3733 3734 XFS_BTREE_STATS_INC(cur, delrec); 3735 XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr); 3736 3737 /* Excise the entries being deleted. */ 3738 if (level > 0) { 3739 /* It's a nonleaf. operate on keys and ptrs */ 3740 union xfs_btree_key *lkp; 3741 union xfs_btree_ptr *lpp; 3742 3743 lkp = xfs_btree_key_addr(cur, ptr + 1, block); 3744 lpp = xfs_btree_ptr_addr(cur, ptr + 1, block); 3745 3746 for (i = 0; i < numrecs - ptr; i++) { 3747 error = xfs_btree_debug_check_ptr(cur, lpp, i, level); 3748 if (error) 3749 goto error0; 3750 } 3751 3752 if (ptr < numrecs) { 3753 xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr); 3754 xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr); 3755 xfs_btree_log_keys(cur, bp, ptr, numrecs - 1); 3756 xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1); 3757 } 3758 } else { 3759 /* It's a leaf. operate on records */ 3760 if (ptr < numrecs) { 3761 xfs_btree_shift_recs(cur, 3762 xfs_btree_rec_addr(cur, ptr + 1, block), 3763 -1, numrecs - ptr); 3764 xfs_btree_log_recs(cur, bp, ptr, numrecs - 1); 3765 } 3766 } 3767 3768 /* 3769 * Decrement and log the number of entries in the block. 3770 */ 3771 xfs_btree_set_numrecs(block, --numrecs); 3772 xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); 3773 3774 /* 3775 * If we are tracking the last record in the tree and 3776 * we are at the far right edge of the tree, update it. 3777 */ 3778 if (xfs_btree_is_lastrec(cur, block, level)) { 3779 cur->bc_ops->update_lastrec(cur, block, NULL, 3780 ptr, LASTREC_DELREC); 3781 } 3782 3783 /* 3784 * We're at the root level. First, shrink the root block in-memory. 3785 * Try to get rid of the next level down. If we can't then there's 3786 * nothing left to do. 3787 */ 3788 if (level == cur->bc_nlevels - 1) { 3789 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { 3790 xfs_iroot_realloc(cur->bc_ino.ip, -1, 3791 cur->bc_ino.whichfork); 3792 3793 error = xfs_btree_kill_iroot(cur); 3794 if (error) 3795 goto error0; 3796 3797 error = xfs_btree_dec_cursor(cur, level, stat); 3798 if (error) 3799 goto error0; 3800 *stat = 1; 3801 return 0; 3802 } 3803 3804 /* 3805 * If this is the root level, and there's only one entry left, 3806 * and it's NOT the leaf level, then we can get rid of this 3807 * level. 3808 */ 3809 if (numrecs == 1 && level > 0) { 3810 union xfs_btree_ptr *pp; 3811 /* 3812 * pp is still set to the first pointer in the block. 3813 * Make it the new root of the btree. 3814 */ 3815 pp = xfs_btree_ptr_addr(cur, 1, block); 3816 error = xfs_btree_kill_root(cur, bp, level, pp); 3817 if (error) 3818 goto error0; 3819 } else if (level > 0) { 3820 error = xfs_btree_dec_cursor(cur, level, stat); 3821 if (error) 3822 goto error0; 3823 } 3824 *stat = 1; 3825 return 0; 3826 } 3827 3828 /* 3829 * If we deleted the leftmost entry in the block, update the 3830 * key values above us in the tree. 3831 */ 3832 if (xfs_btree_needs_key_update(cur, ptr)) { 3833 error = xfs_btree_update_keys(cur, level); 3834 if (error) 3835 goto error0; 3836 } 3837 3838 /* 3839 * If the number of records remaining in the block is at least 3840 * the minimum, we're done. 3841 */ 3842 if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) { 3843 error = xfs_btree_dec_cursor(cur, level, stat); 3844 if (error) 3845 goto error0; 3846 return 0; 3847 } 3848 3849 /* 3850 * Otherwise, we have to move some records around to keep the 3851 * tree balanced. Look at the left and right sibling blocks to 3852 * see if we can re-balance by moving only one record. 3853 */ 3854 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); 3855 xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB); 3856 3857 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { 3858 /* 3859 * One child of root, need to get a chance to copy its contents 3860 * into the root and delete it. Can't go up to next level, 3861 * there's nothing to delete there. 3862 */ 3863 if (xfs_btree_ptr_is_null(cur, &rptr) && 3864 xfs_btree_ptr_is_null(cur, &lptr) && 3865 level == cur->bc_nlevels - 2) { 3866 error = xfs_btree_kill_iroot(cur); 3867 if (!error) 3868 error = xfs_btree_dec_cursor(cur, level, stat); 3869 if (error) 3870 goto error0; 3871 return 0; 3872 } 3873 } 3874 3875 ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) || 3876 !xfs_btree_ptr_is_null(cur, &lptr)); 3877 3878 /* 3879 * Duplicate the cursor so our btree manipulations here won't 3880 * disrupt the next level up. 3881 */ 3882 error = xfs_btree_dup_cursor(cur, &tcur); 3883 if (error) 3884 goto error0; 3885 3886 /* 3887 * If there's a right sibling, see if it's ok to shift an entry 3888 * out of it. 3889 */ 3890 if (!xfs_btree_ptr_is_null(cur, &rptr)) { 3891 /* 3892 * Move the temp cursor to the last entry in the next block. 3893 * Actually any entry but the first would suffice. 3894 */ 3895 i = xfs_btree_lastrec(tcur, level); 3896 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 3897 error = -EFSCORRUPTED; 3898 goto error0; 3899 } 3900 3901 error = xfs_btree_increment(tcur, level, &i); 3902 if (error) 3903 goto error0; 3904 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 3905 error = -EFSCORRUPTED; 3906 goto error0; 3907 } 3908 3909 i = xfs_btree_lastrec(tcur, level); 3910 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 3911 error = -EFSCORRUPTED; 3912 goto error0; 3913 } 3914 3915 /* Grab a pointer to the block. */ 3916 right = xfs_btree_get_block(tcur, level, &rbp); 3917 #ifdef DEBUG 3918 error = xfs_btree_check_block(tcur, right, level, rbp); 3919 if (error) 3920 goto error0; 3921 #endif 3922 /* Grab the current block number, for future use. */ 3923 xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB); 3924 3925 /* 3926 * If right block is full enough so that removing one entry 3927 * won't make it too empty, and left-shifting an entry out 3928 * of right to us works, we're done. 3929 */ 3930 if (xfs_btree_get_numrecs(right) - 1 >= 3931 cur->bc_ops->get_minrecs(tcur, level)) { 3932 error = xfs_btree_lshift(tcur, level, &i); 3933 if (error) 3934 goto error0; 3935 if (i) { 3936 ASSERT(xfs_btree_get_numrecs(block) >= 3937 cur->bc_ops->get_minrecs(tcur, level)); 3938 3939 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 3940 tcur = NULL; 3941 3942 error = xfs_btree_dec_cursor(cur, level, stat); 3943 if (error) 3944 goto error0; 3945 return 0; 3946 } 3947 } 3948 3949 /* 3950 * Otherwise, grab the number of records in right for 3951 * future reference, and fix up the temp cursor to point 3952 * to our block again (last record). 3953 */ 3954 rrecs = xfs_btree_get_numrecs(right); 3955 if (!xfs_btree_ptr_is_null(cur, &lptr)) { 3956 i = xfs_btree_firstrec(tcur, level); 3957 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 3958 error = -EFSCORRUPTED; 3959 goto error0; 3960 } 3961 3962 error = xfs_btree_decrement(tcur, level, &i); 3963 if (error) 3964 goto error0; 3965 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 3966 error = -EFSCORRUPTED; 3967 goto error0; 3968 } 3969 } 3970 } 3971 3972 /* 3973 * If there's a left sibling, see if it's ok to shift an entry 3974 * out of it. 3975 */ 3976 if (!xfs_btree_ptr_is_null(cur, &lptr)) { 3977 /* 3978 * Move the temp cursor to the first entry in the 3979 * previous block. 3980 */ 3981 i = xfs_btree_firstrec(tcur, level); 3982 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 3983 error = -EFSCORRUPTED; 3984 goto error0; 3985 } 3986 3987 error = xfs_btree_decrement(tcur, level, &i); 3988 if (error) 3989 goto error0; 3990 i = xfs_btree_firstrec(tcur, level); 3991 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 3992 error = -EFSCORRUPTED; 3993 goto error0; 3994 } 3995 3996 /* Grab a pointer to the block. */ 3997 left = xfs_btree_get_block(tcur, level, &lbp); 3998 #ifdef DEBUG 3999 error = xfs_btree_check_block(cur, left, level, lbp); 4000 if (error) 4001 goto error0; 4002 #endif 4003 /* Grab the current block number, for future use. */ 4004 xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB); 4005 4006 /* 4007 * If left block is full enough so that removing one entry 4008 * won't make it too empty, and right-shifting an entry out 4009 * of left to us works, we're done. 4010 */ 4011 if (xfs_btree_get_numrecs(left) - 1 >= 4012 cur->bc_ops->get_minrecs(tcur, level)) { 4013 error = xfs_btree_rshift(tcur, level, &i); 4014 if (error) 4015 goto error0; 4016 if (i) { 4017 ASSERT(xfs_btree_get_numrecs(block) >= 4018 cur->bc_ops->get_minrecs(tcur, level)); 4019 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 4020 tcur = NULL; 4021 if (level == 0) 4022 cur->bc_levels[0].ptr++; 4023 4024 *stat = 1; 4025 return 0; 4026 } 4027 } 4028 4029 /* 4030 * Otherwise, grab the number of records in right for 4031 * future reference. 4032 */ 4033 lrecs = xfs_btree_get_numrecs(left); 4034 } 4035 4036 /* Delete the temp cursor, we're done with it. */ 4037 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 4038 tcur = NULL; 4039 4040 /* If here, we need to do a join to keep the tree balanced. */ 4041 ASSERT(!xfs_btree_ptr_is_null(cur, &cptr)); 4042 4043 if (!xfs_btree_ptr_is_null(cur, &lptr) && 4044 lrecs + xfs_btree_get_numrecs(block) <= 4045 cur->bc_ops->get_maxrecs(cur, level)) { 4046 /* 4047 * Set "right" to be the starting block, 4048 * "left" to be the left neighbor. 4049 */ 4050 rptr = cptr; 4051 right = block; 4052 rbp = bp; 4053 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); 4054 if (error) 4055 goto error0; 4056 4057 /* 4058 * If that won't work, see if we can join with the right neighbor block. 4059 */ 4060 } else if (!xfs_btree_ptr_is_null(cur, &rptr) && 4061 rrecs + xfs_btree_get_numrecs(block) <= 4062 cur->bc_ops->get_maxrecs(cur, level)) { 4063 /* 4064 * Set "left" to be the starting block, 4065 * "right" to be the right neighbor. 4066 */ 4067 lptr = cptr; 4068 left = block; 4069 lbp = bp; 4070 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); 4071 if (error) 4072 goto error0; 4073 4074 /* 4075 * Otherwise, we can't fix the imbalance. 4076 * Just return. This is probably a logic error, but it's not fatal. 4077 */ 4078 } else { 4079 error = xfs_btree_dec_cursor(cur, level, stat); 4080 if (error) 4081 goto error0; 4082 return 0; 4083 } 4084 4085 rrecs = xfs_btree_get_numrecs(right); 4086 lrecs = xfs_btree_get_numrecs(left); 4087 4088 /* 4089 * We're now going to join "left" and "right" by moving all the stuff 4090 * in "right" to "left" and deleting "right". 4091 */ 4092 XFS_BTREE_STATS_ADD(cur, moves, rrecs); 4093 if (level > 0) { 4094 /* It's a non-leaf. Move keys and pointers. */ 4095 union xfs_btree_key *lkp; /* left btree key */ 4096 union xfs_btree_ptr *lpp; /* left address pointer */ 4097 union xfs_btree_key *rkp; /* right btree key */ 4098 union xfs_btree_ptr *rpp; /* right address pointer */ 4099 4100 lkp = xfs_btree_key_addr(cur, lrecs + 1, left); 4101 lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left); 4102 rkp = xfs_btree_key_addr(cur, 1, right); 4103 rpp = xfs_btree_ptr_addr(cur, 1, right); 4104 4105 for (i = 1; i < rrecs; i++) { 4106 error = xfs_btree_debug_check_ptr(cur, rpp, i, level); 4107 if (error) 4108 goto error0; 4109 } 4110 4111 xfs_btree_copy_keys(cur, lkp, rkp, rrecs); 4112 xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs); 4113 4114 xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs); 4115 xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs); 4116 } else { 4117 /* It's a leaf. Move records. */ 4118 union xfs_btree_rec *lrp; /* left record pointer */ 4119 union xfs_btree_rec *rrp; /* right record pointer */ 4120 4121 lrp = xfs_btree_rec_addr(cur, lrecs + 1, left); 4122 rrp = xfs_btree_rec_addr(cur, 1, right); 4123 4124 xfs_btree_copy_recs(cur, lrp, rrp, rrecs); 4125 xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs); 4126 } 4127 4128 XFS_BTREE_STATS_INC(cur, join); 4129 4130 /* 4131 * Fix up the number of records and right block pointer in the 4132 * surviving block, and log it. 4133 */ 4134 xfs_btree_set_numrecs(left, lrecs + rrecs); 4135 xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB); 4136 xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); 4137 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); 4138 4139 /* If there is a right sibling, point it to the remaining block. */ 4140 xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); 4141 if (!xfs_btree_ptr_is_null(cur, &cptr)) { 4142 error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp); 4143 if (error) 4144 goto error0; 4145 xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB); 4146 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); 4147 } 4148 4149 /* Free the deleted block. */ 4150 error = xfs_btree_free_block(cur, rbp); 4151 if (error) 4152 goto error0; 4153 4154 /* 4155 * If we joined with the left neighbor, set the buffer in the 4156 * cursor to the left block, and fix up the index. 4157 */ 4158 if (bp != lbp) { 4159 cur->bc_levels[level].bp = lbp; 4160 cur->bc_levels[level].ptr += lrecs; 4161 cur->bc_levels[level].ra = 0; 4162 } 4163 /* 4164 * If we joined with the right neighbor and there's a level above 4165 * us, increment the cursor at that level. 4166 */ 4167 else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || 4168 (level + 1 < cur->bc_nlevels)) { 4169 error = xfs_btree_increment(cur, level + 1, &i); 4170 if (error) 4171 goto error0; 4172 } 4173 4174 /* 4175 * Readjust the ptr at this level if it's not a leaf, since it's 4176 * still pointing at the deletion point, which makes the cursor 4177 * inconsistent. If this makes the ptr 0, the caller fixes it up. 4178 * We can't use decrement because it would change the next level up. 4179 */ 4180 if (level > 0) 4181 cur->bc_levels[level].ptr--; 4182 4183 /* 4184 * We combined blocks, so we have to update the parent keys if the 4185 * btree supports overlapped intervals. However, 4186 * bc_levels[level + 1].ptr points to the old block so that the caller 4187 * knows which record to delete. Therefore, the caller must be savvy 4188 * enough to call updkeys for us if we return stat == 2. The other 4189 * exit points from this function don't require deletions further up 4190 * the tree, so they can call updkeys directly. 4191 */ 4192 4193 /* Return value means the next level up has something to do. */ 4194 *stat = 2; 4195 return 0; 4196 4197 error0: 4198 if (tcur) 4199 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 4200 return error; 4201 } 4202 4203 /* 4204 * Delete the record pointed to by cur. 4205 * The cursor refers to the place where the record was (could be inserted) 4206 * when the operation returns. 4207 */ 4208 int /* error */ 4209 xfs_btree_delete( 4210 struct xfs_btree_cur *cur, 4211 int *stat) /* success/failure */ 4212 { 4213 int error; /* error return value */ 4214 int level; 4215 int i; 4216 bool joined = false; 4217 4218 /* 4219 * Go up the tree, starting at leaf level. 4220 * 4221 * If 2 is returned then a join was done; go to the next level. 4222 * Otherwise we are done. 4223 */ 4224 for (level = 0, i = 2; i == 2; level++) { 4225 error = xfs_btree_delrec(cur, level, &i); 4226 if (error) 4227 goto error0; 4228 if (i == 2) 4229 joined = true; 4230 } 4231 4232 /* 4233 * If we combined blocks as part of deleting the record, delrec won't 4234 * have updated the parent high keys so we have to do that here. 4235 */ 4236 if (joined && (cur->bc_flags & XFS_BTREE_OVERLAPPING)) { 4237 error = xfs_btree_updkeys_force(cur, 0); 4238 if (error) 4239 goto error0; 4240 } 4241 4242 if (i == 0) { 4243 for (level = 1; level < cur->bc_nlevels; level++) { 4244 if (cur->bc_levels[level].ptr == 0) { 4245 error = xfs_btree_decrement(cur, level, &i); 4246 if (error) 4247 goto error0; 4248 break; 4249 } 4250 } 4251 } 4252 4253 *stat = i; 4254 return 0; 4255 error0: 4256 return error; 4257 } 4258 4259 /* 4260 * Get the data from the pointed-to record. 4261 */ 4262 int /* error */ 4263 xfs_btree_get_rec( 4264 struct xfs_btree_cur *cur, /* btree cursor */ 4265 union xfs_btree_rec **recp, /* output: btree record */ 4266 int *stat) /* output: success/failure */ 4267 { 4268 struct xfs_btree_block *block; /* btree block */ 4269 struct xfs_buf *bp; /* buffer pointer */ 4270 int ptr; /* record number */ 4271 #ifdef DEBUG 4272 int error; /* error return value */ 4273 #endif 4274 4275 ptr = cur->bc_levels[0].ptr; 4276 block = xfs_btree_get_block(cur, 0, &bp); 4277 4278 #ifdef DEBUG 4279 error = xfs_btree_check_block(cur, block, 0, bp); 4280 if (error) 4281 return error; 4282 #endif 4283 4284 /* 4285 * Off the right end or left end, return failure. 4286 */ 4287 if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) { 4288 *stat = 0; 4289 return 0; 4290 } 4291 4292 /* 4293 * Point to the record and extract its data. 4294 */ 4295 *recp = xfs_btree_rec_addr(cur, ptr, block); 4296 *stat = 1; 4297 return 0; 4298 } 4299 4300 /* Visit a block in a btree. */ 4301 STATIC int 4302 xfs_btree_visit_block( 4303 struct xfs_btree_cur *cur, 4304 int level, 4305 xfs_btree_visit_blocks_fn fn, 4306 void *data) 4307 { 4308 struct xfs_btree_block *block; 4309 struct xfs_buf *bp; 4310 union xfs_btree_ptr rptr; 4311 int error; 4312 4313 /* do right sibling readahead */ 4314 xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); 4315 block = xfs_btree_get_block(cur, level, &bp); 4316 4317 /* process the block */ 4318 error = fn(cur, level, data); 4319 if (error) 4320 return error; 4321 4322 /* now read rh sibling block for next iteration */ 4323 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); 4324 if (xfs_btree_ptr_is_null(cur, &rptr)) 4325 return -ENOENT; 4326 4327 /* 4328 * We only visit blocks once in this walk, so we have to avoid the 4329 * internal xfs_btree_lookup_get_block() optimisation where it will 4330 * return the same block without checking if the right sibling points 4331 * back to us and creates a cyclic reference in the btree. 4332 */ 4333 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 4334 if (be64_to_cpu(rptr.l) == XFS_DADDR_TO_FSB(cur->bc_mp, 4335 xfs_buf_daddr(bp))) 4336 return -EFSCORRUPTED; 4337 } else { 4338 if (be32_to_cpu(rptr.s) == xfs_daddr_to_agbno(cur->bc_mp, 4339 xfs_buf_daddr(bp))) 4340 return -EFSCORRUPTED; 4341 } 4342 return xfs_btree_lookup_get_block(cur, level, &rptr, &block); 4343 } 4344 4345 4346 /* Visit every block in a btree. */ 4347 int 4348 xfs_btree_visit_blocks( 4349 struct xfs_btree_cur *cur, 4350 xfs_btree_visit_blocks_fn fn, 4351 unsigned int flags, 4352 void *data) 4353 { 4354 union xfs_btree_ptr lptr; 4355 int level; 4356 struct xfs_btree_block *block = NULL; 4357 int error = 0; 4358 4359 cur->bc_ops->init_ptr_from_cur(cur, &lptr); 4360 4361 /* for each level */ 4362 for (level = cur->bc_nlevels - 1; level >= 0; level--) { 4363 /* grab the left hand block */ 4364 error = xfs_btree_lookup_get_block(cur, level, &lptr, &block); 4365 if (error) 4366 return error; 4367 4368 /* readahead the left most block for the next level down */ 4369 if (level > 0) { 4370 union xfs_btree_ptr *ptr; 4371 4372 ptr = xfs_btree_ptr_addr(cur, 1, block); 4373 xfs_btree_readahead_ptr(cur, ptr, 1); 4374 4375 /* save for the next iteration of the loop */ 4376 xfs_btree_copy_ptrs(cur, &lptr, ptr, 1); 4377 4378 if (!(flags & XFS_BTREE_VISIT_LEAVES)) 4379 continue; 4380 } else if (!(flags & XFS_BTREE_VISIT_RECORDS)) { 4381 continue; 4382 } 4383 4384 /* for each buffer in the level */ 4385 do { 4386 error = xfs_btree_visit_block(cur, level, fn, data); 4387 } while (!error); 4388 4389 if (error != -ENOENT) 4390 return error; 4391 } 4392 4393 return 0; 4394 } 4395 4396 /* 4397 * Change the owner of a btree. 4398 * 4399 * The mechanism we use here is ordered buffer logging. Because we don't know 4400 * how many buffers were are going to need to modify, we don't really want to 4401 * have to make transaction reservations for the worst case of every buffer in a 4402 * full size btree as that may be more space that we can fit in the log.... 4403 * 4404 * We do the btree walk in the most optimal manner possible - we have sibling 4405 * pointers so we can just walk all the blocks on each level from left to right 4406 * in a single pass, and then move to the next level and do the same. We can 4407 * also do readahead on the sibling pointers to get IO moving more quickly, 4408 * though for slow disks this is unlikely to make much difference to performance 4409 * as the amount of CPU work we have to do before moving to the next block is 4410 * relatively small. 4411 * 4412 * For each btree block that we load, modify the owner appropriately, set the 4413 * buffer as an ordered buffer and log it appropriately. We need to ensure that 4414 * we mark the region we change dirty so that if the buffer is relogged in 4415 * a subsequent transaction the changes we make here as an ordered buffer are 4416 * correctly relogged in that transaction. If we are in recovery context, then 4417 * just queue the modified buffer as delayed write buffer so the transaction 4418 * recovery completion writes the changes to disk. 4419 */ 4420 struct xfs_btree_block_change_owner_info { 4421 uint64_t new_owner; 4422 struct list_head *buffer_list; 4423 }; 4424 4425 static int 4426 xfs_btree_block_change_owner( 4427 struct xfs_btree_cur *cur, 4428 int level, 4429 void *data) 4430 { 4431 struct xfs_btree_block_change_owner_info *bbcoi = data; 4432 struct xfs_btree_block *block; 4433 struct xfs_buf *bp; 4434 4435 /* modify the owner */ 4436 block = xfs_btree_get_block(cur, level, &bp); 4437 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 4438 if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner)) 4439 return 0; 4440 block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner); 4441 } else { 4442 if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner)) 4443 return 0; 4444 block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner); 4445 } 4446 4447 /* 4448 * If the block is a root block hosted in an inode, we might not have a 4449 * buffer pointer here and we shouldn't attempt to log the change as the 4450 * information is already held in the inode and discarded when the root 4451 * block is formatted into the on-disk inode fork. We still change it, 4452 * though, so everything is consistent in memory. 4453 */ 4454 if (!bp) { 4455 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); 4456 ASSERT(level == cur->bc_nlevels - 1); 4457 return 0; 4458 } 4459 4460 if (cur->bc_tp) { 4461 if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) { 4462 xfs_btree_log_block(cur, bp, XFS_BB_OWNER); 4463 return -EAGAIN; 4464 } 4465 } else { 4466 xfs_buf_delwri_queue(bp, bbcoi->buffer_list); 4467 } 4468 4469 return 0; 4470 } 4471 4472 int 4473 xfs_btree_change_owner( 4474 struct xfs_btree_cur *cur, 4475 uint64_t new_owner, 4476 struct list_head *buffer_list) 4477 { 4478 struct xfs_btree_block_change_owner_info bbcoi; 4479 4480 bbcoi.new_owner = new_owner; 4481 bbcoi.buffer_list = buffer_list; 4482 4483 return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner, 4484 XFS_BTREE_VISIT_ALL, &bbcoi); 4485 } 4486 4487 /* Verify the v5 fields of a long-format btree block. */ 4488 xfs_failaddr_t 4489 xfs_btree_lblock_v5hdr_verify( 4490 struct xfs_buf *bp, 4491 uint64_t owner) 4492 { 4493 struct xfs_mount *mp = bp->b_mount; 4494 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4495 4496 if (!xfs_has_crc(mp)) 4497 return __this_address; 4498 if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) 4499 return __this_address; 4500 if (block->bb_u.l.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) 4501 return __this_address; 4502 if (owner != XFS_RMAP_OWN_UNKNOWN && 4503 be64_to_cpu(block->bb_u.l.bb_owner) != owner) 4504 return __this_address; 4505 return NULL; 4506 } 4507 4508 /* Verify a long-format btree block. */ 4509 xfs_failaddr_t 4510 xfs_btree_lblock_verify( 4511 struct xfs_buf *bp, 4512 unsigned int max_recs) 4513 { 4514 struct xfs_mount *mp = bp->b_mount; 4515 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4516 xfs_fsblock_t fsb; 4517 xfs_failaddr_t fa; 4518 4519 /* numrecs verification */ 4520 if (be16_to_cpu(block->bb_numrecs) > max_recs) 4521 return __this_address; 4522 4523 /* sibling pointer verification */ 4524 fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); 4525 fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, 4526 be64_to_cpu(block->bb_u.l.bb_leftsib)); 4527 if (!fa) 4528 fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, 4529 be64_to_cpu(block->bb_u.l.bb_rightsib)); 4530 return fa; 4531 } 4532 4533 /** 4534 * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format 4535 * btree block 4536 * 4537 * @bp: buffer containing the btree block 4538 */ 4539 xfs_failaddr_t 4540 xfs_btree_sblock_v5hdr_verify( 4541 struct xfs_buf *bp) 4542 { 4543 struct xfs_mount *mp = bp->b_mount; 4544 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4545 struct xfs_perag *pag = bp->b_pag; 4546 4547 if (!xfs_has_crc(mp)) 4548 return __this_address; 4549 if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) 4550 return __this_address; 4551 if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) 4552 return __this_address; 4553 if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) 4554 return __this_address; 4555 return NULL; 4556 } 4557 4558 /** 4559 * xfs_btree_sblock_verify() -- verify a short-format btree block 4560 * 4561 * @bp: buffer containing the btree block 4562 * @max_recs: maximum records allowed in this btree node 4563 */ 4564 xfs_failaddr_t 4565 xfs_btree_sblock_verify( 4566 struct xfs_buf *bp, 4567 unsigned int max_recs) 4568 { 4569 struct xfs_mount *mp = bp->b_mount; 4570 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4571 xfs_agnumber_t agno; 4572 xfs_agblock_t agbno; 4573 xfs_failaddr_t fa; 4574 4575 /* numrecs verification */ 4576 if (be16_to_cpu(block->bb_numrecs) > max_recs) 4577 return __this_address; 4578 4579 /* sibling pointer verification */ 4580 agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp)); 4581 agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); 4582 fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno, 4583 be32_to_cpu(block->bb_u.s.bb_leftsib)); 4584 if (!fa) 4585 fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno, 4586 be32_to_cpu(block->bb_u.s.bb_rightsib)); 4587 return fa; 4588 } 4589 4590 /* 4591 * For the given limits on leaf and keyptr records per block, calculate the 4592 * height of the tree needed to index the number of leaf records. 4593 */ 4594 unsigned int 4595 xfs_btree_compute_maxlevels( 4596 const unsigned int *limits, 4597 unsigned long long records) 4598 { 4599 unsigned long long level_blocks = howmany_64(records, limits[0]); 4600 unsigned int height = 1; 4601 4602 while (level_blocks > 1) { 4603 level_blocks = howmany_64(level_blocks, limits[1]); 4604 height++; 4605 } 4606 4607 return height; 4608 } 4609 4610 /* 4611 * For the given limits on leaf and keyptr records per block, calculate the 4612 * number of blocks needed to index the given number of leaf records. 4613 */ 4614 unsigned long long 4615 xfs_btree_calc_size( 4616 const unsigned int *limits, 4617 unsigned long long records) 4618 { 4619 unsigned long long level_blocks = howmany_64(records, limits[0]); 4620 unsigned long long blocks = level_blocks; 4621 4622 while (level_blocks > 1) { 4623 level_blocks = howmany_64(level_blocks, limits[1]); 4624 blocks += level_blocks; 4625 } 4626 4627 return blocks; 4628 } 4629 4630 /* 4631 * Given a number of available blocks for the btree to consume with records and 4632 * pointers, calculate the height of the tree needed to index all the records 4633 * that space can hold based on the number of pointers each interior node 4634 * holds. 4635 * 4636 * We start by assuming a single level tree consumes a single block, then track 4637 * the number of blocks each node level consumes until we no longer have space 4638 * to store the next node level. At this point, we are indexing all the leaf 4639 * blocks in the space, and there's no more free space to split the tree any 4640 * further. That's our maximum btree height. 4641 */ 4642 unsigned int 4643 xfs_btree_space_to_height( 4644 const unsigned int *limits, 4645 unsigned long long leaf_blocks) 4646 { 4647 unsigned long long node_blocks = limits[1]; 4648 unsigned long long blocks_left = leaf_blocks - 1; 4649 unsigned int height = 1; 4650 4651 if (leaf_blocks < 1) 4652 return 0; 4653 4654 while (node_blocks < blocks_left) { 4655 blocks_left -= node_blocks; 4656 node_blocks *= limits[1]; 4657 height++; 4658 } 4659 4660 return height; 4661 } 4662 4663 /* 4664 * Query a regular btree for all records overlapping a given interval. 4665 * Start with a LE lookup of the key of low_rec and return all records 4666 * until we find a record with a key greater than the key of high_rec. 4667 */ 4668 STATIC int 4669 xfs_btree_simple_query_range( 4670 struct xfs_btree_cur *cur, 4671 const union xfs_btree_key *low_key, 4672 const union xfs_btree_key *high_key, 4673 xfs_btree_query_range_fn fn, 4674 void *priv) 4675 { 4676 union xfs_btree_rec *recp; 4677 union xfs_btree_key rec_key; 4678 int64_t diff; 4679 int stat; 4680 bool firstrec = true; 4681 int error; 4682 4683 ASSERT(cur->bc_ops->init_high_key_from_rec); 4684 ASSERT(cur->bc_ops->diff_two_keys); 4685 4686 /* 4687 * Find the leftmost record. The btree cursor must be set 4688 * to the low record used to generate low_key. 4689 */ 4690 stat = 0; 4691 error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat); 4692 if (error) 4693 goto out; 4694 4695 /* Nothing? See if there's anything to the right. */ 4696 if (!stat) { 4697 error = xfs_btree_increment(cur, 0, &stat); 4698 if (error) 4699 goto out; 4700 } 4701 4702 while (stat) { 4703 /* Find the record. */ 4704 error = xfs_btree_get_rec(cur, &recp, &stat); 4705 if (error || !stat) 4706 break; 4707 4708 /* Skip if high_key(rec) < low_key. */ 4709 if (firstrec) { 4710 cur->bc_ops->init_high_key_from_rec(&rec_key, recp); 4711 firstrec = false; 4712 diff = cur->bc_ops->diff_two_keys(cur, low_key, 4713 &rec_key); 4714 if (diff > 0) 4715 goto advloop; 4716 } 4717 4718 /* Stop if high_key < low_key(rec). */ 4719 cur->bc_ops->init_key_from_rec(&rec_key, recp); 4720 diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key); 4721 if (diff > 0) 4722 break; 4723 4724 /* Callback */ 4725 error = fn(cur, recp, priv); 4726 if (error) 4727 break; 4728 4729 advloop: 4730 /* Move on to the next record. */ 4731 error = xfs_btree_increment(cur, 0, &stat); 4732 if (error) 4733 break; 4734 } 4735 4736 out: 4737 return error; 4738 } 4739 4740 /* 4741 * Query an overlapped interval btree for all records overlapping a given 4742 * interval. This function roughly follows the algorithm given in 4743 * "Interval Trees" of _Introduction to Algorithms_, which is section 4744 * 14.3 in the 2nd and 3rd editions. 4745 * 4746 * First, generate keys for the low and high records passed in. 4747 * 4748 * For any leaf node, generate the high and low keys for the record. 4749 * If the record keys overlap with the query low/high keys, pass the 4750 * record to the function iterator. 4751 * 4752 * For any internal node, compare the low and high keys of each 4753 * pointer against the query low/high keys. If there's an overlap, 4754 * follow the pointer. 4755 * 4756 * As an optimization, we stop scanning a block when we find a low key 4757 * that is greater than the query's high key. 4758 */ 4759 STATIC int 4760 xfs_btree_overlapped_query_range( 4761 struct xfs_btree_cur *cur, 4762 const union xfs_btree_key *low_key, 4763 const union xfs_btree_key *high_key, 4764 xfs_btree_query_range_fn fn, 4765 void *priv) 4766 { 4767 union xfs_btree_ptr ptr; 4768 union xfs_btree_ptr *pp; 4769 union xfs_btree_key rec_key; 4770 union xfs_btree_key rec_hkey; 4771 union xfs_btree_key *lkp; 4772 union xfs_btree_key *hkp; 4773 union xfs_btree_rec *recp; 4774 struct xfs_btree_block *block; 4775 int64_t ldiff; 4776 int64_t hdiff; 4777 int level; 4778 struct xfs_buf *bp; 4779 int i; 4780 int error; 4781 4782 /* Load the root of the btree. */ 4783 level = cur->bc_nlevels - 1; 4784 cur->bc_ops->init_ptr_from_cur(cur, &ptr); 4785 error = xfs_btree_lookup_get_block(cur, level, &ptr, &block); 4786 if (error) 4787 return error; 4788 xfs_btree_get_block(cur, level, &bp); 4789 trace_xfs_btree_overlapped_query_range(cur, level, bp); 4790 #ifdef DEBUG 4791 error = xfs_btree_check_block(cur, block, level, bp); 4792 if (error) 4793 goto out; 4794 #endif 4795 cur->bc_levels[level].ptr = 1; 4796 4797 while (level < cur->bc_nlevels) { 4798 block = xfs_btree_get_block(cur, level, &bp); 4799 4800 /* End of node, pop back towards the root. */ 4801 if (cur->bc_levels[level].ptr > 4802 be16_to_cpu(block->bb_numrecs)) { 4803 pop_up: 4804 if (level < cur->bc_nlevels - 1) 4805 cur->bc_levels[level + 1].ptr++; 4806 level++; 4807 continue; 4808 } 4809 4810 if (level == 0) { 4811 /* Handle a leaf node. */ 4812 recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr, 4813 block); 4814 4815 cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp); 4816 ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey, 4817 low_key); 4818 4819 cur->bc_ops->init_key_from_rec(&rec_key, recp); 4820 hdiff = cur->bc_ops->diff_two_keys(cur, high_key, 4821 &rec_key); 4822 4823 /* 4824 * If (record's high key >= query's low key) and 4825 * (query's high key >= record's low key), then 4826 * this record overlaps the query range; callback. 4827 */ 4828 if (ldiff >= 0 && hdiff >= 0) { 4829 error = fn(cur, recp, priv); 4830 if (error) 4831 break; 4832 } else if (hdiff < 0) { 4833 /* Record is larger than high key; pop. */ 4834 goto pop_up; 4835 } 4836 cur->bc_levels[level].ptr++; 4837 continue; 4838 } 4839 4840 /* Handle an internal node. */ 4841 lkp = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block); 4842 hkp = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, 4843 block); 4844 pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block); 4845 4846 ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key); 4847 hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp); 4848 4849 /* 4850 * If (pointer's high key >= query's low key) and 4851 * (query's high key >= pointer's low key), then 4852 * this record overlaps the query range; follow pointer. 4853 */ 4854 if (ldiff >= 0 && hdiff >= 0) { 4855 level--; 4856 error = xfs_btree_lookup_get_block(cur, level, pp, 4857 &block); 4858 if (error) 4859 goto out; 4860 xfs_btree_get_block(cur, level, &bp); 4861 trace_xfs_btree_overlapped_query_range(cur, level, bp); 4862 #ifdef DEBUG 4863 error = xfs_btree_check_block(cur, block, level, bp); 4864 if (error) 4865 goto out; 4866 #endif 4867 cur->bc_levels[level].ptr = 1; 4868 continue; 4869 } else if (hdiff < 0) { 4870 /* The low key is larger than the upper range; pop. */ 4871 goto pop_up; 4872 } 4873 cur->bc_levels[level].ptr++; 4874 } 4875 4876 out: 4877 /* 4878 * If we don't end this function with the cursor pointing at a record 4879 * block, a subsequent non-error cursor deletion will not release 4880 * node-level buffers, causing a buffer leak. This is quite possible 4881 * with a zero-results range query, so release the buffers if we 4882 * failed to return any results. 4883 */ 4884 if (cur->bc_levels[0].bp == NULL) { 4885 for (i = 0; i < cur->bc_nlevels; i++) { 4886 if (cur->bc_levels[i].bp) { 4887 xfs_trans_brelse(cur->bc_tp, 4888 cur->bc_levels[i].bp); 4889 cur->bc_levels[i].bp = NULL; 4890 cur->bc_levels[i].ptr = 0; 4891 cur->bc_levels[i].ra = 0; 4892 } 4893 } 4894 } 4895 4896 return error; 4897 } 4898 4899 /* 4900 * Query a btree for all records overlapping a given interval of keys. The 4901 * supplied function will be called with each record found; return one of the 4902 * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error 4903 * code. This function returns -ECANCELED, zero, or a negative error code. 4904 */ 4905 int 4906 xfs_btree_query_range( 4907 struct xfs_btree_cur *cur, 4908 const union xfs_btree_irec *low_rec, 4909 const union xfs_btree_irec *high_rec, 4910 xfs_btree_query_range_fn fn, 4911 void *priv) 4912 { 4913 union xfs_btree_rec rec; 4914 union xfs_btree_key low_key; 4915 union xfs_btree_key high_key; 4916 4917 /* Find the keys of both ends of the interval. */ 4918 cur->bc_rec = *high_rec; 4919 cur->bc_ops->init_rec_from_cur(cur, &rec); 4920 cur->bc_ops->init_key_from_rec(&high_key, &rec); 4921 4922 cur->bc_rec = *low_rec; 4923 cur->bc_ops->init_rec_from_cur(cur, &rec); 4924 cur->bc_ops->init_key_from_rec(&low_key, &rec); 4925 4926 /* Enforce low key < high key. */ 4927 if (cur->bc_ops->diff_two_keys(cur, &low_key, &high_key) > 0) 4928 return -EINVAL; 4929 4930 if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) 4931 return xfs_btree_simple_query_range(cur, &low_key, 4932 &high_key, fn, priv); 4933 return xfs_btree_overlapped_query_range(cur, &low_key, &high_key, 4934 fn, priv); 4935 } 4936 4937 /* Query a btree for all records. */ 4938 int 4939 xfs_btree_query_all( 4940 struct xfs_btree_cur *cur, 4941 xfs_btree_query_range_fn fn, 4942 void *priv) 4943 { 4944 union xfs_btree_key low_key; 4945 union xfs_btree_key high_key; 4946 4947 memset(&cur->bc_rec, 0, sizeof(cur->bc_rec)); 4948 memset(&low_key, 0, sizeof(low_key)); 4949 memset(&high_key, 0xFF, sizeof(high_key)); 4950 4951 return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv); 4952 } 4953 4954 static int 4955 xfs_btree_count_blocks_helper( 4956 struct xfs_btree_cur *cur, 4957 int level, 4958 void *data) 4959 { 4960 xfs_extlen_t *blocks = data; 4961 (*blocks)++; 4962 4963 return 0; 4964 } 4965 4966 /* Count the blocks in a btree and return the result in *blocks. */ 4967 int 4968 xfs_btree_count_blocks( 4969 struct xfs_btree_cur *cur, 4970 xfs_extlen_t *blocks) 4971 { 4972 *blocks = 0; 4973 return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper, 4974 XFS_BTREE_VISIT_ALL, blocks); 4975 } 4976 4977 /* Compare two btree pointers. */ 4978 int64_t 4979 xfs_btree_diff_two_ptrs( 4980 struct xfs_btree_cur *cur, 4981 const union xfs_btree_ptr *a, 4982 const union xfs_btree_ptr *b) 4983 { 4984 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 4985 return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l); 4986 return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s); 4987 } 4988 4989 /* If there's an extent, we're done. */ 4990 STATIC int 4991 xfs_btree_has_record_helper( 4992 struct xfs_btree_cur *cur, 4993 const union xfs_btree_rec *rec, 4994 void *priv) 4995 { 4996 return -ECANCELED; 4997 } 4998 4999 /* Is there a record covering a given range of keys? */ 5000 int 5001 xfs_btree_has_record( 5002 struct xfs_btree_cur *cur, 5003 const union xfs_btree_irec *low, 5004 const union xfs_btree_irec *high, 5005 bool *exists) 5006 { 5007 int error; 5008 5009 error = xfs_btree_query_range(cur, low, high, 5010 &xfs_btree_has_record_helper, NULL); 5011 if (error == -ECANCELED) { 5012 *exists = true; 5013 return 0; 5014 } 5015 *exists = false; 5016 return error; 5017 } 5018 5019 /* Are there more records in this btree? */ 5020 bool 5021 xfs_btree_has_more_records( 5022 struct xfs_btree_cur *cur) 5023 { 5024 struct xfs_btree_block *block; 5025 struct xfs_buf *bp; 5026 5027 block = xfs_btree_get_block(cur, 0, &bp); 5028 5029 /* There are still records in this block. */ 5030 if (cur->bc_levels[0].ptr < xfs_btree_get_numrecs(block)) 5031 return true; 5032 5033 /* There are more record blocks. */ 5034 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 5035 return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK); 5036 else 5037 return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK); 5038 } 5039 5040 /* Set up all the btree cursor caches. */ 5041 int __init 5042 xfs_btree_init_cur_caches(void) 5043 { 5044 int error; 5045 5046 error = xfs_allocbt_init_cur_cache(); 5047 if (error) 5048 return error; 5049 error = xfs_inobt_init_cur_cache(); 5050 if (error) 5051 goto err; 5052 error = xfs_bmbt_init_cur_cache(); 5053 if (error) 5054 goto err; 5055 error = xfs_rmapbt_init_cur_cache(); 5056 if (error) 5057 goto err; 5058 error = xfs_refcountbt_init_cur_cache(); 5059 if (error) 5060 goto err; 5061 5062 return 0; 5063 err: 5064 xfs_btree_destroy_cur_caches(); 5065 return error; 5066 } 5067 5068 /* Destroy all the btree cursor caches, if they've been allocated. */ 5069 void 5070 xfs_btree_destroy_cur_caches(void) 5071 { 5072 xfs_allocbt_destroy_cur_cache(); 5073 xfs_inobt_destroy_cur_cache(); 5074 xfs_bmbt_destroy_cur_cache(); 5075 xfs_rmapbt_destroy_cur_cache(); 5076 xfs_refcountbt_destroy_cur_cache(); 5077 } 5078