1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_mount.h" 14 #include "xfs_inode.h" 15 #include "xfs_trans.h" 16 #include "xfs_buf_item.h" 17 #include "xfs_btree.h" 18 #include "xfs_errortag.h" 19 #include "xfs_error.h" 20 #include "xfs_trace.h" 21 #include "xfs_alloc.h" 22 #include "xfs_log.h" 23 #include "xfs_btree_staging.h" 24 #include "xfs_ag.h" 25 #include "xfs_alloc_btree.h" 26 #include "xfs_ialloc_btree.h" 27 #include "xfs_bmap_btree.h" 28 #include "xfs_rmap_btree.h" 29 #include "xfs_refcount_btree.h" 30 31 /* 32 * Btree magic numbers. 33 */ 34 static const uint32_t xfs_magics[2][XFS_BTNUM_MAX] = { 35 { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC, 36 XFS_FIBT_MAGIC, 0 }, 37 { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC, 38 XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC, 39 XFS_REFC_CRC_MAGIC } 40 }; 41 42 uint32_t 43 xfs_btree_magic( 44 int crc, 45 xfs_btnum_t btnum) 46 { 47 uint32_t magic = xfs_magics[crc][btnum]; 48 49 /* Ensure we asked for crc for crc-only magics. */ 50 ASSERT(magic != 0); 51 return magic; 52 } 53 54 /* 55 * These sibling pointer checks are optimised for null sibling pointers. This 56 * happens a lot, and we don't need to byte swap at runtime if the sibling 57 * pointer is NULL. 58 * 59 * These are explicitly marked at inline because the cost of calling them as 60 * functions instead of inlining them is about 36 bytes extra code per call site 61 * on x86-64. Yes, gcc-11 fails to inline them, and explicit inlining of these 62 * two sibling check functions reduces the compiled code size by over 300 63 * bytes. 64 */ 65 static inline xfs_failaddr_t 66 xfs_btree_check_lblock_siblings( 67 struct xfs_mount *mp, 68 struct xfs_btree_cur *cur, 69 int level, 70 xfs_fsblock_t fsb, 71 __be64 dsibling) 72 { 73 xfs_fsblock_t sibling; 74 75 if (dsibling == cpu_to_be64(NULLFSBLOCK)) 76 return NULL; 77 78 sibling = be64_to_cpu(dsibling); 79 if (sibling == fsb) 80 return __this_address; 81 if (level >= 0) { 82 if (!xfs_btree_check_lptr(cur, sibling, level + 1)) 83 return __this_address; 84 } else { 85 if (!xfs_verify_fsbno(mp, sibling)) 86 return __this_address; 87 } 88 89 return NULL; 90 } 91 92 static inline xfs_failaddr_t 93 xfs_btree_check_sblock_siblings( 94 struct xfs_perag *pag, 95 struct xfs_btree_cur *cur, 96 int level, 97 xfs_agblock_t agbno, 98 __be32 dsibling) 99 { 100 xfs_agblock_t sibling; 101 102 if (dsibling == cpu_to_be32(NULLAGBLOCK)) 103 return NULL; 104 105 sibling = be32_to_cpu(dsibling); 106 if (sibling == agbno) 107 return __this_address; 108 if (level >= 0) { 109 if (!xfs_btree_check_sptr(cur, sibling, level + 1)) 110 return __this_address; 111 } else { 112 if (!xfs_verify_agbno(pag, sibling)) 113 return __this_address; 114 } 115 return NULL; 116 } 117 118 /* 119 * Check a long btree block header. Return the address of the failing check, 120 * or NULL if everything is ok. 121 */ 122 xfs_failaddr_t 123 __xfs_btree_check_lblock( 124 struct xfs_btree_cur *cur, 125 struct xfs_btree_block *block, 126 int level, 127 struct xfs_buf *bp) 128 { 129 struct xfs_mount *mp = cur->bc_mp; 130 xfs_btnum_t btnum = cur->bc_btnum; 131 int crc = xfs_has_crc(mp); 132 xfs_failaddr_t fa; 133 xfs_fsblock_t fsb = NULLFSBLOCK; 134 135 if (crc) { 136 if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) 137 return __this_address; 138 if (block->bb_u.l.bb_blkno != 139 cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL)) 140 return __this_address; 141 if (block->bb_u.l.bb_pad != cpu_to_be32(0)) 142 return __this_address; 143 } 144 145 if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum)) 146 return __this_address; 147 if (be16_to_cpu(block->bb_level) != level) 148 return __this_address; 149 if (be16_to_cpu(block->bb_numrecs) > 150 cur->bc_ops->get_maxrecs(cur, level)) 151 return __this_address; 152 153 if (bp) 154 fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); 155 156 fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, 157 block->bb_u.l.bb_leftsib); 158 if (!fa) 159 fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, 160 block->bb_u.l.bb_rightsib); 161 return fa; 162 } 163 164 /* Check a long btree block header. */ 165 static int 166 xfs_btree_check_lblock( 167 struct xfs_btree_cur *cur, 168 struct xfs_btree_block *block, 169 int level, 170 struct xfs_buf *bp) 171 { 172 struct xfs_mount *mp = cur->bc_mp; 173 xfs_failaddr_t fa; 174 175 fa = __xfs_btree_check_lblock(cur, block, level, bp); 176 if (XFS_IS_CORRUPT(mp, fa != NULL) || 177 XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK)) { 178 if (bp) 179 trace_xfs_btree_corrupt(bp, _RET_IP_); 180 return -EFSCORRUPTED; 181 } 182 return 0; 183 } 184 185 /* 186 * Check a short btree block header. Return the address of the failing check, 187 * or NULL if everything is ok. 188 */ 189 xfs_failaddr_t 190 __xfs_btree_check_sblock( 191 struct xfs_btree_cur *cur, 192 struct xfs_btree_block *block, 193 int level, 194 struct xfs_buf *bp) 195 { 196 struct xfs_mount *mp = cur->bc_mp; 197 struct xfs_perag *pag = cur->bc_ag.pag; 198 xfs_btnum_t btnum = cur->bc_btnum; 199 int crc = xfs_has_crc(mp); 200 xfs_failaddr_t fa; 201 xfs_agblock_t agbno = NULLAGBLOCK; 202 203 if (crc) { 204 if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) 205 return __this_address; 206 if (block->bb_u.s.bb_blkno != 207 cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL)) 208 return __this_address; 209 } 210 211 if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum)) 212 return __this_address; 213 if (be16_to_cpu(block->bb_level) != level) 214 return __this_address; 215 if (be16_to_cpu(block->bb_numrecs) > 216 cur->bc_ops->get_maxrecs(cur, level)) 217 return __this_address; 218 219 if (bp) 220 agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); 221 222 fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno, 223 block->bb_u.s.bb_leftsib); 224 if (!fa) 225 fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno, 226 block->bb_u.s.bb_rightsib); 227 return fa; 228 } 229 230 /* Check a short btree block header. */ 231 STATIC int 232 xfs_btree_check_sblock( 233 struct xfs_btree_cur *cur, 234 struct xfs_btree_block *block, 235 int level, 236 struct xfs_buf *bp) 237 { 238 struct xfs_mount *mp = cur->bc_mp; 239 xfs_failaddr_t fa; 240 241 fa = __xfs_btree_check_sblock(cur, block, level, bp); 242 if (XFS_IS_CORRUPT(mp, fa != NULL) || 243 XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_SBLOCK)) { 244 if (bp) 245 trace_xfs_btree_corrupt(bp, _RET_IP_); 246 return -EFSCORRUPTED; 247 } 248 return 0; 249 } 250 251 /* 252 * Debug routine: check that block header is ok. 253 */ 254 int 255 xfs_btree_check_block( 256 struct xfs_btree_cur *cur, /* btree cursor */ 257 struct xfs_btree_block *block, /* generic btree block pointer */ 258 int level, /* level of the btree block */ 259 struct xfs_buf *bp) /* buffer containing block, if any */ 260 { 261 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 262 return xfs_btree_check_lblock(cur, block, level, bp); 263 else 264 return xfs_btree_check_sblock(cur, block, level, bp); 265 } 266 267 /* Check that this long pointer is valid and points within the fs. */ 268 bool 269 xfs_btree_check_lptr( 270 struct xfs_btree_cur *cur, 271 xfs_fsblock_t fsbno, 272 int level) 273 { 274 if (level <= 0) 275 return false; 276 return xfs_verify_fsbno(cur->bc_mp, fsbno); 277 } 278 279 /* Check that this short pointer is valid and points within the AG. */ 280 bool 281 xfs_btree_check_sptr( 282 struct xfs_btree_cur *cur, 283 xfs_agblock_t agbno, 284 int level) 285 { 286 if (level <= 0) 287 return false; 288 return xfs_verify_agbno(cur->bc_ag.pag, agbno); 289 } 290 291 /* 292 * Check that a given (indexed) btree pointer at a certain level of a 293 * btree is valid and doesn't point past where it should. 294 */ 295 static int 296 xfs_btree_check_ptr( 297 struct xfs_btree_cur *cur, 298 const union xfs_btree_ptr *ptr, 299 int index, 300 int level) 301 { 302 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 303 if (xfs_btree_check_lptr(cur, be64_to_cpu((&ptr->l)[index]), 304 level)) 305 return 0; 306 xfs_err(cur->bc_mp, 307 "Inode %llu fork %d: Corrupt btree %d pointer at level %d index %d.", 308 cur->bc_ino.ip->i_ino, 309 cur->bc_ino.whichfork, cur->bc_btnum, 310 level, index); 311 } else { 312 if (xfs_btree_check_sptr(cur, be32_to_cpu((&ptr->s)[index]), 313 level)) 314 return 0; 315 xfs_err(cur->bc_mp, 316 "AG %u: Corrupt btree %d pointer at level %d index %d.", 317 cur->bc_ag.pag->pag_agno, cur->bc_btnum, 318 level, index); 319 } 320 321 return -EFSCORRUPTED; 322 } 323 324 #ifdef DEBUG 325 # define xfs_btree_debug_check_ptr xfs_btree_check_ptr 326 #else 327 # define xfs_btree_debug_check_ptr(...) (0) 328 #endif 329 330 /* 331 * Calculate CRC on the whole btree block and stuff it into the 332 * long-form btree header. 333 * 334 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put 335 * it into the buffer so recovery knows what the last modification was that made 336 * it to disk. 337 */ 338 void 339 xfs_btree_lblock_calc_crc( 340 struct xfs_buf *bp) 341 { 342 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 343 struct xfs_buf_log_item *bip = bp->b_log_item; 344 345 if (!xfs_has_crc(bp->b_mount)) 346 return; 347 if (bip) 348 block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); 349 xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); 350 } 351 352 bool 353 xfs_btree_lblock_verify_crc( 354 struct xfs_buf *bp) 355 { 356 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 357 struct xfs_mount *mp = bp->b_mount; 358 359 if (xfs_has_crc(mp)) { 360 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn))) 361 return false; 362 return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); 363 } 364 365 return true; 366 } 367 368 /* 369 * Calculate CRC on the whole btree block and stuff it into the 370 * short-form btree header. 371 * 372 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put 373 * it into the buffer so recovery knows what the last modification was that made 374 * it to disk. 375 */ 376 void 377 xfs_btree_sblock_calc_crc( 378 struct xfs_buf *bp) 379 { 380 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 381 struct xfs_buf_log_item *bip = bp->b_log_item; 382 383 if (!xfs_has_crc(bp->b_mount)) 384 return; 385 if (bip) 386 block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); 387 xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); 388 } 389 390 bool 391 xfs_btree_sblock_verify_crc( 392 struct xfs_buf *bp) 393 { 394 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 395 struct xfs_mount *mp = bp->b_mount; 396 397 if (xfs_has_crc(mp)) { 398 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn))) 399 return false; 400 return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); 401 } 402 403 return true; 404 } 405 406 static int 407 xfs_btree_free_block( 408 struct xfs_btree_cur *cur, 409 struct xfs_buf *bp) 410 { 411 int error; 412 413 error = cur->bc_ops->free_block(cur, bp); 414 if (!error) { 415 xfs_trans_binval(cur->bc_tp, bp); 416 XFS_BTREE_STATS_INC(cur, free); 417 } 418 return error; 419 } 420 421 /* 422 * Delete the btree cursor. 423 */ 424 void 425 xfs_btree_del_cursor( 426 struct xfs_btree_cur *cur, /* btree cursor */ 427 int error) /* del because of error */ 428 { 429 int i; /* btree level */ 430 431 /* 432 * Clear the buffer pointers and release the buffers. If we're doing 433 * this because of an error, inspect all of the entries in the bc_bufs 434 * array for buffers to be unlocked. This is because some of the btree 435 * code works from level n down to 0, and if we get an error along the 436 * way we won't have initialized all the entries down to 0. 437 */ 438 for (i = 0; i < cur->bc_nlevels; i++) { 439 if (cur->bc_levels[i].bp) 440 xfs_trans_brelse(cur->bc_tp, cur->bc_levels[i].bp); 441 else if (!error) 442 break; 443 } 444 445 /* 446 * If we are doing a BMBT update, the number of unaccounted blocks 447 * allocated during this cursor life time should be zero. If it's not 448 * zero, then we should be shut down or on our way to shutdown due to 449 * cancelling a dirty transaction on error. 450 */ 451 ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 || 452 xfs_is_shutdown(cur->bc_mp) || error != 0); 453 if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) 454 kmem_free(cur->bc_ops); 455 if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag) 456 xfs_perag_put(cur->bc_ag.pag); 457 kmem_cache_free(cur->bc_cache, cur); 458 } 459 460 /* 461 * Duplicate the btree cursor. 462 * Allocate a new one, copy the record, re-get the buffers. 463 */ 464 int /* error */ 465 xfs_btree_dup_cursor( 466 struct xfs_btree_cur *cur, /* input cursor */ 467 struct xfs_btree_cur **ncur) /* output cursor */ 468 { 469 struct xfs_buf *bp; /* btree block's buffer pointer */ 470 int error; /* error return value */ 471 int i; /* level number of btree block */ 472 xfs_mount_t *mp; /* mount structure for filesystem */ 473 struct xfs_btree_cur *new; /* new cursor value */ 474 xfs_trans_t *tp; /* transaction pointer, can be NULL */ 475 476 tp = cur->bc_tp; 477 mp = cur->bc_mp; 478 479 /* 480 * Allocate a new cursor like the old one. 481 */ 482 new = cur->bc_ops->dup_cursor(cur); 483 484 /* 485 * Copy the record currently in the cursor. 486 */ 487 new->bc_rec = cur->bc_rec; 488 489 /* 490 * For each level current, re-get the buffer and copy the ptr value. 491 */ 492 for (i = 0; i < new->bc_nlevels; i++) { 493 new->bc_levels[i].ptr = cur->bc_levels[i].ptr; 494 new->bc_levels[i].ra = cur->bc_levels[i].ra; 495 bp = cur->bc_levels[i].bp; 496 if (bp) { 497 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 498 xfs_buf_daddr(bp), mp->m_bsize, 499 0, &bp, 500 cur->bc_ops->buf_ops); 501 if (error) { 502 xfs_btree_del_cursor(new, error); 503 *ncur = NULL; 504 return error; 505 } 506 } 507 new->bc_levels[i].bp = bp; 508 } 509 *ncur = new; 510 return 0; 511 } 512 513 /* 514 * XFS btree block layout and addressing: 515 * 516 * There are two types of blocks in the btree: leaf and non-leaf blocks. 517 * 518 * The leaf record start with a header then followed by records containing 519 * the values. A non-leaf block also starts with the same header, and 520 * then first contains lookup keys followed by an equal number of pointers 521 * to the btree blocks at the previous level. 522 * 523 * +--------+-------+-------+-------+-------+-------+-------+ 524 * Leaf: | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N | 525 * +--------+-------+-------+-------+-------+-------+-------+ 526 * 527 * +--------+-------+-------+-------+-------+-------+-------+ 528 * Non-Leaf: | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N | 529 * +--------+-------+-------+-------+-------+-------+-------+ 530 * 531 * The header is called struct xfs_btree_block for reasons better left unknown 532 * and comes in different versions for short (32bit) and long (64bit) block 533 * pointers. The record and key structures are defined by the btree instances 534 * and opaque to the btree core. The block pointers are simple disk endian 535 * integers, available in a short (32bit) and long (64bit) variant. 536 * 537 * The helpers below calculate the offset of a given record, key or pointer 538 * into a btree block (xfs_btree_*_offset) or return a pointer to the given 539 * record, key or pointer (xfs_btree_*_addr). Note that all addressing 540 * inside the btree block is done using indices starting at one, not zero! 541 * 542 * If XFS_BTREE_OVERLAPPING is set, then this btree supports keys containing 543 * overlapping intervals. In such a tree, records are still sorted lowest to 544 * highest and indexed by the smallest key value that refers to the record. 545 * However, nodes are different: each pointer has two associated keys -- one 546 * indexing the lowest key available in the block(s) below (the same behavior 547 * as the key in a regular btree) and another indexing the highest key 548 * available in the block(s) below. Because records are /not/ sorted by the 549 * highest key, all leaf block updates require us to compute the highest key 550 * that matches any record in the leaf and to recursively update the high keys 551 * in the nodes going further up in the tree, if necessary. Nodes look like 552 * this: 553 * 554 * +--------+-----+-----+-----+-----+-----+-------+-------+-----+ 555 * Non-Leaf: | header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... | 556 * +--------+-----+-----+-----+-----+-----+-------+-------+-----+ 557 * 558 * To perform an interval query on an overlapped tree, perform the usual 559 * depth-first search and use the low and high keys to decide if we can skip 560 * that particular node. If a leaf node is reached, return the records that 561 * intersect the interval. Note that an interval query may return numerous 562 * entries. For a non-overlapped tree, simply search for the record associated 563 * with the lowest key and iterate forward until a non-matching record is 564 * found. Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by 565 * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in 566 * more detail. 567 * 568 * Why do we care about overlapping intervals? Let's say you have a bunch of 569 * reverse mapping records on a reflink filesystem: 570 * 571 * 1: +- file A startblock B offset C length D -----------+ 572 * 2: +- file E startblock F offset G length H --------------+ 573 * 3: +- file I startblock F offset J length K --+ 574 * 4: +- file L... --+ 575 * 576 * Now say we want to map block (B+D) into file A at offset (C+D). Ideally, 577 * we'd simply increment the length of record 1. But how do we find the record 578 * that ends at (B+D-1) (i.e. record 1)? A LE lookup of (B+D-1) would return 579 * record 3 because the keys are ordered first by startblock. An interval 580 * query would return records 1 and 2 because they both overlap (B+D-1), and 581 * from that we can pick out record 1 as the appropriate left neighbor. 582 * 583 * In the non-overlapped case you can do a LE lookup and decrement the cursor 584 * because a record's interval must end before the next record. 585 */ 586 587 /* 588 * Return size of the btree block header for this btree instance. 589 */ 590 static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur) 591 { 592 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 593 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) 594 return XFS_BTREE_LBLOCK_CRC_LEN; 595 return XFS_BTREE_LBLOCK_LEN; 596 } 597 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) 598 return XFS_BTREE_SBLOCK_CRC_LEN; 599 return XFS_BTREE_SBLOCK_LEN; 600 } 601 602 /* 603 * Return size of btree block pointers for this btree instance. 604 */ 605 static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur) 606 { 607 return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? 608 sizeof(__be64) : sizeof(__be32); 609 } 610 611 /* 612 * Calculate offset of the n-th record in a btree block. 613 */ 614 STATIC size_t 615 xfs_btree_rec_offset( 616 struct xfs_btree_cur *cur, 617 int n) 618 { 619 return xfs_btree_block_len(cur) + 620 (n - 1) * cur->bc_ops->rec_len; 621 } 622 623 /* 624 * Calculate offset of the n-th key in a btree block. 625 */ 626 STATIC size_t 627 xfs_btree_key_offset( 628 struct xfs_btree_cur *cur, 629 int n) 630 { 631 return xfs_btree_block_len(cur) + 632 (n - 1) * cur->bc_ops->key_len; 633 } 634 635 /* 636 * Calculate offset of the n-th high key in a btree block. 637 */ 638 STATIC size_t 639 xfs_btree_high_key_offset( 640 struct xfs_btree_cur *cur, 641 int n) 642 { 643 return xfs_btree_block_len(cur) + 644 (n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2); 645 } 646 647 /* 648 * Calculate offset of the n-th block pointer in a btree block. 649 */ 650 STATIC size_t 651 xfs_btree_ptr_offset( 652 struct xfs_btree_cur *cur, 653 int n, 654 int level) 655 { 656 return xfs_btree_block_len(cur) + 657 cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len + 658 (n - 1) * xfs_btree_ptr_len(cur); 659 } 660 661 /* 662 * Return a pointer to the n-th record in the btree block. 663 */ 664 union xfs_btree_rec * 665 xfs_btree_rec_addr( 666 struct xfs_btree_cur *cur, 667 int n, 668 struct xfs_btree_block *block) 669 { 670 return (union xfs_btree_rec *) 671 ((char *)block + xfs_btree_rec_offset(cur, n)); 672 } 673 674 /* 675 * Return a pointer to the n-th key in the btree block. 676 */ 677 union xfs_btree_key * 678 xfs_btree_key_addr( 679 struct xfs_btree_cur *cur, 680 int n, 681 struct xfs_btree_block *block) 682 { 683 return (union xfs_btree_key *) 684 ((char *)block + xfs_btree_key_offset(cur, n)); 685 } 686 687 /* 688 * Return a pointer to the n-th high key in the btree block. 689 */ 690 union xfs_btree_key * 691 xfs_btree_high_key_addr( 692 struct xfs_btree_cur *cur, 693 int n, 694 struct xfs_btree_block *block) 695 { 696 return (union xfs_btree_key *) 697 ((char *)block + xfs_btree_high_key_offset(cur, n)); 698 } 699 700 /* 701 * Return a pointer to the n-th block pointer in the btree block. 702 */ 703 union xfs_btree_ptr * 704 xfs_btree_ptr_addr( 705 struct xfs_btree_cur *cur, 706 int n, 707 struct xfs_btree_block *block) 708 { 709 int level = xfs_btree_get_level(block); 710 711 ASSERT(block->bb_level != 0); 712 713 return (union xfs_btree_ptr *) 714 ((char *)block + xfs_btree_ptr_offset(cur, n, level)); 715 } 716 717 struct xfs_ifork * 718 xfs_btree_ifork_ptr( 719 struct xfs_btree_cur *cur) 720 { 721 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); 722 723 if (cur->bc_flags & XFS_BTREE_STAGING) 724 return cur->bc_ino.ifake->if_fork; 725 return xfs_ifork_ptr(cur->bc_ino.ip, cur->bc_ino.whichfork); 726 } 727 728 /* 729 * Get the root block which is stored in the inode. 730 * 731 * For now this btree implementation assumes the btree root is always 732 * stored in the if_broot field of an inode fork. 733 */ 734 STATIC struct xfs_btree_block * 735 xfs_btree_get_iroot( 736 struct xfs_btree_cur *cur) 737 { 738 struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); 739 740 return (struct xfs_btree_block *)ifp->if_broot; 741 } 742 743 /* 744 * Retrieve the block pointer from the cursor at the given level. 745 * This may be an inode btree root or from a buffer. 746 */ 747 struct xfs_btree_block * /* generic btree block pointer */ 748 xfs_btree_get_block( 749 struct xfs_btree_cur *cur, /* btree cursor */ 750 int level, /* level in btree */ 751 struct xfs_buf **bpp) /* buffer containing the block */ 752 { 753 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && 754 (level == cur->bc_nlevels - 1)) { 755 *bpp = NULL; 756 return xfs_btree_get_iroot(cur); 757 } 758 759 *bpp = cur->bc_levels[level].bp; 760 return XFS_BUF_TO_BLOCK(*bpp); 761 } 762 763 /* 764 * Change the cursor to point to the first record at the given level. 765 * Other levels are unaffected. 766 */ 767 STATIC int /* success=1, failure=0 */ 768 xfs_btree_firstrec( 769 struct xfs_btree_cur *cur, /* btree cursor */ 770 int level) /* level to change */ 771 { 772 struct xfs_btree_block *block; /* generic btree block pointer */ 773 struct xfs_buf *bp; /* buffer containing block */ 774 775 /* 776 * Get the block pointer for this level. 777 */ 778 block = xfs_btree_get_block(cur, level, &bp); 779 if (xfs_btree_check_block(cur, block, level, bp)) 780 return 0; 781 /* 782 * It's empty, there is no such record. 783 */ 784 if (!block->bb_numrecs) 785 return 0; 786 /* 787 * Set the ptr value to 1, that's the first record/key. 788 */ 789 cur->bc_levels[level].ptr = 1; 790 return 1; 791 } 792 793 /* 794 * Change the cursor to point to the last record in the current block 795 * at the given level. Other levels are unaffected. 796 */ 797 STATIC int /* success=1, failure=0 */ 798 xfs_btree_lastrec( 799 struct xfs_btree_cur *cur, /* btree cursor */ 800 int level) /* level to change */ 801 { 802 struct xfs_btree_block *block; /* generic btree block pointer */ 803 struct xfs_buf *bp; /* buffer containing block */ 804 805 /* 806 * Get the block pointer for this level. 807 */ 808 block = xfs_btree_get_block(cur, level, &bp); 809 if (xfs_btree_check_block(cur, block, level, bp)) 810 return 0; 811 /* 812 * It's empty, there is no such record. 813 */ 814 if (!block->bb_numrecs) 815 return 0; 816 /* 817 * Set the ptr value to numrecs, that's the last record/key. 818 */ 819 cur->bc_levels[level].ptr = be16_to_cpu(block->bb_numrecs); 820 return 1; 821 } 822 823 /* 824 * Compute first and last byte offsets for the fields given. 825 * Interprets the offsets table, which contains struct field offsets. 826 */ 827 void 828 xfs_btree_offsets( 829 uint32_t fields, /* bitmask of fields */ 830 const short *offsets, /* table of field offsets */ 831 int nbits, /* number of bits to inspect */ 832 int *first, /* output: first byte offset */ 833 int *last) /* output: last byte offset */ 834 { 835 int i; /* current bit number */ 836 uint32_t imask; /* mask for current bit number */ 837 838 ASSERT(fields != 0); 839 /* 840 * Find the lowest bit, so the first byte offset. 841 */ 842 for (i = 0, imask = 1u; ; i++, imask <<= 1) { 843 if (imask & fields) { 844 *first = offsets[i]; 845 break; 846 } 847 } 848 /* 849 * Find the highest bit, so the last byte offset. 850 */ 851 for (i = nbits - 1, imask = 1u << i; ; i--, imask >>= 1) { 852 if (imask & fields) { 853 *last = offsets[i + 1] - 1; 854 break; 855 } 856 } 857 } 858 859 /* 860 * Get a buffer for the block, return it read in. 861 * Long-form addressing. 862 */ 863 int 864 xfs_btree_read_bufl( 865 struct xfs_mount *mp, /* file system mount point */ 866 struct xfs_trans *tp, /* transaction pointer */ 867 xfs_fsblock_t fsbno, /* file system block number */ 868 struct xfs_buf **bpp, /* buffer for fsbno */ 869 int refval, /* ref count value for buffer */ 870 const struct xfs_buf_ops *ops) 871 { 872 struct xfs_buf *bp; /* return value */ 873 xfs_daddr_t d; /* real disk block address */ 874 int error; 875 876 if (!xfs_verify_fsbno(mp, fsbno)) 877 return -EFSCORRUPTED; 878 d = XFS_FSB_TO_DADDR(mp, fsbno); 879 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, 880 mp->m_bsize, 0, &bp, ops); 881 if (error) 882 return error; 883 if (bp) 884 xfs_buf_set_ref(bp, refval); 885 *bpp = bp; 886 return 0; 887 } 888 889 /* 890 * Read-ahead the block, don't wait for it, don't return a buffer. 891 * Long-form addressing. 892 */ 893 /* ARGSUSED */ 894 void 895 xfs_btree_reada_bufl( 896 struct xfs_mount *mp, /* file system mount point */ 897 xfs_fsblock_t fsbno, /* file system block number */ 898 xfs_extlen_t count, /* count of filesystem blocks */ 899 const struct xfs_buf_ops *ops) 900 { 901 xfs_daddr_t d; 902 903 ASSERT(fsbno != NULLFSBLOCK); 904 d = XFS_FSB_TO_DADDR(mp, fsbno); 905 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops); 906 } 907 908 /* 909 * Read-ahead the block, don't wait for it, don't return a buffer. 910 * Short-form addressing. 911 */ 912 /* ARGSUSED */ 913 void 914 xfs_btree_reada_bufs( 915 struct xfs_mount *mp, /* file system mount point */ 916 xfs_agnumber_t agno, /* allocation group number */ 917 xfs_agblock_t agbno, /* allocation group block number */ 918 xfs_extlen_t count, /* count of filesystem blocks */ 919 const struct xfs_buf_ops *ops) 920 { 921 xfs_daddr_t d; 922 923 ASSERT(agno != NULLAGNUMBER); 924 ASSERT(agbno != NULLAGBLOCK); 925 d = XFS_AGB_TO_DADDR(mp, agno, agbno); 926 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops); 927 } 928 929 STATIC int 930 xfs_btree_readahead_lblock( 931 struct xfs_btree_cur *cur, 932 int lr, 933 struct xfs_btree_block *block) 934 { 935 int rval = 0; 936 xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); 937 xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); 938 939 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) { 940 xfs_btree_reada_bufl(cur->bc_mp, left, 1, 941 cur->bc_ops->buf_ops); 942 rval++; 943 } 944 945 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) { 946 xfs_btree_reada_bufl(cur->bc_mp, right, 1, 947 cur->bc_ops->buf_ops); 948 rval++; 949 } 950 951 return rval; 952 } 953 954 STATIC int 955 xfs_btree_readahead_sblock( 956 struct xfs_btree_cur *cur, 957 int lr, 958 struct xfs_btree_block *block) 959 { 960 int rval = 0; 961 xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib); 962 xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib); 963 964 965 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { 966 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno, 967 left, 1, cur->bc_ops->buf_ops); 968 rval++; 969 } 970 971 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { 972 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno, 973 right, 1, cur->bc_ops->buf_ops); 974 rval++; 975 } 976 977 return rval; 978 } 979 980 /* 981 * Read-ahead btree blocks, at the given level. 982 * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA. 983 */ 984 STATIC int 985 xfs_btree_readahead( 986 struct xfs_btree_cur *cur, /* btree cursor */ 987 int lev, /* level in btree */ 988 int lr) /* left/right bits */ 989 { 990 struct xfs_btree_block *block; 991 992 /* 993 * No readahead needed if we are at the root level and the 994 * btree root is stored in the inode. 995 */ 996 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && 997 (lev == cur->bc_nlevels - 1)) 998 return 0; 999 1000 if ((cur->bc_levels[lev].ra | lr) == cur->bc_levels[lev].ra) 1001 return 0; 1002 1003 cur->bc_levels[lev].ra |= lr; 1004 block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp); 1005 1006 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 1007 return xfs_btree_readahead_lblock(cur, lr, block); 1008 return xfs_btree_readahead_sblock(cur, lr, block); 1009 } 1010 1011 STATIC int 1012 xfs_btree_ptr_to_daddr( 1013 struct xfs_btree_cur *cur, 1014 const union xfs_btree_ptr *ptr, 1015 xfs_daddr_t *daddr) 1016 { 1017 xfs_fsblock_t fsbno; 1018 xfs_agblock_t agbno; 1019 int error; 1020 1021 error = xfs_btree_check_ptr(cur, ptr, 0, 1); 1022 if (error) 1023 return error; 1024 1025 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 1026 fsbno = be64_to_cpu(ptr->l); 1027 *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno); 1028 } else { 1029 agbno = be32_to_cpu(ptr->s); 1030 *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.pag->pag_agno, 1031 agbno); 1032 } 1033 1034 return 0; 1035 } 1036 1037 /* 1038 * Readahead @count btree blocks at the given @ptr location. 1039 * 1040 * We don't need to care about long or short form btrees here as we have a 1041 * method of converting the ptr directly to a daddr available to us. 1042 */ 1043 STATIC void 1044 xfs_btree_readahead_ptr( 1045 struct xfs_btree_cur *cur, 1046 union xfs_btree_ptr *ptr, 1047 xfs_extlen_t count) 1048 { 1049 xfs_daddr_t daddr; 1050 1051 if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr)) 1052 return; 1053 xfs_buf_readahead(cur->bc_mp->m_ddev_targp, daddr, 1054 cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops); 1055 } 1056 1057 /* 1058 * Set the buffer for level "lev" in the cursor to bp, releasing 1059 * any previous buffer. 1060 */ 1061 STATIC void 1062 xfs_btree_setbuf( 1063 struct xfs_btree_cur *cur, /* btree cursor */ 1064 int lev, /* level in btree */ 1065 struct xfs_buf *bp) /* new buffer to set */ 1066 { 1067 struct xfs_btree_block *b; /* btree block */ 1068 1069 if (cur->bc_levels[lev].bp) 1070 xfs_trans_brelse(cur->bc_tp, cur->bc_levels[lev].bp); 1071 cur->bc_levels[lev].bp = bp; 1072 cur->bc_levels[lev].ra = 0; 1073 1074 b = XFS_BUF_TO_BLOCK(bp); 1075 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 1076 if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)) 1077 cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA; 1078 if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)) 1079 cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA; 1080 } else { 1081 if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK)) 1082 cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA; 1083 if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK)) 1084 cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA; 1085 } 1086 } 1087 1088 bool 1089 xfs_btree_ptr_is_null( 1090 struct xfs_btree_cur *cur, 1091 const union xfs_btree_ptr *ptr) 1092 { 1093 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 1094 return ptr->l == cpu_to_be64(NULLFSBLOCK); 1095 else 1096 return ptr->s == cpu_to_be32(NULLAGBLOCK); 1097 } 1098 1099 void 1100 xfs_btree_set_ptr_null( 1101 struct xfs_btree_cur *cur, 1102 union xfs_btree_ptr *ptr) 1103 { 1104 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 1105 ptr->l = cpu_to_be64(NULLFSBLOCK); 1106 else 1107 ptr->s = cpu_to_be32(NULLAGBLOCK); 1108 } 1109 1110 /* 1111 * Get/set/init sibling pointers 1112 */ 1113 void 1114 xfs_btree_get_sibling( 1115 struct xfs_btree_cur *cur, 1116 struct xfs_btree_block *block, 1117 union xfs_btree_ptr *ptr, 1118 int lr) 1119 { 1120 ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); 1121 1122 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 1123 if (lr == XFS_BB_RIGHTSIB) 1124 ptr->l = block->bb_u.l.bb_rightsib; 1125 else 1126 ptr->l = block->bb_u.l.bb_leftsib; 1127 } else { 1128 if (lr == XFS_BB_RIGHTSIB) 1129 ptr->s = block->bb_u.s.bb_rightsib; 1130 else 1131 ptr->s = block->bb_u.s.bb_leftsib; 1132 } 1133 } 1134 1135 void 1136 xfs_btree_set_sibling( 1137 struct xfs_btree_cur *cur, 1138 struct xfs_btree_block *block, 1139 const union xfs_btree_ptr *ptr, 1140 int lr) 1141 { 1142 ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); 1143 1144 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 1145 if (lr == XFS_BB_RIGHTSIB) 1146 block->bb_u.l.bb_rightsib = ptr->l; 1147 else 1148 block->bb_u.l.bb_leftsib = ptr->l; 1149 } else { 1150 if (lr == XFS_BB_RIGHTSIB) 1151 block->bb_u.s.bb_rightsib = ptr->s; 1152 else 1153 block->bb_u.s.bb_leftsib = ptr->s; 1154 } 1155 } 1156 1157 void 1158 xfs_btree_init_block_int( 1159 struct xfs_mount *mp, 1160 struct xfs_btree_block *buf, 1161 xfs_daddr_t blkno, 1162 xfs_btnum_t btnum, 1163 __u16 level, 1164 __u16 numrecs, 1165 __u64 owner, 1166 unsigned int flags) 1167 { 1168 int crc = xfs_has_crc(mp); 1169 __u32 magic = xfs_btree_magic(crc, btnum); 1170 1171 buf->bb_magic = cpu_to_be32(magic); 1172 buf->bb_level = cpu_to_be16(level); 1173 buf->bb_numrecs = cpu_to_be16(numrecs); 1174 1175 if (flags & XFS_BTREE_LONG_PTRS) { 1176 buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK); 1177 buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK); 1178 if (crc) { 1179 buf->bb_u.l.bb_blkno = cpu_to_be64(blkno); 1180 buf->bb_u.l.bb_owner = cpu_to_be64(owner); 1181 uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid); 1182 buf->bb_u.l.bb_pad = 0; 1183 buf->bb_u.l.bb_lsn = 0; 1184 } 1185 } else { 1186 /* owner is a 32 bit value on short blocks */ 1187 __u32 __owner = (__u32)owner; 1188 1189 buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); 1190 buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); 1191 if (crc) { 1192 buf->bb_u.s.bb_blkno = cpu_to_be64(blkno); 1193 buf->bb_u.s.bb_owner = cpu_to_be32(__owner); 1194 uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid); 1195 buf->bb_u.s.bb_lsn = 0; 1196 } 1197 } 1198 } 1199 1200 void 1201 xfs_btree_init_block( 1202 struct xfs_mount *mp, 1203 struct xfs_buf *bp, 1204 xfs_btnum_t btnum, 1205 __u16 level, 1206 __u16 numrecs, 1207 __u64 owner) 1208 { 1209 xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), xfs_buf_daddr(bp), 1210 btnum, level, numrecs, owner, 0); 1211 } 1212 1213 void 1214 xfs_btree_init_block_cur( 1215 struct xfs_btree_cur *cur, 1216 struct xfs_buf *bp, 1217 int level, 1218 int numrecs) 1219 { 1220 __u64 owner; 1221 1222 /* 1223 * we can pull the owner from the cursor right now as the different 1224 * owners align directly with the pointer size of the btree. This may 1225 * change in future, but is safe for current users of the generic btree 1226 * code. 1227 */ 1228 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 1229 owner = cur->bc_ino.ip->i_ino; 1230 else 1231 owner = cur->bc_ag.pag->pag_agno; 1232 1233 xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), 1234 xfs_buf_daddr(bp), cur->bc_btnum, level, 1235 numrecs, owner, cur->bc_flags); 1236 } 1237 1238 /* 1239 * Return true if ptr is the last record in the btree and 1240 * we need to track updates to this record. The decision 1241 * will be further refined in the update_lastrec method. 1242 */ 1243 STATIC int 1244 xfs_btree_is_lastrec( 1245 struct xfs_btree_cur *cur, 1246 struct xfs_btree_block *block, 1247 int level) 1248 { 1249 union xfs_btree_ptr ptr; 1250 1251 if (level > 0) 1252 return 0; 1253 if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE)) 1254 return 0; 1255 1256 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 1257 if (!xfs_btree_ptr_is_null(cur, &ptr)) 1258 return 0; 1259 return 1; 1260 } 1261 1262 STATIC void 1263 xfs_btree_buf_to_ptr( 1264 struct xfs_btree_cur *cur, 1265 struct xfs_buf *bp, 1266 union xfs_btree_ptr *ptr) 1267 { 1268 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 1269 ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, 1270 xfs_buf_daddr(bp))); 1271 else { 1272 ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp, 1273 xfs_buf_daddr(bp))); 1274 } 1275 } 1276 1277 STATIC void 1278 xfs_btree_set_refs( 1279 struct xfs_btree_cur *cur, 1280 struct xfs_buf *bp) 1281 { 1282 switch (cur->bc_btnum) { 1283 case XFS_BTNUM_BNO: 1284 case XFS_BTNUM_CNT: 1285 xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF); 1286 break; 1287 case XFS_BTNUM_INO: 1288 case XFS_BTNUM_FINO: 1289 xfs_buf_set_ref(bp, XFS_INO_BTREE_REF); 1290 break; 1291 case XFS_BTNUM_BMAP: 1292 xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF); 1293 break; 1294 case XFS_BTNUM_RMAP: 1295 xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF); 1296 break; 1297 case XFS_BTNUM_REFC: 1298 xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF); 1299 break; 1300 default: 1301 ASSERT(0); 1302 } 1303 } 1304 1305 int 1306 xfs_btree_get_buf_block( 1307 struct xfs_btree_cur *cur, 1308 const union xfs_btree_ptr *ptr, 1309 struct xfs_btree_block **block, 1310 struct xfs_buf **bpp) 1311 { 1312 struct xfs_mount *mp = cur->bc_mp; 1313 xfs_daddr_t d; 1314 int error; 1315 1316 error = xfs_btree_ptr_to_daddr(cur, ptr, &d); 1317 if (error) 1318 return error; 1319 error = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize, 1320 0, bpp); 1321 if (error) 1322 return error; 1323 1324 (*bpp)->b_ops = cur->bc_ops->buf_ops; 1325 *block = XFS_BUF_TO_BLOCK(*bpp); 1326 return 0; 1327 } 1328 1329 /* 1330 * Read in the buffer at the given ptr and return the buffer and 1331 * the block pointer within the buffer. 1332 */ 1333 STATIC int 1334 xfs_btree_read_buf_block( 1335 struct xfs_btree_cur *cur, 1336 const union xfs_btree_ptr *ptr, 1337 int flags, 1338 struct xfs_btree_block **block, 1339 struct xfs_buf **bpp) 1340 { 1341 struct xfs_mount *mp = cur->bc_mp; 1342 xfs_daddr_t d; 1343 int error; 1344 1345 /* need to sort out how callers deal with failures first */ 1346 ASSERT(!(flags & XBF_TRYLOCK)); 1347 1348 error = xfs_btree_ptr_to_daddr(cur, ptr, &d); 1349 if (error) 1350 return error; 1351 error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, 1352 mp->m_bsize, flags, bpp, 1353 cur->bc_ops->buf_ops); 1354 if (error) 1355 return error; 1356 1357 xfs_btree_set_refs(cur, *bpp); 1358 *block = XFS_BUF_TO_BLOCK(*bpp); 1359 return 0; 1360 } 1361 1362 /* 1363 * Copy keys from one btree block to another. 1364 */ 1365 void 1366 xfs_btree_copy_keys( 1367 struct xfs_btree_cur *cur, 1368 union xfs_btree_key *dst_key, 1369 const union xfs_btree_key *src_key, 1370 int numkeys) 1371 { 1372 ASSERT(numkeys >= 0); 1373 memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len); 1374 } 1375 1376 /* 1377 * Copy records from one btree block to another. 1378 */ 1379 STATIC void 1380 xfs_btree_copy_recs( 1381 struct xfs_btree_cur *cur, 1382 union xfs_btree_rec *dst_rec, 1383 union xfs_btree_rec *src_rec, 1384 int numrecs) 1385 { 1386 ASSERT(numrecs >= 0); 1387 memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len); 1388 } 1389 1390 /* 1391 * Copy block pointers from one btree block to another. 1392 */ 1393 void 1394 xfs_btree_copy_ptrs( 1395 struct xfs_btree_cur *cur, 1396 union xfs_btree_ptr *dst_ptr, 1397 const union xfs_btree_ptr *src_ptr, 1398 int numptrs) 1399 { 1400 ASSERT(numptrs >= 0); 1401 memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur)); 1402 } 1403 1404 /* 1405 * Shift keys one index left/right inside a single btree block. 1406 */ 1407 STATIC void 1408 xfs_btree_shift_keys( 1409 struct xfs_btree_cur *cur, 1410 union xfs_btree_key *key, 1411 int dir, 1412 int numkeys) 1413 { 1414 char *dst_key; 1415 1416 ASSERT(numkeys >= 0); 1417 ASSERT(dir == 1 || dir == -1); 1418 1419 dst_key = (char *)key + (dir * cur->bc_ops->key_len); 1420 memmove(dst_key, key, numkeys * cur->bc_ops->key_len); 1421 } 1422 1423 /* 1424 * Shift records one index left/right inside a single btree block. 1425 */ 1426 STATIC void 1427 xfs_btree_shift_recs( 1428 struct xfs_btree_cur *cur, 1429 union xfs_btree_rec *rec, 1430 int dir, 1431 int numrecs) 1432 { 1433 char *dst_rec; 1434 1435 ASSERT(numrecs >= 0); 1436 ASSERT(dir == 1 || dir == -1); 1437 1438 dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len); 1439 memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len); 1440 } 1441 1442 /* 1443 * Shift block pointers one index left/right inside a single btree block. 1444 */ 1445 STATIC void 1446 xfs_btree_shift_ptrs( 1447 struct xfs_btree_cur *cur, 1448 union xfs_btree_ptr *ptr, 1449 int dir, 1450 int numptrs) 1451 { 1452 char *dst_ptr; 1453 1454 ASSERT(numptrs >= 0); 1455 ASSERT(dir == 1 || dir == -1); 1456 1457 dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur)); 1458 memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur)); 1459 } 1460 1461 /* 1462 * Log key values from the btree block. 1463 */ 1464 STATIC void 1465 xfs_btree_log_keys( 1466 struct xfs_btree_cur *cur, 1467 struct xfs_buf *bp, 1468 int first, 1469 int last) 1470 { 1471 1472 if (bp) { 1473 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1474 xfs_trans_log_buf(cur->bc_tp, bp, 1475 xfs_btree_key_offset(cur, first), 1476 xfs_btree_key_offset(cur, last + 1) - 1); 1477 } else { 1478 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, 1479 xfs_ilog_fbroot(cur->bc_ino.whichfork)); 1480 } 1481 } 1482 1483 /* 1484 * Log record values from the btree block. 1485 */ 1486 void 1487 xfs_btree_log_recs( 1488 struct xfs_btree_cur *cur, 1489 struct xfs_buf *bp, 1490 int first, 1491 int last) 1492 { 1493 1494 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1495 xfs_trans_log_buf(cur->bc_tp, bp, 1496 xfs_btree_rec_offset(cur, first), 1497 xfs_btree_rec_offset(cur, last + 1) - 1); 1498 1499 } 1500 1501 /* 1502 * Log block pointer fields from a btree block (nonleaf). 1503 */ 1504 STATIC void 1505 xfs_btree_log_ptrs( 1506 struct xfs_btree_cur *cur, /* btree cursor */ 1507 struct xfs_buf *bp, /* buffer containing btree block */ 1508 int first, /* index of first pointer to log */ 1509 int last) /* index of last pointer to log */ 1510 { 1511 1512 if (bp) { 1513 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 1514 int level = xfs_btree_get_level(block); 1515 1516 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1517 xfs_trans_log_buf(cur->bc_tp, bp, 1518 xfs_btree_ptr_offset(cur, first, level), 1519 xfs_btree_ptr_offset(cur, last + 1, level) - 1); 1520 } else { 1521 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, 1522 xfs_ilog_fbroot(cur->bc_ino.whichfork)); 1523 } 1524 1525 } 1526 1527 /* 1528 * Log fields from a btree block header. 1529 */ 1530 void 1531 xfs_btree_log_block( 1532 struct xfs_btree_cur *cur, /* btree cursor */ 1533 struct xfs_buf *bp, /* buffer containing btree block */ 1534 uint32_t fields) /* mask of fields: XFS_BB_... */ 1535 { 1536 int first; /* first byte offset logged */ 1537 int last; /* last byte offset logged */ 1538 static const short soffsets[] = { /* table of offsets (short) */ 1539 offsetof(struct xfs_btree_block, bb_magic), 1540 offsetof(struct xfs_btree_block, bb_level), 1541 offsetof(struct xfs_btree_block, bb_numrecs), 1542 offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib), 1543 offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib), 1544 offsetof(struct xfs_btree_block, bb_u.s.bb_blkno), 1545 offsetof(struct xfs_btree_block, bb_u.s.bb_lsn), 1546 offsetof(struct xfs_btree_block, bb_u.s.bb_uuid), 1547 offsetof(struct xfs_btree_block, bb_u.s.bb_owner), 1548 offsetof(struct xfs_btree_block, bb_u.s.bb_crc), 1549 XFS_BTREE_SBLOCK_CRC_LEN 1550 }; 1551 static const short loffsets[] = { /* table of offsets (long) */ 1552 offsetof(struct xfs_btree_block, bb_magic), 1553 offsetof(struct xfs_btree_block, bb_level), 1554 offsetof(struct xfs_btree_block, bb_numrecs), 1555 offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib), 1556 offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib), 1557 offsetof(struct xfs_btree_block, bb_u.l.bb_blkno), 1558 offsetof(struct xfs_btree_block, bb_u.l.bb_lsn), 1559 offsetof(struct xfs_btree_block, bb_u.l.bb_uuid), 1560 offsetof(struct xfs_btree_block, bb_u.l.bb_owner), 1561 offsetof(struct xfs_btree_block, bb_u.l.bb_crc), 1562 offsetof(struct xfs_btree_block, bb_u.l.bb_pad), 1563 XFS_BTREE_LBLOCK_CRC_LEN 1564 }; 1565 1566 if (bp) { 1567 int nbits; 1568 1569 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { 1570 /* 1571 * We don't log the CRC when updating a btree 1572 * block but instead recreate it during log 1573 * recovery. As the log buffers have checksums 1574 * of their own this is safe and avoids logging a crc 1575 * update in a lot of places. 1576 */ 1577 if (fields == XFS_BB_ALL_BITS) 1578 fields = XFS_BB_ALL_BITS_CRC; 1579 nbits = XFS_BB_NUM_BITS_CRC; 1580 } else { 1581 nbits = XFS_BB_NUM_BITS; 1582 } 1583 xfs_btree_offsets(fields, 1584 (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? 1585 loffsets : soffsets, 1586 nbits, &first, &last); 1587 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1588 xfs_trans_log_buf(cur->bc_tp, bp, first, last); 1589 } else { 1590 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, 1591 xfs_ilog_fbroot(cur->bc_ino.whichfork)); 1592 } 1593 } 1594 1595 /* 1596 * Increment cursor by one record at the level. 1597 * For nonzero levels the leaf-ward information is untouched. 1598 */ 1599 int /* error */ 1600 xfs_btree_increment( 1601 struct xfs_btree_cur *cur, 1602 int level, 1603 int *stat) /* success/failure */ 1604 { 1605 struct xfs_btree_block *block; 1606 union xfs_btree_ptr ptr; 1607 struct xfs_buf *bp; 1608 int error; /* error return value */ 1609 int lev; 1610 1611 ASSERT(level < cur->bc_nlevels); 1612 1613 /* Read-ahead to the right at this level. */ 1614 xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); 1615 1616 /* Get a pointer to the btree block. */ 1617 block = xfs_btree_get_block(cur, level, &bp); 1618 1619 #ifdef DEBUG 1620 error = xfs_btree_check_block(cur, block, level, bp); 1621 if (error) 1622 goto error0; 1623 #endif 1624 1625 /* We're done if we remain in the block after the increment. */ 1626 if (++cur->bc_levels[level].ptr <= xfs_btree_get_numrecs(block)) 1627 goto out1; 1628 1629 /* Fail if we just went off the right edge of the tree. */ 1630 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 1631 if (xfs_btree_ptr_is_null(cur, &ptr)) 1632 goto out0; 1633 1634 XFS_BTREE_STATS_INC(cur, increment); 1635 1636 /* 1637 * March up the tree incrementing pointers. 1638 * Stop when we don't go off the right edge of a block. 1639 */ 1640 for (lev = level + 1; lev < cur->bc_nlevels; lev++) { 1641 block = xfs_btree_get_block(cur, lev, &bp); 1642 1643 #ifdef DEBUG 1644 error = xfs_btree_check_block(cur, block, lev, bp); 1645 if (error) 1646 goto error0; 1647 #endif 1648 1649 if (++cur->bc_levels[lev].ptr <= xfs_btree_get_numrecs(block)) 1650 break; 1651 1652 /* Read-ahead the right block for the next loop. */ 1653 xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA); 1654 } 1655 1656 /* 1657 * If we went off the root then we are either seriously 1658 * confused or have the tree root in an inode. 1659 */ 1660 if (lev == cur->bc_nlevels) { 1661 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) 1662 goto out0; 1663 ASSERT(0); 1664 error = -EFSCORRUPTED; 1665 goto error0; 1666 } 1667 ASSERT(lev < cur->bc_nlevels); 1668 1669 /* 1670 * Now walk back down the tree, fixing up the cursor's buffer 1671 * pointers and key numbers. 1672 */ 1673 for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { 1674 union xfs_btree_ptr *ptrp; 1675 1676 ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block); 1677 --lev; 1678 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); 1679 if (error) 1680 goto error0; 1681 1682 xfs_btree_setbuf(cur, lev, bp); 1683 cur->bc_levels[lev].ptr = 1; 1684 } 1685 out1: 1686 *stat = 1; 1687 return 0; 1688 1689 out0: 1690 *stat = 0; 1691 return 0; 1692 1693 error0: 1694 return error; 1695 } 1696 1697 /* 1698 * Decrement cursor by one record at the level. 1699 * For nonzero levels the leaf-ward information is untouched. 1700 */ 1701 int /* error */ 1702 xfs_btree_decrement( 1703 struct xfs_btree_cur *cur, 1704 int level, 1705 int *stat) /* success/failure */ 1706 { 1707 struct xfs_btree_block *block; 1708 struct xfs_buf *bp; 1709 int error; /* error return value */ 1710 int lev; 1711 union xfs_btree_ptr ptr; 1712 1713 ASSERT(level < cur->bc_nlevels); 1714 1715 /* Read-ahead to the left at this level. */ 1716 xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA); 1717 1718 /* We're done if we remain in the block after the decrement. */ 1719 if (--cur->bc_levels[level].ptr > 0) 1720 goto out1; 1721 1722 /* Get a pointer to the btree block. */ 1723 block = xfs_btree_get_block(cur, level, &bp); 1724 1725 #ifdef DEBUG 1726 error = xfs_btree_check_block(cur, block, level, bp); 1727 if (error) 1728 goto error0; 1729 #endif 1730 1731 /* Fail if we just went off the left edge of the tree. */ 1732 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); 1733 if (xfs_btree_ptr_is_null(cur, &ptr)) 1734 goto out0; 1735 1736 XFS_BTREE_STATS_INC(cur, decrement); 1737 1738 /* 1739 * March up the tree decrementing pointers. 1740 * Stop when we don't go off the left edge of a block. 1741 */ 1742 for (lev = level + 1; lev < cur->bc_nlevels; lev++) { 1743 if (--cur->bc_levels[lev].ptr > 0) 1744 break; 1745 /* Read-ahead the left block for the next loop. */ 1746 xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA); 1747 } 1748 1749 /* 1750 * If we went off the root then we are seriously confused. 1751 * or the root of the tree is in an inode. 1752 */ 1753 if (lev == cur->bc_nlevels) { 1754 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) 1755 goto out0; 1756 ASSERT(0); 1757 error = -EFSCORRUPTED; 1758 goto error0; 1759 } 1760 ASSERT(lev < cur->bc_nlevels); 1761 1762 /* 1763 * Now walk back down the tree, fixing up the cursor's buffer 1764 * pointers and key numbers. 1765 */ 1766 for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { 1767 union xfs_btree_ptr *ptrp; 1768 1769 ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block); 1770 --lev; 1771 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); 1772 if (error) 1773 goto error0; 1774 xfs_btree_setbuf(cur, lev, bp); 1775 cur->bc_levels[lev].ptr = xfs_btree_get_numrecs(block); 1776 } 1777 out1: 1778 *stat = 1; 1779 return 0; 1780 1781 out0: 1782 *stat = 0; 1783 return 0; 1784 1785 error0: 1786 return error; 1787 } 1788 1789 int 1790 xfs_btree_lookup_get_block( 1791 struct xfs_btree_cur *cur, /* btree cursor */ 1792 int level, /* level in the btree */ 1793 const union xfs_btree_ptr *pp, /* ptr to btree block */ 1794 struct xfs_btree_block **blkp) /* return btree block */ 1795 { 1796 struct xfs_buf *bp; /* buffer pointer for btree block */ 1797 xfs_daddr_t daddr; 1798 int error = 0; 1799 1800 /* special case the root block if in an inode */ 1801 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && 1802 (level == cur->bc_nlevels - 1)) { 1803 *blkp = xfs_btree_get_iroot(cur); 1804 return 0; 1805 } 1806 1807 /* 1808 * If the old buffer at this level for the disk address we are 1809 * looking for re-use it. 1810 * 1811 * Otherwise throw it away and get a new one. 1812 */ 1813 bp = cur->bc_levels[level].bp; 1814 error = xfs_btree_ptr_to_daddr(cur, pp, &daddr); 1815 if (error) 1816 return error; 1817 if (bp && xfs_buf_daddr(bp) == daddr) { 1818 *blkp = XFS_BUF_TO_BLOCK(bp); 1819 return 0; 1820 } 1821 1822 error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp); 1823 if (error) 1824 return error; 1825 1826 /* Check the inode owner since the verifiers don't. */ 1827 if (xfs_has_crc(cur->bc_mp) && 1828 !(cur->bc_ino.flags & XFS_BTCUR_BMBT_INVALID_OWNER) && 1829 (cur->bc_flags & XFS_BTREE_LONG_PTRS) && 1830 be64_to_cpu((*blkp)->bb_u.l.bb_owner) != 1831 cur->bc_ino.ip->i_ino) 1832 goto out_bad; 1833 1834 /* Did we get the level we were looking for? */ 1835 if (be16_to_cpu((*blkp)->bb_level) != level) 1836 goto out_bad; 1837 1838 /* Check that internal nodes have at least one record. */ 1839 if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0) 1840 goto out_bad; 1841 1842 xfs_btree_setbuf(cur, level, bp); 1843 return 0; 1844 1845 out_bad: 1846 *blkp = NULL; 1847 xfs_buf_mark_corrupt(bp); 1848 xfs_trans_brelse(cur->bc_tp, bp); 1849 return -EFSCORRUPTED; 1850 } 1851 1852 /* 1853 * Get current search key. For level 0 we don't actually have a key 1854 * structure so we make one up from the record. For all other levels 1855 * we just return the right key. 1856 */ 1857 STATIC union xfs_btree_key * 1858 xfs_lookup_get_search_key( 1859 struct xfs_btree_cur *cur, 1860 int level, 1861 int keyno, 1862 struct xfs_btree_block *block, 1863 union xfs_btree_key *kp) 1864 { 1865 if (level == 0) { 1866 cur->bc_ops->init_key_from_rec(kp, 1867 xfs_btree_rec_addr(cur, keyno, block)); 1868 return kp; 1869 } 1870 1871 return xfs_btree_key_addr(cur, keyno, block); 1872 } 1873 1874 /* 1875 * Lookup the record. The cursor is made to point to it, based on dir. 1876 * stat is set to 0 if can't find any such record, 1 for success. 1877 */ 1878 int /* error */ 1879 xfs_btree_lookup( 1880 struct xfs_btree_cur *cur, /* btree cursor */ 1881 xfs_lookup_t dir, /* <=, ==, or >= */ 1882 int *stat) /* success/failure */ 1883 { 1884 struct xfs_btree_block *block; /* current btree block */ 1885 int64_t diff; /* difference for the current key */ 1886 int error; /* error return value */ 1887 int keyno; /* current key number */ 1888 int level; /* level in the btree */ 1889 union xfs_btree_ptr *pp; /* ptr to btree block */ 1890 union xfs_btree_ptr ptr; /* ptr to btree block */ 1891 1892 XFS_BTREE_STATS_INC(cur, lookup); 1893 1894 /* No such thing as a zero-level tree. */ 1895 if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0)) 1896 return -EFSCORRUPTED; 1897 1898 block = NULL; 1899 keyno = 0; 1900 1901 /* initialise start pointer from cursor */ 1902 cur->bc_ops->init_ptr_from_cur(cur, &ptr); 1903 pp = &ptr; 1904 1905 /* 1906 * Iterate over each level in the btree, starting at the root. 1907 * For each level above the leaves, find the key we need, based 1908 * on the lookup record, then follow the corresponding block 1909 * pointer down to the next level. 1910 */ 1911 for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) { 1912 /* Get the block we need to do the lookup on. */ 1913 error = xfs_btree_lookup_get_block(cur, level, pp, &block); 1914 if (error) 1915 goto error0; 1916 1917 if (diff == 0) { 1918 /* 1919 * If we already had a key match at a higher level, we 1920 * know we need to use the first entry in this block. 1921 */ 1922 keyno = 1; 1923 } else { 1924 /* Otherwise search this block. Do a binary search. */ 1925 1926 int high; /* high entry number */ 1927 int low; /* low entry number */ 1928 1929 /* Set low and high entry numbers, 1-based. */ 1930 low = 1; 1931 high = xfs_btree_get_numrecs(block); 1932 if (!high) { 1933 /* Block is empty, must be an empty leaf. */ 1934 if (level != 0 || cur->bc_nlevels != 1) { 1935 XFS_CORRUPTION_ERROR(__func__, 1936 XFS_ERRLEVEL_LOW, 1937 cur->bc_mp, block, 1938 sizeof(*block)); 1939 return -EFSCORRUPTED; 1940 } 1941 1942 cur->bc_levels[0].ptr = dir != XFS_LOOKUP_LE; 1943 *stat = 0; 1944 return 0; 1945 } 1946 1947 /* Binary search the block. */ 1948 while (low <= high) { 1949 union xfs_btree_key key; 1950 union xfs_btree_key *kp; 1951 1952 XFS_BTREE_STATS_INC(cur, compare); 1953 1954 /* keyno is average of low and high. */ 1955 keyno = (low + high) >> 1; 1956 1957 /* Get current search key */ 1958 kp = xfs_lookup_get_search_key(cur, level, 1959 keyno, block, &key); 1960 1961 /* 1962 * Compute difference to get next direction: 1963 * - less than, move right 1964 * - greater than, move left 1965 * - equal, we're done 1966 */ 1967 diff = cur->bc_ops->key_diff(cur, kp); 1968 if (diff < 0) 1969 low = keyno + 1; 1970 else if (diff > 0) 1971 high = keyno - 1; 1972 else 1973 break; 1974 } 1975 } 1976 1977 /* 1978 * If there are more levels, set up for the next level 1979 * by getting the block number and filling in the cursor. 1980 */ 1981 if (level > 0) { 1982 /* 1983 * If we moved left, need the previous key number, 1984 * unless there isn't one. 1985 */ 1986 if (diff > 0 && --keyno < 1) 1987 keyno = 1; 1988 pp = xfs_btree_ptr_addr(cur, keyno, block); 1989 1990 error = xfs_btree_debug_check_ptr(cur, pp, 0, level); 1991 if (error) 1992 goto error0; 1993 1994 cur->bc_levels[level].ptr = keyno; 1995 } 1996 } 1997 1998 /* Done with the search. See if we need to adjust the results. */ 1999 if (dir != XFS_LOOKUP_LE && diff < 0) { 2000 keyno++; 2001 /* 2002 * If ge search and we went off the end of the block, but it's 2003 * not the last block, we're in the wrong block. 2004 */ 2005 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 2006 if (dir == XFS_LOOKUP_GE && 2007 keyno > xfs_btree_get_numrecs(block) && 2008 !xfs_btree_ptr_is_null(cur, &ptr)) { 2009 int i; 2010 2011 cur->bc_levels[0].ptr = keyno; 2012 error = xfs_btree_increment(cur, 0, &i); 2013 if (error) 2014 goto error0; 2015 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) 2016 return -EFSCORRUPTED; 2017 *stat = 1; 2018 return 0; 2019 } 2020 } else if (dir == XFS_LOOKUP_LE && diff > 0) 2021 keyno--; 2022 cur->bc_levels[0].ptr = keyno; 2023 2024 /* Return if we succeeded or not. */ 2025 if (keyno == 0 || keyno > xfs_btree_get_numrecs(block)) 2026 *stat = 0; 2027 else if (dir != XFS_LOOKUP_EQ || diff == 0) 2028 *stat = 1; 2029 else 2030 *stat = 0; 2031 return 0; 2032 2033 error0: 2034 return error; 2035 } 2036 2037 /* Find the high key storage area from a regular key. */ 2038 union xfs_btree_key * 2039 xfs_btree_high_key_from_key( 2040 struct xfs_btree_cur *cur, 2041 union xfs_btree_key *key) 2042 { 2043 ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING); 2044 return (union xfs_btree_key *)((char *)key + 2045 (cur->bc_ops->key_len / 2)); 2046 } 2047 2048 /* Determine the low (and high if overlapped) keys of a leaf block */ 2049 STATIC void 2050 xfs_btree_get_leaf_keys( 2051 struct xfs_btree_cur *cur, 2052 struct xfs_btree_block *block, 2053 union xfs_btree_key *key) 2054 { 2055 union xfs_btree_key max_hkey; 2056 union xfs_btree_key hkey; 2057 union xfs_btree_rec *rec; 2058 union xfs_btree_key *high; 2059 int n; 2060 2061 rec = xfs_btree_rec_addr(cur, 1, block); 2062 cur->bc_ops->init_key_from_rec(key, rec); 2063 2064 if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { 2065 2066 cur->bc_ops->init_high_key_from_rec(&max_hkey, rec); 2067 for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { 2068 rec = xfs_btree_rec_addr(cur, n, block); 2069 cur->bc_ops->init_high_key_from_rec(&hkey, rec); 2070 if (cur->bc_ops->diff_two_keys(cur, &hkey, &max_hkey) 2071 > 0) 2072 max_hkey = hkey; 2073 } 2074 2075 high = xfs_btree_high_key_from_key(cur, key); 2076 memcpy(high, &max_hkey, cur->bc_ops->key_len / 2); 2077 } 2078 } 2079 2080 /* Determine the low (and high if overlapped) keys of a node block */ 2081 STATIC void 2082 xfs_btree_get_node_keys( 2083 struct xfs_btree_cur *cur, 2084 struct xfs_btree_block *block, 2085 union xfs_btree_key *key) 2086 { 2087 union xfs_btree_key *hkey; 2088 union xfs_btree_key *max_hkey; 2089 union xfs_btree_key *high; 2090 int n; 2091 2092 if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { 2093 memcpy(key, xfs_btree_key_addr(cur, 1, block), 2094 cur->bc_ops->key_len / 2); 2095 2096 max_hkey = xfs_btree_high_key_addr(cur, 1, block); 2097 for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { 2098 hkey = xfs_btree_high_key_addr(cur, n, block); 2099 if (cur->bc_ops->diff_two_keys(cur, hkey, max_hkey) > 0) 2100 max_hkey = hkey; 2101 } 2102 2103 high = xfs_btree_high_key_from_key(cur, key); 2104 memcpy(high, max_hkey, cur->bc_ops->key_len / 2); 2105 } else { 2106 memcpy(key, xfs_btree_key_addr(cur, 1, block), 2107 cur->bc_ops->key_len); 2108 } 2109 } 2110 2111 /* Derive the keys for any btree block. */ 2112 void 2113 xfs_btree_get_keys( 2114 struct xfs_btree_cur *cur, 2115 struct xfs_btree_block *block, 2116 union xfs_btree_key *key) 2117 { 2118 if (be16_to_cpu(block->bb_level) == 0) 2119 xfs_btree_get_leaf_keys(cur, block, key); 2120 else 2121 xfs_btree_get_node_keys(cur, block, key); 2122 } 2123 2124 /* 2125 * Decide if we need to update the parent keys of a btree block. For 2126 * a standard btree this is only necessary if we're updating the first 2127 * record/key. For an overlapping btree, we must always update the 2128 * keys because the highest key can be in any of the records or keys 2129 * in the block. 2130 */ 2131 static inline bool 2132 xfs_btree_needs_key_update( 2133 struct xfs_btree_cur *cur, 2134 int ptr) 2135 { 2136 return (cur->bc_flags & XFS_BTREE_OVERLAPPING) || ptr == 1; 2137 } 2138 2139 /* 2140 * Update the low and high parent keys of the given level, progressing 2141 * towards the root. If force_all is false, stop if the keys for a given 2142 * level do not need updating. 2143 */ 2144 STATIC int 2145 __xfs_btree_updkeys( 2146 struct xfs_btree_cur *cur, 2147 int level, 2148 struct xfs_btree_block *block, 2149 struct xfs_buf *bp0, 2150 bool force_all) 2151 { 2152 union xfs_btree_key key; /* keys from current level */ 2153 union xfs_btree_key *lkey; /* keys from the next level up */ 2154 union xfs_btree_key *hkey; 2155 union xfs_btree_key *nlkey; /* keys from the next level up */ 2156 union xfs_btree_key *nhkey; 2157 struct xfs_buf *bp; 2158 int ptr; 2159 2160 ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING); 2161 2162 /* Exit if there aren't any parent levels to update. */ 2163 if (level + 1 >= cur->bc_nlevels) 2164 return 0; 2165 2166 trace_xfs_btree_updkeys(cur, level, bp0); 2167 2168 lkey = &key; 2169 hkey = xfs_btree_high_key_from_key(cur, lkey); 2170 xfs_btree_get_keys(cur, block, lkey); 2171 for (level++; level < cur->bc_nlevels; level++) { 2172 #ifdef DEBUG 2173 int error; 2174 #endif 2175 block = xfs_btree_get_block(cur, level, &bp); 2176 trace_xfs_btree_updkeys(cur, level, bp); 2177 #ifdef DEBUG 2178 error = xfs_btree_check_block(cur, block, level, bp); 2179 if (error) 2180 return error; 2181 #endif 2182 ptr = cur->bc_levels[level].ptr; 2183 nlkey = xfs_btree_key_addr(cur, ptr, block); 2184 nhkey = xfs_btree_high_key_addr(cur, ptr, block); 2185 if (!force_all && 2186 !(cur->bc_ops->diff_two_keys(cur, nlkey, lkey) != 0 || 2187 cur->bc_ops->diff_two_keys(cur, nhkey, hkey) != 0)) 2188 break; 2189 xfs_btree_copy_keys(cur, nlkey, lkey, 1); 2190 xfs_btree_log_keys(cur, bp, ptr, ptr); 2191 if (level + 1 >= cur->bc_nlevels) 2192 break; 2193 xfs_btree_get_node_keys(cur, block, lkey); 2194 } 2195 2196 return 0; 2197 } 2198 2199 /* Update all the keys from some level in cursor back to the root. */ 2200 STATIC int 2201 xfs_btree_updkeys_force( 2202 struct xfs_btree_cur *cur, 2203 int level) 2204 { 2205 struct xfs_buf *bp; 2206 struct xfs_btree_block *block; 2207 2208 block = xfs_btree_get_block(cur, level, &bp); 2209 return __xfs_btree_updkeys(cur, level, block, bp, true); 2210 } 2211 2212 /* 2213 * Update the parent keys of the given level, progressing towards the root. 2214 */ 2215 STATIC int 2216 xfs_btree_update_keys( 2217 struct xfs_btree_cur *cur, 2218 int level) 2219 { 2220 struct xfs_btree_block *block; 2221 struct xfs_buf *bp; 2222 union xfs_btree_key *kp; 2223 union xfs_btree_key key; 2224 int ptr; 2225 2226 ASSERT(level >= 0); 2227 2228 block = xfs_btree_get_block(cur, level, &bp); 2229 if (cur->bc_flags & XFS_BTREE_OVERLAPPING) 2230 return __xfs_btree_updkeys(cur, level, block, bp, false); 2231 2232 /* 2233 * Go up the tree from this level toward the root. 2234 * At each level, update the key value to the value input. 2235 * Stop when we reach a level where the cursor isn't pointing 2236 * at the first entry in the block. 2237 */ 2238 xfs_btree_get_keys(cur, block, &key); 2239 for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) { 2240 #ifdef DEBUG 2241 int error; 2242 #endif 2243 block = xfs_btree_get_block(cur, level, &bp); 2244 #ifdef DEBUG 2245 error = xfs_btree_check_block(cur, block, level, bp); 2246 if (error) 2247 return error; 2248 #endif 2249 ptr = cur->bc_levels[level].ptr; 2250 kp = xfs_btree_key_addr(cur, ptr, block); 2251 xfs_btree_copy_keys(cur, kp, &key, 1); 2252 xfs_btree_log_keys(cur, bp, ptr, ptr); 2253 } 2254 2255 return 0; 2256 } 2257 2258 /* 2259 * Update the record referred to by cur to the value in the 2260 * given record. This either works (return 0) or gets an 2261 * EFSCORRUPTED error. 2262 */ 2263 int 2264 xfs_btree_update( 2265 struct xfs_btree_cur *cur, 2266 union xfs_btree_rec *rec) 2267 { 2268 struct xfs_btree_block *block; 2269 struct xfs_buf *bp; 2270 int error; 2271 int ptr; 2272 union xfs_btree_rec *rp; 2273 2274 /* Pick up the current block. */ 2275 block = xfs_btree_get_block(cur, 0, &bp); 2276 2277 #ifdef DEBUG 2278 error = xfs_btree_check_block(cur, block, 0, bp); 2279 if (error) 2280 goto error0; 2281 #endif 2282 /* Get the address of the rec to be updated. */ 2283 ptr = cur->bc_levels[0].ptr; 2284 rp = xfs_btree_rec_addr(cur, ptr, block); 2285 2286 /* Fill in the new contents and log them. */ 2287 xfs_btree_copy_recs(cur, rp, rec, 1); 2288 xfs_btree_log_recs(cur, bp, ptr, ptr); 2289 2290 /* 2291 * If we are tracking the last record in the tree and 2292 * we are at the far right edge of the tree, update it. 2293 */ 2294 if (xfs_btree_is_lastrec(cur, block, 0)) { 2295 cur->bc_ops->update_lastrec(cur, block, rec, 2296 ptr, LASTREC_UPDATE); 2297 } 2298 2299 /* Pass new key value up to our parent. */ 2300 if (xfs_btree_needs_key_update(cur, ptr)) { 2301 error = xfs_btree_update_keys(cur, 0); 2302 if (error) 2303 goto error0; 2304 } 2305 2306 return 0; 2307 2308 error0: 2309 return error; 2310 } 2311 2312 /* 2313 * Move 1 record left from cur/level if possible. 2314 * Update cur to reflect the new path. 2315 */ 2316 STATIC int /* error */ 2317 xfs_btree_lshift( 2318 struct xfs_btree_cur *cur, 2319 int level, 2320 int *stat) /* success/failure */ 2321 { 2322 struct xfs_buf *lbp; /* left buffer pointer */ 2323 struct xfs_btree_block *left; /* left btree block */ 2324 int lrecs; /* left record count */ 2325 struct xfs_buf *rbp; /* right buffer pointer */ 2326 struct xfs_btree_block *right; /* right btree block */ 2327 struct xfs_btree_cur *tcur; /* temporary btree cursor */ 2328 int rrecs; /* right record count */ 2329 union xfs_btree_ptr lptr; /* left btree pointer */ 2330 union xfs_btree_key *rkp = NULL; /* right btree key */ 2331 union xfs_btree_ptr *rpp = NULL; /* right address pointer */ 2332 union xfs_btree_rec *rrp = NULL; /* right record pointer */ 2333 int error; /* error return value */ 2334 int i; 2335 2336 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && 2337 level == cur->bc_nlevels - 1) 2338 goto out0; 2339 2340 /* Set up variables for this block as "right". */ 2341 right = xfs_btree_get_block(cur, level, &rbp); 2342 2343 #ifdef DEBUG 2344 error = xfs_btree_check_block(cur, right, level, rbp); 2345 if (error) 2346 goto error0; 2347 #endif 2348 2349 /* If we've got no left sibling then we can't shift an entry left. */ 2350 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); 2351 if (xfs_btree_ptr_is_null(cur, &lptr)) 2352 goto out0; 2353 2354 /* 2355 * If the cursor entry is the one that would be moved, don't 2356 * do it... it's too complicated. 2357 */ 2358 if (cur->bc_levels[level].ptr <= 1) 2359 goto out0; 2360 2361 /* Set up the left neighbor as "left". */ 2362 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); 2363 if (error) 2364 goto error0; 2365 2366 /* If it's full, it can't take another entry. */ 2367 lrecs = xfs_btree_get_numrecs(left); 2368 if (lrecs == cur->bc_ops->get_maxrecs(cur, level)) 2369 goto out0; 2370 2371 rrecs = xfs_btree_get_numrecs(right); 2372 2373 /* 2374 * We add one entry to the left side and remove one for the right side. 2375 * Account for it here, the changes will be updated on disk and logged 2376 * later. 2377 */ 2378 lrecs++; 2379 rrecs--; 2380 2381 XFS_BTREE_STATS_INC(cur, lshift); 2382 XFS_BTREE_STATS_ADD(cur, moves, 1); 2383 2384 /* 2385 * If non-leaf, copy a key and a ptr to the left block. 2386 * Log the changes to the left block. 2387 */ 2388 if (level > 0) { 2389 /* It's a non-leaf. Move keys and pointers. */ 2390 union xfs_btree_key *lkp; /* left btree key */ 2391 union xfs_btree_ptr *lpp; /* left address pointer */ 2392 2393 lkp = xfs_btree_key_addr(cur, lrecs, left); 2394 rkp = xfs_btree_key_addr(cur, 1, right); 2395 2396 lpp = xfs_btree_ptr_addr(cur, lrecs, left); 2397 rpp = xfs_btree_ptr_addr(cur, 1, right); 2398 2399 error = xfs_btree_debug_check_ptr(cur, rpp, 0, level); 2400 if (error) 2401 goto error0; 2402 2403 xfs_btree_copy_keys(cur, lkp, rkp, 1); 2404 xfs_btree_copy_ptrs(cur, lpp, rpp, 1); 2405 2406 xfs_btree_log_keys(cur, lbp, lrecs, lrecs); 2407 xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs); 2408 2409 ASSERT(cur->bc_ops->keys_inorder(cur, 2410 xfs_btree_key_addr(cur, lrecs - 1, left), lkp)); 2411 } else { 2412 /* It's a leaf. Move records. */ 2413 union xfs_btree_rec *lrp; /* left record pointer */ 2414 2415 lrp = xfs_btree_rec_addr(cur, lrecs, left); 2416 rrp = xfs_btree_rec_addr(cur, 1, right); 2417 2418 xfs_btree_copy_recs(cur, lrp, rrp, 1); 2419 xfs_btree_log_recs(cur, lbp, lrecs, lrecs); 2420 2421 ASSERT(cur->bc_ops->recs_inorder(cur, 2422 xfs_btree_rec_addr(cur, lrecs - 1, left), lrp)); 2423 } 2424 2425 xfs_btree_set_numrecs(left, lrecs); 2426 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); 2427 2428 xfs_btree_set_numrecs(right, rrecs); 2429 xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); 2430 2431 /* 2432 * Slide the contents of right down one entry. 2433 */ 2434 XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1); 2435 if (level > 0) { 2436 /* It's a nonleaf. operate on keys and ptrs */ 2437 for (i = 0; i < rrecs; i++) { 2438 error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level); 2439 if (error) 2440 goto error0; 2441 } 2442 2443 xfs_btree_shift_keys(cur, 2444 xfs_btree_key_addr(cur, 2, right), 2445 -1, rrecs); 2446 xfs_btree_shift_ptrs(cur, 2447 xfs_btree_ptr_addr(cur, 2, right), 2448 -1, rrecs); 2449 2450 xfs_btree_log_keys(cur, rbp, 1, rrecs); 2451 xfs_btree_log_ptrs(cur, rbp, 1, rrecs); 2452 } else { 2453 /* It's a leaf. operate on records */ 2454 xfs_btree_shift_recs(cur, 2455 xfs_btree_rec_addr(cur, 2, right), 2456 -1, rrecs); 2457 xfs_btree_log_recs(cur, rbp, 1, rrecs); 2458 } 2459 2460 /* 2461 * Using a temporary cursor, update the parent key values of the 2462 * block on the left. 2463 */ 2464 if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { 2465 error = xfs_btree_dup_cursor(cur, &tcur); 2466 if (error) 2467 goto error0; 2468 i = xfs_btree_firstrec(tcur, level); 2469 if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) { 2470 error = -EFSCORRUPTED; 2471 goto error0; 2472 } 2473 2474 error = xfs_btree_decrement(tcur, level, &i); 2475 if (error) 2476 goto error1; 2477 2478 /* Update the parent high keys of the left block, if needed. */ 2479 error = xfs_btree_update_keys(tcur, level); 2480 if (error) 2481 goto error1; 2482 2483 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 2484 } 2485 2486 /* Update the parent keys of the right block. */ 2487 error = xfs_btree_update_keys(cur, level); 2488 if (error) 2489 goto error0; 2490 2491 /* Slide the cursor value left one. */ 2492 cur->bc_levels[level].ptr--; 2493 2494 *stat = 1; 2495 return 0; 2496 2497 out0: 2498 *stat = 0; 2499 return 0; 2500 2501 error0: 2502 return error; 2503 2504 error1: 2505 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 2506 return error; 2507 } 2508 2509 /* 2510 * Move 1 record right from cur/level if possible. 2511 * Update cur to reflect the new path. 2512 */ 2513 STATIC int /* error */ 2514 xfs_btree_rshift( 2515 struct xfs_btree_cur *cur, 2516 int level, 2517 int *stat) /* success/failure */ 2518 { 2519 struct xfs_buf *lbp; /* left buffer pointer */ 2520 struct xfs_btree_block *left; /* left btree block */ 2521 struct xfs_buf *rbp; /* right buffer pointer */ 2522 struct xfs_btree_block *right; /* right btree block */ 2523 struct xfs_btree_cur *tcur; /* temporary btree cursor */ 2524 union xfs_btree_ptr rptr; /* right block pointer */ 2525 union xfs_btree_key *rkp; /* right btree key */ 2526 int rrecs; /* right record count */ 2527 int lrecs; /* left record count */ 2528 int error; /* error return value */ 2529 int i; /* loop counter */ 2530 2531 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && 2532 (level == cur->bc_nlevels - 1)) 2533 goto out0; 2534 2535 /* Set up variables for this block as "left". */ 2536 left = xfs_btree_get_block(cur, level, &lbp); 2537 2538 #ifdef DEBUG 2539 error = xfs_btree_check_block(cur, left, level, lbp); 2540 if (error) 2541 goto error0; 2542 #endif 2543 2544 /* If we've got no right sibling then we can't shift an entry right. */ 2545 xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); 2546 if (xfs_btree_ptr_is_null(cur, &rptr)) 2547 goto out0; 2548 2549 /* 2550 * If the cursor entry is the one that would be moved, don't 2551 * do it... it's too complicated. 2552 */ 2553 lrecs = xfs_btree_get_numrecs(left); 2554 if (cur->bc_levels[level].ptr >= lrecs) 2555 goto out0; 2556 2557 /* Set up the right neighbor as "right". */ 2558 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); 2559 if (error) 2560 goto error0; 2561 2562 /* If it's full, it can't take another entry. */ 2563 rrecs = xfs_btree_get_numrecs(right); 2564 if (rrecs == cur->bc_ops->get_maxrecs(cur, level)) 2565 goto out0; 2566 2567 XFS_BTREE_STATS_INC(cur, rshift); 2568 XFS_BTREE_STATS_ADD(cur, moves, rrecs); 2569 2570 /* 2571 * Make a hole at the start of the right neighbor block, then 2572 * copy the last left block entry to the hole. 2573 */ 2574 if (level > 0) { 2575 /* It's a nonleaf. make a hole in the keys and ptrs */ 2576 union xfs_btree_key *lkp; 2577 union xfs_btree_ptr *lpp; 2578 union xfs_btree_ptr *rpp; 2579 2580 lkp = xfs_btree_key_addr(cur, lrecs, left); 2581 lpp = xfs_btree_ptr_addr(cur, lrecs, left); 2582 rkp = xfs_btree_key_addr(cur, 1, right); 2583 rpp = xfs_btree_ptr_addr(cur, 1, right); 2584 2585 for (i = rrecs - 1; i >= 0; i--) { 2586 error = xfs_btree_debug_check_ptr(cur, rpp, i, level); 2587 if (error) 2588 goto error0; 2589 } 2590 2591 xfs_btree_shift_keys(cur, rkp, 1, rrecs); 2592 xfs_btree_shift_ptrs(cur, rpp, 1, rrecs); 2593 2594 error = xfs_btree_debug_check_ptr(cur, lpp, 0, level); 2595 if (error) 2596 goto error0; 2597 2598 /* Now put the new data in, and log it. */ 2599 xfs_btree_copy_keys(cur, rkp, lkp, 1); 2600 xfs_btree_copy_ptrs(cur, rpp, lpp, 1); 2601 2602 xfs_btree_log_keys(cur, rbp, 1, rrecs + 1); 2603 xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1); 2604 2605 ASSERT(cur->bc_ops->keys_inorder(cur, rkp, 2606 xfs_btree_key_addr(cur, 2, right))); 2607 } else { 2608 /* It's a leaf. make a hole in the records */ 2609 union xfs_btree_rec *lrp; 2610 union xfs_btree_rec *rrp; 2611 2612 lrp = xfs_btree_rec_addr(cur, lrecs, left); 2613 rrp = xfs_btree_rec_addr(cur, 1, right); 2614 2615 xfs_btree_shift_recs(cur, rrp, 1, rrecs); 2616 2617 /* Now put the new data in, and log it. */ 2618 xfs_btree_copy_recs(cur, rrp, lrp, 1); 2619 xfs_btree_log_recs(cur, rbp, 1, rrecs + 1); 2620 } 2621 2622 /* 2623 * Decrement and log left's numrecs, bump and log right's numrecs. 2624 */ 2625 xfs_btree_set_numrecs(left, --lrecs); 2626 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); 2627 2628 xfs_btree_set_numrecs(right, ++rrecs); 2629 xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); 2630 2631 /* 2632 * Using a temporary cursor, update the parent key values of the 2633 * block on the right. 2634 */ 2635 error = xfs_btree_dup_cursor(cur, &tcur); 2636 if (error) 2637 goto error0; 2638 i = xfs_btree_lastrec(tcur, level); 2639 if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) { 2640 error = -EFSCORRUPTED; 2641 goto error0; 2642 } 2643 2644 error = xfs_btree_increment(tcur, level, &i); 2645 if (error) 2646 goto error1; 2647 2648 /* Update the parent high keys of the left block, if needed. */ 2649 if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { 2650 error = xfs_btree_update_keys(cur, level); 2651 if (error) 2652 goto error1; 2653 } 2654 2655 /* Update the parent keys of the right block. */ 2656 error = xfs_btree_update_keys(tcur, level); 2657 if (error) 2658 goto error1; 2659 2660 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 2661 2662 *stat = 1; 2663 return 0; 2664 2665 out0: 2666 *stat = 0; 2667 return 0; 2668 2669 error0: 2670 return error; 2671 2672 error1: 2673 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 2674 return error; 2675 } 2676 2677 /* 2678 * Split cur/level block in half. 2679 * Return new block number and the key to its first 2680 * record (to be inserted into parent). 2681 */ 2682 STATIC int /* error */ 2683 __xfs_btree_split( 2684 struct xfs_btree_cur *cur, 2685 int level, 2686 union xfs_btree_ptr *ptrp, 2687 union xfs_btree_key *key, 2688 struct xfs_btree_cur **curp, 2689 int *stat) /* success/failure */ 2690 { 2691 union xfs_btree_ptr lptr; /* left sibling block ptr */ 2692 struct xfs_buf *lbp; /* left buffer pointer */ 2693 struct xfs_btree_block *left; /* left btree block */ 2694 union xfs_btree_ptr rptr; /* right sibling block ptr */ 2695 struct xfs_buf *rbp; /* right buffer pointer */ 2696 struct xfs_btree_block *right; /* right btree block */ 2697 union xfs_btree_ptr rrptr; /* right-right sibling ptr */ 2698 struct xfs_buf *rrbp; /* right-right buffer pointer */ 2699 struct xfs_btree_block *rrblock; /* right-right btree block */ 2700 int lrecs; 2701 int rrecs; 2702 int src_index; 2703 int error; /* error return value */ 2704 int i; 2705 2706 XFS_BTREE_STATS_INC(cur, split); 2707 2708 /* Set up left block (current one). */ 2709 left = xfs_btree_get_block(cur, level, &lbp); 2710 2711 #ifdef DEBUG 2712 error = xfs_btree_check_block(cur, left, level, lbp); 2713 if (error) 2714 goto error0; 2715 #endif 2716 2717 xfs_btree_buf_to_ptr(cur, lbp, &lptr); 2718 2719 /* Allocate the new block. If we can't do it, we're toast. Give up. */ 2720 error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat); 2721 if (error) 2722 goto error0; 2723 if (*stat == 0) 2724 goto out0; 2725 XFS_BTREE_STATS_INC(cur, alloc); 2726 2727 /* Set up the new block as "right". */ 2728 error = xfs_btree_get_buf_block(cur, &rptr, &right, &rbp); 2729 if (error) 2730 goto error0; 2731 2732 /* Fill in the btree header for the new right block. */ 2733 xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0); 2734 2735 /* 2736 * Split the entries between the old and the new block evenly. 2737 * Make sure that if there's an odd number of entries now, that 2738 * each new block will have the same number of entries. 2739 */ 2740 lrecs = xfs_btree_get_numrecs(left); 2741 rrecs = lrecs / 2; 2742 if ((lrecs & 1) && cur->bc_levels[level].ptr <= rrecs + 1) 2743 rrecs++; 2744 src_index = (lrecs - rrecs + 1); 2745 2746 XFS_BTREE_STATS_ADD(cur, moves, rrecs); 2747 2748 /* Adjust numrecs for the later get_*_keys() calls. */ 2749 lrecs -= rrecs; 2750 xfs_btree_set_numrecs(left, lrecs); 2751 xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs); 2752 2753 /* 2754 * Copy btree block entries from the left block over to the 2755 * new block, the right. Update the right block and log the 2756 * changes. 2757 */ 2758 if (level > 0) { 2759 /* It's a non-leaf. Move keys and pointers. */ 2760 union xfs_btree_key *lkp; /* left btree key */ 2761 union xfs_btree_ptr *lpp; /* left address pointer */ 2762 union xfs_btree_key *rkp; /* right btree key */ 2763 union xfs_btree_ptr *rpp; /* right address pointer */ 2764 2765 lkp = xfs_btree_key_addr(cur, src_index, left); 2766 lpp = xfs_btree_ptr_addr(cur, src_index, left); 2767 rkp = xfs_btree_key_addr(cur, 1, right); 2768 rpp = xfs_btree_ptr_addr(cur, 1, right); 2769 2770 for (i = src_index; i < rrecs; i++) { 2771 error = xfs_btree_debug_check_ptr(cur, lpp, i, level); 2772 if (error) 2773 goto error0; 2774 } 2775 2776 /* Copy the keys & pointers to the new block. */ 2777 xfs_btree_copy_keys(cur, rkp, lkp, rrecs); 2778 xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs); 2779 2780 xfs_btree_log_keys(cur, rbp, 1, rrecs); 2781 xfs_btree_log_ptrs(cur, rbp, 1, rrecs); 2782 2783 /* Stash the keys of the new block for later insertion. */ 2784 xfs_btree_get_node_keys(cur, right, key); 2785 } else { 2786 /* It's a leaf. Move records. */ 2787 union xfs_btree_rec *lrp; /* left record pointer */ 2788 union xfs_btree_rec *rrp; /* right record pointer */ 2789 2790 lrp = xfs_btree_rec_addr(cur, src_index, left); 2791 rrp = xfs_btree_rec_addr(cur, 1, right); 2792 2793 /* Copy records to the new block. */ 2794 xfs_btree_copy_recs(cur, rrp, lrp, rrecs); 2795 xfs_btree_log_recs(cur, rbp, 1, rrecs); 2796 2797 /* Stash the keys of the new block for later insertion. */ 2798 xfs_btree_get_leaf_keys(cur, right, key); 2799 } 2800 2801 /* 2802 * Find the left block number by looking in the buffer. 2803 * Adjust sibling pointers. 2804 */ 2805 xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB); 2806 xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB); 2807 xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); 2808 xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); 2809 2810 xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS); 2811 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); 2812 2813 /* 2814 * If there's a block to the new block's right, make that block 2815 * point back to right instead of to left. 2816 */ 2817 if (!xfs_btree_ptr_is_null(cur, &rrptr)) { 2818 error = xfs_btree_read_buf_block(cur, &rrptr, 2819 0, &rrblock, &rrbp); 2820 if (error) 2821 goto error0; 2822 xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB); 2823 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); 2824 } 2825 2826 /* Update the parent high keys of the left block, if needed. */ 2827 if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { 2828 error = xfs_btree_update_keys(cur, level); 2829 if (error) 2830 goto error0; 2831 } 2832 2833 /* 2834 * If the cursor is really in the right block, move it there. 2835 * If it's just pointing past the last entry in left, then we'll 2836 * insert there, so don't change anything in that case. 2837 */ 2838 if (cur->bc_levels[level].ptr > lrecs + 1) { 2839 xfs_btree_setbuf(cur, level, rbp); 2840 cur->bc_levels[level].ptr -= lrecs; 2841 } 2842 /* 2843 * If there are more levels, we'll need another cursor which refers 2844 * the right block, no matter where this cursor was. 2845 */ 2846 if (level + 1 < cur->bc_nlevels) { 2847 error = xfs_btree_dup_cursor(cur, curp); 2848 if (error) 2849 goto error0; 2850 (*curp)->bc_levels[level + 1].ptr++; 2851 } 2852 *ptrp = rptr; 2853 *stat = 1; 2854 return 0; 2855 out0: 2856 *stat = 0; 2857 return 0; 2858 2859 error0: 2860 return error; 2861 } 2862 2863 #ifdef __KERNEL__ 2864 struct xfs_btree_split_args { 2865 struct xfs_btree_cur *cur; 2866 int level; 2867 union xfs_btree_ptr *ptrp; 2868 union xfs_btree_key *key; 2869 struct xfs_btree_cur **curp; 2870 int *stat; /* success/failure */ 2871 int result; 2872 bool kswapd; /* allocation in kswapd context */ 2873 struct completion *done; 2874 struct work_struct work; 2875 }; 2876 2877 /* 2878 * Stack switching interfaces for allocation 2879 */ 2880 static void 2881 xfs_btree_split_worker( 2882 struct work_struct *work) 2883 { 2884 struct xfs_btree_split_args *args = container_of(work, 2885 struct xfs_btree_split_args, work); 2886 unsigned long pflags; 2887 unsigned long new_pflags = 0; 2888 2889 /* 2890 * we are in a transaction context here, but may also be doing work 2891 * in kswapd context, and hence we may need to inherit that state 2892 * temporarily to ensure that we don't block waiting for memory reclaim 2893 * in any way. 2894 */ 2895 if (args->kswapd) 2896 new_pflags |= PF_MEMALLOC | PF_KSWAPD; 2897 2898 current_set_flags_nested(&pflags, new_pflags); 2899 xfs_trans_set_context(args->cur->bc_tp); 2900 2901 args->result = __xfs_btree_split(args->cur, args->level, args->ptrp, 2902 args->key, args->curp, args->stat); 2903 2904 xfs_trans_clear_context(args->cur->bc_tp); 2905 current_restore_flags_nested(&pflags, new_pflags); 2906 2907 /* 2908 * Do not access args after complete() has run here. We don't own args 2909 * and the owner may run and free args before we return here. 2910 */ 2911 complete(args->done); 2912 2913 } 2914 2915 /* 2916 * BMBT split requests often come in with little stack to work on so we push 2917 * them off to a worker thread so there is lots of stack to use. For the other 2918 * btree types, just call directly to avoid the context switch overhead here. 2919 * 2920 * Care must be taken here - the work queue rescuer thread introduces potential 2921 * AGF <> worker queue deadlocks if the BMBT block allocation has to lock new 2922 * AGFs to allocate blocks. A task being run by the rescuer could attempt to 2923 * lock an AGF that is already locked by a task queued to run by the rescuer, 2924 * resulting in an ABBA deadlock as the rescuer cannot run the lock holder to 2925 * release it until the current thread it is running gains the lock. 2926 * 2927 * To avoid this issue, we only ever queue BMBT splits that don't have an AGF 2928 * already locked to allocate from. The only place that doesn't hold an AGF 2929 * locked is unwritten extent conversion at IO completion, but that has already 2930 * been offloaded to a worker thread and hence has no stack consumption issues 2931 * we have to worry about. 2932 */ 2933 STATIC int /* error */ 2934 xfs_btree_split( 2935 struct xfs_btree_cur *cur, 2936 int level, 2937 union xfs_btree_ptr *ptrp, 2938 union xfs_btree_key *key, 2939 struct xfs_btree_cur **curp, 2940 int *stat) /* success/failure */ 2941 { 2942 struct xfs_btree_split_args args; 2943 DECLARE_COMPLETION_ONSTACK(done); 2944 2945 if (cur->bc_btnum != XFS_BTNUM_BMAP || 2946 cur->bc_tp->t_highest_agno == NULLAGNUMBER) 2947 return __xfs_btree_split(cur, level, ptrp, key, curp, stat); 2948 2949 args.cur = cur; 2950 args.level = level; 2951 args.ptrp = ptrp; 2952 args.key = key; 2953 args.curp = curp; 2954 args.stat = stat; 2955 args.done = &done; 2956 args.kswapd = current_is_kswapd(); 2957 INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker); 2958 queue_work(xfs_alloc_wq, &args.work); 2959 wait_for_completion(&done); 2960 destroy_work_on_stack(&args.work); 2961 return args.result; 2962 } 2963 #else 2964 #define xfs_btree_split __xfs_btree_split 2965 #endif /* __KERNEL__ */ 2966 2967 2968 /* 2969 * Copy the old inode root contents into a real block and make the 2970 * broot point to it. 2971 */ 2972 int /* error */ 2973 xfs_btree_new_iroot( 2974 struct xfs_btree_cur *cur, /* btree cursor */ 2975 int *logflags, /* logging flags for inode */ 2976 int *stat) /* return status - 0 fail */ 2977 { 2978 struct xfs_buf *cbp; /* buffer for cblock */ 2979 struct xfs_btree_block *block; /* btree block */ 2980 struct xfs_btree_block *cblock; /* child btree block */ 2981 union xfs_btree_key *ckp; /* child key pointer */ 2982 union xfs_btree_ptr *cpp; /* child ptr pointer */ 2983 union xfs_btree_key *kp; /* pointer to btree key */ 2984 union xfs_btree_ptr *pp; /* pointer to block addr */ 2985 union xfs_btree_ptr nptr; /* new block addr */ 2986 int level; /* btree level */ 2987 int error; /* error return code */ 2988 int i; /* loop counter */ 2989 2990 XFS_BTREE_STATS_INC(cur, newroot); 2991 2992 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); 2993 2994 level = cur->bc_nlevels - 1; 2995 2996 block = xfs_btree_get_iroot(cur); 2997 pp = xfs_btree_ptr_addr(cur, 1, block); 2998 2999 /* Allocate the new block. If we can't do it, we're toast. Give up. */ 3000 error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat); 3001 if (error) 3002 goto error0; 3003 if (*stat == 0) 3004 return 0; 3005 3006 XFS_BTREE_STATS_INC(cur, alloc); 3007 3008 /* Copy the root into a real block. */ 3009 error = xfs_btree_get_buf_block(cur, &nptr, &cblock, &cbp); 3010 if (error) 3011 goto error0; 3012 3013 /* 3014 * we can't just memcpy() the root in for CRC enabled btree blocks. 3015 * In that case have to also ensure the blkno remains correct 3016 */ 3017 memcpy(cblock, block, xfs_btree_block_len(cur)); 3018 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { 3019 __be64 bno = cpu_to_be64(xfs_buf_daddr(cbp)); 3020 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 3021 cblock->bb_u.l.bb_blkno = bno; 3022 else 3023 cblock->bb_u.s.bb_blkno = bno; 3024 } 3025 3026 be16_add_cpu(&block->bb_level, 1); 3027 xfs_btree_set_numrecs(block, 1); 3028 cur->bc_nlevels++; 3029 ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); 3030 cur->bc_levels[level + 1].ptr = 1; 3031 3032 kp = xfs_btree_key_addr(cur, 1, block); 3033 ckp = xfs_btree_key_addr(cur, 1, cblock); 3034 xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock)); 3035 3036 cpp = xfs_btree_ptr_addr(cur, 1, cblock); 3037 for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) { 3038 error = xfs_btree_debug_check_ptr(cur, pp, i, level); 3039 if (error) 3040 goto error0; 3041 } 3042 3043 xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock)); 3044 3045 error = xfs_btree_debug_check_ptr(cur, &nptr, 0, level); 3046 if (error) 3047 goto error0; 3048 3049 xfs_btree_copy_ptrs(cur, pp, &nptr, 1); 3050 3051 xfs_iroot_realloc(cur->bc_ino.ip, 3052 1 - xfs_btree_get_numrecs(cblock), 3053 cur->bc_ino.whichfork); 3054 3055 xfs_btree_setbuf(cur, level, cbp); 3056 3057 /* 3058 * Do all this logging at the end so that 3059 * the root is at the right level. 3060 */ 3061 xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS); 3062 xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); 3063 xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); 3064 3065 *logflags |= 3066 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork); 3067 *stat = 1; 3068 return 0; 3069 error0: 3070 return error; 3071 } 3072 3073 /* 3074 * Allocate a new root block, fill it in. 3075 */ 3076 STATIC int /* error */ 3077 xfs_btree_new_root( 3078 struct xfs_btree_cur *cur, /* btree cursor */ 3079 int *stat) /* success/failure */ 3080 { 3081 struct xfs_btree_block *block; /* one half of the old root block */ 3082 struct xfs_buf *bp; /* buffer containing block */ 3083 int error; /* error return value */ 3084 struct xfs_buf *lbp; /* left buffer pointer */ 3085 struct xfs_btree_block *left; /* left btree block */ 3086 struct xfs_buf *nbp; /* new (root) buffer */ 3087 struct xfs_btree_block *new; /* new (root) btree block */ 3088 int nptr; /* new value for key index, 1 or 2 */ 3089 struct xfs_buf *rbp; /* right buffer pointer */ 3090 struct xfs_btree_block *right; /* right btree block */ 3091 union xfs_btree_ptr rptr; 3092 union xfs_btree_ptr lptr; 3093 3094 XFS_BTREE_STATS_INC(cur, newroot); 3095 3096 /* initialise our start point from the cursor */ 3097 cur->bc_ops->init_ptr_from_cur(cur, &rptr); 3098 3099 /* Allocate the new block. If we can't do it, we're toast. Give up. */ 3100 error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat); 3101 if (error) 3102 goto error0; 3103 if (*stat == 0) 3104 goto out0; 3105 XFS_BTREE_STATS_INC(cur, alloc); 3106 3107 /* Set up the new block. */ 3108 error = xfs_btree_get_buf_block(cur, &lptr, &new, &nbp); 3109 if (error) 3110 goto error0; 3111 3112 /* Set the root in the holding structure increasing the level by 1. */ 3113 cur->bc_ops->set_root(cur, &lptr, 1); 3114 3115 /* 3116 * At the previous root level there are now two blocks: the old root, 3117 * and the new block generated when it was split. We don't know which 3118 * one the cursor is pointing at, so we set up variables "left" and 3119 * "right" for each case. 3120 */ 3121 block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp); 3122 3123 #ifdef DEBUG 3124 error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp); 3125 if (error) 3126 goto error0; 3127 #endif 3128 3129 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); 3130 if (!xfs_btree_ptr_is_null(cur, &rptr)) { 3131 /* Our block is left, pick up the right block. */ 3132 lbp = bp; 3133 xfs_btree_buf_to_ptr(cur, lbp, &lptr); 3134 left = block; 3135 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); 3136 if (error) 3137 goto error0; 3138 bp = rbp; 3139 nptr = 1; 3140 } else { 3141 /* Our block is right, pick up the left block. */ 3142 rbp = bp; 3143 xfs_btree_buf_to_ptr(cur, rbp, &rptr); 3144 right = block; 3145 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); 3146 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); 3147 if (error) 3148 goto error0; 3149 bp = lbp; 3150 nptr = 2; 3151 } 3152 3153 /* Fill in the new block's btree header and log it. */ 3154 xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2); 3155 xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); 3156 ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) && 3157 !xfs_btree_ptr_is_null(cur, &rptr)); 3158 3159 /* Fill in the key data in the new root. */ 3160 if (xfs_btree_get_level(left) > 0) { 3161 /* 3162 * Get the keys for the left block's keys and put them directly 3163 * in the parent block. Do the same for the right block. 3164 */ 3165 xfs_btree_get_node_keys(cur, left, 3166 xfs_btree_key_addr(cur, 1, new)); 3167 xfs_btree_get_node_keys(cur, right, 3168 xfs_btree_key_addr(cur, 2, new)); 3169 } else { 3170 /* 3171 * Get the keys for the left block's records and put them 3172 * directly in the parent block. Do the same for the right 3173 * block. 3174 */ 3175 xfs_btree_get_leaf_keys(cur, left, 3176 xfs_btree_key_addr(cur, 1, new)); 3177 xfs_btree_get_leaf_keys(cur, right, 3178 xfs_btree_key_addr(cur, 2, new)); 3179 } 3180 xfs_btree_log_keys(cur, nbp, 1, 2); 3181 3182 /* Fill in the pointer data in the new root. */ 3183 xfs_btree_copy_ptrs(cur, 3184 xfs_btree_ptr_addr(cur, 1, new), &lptr, 1); 3185 xfs_btree_copy_ptrs(cur, 3186 xfs_btree_ptr_addr(cur, 2, new), &rptr, 1); 3187 xfs_btree_log_ptrs(cur, nbp, 1, 2); 3188 3189 /* Fix up the cursor. */ 3190 xfs_btree_setbuf(cur, cur->bc_nlevels, nbp); 3191 cur->bc_levels[cur->bc_nlevels].ptr = nptr; 3192 cur->bc_nlevels++; 3193 ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); 3194 *stat = 1; 3195 return 0; 3196 error0: 3197 return error; 3198 out0: 3199 *stat = 0; 3200 return 0; 3201 } 3202 3203 STATIC int 3204 xfs_btree_make_block_unfull( 3205 struct xfs_btree_cur *cur, /* btree cursor */ 3206 int level, /* btree level */ 3207 int numrecs,/* # of recs in block */ 3208 int *oindex,/* old tree index */ 3209 int *index, /* new tree index */ 3210 union xfs_btree_ptr *nptr, /* new btree ptr */ 3211 struct xfs_btree_cur **ncur, /* new btree cursor */ 3212 union xfs_btree_key *key, /* key of new block */ 3213 int *stat) 3214 { 3215 int error = 0; 3216 3217 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && 3218 level == cur->bc_nlevels - 1) { 3219 struct xfs_inode *ip = cur->bc_ino.ip; 3220 3221 if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { 3222 /* A root block that can be made bigger. */ 3223 xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork); 3224 *stat = 1; 3225 } else { 3226 /* A root block that needs replacing */ 3227 int logflags = 0; 3228 3229 error = xfs_btree_new_iroot(cur, &logflags, stat); 3230 if (error || *stat == 0) 3231 return error; 3232 3233 xfs_trans_log_inode(cur->bc_tp, ip, logflags); 3234 } 3235 3236 return 0; 3237 } 3238 3239 /* First, try shifting an entry to the right neighbor. */ 3240 error = xfs_btree_rshift(cur, level, stat); 3241 if (error || *stat) 3242 return error; 3243 3244 /* Next, try shifting an entry to the left neighbor. */ 3245 error = xfs_btree_lshift(cur, level, stat); 3246 if (error) 3247 return error; 3248 3249 if (*stat) { 3250 *oindex = *index = cur->bc_levels[level].ptr; 3251 return 0; 3252 } 3253 3254 /* 3255 * Next, try splitting the current block in half. 3256 * 3257 * If this works we have to re-set our variables because we 3258 * could be in a different block now. 3259 */ 3260 error = xfs_btree_split(cur, level, nptr, key, ncur, stat); 3261 if (error || *stat == 0) 3262 return error; 3263 3264 3265 *index = cur->bc_levels[level].ptr; 3266 return 0; 3267 } 3268 3269 /* 3270 * Insert one record/level. Return information to the caller 3271 * allowing the next level up to proceed if necessary. 3272 */ 3273 STATIC int 3274 xfs_btree_insrec( 3275 struct xfs_btree_cur *cur, /* btree cursor */ 3276 int level, /* level to insert record at */ 3277 union xfs_btree_ptr *ptrp, /* i/o: block number inserted */ 3278 union xfs_btree_rec *rec, /* record to insert */ 3279 union xfs_btree_key *key, /* i/o: block key for ptrp */ 3280 struct xfs_btree_cur **curp, /* output: new cursor replacing cur */ 3281 int *stat) /* success/failure */ 3282 { 3283 struct xfs_btree_block *block; /* btree block */ 3284 struct xfs_buf *bp; /* buffer for block */ 3285 union xfs_btree_ptr nptr; /* new block ptr */ 3286 struct xfs_btree_cur *ncur = NULL; /* new btree cursor */ 3287 union xfs_btree_key nkey; /* new block key */ 3288 union xfs_btree_key *lkey; 3289 int optr; /* old key/record index */ 3290 int ptr; /* key/record index */ 3291 int numrecs;/* number of records */ 3292 int error; /* error return value */ 3293 int i; 3294 xfs_daddr_t old_bn; 3295 3296 ncur = NULL; 3297 lkey = &nkey; 3298 3299 /* 3300 * If we have an external root pointer, and we've made it to the 3301 * root level, allocate a new root block and we're done. 3302 */ 3303 if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && 3304 (level >= cur->bc_nlevels)) { 3305 error = xfs_btree_new_root(cur, stat); 3306 xfs_btree_set_ptr_null(cur, ptrp); 3307 3308 return error; 3309 } 3310 3311 /* If we're off the left edge, return failure. */ 3312 ptr = cur->bc_levels[level].ptr; 3313 if (ptr == 0) { 3314 *stat = 0; 3315 return 0; 3316 } 3317 3318 optr = ptr; 3319 3320 XFS_BTREE_STATS_INC(cur, insrec); 3321 3322 /* Get pointers to the btree buffer and block. */ 3323 block = xfs_btree_get_block(cur, level, &bp); 3324 old_bn = bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL; 3325 numrecs = xfs_btree_get_numrecs(block); 3326 3327 #ifdef DEBUG 3328 error = xfs_btree_check_block(cur, block, level, bp); 3329 if (error) 3330 goto error0; 3331 3332 /* Check that the new entry is being inserted in the right place. */ 3333 if (ptr <= numrecs) { 3334 if (level == 0) { 3335 ASSERT(cur->bc_ops->recs_inorder(cur, rec, 3336 xfs_btree_rec_addr(cur, ptr, block))); 3337 } else { 3338 ASSERT(cur->bc_ops->keys_inorder(cur, key, 3339 xfs_btree_key_addr(cur, ptr, block))); 3340 } 3341 } 3342 #endif 3343 3344 /* 3345 * If the block is full, we can't insert the new entry until we 3346 * make the block un-full. 3347 */ 3348 xfs_btree_set_ptr_null(cur, &nptr); 3349 if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) { 3350 error = xfs_btree_make_block_unfull(cur, level, numrecs, 3351 &optr, &ptr, &nptr, &ncur, lkey, stat); 3352 if (error || *stat == 0) 3353 goto error0; 3354 } 3355 3356 /* 3357 * The current block may have changed if the block was 3358 * previously full and we have just made space in it. 3359 */ 3360 block = xfs_btree_get_block(cur, level, &bp); 3361 numrecs = xfs_btree_get_numrecs(block); 3362 3363 #ifdef DEBUG 3364 error = xfs_btree_check_block(cur, block, level, bp); 3365 if (error) 3366 goto error0; 3367 #endif 3368 3369 /* 3370 * At this point we know there's room for our new entry in the block 3371 * we're pointing at. 3372 */ 3373 XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1); 3374 3375 if (level > 0) { 3376 /* It's a nonleaf. make a hole in the keys and ptrs */ 3377 union xfs_btree_key *kp; 3378 union xfs_btree_ptr *pp; 3379 3380 kp = xfs_btree_key_addr(cur, ptr, block); 3381 pp = xfs_btree_ptr_addr(cur, ptr, block); 3382 3383 for (i = numrecs - ptr; i >= 0; i--) { 3384 error = xfs_btree_debug_check_ptr(cur, pp, i, level); 3385 if (error) 3386 goto error0; 3387 } 3388 3389 xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1); 3390 xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1); 3391 3392 error = xfs_btree_debug_check_ptr(cur, ptrp, 0, level); 3393 if (error) 3394 goto error0; 3395 3396 /* Now put the new data in, bump numrecs and log it. */ 3397 xfs_btree_copy_keys(cur, kp, key, 1); 3398 xfs_btree_copy_ptrs(cur, pp, ptrp, 1); 3399 numrecs++; 3400 xfs_btree_set_numrecs(block, numrecs); 3401 xfs_btree_log_ptrs(cur, bp, ptr, numrecs); 3402 xfs_btree_log_keys(cur, bp, ptr, numrecs); 3403 #ifdef DEBUG 3404 if (ptr < numrecs) { 3405 ASSERT(cur->bc_ops->keys_inorder(cur, kp, 3406 xfs_btree_key_addr(cur, ptr + 1, block))); 3407 } 3408 #endif 3409 } else { 3410 /* It's a leaf. make a hole in the records */ 3411 union xfs_btree_rec *rp; 3412 3413 rp = xfs_btree_rec_addr(cur, ptr, block); 3414 3415 xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1); 3416 3417 /* Now put the new data in, bump numrecs and log it. */ 3418 xfs_btree_copy_recs(cur, rp, rec, 1); 3419 xfs_btree_set_numrecs(block, ++numrecs); 3420 xfs_btree_log_recs(cur, bp, ptr, numrecs); 3421 #ifdef DEBUG 3422 if (ptr < numrecs) { 3423 ASSERT(cur->bc_ops->recs_inorder(cur, rp, 3424 xfs_btree_rec_addr(cur, ptr + 1, block))); 3425 } 3426 #endif 3427 } 3428 3429 /* Log the new number of records in the btree header. */ 3430 xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); 3431 3432 /* 3433 * If we just inserted into a new tree block, we have to 3434 * recalculate nkey here because nkey is out of date. 3435 * 3436 * Otherwise we're just updating an existing block (having shoved 3437 * some records into the new tree block), so use the regular key 3438 * update mechanism. 3439 */ 3440 if (bp && xfs_buf_daddr(bp) != old_bn) { 3441 xfs_btree_get_keys(cur, block, lkey); 3442 } else if (xfs_btree_needs_key_update(cur, optr)) { 3443 error = xfs_btree_update_keys(cur, level); 3444 if (error) 3445 goto error0; 3446 } 3447 3448 /* 3449 * If we are tracking the last record in the tree and 3450 * we are at the far right edge of the tree, update it. 3451 */ 3452 if (xfs_btree_is_lastrec(cur, block, level)) { 3453 cur->bc_ops->update_lastrec(cur, block, rec, 3454 ptr, LASTREC_INSREC); 3455 } 3456 3457 /* 3458 * Return the new block number, if any. 3459 * If there is one, give back a record value and a cursor too. 3460 */ 3461 *ptrp = nptr; 3462 if (!xfs_btree_ptr_is_null(cur, &nptr)) { 3463 xfs_btree_copy_keys(cur, key, lkey, 1); 3464 *curp = ncur; 3465 } 3466 3467 *stat = 1; 3468 return 0; 3469 3470 error0: 3471 if (ncur) 3472 xfs_btree_del_cursor(ncur, error); 3473 return error; 3474 } 3475 3476 /* 3477 * Insert the record at the point referenced by cur. 3478 * 3479 * A multi-level split of the tree on insert will invalidate the original 3480 * cursor. All callers of this function should assume that the cursor is 3481 * no longer valid and revalidate it. 3482 */ 3483 int 3484 xfs_btree_insert( 3485 struct xfs_btree_cur *cur, 3486 int *stat) 3487 { 3488 int error; /* error return value */ 3489 int i; /* result value, 0 for failure */ 3490 int level; /* current level number in btree */ 3491 union xfs_btree_ptr nptr; /* new block number (split result) */ 3492 struct xfs_btree_cur *ncur; /* new cursor (split result) */ 3493 struct xfs_btree_cur *pcur; /* previous level's cursor */ 3494 union xfs_btree_key bkey; /* key of block to insert */ 3495 union xfs_btree_key *key; 3496 union xfs_btree_rec rec; /* record to insert */ 3497 3498 level = 0; 3499 ncur = NULL; 3500 pcur = cur; 3501 key = &bkey; 3502 3503 xfs_btree_set_ptr_null(cur, &nptr); 3504 3505 /* Make a key out of the record data to be inserted, and save it. */ 3506 cur->bc_ops->init_rec_from_cur(cur, &rec); 3507 cur->bc_ops->init_key_from_rec(key, &rec); 3508 3509 /* 3510 * Loop going up the tree, starting at the leaf level. 3511 * Stop when we don't get a split block, that must mean that 3512 * the insert is finished with this level. 3513 */ 3514 do { 3515 /* 3516 * Insert nrec/nptr into this level of the tree. 3517 * Note if we fail, nptr will be null. 3518 */ 3519 error = xfs_btree_insrec(pcur, level, &nptr, &rec, key, 3520 &ncur, &i); 3521 if (error) { 3522 if (pcur != cur) 3523 xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); 3524 goto error0; 3525 } 3526 3527 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 3528 error = -EFSCORRUPTED; 3529 goto error0; 3530 } 3531 level++; 3532 3533 /* 3534 * See if the cursor we just used is trash. 3535 * Can't trash the caller's cursor, but otherwise we should 3536 * if ncur is a new cursor or we're about to be done. 3537 */ 3538 if (pcur != cur && 3539 (ncur || xfs_btree_ptr_is_null(cur, &nptr))) { 3540 /* Save the state from the cursor before we trash it */ 3541 if (cur->bc_ops->update_cursor) 3542 cur->bc_ops->update_cursor(pcur, cur); 3543 cur->bc_nlevels = pcur->bc_nlevels; 3544 xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); 3545 } 3546 /* If we got a new cursor, switch to it. */ 3547 if (ncur) { 3548 pcur = ncur; 3549 ncur = NULL; 3550 } 3551 } while (!xfs_btree_ptr_is_null(cur, &nptr)); 3552 3553 *stat = i; 3554 return 0; 3555 error0: 3556 return error; 3557 } 3558 3559 /* 3560 * Try to merge a non-leaf block back into the inode root. 3561 * 3562 * Note: the killroot names comes from the fact that we're effectively 3563 * killing the old root block. But because we can't just delete the 3564 * inode we have to copy the single block it was pointing to into the 3565 * inode. 3566 */ 3567 STATIC int 3568 xfs_btree_kill_iroot( 3569 struct xfs_btree_cur *cur) 3570 { 3571 int whichfork = cur->bc_ino.whichfork; 3572 struct xfs_inode *ip = cur->bc_ino.ip; 3573 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); 3574 struct xfs_btree_block *block; 3575 struct xfs_btree_block *cblock; 3576 union xfs_btree_key *kp; 3577 union xfs_btree_key *ckp; 3578 union xfs_btree_ptr *pp; 3579 union xfs_btree_ptr *cpp; 3580 struct xfs_buf *cbp; 3581 int level; 3582 int index; 3583 int numrecs; 3584 int error; 3585 #ifdef DEBUG 3586 union xfs_btree_ptr ptr; 3587 #endif 3588 int i; 3589 3590 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); 3591 ASSERT(cur->bc_nlevels > 1); 3592 3593 /* 3594 * Don't deal with the root block needs to be a leaf case. 3595 * We're just going to turn the thing back into extents anyway. 3596 */ 3597 level = cur->bc_nlevels - 1; 3598 if (level == 1) 3599 goto out0; 3600 3601 /* 3602 * Give up if the root has multiple children. 3603 */ 3604 block = xfs_btree_get_iroot(cur); 3605 if (xfs_btree_get_numrecs(block) != 1) 3606 goto out0; 3607 3608 cblock = xfs_btree_get_block(cur, level - 1, &cbp); 3609 numrecs = xfs_btree_get_numrecs(cblock); 3610 3611 /* 3612 * Only do this if the next level will fit. 3613 * Then the data must be copied up to the inode, 3614 * instead of freeing the root you free the next level. 3615 */ 3616 if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level)) 3617 goto out0; 3618 3619 XFS_BTREE_STATS_INC(cur, killroot); 3620 3621 #ifdef DEBUG 3622 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); 3623 ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); 3624 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 3625 ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); 3626 #endif 3627 3628 index = numrecs - cur->bc_ops->get_maxrecs(cur, level); 3629 if (index) { 3630 xfs_iroot_realloc(cur->bc_ino.ip, index, 3631 cur->bc_ino.whichfork); 3632 block = ifp->if_broot; 3633 } 3634 3635 be16_add_cpu(&block->bb_numrecs, index); 3636 ASSERT(block->bb_numrecs == cblock->bb_numrecs); 3637 3638 kp = xfs_btree_key_addr(cur, 1, block); 3639 ckp = xfs_btree_key_addr(cur, 1, cblock); 3640 xfs_btree_copy_keys(cur, kp, ckp, numrecs); 3641 3642 pp = xfs_btree_ptr_addr(cur, 1, block); 3643 cpp = xfs_btree_ptr_addr(cur, 1, cblock); 3644 3645 for (i = 0; i < numrecs; i++) { 3646 error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1); 3647 if (error) 3648 return error; 3649 } 3650 3651 xfs_btree_copy_ptrs(cur, pp, cpp, numrecs); 3652 3653 error = xfs_btree_free_block(cur, cbp); 3654 if (error) 3655 return error; 3656 3657 cur->bc_levels[level - 1].bp = NULL; 3658 be16_add_cpu(&block->bb_level, -1); 3659 xfs_trans_log_inode(cur->bc_tp, ip, 3660 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork)); 3661 cur->bc_nlevels--; 3662 out0: 3663 return 0; 3664 } 3665 3666 /* 3667 * Kill the current root node, and replace it with it's only child node. 3668 */ 3669 STATIC int 3670 xfs_btree_kill_root( 3671 struct xfs_btree_cur *cur, 3672 struct xfs_buf *bp, 3673 int level, 3674 union xfs_btree_ptr *newroot) 3675 { 3676 int error; 3677 3678 XFS_BTREE_STATS_INC(cur, killroot); 3679 3680 /* 3681 * Update the root pointer, decreasing the level by 1 and then 3682 * free the old root. 3683 */ 3684 cur->bc_ops->set_root(cur, newroot, -1); 3685 3686 error = xfs_btree_free_block(cur, bp); 3687 if (error) 3688 return error; 3689 3690 cur->bc_levels[level].bp = NULL; 3691 cur->bc_levels[level].ra = 0; 3692 cur->bc_nlevels--; 3693 3694 return 0; 3695 } 3696 3697 STATIC int 3698 xfs_btree_dec_cursor( 3699 struct xfs_btree_cur *cur, 3700 int level, 3701 int *stat) 3702 { 3703 int error; 3704 int i; 3705 3706 if (level > 0) { 3707 error = xfs_btree_decrement(cur, level, &i); 3708 if (error) 3709 return error; 3710 } 3711 3712 *stat = 1; 3713 return 0; 3714 } 3715 3716 /* 3717 * Single level of the btree record deletion routine. 3718 * Delete record pointed to by cur/level. 3719 * Remove the record from its block then rebalance the tree. 3720 * Return 0 for error, 1 for done, 2 to go on to the next level. 3721 */ 3722 STATIC int /* error */ 3723 xfs_btree_delrec( 3724 struct xfs_btree_cur *cur, /* btree cursor */ 3725 int level, /* level removing record from */ 3726 int *stat) /* fail/done/go-on */ 3727 { 3728 struct xfs_btree_block *block; /* btree block */ 3729 union xfs_btree_ptr cptr; /* current block ptr */ 3730 struct xfs_buf *bp; /* buffer for block */ 3731 int error; /* error return value */ 3732 int i; /* loop counter */ 3733 union xfs_btree_ptr lptr; /* left sibling block ptr */ 3734 struct xfs_buf *lbp; /* left buffer pointer */ 3735 struct xfs_btree_block *left; /* left btree block */ 3736 int lrecs = 0; /* left record count */ 3737 int ptr; /* key/record index */ 3738 union xfs_btree_ptr rptr; /* right sibling block ptr */ 3739 struct xfs_buf *rbp; /* right buffer pointer */ 3740 struct xfs_btree_block *right; /* right btree block */ 3741 struct xfs_btree_block *rrblock; /* right-right btree block */ 3742 struct xfs_buf *rrbp; /* right-right buffer pointer */ 3743 int rrecs = 0; /* right record count */ 3744 struct xfs_btree_cur *tcur; /* temporary btree cursor */ 3745 int numrecs; /* temporary numrec count */ 3746 3747 tcur = NULL; 3748 3749 /* Get the index of the entry being deleted, check for nothing there. */ 3750 ptr = cur->bc_levels[level].ptr; 3751 if (ptr == 0) { 3752 *stat = 0; 3753 return 0; 3754 } 3755 3756 /* Get the buffer & block containing the record or key/ptr. */ 3757 block = xfs_btree_get_block(cur, level, &bp); 3758 numrecs = xfs_btree_get_numrecs(block); 3759 3760 #ifdef DEBUG 3761 error = xfs_btree_check_block(cur, block, level, bp); 3762 if (error) 3763 goto error0; 3764 #endif 3765 3766 /* Fail if we're off the end of the block. */ 3767 if (ptr > numrecs) { 3768 *stat = 0; 3769 return 0; 3770 } 3771 3772 XFS_BTREE_STATS_INC(cur, delrec); 3773 XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr); 3774 3775 /* Excise the entries being deleted. */ 3776 if (level > 0) { 3777 /* It's a nonleaf. operate on keys and ptrs */ 3778 union xfs_btree_key *lkp; 3779 union xfs_btree_ptr *lpp; 3780 3781 lkp = xfs_btree_key_addr(cur, ptr + 1, block); 3782 lpp = xfs_btree_ptr_addr(cur, ptr + 1, block); 3783 3784 for (i = 0; i < numrecs - ptr; i++) { 3785 error = xfs_btree_debug_check_ptr(cur, lpp, i, level); 3786 if (error) 3787 goto error0; 3788 } 3789 3790 if (ptr < numrecs) { 3791 xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr); 3792 xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr); 3793 xfs_btree_log_keys(cur, bp, ptr, numrecs - 1); 3794 xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1); 3795 } 3796 } else { 3797 /* It's a leaf. operate on records */ 3798 if (ptr < numrecs) { 3799 xfs_btree_shift_recs(cur, 3800 xfs_btree_rec_addr(cur, ptr + 1, block), 3801 -1, numrecs - ptr); 3802 xfs_btree_log_recs(cur, bp, ptr, numrecs - 1); 3803 } 3804 } 3805 3806 /* 3807 * Decrement and log the number of entries in the block. 3808 */ 3809 xfs_btree_set_numrecs(block, --numrecs); 3810 xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); 3811 3812 /* 3813 * If we are tracking the last record in the tree and 3814 * we are at the far right edge of the tree, update it. 3815 */ 3816 if (xfs_btree_is_lastrec(cur, block, level)) { 3817 cur->bc_ops->update_lastrec(cur, block, NULL, 3818 ptr, LASTREC_DELREC); 3819 } 3820 3821 /* 3822 * We're at the root level. First, shrink the root block in-memory. 3823 * Try to get rid of the next level down. If we can't then there's 3824 * nothing left to do. 3825 */ 3826 if (level == cur->bc_nlevels - 1) { 3827 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { 3828 xfs_iroot_realloc(cur->bc_ino.ip, -1, 3829 cur->bc_ino.whichfork); 3830 3831 error = xfs_btree_kill_iroot(cur); 3832 if (error) 3833 goto error0; 3834 3835 error = xfs_btree_dec_cursor(cur, level, stat); 3836 if (error) 3837 goto error0; 3838 *stat = 1; 3839 return 0; 3840 } 3841 3842 /* 3843 * If this is the root level, and there's only one entry left, 3844 * and it's NOT the leaf level, then we can get rid of this 3845 * level. 3846 */ 3847 if (numrecs == 1 && level > 0) { 3848 union xfs_btree_ptr *pp; 3849 /* 3850 * pp is still set to the first pointer in the block. 3851 * Make it the new root of the btree. 3852 */ 3853 pp = xfs_btree_ptr_addr(cur, 1, block); 3854 error = xfs_btree_kill_root(cur, bp, level, pp); 3855 if (error) 3856 goto error0; 3857 } else if (level > 0) { 3858 error = xfs_btree_dec_cursor(cur, level, stat); 3859 if (error) 3860 goto error0; 3861 } 3862 *stat = 1; 3863 return 0; 3864 } 3865 3866 /* 3867 * If we deleted the leftmost entry in the block, update the 3868 * key values above us in the tree. 3869 */ 3870 if (xfs_btree_needs_key_update(cur, ptr)) { 3871 error = xfs_btree_update_keys(cur, level); 3872 if (error) 3873 goto error0; 3874 } 3875 3876 /* 3877 * If the number of records remaining in the block is at least 3878 * the minimum, we're done. 3879 */ 3880 if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) { 3881 error = xfs_btree_dec_cursor(cur, level, stat); 3882 if (error) 3883 goto error0; 3884 return 0; 3885 } 3886 3887 /* 3888 * Otherwise, we have to move some records around to keep the 3889 * tree balanced. Look at the left and right sibling blocks to 3890 * see if we can re-balance by moving only one record. 3891 */ 3892 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); 3893 xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB); 3894 3895 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { 3896 /* 3897 * One child of root, need to get a chance to copy its contents 3898 * into the root and delete it. Can't go up to next level, 3899 * there's nothing to delete there. 3900 */ 3901 if (xfs_btree_ptr_is_null(cur, &rptr) && 3902 xfs_btree_ptr_is_null(cur, &lptr) && 3903 level == cur->bc_nlevels - 2) { 3904 error = xfs_btree_kill_iroot(cur); 3905 if (!error) 3906 error = xfs_btree_dec_cursor(cur, level, stat); 3907 if (error) 3908 goto error0; 3909 return 0; 3910 } 3911 } 3912 3913 ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) || 3914 !xfs_btree_ptr_is_null(cur, &lptr)); 3915 3916 /* 3917 * Duplicate the cursor so our btree manipulations here won't 3918 * disrupt the next level up. 3919 */ 3920 error = xfs_btree_dup_cursor(cur, &tcur); 3921 if (error) 3922 goto error0; 3923 3924 /* 3925 * If there's a right sibling, see if it's ok to shift an entry 3926 * out of it. 3927 */ 3928 if (!xfs_btree_ptr_is_null(cur, &rptr)) { 3929 /* 3930 * Move the temp cursor to the last entry in the next block. 3931 * Actually any entry but the first would suffice. 3932 */ 3933 i = xfs_btree_lastrec(tcur, level); 3934 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 3935 error = -EFSCORRUPTED; 3936 goto error0; 3937 } 3938 3939 error = xfs_btree_increment(tcur, level, &i); 3940 if (error) 3941 goto error0; 3942 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 3943 error = -EFSCORRUPTED; 3944 goto error0; 3945 } 3946 3947 i = xfs_btree_lastrec(tcur, level); 3948 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 3949 error = -EFSCORRUPTED; 3950 goto error0; 3951 } 3952 3953 /* Grab a pointer to the block. */ 3954 right = xfs_btree_get_block(tcur, level, &rbp); 3955 #ifdef DEBUG 3956 error = xfs_btree_check_block(tcur, right, level, rbp); 3957 if (error) 3958 goto error0; 3959 #endif 3960 /* Grab the current block number, for future use. */ 3961 xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB); 3962 3963 /* 3964 * If right block is full enough so that removing one entry 3965 * won't make it too empty, and left-shifting an entry out 3966 * of right to us works, we're done. 3967 */ 3968 if (xfs_btree_get_numrecs(right) - 1 >= 3969 cur->bc_ops->get_minrecs(tcur, level)) { 3970 error = xfs_btree_lshift(tcur, level, &i); 3971 if (error) 3972 goto error0; 3973 if (i) { 3974 ASSERT(xfs_btree_get_numrecs(block) >= 3975 cur->bc_ops->get_minrecs(tcur, level)); 3976 3977 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 3978 tcur = NULL; 3979 3980 error = xfs_btree_dec_cursor(cur, level, stat); 3981 if (error) 3982 goto error0; 3983 return 0; 3984 } 3985 } 3986 3987 /* 3988 * Otherwise, grab the number of records in right for 3989 * future reference, and fix up the temp cursor to point 3990 * to our block again (last record). 3991 */ 3992 rrecs = xfs_btree_get_numrecs(right); 3993 if (!xfs_btree_ptr_is_null(cur, &lptr)) { 3994 i = xfs_btree_firstrec(tcur, level); 3995 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 3996 error = -EFSCORRUPTED; 3997 goto error0; 3998 } 3999 4000 error = xfs_btree_decrement(tcur, level, &i); 4001 if (error) 4002 goto error0; 4003 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4004 error = -EFSCORRUPTED; 4005 goto error0; 4006 } 4007 } 4008 } 4009 4010 /* 4011 * If there's a left sibling, see if it's ok to shift an entry 4012 * out of it. 4013 */ 4014 if (!xfs_btree_ptr_is_null(cur, &lptr)) { 4015 /* 4016 * Move the temp cursor to the first entry in the 4017 * previous block. 4018 */ 4019 i = xfs_btree_firstrec(tcur, level); 4020 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4021 error = -EFSCORRUPTED; 4022 goto error0; 4023 } 4024 4025 error = xfs_btree_decrement(tcur, level, &i); 4026 if (error) 4027 goto error0; 4028 i = xfs_btree_firstrec(tcur, level); 4029 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4030 error = -EFSCORRUPTED; 4031 goto error0; 4032 } 4033 4034 /* Grab a pointer to the block. */ 4035 left = xfs_btree_get_block(tcur, level, &lbp); 4036 #ifdef DEBUG 4037 error = xfs_btree_check_block(cur, left, level, lbp); 4038 if (error) 4039 goto error0; 4040 #endif 4041 /* Grab the current block number, for future use. */ 4042 xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB); 4043 4044 /* 4045 * If left block is full enough so that removing one entry 4046 * won't make it too empty, and right-shifting an entry out 4047 * of left to us works, we're done. 4048 */ 4049 if (xfs_btree_get_numrecs(left) - 1 >= 4050 cur->bc_ops->get_minrecs(tcur, level)) { 4051 error = xfs_btree_rshift(tcur, level, &i); 4052 if (error) 4053 goto error0; 4054 if (i) { 4055 ASSERT(xfs_btree_get_numrecs(block) >= 4056 cur->bc_ops->get_minrecs(tcur, level)); 4057 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 4058 tcur = NULL; 4059 if (level == 0) 4060 cur->bc_levels[0].ptr++; 4061 4062 *stat = 1; 4063 return 0; 4064 } 4065 } 4066 4067 /* 4068 * Otherwise, grab the number of records in right for 4069 * future reference. 4070 */ 4071 lrecs = xfs_btree_get_numrecs(left); 4072 } 4073 4074 /* Delete the temp cursor, we're done with it. */ 4075 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 4076 tcur = NULL; 4077 4078 /* If here, we need to do a join to keep the tree balanced. */ 4079 ASSERT(!xfs_btree_ptr_is_null(cur, &cptr)); 4080 4081 if (!xfs_btree_ptr_is_null(cur, &lptr) && 4082 lrecs + xfs_btree_get_numrecs(block) <= 4083 cur->bc_ops->get_maxrecs(cur, level)) { 4084 /* 4085 * Set "right" to be the starting block, 4086 * "left" to be the left neighbor. 4087 */ 4088 rptr = cptr; 4089 right = block; 4090 rbp = bp; 4091 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); 4092 if (error) 4093 goto error0; 4094 4095 /* 4096 * If that won't work, see if we can join with the right neighbor block. 4097 */ 4098 } else if (!xfs_btree_ptr_is_null(cur, &rptr) && 4099 rrecs + xfs_btree_get_numrecs(block) <= 4100 cur->bc_ops->get_maxrecs(cur, level)) { 4101 /* 4102 * Set "left" to be the starting block, 4103 * "right" to be the right neighbor. 4104 */ 4105 lptr = cptr; 4106 left = block; 4107 lbp = bp; 4108 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); 4109 if (error) 4110 goto error0; 4111 4112 /* 4113 * Otherwise, we can't fix the imbalance. 4114 * Just return. This is probably a logic error, but it's not fatal. 4115 */ 4116 } else { 4117 error = xfs_btree_dec_cursor(cur, level, stat); 4118 if (error) 4119 goto error0; 4120 return 0; 4121 } 4122 4123 rrecs = xfs_btree_get_numrecs(right); 4124 lrecs = xfs_btree_get_numrecs(left); 4125 4126 /* 4127 * We're now going to join "left" and "right" by moving all the stuff 4128 * in "right" to "left" and deleting "right". 4129 */ 4130 XFS_BTREE_STATS_ADD(cur, moves, rrecs); 4131 if (level > 0) { 4132 /* It's a non-leaf. Move keys and pointers. */ 4133 union xfs_btree_key *lkp; /* left btree key */ 4134 union xfs_btree_ptr *lpp; /* left address pointer */ 4135 union xfs_btree_key *rkp; /* right btree key */ 4136 union xfs_btree_ptr *rpp; /* right address pointer */ 4137 4138 lkp = xfs_btree_key_addr(cur, lrecs + 1, left); 4139 lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left); 4140 rkp = xfs_btree_key_addr(cur, 1, right); 4141 rpp = xfs_btree_ptr_addr(cur, 1, right); 4142 4143 for (i = 1; i < rrecs; i++) { 4144 error = xfs_btree_debug_check_ptr(cur, rpp, i, level); 4145 if (error) 4146 goto error0; 4147 } 4148 4149 xfs_btree_copy_keys(cur, lkp, rkp, rrecs); 4150 xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs); 4151 4152 xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs); 4153 xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs); 4154 } else { 4155 /* It's a leaf. Move records. */ 4156 union xfs_btree_rec *lrp; /* left record pointer */ 4157 union xfs_btree_rec *rrp; /* right record pointer */ 4158 4159 lrp = xfs_btree_rec_addr(cur, lrecs + 1, left); 4160 rrp = xfs_btree_rec_addr(cur, 1, right); 4161 4162 xfs_btree_copy_recs(cur, lrp, rrp, rrecs); 4163 xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs); 4164 } 4165 4166 XFS_BTREE_STATS_INC(cur, join); 4167 4168 /* 4169 * Fix up the number of records and right block pointer in the 4170 * surviving block, and log it. 4171 */ 4172 xfs_btree_set_numrecs(left, lrecs + rrecs); 4173 xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB); 4174 xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); 4175 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); 4176 4177 /* If there is a right sibling, point it to the remaining block. */ 4178 xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); 4179 if (!xfs_btree_ptr_is_null(cur, &cptr)) { 4180 error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp); 4181 if (error) 4182 goto error0; 4183 xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB); 4184 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); 4185 } 4186 4187 /* Free the deleted block. */ 4188 error = xfs_btree_free_block(cur, rbp); 4189 if (error) 4190 goto error0; 4191 4192 /* 4193 * If we joined with the left neighbor, set the buffer in the 4194 * cursor to the left block, and fix up the index. 4195 */ 4196 if (bp != lbp) { 4197 cur->bc_levels[level].bp = lbp; 4198 cur->bc_levels[level].ptr += lrecs; 4199 cur->bc_levels[level].ra = 0; 4200 } 4201 /* 4202 * If we joined with the right neighbor and there's a level above 4203 * us, increment the cursor at that level. 4204 */ 4205 else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || 4206 (level + 1 < cur->bc_nlevels)) { 4207 error = xfs_btree_increment(cur, level + 1, &i); 4208 if (error) 4209 goto error0; 4210 } 4211 4212 /* 4213 * Readjust the ptr at this level if it's not a leaf, since it's 4214 * still pointing at the deletion point, which makes the cursor 4215 * inconsistent. If this makes the ptr 0, the caller fixes it up. 4216 * We can't use decrement because it would change the next level up. 4217 */ 4218 if (level > 0) 4219 cur->bc_levels[level].ptr--; 4220 4221 /* 4222 * We combined blocks, so we have to update the parent keys if the 4223 * btree supports overlapped intervals. However, 4224 * bc_levels[level + 1].ptr points to the old block so that the caller 4225 * knows which record to delete. Therefore, the caller must be savvy 4226 * enough to call updkeys for us if we return stat == 2. The other 4227 * exit points from this function don't require deletions further up 4228 * the tree, so they can call updkeys directly. 4229 */ 4230 4231 /* Return value means the next level up has something to do. */ 4232 *stat = 2; 4233 return 0; 4234 4235 error0: 4236 if (tcur) 4237 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 4238 return error; 4239 } 4240 4241 /* 4242 * Delete the record pointed to by cur. 4243 * The cursor refers to the place where the record was (could be inserted) 4244 * when the operation returns. 4245 */ 4246 int /* error */ 4247 xfs_btree_delete( 4248 struct xfs_btree_cur *cur, 4249 int *stat) /* success/failure */ 4250 { 4251 int error; /* error return value */ 4252 int level; 4253 int i; 4254 bool joined = false; 4255 4256 /* 4257 * Go up the tree, starting at leaf level. 4258 * 4259 * If 2 is returned then a join was done; go to the next level. 4260 * Otherwise we are done. 4261 */ 4262 for (level = 0, i = 2; i == 2; level++) { 4263 error = xfs_btree_delrec(cur, level, &i); 4264 if (error) 4265 goto error0; 4266 if (i == 2) 4267 joined = true; 4268 } 4269 4270 /* 4271 * If we combined blocks as part of deleting the record, delrec won't 4272 * have updated the parent high keys so we have to do that here. 4273 */ 4274 if (joined && (cur->bc_flags & XFS_BTREE_OVERLAPPING)) { 4275 error = xfs_btree_updkeys_force(cur, 0); 4276 if (error) 4277 goto error0; 4278 } 4279 4280 if (i == 0) { 4281 for (level = 1; level < cur->bc_nlevels; level++) { 4282 if (cur->bc_levels[level].ptr == 0) { 4283 error = xfs_btree_decrement(cur, level, &i); 4284 if (error) 4285 goto error0; 4286 break; 4287 } 4288 } 4289 } 4290 4291 *stat = i; 4292 return 0; 4293 error0: 4294 return error; 4295 } 4296 4297 /* 4298 * Get the data from the pointed-to record. 4299 */ 4300 int /* error */ 4301 xfs_btree_get_rec( 4302 struct xfs_btree_cur *cur, /* btree cursor */ 4303 union xfs_btree_rec **recp, /* output: btree record */ 4304 int *stat) /* output: success/failure */ 4305 { 4306 struct xfs_btree_block *block; /* btree block */ 4307 struct xfs_buf *bp; /* buffer pointer */ 4308 int ptr; /* record number */ 4309 #ifdef DEBUG 4310 int error; /* error return value */ 4311 #endif 4312 4313 ptr = cur->bc_levels[0].ptr; 4314 block = xfs_btree_get_block(cur, 0, &bp); 4315 4316 #ifdef DEBUG 4317 error = xfs_btree_check_block(cur, block, 0, bp); 4318 if (error) 4319 return error; 4320 #endif 4321 4322 /* 4323 * Off the right end or left end, return failure. 4324 */ 4325 if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) { 4326 *stat = 0; 4327 return 0; 4328 } 4329 4330 /* 4331 * Point to the record and extract its data. 4332 */ 4333 *recp = xfs_btree_rec_addr(cur, ptr, block); 4334 *stat = 1; 4335 return 0; 4336 } 4337 4338 /* Visit a block in a btree. */ 4339 STATIC int 4340 xfs_btree_visit_block( 4341 struct xfs_btree_cur *cur, 4342 int level, 4343 xfs_btree_visit_blocks_fn fn, 4344 void *data) 4345 { 4346 struct xfs_btree_block *block; 4347 struct xfs_buf *bp; 4348 union xfs_btree_ptr rptr; 4349 int error; 4350 4351 /* do right sibling readahead */ 4352 xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); 4353 block = xfs_btree_get_block(cur, level, &bp); 4354 4355 /* process the block */ 4356 error = fn(cur, level, data); 4357 if (error) 4358 return error; 4359 4360 /* now read rh sibling block for next iteration */ 4361 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); 4362 if (xfs_btree_ptr_is_null(cur, &rptr)) 4363 return -ENOENT; 4364 4365 /* 4366 * We only visit blocks once in this walk, so we have to avoid the 4367 * internal xfs_btree_lookup_get_block() optimisation where it will 4368 * return the same block without checking if the right sibling points 4369 * back to us and creates a cyclic reference in the btree. 4370 */ 4371 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 4372 if (be64_to_cpu(rptr.l) == XFS_DADDR_TO_FSB(cur->bc_mp, 4373 xfs_buf_daddr(bp))) 4374 return -EFSCORRUPTED; 4375 } else { 4376 if (be32_to_cpu(rptr.s) == xfs_daddr_to_agbno(cur->bc_mp, 4377 xfs_buf_daddr(bp))) 4378 return -EFSCORRUPTED; 4379 } 4380 return xfs_btree_lookup_get_block(cur, level, &rptr, &block); 4381 } 4382 4383 4384 /* Visit every block in a btree. */ 4385 int 4386 xfs_btree_visit_blocks( 4387 struct xfs_btree_cur *cur, 4388 xfs_btree_visit_blocks_fn fn, 4389 unsigned int flags, 4390 void *data) 4391 { 4392 union xfs_btree_ptr lptr; 4393 int level; 4394 struct xfs_btree_block *block = NULL; 4395 int error = 0; 4396 4397 cur->bc_ops->init_ptr_from_cur(cur, &lptr); 4398 4399 /* for each level */ 4400 for (level = cur->bc_nlevels - 1; level >= 0; level--) { 4401 /* grab the left hand block */ 4402 error = xfs_btree_lookup_get_block(cur, level, &lptr, &block); 4403 if (error) 4404 return error; 4405 4406 /* readahead the left most block for the next level down */ 4407 if (level > 0) { 4408 union xfs_btree_ptr *ptr; 4409 4410 ptr = xfs_btree_ptr_addr(cur, 1, block); 4411 xfs_btree_readahead_ptr(cur, ptr, 1); 4412 4413 /* save for the next iteration of the loop */ 4414 xfs_btree_copy_ptrs(cur, &lptr, ptr, 1); 4415 4416 if (!(flags & XFS_BTREE_VISIT_LEAVES)) 4417 continue; 4418 } else if (!(flags & XFS_BTREE_VISIT_RECORDS)) { 4419 continue; 4420 } 4421 4422 /* for each buffer in the level */ 4423 do { 4424 error = xfs_btree_visit_block(cur, level, fn, data); 4425 } while (!error); 4426 4427 if (error != -ENOENT) 4428 return error; 4429 } 4430 4431 return 0; 4432 } 4433 4434 /* 4435 * Change the owner of a btree. 4436 * 4437 * The mechanism we use here is ordered buffer logging. Because we don't know 4438 * how many buffers were are going to need to modify, we don't really want to 4439 * have to make transaction reservations for the worst case of every buffer in a 4440 * full size btree as that may be more space that we can fit in the log.... 4441 * 4442 * We do the btree walk in the most optimal manner possible - we have sibling 4443 * pointers so we can just walk all the blocks on each level from left to right 4444 * in a single pass, and then move to the next level and do the same. We can 4445 * also do readahead on the sibling pointers to get IO moving more quickly, 4446 * though for slow disks this is unlikely to make much difference to performance 4447 * as the amount of CPU work we have to do before moving to the next block is 4448 * relatively small. 4449 * 4450 * For each btree block that we load, modify the owner appropriately, set the 4451 * buffer as an ordered buffer and log it appropriately. We need to ensure that 4452 * we mark the region we change dirty so that if the buffer is relogged in 4453 * a subsequent transaction the changes we make here as an ordered buffer are 4454 * correctly relogged in that transaction. If we are in recovery context, then 4455 * just queue the modified buffer as delayed write buffer so the transaction 4456 * recovery completion writes the changes to disk. 4457 */ 4458 struct xfs_btree_block_change_owner_info { 4459 uint64_t new_owner; 4460 struct list_head *buffer_list; 4461 }; 4462 4463 static int 4464 xfs_btree_block_change_owner( 4465 struct xfs_btree_cur *cur, 4466 int level, 4467 void *data) 4468 { 4469 struct xfs_btree_block_change_owner_info *bbcoi = data; 4470 struct xfs_btree_block *block; 4471 struct xfs_buf *bp; 4472 4473 /* modify the owner */ 4474 block = xfs_btree_get_block(cur, level, &bp); 4475 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 4476 if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner)) 4477 return 0; 4478 block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner); 4479 } else { 4480 if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner)) 4481 return 0; 4482 block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner); 4483 } 4484 4485 /* 4486 * If the block is a root block hosted in an inode, we might not have a 4487 * buffer pointer here and we shouldn't attempt to log the change as the 4488 * information is already held in the inode and discarded when the root 4489 * block is formatted into the on-disk inode fork. We still change it, 4490 * though, so everything is consistent in memory. 4491 */ 4492 if (!bp) { 4493 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); 4494 ASSERT(level == cur->bc_nlevels - 1); 4495 return 0; 4496 } 4497 4498 if (cur->bc_tp) { 4499 if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) { 4500 xfs_btree_log_block(cur, bp, XFS_BB_OWNER); 4501 return -EAGAIN; 4502 } 4503 } else { 4504 xfs_buf_delwri_queue(bp, bbcoi->buffer_list); 4505 } 4506 4507 return 0; 4508 } 4509 4510 int 4511 xfs_btree_change_owner( 4512 struct xfs_btree_cur *cur, 4513 uint64_t new_owner, 4514 struct list_head *buffer_list) 4515 { 4516 struct xfs_btree_block_change_owner_info bbcoi; 4517 4518 bbcoi.new_owner = new_owner; 4519 bbcoi.buffer_list = buffer_list; 4520 4521 return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner, 4522 XFS_BTREE_VISIT_ALL, &bbcoi); 4523 } 4524 4525 /* Verify the v5 fields of a long-format btree block. */ 4526 xfs_failaddr_t 4527 xfs_btree_lblock_v5hdr_verify( 4528 struct xfs_buf *bp, 4529 uint64_t owner) 4530 { 4531 struct xfs_mount *mp = bp->b_mount; 4532 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4533 4534 if (!xfs_has_crc(mp)) 4535 return __this_address; 4536 if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) 4537 return __this_address; 4538 if (block->bb_u.l.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) 4539 return __this_address; 4540 if (owner != XFS_RMAP_OWN_UNKNOWN && 4541 be64_to_cpu(block->bb_u.l.bb_owner) != owner) 4542 return __this_address; 4543 return NULL; 4544 } 4545 4546 /* Verify a long-format btree block. */ 4547 xfs_failaddr_t 4548 xfs_btree_lblock_verify( 4549 struct xfs_buf *bp, 4550 unsigned int max_recs) 4551 { 4552 struct xfs_mount *mp = bp->b_mount; 4553 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4554 xfs_fsblock_t fsb; 4555 xfs_failaddr_t fa; 4556 4557 /* numrecs verification */ 4558 if (be16_to_cpu(block->bb_numrecs) > max_recs) 4559 return __this_address; 4560 4561 /* sibling pointer verification */ 4562 fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); 4563 fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, 4564 block->bb_u.l.bb_leftsib); 4565 if (!fa) 4566 fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, 4567 block->bb_u.l.bb_rightsib); 4568 return fa; 4569 } 4570 4571 /** 4572 * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format 4573 * btree block 4574 * 4575 * @bp: buffer containing the btree block 4576 */ 4577 xfs_failaddr_t 4578 xfs_btree_sblock_v5hdr_verify( 4579 struct xfs_buf *bp) 4580 { 4581 struct xfs_mount *mp = bp->b_mount; 4582 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4583 struct xfs_perag *pag = bp->b_pag; 4584 4585 if (!xfs_has_crc(mp)) 4586 return __this_address; 4587 if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) 4588 return __this_address; 4589 if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) 4590 return __this_address; 4591 if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) 4592 return __this_address; 4593 return NULL; 4594 } 4595 4596 /** 4597 * xfs_btree_sblock_verify() -- verify a short-format btree block 4598 * 4599 * @bp: buffer containing the btree block 4600 * @max_recs: maximum records allowed in this btree node 4601 */ 4602 xfs_failaddr_t 4603 xfs_btree_sblock_verify( 4604 struct xfs_buf *bp, 4605 unsigned int max_recs) 4606 { 4607 struct xfs_mount *mp = bp->b_mount; 4608 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4609 xfs_agblock_t agbno; 4610 xfs_failaddr_t fa; 4611 4612 /* numrecs verification */ 4613 if (be16_to_cpu(block->bb_numrecs) > max_recs) 4614 return __this_address; 4615 4616 /* sibling pointer verification */ 4617 agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); 4618 fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno, 4619 block->bb_u.s.bb_leftsib); 4620 if (!fa) 4621 fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno, 4622 block->bb_u.s.bb_rightsib); 4623 return fa; 4624 } 4625 4626 /* 4627 * For the given limits on leaf and keyptr records per block, calculate the 4628 * height of the tree needed to index the number of leaf records. 4629 */ 4630 unsigned int 4631 xfs_btree_compute_maxlevels( 4632 const unsigned int *limits, 4633 unsigned long long records) 4634 { 4635 unsigned long long level_blocks = howmany_64(records, limits[0]); 4636 unsigned int height = 1; 4637 4638 while (level_blocks > 1) { 4639 level_blocks = howmany_64(level_blocks, limits[1]); 4640 height++; 4641 } 4642 4643 return height; 4644 } 4645 4646 /* 4647 * For the given limits on leaf and keyptr records per block, calculate the 4648 * number of blocks needed to index the given number of leaf records. 4649 */ 4650 unsigned long long 4651 xfs_btree_calc_size( 4652 const unsigned int *limits, 4653 unsigned long long records) 4654 { 4655 unsigned long long level_blocks = howmany_64(records, limits[0]); 4656 unsigned long long blocks = level_blocks; 4657 4658 while (level_blocks > 1) { 4659 level_blocks = howmany_64(level_blocks, limits[1]); 4660 blocks += level_blocks; 4661 } 4662 4663 return blocks; 4664 } 4665 4666 /* 4667 * Given a number of available blocks for the btree to consume with records and 4668 * pointers, calculate the height of the tree needed to index all the records 4669 * that space can hold based on the number of pointers each interior node 4670 * holds. 4671 * 4672 * We start by assuming a single level tree consumes a single block, then track 4673 * the number of blocks each node level consumes until we no longer have space 4674 * to store the next node level. At this point, we are indexing all the leaf 4675 * blocks in the space, and there's no more free space to split the tree any 4676 * further. That's our maximum btree height. 4677 */ 4678 unsigned int 4679 xfs_btree_space_to_height( 4680 const unsigned int *limits, 4681 unsigned long long leaf_blocks) 4682 { 4683 /* 4684 * The root btree block can have fewer than minrecs pointers in it 4685 * because the tree might not be big enough to require that amount of 4686 * fanout. Hence it has a minimum size of 2 pointers, not limits[1]. 4687 */ 4688 unsigned long long node_blocks = 2; 4689 unsigned long long blocks_left = leaf_blocks - 1; 4690 unsigned int height = 1; 4691 4692 if (leaf_blocks < 1) 4693 return 0; 4694 4695 while (node_blocks < blocks_left) { 4696 blocks_left -= node_blocks; 4697 node_blocks *= limits[1]; 4698 height++; 4699 } 4700 4701 return height; 4702 } 4703 4704 /* 4705 * Query a regular btree for all records overlapping a given interval. 4706 * Start with a LE lookup of the key of low_rec and return all records 4707 * until we find a record with a key greater than the key of high_rec. 4708 */ 4709 STATIC int 4710 xfs_btree_simple_query_range( 4711 struct xfs_btree_cur *cur, 4712 const union xfs_btree_key *low_key, 4713 const union xfs_btree_key *high_key, 4714 xfs_btree_query_range_fn fn, 4715 void *priv) 4716 { 4717 union xfs_btree_rec *recp; 4718 union xfs_btree_key rec_key; 4719 int64_t diff; 4720 int stat; 4721 bool firstrec = true; 4722 int error; 4723 4724 ASSERT(cur->bc_ops->init_high_key_from_rec); 4725 ASSERT(cur->bc_ops->diff_two_keys); 4726 4727 /* 4728 * Find the leftmost record. The btree cursor must be set 4729 * to the low record used to generate low_key. 4730 */ 4731 stat = 0; 4732 error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat); 4733 if (error) 4734 goto out; 4735 4736 /* Nothing? See if there's anything to the right. */ 4737 if (!stat) { 4738 error = xfs_btree_increment(cur, 0, &stat); 4739 if (error) 4740 goto out; 4741 } 4742 4743 while (stat) { 4744 /* Find the record. */ 4745 error = xfs_btree_get_rec(cur, &recp, &stat); 4746 if (error || !stat) 4747 break; 4748 4749 /* Skip if high_key(rec) < low_key. */ 4750 if (firstrec) { 4751 cur->bc_ops->init_high_key_from_rec(&rec_key, recp); 4752 firstrec = false; 4753 diff = cur->bc_ops->diff_two_keys(cur, low_key, 4754 &rec_key); 4755 if (diff > 0) 4756 goto advloop; 4757 } 4758 4759 /* Stop if high_key < low_key(rec). */ 4760 cur->bc_ops->init_key_from_rec(&rec_key, recp); 4761 diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key); 4762 if (diff > 0) 4763 break; 4764 4765 /* Callback */ 4766 error = fn(cur, recp, priv); 4767 if (error) 4768 break; 4769 4770 advloop: 4771 /* Move on to the next record. */ 4772 error = xfs_btree_increment(cur, 0, &stat); 4773 if (error) 4774 break; 4775 } 4776 4777 out: 4778 return error; 4779 } 4780 4781 /* 4782 * Query an overlapped interval btree for all records overlapping a given 4783 * interval. This function roughly follows the algorithm given in 4784 * "Interval Trees" of _Introduction to Algorithms_, which is section 4785 * 14.3 in the 2nd and 3rd editions. 4786 * 4787 * First, generate keys for the low and high records passed in. 4788 * 4789 * For any leaf node, generate the high and low keys for the record. 4790 * If the record keys overlap with the query low/high keys, pass the 4791 * record to the function iterator. 4792 * 4793 * For any internal node, compare the low and high keys of each 4794 * pointer against the query low/high keys. If there's an overlap, 4795 * follow the pointer. 4796 * 4797 * As an optimization, we stop scanning a block when we find a low key 4798 * that is greater than the query's high key. 4799 */ 4800 STATIC int 4801 xfs_btree_overlapped_query_range( 4802 struct xfs_btree_cur *cur, 4803 const union xfs_btree_key *low_key, 4804 const union xfs_btree_key *high_key, 4805 xfs_btree_query_range_fn fn, 4806 void *priv) 4807 { 4808 union xfs_btree_ptr ptr; 4809 union xfs_btree_ptr *pp; 4810 union xfs_btree_key rec_key; 4811 union xfs_btree_key rec_hkey; 4812 union xfs_btree_key *lkp; 4813 union xfs_btree_key *hkp; 4814 union xfs_btree_rec *recp; 4815 struct xfs_btree_block *block; 4816 int64_t ldiff; 4817 int64_t hdiff; 4818 int level; 4819 struct xfs_buf *bp; 4820 int i; 4821 int error; 4822 4823 /* Load the root of the btree. */ 4824 level = cur->bc_nlevels - 1; 4825 cur->bc_ops->init_ptr_from_cur(cur, &ptr); 4826 error = xfs_btree_lookup_get_block(cur, level, &ptr, &block); 4827 if (error) 4828 return error; 4829 xfs_btree_get_block(cur, level, &bp); 4830 trace_xfs_btree_overlapped_query_range(cur, level, bp); 4831 #ifdef DEBUG 4832 error = xfs_btree_check_block(cur, block, level, bp); 4833 if (error) 4834 goto out; 4835 #endif 4836 cur->bc_levels[level].ptr = 1; 4837 4838 while (level < cur->bc_nlevels) { 4839 block = xfs_btree_get_block(cur, level, &bp); 4840 4841 /* End of node, pop back towards the root. */ 4842 if (cur->bc_levels[level].ptr > 4843 be16_to_cpu(block->bb_numrecs)) { 4844 pop_up: 4845 if (level < cur->bc_nlevels - 1) 4846 cur->bc_levels[level + 1].ptr++; 4847 level++; 4848 continue; 4849 } 4850 4851 if (level == 0) { 4852 /* Handle a leaf node. */ 4853 recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr, 4854 block); 4855 4856 cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp); 4857 ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey, 4858 low_key); 4859 4860 cur->bc_ops->init_key_from_rec(&rec_key, recp); 4861 hdiff = cur->bc_ops->diff_two_keys(cur, high_key, 4862 &rec_key); 4863 4864 /* 4865 * If (record's high key >= query's low key) and 4866 * (query's high key >= record's low key), then 4867 * this record overlaps the query range; callback. 4868 */ 4869 if (ldiff >= 0 && hdiff >= 0) { 4870 error = fn(cur, recp, priv); 4871 if (error) 4872 break; 4873 } else if (hdiff < 0) { 4874 /* Record is larger than high key; pop. */ 4875 goto pop_up; 4876 } 4877 cur->bc_levels[level].ptr++; 4878 continue; 4879 } 4880 4881 /* Handle an internal node. */ 4882 lkp = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block); 4883 hkp = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, 4884 block); 4885 pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block); 4886 4887 ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key); 4888 hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp); 4889 4890 /* 4891 * If (pointer's high key >= query's low key) and 4892 * (query's high key >= pointer's low key), then 4893 * this record overlaps the query range; follow pointer. 4894 */ 4895 if (ldiff >= 0 && hdiff >= 0) { 4896 level--; 4897 error = xfs_btree_lookup_get_block(cur, level, pp, 4898 &block); 4899 if (error) 4900 goto out; 4901 xfs_btree_get_block(cur, level, &bp); 4902 trace_xfs_btree_overlapped_query_range(cur, level, bp); 4903 #ifdef DEBUG 4904 error = xfs_btree_check_block(cur, block, level, bp); 4905 if (error) 4906 goto out; 4907 #endif 4908 cur->bc_levels[level].ptr = 1; 4909 continue; 4910 } else if (hdiff < 0) { 4911 /* The low key is larger than the upper range; pop. */ 4912 goto pop_up; 4913 } 4914 cur->bc_levels[level].ptr++; 4915 } 4916 4917 out: 4918 /* 4919 * If we don't end this function with the cursor pointing at a record 4920 * block, a subsequent non-error cursor deletion will not release 4921 * node-level buffers, causing a buffer leak. This is quite possible 4922 * with a zero-results range query, so release the buffers if we 4923 * failed to return any results. 4924 */ 4925 if (cur->bc_levels[0].bp == NULL) { 4926 for (i = 0; i < cur->bc_nlevels; i++) { 4927 if (cur->bc_levels[i].bp) { 4928 xfs_trans_brelse(cur->bc_tp, 4929 cur->bc_levels[i].bp); 4930 cur->bc_levels[i].bp = NULL; 4931 cur->bc_levels[i].ptr = 0; 4932 cur->bc_levels[i].ra = 0; 4933 } 4934 } 4935 } 4936 4937 return error; 4938 } 4939 4940 /* 4941 * Query a btree for all records overlapping a given interval of keys. The 4942 * supplied function will be called with each record found; return one of the 4943 * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error 4944 * code. This function returns -ECANCELED, zero, or a negative error code. 4945 */ 4946 int 4947 xfs_btree_query_range( 4948 struct xfs_btree_cur *cur, 4949 const union xfs_btree_irec *low_rec, 4950 const union xfs_btree_irec *high_rec, 4951 xfs_btree_query_range_fn fn, 4952 void *priv) 4953 { 4954 union xfs_btree_rec rec; 4955 union xfs_btree_key low_key; 4956 union xfs_btree_key high_key; 4957 4958 /* Find the keys of both ends of the interval. */ 4959 cur->bc_rec = *high_rec; 4960 cur->bc_ops->init_rec_from_cur(cur, &rec); 4961 cur->bc_ops->init_key_from_rec(&high_key, &rec); 4962 4963 cur->bc_rec = *low_rec; 4964 cur->bc_ops->init_rec_from_cur(cur, &rec); 4965 cur->bc_ops->init_key_from_rec(&low_key, &rec); 4966 4967 /* Enforce low key < high key. */ 4968 if (cur->bc_ops->diff_two_keys(cur, &low_key, &high_key) > 0) 4969 return -EINVAL; 4970 4971 if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) 4972 return xfs_btree_simple_query_range(cur, &low_key, 4973 &high_key, fn, priv); 4974 return xfs_btree_overlapped_query_range(cur, &low_key, &high_key, 4975 fn, priv); 4976 } 4977 4978 /* Query a btree for all records. */ 4979 int 4980 xfs_btree_query_all( 4981 struct xfs_btree_cur *cur, 4982 xfs_btree_query_range_fn fn, 4983 void *priv) 4984 { 4985 union xfs_btree_key low_key; 4986 union xfs_btree_key high_key; 4987 4988 memset(&cur->bc_rec, 0, sizeof(cur->bc_rec)); 4989 memset(&low_key, 0, sizeof(low_key)); 4990 memset(&high_key, 0xFF, sizeof(high_key)); 4991 4992 return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv); 4993 } 4994 4995 static int 4996 xfs_btree_count_blocks_helper( 4997 struct xfs_btree_cur *cur, 4998 int level, 4999 void *data) 5000 { 5001 xfs_extlen_t *blocks = data; 5002 (*blocks)++; 5003 5004 return 0; 5005 } 5006 5007 /* Count the blocks in a btree and return the result in *blocks. */ 5008 int 5009 xfs_btree_count_blocks( 5010 struct xfs_btree_cur *cur, 5011 xfs_extlen_t *blocks) 5012 { 5013 *blocks = 0; 5014 return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper, 5015 XFS_BTREE_VISIT_ALL, blocks); 5016 } 5017 5018 /* Compare two btree pointers. */ 5019 int64_t 5020 xfs_btree_diff_two_ptrs( 5021 struct xfs_btree_cur *cur, 5022 const union xfs_btree_ptr *a, 5023 const union xfs_btree_ptr *b) 5024 { 5025 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 5026 return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l); 5027 return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s); 5028 } 5029 5030 /* If there's an extent, we're done. */ 5031 STATIC int 5032 xfs_btree_has_record_helper( 5033 struct xfs_btree_cur *cur, 5034 const union xfs_btree_rec *rec, 5035 void *priv) 5036 { 5037 return -ECANCELED; 5038 } 5039 5040 /* Is there a record covering a given range of keys? */ 5041 int 5042 xfs_btree_has_record( 5043 struct xfs_btree_cur *cur, 5044 const union xfs_btree_irec *low, 5045 const union xfs_btree_irec *high, 5046 bool *exists) 5047 { 5048 int error; 5049 5050 error = xfs_btree_query_range(cur, low, high, 5051 &xfs_btree_has_record_helper, NULL); 5052 if (error == -ECANCELED) { 5053 *exists = true; 5054 return 0; 5055 } 5056 *exists = false; 5057 return error; 5058 } 5059 5060 /* Are there more records in this btree? */ 5061 bool 5062 xfs_btree_has_more_records( 5063 struct xfs_btree_cur *cur) 5064 { 5065 struct xfs_btree_block *block; 5066 struct xfs_buf *bp; 5067 5068 block = xfs_btree_get_block(cur, 0, &bp); 5069 5070 /* There are still records in this block. */ 5071 if (cur->bc_levels[0].ptr < xfs_btree_get_numrecs(block)) 5072 return true; 5073 5074 /* There are more record blocks. */ 5075 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 5076 return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK); 5077 else 5078 return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK); 5079 } 5080 5081 /* Set up all the btree cursor caches. */ 5082 int __init 5083 xfs_btree_init_cur_caches(void) 5084 { 5085 int error; 5086 5087 error = xfs_allocbt_init_cur_cache(); 5088 if (error) 5089 return error; 5090 error = xfs_inobt_init_cur_cache(); 5091 if (error) 5092 goto err; 5093 error = xfs_bmbt_init_cur_cache(); 5094 if (error) 5095 goto err; 5096 error = xfs_rmapbt_init_cur_cache(); 5097 if (error) 5098 goto err; 5099 error = xfs_refcountbt_init_cur_cache(); 5100 if (error) 5101 goto err; 5102 5103 return 0; 5104 err: 5105 xfs_btree_destroy_cur_caches(); 5106 return error; 5107 } 5108 5109 /* Destroy all the btree cursor caches, if they've been allocated. */ 5110 void 5111 xfs_btree_destroy_cur_caches(void) 5112 { 5113 xfs_allocbt_destroy_cur_cache(); 5114 xfs_inobt_destroy_cur_cache(); 5115 xfs_bmbt_destroy_cur_cache(); 5116 xfs_rmapbt_destroy_cur_cache(); 5117 xfs_refcountbt_destroy_cur_cache(); 5118 } 5119