1 /* 2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include "xfs.h" 19 #include "xfs_fs.h" 20 #include "xfs_shared.h" 21 #include "xfs_format.h" 22 #include "xfs_log_format.h" 23 #include "xfs_trans_resv.h" 24 #include "xfs_bit.h" 25 #include "xfs_mount.h" 26 #include "xfs_defer.h" 27 #include "xfs_inode.h" 28 #include "xfs_trans.h" 29 #include "xfs_inode_item.h" 30 #include "xfs_buf_item.h" 31 #include "xfs_btree.h" 32 #include "xfs_error.h" 33 #include "xfs_trace.h" 34 #include "xfs_cksum.h" 35 #include "xfs_alloc.h" 36 #include "xfs_log.h" 37 38 /* 39 * Cursor allocation zone. 40 */ 41 kmem_zone_t *xfs_btree_cur_zone; 42 43 /* 44 * Btree magic numbers. 45 */ 46 static const uint32_t xfs_magics[2][XFS_BTNUM_MAX] = { 47 { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC, 48 XFS_FIBT_MAGIC, 0 }, 49 { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC, 50 XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC, 51 XFS_REFC_CRC_MAGIC } 52 }; 53 54 uint32_t 55 xfs_btree_magic( 56 int crc, 57 xfs_btnum_t btnum) 58 { 59 uint32_t magic = xfs_magics[crc][btnum]; 60 61 /* Ensure we asked for crc for crc-only magics. */ 62 ASSERT(magic != 0); 63 return magic; 64 } 65 66 STATIC int /* error (0 or EFSCORRUPTED) */ 67 xfs_btree_check_lblock( 68 struct xfs_btree_cur *cur, /* btree cursor */ 69 struct xfs_btree_block *block, /* btree long form block pointer */ 70 int level, /* level of the btree block */ 71 struct xfs_buf *bp) /* buffer for block, if any */ 72 { 73 int lblock_ok = 1; /* block passes checks */ 74 struct xfs_mount *mp; /* file system mount point */ 75 xfs_btnum_t btnum = cur->bc_btnum; 76 int crc; 77 78 mp = cur->bc_mp; 79 crc = xfs_sb_version_hascrc(&mp->m_sb); 80 81 if (crc) { 82 lblock_ok = lblock_ok && 83 uuid_equal(&block->bb_u.l.bb_uuid, 84 &mp->m_sb.sb_meta_uuid) && 85 block->bb_u.l.bb_blkno == cpu_to_be64( 86 bp ? bp->b_bn : XFS_BUF_DADDR_NULL); 87 } 88 89 lblock_ok = lblock_ok && 90 be32_to_cpu(block->bb_magic) == xfs_btree_magic(crc, btnum) && 91 be16_to_cpu(block->bb_level) == level && 92 be16_to_cpu(block->bb_numrecs) <= 93 cur->bc_ops->get_maxrecs(cur, level) && 94 block->bb_u.l.bb_leftsib && 95 (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK) || 96 XFS_FSB_SANITY_CHECK(mp, 97 be64_to_cpu(block->bb_u.l.bb_leftsib))) && 98 block->bb_u.l.bb_rightsib && 99 (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK) || 100 XFS_FSB_SANITY_CHECK(mp, 101 be64_to_cpu(block->bb_u.l.bb_rightsib))); 102 103 if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp, 104 XFS_ERRTAG_BTREE_CHECK_LBLOCK))) { 105 if (bp) 106 trace_xfs_btree_corrupt(bp, _RET_IP_); 107 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); 108 return -EFSCORRUPTED; 109 } 110 return 0; 111 } 112 113 STATIC int /* error (0 or EFSCORRUPTED) */ 114 xfs_btree_check_sblock( 115 struct xfs_btree_cur *cur, /* btree cursor */ 116 struct xfs_btree_block *block, /* btree short form block pointer */ 117 int level, /* level of the btree block */ 118 struct xfs_buf *bp) /* buffer containing block */ 119 { 120 struct xfs_mount *mp; /* file system mount point */ 121 struct xfs_buf *agbp; /* buffer for ag. freespace struct */ 122 struct xfs_agf *agf; /* ag. freespace structure */ 123 xfs_agblock_t agflen; /* native ag. freespace length */ 124 int sblock_ok = 1; /* block passes checks */ 125 xfs_btnum_t btnum = cur->bc_btnum; 126 int crc; 127 128 mp = cur->bc_mp; 129 crc = xfs_sb_version_hascrc(&mp->m_sb); 130 agbp = cur->bc_private.a.agbp; 131 agf = XFS_BUF_TO_AGF(agbp); 132 agflen = be32_to_cpu(agf->agf_length); 133 134 if (crc) { 135 sblock_ok = sblock_ok && 136 uuid_equal(&block->bb_u.s.bb_uuid, 137 &mp->m_sb.sb_meta_uuid) && 138 block->bb_u.s.bb_blkno == cpu_to_be64( 139 bp ? bp->b_bn : XFS_BUF_DADDR_NULL); 140 } 141 142 sblock_ok = sblock_ok && 143 be32_to_cpu(block->bb_magic) == xfs_btree_magic(crc, btnum) && 144 be16_to_cpu(block->bb_level) == level && 145 be16_to_cpu(block->bb_numrecs) <= 146 cur->bc_ops->get_maxrecs(cur, level) && 147 (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) || 148 be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) && 149 block->bb_u.s.bb_leftsib && 150 (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || 151 be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) && 152 block->bb_u.s.bb_rightsib; 153 154 if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp, 155 XFS_ERRTAG_BTREE_CHECK_SBLOCK))) { 156 if (bp) 157 trace_xfs_btree_corrupt(bp, _RET_IP_); 158 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); 159 return -EFSCORRUPTED; 160 } 161 return 0; 162 } 163 164 /* 165 * Debug routine: check that block header is ok. 166 */ 167 int 168 xfs_btree_check_block( 169 struct xfs_btree_cur *cur, /* btree cursor */ 170 struct xfs_btree_block *block, /* generic btree block pointer */ 171 int level, /* level of the btree block */ 172 struct xfs_buf *bp) /* buffer containing block, if any */ 173 { 174 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 175 return xfs_btree_check_lblock(cur, block, level, bp); 176 else 177 return xfs_btree_check_sblock(cur, block, level, bp); 178 } 179 180 /* 181 * Check that (long) pointer is ok. 182 */ 183 int /* error (0 or EFSCORRUPTED) */ 184 xfs_btree_check_lptr( 185 struct xfs_btree_cur *cur, /* btree cursor */ 186 xfs_fsblock_t bno, /* btree block disk address */ 187 int level) /* btree block level */ 188 { 189 XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, 190 level > 0 && 191 bno != NULLFSBLOCK && 192 XFS_FSB_SANITY_CHECK(cur->bc_mp, bno)); 193 return 0; 194 } 195 196 #ifdef DEBUG 197 /* 198 * Check that (short) pointer is ok. 199 */ 200 STATIC int /* error (0 or EFSCORRUPTED) */ 201 xfs_btree_check_sptr( 202 struct xfs_btree_cur *cur, /* btree cursor */ 203 xfs_agblock_t bno, /* btree block disk address */ 204 int level) /* btree block level */ 205 { 206 xfs_agblock_t agblocks = cur->bc_mp->m_sb.sb_agblocks; 207 208 XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, 209 level > 0 && 210 bno != NULLAGBLOCK && 211 bno != 0 && 212 bno < agblocks); 213 return 0; 214 } 215 216 /* 217 * Check that block ptr is ok. 218 */ 219 STATIC int /* error (0 or EFSCORRUPTED) */ 220 xfs_btree_check_ptr( 221 struct xfs_btree_cur *cur, /* btree cursor */ 222 union xfs_btree_ptr *ptr, /* btree block disk address */ 223 int index, /* offset from ptr to check */ 224 int level) /* btree block level */ 225 { 226 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 227 return xfs_btree_check_lptr(cur, 228 be64_to_cpu((&ptr->l)[index]), level); 229 } else { 230 return xfs_btree_check_sptr(cur, 231 be32_to_cpu((&ptr->s)[index]), level); 232 } 233 } 234 #endif 235 236 /* 237 * Calculate CRC on the whole btree block and stuff it into the 238 * long-form btree header. 239 * 240 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put 241 * it into the buffer so recovery knows what the last modification was that made 242 * it to disk. 243 */ 244 void 245 xfs_btree_lblock_calc_crc( 246 struct xfs_buf *bp) 247 { 248 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 249 struct xfs_buf_log_item *bip = bp->b_fspriv; 250 251 if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) 252 return; 253 if (bip) 254 block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); 255 xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); 256 } 257 258 bool 259 xfs_btree_lblock_verify_crc( 260 struct xfs_buf *bp) 261 { 262 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 263 struct xfs_mount *mp = bp->b_target->bt_mount; 264 265 if (xfs_sb_version_hascrc(&mp->m_sb)) { 266 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn))) 267 return false; 268 return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); 269 } 270 271 return true; 272 } 273 274 /* 275 * Calculate CRC on the whole btree block and stuff it into the 276 * short-form btree header. 277 * 278 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put 279 * it into the buffer so recovery knows what the last modification was that made 280 * it to disk. 281 */ 282 void 283 xfs_btree_sblock_calc_crc( 284 struct xfs_buf *bp) 285 { 286 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 287 struct xfs_buf_log_item *bip = bp->b_fspriv; 288 289 if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) 290 return; 291 if (bip) 292 block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); 293 xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); 294 } 295 296 bool 297 xfs_btree_sblock_verify_crc( 298 struct xfs_buf *bp) 299 { 300 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 301 struct xfs_mount *mp = bp->b_target->bt_mount; 302 303 if (xfs_sb_version_hascrc(&mp->m_sb)) { 304 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn))) 305 return false; 306 return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); 307 } 308 309 return true; 310 } 311 312 static int 313 xfs_btree_free_block( 314 struct xfs_btree_cur *cur, 315 struct xfs_buf *bp) 316 { 317 int error; 318 319 error = cur->bc_ops->free_block(cur, bp); 320 if (!error) { 321 xfs_trans_binval(cur->bc_tp, bp); 322 XFS_BTREE_STATS_INC(cur, free); 323 } 324 return error; 325 } 326 327 /* 328 * Delete the btree cursor. 329 */ 330 void 331 xfs_btree_del_cursor( 332 xfs_btree_cur_t *cur, /* btree cursor */ 333 int error) /* del because of error */ 334 { 335 int i; /* btree level */ 336 337 /* 338 * Clear the buffer pointers, and release the buffers. 339 * If we're doing this in the face of an error, we 340 * need to make sure to inspect all of the entries 341 * in the bc_bufs array for buffers to be unlocked. 342 * This is because some of the btree code works from 343 * level n down to 0, and if we get an error along 344 * the way we won't have initialized all the entries 345 * down to 0. 346 */ 347 for (i = 0; i < cur->bc_nlevels; i++) { 348 if (cur->bc_bufs[i]) 349 xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]); 350 else if (!error) 351 break; 352 } 353 /* 354 * Can't free a bmap cursor without having dealt with the 355 * allocated indirect blocks' accounting. 356 */ 357 ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || 358 cur->bc_private.b.allocated == 0); 359 /* 360 * Free the cursor. 361 */ 362 kmem_zone_free(xfs_btree_cur_zone, cur); 363 } 364 365 /* 366 * Duplicate the btree cursor. 367 * Allocate a new one, copy the record, re-get the buffers. 368 */ 369 int /* error */ 370 xfs_btree_dup_cursor( 371 xfs_btree_cur_t *cur, /* input cursor */ 372 xfs_btree_cur_t **ncur) /* output cursor */ 373 { 374 xfs_buf_t *bp; /* btree block's buffer pointer */ 375 int error; /* error return value */ 376 int i; /* level number of btree block */ 377 xfs_mount_t *mp; /* mount structure for filesystem */ 378 xfs_btree_cur_t *new; /* new cursor value */ 379 xfs_trans_t *tp; /* transaction pointer, can be NULL */ 380 381 tp = cur->bc_tp; 382 mp = cur->bc_mp; 383 384 /* 385 * Allocate a new cursor like the old one. 386 */ 387 new = cur->bc_ops->dup_cursor(cur); 388 389 /* 390 * Copy the record currently in the cursor. 391 */ 392 new->bc_rec = cur->bc_rec; 393 394 /* 395 * For each level current, re-get the buffer and copy the ptr value. 396 */ 397 for (i = 0; i < new->bc_nlevels; i++) { 398 new->bc_ptrs[i] = cur->bc_ptrs[i]; 399 new->bc_ra[i] = cur->bc_ra[i]; 400 bp = cur->bc_bufs[i]; 401 if (bp) { 402 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 403 XFS_BUF_ADDR(bp), mp->m_bsize, 404 0, &bp, 405 cur->bc_ops->buf_ops); 406 if (error) { 407 xfs_btree_del_cursor(new, error); 408 *ncur = NULL; 409 return error; 410 } 411 } 412 new->bc_bufs[i] = bp; 413 } 414 *ncur = new; 415 return 0; 416 } 417 418 /* 419 * XFS btree block layout and addressing: 420 * 421 * There are two types of blocks in the btree: leaf and non-leaf blocks. 422 * 423 * The leaf record start with a header then followed by records containing 424 * the values. A non-leaf block also starts with the same header, and 425 * then first contains lookup keys followed by an equal number of pointers 426 * to the btree blocks at the previous level. 427 * 428 * +--------+-------+-------+-------+-------+-------+-------+ 429 * Leaf: | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N | 430 * +--------+-------+-------+-------+-------+-------+-------+ 431 * 432 * +--------+-------+-------+-------+-------+-------+-------+ 433 * Non-Leaf: | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N | 434 * +--------+-------+-------+-------+-------+-------+-------+ 435 * 436 * The header is called struct xfs_btree_block for reasons better left unknown 437 * and comes in different versions for short (32bit) and long (64bit) block 438 * pointers. The record and key structures are defined by the btree instances 439 * and opaque to the btree core. The block pointers are simple disk endian 440 * integers, available in a short (32bit) and long (64bit) variant. 441 * 442 * The helpers below calculate the offset of a given record, key or pointer 443 * into a btree block (xfs_btree_*_offset) or return a pointer to the given 444 * record, key or pointer (xfs_btree_*_addr). Note that all addressing 445 * inside the btree block is done using indices starting at one, not zero! 446 * 447 * If XFS_BTREE_OVERLAPPING is set, then this btree supports keys containing 448 * overlapping intervals. In such a tree, records are still sorted lowest to 449 * highest and indexed by the smallest key value that refers to the record. 450 * However, nodes are different: each pointer has two associated keys -- one 451 * indexing the lowest key available in the block(s) below (the same behavior 452 * as the key in a regular btree) and another indexing the highest key 453 * available in the block(s) below. Because records are /not/ sorted by the 454 * highest key, all leaf block updates require us to compute the highest key 455 * that matches any record in the leaf and to recursively update the high keys 456 * in the nodes going further up in the tree, if necessary. Nodes look like 457 * this: 458 * 459 * +--------+-----+-----+-----+-----+-----+-------+-------+-----+ 460 * Non-Leaf: | header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... | 461 * +--------+-----+-----+-----+-----+-----+-------+-------+-----+ 462 * 463 * To perform an interval query on an overlapped tree, perform the usual 464 * depth-first search and use the low and high keys to decide if we can skip 465 * that particular node. If a leaf node is reached, return the records that 466 * intersect the interval. Note that an interval query may return numerous 467 * entries. For a non-overlapped tree, simply search for the record associated 468 * with the lowest key and iterate forward until a non-matching record is 469 * found. Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by 470 * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in 471 * more detail. 472 * 473 * Why do we care about overlapping intervals? Let's say you have a bunch of 474 * reverse mapping records on a reflink filesystem: 475 * 476 * 1: +- file A startblock B offset C length D -----------+ 477 * 2: +- file E startblock F offset G length H --------------+ 478 * 3: +- file I startblock F offset J length K --+ 479 * 4: +- file L... --+ 480 * 481 * Now say we want to map block (B+D) into file A at offset (C+D). Ideally, 482 * we'd simply increment the length of record 1. But how do we find the record 483 * that ends at (B+D-1) (i.e. record 1)? A LE lookup of (B+D-1) would return 484 * record 3 because the keys are ordered first by startblock. An interval 485 * query would return records 1 and 2 because they both overlap (B+D-1), and 486 * from that we can pick out record 1 as the appropriate left neighbor. 487 * 488 * In the non-overlapped case you can do a LE lookup and decrement the cursor 489 * because a record's interval must end before the next record. 490 */ 491 492 /* 493 * Return size of the btree block header for this btree instance. 494 */ 495 static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur) 496 { 497 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 498 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) 499 return XFS_BTREE_LBLOCK_CRC_LEN; 500 return XFS_BTREE_LBLOCK_LEN; 501 } 502 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) 503 return XFS_BTREE_SBLOCK_CRC_LEN; 504 return XFS_BTREE_SBLOCK_LEN; 505 } 506 507 /* 508 * Return size of btree block pointers for this btree instance. 509 */ 510 static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur) 511 { 512 return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? 513 sizeof(__be64) : sizeof(__be32); 514 } 515 516 /* 517 * Calculate offset of the n-th record in a btree block. 518 */ 519 STATIC size_t 520 xfs_btree_rec_offset( 521 struct xfs_btree_cur *cur, 522 int n) 523 { 524 return xfs_btree_block_len(cur) + 525 (n - 1) * cur->bc_ops->rec_len; 526 } 527 528 /* 529 * Calculate offset of the n-th key in a btree block. 530 */ 531 STATIC size_t 532 xfs_btree_key_offset( 533 struct xfs_btree_cur *cur, 534 int n) 535 { 536 return xfs_btree_block_len(cur) + 537 (n - 1) * cur->bc_ops->key_len; 538 } 539 540 /* 541 * Calculate offset of the n-th high key in a btree block. 542 */ 543 STATIC size_t 544 xfs_btree_high_key_offset( 545 struct xfs_btree_cur *cur, 546 int n) 547 { 548 return xfs_btree_block_len(cur) + 549 (n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2); 550 } 551 552 /* 553 * Calculate offset of the n-th block pointer in a btree block. 554 */ 555 STATIC size_t 556 xfs_btree_ptr_offset( 557 struct xfs_btree_cur *cur, 558 int n, 559 int level) 560 { 561 return xfs_btree_block_len(cur) + 562 cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len + 563 (n - 1) * xfs_btree_ptr_len(cur); 564 } 565 566 /* 567 * Return a pointer to the n-th record in the btree block. 568 */ 569 union xfs_btree_rec * 570 xfs_btree_rec_addr( 571 struct xfs_btree_cur *cur, 572 int n, 573 struct xfs_btree_block *block) 574 { 575 return (union xfs_btree_rec *) 576 ((char *)block + xfs_btree_rec_offset(cur, n)); 577 } 578 579 /* 580 * Return a pointer to the n-th key in the btree block. 581 */ 582 union xfs_btree_key * 583 xfs_btree_key_addr( 584 struct xfs_btree_cur *cur, 585 int n, 586 struct xfs_btree_block *block) 587 { 588 return (union xfs_btree_key *) 589 ((char *)block + xfs_btree_key_offset(cur, n)); 590 } 591 592 /* 593 * Return a pointer to the n-th high key in the btree block. 594 */ 595 union xfs_btree_key * 596 xfs_btree_high_key_addr( 597 struct xfs_btree_cur *cur, 598 int n, 599 struct xfs_btree_block *block) 600 { 601 return (union xfs_btree_key *) 602 ((char *)block + xfs_btree_high_key_offset(cur, n)); 603 } 604 605 /* 606 * Return a pointer to the n-th block pointer in the btree block. 607 */ 608 union xfs_btree_ptr * 609 xfs_btree_ptr_addr( 610 struct xfs_btree_cur *cur, 611 int n, 612 struct xfs_btree_block *block) 613 { 614 int level = xfs_btree_get_level(block); 615 616 ASSERT(block->bb_level != 0); 617 618 return (union xfs_btree_ptr *) 619 ((char *)block + xfs_btree_ptr_offset(cur, n, level)); 620 } 621 622 /* 623 * Get the root block which is stored in the inode. 624 * 625 * For now this btree implementation assumes the btree root is always 626 * stored in the if_broot field of an inode fork. 627 */ 628 STATIC struct xfs_btree_block * 629 xfs_btree_get_iroot( 630 struct xfs_btree_cur *cur) 631 { 632 struct xfs_ifork *ifp; 633 634 ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork); 635 return (struct xfs_btree_block *)ifp->if_broot; 636 } 637 638 /* 639 * Retrieve the block pointer from the cursor at the given level. 640 * This may be an inode btree root or from a buffer. 641 */ 642 struct xfs_btree_block * /* generic btree block pointer */ 643 xfs_btree_get_block( 644 struct xfs_btree_cur *cur, /* btree cursor */ 645 int level, /* level in btree */ 646 struct xfs_buf **bpp) /* buffer containing the block */ 647 { 648 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && 649 (level == cur->bc_nlevels - 1)) { 650 *bpp = NULL; 651 return xfs_btree_get_iroot(cur); 652 } 653 654 *bpp = cur->bc_bufs[level]; 655 return XFS_BUF_TO_BLOCK(*bpp); 656 } 657 658 /* 659 * Get a buffer for the block, return it with no data read. 660 * Long-form addressing. 661 */ 662 xfs_buf_t * /* buffer for fsbno */ 663 xfs_btree_get_bufl( 664 xfs_mount_t *mp, /* file system mount point */ 665 xfs_trans_t *tp, /* transaction pointer */ 666 xfs_fsblock_t fsbno, /* file system block number */ 667 uint lock) /* lock flags for get_buf */ 668 { 669 xfs_daddr_t d; /* real disk block address */ 670 671 ASSERT(fsbno != NULLFSBLOCK); 672 d = XFS_FSB_TO_DADDR(mp, fsbno); 673 return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); 674 } 675 676 /* 677 * Get a buffer for the block, return it with no data read. 678 * Short-form addressing. 679 */ 680 xfs_buf_t * /* buffer for agno/agbno */ 681 xfs_btree_get_bufs( 682 xfs_mount_t *mp, /* file system mount point */ 683 xfs_trans_t *tp, /* transaction pointer */ 684 xfs_agnumber_t agno, /* allocation group number */ 685 xfs_agblock_t agbno, /* allocation group block number */ 686 uint lock) /* lock flags for get_buf */ 687 { 688 xfs_daddr_t d; /* real disk block address */ 689 690 ASSERT(agno != NULLAGNUMBER); 691 ASSERT(agbno != NULLAGBLOCK); 692 d = XFS_AGB_TO_DADDR(mp, agno, agbno); 693 return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); 694 } 695 696 /* 697 * Check for the cursor referring to the last block at the given level. 698 */ 699 int /* 1=is last block, 0=not last block */ 700 xfs_btree_islastblock( 701 xfs_btree_cur_t *cur, /* btree cursor */ 702 int level) /* level to check */ 703 { 704 struct xfs_btree_block *block; /* generic btree block pointer */ 705 xfs_buf_t *bp; /* buffer containing block */ 706 707 block = xfs_btree_get_block(cur, level, &bp); 708 xfs_btree_check_block(cur, block, level, bp); 709 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 710 return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK); 711 else 712 return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK); 713 } 714 715 /* 716 * Change the cursor to point to the first record at the given level. 717 * Other levels are unaffected. 718 */ 719 STATIC int /* success=1, failure=0 */ 720 xfs_btree_firstrec( 721 xfs_btree_cur_t *cur, /* btree cursor */ 722 int level) /* level to change */ 723 { 724 struct xfs_btree_block *block; /* generic btree block pointer */ 725 xfs_buf_t *bp; /* buffer containing block */ 726 727 /* 728 * Get the block pointer for this level. 729 */ 730 block = xfs_btree_get_block(cur, level, &bp); 731 xfs_btree_check_block(cur, block, level, bp); 732 /* 733 * It's empty, there is no such record. 734 */ 735 if (!block->bb_numrecs) 736 return 0; 737 /* 738 * Set the ptr value to 1, that's the first record/key. 739 */ 740 cur->bc_ptrs[level] = 1; 741 return 1; 742 } 743 744 /* 745 * Change the cursor to point to the last record in the current block 746 * at the given level. Other levels are unaffected. 747 */ 748 STATIC int /* success=1, failure=0 */ 749 xfs_btree_lastrec( 750 xfs_btree_cur_t *cur, /* btree cursor */ 751 int level) /* level to change */ 752 { 753 struct xfs_btree_block *block; /* generic btree block pointer */ 754 xfs_buf_t *bp; /* buffer containing block */ 755 756 /* 757 * Get the block pointer for this level. 758 */ 759 block = xfs_btree_get_block(cur, level, &bp); 760 xfs_btree_check_block(cur, block, level, bp); 761 /* 762 * It's empty, there is no such record. 763 */ 764 if (!block->bb_numrecs) 765 return 0; 766 /* 767 * Set the ptr value to numrecs, that's the last record/key. 768 */ 769 cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs); 770 return 1; 771 } 772 773 /* 774 * Compute first and last byte offsets for the fields given. 775 * Interprets the offsets table, which contains struct field offsets. 776 */ 777 void 778 xfs_btree_offsets( 779 int64_t fields, /* bitmask of fields */ 780 const short *offsets, /* table of field offsets */ 781 int nbits, /* number of bits to inspect */ 782 int *first, /* output: first byte offset */ 783 int *last) /* output: last byte offset */ 784 { 785 int i; /* current bit number */ 786 int64_t imask; /* mask for current bit number */ 787 788 ASSERT(fields != 0); 789 /* 790 * Find the lowest bit, so the first byte offset. 791 */ 792 for (i = 0, imask = 1LL; ; i++, imask <<= 1) { 793 if (imask & fields) { 794 *first = offsets[i]; 795 break; 796 } 797 } 798 /* 799 * Find the highest bit, so the last byte offset. 800 */ 801 for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) { 802 if (imask & fields) { 803 *last = offsets[i + 1] - 1; 804 break; 805 } 806 } 807 } 808 809 /* 810 * Get a buffer for the block, return it read in. 811 * Long-form addressing. 812 */ 813 int 814 xfs_btree_read_bufl( 815 struct xfs_mount *mp, /* file system mount point */ 816 struct xfs_trans *tp, /* transaction pointer */ 817 xfs_fsblock_t fsbno, /* file system block number */ 818 uint lock, /* lock flags for read_buf */ 819 struct xfs_buf **bpp, /* buffer for fsbno */ 820 int refval, /* ref count value for buffer */ 821 const struct xfs_buf_ops *ops) 822 { 823 struct xfs_buf *bp; /* return value */ 824 xfs_daddr_t d; /* real disk block address */ 825 int error; 826 827 if (!XFS_FSB_SANITY_CHECK(mp, fsbno)) 828 return -EFSCORRUPTED; 829 d = XFS_FSB_TO_DADDR(mp, fsbno); 830 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, 831 mp->m_bsize, lock, &bp, ops); 832 if (error) 833 return error; 834 if (bp) 835 xfs_buf_set_ref(bp, refval); 836 *bpp = bp; 837 return 0; 838 } 839 840 /* 841 * Read-ahead the block, don't wait for it, don't return a buffer. 842 * Long-form addressing. 843 */ 844 /* ARGSUSED */ 845 void 846 xfs_btree_reada_bufl( 847 struct xfs_mount *mp, /* file system mount point */ 848 xfs_fsblock_t fsbno, /* file system block number */ 849 xfs_extlen_t count, /* count of filesystem blocks */ 850 const struct xfs_buf_ops *ops) 851 { 852 xfs_daddr_t d; 853 854 ASSERT(fsbno != NULLFSBLOCK); 855 d = XFS_FSB_TO_DADDR(mp, fsbno); 856 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops); 857 } 858 859 /* 860 * Read-ahead the block, don't wait for it, don't return a buffer. 861 * Short-form addressing. 862 */ 863 /* ARGSUSED */ 864 void 865 xfs_btree_reada_bufs( 866 struct xfs_mount *mp, /* file system mount point */ 867 xfs_agnumber_t agno, /* allocation group number */ 868 xfs_agblock_t agbno, /* allocation group block number */ 869 xfs_extlen_t count, /* count of filesystem blocks */ 870 const struct xfs_buf_ops *ops) 871 { 872 xfs_daddr_t d; 873 874 ASSERT(agno != NULLAGNUMBER); 875 ASSERT(agbno != NULLAGBLOCK); 876 d = XFS_AGB_TO_DADDR(mp, agno, agbno); 877 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops); 878 } 879 880 STATIC int 881 xfs_btree_readahead_lblock( 882 struct xfs_btree_cur *cur, 883 int lr, 884 struct xfs_btree_block *block) 885 { 886 int rval = 0; 887 xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); 888 xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); 889 890 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) { 891 xfs_btree_reada_bufl(cur->bc_mp, left, 1, 892 cur->bc_ops->buf_ops); 893 rval++; 894 } 895 896 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) { 897 xfs_btree_reada_bufl(cur->bc_mp, right, 1, 898 cur->bc_ops->buf_ops); 899 rval++; 900 } 901 902 return rval; 903 } 904 905 STATIC int 906 xfs_btree_readahead_sblock( 907 struct xfs_btree_cur *cur, 908 int lr, 909 struct xfs_btree_block *block) 910 { 911 int rval = 0; 912 xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib); 913 xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib); 914 915 916 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { 917 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, 918 left, 1, cur->bc_ops->buf_ops); 919 rval++; 920 } 921 922 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { 923 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, 924 right, 1, cur->bc_ops->buf_ops); 925 rval++; 926 } 927 928 return rval; 929 } 930 931 /* 932 * Read-ahead btree blocks, at the given level. 933 * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA. 934 */ 935 STATIC int 936 xfs_btree_readahead( 937 struct xfs_btree_cur *cur, /* btree cursor */ 938 int lev, /* level in btree */ 939 int lr) /* left/right bits */ 940 { 941 struct xfs_btree_block *block; 942 943 /* 944 * No readahead needed if we are at the root level and the 945 * btree root is stored in the inode. 946 */ 947 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && 948 (lev == cur->bc_nlevels - 1)) 949 return 0; 950 951 if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev]) 952 return 0; 953 954 cur->bc_ra[lev] |= lr; 955 block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]); 956 957 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 958 return xfs_btree_readahead_lblock(cur, lr, block); 959 return xfs_btree_readahead_sblock(cur, lr, block); 960 } 961 962 STATIC xfs_daddr_t 963 xfs_btree_ptr_to_daddr( 964 struct xfs_btree_cur *cur, 965 union xfs_btree_ptr *ptr) 966 { 967 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 968 ASSERT(ptr->l != cpu_to_be64(NULLFSBLOCK)); 969 970 return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l)); 971 } else { 972 ASSERT(cur->bc_private.a.agno != NULLAGNUMBER); 973 ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK)); 974 975 return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno, 976 be32_to_cpu(ptr->s)); 977 } 978 } 979 980 /* 981 * Readahead @count btree blocks at the given @ptr location. 982 * 983 * We don't need to care about long or short form btrees here as we have a 984 * method of converting the ptr directly to a daddr available to us. 985 */ 986 STATIC void 987 xfs_btree_readahead_ptr( 988 struct xfs_btree_cur *cur, 989 union xfs_btree_ptr *ptr, 990 xfs_extlen_t count) 991 { 992 xfs_buf_readahead(cur->bc_mp->m_ddev_targp, 993 xfs_btree_ptr_to_daddr(cur, ptr), 994 cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops); 995 } 996 997 /* 998 * Set the buffer for level "lev" in the cursor to bp, releasing 999 * any previous buffer. 1000 */ 1001 STATIC void 1002 xfs_btree_setbuf( 1003 xfs_btree_cur_t *cur, /* btree cursor */ 1004 int lev, /* level in btree */ 1005 xfs_buf_t *bp) /* new buffer to set */ 1006 { 1007 struct xfs_btree_block *b; /* btree block */ 1008 1009 if (cur->bc_bufs[lev]) 1010 xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]); 1011 cur->bc_bufs[lev] = bp; 1012 cur->bc_ra[lev] = 0; 1013 1014 b = XFS_BUF_TO_BLOCK(bp); 1015 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 1016 if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)) 1017 cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA; 1018 if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)) 1019 cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA; 1020 } else { 1021 if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK)) 1022 cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA; 1023 if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK)) 1024 cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA; 1025 } 1026 } 1027 1028 STATIC int 1029 xfs_btree_ptr_is_null( 1030 struct xfs_btree_cur *cur, 1031 union xfs_btree_ptr *ptr) 1032 { 1033 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 1034 return ptr->l == cpu_to_be64(NULLFSBLOCK); 1035 else 1036 return ptr->s == cpu_to_be32(NULLAGBLOCK); 1037 } 1038 1039 STATIC void 1040 xfs_btree_set_ptr_null( 1041 struct xfs_btree_cur *cur, 1042 union xfs_btree_ptr *ptr) 1043 { 1044 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 1045 ptr->l = cpu_to_be64(NULLFSBLOCK); 1046 else 1047 ptr->s = cpu_to_be32(NULLAGBLOCK); 1048 } 1049 1050 /* 1051 * Get/set/init sibling pointers 1052 */ 1053 STATIC void 1054 xfs_btree_get_sibling( 1055 struct xfs_btree_cur *cur, 1056 struct xfs_btree_block *block, 1057 union xfs_btree_ptr *ptr, 1058 int lr) 1059 { 1060 ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); 1061 1062 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 1063 if (lr == XFS_BB_RIGHTSIB) 1064 ptr->l = block->bb_u.l.bb_rightsib; 1065 else 1066 ptr->l = block->bb_u.l.bb_leftsib; 1067 } else { 1068 if (lr == XFS_BB_RIGHTSIB) 1069 ptr->s = block->bb_u.s.bb_rightsib; 1070 else 1071 ptr->s = block->bb_u.s.bb_leftsib; 1072 } 1073 } 1074 1075 STATIC void 1076 xfs_btree_set_sibling( 1077 struct xfs_btree_cur *cur, 1078 struct xfs_btree_block *block, 1079 union xfs_btree_ptr *ptr, 1080 int lr) 1081 { 1082 ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); 1083 1084 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 1085 if (lr == XFS_BB_RIGHTSIB) 1086 block->bb_u.l.bb_rightsib = ptr->l; 1087 else 1088 block->bb_u.l.bb_leftsib = ptr->l; 1089 } else { 1090 if (lr == XFS_BB_RIGHTSIB) 1091 block->bb_u.s.bb_rightsib = ptr->s; 1092 else 1093 block->bb_u.s.bb_leftsib = ptr->s; 1094 } 1095 } 1096 1097 void 1098 xfs_btree_init_block_int( 1099 struct xfs_mount *mp, 1100 struct xfs_btree_block *buf, 1101 xfs_daddr_t blkno, 1102 xfs_btnum_t btnum, 1103 __u16 level, 1104 __u16 numrecs, 1105 __u64 owner, 1106 unsigned int flags) 1107 { 1108 int crc = xfs_sb_version_hascrc(&mp->m_sb); 1109 __u32 magic = xfs_btree_magic(crc, btnum); 1110 1111 buf->bb_magic = cpu_to_be32(magic); 1112 buf->bb_level = cpu_to_be16(level); 1113 buf->bb_numrecs = cpu_to_be16(numrecs); 1114 1115 if (flags & XFS_BTREE_LONG_PTRS) { 1116 buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK); 1117 buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK); 1118 if (crc) { 1119 buf->bb_u.l.bb_blkno = cpu_to_be64(blkno); 1120 buf->bb_u.l.bb_owner = cpu_to_be64(owner); 1121 uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid); 1122 buf->bb_u.l.bb_pad = 0; 1123 buf->bb_u.l.bb_lsn = 0; 1124 } 1125 } else { 1126 /* owner is a 32 bit value on short blocks */ 1127 __u32 __owner = (__u32)owner; 1128 1129 buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); 1130 buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); 1131 if (crc) { 1132 buf->bb_u.s.bb_blkno = cpu_to_be64(blkno); 1133 buf->bb_u.s.bb_owner = cpu_to_be32(__owner); 1134 uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid); 1135 buf->bb_u.s.bb_lsn = 0; 1136 } 1137 } 1138 } 1139 1140 void 1141 xfs_btree_init_block( 1142 struct xfs_mount *mp, 1143 struct xfs_buf *bp, 1144 xfs_btnum_t btnum, 1145 __u16 level, 1146 __u16 numrecs, 1147 __u64 owner, 1148 unsigned int flags) 1149 { 1150 xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, 1151 btnum, level, numrecs, owner, flags); 1152 } 1153 1154 STATIC void 1155 xfs_btree_init_block_cur( 1156 struct xfs_btree_cur *cur, 1157 struct xfs_buf *bp, 1158 int level, 1159 int numrecs) 1160 { 1161 __u64 owner; 1162 1163 /* 1164 * we can pull the owner from the cursor right now as the different 1165 * owners align directly with the pointer size of the btree. This may 1166 * change in future, but is safe for current users of the generic btree 1167 * code. 1168 */ 1169 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 1170 owner = cur->bc_private.b.ip->i_ino; 1171 else 1172 owner = cur->bc_private.a.agno; 1173 1174 xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, 1175 cur->bc_btnum, level, numrecs, 1176 owner, cur->bc_flags); 1177 } 1178 1179 /* 1180 * Return true if ptr is the last record in the btree and 1181 * we need to track updates to this record. The decision 1182 * will be further refined in the update_lastrec method. 1183 */ 1184 STATIC int 1185 xfs_btree_is_lastrec( 1186 struct xfs_btree_cur *cur, 1187 struct xfs_btree_block *block, 1188 int level) 1189 { 1190 union xfs_btree_ptr ptr; 1191 1192 if (level > 0) 1193 return 0; 1194 if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE)) 1195 return 0; 1196 1197 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 1198 if (!xfs_btree_ptr_is_null(cur, &ptr)) 1199 return 0; 1200 return 1; 1201 } 1202 1203 STATIC void 1204 xfs_btree_buf_to_ptr( 1205 struct xfs_btree_cur *cur, 1206 struct xfs_buf *bp, 1207 union xfs_btree_ptr *ptr) 1208 { 1209 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 1210 ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, 1211 XFS_BUF_ADDR(bp))); 1212 else { 1213 ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp, 1214 XFS_BUF_ADDR(bp))); 1215 } 1216 } 1217 1218 STATIC void 1219 xfs_btree_set_refs( 1220 struct xfs_btree_cur *cur, 1221 struct xfs_buf *bp) 1222 { 1223 switch (cur->bc_btnum) { 1224 case XFS_BTNUM_BNO: 1225 case XFS_BTNUM_CNT: 1226 xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF); 1227 break; 1228 case XFS_BTNUM_INO: 1229 case XFS_BTNUM_FINO: 1230 xfs_buf_set_ref(bp, XFS_INO_BTREE_REF); 1231 break; 1232 case XFS_BTNUM_BMAP: 1233 xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF); 1234 break; 1235 case XFS_BTNUM_RMAP: 1236 xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF); 1237 break; 1238 case XFS_BTNUM_REFC: 1239 xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF); 1240 break; 1241 default: 1242 ASSERT(0); 1243 } 1244 } 1245 1246 STATIC int 1247 xfs_btree_get_buf_block( 1248 struct xfs_btree_cur *cur, 1249 union xfs_btree_ptr *ptr, 1250 int flags, 1251 struct xfs_btree_block **block, 1252 struct xfs_buf **bpp) 1253 { 1254 struct xfs_mount *mp = cur->bc_mp; 1255 xfs_daddr_t d; 1256 1257 /* need to sort out how callers deal with failures first */ 1258 ASSERT(!(flags & XBF_TRYLOCK)); 1259 1260 d = xfs_btree_ptr_to_daddr(cur, ptr); 1261 *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, 1262 mp->m_bsize, flags); 1263 1264 if (!*bpp) 1265 return -ENOMEM; 1266 1267 (*bpp)->b_ops = cur->bc_ops->buf_ops; 1268 *block = XFS_BUF_TO_BLOCK(*bpp); 1269 return 0; 1270 } 1271 1272 /* 1273 * Read in the buffer at the given ptr and return the buffer and 1274 * the block pointer within the buffer. 1275 */ 1276 STATIC int 1277 xfs_btree_read_buf_block( 1278 struct xfs_btree_cur *cur, 1279 union xfs_btree_ptr *ptr, 1280 int flags, 1281 struct xfs_btree_block **block, 1282 struct xfs_buf **bpp) 1283 { 1284 struct xfs_mount *mp = cur->bc_mp; 1285 xfs_daddr_t d; 1286 int error; 1287 1288 /* need to sort out how callers deal with failures first */ 1289 ASSERT(!(flags & XBF_TRYLOCK)); 1290 1291 d = xfs_btree_ptr_to_daddr(cur, ptr); 1292 error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, 1293 mp->m_bsize, flags, bpp, 1294 cur->bc_ops->buf_ops); 1295 if (error) 1296 return error; 1297 1298 xfs_btree_set_refs(cur, *bpp); 1299 *block = XFS_BUF_TO_BLOCK(*bpp); 1300 return 0; 1301 } 1302 1303 /* 1304 * Copy keys from one btree block to another. 1305 */ 1306 STATIC void 1307 xfs_btree_copy_keys( 1308 struct xfs_btree_cur *cur, 1309 union xfs_btree_key *dst_key, 1310 union xfs_btree_key *src_key, 1311 int numkeys) 1312 { 1313 ASSERT(numkeys >= 0); 1314 memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len); 1315 } 1316 1317 /* 1318 * Copy records from one btree block to another. 1319 */ 1320 STATIC void 1321 xfs_btree_copy_recs( 1322 struct xfs_btree_cur *cur, 1323 union xfs_btree_rec *dst_rec, 1324 union xfs_btree_rec *src_rec, 1325 int numrecs) 1326 { 1327 ASSERT(numrecs >= 0); 1328 memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len); 1329 } 1330 1331 /* 1332 * Copy block pointers from one btree block to another. 1333 */ 1334 STATIC void 1335 xfs_btree_copy_ptrs( 1336 struct xfs_btree_cur *cur, 1337 union xfs_btree_ptr *dst_ptr, 1338 union xfs_btree_ptr *src_ptr, 1339 int numptrs) 1340 { 1341 ASSERT(numptrs >= 0); 1342 memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur)); 1343 } 1344 1345 /* 1346 * Shift keys one index left/right inside a single btree block. 1347 */ 1348 STATIC void 1349 xfs_btree_shift_keys( 1350 struct xfs_btree_cur *cur, 1351 union xfs_btree_key *key, 1352 int dir, 1353 int numkeys) 1354 { 1355 char *dst_key; 1356 1357 ASSERT(numkeys >= 0); 1358 ASSERT(dir == 1 || dir == -1); 1359 1360 dst_key = (char *)key + (dir * cur->bc_ops->key_len); 1361 memmove(dst_key, key, numkeys * cur->bc_ops->key_len); 1362 } 1363 1364 /* 1365 * Shift records one index left/right inside a single btree block. 1366 */ 1367 STATIC void 1368 xfs_btree_shift_recs( 1369 struct xfs_btree_cur *cur, 1370 union xfs_btree_rec *rec, 1371 int dir, 1372 int numrecs) 1373 { 1374 char *dst_rec; 1375 1376 ASSERT(numrecs >= 0); 1377 ASSERT(dir == 1 || dir == -1); 1378 1379 dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len); 1380 memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len); 1381 } 1382 1383 /* 1384 * Shift block pointers one index left/right inside a single btree block. 1385 */ 1386 STATIC void 1387 xfs_btree_shift_ptrs( 1388 struct xfs_btree_cur *cur, 1389 union xfs_btree_ptr *ptr, 1390 int dir, 1391 int numptrs) 1392 { 1393 char *dst_ptr; 1394 1395 ASSERT(numptrs >= 0); 1396 ASSERT(dir == 1 || dir == -1); 1397 1398 dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur)); 1399 memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur)); 1400 } 1401 1402 /* 1403 * Log key values from the btree block. 1404 */ 1405 STATIC void 1406 xfs_btree_log_keys( 1407 struct xfs_btree_cur *cur, 1408 struct xfs_buf *bp, 1409 int first, 1410 int last) 1411 { 1412 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); 1413 XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); 1414 1415 if (bp) { 1416 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1417 xfs_trans_log_buf(cur->bc_tp, bp, 1418 xfs_btree_key_offset(cur, first), 1419 xfs_btree_key_offset(cur, last + 1) - 1); 1420 } else { 1421 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, 1422 xfs_ilog_fbroot(cur->bc_private.b.whichfork)); 1423 } 1424 1425 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 1426 } 1427 1428 /* 1429 * Log record values from the btree block. 1430 */ 1431 void 1432 xfs_btree_log_recs( 1433 struct xfs_btree_cur *cur, 1434 struct xfs_buf *bp, 1435 int first, 1436 int last) 1437 { 1438 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); 1439 XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); 1440 1441 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1442 xfs_trans_log_buf(cur->bc_tp, bp, 1443 xfs_btree_rec_offset(cur, first), 1444 xfs_btree_rec_offset(cur, last + 1) - 1); 1445 1446 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 1447 } 1448 1449 /* 1450 * Log block pointer fields from a btree block (nonleaf). 1451 */ 1452 STATIC void 1453 xfs_btree_log_ptrs( 1454 struct xfs_btree_cur *cur, /* btree cursor */ 1455 struct xfs_buf *bp, /* buffer containing btree block */ 1456 int first, /* index of first pointer to log */ 1457 int last) /* index of last pointer to log */ 1458 { 1459 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); 1460 XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); 1461 1462 if (bp) { 1463 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 1464 int level = xfs_btree_get_level(block); 1465 1466 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1467 xfs_trans_log_buf(cur->bc_tp, bp, 1468 xfs_btree_ptr_offset(cur, first, level), 1469 xfs_btree_ptr_offset(cur, last + 1, level) - 1); 1470 } else { 1471 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, 1472 xfs_ilog_fbroot(cur->bc_private.b.whichfork)); 1473 } 1474 1475 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 1476 } 1477 1478 /* 1479 * Log fields from a btree block header. 1480 */ 1481 void 1482 xfs_btree_log_block( 1483 struct xfs_btree_cur *cur, /* btree cursor */ 1484 struct xfs_buf *bp, /* buffer containing btree block */ 1485 int fields) /* mask of fields: XFS_BB_... */ 1486 { 1487 int first; /* first byte offset logged */ 1488 int last; /* last byte offset logged */ 1489 static const short soffsets[] = { /* table of offsets (short) */ 1490 offsetof(struct xfs_btree_block, bb_magic), 1491 offsetof(struct xfs_btree_block, bb_level), 1492 offsetof(struct xfs_btree_block, bb_numrecs), 1493 offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib), 1494 offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib), 1495 offsetof(struct xfs_btree_block, bb_u.s.bb_blkno), 1496 offsetof(struct xfs_btree_block, bb_u.s.bb_lsn), 1497 offsetof(struct xfs_btree_block, bb_u.s.bb_uuid), 1498 offsetof(struct xfs_btree_block, bb_u.s.bb_owner), 1499 offsetof(struct xfs_btree_block, bb_u.s.bb_crc), 1500 XFS_BTREE_SBLOCK_CRC_LEN 1501 }; 1502 static const short loffsets[] = { /* table of offsets (long) */ 1503 offsetof(struct xfs_btree_block, bb_magic), 1504 offsetof(struct xfs_btree_block, bb_level), 1505 offsetof(struct xfs_btree_block, bb_numrecs), 1506 offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib), 1507 offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib), 1508 offsetof(struct xfs_btree_block, bb_u.l.bb_blkno), 1509 offsetof(struct xfs_btree_block, bb_u.l.bb_lsn), 1510 offsetof(struct xfs_btree_block, bb_u.l.bb_uuid), 1511 offsetof(struct xfs_btree_block, bb_u.l.bb_owner), 1512 offsetof(struct xfs_btree_block, bb_u.l.bb_crc), 1513 offsetof(struct xfs_btree_block, bb_u.l.bb_pad), 1514 XFS_BTREE_LBLOCK_CRC_LEN 1515 }; 1516 1517 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); 1518 XFS_BTREE_TRACE_ARGBI(cur, bp, fields); 1519 1520 if (bp) { 1521 int nbits; 1522 1523 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { 1524 /* 1525 * We don't log the CRC when updating a btree 1526 * block but instead recreate it during log 1527 * recovery. As the log buffers have checksums 1528 * of their own this is safe and avoids logging a crc 1529 * update in a lot of places. 1530 */ 1531 if (fields == XFS_BB_ALL_BITS) 1532 fields = XFS_BB_ALL_BITS_CRC; 1533 nbits = XFS_BB_NUM_BITS_CRC; 1534 } else { 1535 nbits = XFS_BB_NUM_BITS; 1536 } 1537 xfs_btree_offsets(fields, 1538 (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? 1539 loffsets : soffsets, 1540 nbits, &first, &last); 1541 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1542 xfs_trans_log_buf(cur->bc_tp, bp, first, last); 1543 } else { 1544 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, 1545 xfs_ilog_fbroot(cur->bc_private.b.whichfork)); 1546 } 1547 1548 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 1549 } 1550 1551 /* 1552 * Increment cursor by one record at the level. 1553 * For nonzero levels the leaf-ward information is untouched. 1554 */ 1555 int /* error */ 1556 xfs_btree_increment( 1557 struct xfs_btree_cur *cur, 1558 int level, 1559 int *stat) /* success/failure */ 1560 { 1561 struct xfs_btree_block *block; 1562 union xfs_btree_ptr ptr; 1563 struct xfs_buf *bp; 1564 int error; /* error return value */ 1565 int lev; 1566 1567 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); 1568 XFS_BTREE_TRACE_ARGI(cur, level); 1569 1570 ASSERT(level < cur->bc_nlevels); 1571 1572 /* Read-ahead to the right at this level. */ 1573 xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); 1574 1575 /* Get a pointer to the btree block. */ 1576 block = xfs_btree_get_block(cur, level, &bp); 1577 1578 #ifdef DEBUG 1579 error = xfs_btree_check_block(cur, block, level, bp); 1580 if (error) 1581 goto error0; 1582 #endif 1583 1584 /* We're done if we remain in the block after the increment. */ 1585 if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block)) 1586 goto out1; 1587 1588 /* Fail if we just went off the right edge of the tree. */ 1589 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 1590 if (xfs_btree_ptr_is_null(cur, &ptr)) 1591 goto out0; 1592 1593 XFS_BTREE_STATS_INC(cur, increment); 1594 1595 /* 1596 * March up the tree incrementing pointers. 1597 * Stop when we don't go off the right edge of a block. 1598 */ 1599 for (lev = level + 1; lev < cur->bc_nlevels; lev++) { 1600 block = xfs_btree_get_block(cur, lev, &bp); 1601 1602 #ifdef DEBUG 1603 error = xfs_btree_check_block(cur, block, lev, bp); 1604 if (error) 1605 goto error0; 1606 #endif 1607 1608 if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block)) 1609 break; 1610 1611 /* Read-ahead the right block for the next loop. */ 1612 xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA); 1613 } 1614 1615 /* 1616 * If we went off the root then we are either seriously 1617 * confused or have the tree root in an inode. 1618 */ 1619 if (lev == cur->bc_nlevels) { 1620 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) 1621 goto out0; 1622 ASSERT(0); 1623 error = -EFSCORRUPTED; 1624 goto error0; 1625 } 1626 ASSERT(lev < cur->bc_nlevels); 1627 1628 /* 1629 * Now walk back down the tree, fixing up the cursor's buffer 1630 * pointers and key numbers. 1631 */ 1632 for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { 1633 union xfs_btree_ptr *ptrp; 1634 1635 ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block); 1636 --lev; 1637 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); 1638 if (error) 1639 goto error0; 1640 1641 xfs_btree_setbuf(cur, lev, bp); 1642 cur->bc_ptrs[lev] = 1; 1643 } 1644 out1: 1645 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 1646 *stat = 1; 1647 return 0; 1648 1649 out0: 1650 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 1651 *stat = 0; 1652 return 0; 1653 1654 error0: 1655 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); 1656 return error; 1657 } 1658 1659 /* 1660 * Decrement cursor by one record at the level. 1661 * For nonzero levels the leaf-ward information is untouched. 1662 */ 1663 int /* error */ 1664 xfs_btree_decrement( 1665 struct xfs_btree_cur *cur, 1666 int level, 1667 int *stat) /* success/failure */ 1668 { 1669 struct xfs_btree_block *block; 1670 xfs_buf_t *bp; 1671 int error; /* error return value */ 1672 int lev; 1673 union xfs_btree_ptr ptr; 1674 1675 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); 1676 XFS_BTREE_TRACE_ARGI(cur, level); 1677 1678 ASSERT(level < cur->bc_nlevels); 1679 1680 /* Read-ahead to the left at this level. */ 1681 xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA); 1682 1683 /* We're done if we remain in the block after the decrement. */ 1684 if (--cur->bc_ptrs[level] > 0) 1685 goto out1; 1686 1687 /* Get a pointer to the btree block. */ 1688 block = xfs_btree_get_block(cur, level, &bp); 1689 1690 #ifdef DEBUG 1691 error = xfs_btree_check_block(cur, block, level, bp); 1692 if (error) 1693 goto error0; 1694 #endif 1695 1696 /* Fail if we just went off the left edge of the tree. */ 1697 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); 1698 if (xfs_btree_ptr_is_null(cur, &ptr)) 1699 goto out0; 1700 1701 XFS_BTREE_STATS_INC(cur, decrement); 1702 1703 /* 1704 * March up the tree decrementing pointers. 1705 * Stop when we don't go off the left edge of a block. 1706 */ 1707 for (lev = level + 1; lev < cur->bc_nlevels; lev++) { 1708 if (--cur->bc_ptrs[lev] > 0) 1709 break; 1710 /* Read-ahead the left block for the next loop. */ 1711 xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA); 1712 } 1713 1714 /* 1715 * If we went off the root then we are seriously confused. 1716 * or the root of the tree is in an inode. 1717 */ 1718 if (lev == cur->bc_nlevels) { 1719 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) 1720 goto out0; 1721 ASSERT(0); 1722 error = -EFSCORRUPTED; 1723 goto error0; 1724 } 1725 ASSERT(lev < cur->bc_nlevels); 1726 1727 /* 1728 * Now walk back down the tree, fixing up the cursor's buffer 1729 * pointers and key numbers. 1730 */ 1731 for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { 1732 union xfs_btree_ptr *ptrp; 1733 1734 ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block); 1735 --lev; 1736 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); 1737 if (error) 1738 goto error0; 1739 xfs_btree_setbuf(cur, lev, bp); 1740 cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block); 1741 } 1742 out1: 1743 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 1744 *stat = 1; 1745 return 0; 1746 1747 out0: 1748 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 1749 *stat = 0; 1750 return 0; 1751 1752 error0: 1753 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); 1754 return error; 1755 } 1756 1757 int 1758 xfs_btree_lookup_get_block( 1759 struct xfs_btree_cur *cur, /* btree cursor */ 1760 int level, /* level in the btree */ 1761 union xfs_btree_ptr *pp, /* ptr to btree block */ 1762 struct xfs_btree_block **blkp) /* return btree block */ 1763 { 1764 struct xfs_buf *bp; /* buffer pointer for btree block */ 1765 int error = 0; 1766 1767 /* special case the root block if in an inode */ 1768 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && 1769 (level == cur->bc_nlevels - 1)) { 1770 *blkp = xfs_btree_get_iroot(cur); 1771 return 0; 1772 } 1773 1774 /* 1775 * If the old buffer at this level for the disk address we are 1776 * looking for re-use it. 1777 * 1778 * Otherwise throw it away and get a new one. 1779 */ 1780 bp = cur->bc_bufs[level]; 1781 if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) { 1782 *blkp = XFS_BUF_TO_BLOCK(bp); 1783 return 0; 1784 } 1785 1786 error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp); 1787 if (error) 1788 return error; 1789 1790 /* Check the inode owner since the verifiers don't. */ 1791 if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) && 1792 (cur->bc_flags & XFS_BTREE_LONG_PTRS) && 1793 be64_to_cpu((*blkp)->bb_u.l.bb_owner) != 1794 cur->bc_private.b.ip->i_ino) 1795 goto out_bad; 1796 1797 /* Did we get the level we were looking for? */ 1798 if (be16_to_cpu((*blkp)->bb_level) != level) 1799 goto out_bad; 1800 1801 /* Check that internal nodes have at least one record. */ 1802 if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0) 1803 goto out_bad; 1804 1805 xfs_btree_setbuf(cur, level, bp); 1806 return 0; 1807 1808 out_bad: 1809 *blkp = NULL; 1810 xfs_trans_brelse(cur->bc_tp, bp); 1811 return -EFSCORRUPTED; 1812 } 1813 1814 /* 1815 * Get current search key. For level 0 we don't actually have a key 1816 * structure so we make one up from the record. For all other levels 1817 * we just return the right key. 1818 */ 1819 STATIC union xfs_btree_key * 1820 xfs_lookup_get_search_key( 1821 struct xfs_btree_cur *cur, 1822 int level, 1823 int keyno, 1824 struct xfs_btree_block *block, 1825 union xfs_btree_key *kp) 1826 { 1827 if (level == 0) { 1828 cur->bc_ops->init_key_from_rec(kp, 1829 xfs_btree_rec_addr(cur, keyno, block)); 1830 return kp; 1831 } 1832 1833 return xfs_btree_key_addr(cur, keyno, block); 1834 } 1835 1836 /* 1837 * Lookup the record. The cursor is made to point to it, based on dir. 1838 * stat is set to 0 if can't find any such record, 1 for success. 1839 */ 1840 int /* error */ 1841 xfs_btree_lookup( 1842 struct xfs_btree_cur *cur, /* btree cursor */ 1843 xfs_lookup_t dir, /* <=, ==, or >= */ 1844 int *stat) /* success/failure */ 1845 { 1846 struct xfs_btree_block *block; /* current btree block */ 1847 int64_t diff; /* difference for the current key */ 1848 int error; /* error return value */ 1849 int keyno; /* current key number */ 1850 int level; /* level in the btree */ 1851 union xfs_btree_ptr *pp; /* ptr to btree block */ 1852 union xfs_btree_ptr ptr; /* ptr to btree block */ 1853 1854 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); 1855 XFS_BTREE_TRACE_ARGI(cur, dir); 1856 1857 XFS_BTREE_STATS_INC(cur, lookup); 1858 1859 /* No such thing as a zero-level tree. */ 1860 if (cur->bc_nlevels == 0) 1861 return -EFSCORRUPTED; 1862 1863 block = NULL; 1864 keyno = 0; 1865 1866 /* initialise start pointer from cursor */ 1867 cur->bc_ops->init_ptr_from_cur(cur, &ptr); 1868 pp = &ptr; 1869 1870 /* 1871 * Iterate over each level in the btree, starting at the root. 1872 * For each level above the leaves, find the key we need, based 1873 * on the lookup record, then follow the corresponding block 1874 * pointer down to the next level. 1875 */ 1876 for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) { 1877 /* Get the block we need to do the lookup on. */ 1878 error = xfs_btree_lookup_get_block(cur, level, pp, &block); 1879 if (error) 1880 goto error0; 1881 1882 if (diff == 0) { 1883 /* 1884 * If we already had a key match at a higher level, we 1885 * know we need to use the first entry in this block. 1886 */ 1887 keyno = 1; 1888 } else { 1889 /* Otherwise search this block. Do a binary search. */ 1890 1891 int high; /* high entry number */ 1892 int low; /* low entry number */ 1893 1894 /* Set low and high entry numbers, 1-based. */ 1895 low = 1; 1896 high = xfs_btree_get_numrecs(block); 1897 if (!high) { 1898 /* Block is empty, must be an empty leaf. */ 1899 ASSERT(level == 0 && cur->bc_nlevels == 1); 1900 1901 cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE; 1902 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 1903 *stat = 0; 1904 return 0; 1905 } 1906 1907 /* Binary search the block. */ 1908 while (low <= high) { 1909 union xfs_btree_key key; 1910 union xfs_btree_key *kp; 1911 1912 XFS_BTREE_STATS_INC(cur, compare); 1913 1914 /* keyno is average of low and high. */ 1915 keyno = (low + high) >> 1; 1916 1917 /* Get current search key */ 1918 kp = xfs_lookup_get_search_key(cur, level, 1919 keyno, block, &key); 1920 1921 /* 1922 * Compute difference to get next direction: 1923 * - less than, move right 1924 * - greater than, move left 1925 * - equal, we're done 1926 */ 1927 diff = cur->bc_ops->key_diff(cur, kp); 1928 if (diff < 0) 1929 low = keyno + 1; 1930 else if (diff > 0) 1931 high = keyno - 1; 1932 else 1933 break; 1934 } 1935 } 1936 1937 /* 1938 * If there are more levels, set up for the next level 1939 * by getting the block number and filling in the cursor. 1940 */ 1941 if (level > 0) { 1942 /* 1943 * If we moved left, need the previous key number, 1944 * unless there isn't one. 1945 */ 1946 if (diff > 0 && --keyno < 1) 1947 keyno = 1; 1948 pp = xfs_btree_ptr_addr(cur, keyno, block); 1949 1950 #ifdef DEBUG 1951 error = xfs_btree_check_ptr(cur, pp, 0, level); 1952 if (error) 1953 goto error0; 1954 #endif 1955 cur->bc_ptrs[level] = keyno; 1956 } 1957 } 1958 1959 /* Done with the search. See if we need to adjust the results. */ 1960 if (dir != XFS_LOOKUP_LE && diff < 0) { 1961 keyno++; 1962 /* 1963 * If ge search and we went off the end of the block, but it's 1964 * not the last block, we're in the wrong block. 1965 */ 1966 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 1967 if (dir == XFS_LOOKUP_GE && 1968 keyno > xfs_btree_get_numrecs(block) && 1969 !xfs_btree_ptr_is_null(cur, &ptr)) { 1970 int i; 1971 1972 cur->bc_ptrs[0] = keyno; 1973 error = xfs_btree_increment(cur, 0, &i); 1974 if (error) 1975 goto error0; 1976 XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); 1977 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 1978 *stat = 1; 1979 return 0; 1980 } 1981 } else if (dir == XFS_LOOKUP_LE && diff > 0) 1982 keyno--; 1983 cur->bc_ptrs[0] = keyno; 1984 1985 /* Return if we succeeded or not. */ 1986 if (keyno == 0 || keyno > xfs_btree_get_numrecs(block)) 1987 *stat = 0; 1988 else if (dir != XFS_LOOKUP_EQ || diff == 0) 1989 *stat = 1; 1990 else 1991 *stat = 0; 1992 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 1993 return 0; 1994 1995 error0: 1996 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); 1997 return error; 1998 } 1999 2000 /* Find the high key storage area from a regular key. */ 2001 STATIC union xfs_btree_key * 2002 xfs_btree_high_key_from_key( 2003 struct xfs_btree_cur *cur, 2004 union xfs_btree_key *key) 2005 { 2006 ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING); 2007 return (union xfs_btree_key *)((char *)key + 2008 (cur->bc_ops->key_len / 2)); 2009 } 2010 2011 /* Determine the low (and high if overlapped) keys of a leaf block */ 2012 STATIC void 2013 xfs_btree_get_leaf_keys( 2014 struct xfs_btree_cur *cur, 2015 struct xfs_btree_block *block, 2016 union xfs_btree_key *key) 2017 { 2018 union xfs_btree_key max_hkey; 2019 union xfs_btree_key hkey; 2020 union xfs_btree_rec *rec; 2021 union xfs_btree_key *high; 2022 int n; 2023 2024 rec = xfs_btree_rec_addr(cur, 1, block); 2025 cur->bc_ops->init_key_from_rec(key, rec); 2026 2027 if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { 2028 2029 cur->bc_ops->init_high_key_from_rec(&max_hkey, rec); 2030 for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { 2031 rec = xfs_btree_rec_addr(cur, n, block); 2032 cur->bc_ops->init_high_key_from_rec(&hkey, rec); 2033 if (cur->bc_ops->diff_two_keys(cur, &hkey, &max_hkey) 2034 > 0) 2035 max_hkey = hkey; 2036 } 2037 2038 high = xfs_btree_high_key_from_key(cur, key); 2039 memcpy(high, &max_hkey, cur->bc_ops->key_len / 2); 2040 } 2041 } 2042 2043 /* Determine the low (and high if overlapped) keys of a node block */ 2044 STATIC void 2045 xfs_btree_get_node_keys( 2046 struct xfs_btree_cur *cur, 2047 struct xfs_btree_block *block, 2048 union xfs_btree_key *key) 2049 { 2050 union xfs_btree_key *hkey; 2051 union xfs_btree_key *max_hkey; 2052 union xfs_btree_key *high; 2053 int n; 2054 2055 if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { 2056 memcpy(key, xfs_btree_key_addr(cur, 1, block), 2057 cur->bc_ops->key_len / 2); 2058 2059 max_hkey = xfs_btree_high_key_addr(cur, 1, block); 2060 for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { 2061 hkey = xfs_btree_high_key_addr(cur, n, block); 2062 if (cur->bc_ops->diff_two_keys(cur, hkey, max_hkey) > 0) 2063 max_hkey = hkey; 2064 } 2065 2066 high = xfs_btree_high_key_from_key(cur, key); 2067 memcpy(high, max_hkey, cur->bc_ops->key_len / 2); 2068 } else { 2069 memcpy(key, xfs_btree_key_addr(cur, 1, block), 2070 cur->bc_ops->key_len); 2071 } 2072 } 2073 2074 /* Derive the keys for any btree block. */ 2075 STATIC void 2076 xfs_btree_get_keys( 2077 struct xfs_btree_cur *cur, 2078 struct xfs_btree_block *block, 2079 union xfs_btree_key *key) 2080 { 2081 if (be16_to_cpu(block->bb_level) == 0) 2082 xfs_btree_get_leaf_keys(cur, block, key); 2083 else 2084 xfs_btree_get_node_keys(cur, block, key); 2085 } 2086 2087 /* 2088 * Decide if we need to update the parent keys of a btree block. For 2089 * a standard btree this is only necessary if we're updating the first 2090 * record/key. For an overlapping btree, we must always update the 2091 * keys because the highest key can be in any of the records or keys 2092 * in the block. 2093 */ 2094 static inline bool 2095 xfs_btree_needs_key_update( 2096 struct xfs_btree_cur *cur, 2097 int ptr) 2098 { 2099 return (cur->bc_flags & XFS_BTREE_OVERLAPPING) || ptr == 1; 2100 } 2101 2102 /* 2103 * Update the low and high parent keys of the given level, progressing 2104 * towards the root. If force_all is false, stop if the keys for a given 2105 * level do not need updating. 2106 */ 2107 STATIC int 2108 __xfs_btree_updkeys( 2109 struct xfs_btree_cur *cur, 2110 int level, 2111 struct xfs_btree_block *block, 2112 struct xfs_buf *bp0, 2113 bool force_all) 2114 { 2115 union xfs_btree_key key; /* keys from current level */ 2116 union xfs_btree_key *lkey; /* keys from the next level up */ 2117 union xfs_btree_key *hkey; 2118 union xfs_btree_key *nlkey; /* keys from the next level up */ 2119 union xfs_btree_key *nhkey; 2120 struct xfs_buf *bp; 2121 int ptr; 2122 2123 ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING); 2124 2125 /* Exit if there aren't any parent levels to update. */ 2126 if (level + 1 >= cur->bc_nlevels) 2127 return 0; 2128 2129 trace_xfs_btree_updkeys(cur, level, bp0); 2130 2131 lkey = &key; 2132 hkey = xfs_btree_high_key_from_key(cur, lkey); 2133 xfs_btree_get_keys(cur, block, lkey); 2134 for (level++; level < cur->bc_nlevels; level++) { 2135 #ifdef DEBUG 2136 int error; 2137 #endif 2138 block = xfs_btree_get_block(cur, level, &bp); 2139 trace_xfs_btree_updkeys(cur, level, bp); 2140 #ifdef DEBUG 2141 error = xfs_btree_check_block(cur, block, level, bp); 2142 if (error) { 2143 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); 2144 return error; 2145 } 2146 #endif 2147 ptr = cur->bc_ptrs[level]; 2148 nlkey = xfs_btree_key_addr(cur, ptr, block); 2149 nhkey = xfs_btree_high_key_addr(cur, ptr, block); 2150 if (!force_all && 2151 !(cur->bc_ops->diff_two_keys(cur, nlkey, lkey) != 0 || 2152 cur->bc_ops->diff_two_keys(cur, nhkey, hkey) != 0)) 2153 break; 2154 xfs_btree_copy_keys(cur, nlkey, lkey, 1); 2155 xfs_btree_log_keys(cur, bp, ptr, ptr); 2156 if (level + 1 >= cur->bc_nlevels) 2157 break; 2158 xfs_btree_get_node_keys(cur, block, lkey); 2159 } 2160 2161 return 0; 2162 } 2163 2164 /* Update all the keys from some level in cursor back to the root. */ 2165 STATIC int 2166 xfs_btree_updkeys_force( 2167 struct xfs_btree_cur *cur, 2168 int level) 2169 { 2170 struct xfs_buf *bp; 2171 struct xfs_btree_block *block; 2172 2173 block = xfs_btree_get_block(cur, level, &bp); 2174 return __xfs_btree_updkeys(cur, level, block, bp, true); 2175 } 2176 2177 /* 2178 * Update the parent keys of the given level, progressing towards the root. 2179 */ 2180 STATIC int 2181 xfs_btree_update_keys( 2182 struct xfs_btree_cur *cur, 2183 int level) 2184 { 2185 struct xfs_btree_block *block; 2186 struct xfs_buf *bp; 2187 union xfs_btree_key *kp; 2188 union xfs_btree_key key; 2189 int ptr; 2190 2191 ASSERT(level >= 0); 2192 2193 block = xfs_btree_get_block(cur, level, &bp); 2194 if (cur->bc_flags & XFS_BTREE_OVERLAPPING) 2195 return __xfs_btree_updkeys(cur, level, block, bp, false); 2196 2197 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); 2198 XFS_BTREE_TRACE_ARGIK(cur, level, keyp); 2199 2200 /* 2201 * Go up the tree from this level toward the root. 2202 * At each level, update the key value to the value input. 2203 * Stop when we reach a level where the cursor isn't pointing 2204 * at the first entry in the block. 2205 */ 2206 xfs_btree_get_keys(cur, block, &key); 2207 for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) { 2208 #ifdef DEBUG 2209 int error; 2210 #endif 2211 block = xfs_btree_get_block(cur, level, &bp); 2212 #ifdef DEBUG 2213 error = xfs_btree_check_block(cur, block, level, bp); 2214 if (error) { 2215 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); 2216 return error; 2217 } 2218 #endif 2219 ptr = cur->bc_ptrs[level]; 2220 kp = xfs_btree_key_addr(cur, ptr, block); 2221 xfs_btree_copy_keys(cur, kp, &key, 1); 2222 xfs_btree_log_keys(cur, bp, ptr, ptr); 2223 } 2224 2225 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 2226 return 0; 2227 } 2228 2229 /* 2230 * Update the record referred to by cur to the value in the 2231 * given record. This either works (return 0) or gets an 2232 * EFSCORRUPTED error. 2233 */ 2234 int 2235 xfs_btree_update( 2236 struct xfs_btree_cur *cur, 2237 union xfs_btree_rec *rec) 2238 { 2239 struct xfs_btree_block *block; 2240 struct xfs_buf *bp; 2241 int error; 2242 int ptr; 2243 union xfs_btree_rec *rp; 2244 2245 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); 2246 XFS_BTREE_TRACE_ARGR(cur, rec); 2247 2248 /* Pick up the current block. */ 2249 block = xfs_btree_get_block(cur, 0, &bp); 2250 2251 #ifdef DEBUG 2252 error = xfs_btree_check_block(cur, block, 0, bp); 2253 if (error) 2254 goto error0; 2255 #endif 2256 /* Get the address of the rec to be updated. */ 2257 ptr = cur->bc_ptrs[0]; 2258 rp = xfs_btree_rec_addr(cur, ptr, block); 2259 2260 /* Fill in the new contents and log them. */ 2261 xfs_btree_copy_recs(cur, rp, rec, 1); 2262 xfs_btree_log_recs(cur, bp, ptr, ptr); 2263 2264 /* 2265 * If we are tracking the last record in the tree and 2266 * we are at the far right edge of the tree, update it. 2267 */ 2268 if (xfs_btree_is_lastrec(cur, block, 0)) { 2269 cur->bc_ops->update_lastrec(cur, block, rec, 2270 ptr, LASTREC_UPDATE); 2271 } 2272 2273 /* Pass new key value up to our parent. */ 2274 if (xfs_btree_needs_key_update(cur, ptr)) { 2275 error = xfs_btree_update_keys(cur, 0); 2276 if (error) 2277 goto error0; 2278 } 2279 2280 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 2281 return 0; 2282 2283 error0: 2284 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); 2285 return error; 2286 } 2287 2288 /* 2289 * Move 1 record left from cur/level if possible. 2290 * Update cur to reflect the new path. 2291 */ 2292 STATIC int /* error */ 2293 xfs_btree_lshift( 2294 struct xfs_btree_cur *cur, 2295 int level, 2296 int *stat) /* success/failure */ 2297 { 2298 struct xfs_buf *lbp; /* left buffer pointer */ 2299 struct xfs_btree_block *left; /* left btree block */ 2300 int lrecs; /* left record count */ 2301 struct xfs_buf *rbp; /* right buffer pointer */ 2302 struct xfs_btree_block *right; /* right btree block */ 2303 struct xfs_btree_cur *tcur; /* temporary btree cursor */ 2304 int rrecs; /* right record count */ 2305 union xfs_btree_ptr lptr; /* left btree pointer */ 2306 union xfs_btree_key *rkp = NULL; /* right btree key */ 2307 union xfs_btree_ptr *rpp = NULL; /* right address pointer */ 2308 union xfs_btree_rec *rrp = NULL; /* right record pointer */ 2309 int error; /* error return value */ 2310 int i; 2311 2312 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); 2313 XFS_BTREE_TRACE_ARGI(cur, level); 2314 2315 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && 2316 level == cur->bc_nlevels - 1) 2317 goto out0; 2318 2319 /* Set up variables for this block as "right". */ 2320 right = xfs_btree_get_block(cur, level, &rbp); 2321 2322 #ifdef DEBUG 2323 error = xfs_btree_check_block(cur, right, level, rbp); 2324 if (error) 2325 goto error0; 2326 #endif 2327 2328 /* If we've got no left sibling then we can't shift an entry left. */ 2329 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); 2330 if (xfs_btree_ptr_is_null(cur, &lptr)) 2331 goto out0; 2332 2333 /* 2334 * If the cursor entry is the one that would be moved, don't 2335 * do it... it's too complicated. 2336 */ 2337 if (cur->bc_ptrs[level] <= 1) 2338 goto out0; 2339 2340 /* Set up the left neighbor as "left". */ 2341 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); 2342 if (error) 2343 goto error0; 2344 2345 /* If it's full, it can't take another entry. */ 2346 lrecs = xfs_btree_get_numrecs(left); 2347 if (lrecs == cur->bc_ops->get_maxrecs(cur, level)) 2348 goto out0; 2349 2350 rrecs = xfs_btree_get_numrecs(right); 2351 2352 /* 2353 * We add one entry to the left side and remove one for the right side. 2354 * Account for it here, the changes will be updated on disk and logged 2355 * later. 2356 */ 2357 lrecs++; 2358 rrecs--; 2359 2360 XFS_BTREE_STATS_INC(cur, lshift); 2361 XFS_BTREE_STATS_ADD(cur, moves, 1); 2362 2363 /* 2364 * If non-leaf, copy a key and a ptr to the left block. 2365 * Log the changes to the left block. 2366 */ 2367 if (level > 0) { 2368 /* It's a non-leaf. Move keys and pointers. */ 2369 union xfs_btree_key *lkp; /* left btree key */ 2370 union xfs_btree_ptr *lpp; /* left address pointer */ 2371 2372 lkp = xfs_btree_key_addr(cur, lrecs, left); 2373 rkp = xfs_btree_key_addr(cur, 1, right); 2374 2375 lpp = xfs_btree_ptr_addr(cur, lrecs, left); 2376 rpp = xfs_btree_ptr_addr(cur, 1, right); 2377 #ifdef DEBUG 2378 error = xfs_btree_check_ptr(cur, rpp, 0, level); 2379 if (error) 2380 goto error0; 2381 #endif 2382 xfs_btree_copy_keys(cur, lkp, rkp, 1); 2383 xfs_btree_copy_ptrs(cur, lpp, rpp, 1); 2384 2385 xfs_btree_log_keys(cur, lbp, lrecs, lrecs); 2386 xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs); 2387 2388 ASSERT(cur->bc_ops->keys_inorder(cur, 2389 xfs_btree_key_addr(cur, lrecs - 1, left), lkp)); 2390 } else { 2391 /* It's a leaf. Move records. */ 2392 union xfs_btree_rec *lrp; /* left record pointer */ 2393 2394 lrp = xfs_btree_rec_addr(cur, lrecs, left); 2395 rrp = xfs_btree_rec_addr(cur, 1, right); 2396 2397 xfs_btree_copy_recs(cur, lrp, rrp, 1); 2398 xfs_btree_log_recs(cur, lbp, lrecs, lrecs); 2399 2400 ASSERT(cur->bc_ops->recs_inorder(cur, 2401 xfs_btree_rec_addr(cur, lrecs - 1, left), lrp)); 2402 } 2403 2404 xfs_btree_set_numrecs(left, lrecs); 2405 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); 2406 2407 xfs_btree_set_numrecs(right, rrecs); 2408 xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); 2409 2410 /* 2411 * Slide the contents of right down one entry. 2412 */ 2413 XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1); 2414 if (level > 0) { 2415 /* It's a nonleaf. operate on keys and ptrs */ 2416 #ifdef DEBUG 2417 int i; /* loop index */ 2418 2419 for (i = 0; i < rrecs; i++) { 2420 error = xfs_btree_check_ptr(cur, rpp, i + 1, level); 2421 if (error) 2422 goto error0; 2423 } 2424 #endif 2425 xfs_btree_shift_keys(cur, 2426 xfs_btree_key_addr(cur, 2, right), 2427 -1, rrecs); 2428 xfs_btree_shift_ptrs(cur, 2429 xfs_btree_ptr_addr(cur, 2, right), 2430 -1, rrecs); 2431 2432 xfs_btree_log_keys(cur, rbp, 1, rrecs); 2433 xfs_btree_log_ptrs(cur, rbp, 1, rrecs); 2434 } else { 2435 /* It's a leaf. operate on records */ 2436 xfs_btree_shift_recs(cur, 2437 xfs_btree_rec_addr(cur, 2, right), 2438 -1, rrecs); 2439 xfs_btree_log_recs(cur, rbp, 1, rrecs); 2440 } 2441 2442 /* 2443 * Using a temporary cursor, update the parent key values of the 2444 * block on the left. 2445 */ 2446 if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { 2447 error = xfs_btree_dup_cursor(cur, &tcur); 2448 if (error) 2449 goto error0; 2450 i = xfs_btree_firstrec(tcur, level); 2451 XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0); 2452 2453 error = xfs_btree_decrement(tcur, level, &i); 2454 if (error) 2455 goto error1; 2456 2457 /* Update the parent high keys of the left block, if needed. */ 2458 error = xfs_btree_update_keys(tcur, level); 2459 if (error) 2460 goto error1; 2461 2462 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 2463 } 2464 2465 /* Update the parent keys of the right block. */ 2466 error = xfs_btree_update_keys(cur, level); 2467 if (error) 2468 goto error0; 2469 2470 /* Slide the cursor value left one. */ 2471 cur->bc_ptrs[level]--; 2472 2473 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 2474 *stat = 1; 2475 return 0; 2476 2477 out0: 2478 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 2479 *stat = 0; 2480 return 0; 2481 2482 error0: 2483 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); 2484 return error; 2485 2486 error1: 2487 XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR); 2488 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 2489 return error; 2490 } 2491 2492 /* 2493 * Move 1 record right from cur/level if possible. 2494 * Update cur to reflect the new path. 2495 */ 2496 STATIC int /* error */ 2497 xfs_btree_rshift( 2498 struct xfs_btree_cur *cur, 2499 int level, 2500 int *stat) /* success/failure */ 2501 { 2502 struct xfs_buf *lbp; /* left buffer pointer */ 2503 struct xfs_btree_block *left; /* left btree block */ 2504 struct xfs_buf *rbp; /* right buffer pointer */ 2505 struct xfs_btree_block *right; /* right btree block */ 2506 struct xfs_btree_cur *tcur; /* temporary btree cursor */ 2507 union xfs_btree_ptr rptr; /* right block pointer */ 2508 union xfs_btree_key *rkp; /* right btree key */ 2509 int rrecs; /* right record count */ 2510 int lrecs; /* left record count */ 2511 int error; /* error return value */ 2512 int i; /* loop counter */ 2513 2514 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); 2515 XFS_BTREE_TRACE_ARGI(cur, level); 2516 2517 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && 2518 (level == cur->bc_nlevels - 1)) 2519 goto out0; 2520 2521 /* Set up variables for this block as "left". */ 2522 left = xfs_btree_get_block(cur, level, &lbp); 2523 2524 #ifdef DEBUG 2525 error = xfs_btree_check_block(cur, left, level, lbp); 2526 if (error) 2527 goto error0; 2528 #endif 2529 2530 /* If we've got no right sibling then we can't shift an entry right. */ 2531 xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); 2532 if (xfs_btree_ptr_is_null(cur, &rptr)) 2533 goto out0; 2534 2535 /* 2536 * If the cursor entry is the one that would be moved, don't 2537 * do it... it's too complicated. 2538 */ 2539 lrecs = xfs_btree_get_numrecs(left); 2540 if (cur->bc_ptrs[level] >= lrecs) 2541 goto out0; 2542 2543 /* Set up the right neighbor as "right". */ 2544 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); 2545 if (error) 2546 goto error0; 2547 2548 /* If it's full, it can't take another entry. */ 2549 rrecs = xfs_btree_get_numrecs(right); 2550 if (rrecs == cur->bc_ops->get_maxrecs(cur, level)) 2551 goto out0; 2552 2553 XFS_BTREE_STATS_INC(cur, rshift); 2554 XFS_BTREE_STATS_ADD(cur, moves, rrecs); 2555 2556 /* 2557 * Make a hole at the start of the right neighbor block, then 2558 * copy the last left block entry to the hole. 2559 */ 2560 if (level > 0) { 2561 /* It's a nonleaf. make a hole in the keys and ptrs */ 2562 union xfs_btree_key *lkp; 2563 union xfs_btree_ptr *lpp; 2564 union xfs_btree_ptr *rpp; 2565 2566 lkp = xfs_btree_key_addr(cur, lrecs, left); 2567 lpp = xfs_btree_ptr_addr(cur, lrecs, left); 2568 rkp = xfs_btree_key_addr(cur, 1, right); 2569 rpp = xfs_btree_ptr_addr(cur, 1, right); 2570 2571 #ifdef DEBUG 2572 for (i = rrecs - 1; i >= 0; i--) { 2573 error = xfs_btree_check_ptr(cur, rpp, i, level); 2574 if (error) 2575 goto error0; 2576 } 2577 #endif 2578 2579 xfs_btree_shift_keys(cur, rkp, 1, rrecs); 2580 xfs_btree_shift_ptrs(cur, rpp, 1, rrecs); 2581 2582 #ifdef DEBUG 2583 error = xfs_btree_check_ptr(cur, lpp, 0, level); 2584 if (error) 2585 goto error0; 2586 #endif 2587 2588 /* Now put the new data in, and log it. */ 2589 xfs_btree_copy_keys(cur, rkp, lkp, 1); 2590 xfs_btree_copy_ptrs(cur, rpp, lpp, 1); 2591 2592 xfs_btree_log_keys(cur, rbp, 1, rrecs + 1); 2593 xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1); 2594 2595 ASSERT(cur->bc_ops->keys_inorder(cur, rkp, 2596 xfs_btree_key_addr(cur, 2, right))); 2597 } else { 2598 /* It's a leaf. make a hole in the records */ 2599 union xfs_btree_rec *lrp; 2600 union xfs_btree_rec *rrp; 2601 2602 lrp = xfs_btree_rec_addr(cur, lrecs, left); 2603 rrp = xfs_btree_rec_addr(cur, 1, right); 2604 2605 xfs_btree_shift_recs(cur, rrp, 1, rrecs); 2606 2607 /* Now put the new data in, and log it. */ 2608 xfs_btree_copy_recs(cur, rrp, lrp, 1); 2609 xfs_btree_log_recs(cur, rbp, 1, rrecs + 1); 2610 } 2611 2612 /* 2613 * Decrement and log left's numrecs, bump and log right's numrecs. 2614 */ 2615 xfs_btree_set_numrecs(left, --lrecs); 2616 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); 2617 2618 xfs_btree_set_numrecs(right, ++rrecs); 2619 xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); 2620 2621 /* 2622 * Using a temporary cursor, update the parent key values of the 2623 * block on the right. 2624 */ 2625 error = xfs_btree_dup_cursor(cur, &tcur); 2626 if (error) 2627 goto error0; 2628 i = xfs_btree_lastrec(tcur, level); 2629 XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0); 2630 2631 error = xfs_btree_increment(tcur, level, &i); 2632 if (error) 2633 goto error1; 2634 2635 /* Update the parent high keys of the left block, if needed. */ 2636 if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { 2637 error = xfs_btree_update_keys(cur, level); 2638 if (error) 2639 goto error1; 2640 } 2641 2642 /* Update the parent keys of the right block. */ 2643 error = xfs_btree_update_keys(tcur, level); 2644 if (error) 2645 goto error1; 2646 2647 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 2648 2649 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 2650 *stat = 1; 2651 return 0; 2652 2653 out0: 2654 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 2655 *stat = 0; 2656 return 0; 2657 2658 error0: 2659 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); 2660 return error; 2661 2662 error1: 2663 XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR); 2664 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 2665 return error; 2666 } 2667 2668 /* 2669 * Split cur/level block in half. 2670 * Return new block number and the key to its first 2671 * record (to be inserted into parent). 2672 */ 2673 STATIC int /* error */ 2674 __xfs_btree_split( 2675 struct xfs_btree_cur *cur, 2676 int level, 2677 union xfs_btree_ptr *ptrp, 2678 union xfs_btree_key *key, 2679 struct xfs_btree_cur **curp, 2680 int *stat) /* success/failure */ 2681 { 2682 union xfs_btree_ptr lptr; /* left sibling block ptr */ 2683 struct xfs_buf *lbp; /* left buffer pointer */ 2684 struct xfs_btree_block *left; /* left btree block */ 2685 union xfs_btree_ptr rptr; /* right sibling block ptr */ 2686 struct xfs_buf *rbp; /* right buffer pointer */ 2687 struct xfs_btree_block *right; /* right btree block */ 2688 union xfs_btree_ptr rrptr; /* right-right sibling ptr */ 2689 struct xfs_buf *rrbp; /* right-right buffer pointer */ 2690 struct xfs_btree_block *rrblock; /* right-right btree block */ 2691 int lrecs; 2692 int rrecs; 2693 int src_index; 2694 int error; /* error return value */ 2695 #ifdef DEBUG 2696 int i; 2697 #endif 2698 2699 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); 2700 XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key); 2701 2702 XFS_BTREE_STATS_INC(cur, split); 2703 2704 /* Set up left block (current one). */ 2705 left = xfs_btree_get_block(cur, level, &lbp); 2706 2707 #ifdef DEBUG 2708 error = xfs_btree_check_block(cur, left, level, lbp); 2709 if (error) 2710 goto error0; 2711 #endif 2712 2713 xfs_btree_buf_to_ptr(cur, lbp, &lptr); 2714 2715 /* Allocate the new block. If we can't do it, we're toast. Give up. */ 2716 error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat); 2717 if (error) 2718 goto error0; 2719 if (*stat == 0) 2720 goto out0; 2721 XFS_BTREE_STATS_INC(cur, alloc); 2722 2723 /* Set up the new block as "right". */ 2724 error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp); 2725 if (error) 2726 goto error0; 2727 2728 /* Fill in the btree header for the new right block. */ 2729 xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0); 2730 2731 /* 2732 * Split the entries between the old and the new block evenly. 2733 * Make sure that if there's an odd number of entries now, that 2734 * each new block will have the same number of entries. 2735 */ 2736 lrecs = xfs_btree_get_numrecs(left); 2737 rrecs = lrecs / 2; 2738 if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1) 2739 rrecs++; 2740 src_index = (lrecs - rrecs + 1); 2741 2742 XFS_BTREE_STATS_ADD(cur, moves, rrecs); 2743 2744 /* Adjust numrecs for the later get_*_keys() calls. */ 2745 lrecs -= rrecs; 2746 xfs_btree_set_numrecs(left, lrecs); 2747 xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs); 2748 2749 /* 2750 * Copy btree block entries from the left block over to the 2751 * new block, the right. Update the right block and log the 2752 * changes. 2753 */ 2754 if (level > 0) { 2755 /* It's a non-leaf. Move keys and pointers. */ 2756 union xfs_btree_key *lkp; /* left btree key */ 2757 union xfs_btree_ptr *lpp; /* left address pointer */ 2758 union xfs_btree_key *rkp; /* right btree key */ 2759 union xfs_btree_ptr *rpp; /* right address pointer */ 2760 2761 lkp = xfs_btree_key_addr(cur, src_index, left); 2762 lpp = xfs_btree_ptr_addr(cur, src_index, left); 2763 rkp = xfs_btree_key_addr(cur, 1, right); 2764 rpp = xfs_btree_ptr_addr(cur, 1, right); 2765 2766 #ifdef DEBUG 2767 for (i = src_index; i < rrecs; i++) { 2768 error = xfs_btree_check_ptr(cur, lpp, i, level); 2769 if (error) 2770 goto error0; 2771 } 2772 #endif 2773 2774 /* Copy the keys & pointers to the new block. */ 2775 xfs_btree_copy_keys(cur, rkp, lkp, rrecs); 2776 xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs); 2777 2778 xfs_btree_log_keys(cur, rbp, 1, rrecs); 2779 xfs_btree_log_ptrs(cur, rbp, 1, rrecs); 2780 2781 /* Stash the keys of the new block for later insertion. */ 2782 xfs_btree_get_node_keys(cur, right, key); 2783 } else { 2784 /* It's a leaf. Move records. */ 2785 union xfs_btree_rec *lrp; /* left record pointer */ 2786 union xfs_btree_rec *rrp; /* right record pointer */ 2787 2788 lrp = xfs_btree_rec_addr(cur, src_index, left); 2789 rrp = xfs_btree_rec_addr(cur, 1, right); 2790 2791 /* Copy records to the new block. */ 2792 xfs_btree_copy_recs(cur, rrp, lrp, rrecs); 2793 xfs_btree_log_recs(cur, rbp, 1, rrecs); 2794 2795 /* Stash the keys of the new block for later insertion. */ 2796 xfs_btree_get_leaf_keys(cur, right, key); 2797 } 2798 2799 /* 2800 * Find the left block number by looking in the buffer. 2801 * Adjust sibling pointers. 2802 */ 2803 xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB); 2804 xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB); 2805 xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); 2806 xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); 2807 2808 xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS); 2809 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); 2810 2811 /* 2812 * If there's a block to the new block's right, make that block 2813 * point back to right instead of to left. 2814 */ 2815 if (!xfs_btree_ptr_is_null(cur, &rrptr)) { 2816 error = xfs_btree_read_buf_block(cur, &rrptr, 2817 0, &rrblock, &rrbp); 2818 if (error) 2819 goto error0; 2820 xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB); 2821 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); 2822 } 2823 2824 /* Update the parent high keys of the left block, if needed. */ 2825 if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { 2826 error = xfs_btree_update_keys(cur, level); 2827 if (error) 2828 goto error0; 2829 } 2830 2831 /* 2832 * If the cursor is really in the right block, move it there. 2833 * If it's just pointing past the last entry in left, then we'll 2834 * insert there, so don't change anything in that case. 2835 */ 2836 if (cur->bc_ptrs[level] > lrecs + 1) { 2837 xfs_btree_setbuf(cur, level, rbp); 2838 cur->bc_ptrs[level] -= lrecs; 2839 } 2840 /* 2841 * If there are more levels, we'll need another cursor which refers 2842 * the right block, no matter where this cursor was. 2843 */ 2844 if (level + 1 < cur->bc_nlevels) { 2845 error = xfs_btree_dup_cursor(cur, curp); 2846 if (error) 2847 goto error0; 2848 (*curp)->bc_ptrs[level + 1]++; 2849 } 2850 *ptrp = rptr; 2851 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 2852 *stat = 1; 2853 return 0; 2854 out0: 2855 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 2856 *stat = 0; 2857 return 0; 2858 2859 error0: 2860 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); 2861 return error; 2862 } 2863 2864 struct xfs_btree_split_args { 2865 struct xfs_btree_cur *cur; 2866 int level; 2867 union xfs_btree_ptr *ptrp; 2868 union xfs_btree_key *key; 2869 struct xfs_btree_cur **curp; 2870 int *stat; /* success/failure */ 2871 int result; 2872 bool kswapd; /* allocation in kswapd context */ 2873 struct completion *done; 2874 struct work_struct work; 2875 }; 2876 2877 /* 2878 * Stack switching interfaces for allocation 2879 */ 2880 static void 2881 xfs_btree_split_worker( 2882 struct work_struct *work) 2883 { 2884 struct xfs_btree_split_args *args = container_of(work, 2885 struct xfs_btree_split_args, work); 2886 unsigned long pflags; 2887 unsigned long new_pflags = PF_MEMALLOC_NOFS; 2888 2889 /* 2890 * we are in a transaction context here, but may also be doing work 2891 * in kswapd context, and hence we may need to inherit that state 2892 * temporarily to ensure that we don't block waiting for memory reclaim 2893 * in any way. 2894 */ 2895 if (args->kswapd) 2896 new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; 2897 2898 current_set_flags_nested(&pflags, new_pflags); 2899 2900 args->result = __xfs_btree_split(args->cur, args->level, args->ptrp, 2901 args->key, args->curp, args->stat); 2902 complete(args->done); 2903 2904 current_restore_flags_nested(&pflags, new_pflags); 2905 } 2906 2907 /* 2908 * BMBT split requests often come in with little stack to work on. Push 2909 * them off to a worker thread so there is lots of stack to use. For the other 2910 * btree types, just call directly to avoid the context switch overhead here. 2911 */ 2912 STATIC int /* error */ 2913 xfs_btree_split( 2914 struct xfs_btree_cur *cur, 2915 int level, 2916 union xfs_btree_ptr *ptrp, 2917 union xfs_btree_key *key, 2918 struct xfs_btree_cur **curp, 2919 int *stat) /* success/failure */ 2920 { 2921 struct xfs_btree_split_args args; 2922 DECLARE_COMPLETION_ONSTACK(done); 2923 2924 if (cur->bc_btnum != XFS_BTNUM_BMAP) 2925 return __xfs_btree_split(cur, level, ptrp, key, curp, stat); 2926 2927 args.cur = cur; 2928 args.level = level; 2929 args.ptrp = ptrp; 2930 args.key = key; 2931 args.curp = curp; 2932 args.stat = stat; 2933 args.done = &done; 2934 args.kswapd = current_is_kswapd(); 2935 INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker); 2936 queue_work(xfs_alloc_wq, &args.work); 2937 wait_for_completion(&done); 2938 destroy_work_on_stack(&args.work); 2939 return args.result; 2940 } 2941 2942 2943 /* 2944 * Copy the old inode root contents into a real block and make the 2945 * broot point to it. 2946 */ 2947 int /* error */ 2948 xfs_btree_new_iroot( 2949 struct xfs_btree_cur *cur, /* btree cursor */ 2950 int *logflags, /* logging flags for inode */ 2951 int *stat) /* return status - 0 fail */ 2952 { 2953 struct xfs_buf *cbp; /* buffer for cblock */ 2954 struct xfs_btree_block *block; /* btree block */ 2955 struct xfs_btree_block *cblock; /* child btree block */ 2956 union xfs_btree_key *ckp; /* child key pointer */ 2957 union xfs_btree_ptr *cpp; /* child ptr pointer */ 2958 union xfs_btree_key *kp; /* pointer to btree key */ 2959 union xfs_btree_ptr *pp; /* pointer to block addr */ 2960 union xfs_btree_ptr nptr; /* new block addr */ 2961 int level; /* btree level */ 2962 int error; /* error return code */ 2963 #ifdef DEBUG 2964 int i; /* loop counter */ 2965 #endif 2966 2967 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); 2968 XFS_BTREE_STATS_INC(cur, newroot); 2969 2970 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); 2971 2972 level = cur->bc_nlevels - 1; 2973 2974 block = xfs_btree_get_iroot(cur); 2975 pp = xfs_btree_ptr_addr(cur, 1, block); 2976 2977 /* Allocate the new block. If we can't do it, we're toast. Give up. */ 2978 error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat); 2979 if (error) 2980 goto error0; 2981 if (*stat == 0) { 2982 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 2983 return 0; 2984 } 2985 XFS_BTREE_STATS_INC(cur, alloc); 2986 2987 /* Copy the root into a real block. */ 2988 error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp); 2989 if (error) 2990 goto error0; 2991 2992 /* 2993 * we can't just memcpy() the root in for CRC enabled btree blocks. 2994 * In that case have to also ensure the blkno remains correct 2995 */ 2996 memcpy(cblock, block, xfs_btree_block_len(cur)); 2997 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { 2998 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 2999 cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn); 3000 else 3001 cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn); 3002 } 3003 3004 be16_add_cpu(&block->bb_level, 1); 3005 xfs_btree_set_numrecs(block, 1); 3006 cur->bc_nlevels++; 3007 cur->bc_ptrs[level + 1] = 1; 3008 3009 kp = xfs_btree_key_addr(cur, 1, block); 3010 ckp = xfs_btree_key_addr(cur, 1, cblock); 3011 xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock)); 3012 3013 cpp = xfs_btree_ptr_addr(cur, 1, cblock); 3014 #ifdef DEBUG 3015 for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) { 3016 error = xfs_btree_check_ptr(cur, pp, i, level); 3017 if (error) 3018 goto error0; 3019 } 3020 #endif 3021 xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock)); 3022 3023 #ifdef DEBUG 3024 error = xfs_btree_check_ptr(cur, &nptr, 0, level); 3025 if (error) 3026 goto error0; 3027 #endif 3028 xfs_btree_copy_ptrs(cur, pp, &nptr, 1); 3029 3030 xfs_iroot_realloc(cur->bc_private.b.ip, 3031 1 - xfs_btree_get_numrecs(cblock), 3032 cur->bc_private.b.whichfork); 3033 3034 xfs_btree_setbuf(cur, level, cbp); 3035 3036 /* 3037 * Do all this logging at the end so that 3038 * the root is at the right level. 3039 */ 3040 xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS); 3041 xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); 3042 xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); 3043 3044 *logflags |= 3045 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork); 3046 *stat = 1; 3047 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 3048 return 0; 3049 error0: 3050 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); 3051 return error; 3052 } 3053 3054 /* 3055 * Allocate a new root block, fill it in. 3056 */ 3057 STATIC int /* error */ 3058 xfs_btree_new_root( 3059 struct xfs_btree_cur *cur, /* btree cursor */ 3060 int *stat) /* success/failure */ 3061 { 3062 struct xfs_btree_block *block; /* one half of the old root block */ 3063 struct xfs_buf *bp; /* buffer containing block */ 3064 int error; /* error return value */ 3065 struct xfs_buf *lbp; /* left buffer pointer */ 3066 struct xfs_btree_block *left; /* left btree block */ 3067 struct xfs_buf *nbp; /* new (root) buffer */ 3068 struct xfs_btree_block *new; /* new (root) btree block */ 3069 int nptr; /* new value for key index, 1 or 2 */ 3070 struct xfs_buf *rbp; /* right buffer pointer */ 3071 struct xfs_btree_block *right; /* right btree block */ 3072 union xfs_btree_ptr rptr; 3073 union xfs_btree_ptr lptr; 3074 3075 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); 3076 XFS_BTREE_STATS_INC(cur, newroot); 3077 3078 /* initialise our start point from the cursor */ 3079 cur->bc_ops->init_ptr_from_cur(cur, &rptr); 3080 3081 /* Allocate the new block. If we can't do it, we're toast. Give up. */ 3082 error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat); 3083 if (error) 3084 goto error0; 3085 if (*stat == 0) 3086 goto out0; 3087 XFS_BTREE_STATS_INC(cur, alloc); 3088 3089 /* Set up the new block. */ 3090 error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp); 3091 if (error) 3092 goto error0; 3093 3094 /* Set the root in the holding structure increasing the level by 1. */ 3095 cur->bc_ops->set_root(cur, &lptr, 1); 3096 3097 /* 3098 * At the previous root level there are now two blocks: the old root, 3099 * and the new block generated when it was split. We don't know which 3100 * one the cursor is pointing at, so we set up variables "left" and 3101 * "right" for each case. 3102 */ 3103 block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp); 3104 3105 #ifdef DEBUG 3106 error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp); 3107 if (error) 3108 goto error0; 3109 #endif 3110 3111 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); 3112 if (!xfs_btree_ptr_is_null(cur, &rptr)) { 3113 /* Our block is left, pick up the right block. */ 3114 lbp = bp; 3115 xfs_btree_buf_to_ptr(cur, lbp, &lptr); 3116 left = block; 3117 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); 3118 if (error) 3119 goto error0; 3120 bp = rbp; 3121 nptr = 1; 3122 } else { 3123 /* Our block is right, pick up the left block. */ 3124 rbp = bp; 3125 xfs_btree_buf_to_ptr(cur, rbp, &rptr); 3126 right = block; 3127 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); 3128 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); 3129 if (error) 3130 goto error0; 3131 bp = lbp; 3132 nptr = 2; 3133 } 3134 3135 /* Fill in the new block's btree header and log it. */ 3136 xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2); 3137 xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); 3138 ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) && 3139 !xfs_btree_ptr_is_null(cur, &rptr)); 3140 3141 /* Fill in the key data in the new root. */ 3142 if (xfs_btree_get_level(left) > 0) { 3143 /* 3144 * Get the keys for the left block's keys and put them directly 3145 * in the parent block. Do the same for the right block. 3146 */ 3147 xfs_btree_get_node_keys(cur, left, 3148 xfs_btree_key_addr(cur, 1, new)); 3149 xfs_btree_get_node_keys(cur, right, 3150 xfs_btree_key_addr(cur, 2, new)); 3151 } else { 3152 /* 3153 * Get the keys for the left block's records and put them 3154 * directly in the parent block. Do the same for the right 3155 * block. 3156 */ 3157 xfs_btree_get_leaf_keys(cur, left, 3158 xfs_btree_key_addr(cur, 1, new)); 3159 xfs_btree_get_leaf_keys(cur, right, 3160 xfs_btree_key_addr(cur, 2, new)); 3161 } 3162 xfs_btree_log_keys(cur, nbp, 1, 2); 3163 3164 /* Fill in the pointer data in the new root. */ 3165 xfs_btree_copy_ptrs(cur, 3166 xfs_btree_ptr_addr(cur, 1, new), &lptr, 1); 3167 xfs_btree_copy_ptrs(cur, 3168 xfs_btree_ptr_addr(cur, 2, new), &rptr, 1); 3169 xfs_btree_log_ptrs(cur, nbp, 1, 2); 3170 3171 /* Fix up the cursor. */ 3172 xfs_btree_setbuf(cur, cur->bc_nlevels, nbp); 3173 cur->bc_ptrs[cur->bc_nlevels] = nptr; 3174 cur->bc_nlevels++; 3175 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 3176 *stat = 1; 3177 return 0; 3178 error0: 3179 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); 3180 return error; 3181 out0: 3182 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 3183 *stat = 0; 3184 return 0; 3185 } 3186 3187 STATIC int 3188 xfs_btree_make_block_unfull( 3189 struct xfs_btree_cur *cur, /* btree cursor */ 3190 int level, /* btree level */ 3191 int numrecs,/* # of recs in block */ 3192 int *oindex,/* old tree index */ 3193 int *index, /* new tree index */ 3194 union xfs_btree_ptr *nptr, /* new btree ptr */ 3195 struct xfs_btree_cur **ncur, /* new btree cursor */ 3196 union xfs_btree_key *key, /* key of new block */ 3197 int *stat) 3198 { 3199 int error = 0; 3200 3201 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && 3202 level == cur->bc_nlevels - 1) { 3203 struct xfs_inode *ip = cur->bc_private.b.ip; 3204 3205 if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { 3206 /* A root block that can be made bigger. */ 3207 xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork); 3208 *stat = 1; 3209 } else { 3210 /* A root block that needs replacing */ 3211 int logflags = 0; 3212 3213 error = xfs_btree_new_iroot(cur, &logflags, stat); 3214 if (error || *stat == 0) 3215 return error; 3216 3217 xfs_trans_log_inode(cur->bc_tp, ip, logflags); 3218 } 3219 3220 return 0; 3221 } 3222 3223 /* First, try shifting an entry to the right neighbor. */ 3224 error = xfs_btree_rshift(cur, level, stat); 3225 if (error || *stat) 3226 return error; 3227 3228 /* Next, try shifting an entry to the left neighbor. */ 3229 error = xfs_btree_lshift(cur, level, stat); 3230 if (error) 3231 return error; 3232 3233 if (*stat) { 3234 *oindex = *index = cur->bc_ptrs[level]; 3235 return 0; 3236 } 3237 3238 /* 3239 * Next, try splitting the current block in half. 3240 * 3241 * If this works we have to re-set our variables because we 3242 * could be in a different block now. 3243 */ 3244 error = xfs_btree_split(cur, level, nptr, key, ncur, stat); 3245 if (error || *stat == 0) 3246 return error; 3247 3248 3249 *index = cur->bc_ptrs[level]; 3250 return 0; 3251 } 3252 3253 /* 3254 * Insert one record/level. Return information to the caller 3255 * allowing the next level up to proceed if necessary. 3256 */ 3257 STATIC int 3258 xfs_btree_insrec( 3259 struct xfs_btree_cur *cur, /* btree cursor */ 3260 int level, /* level to insert record at */ 3261 union xfs_btree_ptr *ptrp, /* i/o: block number inserted */ 3262 union xfs_btree_rec *rec, /* record to insert */ 3263 union xfs_btree_key *key, /* i/o: block key for ptrp */ 3264 struct xfs_btree_cur **curp, /* output: new cursor replacing cur */ 3265 int *stat) /* success/failure */ 3266 { 3267 struct xfs_btree_block *block; /* btree block */ 3268 struct xfs_buf *bp; /* buffer for block */ 3269 union xfs_btree_ptr nptr; /* new block ptr */ 3270 struct xfs_btree_cur *ncur; /* new btree cursor */ 3271 union xfs_btree_key nkey; /* new block key */ 3272 union xfs_btree_key *lkey; 3273 int optr; /* old key/record index */ 3274 int ptr; /* key/record index */ 3275 int numrecs;/* number of records */ 3276 int error; /* error return value */ 3277 #ifdef DEBUG 3278 int i; 3279 #endif 3280 xfs_daddr_t old_bn; 3281 3282 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); 3283 XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, &rec); 3284 3285 ncur = NULL; 3286 lkey = &nkey; 3287 3288 /* 3289 * If we have an external root pointer, and we've made it to the 3290 * root level, allocate a new root block and we're done. 3291 */ 3292 if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && 3293 (level >= cur->bc_nlevels)) { 3294 error = xfs_btree_new_root(cur, stat); 3295 xfs_btree_set_ptr_null(cur, ptrp); 3296 3297 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 3298 return error; 3299 } 3300 3301 /* If we're off the left edge, return failure. */ 3302 ptr = cur->bc_ptrs[level]; 3303 if (ptr == 0) { 3304 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 3305 *stat = 0; 3306 return 0; 3307 } 3308 3309 optr = ptr; 3310 3311 XFS_BTREE_STATS_INC(cur, insrec); 3312 3313 /* Get pointers to the btree buffer and block. */ 3314 block = xfs_btree_get_block(cur, level, &bp); 3315 old_bn = bp ? bp->b_bn : XFS_BUF_DADDR_NULL; 3316 numrecs = xfs_btree_get_numrecs(block); 3317 3318 #ifdef DEBUG 3319 error = xfs_btree_check_block(cur, block, level, bp); 3320 if (error) 3321 goto error0; 3322 3323 /* Check that the new entry is being inserted in the right place. */ 3324 if (ptr <= numrecs) { 3325 if (level == 0) { 3326 ASSERT(cur->bc_ops->recs_inorder(cur, rec, 3327 xfs_btree_rec_addr(cur, ptr, block))); 3328 } else { 3329 ASSERT(cur->bc_ops->keys_inorder(cur, key, 3330 xfs_btree_key_addr(cur, ptr, block))); 3331 } 3332 } 3333 #endif 3334 3335 /* 3336 * If the block is full, we can't insert the new entry until we 3337 * make the block un-full. 3338 */ 3339 xfs_btree_set_ptr_null(cur, &nptr); 3340 if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) { 3341 error = xfs_btree_make_block_unfull(cur, level, numrecs, 3342 &optr, &ptr, &nptr, &ncur, lkey, stat); 3343 if (error || *stat == 0) 3344 goto error0; 3345 } 3346 3347 /* 3348 * The current block may have changed if the block was 3349 * previously full and we have just made space in it. 3350 */ 3351 block = xfs_btree_get_block(cur, level, &bp); 3352 numrecs = xfs_btree_get_numrecs(block); 3353 3354 #ifdef DEBUG 3355 error = xfs_btree_check_block(cur, block, level, bp); 3356 if (error) 3357 return error; 3358 #endif 3359 3360 /* 3361 * At this point we know there's room for our new entry in the block 3362 * we're pointing at. 3363 */ 3364 XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1); 3365 3366 if (level > 0) { 3367 /* It's a nonleaf. make a hole in the keys and ptrs */ 3368 union xfs_btree_key *kp; 3369 union xfs_btree_ptr *pp; 3370 3371 kp = xfs_btree_key_addr(cur, ptr, block); 3372 pp = xfs_btree_ptr_addr(cur, ptr, block); 3373 3374 #ifdef DEBUG 3375 for (i = numrecs - ptr; i >= 0; i--) { 3376 error = xfs_btree_check_ptr(cur, pp, i, level); 3377 if (error) 3378 return error; 3379 } 3380 #endif 3381 3382 xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1); 3383 xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1); 3384 3385 #ifdef DEBUG 3386 error = xfs_btree_check_ptr(cur, ptrp, 0, level); 3387 if (error) 3388 goto error0; 3389 #endif 3390 3391 /* Now put the new data in, bump numrecs and log it. */ 3392 xfs_btree_copy_keys(cur, kp, key, 1); 3393 xfs_btree_copy_ptrs(cur, pp, ptrp, 1); 3394 numrecs++; 3395 xfs_btree_set_numrecs(block, numrecs); 3396 xfs_btree_log_ptrs(cur, bp, ptr, numrecs); 3397 xfs_btree_log_keys(cur, bp, ptr, numrecs); 3398 #ifdef DEBUG 3399 if (ptr < numrecs) { 3400 ASSERT(cur->bc_ops->keys_inorder(cur, kp, 3401 xfs_btree_key_addr(cur, ptr + 1, block))); 3402 } 3403 #endif 3404 } else { 3405 /* It's a leaf. make a hole in the records */ 3406 union xfs_btree_rec *rp; 3407 3408 rp = xfs_btree_rec_addr(cur, ptr, block); 3409 3410 xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1); 3411 3412 /* Now put the new data in, bump numrecs and log it. */ 3413 xfs_btree_copy_recs(cur, rp, rec, 1); 3414 xfs_btree_set_numrecs(block, ++numrecs); 3415 xfs_btree_log_recs(cur, bp, ptr, numrecs); 3416 #ifdef DEBUG 3417 if (ptr < numrecs) { 3418 ASSERT(cur->bc_ops->recs_inorder(cur, rp, 3419 xfs_btree_rec_addr(cur, ptr + 1, block))); 3420 } 3421 #endif 3422 } 3423 3424 /* Log the new number of records in the btree header. */ 3425 xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); 3426 3427 /* 3428 * If we just inserted into a new tree block, we have to 3429 * recalculate nkey here because nkey is out of date. 3430 * 3431 * Otherwise we're just updating an existing block (having shoved 3432 * some records into the new tree block), so use the regular key 3433 * update mechanism. 3434 */ 3435 if (bp && bp->b_bn != old_bn) { 3436 xfs_btree_get_keys(cur, block, lkey); 3437 } else if (xfs_btree_needs_key_update(cur, optr)) { 3438 error = xfs_btree_update_keys(cur, level); 3439 if (error) 3440 goto error0; 3441 } 3442 3443 /* 3444 * If we are tracking the last record in the tree and 3445 * we are at the far right edge of the tree, update it. 3446 */ 3447 if (xfs_btree_is_lastrec(cur, block, level)) { 3448 cur->bc_ops->update_lastrec(cur, block, rec, 3449 ptr, LASTREC_INSREC); 3450 } 3451 3452 /* 3453 * Return the new block number, if any. 3454 * If there is one, give back a record value and a cursor too. 3455 */ 3456 *ptrp = nptr; 3457 if (!xfs_btree_ptr_is_null(cur, &nptr)) { 3458 xfs_btree_copy_keys(cur, key, lkey, 1); 3459 *curp = ncur; 3460 } 3461 3462 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 3463 *stat = 1; 3464 return 0; 3465 3466 error0: 3467 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); 3468 return error; 3469 } 3470 3471 /* 3472 * Insert the record at the point referenced by cur. 3473 * 3474 * A multi-level split of the tree on insert will invalidate the original 3475 * cursor. All callers of this function should assume that the cursor is 3476 * no longer valid and revalidate it. 3477 */ 3478 int 3479 xfs_btree_insert( 3480 struct xfs_btree_cur *cur, 3481 int *stat) 3482 { 3483 int error; /* error return value */ 3484 int i; /* result value, 0 for failure */ 3485 int level; /* current level number in btree */ 3486 union xfs_btree_ptr nptr; /* new block number (split result) */ 3487 struct xfs_btree_cur *ncur; /* new cursor (split result) */ 3488 struct xfs_btree_cur *pcur; /* previous level's cursor */ 3489 union xfs_btree_key bkey; /* key of block to insert */ 3490 union xfs_btree_key *key; 3491 union xfs_btree_rec rec; /* record to insert */ 3492 3493 level = 0; 3494 ncur = NULL; 3495 pcur = cur; 3496 key = &bkey; 3497 3498 xfs_btree_set_ptr_null(cur, &nptr); 3499 3500 /* Make a key out of the record data to be inserted, and save it. */ 3501 cur->bc_ops->init_rec_from_cur(cur, &rec); 3502 cur->bc_ops->init_key_from_rec(key, &rec); 3503 3504 /* 3505 * Loop going up the tree, starting at the leaf level. 3506 * Stop when we don't get a split block, that must mean that 3507 * the insert is finished with this level. 3508 */ 3509 do { 3510 /* 3511 * Insert nrec/nptr into this level of the tree. 3512 * Note if we fail, nptr will be null. 3513 */ 3514 error = xfs_btree_insrec(pcur, level, &nptr, &rec, key, 3515 &ncur, &i); 3516 if (error) { 3517 if (pcur != cur) 3518 xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); 3519 goto error0; 3520 } 3521 3522 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); 3523 level++; 3524 3525 /* 3526 * See if the cursor we just used is trash. 3527 * Can't trash the caller's cursor, but otherwise we should 3528 * if ncur is a new cursor or we're about to be done. 3529 */ 3530 if (pcur != cur && 3531 (ncur || xfs_btree_ptr_is_null(cur, &nptr))) { 3532 /* Save the state from the cursor before we trash it */ 3533 if (cur->bc_ops->update_cursor) 3534 cur->bc_ops->update_cursor(pcur, cur); 3535 cur->bc_nlevels = pcur->bc_nlevels; 3536 xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); 3537 } 3538 /* If we got a new cursor, switch to it. */ 3539 if (ncur) { 3540 pcur = ncur; 3541 ncur = NULL; 3542 } 3543 } while (!xfs_btree_ptr_is_null(cur, &nptr)); 3544 3545 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 3546 *stat = i; 3547 return 0; 3548 error0: 3549 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); 3550 return error; 3551 } 3552 3553 /* 3554 * Try to merge a non-leaf block back into the inode root. 3555 * 3556 * Note: the killroot names comes from the fact that we're effectively 3557 * killing the old root block. But because we can't just delete the 3558 * inode we have to copy the single block it was pointing to into the 3559 * inode. 3560 */ 3561 STATIC int 3562 xfs_btree_kill_iroot( 3563 struct xfs_btree_cur *cur) 3564 { 3565 int whichfork = cur->bc_private.b.whichfork; 3566 struct xfs_inode *ip = cur->bc_private.b.ip; 3567 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); 3568 struct xfs_btree_block *block; 3569 struct xfs_btree_block *cblock; 3570 union xfs_btree_key *kp; 3571 union xfs_btree_key *ckp; 3572 union xfs_btree_ptr *pp; 3573 union xfs_btree_ptr *cpp; 3574 struct xfs_buf *cbp; 3575 int level; 3576 int index; 3577 int numrecs; 3578 int error; 3579 #ifdef DEBUG 3580 union xfs_btree_ptr ptr; 3581 int i; 3582 #endif 3583 3584 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); 3585 3586 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); 3587 ASSERT(cur->bc_nlevels > 1); 3588 3589 /* 3590 * Don't deal with the root block needs to be a leaf case. 3591 * We're just going to turn the thing back into extents anyway. 3592 */ 3593 level = cur->bc_nlevels - 1; 3594 if (level == 1) 3595 goto out0; 3596 3597 /* 3598 * Give up if the root has multiple children. 3599 */ 3600 block = xfs_btree_get_iroot(cur); 3601 if (xfs_btree_get_numrecs(block) != 1) 3602 goto out0; 3603 3604 cblock = xfs_btree_get_block(cur, level - 1, &cbp); 3605 numrecs = xfs_btree_get_numrecs(cblock); 3606 3607 /* 3608 * Only do this if the next level will fit. 3609 * Then the data must be copied up to the inode, 3610 * instead of freeing the root you free the next level. 3611 */ 3612 if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level)) 3613 goto out0; 3614 3615 XFS_BTREE_STATS_INC(cur, killroot); 3616 3617 #ifdef DEBUG 3618 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); 3619 ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); 3620 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 3621 ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); 3622 #endif 3623 3624 index = numrecs - cur->bc_ops->get_maxrecs(cur, level); 3625 if (index) { 3626 xfs_iroot_realloc(cur->bc_private.b.ip, index, 3627 cur->bc_private.b.whichfork); 3628 block = ifp->if_broot; 3629 } 3630 3631 be16_add_cpu(&block->bb_numrecs, index); 3632 ASSERT(block->bb_numrecs == cblock->bb_numrecs); 3633 3634 kp = xfs_btree_key_addr(cur, 1, block); 3635 ckp = xfs_btree_key_addr(cur, 1, cblock); 3636 xfs_btree_copy_keys(cur, kp, ckp, numrecs); 3637 3638 pp = xfs_btree_ptr_addr(cur, 1, block); 3639 cpp = xfs_btree_ptr_addr(cur, 1, cblock); 3640 #ifdef DEBUG 3641 for (i = 0; i < numrecs; i++) { 3642 error = xfs_btree_check_ptr(cur, cpp, i, level - 1); 3643 if (error) { 3644 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); 3645 return error; 3646 } 3647 } 3648 #endif 3649 xfs_btree_copy_ptrs(cur, pp, cpp, numrecs); 3650 3651 error = xfs_btree_free_block(cur, cbp); 3652 if (error) { 3653 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); 3654 return error; 3655 } 3656 3657 cur->bc_bufs[level - 1] = NULL; 3658 be16_add_cpu(&block->bb_level, -1); 3659 xfs_trans_log_inode(cur->bc_tp, ip, 3660 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork)); 3661 cur->bc_nlevels--; 3662 out0: 3663 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 3664 return 0; 3665 } 3666 3667 /* 3668 * Kill the current root node, and replace it with it's only child node. 3669 */ 3670 STATIC int 3671 xfs_btree_kill_root( 3672 struct xfs_btree_cur *cur, 3673 struct xfs_buf *bp, 3674 int level, 3675 union xfs_btree_ptr *newroot) 3676 { 3677 int error; 3678 3679 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); 3680 XFS_BTREE_STATS_INC(cur, killroot); 3681 3682 /* 3683 * Update the root pointer, decreasing the level by 1 and then 3684 * free the old root. 3685 */ 3686 cur->bc_ops->set_root(cur, newroot, -1); 3687 3688 error = xfs_btree_free_block(cur, bp); 3689 if (error) { 3690 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); 3691 return error; 3692 } 3693 3694 cur->bc_bufs[level] = NULL; 3695 cur->bc_ra[level] = 0; 3696 cur->bc_nlevels--; 3697 3698 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 3699 return 0; 3700 } 3701 3702 STATIC int 3703 xfs_btree_dec_cursor( 3704 struct xfs_btree_cur *cur, 3705 int level, 3706 int *stat) 3707 { 3708 int error; 3709 int i; 3710 3711 if (level > 0) { 3712 error = xfs_btree_decrement(cur, level, &i); 3713 if (error) 3714 return error; 3715 } 3716 3717 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 3718 *stat = 1; 3719 return 0; 3720 } 3721 3722 /* 3723 * Single level of the btree record deletion routine. 3724 * Delete record pointed to by cur/level. 3725 * Remove the record from its block then rebalance the tree. 3726 * Return 0 for error, 1 for done, 2 to go on to the next level. 3727 */ 3728 STATIC int /* error */ 3729 xfs_btree_delrec( 3730 struct xfs_btree_cur *cur, /* btree cursor */ 3731 int level, /* level removing record from */ 3732 int *stat) /* fail/done/go-on */ 3733 { 3734 struct xfs_btree_block *block; /* btree block */ 3735 union xfs_btree_ptr cptr; /* current block ptr */ 3736 struct xfs_buf *bp; /* buffer for block */ 3737 int error; /* error return value */ 3738 int i; /* loop counter */ 3739 union xfs_btree_ptr lptr; /* left sibling block ptr */ 3740 struct xfs_buf *lbp; /* left buffer pointer */ 3741 struct xfs_btree_block *left; /* left btree block */ 3742 int lrecs = 0; /* left record count */ 3743 int ptr; /* key/record index */ 3744 union xfs_btree_ptr rptr; /* right sibling block ptr */ 3745 struct xfs_buf *rbp; /* right buffer pointer */ 3746 struct xfs_btree_block *right; /* right btree block */ 3747 struct xfs_btree_block *rrblock; /* right-right btree block */ 3748 struct xfs_buf *rrbp; /* right-right buffer pointer */ 3749 int rrecs = 0; /* right record count */ 3750 struct xfs_btree_cur *tcur; /* temporary btree cursor */ 3751 int numrecs; /* temporary numrec count */ 3752 3753 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); 3754 XFS_BTREE_TRACE_ARGI(cur, level); 3755 3756 tcur = NULL; 3757 3758 /* Get the index of the entry being deleted, check for nothing there. */ 3759 ptr = cur->bc_ptrs[level]; 3760 if (ptr == 0) { 3761 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 3762 *stat = 0; 3763 return 0; 3764 } 3765 3766 /* Get the buffer & block containing the record or key/ptr. */ 3767 block = xfs_btree_get_block(cur, level, &bp); 3768 numrecs = xfs_btree_get_numrecs(block); 3769 3770 #ifdef DEBUG 3771 error = xfs_btree_check_block(cur, block, level, bp); 3772 if (error) 3773 goto error0; 3774 #endif 3775 3776 /* Fail if we're off the end of the block. */ 3777 if (ptr > numrecs) { 3778 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 3779 *stat = 0; 3780 return 0; 3781 } 3782 3783 XFS_BTREE_STATS_INC(cur, delrec); 3784 XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr); 3785 3786 /* Excise the entries being deleted. */ 3787 if (level > 0) { 3788 /* It's a nonleaf. operate on keys and ptrs */ 3789 union xfs_btree_key *lkp; 3790 union xfs_btree_ptr *lpp; 3791 3792 lkp = xfs_btree_key_addr(cur, ptr + 1, block); 3793 lpp = xfs_btree_ptr_addr(cur, ptr + 1, block); 3794 3795 #ifdef DEBUG 3796 for (i = 0; i < numrecs - ptr; i++) { 3797 error = xfs_btree_check_ptr(cur, lpp, i, level); 3798 if (error) 3799 goto error0; 3800 } 3801 #endif 3802 3803 if (ptr < numrecs) { 3804 xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr); 3805 xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr); 3806 xfs_btree_log_keys(cur, bp, ptr, numrecs - 1); 3807 xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1); 3808 } 3809 } else { 3810 /* It's a leaf. operate on records */ 3811 if (ptr < numrecs) { 3812 xfs_btree_shift_recs(cur, 3813 xfs_btree_rec_addr(cur, ptr + 1, block), 3814 -1, numrecs - ptr); 3815 xfs_btree_log_recs(cur, bp, ptr, numrecs - 1); 3816 } 3817 } 3818 3819 /* 3820 * Decrement and log the number of entries in the block. 3821 */ 3822 xfs_btree_set_numrecs(block, --numrecs); 3823 xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); 3824 3825 /* 3826 * If we are tracking the last record in the tree and 3827 * we are at the far right edge of the tree, update it. 3828 */ 3829 if (xfs_btree_is_lastrec(cur, block, level)) { 3830 cur->bc_ops->update_lastrec(cur, block, NULL, 3831 ptr, LASTREC_DELREC); 3832 } 3833 3834 /* 3835 * We're at the root level. First, shrink the root block in-memory. 3836 * Try to get rid of the next level down. If we can't then there's 3837 * nothing left to do. 3838 */ 3839 if (level == cur->bc_nlevels - 1) { 3840 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { 3841 xfs_iroot_realloc(cur->bc_private.b.ip, -1, 3842 cur->bc_private.b.whichfork); 3843 3844 error = xfs_btree_kill_iroot(cur); 3845 if (error) 3846 goto error0; 3847 3848 error = xfs_btree_dec_cursor(cur, level, stat); 3849 if (error) 3850 goto error0; 3851 *stat = 1; 3852 return 0; 3853 } 3854 3855 /* 3856 * If this is the root level, and there's only one entry left, 3857 * and it's NOT the leaf level, then we can get rid of this 3858 * level. 3859 */ 3860 if (numrecs == 1 && level > 0) { 3861 union xfs_btree_ptr *pp; 3862 /* 3863 * pp is still set to the first pointer in the block. 3864 * Make it the new root of the btree. 3865 */ 3866 pp = xfs_btree_ptr_addr(cur, 1, block); 3867 error = xfs_btree_kill_root(cur, bp, level, pp); 3868 if (error) 3869 goto error0; 3870 } else if (level > 0) { 3871 error = xfs_btree_dec_cursor(cur, level, stat); 3872 if (error) 3873 goto error0; 3874 } 3875 *stat = 1; 3876 return 0; 3877 } 3878 3879 /* 3880 * If we deleted the leftmost entry in the block, update the 3881 * key values above us in the tree. 3882 */ 3883 if (xfs_btree_needs_key_update(cur, ptr)) { 3884 error = xfs_btree_update_keys(cur, level); 3885 if (error) 3886 goto error0; 3887 } 3888 3889 /* 3890 * If the number of records remaining in the block is at least 3891 * the minimum, we're done. 3892 */ 3893 if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) { 3894 error = xfs_btree_dec_cursor(cur, level, stat); 3895 if (error) 3896 goto error0; 3897 return 0; 3898 } 3899 3900 /* 3901 * Otherwise, we have to move some records around to keep the 3902 * tree balanced. Look at the left and right sibling blocks to 3903 * see if we can re-balance by moving only one record. 3904 */ 3905 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); 3906 xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB); 3907 3908 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { 3909 /* 3910 * One child of root, need to get a chance to copy its contents 3911 * into the root and delete it. Can't go up to next level, 3912 * there's nothing to delete there. 3913 */ 3914 if (xfs_btree_ptr_is_null(cur, &rptr) && 3915 xfs_btree_ptr_is_null(cur, &lptr) && 3916 level == cur->bc_nlevels - 2) { 3917 error = xfs_btree_kill_iroot(cur); 3918 if (!error) 3919 error = xfs_btree_dec_cursor(cur, level, stat); 3920 if (error) 3921 goto error0; 3922 return 0; 3923 } 3924 } 3925 3926 ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) || 3927 !xfs_btree_ptr_is_null(cur, &lptr)); 3928 3929 /* 3930 * Duplicate the cursor so our btree manipulations here won't 3931 * disrupt the next level up. 3932 */ 3933 error = xfs_btree_dup_cursor(cur, &tcur); 3934 if (error) 3935 goto error0; 3936 3937 /* 3938 * If there's a right sibling, see if it's ok to shift an entry 3939 * out of it. 3940 */ 3941 if (!xfs_btree_ptr_is_null(cur, &rptr)) { 3942 /* 3943 * Move the temp cursor to the last entry in the next block. 3944 * Actually any entry but the first would suffice. 3945 */ 3946 i = xfs_btree_lastrec(tcur, level); 3947 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); 3948 3949 error = xfs_btree_increment(tcur, level, &i); 3950 if (error) 3951 goto error0; 3952 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); 3953 3954 i = xfs_btree_lastrec(tcur, level); 3955 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); 3956 3957 /* Grab a pointer to the block. */ 3958 right = xfs_btree_get_block(tcur, level, &rbp); 3959 #ifdef DEBUG 3960 error = xfs_btree_check_block(tcur, right, level, rbp); 3961 if (error) 3962 goto error0; 3963 #endif 3964 /* Grab the current block number, for future use. */ 3965 xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB); 3966 3967 /* 3968 * If right block is full enough so that removing one entry 3969 * won't make it too empty, and left-shifting an entry out 3970 * of right to us works, we're done. 3971 */ 3972 if (xfs_btree_get_numrecs(right) - 1 >= 3973 cur->bc_ops->get_minrecs(tcur, level)) { 3974 error = xfs_btree_lshift(tcur, level, &i); 3975 if (error) 3976 goto error0; 3977 if (i) { 3978 ASSERT(xfs_btree_get_numrecs(block) >= 3979 cur->bc_ops->get_minrecs(tcur, level)); 3980 3981 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 3982 tcur = NULL; 3983 3984 error = xfs_btree_dec_cursor(cur, level, stat); 3985 if (error) 3986 goto error0; 3987 return 0; 3988 } 3989 } 3990 3991 /* 3992 * Otherwise, grab the number of records in right for 3993 * future reference, and fix up the temp cursor to point 3994 * to our block again (last record). 3995 */ 3996 rrecs = xfs_btree_get_numrecs(right); 3997 if (!xfs_btree_ptr_is_null(cur, &lptr)) { 3998 i = xfs_btree_firstrec(tcur, level); 3999 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); 4000 4001 error = xfs_btree_decrement(tcur, level, &i); 4002 if (error) 4003 goto error0; 4004 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); 4005 } 4006 } 4007 4008 /* 4009 * If there's a left sibling, see if it's ok to shift an entry 4010 * out of it. 4011 */ 4012 if (!xfs_btree_ptr_is_null(cur, &lptr)) { 4013 /* 4014 * Move the temp cursor to the first entry in the 4015 * previous block. 4016 */ 4017 i = xfs_btree_firstrec(tcur, level); 4018 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); 4019 4020 error = xfs_btree_decrement(tcur, level, &i); 4021 if (error) 4022 goto error0; 4023 i = xfs_btree_firstrec(tcur, level); 4024 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); 4025 4026 /* Grab a pointer to the block. */ 4027 left = xfs_btree_get_block(tcur, level, &lbp); 4028 #ifdef DEBUG 4029 error = xfs_btree_check_block(cur, left, level, lbp); 4030 if (error) 4031 goto error0; 4032 #endif 4033 /* Grab the current block number, for future use. */ 4034 xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB); 4035 4036 /* 4037 * If left block is full enough so that removing one entry 4038 * won't make it too empty, and right-shifting an entry out 4039 * of left to us works, we're done. 4040 */ 4041 if (xfs_btree_get_numrecs(left) - 1 >= 4042 cur->bc_ops->get_minrecs(tcur, level)) { 4043 error = xfs_btree_rshift(tcur, level, &i); 4044 if (error) 4045 goto error0; 4046 if (i) { 4047 ASSERT(xfs_btree_get_numrecs(block) >= 4048 cur->bc_ops->get_minrecs(tcur, level)); 4049 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 4050 tcur = NULL; 4051 if (level == 0) 4052 cur->bc_ptrs[0]++; 4053 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 4054 *stat = 1; 4055 return 0; 4056 } 4057 } 4058 4059 /* 4060 * Otherwise, grab the number of records in right for 4061 * future reference. 4062 */ 4063 lrecs = xfs_btree_get_numrecs(left); 4064 } 4065 4066 /* Delete the temp cursor, we're done with it. */ 4067 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 4068 tcur = NULL; 4069 4070 /* If here, we need to do a join to keep the tree balanced. */ 4071 ASSERT(!xfs_btree_ptr_is_null(cur, &cptr)); 4072 4073 if (!xfs_btree_ptr_is_null(cur, &lptr) && 4074 lrecs + xfs_btree_get_numrecs(block) <= 4075 cur->bc_ops->get_maxrecs(cur, level)) { 4076 /* 4077 * Set "right" to be the starting block, 4078 * "left" to be the left neighbor. 4079 */ 4080 rptr = cptr; 4081 right = block; 4082 rbp = bp; 4083 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); 4084 if (error) 4085 goto error0; 4086 4087 /* 4088 * If that won't work, see if we can join with the right neighbor block. 4089 */ 4090 } else if (!xfs_btree_ptr_is_null(cur, &rptr) && 4091 rrecs + xfs_btree_get_numrecs(block) <= 4092 cur->bc_ops->get_maxrecs(cur, level)) { 4093 /* 4094 * Set "left" to be the starting block, 4095 * "right" to be the right neighbor. 4096 */ 4097 lptr = cptr; 4098 left = block; 4099 lbp = bp; 4100 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); 4101 if (error) 4102 goto error0; 4103 4104 /* 4105 * Otherwise, we can't fix the imbalance. 4106 * Just return. This is probably a logic error, but it's not fatal. 4107 */ 4108 } else { 4109 error = xfs_btree_dec_cursor(cur, level, stat); 4110 if (error) 4111 goto error0; 4112 return 0; 4113 } 4114 4115 rrecs = xfs_btree_get_numrecs(right); 4116 lrecs = xfs_btree_get_numrecs(left); 4117 4118 /* 4119 * We're now going to join "left" and "right" by moving all the stuff 4120 * in "right" to "left" and deleting "right". 4121 */ 4122 XFS_BTREE_STATS_ADD(cur, moves, rrecs); 4123 if (level > 0) { 4124 /* It's a non-leaf. Move keys and pointers. */ 4125 union xfs_btree_key *lkp; /* left btree key */ 4126 union xfs_btree_ptr *lpp; /* left address pointer */ 4127 union xfs_btree_key *rkp; /* right btree key */ 4128 union xfs_btree_ptr *rpp; /* right address pointer */ 4129 4130 lkp = xfs_btree_key_addr(cur, lrecs + 1, left); 4131 lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left); 4132 rkp = xfs_btree_key_addr(cur, 1, right); 4133 rpp = xfs_btree_ptr_addr(cur, 1, right); 4134 #ifdef DEBUG 4135 for (i = 1; i < rrecs; i++) { 4136 error = xfs_btree_check_ptr(cur, rpp, i, level); 4137 if (error) 4138 goto error0; 4139 } 4140 #endif 4141 xfs_btree_copy_keys(cur, lkp, rkp, rrecs); 4142 xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs); 4143 4144 xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs); 4145 xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs); 4146 } else { 4147 /* It's a leaf. Move records. */ 4148 union xfs_btree_rec *lrp; /* left record pointer */ 4149 union xfs_btree_rec *rrp; /* right record pointer */ 4150 4151 lrp = xfs_btree_rec_addr(cur, lrecs + 1, left); 4152 rrp = xfs_btree_rec_addr(cur, 1, right); 4153 4154 xfs_btree_copy_recs(cur, lrp, rrp, rrecs); 4155 xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs); 4156 } 4157 4158 XFS_BTREE_STATS_INC(cur, join); 4159 4160 /* 4161 * Fix up the number of records and right block pointer in the 4162 * surviving block, and log it. 4163 */ 4164 xfs_btree_set_numrecs(left, lrecs + rrecs); 4165 xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB), 4166 xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); 4167 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); 4168 4169 /* If there is a right sibling, point it to the remaining block. */ 4170 xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); 4171 if (!xfs_btree_ptr_is_null(cur, &cptr)) { 4172 error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp); 4173 if (error) 4174 goto error0; 4175 xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB); 4176 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); 4177 } 4178 4179 /* Free the deleted block. */ 4180 error = xfs_btree_free_block(cur, rbp); 4181 if (error) 4182 goto error0; 4183 4184 /* 4185 * If we joined with the left neighbor, set the buffer in the 4186 * cursor to the left block, and fix up the index. 4187 */ 4188 if (bp != lbp) { 4189 cur->bc_bufs[level] = lbp; 4190 cur->bc_ptrs[level] += lrecs; 4191 cur->bc_ra[level] = 0; 4192 } 4193 /* 4194 * If we joined with the right neighbor and there's a level above 4195 * us, increment the cursor at that level. 4196 */ 4197 else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || 4198 (level + 1 < cur->bc_nlevels)) { 4199 error = xfs_btree_increment(cur, level + 1, &i); 4200 if (error) 4201 goto error0; 4202 } 4203 4204 /* 4205 * Readjust the ptr at this level if it's not a leaf, since it's 4206 * still pointing at the deletion point, which makes the cursor 4207 * inconsistent. If this makes the ptr 0, the caller fixes it up. 4208 * We can't use decrement because it would change the next level up. 4209 */ 4210 if (level > 0) 4211 cur->bc_ptrs[level]--; 4212 4213 /* 4214 * We combined blocks, so we have to update the parent keys if the 4215 * btree supports overlapped intervals. However, bc_ptrs[level + 1] 4216 * points to the old block so that the caller knows which record to 4217 * delete. Therefore, the caller must be savvy enough to call updkeys 4218 * for us if we return stat == 2. The other exit points from this 4219 * function don't require deletions further up the tree, so they can 4220 * call updkeys directly. 4221 */ 4222 4223 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 4224 /* Return value means the next level up has something to do. */ 4225 *stat = 2; 4226 return 0; 4227 4228 error0: 4229 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); 4230 if (tcur) 4231 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 4232 return error; 4233 } 4234 4235 /* 4236 * Delete the record pointed to by cur. 4237 * The cursor refers to the place where the record was (could be inserted) 4238 * when the operation returns. 4239 */ 4240 int /* error */ 4241 xfs_btree_delete( 4242 struct xfs_btree_cur *cur, 4243 int *stat) /* success/failure */ 4244 { 4245 int error; /* error return value */ 4246 int level; 4247 int i; 4248 bool joined = false; 4249 4250 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); 4251 4252 /* 4253 * Go up the tree, starting at leaf level. 4254 * 4255 * If 2 is returned then a join was done; go to the next level. 4256 * Otherwise we are done. 4257 */ 4258 for (level = 0, i = 2; i == 2; level++) { 4259 error = xfs_btree_delrec(cur, level, &i); 4260 if (error) 4261 goto error0; 4262 if (i == 2) 4263 joined = true; 4264 } 4265 4266 /* 4267 * If we combined blocks as part of deleting the record, delrec won't 4268 * have updated the parent high keys so we have to do that here. 4269 */ 4270 if (joined && (cur->bc_flags & XFS_BTREE_OVERLAPPING)) { 4271 error = xfs_btree_updkeys_force(cur, 0); 4272 if (error) 4273 goto error0; 4274 } 4275 4276 if (i == 0) { 4277 for (level = 1; level < cur->bc_nlevels; level++) { 4278 if (cur->bc_ptrs[level] == 0) { 4279 error = xfs_btree_decrement(cur, level, &i); 4280 if (error) 4281 goto error0; 4282 break; 4283 } 4284 } 4285 } 4286 4287 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 4288 *stat = i; 4289 return 0; 4290 error0: 4291 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); 4292 return error; 4293 } 4294 4295 /* 4296 * Get the data from the pointed-to record. 4297 */ 4298 int /* error */ 4299 xfs_btree_get_rec( 4300 struct xfs_btree_cur *cur, /* btree cursor */ 4301 union xfs_btree_rec **recp, /* output: btree record */ 4302 int *stat) /* output: success/failure */ 4303 { 4304 struct xfs_btree_block *block; /* btree block */ 4305 struct xfs_buf *bp; /* buffer pointer */ 4306 int ptr; /* record number */ 4307 #ifdef DEBUG 4308 int error; /* error return value */ 4309 #endif 4310 4311 ptr = cur->bc_ptrs[0]; 4312 block = xfs_btree_get_block(cur, 0, &bp); 4313 4314 #ifdef DEBUG 4315 error = xfs_btree_check_block(cur, block, 0, bp); 4316 if (error) 4317 return error; 4318 #endif 4319 4320 /* 4321 * Off the right end or left end, return failure. 4322 */ 4323 if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) { 4324 *stat = 0; 4325 return 0; 4326 } 4327 4328 /* 4329 * Point to the record and extract its data. 4330 */ 4331 *recp = xfs_btree_rec_addr(cur, ptr, block); 4332 *stat = 1; 4333 return 0; 4334 } 4335 4336 /* Visit a block in a btree. */ 4337 STATIC int 4338 xfs_btree_visit_block( 4339 struct xfs_btree_cur *cur, 4340 int level, 4341 xfs_btree_visit_blocks_fn fn, 4342 void *data) 4343 { 4344 struct xfs_btree_block *block; 4345 struct xfs_buf *bp; 4346 union xfs_btree_ptr rptr; 4347 int error; 4348 4349 /* do right sibling readahead */ 4350 xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); 4351 block = xfs_btree_get_block(cur, level, &bp); 4352 4353 /* process the block */ 4354 error = fn(cur, level, data); 4355 if (error) 4356 return error; 4357 4358 /* now read rh sibling block for next iteration */ 4359 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); 4360 if (xfs_btree_ptr_is_null(cur, &rptr)) 4361 return -ENOENT; 4362 4363 return xfs_btree_lookup_get_block(cur, level, &rptr, &block); 4364 } 4365 4366 4367 /* Visit every block in a btree. */ 4368 int 4369 xfs_btree_visit_blocks( 4370 struct xfs_btree_cur *cur, 4371 xfs_btree_visit_blocks_fn fn, 4372 void *data) 4373 { 4374 union xfs_btree_ptr lptr; 4375 int level; 4376 struct xfs_btree_block *block = NULL; 4377 int error = 0; 4378 4379 cur->bc_ops->init_ptr_from_cur(cur, &lptr); 4380 4381 /* for each level */ 4382 for (level = cur->bc_nlevels - 1; level >= 0; level--) { 4383 /* grab the left hand block */ 4384 error = xfs_btree_lookup_get_block(cur, level, &lptr, &block); 4385 if (error) 4386 return error; 4387 4388 /* readahead the left most block for the next level down */ 4389 if (level > 0) { 4390 union xfs_btree_ptr *ptr; 4391 4392 ptr = xfs_btree_ptr_addr(cur, 1, block); 4393 xfs_btree_readahead_ptr(cur, ptr, 1); 4394 4395 /* save for the next iteration of the loop */ 4396 xfs_btree_copy_ptrs(cur, &lptr, ptr, 1); 4397 } 4398 4399 /* for each buffer in the level */ 4400 do { 4401 error = xfs_btree_visit_block(cur, level, fn, data); 4402 } while (!error); 4403 4404 if (error != -ENOENT) 4405 return error; 4406 } 4407 4408 return 0; 4409 } 4410 4411 /* 4412 * Change the owner of a btree. 4413 * 4414 * The mechanism we use here is ordered buffer logging. Because we don't know 4415 * how many buffers were are going to need to modify, we don't really want to 4416 * have to make transaction reservations for the worst case of every buffer in a 4417 * full size btree as that may be more space that we can fit in the log.... 4418 * 4419 * We do the btree walk in the most optimal manner possible - we have sibling 4420 * pointers so we can just walk all the blocks on each level from left to right 4421 * in a single pass, and then move to the next level and do the same. We can 4422 * also do readahead on the sibling pointers to get IO moving more quickly, 4423 * though for slow disks this is unlikely to make much difference to performance 4424 * as the amount of CPU work we have to do before moving to the next block is 4425 * relatively small. 4426 * 4427 * For each btree block that we load, modify the owner appropriately, set the 4428 * buffer as an ordered buffer and log it appropriately. We need to ensure that 4429 * we mark the region we change dirty so that if the buffer is relogged in 4430 * a subsequent transaction the changes we make here as an ordered buffer are 4431 * correctly relogged in that transaction. If we are in recovery context, then 4432 * just queue the modified buffer as delayed write buffer so the transaction 4433 * recovery completion writes the changes to disk. 4434 */ 4435 struct xfs_btree_block_change_owner_info { 4436 uint64_t new_owner; 4437 struct list_head *buffer_list; 4438 }; 4439 4440 static int 4441 xfs_btree_block_change_owner( 4442 struct xfs_btree_cur *cur, 4443 int level, 4444 void *data) 4445 { 4446 struct xfs_btree_block_change_owner_info *bbcoi = data; 4447 struct xfs_btree_block *block; 4448 struct xfs_buf *bp; 4449 4450 /* modify the owner */ 4451 block = xfs_btree_get_block(cur, level, &bp); 4452 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 4453 block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner); 4454 else 4455 block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner); 4456 4457 /* 4458 * If the block is a root block hosted in an inode, we might not have a 4459 * buffer pointer here and we shouldn't attempt to log the change as the 4460 * information is already held in the inode and discarded when the root 4461 * block is formatted into the on-disk inode fork. We still change it, 4462 * though, so everything is consistent in memory. 4463 */ 4464 if (bp) { 4465 if (cur->bc_tp) { 4466 xfs_trans_ordered_buf(cur->bc_tp, bp); 4467 xfs_btree_log_block(cur, bp, XFS_BB_OWNER); 4468 } else { 4469 xfs_buf_delwri_queue(bp, bbcoi->buffer_list); 4470 } 4471 } else { 4472 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); 4473 ASSERT(level == cur->bc_nlevels - 1); 4474 } 4475 4476 return 0; 4477 } 4478 4479 int 4480 xfs_btree_change_owner( 4481 struct xfs_btree_cur *cur, 4482 uint64_t new_owner, 4483 struct list_head *buffer_list) 4484 { 4485 struct xfs_btree_block_change_owner_info bbcoi; 4486 4487 bbcoi.new_owner = new_owner; 4488 bbcoi.buffer_list = buffer_list; 4489 4490 return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner, 4491 &bbcoi); 4492 } 4493 4494 /** 4495 * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format 4496 * btree block 4497 * 4498 * @bp: buffer containing the btree block 4499 * @max_recs: pointer to the m_*_mxr max records field in the xfs mount 4500 * @pag_max_level: pointer to the per-ag max level field 4501 */ 4502 bool 4503 xfs_btree_sblock_v5hdr_verify( 4504 struct xfs_buf *bp) 4505 { 4506 struct xfs_mount *mp = bp->b_target->bt_mount; 4507 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4508 struct xfs_perag *pag = bp->b_pag; 4509 4510 if (!xfs_sb_version_hascrc(&mp->m_sb)) 4511 return false; 4512 if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) 4513 return false; 4514 if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) 4515 return false; 4516 if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) 4517 return false; 4518 return true; 4519 } 4520 4521 /** 4522 * xfs_btree_sblock_verify() -- verify a short-format btree block 4523 * 4524 * @bp: buffer containing the btree block 4525 * @max_recs: maximum records allowed in this btree node 4526 */ 4527 bool 4528 xfs_btree_sblock_verify( 4529 struct xfs_buf *bp, 4530 unsigned int max_recs) 4531 { 4532 struct xfs_mount *mp = bp->b_target->bt_mount; 4533 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4534 4535 /* numrecs verification */ 4536 if (be16_to_cpu(block->bb_numrecs) > max_recs) 4537 return false; 4538 4539 /* sibling pointer verification */ 4540 if (!block->bb_u.s.bb_leftsib || 4541 (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks && 4542 block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK))) 4543 return false; 4544 if (!block->bb_u.s.bb_rightsib || 4545 (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks && 4546 block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK))) 4547 return false; 4548 4549 return true; 4550 } 4551 4552 /* 4553 * Calculate the number of btree levels needed to store a given number of 4554 * records in a short-format btree. 4555 */ 4556 uint 4557 xfs_btree_compute_maxlevels( 4558 struct xfs_mount *mp, 4559 uint *limits, 4560 unsigned long len) 4561 { 4562 uint level; 4563 unsigned long maxblocks; 4564 4565 maxblocks = (len + limits[0] - 1) / limits[0]; 4566 for (level = 1; maxblocks > 1; level++) 4567 maxblocks = (maxblocks + limits[1] - 1) / limits[1]; 4568 return level; 4569 } 4570 4571 /* 4572 * Query a regular btree for all records overlapping a given interval. 4573 * Start with a LE lookup of the key of low_rec and return all records 4574 * until we find a record with a key greater than the key of high_rec. 4575 */ 4576 STATIC int 4577 xfs_btree_simple_query_range( 4578 struct xfs_btree_cur *cur, 4579 union xfs_btree_key *low_key, 4580 union xfs_btree_key *high_key, 4581 xfs_btree_query_range_fn fn, 4582 void *priv) 4583 { 4584 union xfs_btree_rec *recp; 4585 union xfs_btree_key rec_key; 4586 int64_t diff; 4587 int stat; 4588 bool firstrec = true; 4589 int error; 4590 4591 ASSERT(cur->bc_ops->init_high_key_from_rec); 4592 ASSERT(cur->bc_ops->diff_two_keys); 4593 4594 /* 4595 * Find the leftmost record. The btree cursor must be set 4596 * to the low record used to generate low_key. 4597 */ 4598 stat = 0; 4599 error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat); 4600 if (error) 4601 goto out; 4602 4603 /* Nothing? See if there's anything to the right. */ 4604 if (!stat) { 4605 error = xfs_btree_increment(cur, 0, &stat); 4606 if (error) 4607 goto out; 4608 } 4609 4610 while (stat) { 4611 /* Find the record. */ 4612 error = xfs_btree_get_rec(cur, &recp, &stat); 4613 if (error || !stat) 4614 break; 4615 4616 /* Skip if high_key(rec) < low_key. */ 4617 if (firstrec) { 4618 cur->bc_ops->init_high_key_from_rec(&rec_key, recp); 4619 firstrec = false; 4620 diff = cur->bc_ops->diff_two_keys(cur, low_key, 4621 &rec_key); 4622 if (diff > 0) 4623 goto advloop; 4624 } 4625 4626 /* Stop if high_key < low_key(rec). */ 4627 cur->bc_ops->init_key_from_rec(&rec_key, recp); 4628 diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key); 4629 if (diff > 0) 4630 break; 4631 4632 /* Callback */ 4633 error = fn(cur, recp, priv); 4634 if (error < 0 || error == XFS_BTREE_QUERY_RANGE_ABORT) 4635 break; 4636 4637 advloop: 4638 /* Move on to the next record. */ 4639 error = xfs_btree_increment(cur, 0, &stat); 4640 if (error) 4641 break; 4642 } 4643 4644 out: 4645 return error; 4646 } 4647 4648 /* 4649 * Query an overlapped interval btree for all records overlapping a given 4650 * interval. This function roughly follows the algorithm given in 4651 * "Interval Trees" of _Introduction to Algorithms_, which is section 4652 * 14.3 in the 2nd and 3rd editions. 4653 * 4654 * First, generate keys for the low and high records passed in. 4655 * 4656 * For any leaf node, generate the high and low keys for the record. 4657 * If the record keys overlap with the query low/high keys, pass the 4658 * record to the function iterator. 4659 * 4660 * For any internal node, compare the low and high keys of each 4661 * pointer against the query low/high keys. If there's an overlap, 4662 * follow the pointer. 4663 * 4664 * As an optimization, we stop scanning a block when we find a low key 4665 * that is greater than the query's high key. 4666 */ 4667 STATIC int 4668 xfs_btree_overlapped_query_range( 4669 struct xfs_btree_cur *cur, 4670 union xfs_btree_key *low_key, 4671 union xfs_btree_key *high_key, 4672 xfs_btree_query_range_fn fn, 4673 void *priv) 4674 { 4675 union xfs_btree_ptr ptr; 4676 union xfs_btree_ptr *pp; 4677 union xfs_btree_key rec_key; 4678 union xfs_btree_key rec_hkey; 4679 union xfs_btree_key *lkp; 4680 union xfs_btree_key *hkp; 4681 union xfs_btree_rec *recp; 4682 struct xfs_btree_block *block; 4683 int64_t ldiff; 4684 int64_t hdiff; 4685 int level; 4686 struct xfs_buf *bp; 4687 int i; 4688 int error; 4689 4690 /* Load the root of the btree. */ 4691 level = cur->bc_nlevels - 1; 4692 cur->bc_ops->init_ptr_from_cur(cur, &ptr); 4693 error = xfs_btree_lookup_get_block(cur, level, &ptr, &block); 4694 if (error) 4695 return error; 4696 xfs_btree_get_block(cur, level, &bp); 4697 trace_xfs_btree_overlapped_query_range(cur, level, bp); 4698 #ifdef DEBUG 4699 error = xfs_btree_check_block(cur, block, level, bp); 4700 if (error) 4701 goto out; 4702 #endif 4703 cur->bc_ptrs[level] = 1; 4704 4705 while (level < cur->bc_nlevels) { 4706 block = xfs_btree_get_block(cur, level, &bp); 4707 4708 /* End of node, pop back towards the root. */ 4709 if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) { 4710 pop_up: 4711 if (level < cur->bc_nlevels - 1) 4712 cur->bc_ptrs[level + 1]++; 4713 level++; 4714 continue; 4715 } 4716 4717 if (level == 0) { 4718 /* Handle a leaf node. */ 4719 recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block); 4720 4721 cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp); 4722 ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey, 4723 low_key); 4724 4725 cur->bc_ops->init_key_from_rec(&rec_key, recp); 4726 hdiff = cur->bc_ops->diff_two_keys(cur, high_key, 4727 &rec_key); 4728 4729 /* 4730 * If (record's high key >= query's low key) and 4731 * (query's high key >= record's low key), then 4732 * this record overlaps the query range; callback. 4733 */ 4734 if (ldiff >= 0 && hdiff >= 0) { 4735 error = fn(cur, recp, priv); 4736 if (error < 0 || 4737 error == XFS_BTREE_QUERY_RANGE_ABORT) 4738 break; 4739 } else if (hdiff < 0) { 4740 /* Record is larger than high key; pop. */ 4741 goto pop_up; 4742 } 4743 cur->bc_ptrs[level]++; 4744 continue; 4745 } 4746 4747 /* Handle an internal node. */ 4748 lkp = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block); 4749 hkp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block); 4750 pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block); 4751 4752 ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key); 4753 hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp); 4754 4755 /* 4756 * If (pointer's high key >= query's low key) and 4757 * (query's high key >= pointer's low key), then 4758 * this record overlaps the query range; follow pointer. 4759 */ 4760 if (ldiff >= 0 && hdiff >= 0) { 4761 level--; 4762 error = xfs_btree_lookup_get_block(cur, level, pp, 4763 &block); 4764 if (error) 4765 goto out; 4766 xfs_btree_get_block(cur, level, &bp); 4767 trace_xfs_btree_overlapped_query_range(cur, level, bp); 4768 #ifdef DEBUG 4769 error = xfs_btree_check_block(cur, block, level, bp); 4770 if (error) 4771 goto out; 4772 #endif 4773 cur->bc_ptrs[level] = 1; 4774 continue; 4775 } else if (hdiff < 0) { 4776 /* The low key is larger than the upper range; pop. */ 4777 goto pop_up; 4778 } 4779 cur->bc_ptrs[level]++; 4780 } 4781 4782 out: 4783 /* 4784 * If we don't end this function with the cursor pointing at a record 4785 * block, a subsequent non-error cursor deletion will not release 4786 * node-level buffers, causing a buffer leak. This is quite possible 4787 * with a zero-results range query, so release the buffers if we 4788 * failed to return any results. 4789 */ 4790 if (cur->bc_bufs[0] == NULL) { 4791 for (i = 0; i < cur->bc_nlevels; i++) { 4792 if (cur->bc_bufs[i]) { 4793 xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]); 4794 cur->bc_bufs[i] = NULL; 4795 cur->bc_ptrs[i] = 0; 4796 cur->bc_ra[i] = 0; 4797 } 4798 } 4799 } 4800 4801 return error; 4802 } 4803 4804 /* 4805 * Query a btree for all records overlapping a given interval of keys. The 4806 * supplied function will be called with each record found; return one of the 4807 * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error 4808 * code. This function returns XFS_BTREE_QUERY_RANGE_ABORT, zero, or a 4809 * negative error code. 4810 */ 4811 int 4812 xfs_btree_query_range( 4813 struct xfs_btree_cur *cur, 4814 union xfs_btree_irec *low_rec, 4815 union xfs_btree_irec *high_rec, 4816 xfs_btree_query_range_fn fn, 4817 void *priv) 4818 { 4819 union xfs_btree_rec rec; 4820 union xfs_btree_key low_key; 4821 union xfs_btree_key high_key; 4822 4823 /* Find the keys of both ends of the interval. */ 4824 cur->bc_rec = *high_rec; 4825 cur->bc_ops->init_rec_from_cur(cur, &rec); 4826 cur->bc_ops->init_key_from_rec(&high_key, &rec); 4827 4828 cur->bc_rec = *low_rec; 4829 cur->bc_ops->init_rec_from_cur(cur, &rec); 4830 cur->bc_ops->init_key_from_rec(&low_key, &rec); 4831 4832 /* Enforce low key < high key. */ 4833 if (cur->bc_ops->diff_two_keys(cur, &low_key, &high_key) > 0) 4834 return -EINVAL; 4835 4836 if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) 4837 return xfs_btree_simple_query_range(cur, &low_key, 4838 &high_key, fn, priv); 4839 return xfs_btree_overlapped_query_range(cur, &low_key, &high_key, 4840 fn, priv); 4841 } 4842 4843 /* Query a btree for all records. */ 4844 int 4845 xfs_btree_query_all( 4846 struct xfs_btree_cur *cur, 4847 xfs_btree_query_range_fn fn, 4848 void *priv) 4849 { 4850 union xfs_btree_key low_key; 4851 union xfs_btree_key high_key; 4852 4853 memset(&cur->bc_rec, 0, sizeof(cur->bc_rec)); 4854 memset(&low_key, 0, sizeof(low_key)); 4855 memset(&high_key, 0xFF, sizeof(high_key)); 4856 4857 return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv); 4858 } 4859 4860 /* 4861 * Calculate the number of blocks needed to store a given number of records 4862 * in a short-format (per-AG metadata) btree. 4863 */ 4864 xfs_extlen_t 4865 xfs_btree_calc_size( 4866 struct xfs_mount *mp, 4867 uint *limits, 4868 unsigned long long len) 4869 { 4870 int level; 4871 int maxrecs; 4872 xfs_extlen_t rval; 4873 4874 maxrecs = limits[0]; 4875 for (level = 0, rval = 0; len > 1; level++) { 4876 len += maxrecs - 1; 4877 do_div(len, maxrecs); 4878 maxrecs = limits[1]; 4879 rval += len; 4880 } 4881 return rval; 4882 } 4883 4884 static int 4885 xfs_btree_count_blocks_helper( 4886 struct xfs_btree_cur *cur, 4887 int level, 4888 void *data) 4889 { 4890 xfs_extlen_t *blocks = data; 4891 (*blocks)++; 4892 4893 return 0; 4894 } 4895 4896 /* Count the blocks in a btree and return the result in *blocks. */ 4897 int 4898 xfs_btree_count_blocks( 4899 struct xfs_btree_cur *cur, 4900 xfs_extlen_t *blocks) 4901 { 4902 *blocks = 0; 4903 return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper, 4904 blocks); 4905 } 4906