1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2014 Red Hat, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_trans.h" 14 #include "xfs_alloc.h" 15 #include "xfs_btree.h" 16 #include "xfs_btree_staging.h" 17 #include "xfs_rmap.h" 18 #include "xfs_rmap_btree.h" 19 #include "xfs_trace.h" 20 #include "xfs_error.h" 21 #include "xfs_extent_busy.h" 22 #include "xfs_ag.h" 23 #include "xfs_ag_resv.h" 24 25 /* 26 * Reverse map btree. 27 * 28 * This is a per-ag tree used to track the owner(s) of a given extent. With 29 * reflink it is possible for there to be multiple owners, which is a departure 30 * from classic XFS. Owner records for data extents are inserted when the 31 * extent is mapped and removed when an extent is unmapped. Owner records for 32 * all other block types (i.e. metadata) are inserted when an extent is 33 * allocated and removed when an extent is freed. There can only be one owner 34 * of a metadata extent, usually an inode or some other metadata structure like 35 * an AG btree. 36 * 37 * The rmap btree is part of the free space management, so blocks for the tree 38 * are sourced from the agfl. Hence we need transaction reservation support for 39 * this tree so that the freelist is always large enough. This also impacts on 40 * the minimum space we need to leave free in the AG. 41 * 42 * The tree is ordered by [ag block, owner, offset]. This is a large key size, 43 * but it is the only way to enforce unique keys when a block can be owned by 44 * multiple files at any offset. There's no need to order/search by extent 45 * size for online updating/management of the tree. It is intended that most 46 * reverse lookups will be to find the owner(s) of a particular block, or to 47 * try to recover tree and file data from corrupt primary metadata. 48 */ 49 50 static struct xfs_btree_cur * 51 xfs_rmapbt_dup_cursor( 52 struct xfs_btree_cur *cur) 53 { 54 return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp, 55 cur->bc_ag.agbp, cur->bc_ag.pag); 56 } 57 58 STATIC void 59 xfs_rmapbt_set_root( 60 struct xfs_btree_cur *cur, 61 const union xfs_btree_ptr *ptr, 62 int inc) 63 { 64 struct xfs_buf *agbp = cur->bc_ag.agbp; 65 struct xfs_agf *agf = agbp->b_addr; 66 int btnum = cur->bc_btnum; 67 68 ASSERT(ptr->s != 0); 69 70 agf->agf_roots[btnum] = ptr->s; 71 be32_add_cpu(&agf->agf_levels[btnum], inc); 72 cur->bc_ag.pag->pagf_levels[btnum] += inc; 73 74 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); 75 } 76 77 STATIC int 78 xfs_rmapbt_alloc_block( 79 struct xfs_btree_cur *cur, 80 const union xfs_btree_ptr *start, 81 union xfs_btree_ptr *new, 82 int *stat) 83 { 84 struct xfs_buf *agbp = cur->bc_ag.agbp; 85 struct xfs_agf *agf = agbp->b_addr; 86 struct xfs_perag *pag = cur->bc_ag.pag; 87 int error; 88 xfs_agblock_t bno; 89 90 /* Allocate the new block from the freelist. If we can't, give up. */ 91 error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_ag.agbp, 92 &bno, 1); 93 if (error) 94 return error; 95 96 trace_xfs_rmapbt_alloc_block(cur->bc_mp, pag->pag_agno, bno, 1); 97 if (bno == NULLAGBLOCK) { 98 *stat = 0; 99 return 0; 100 } 101 102 xfs_extent_busy_reuse(cur->bc_mp, pag, bno, 1, false); 103 104 new->s = cpu_to_be32(bno); 105 be32_add_cpu(&agf->agf_rmap_blocks, 1); 106 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); 107 108 xfs_ag_resv_rmapbt_alloc(cur->bc_mp, pag->pag_agno); 109 110 *stat = 1; 111 return 0; 112 } 113 114 STATIC int 115 xfs_rmapbt_free_block( 116 struct xfs_btree_cur *cur, 117 struct xfs_buf *bp) 118 { 119 struct xfs_buf *agbp = cur->bc_ag.agbp; 120 struct xfs_agf *agf = agbp->b_addr; 121 struct xfs_perag *pag = cur->bc_ag.pag; 122 xfs_agblock_t bno; 123 int error; 124 125 bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp)); 126 trace_xfs_rmapbt_free_block(cur->bc_mp, pag->pag_agno, 127 bno, 1); 128 be32_add_cpu(&agf->agf_rmap_blocks, -1); 129 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); 130 error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1); 131 if (error) 132 return error; 133 134 xfs_extent_busy_insert(cur->bc_tp, pag, bno, 1, 135 XFS_EXTENT_BUSY_SKIP_DISCARD); 136 137 xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1); 138 return 0; 139 } 140 141 STATIC int 142 xfs_rmapbt_get_minrecs( 143 struct xfs_btree_cur *cur, 144 int level) 145 { 146 return cur->bc_mp->m_rmap_mnr[level != 0]; 147 } 148 149 STATIC int 150 xfs_rmapbt_get_maxrecs( 151 struct xfs_btree_cur *cur, 152 int level) 153 { 154 return cur->bc_mp->m_rmap_mxr[level != 0]; 155 } 156 157 STATIC void 158 xfs_rmapbt_init_key_from_rec( 159 union xfs_btree_key *key, 160 const union xfs_btree_rec *rec) 161 { 162 key->rmap.rm_startblock = rec->rmap.rm_startblock; 163 key->rmap.rm_owner = rec->rmap.rm_owner; 164 key->rmap.rm_offset = rec->rmap.rm_offset; 165 } 166 167 /* 168 * The high key for a reverse mapping record can be computed by shifting 169 * the startblock and offset to the highest value that would still map 170 * to that record. In practice this means that we add blockcount-1 to 171 * the startblock for all records, and if the record is for a data/attr 172 * fork mapping, we add blockcount-1 to the offset too. 173 */ 174 STATIC void 175 xfs_rmapbt_init_high_key_from_rec( 176 union xfs_btree_key *key, 177 const union xfs_btree_rec *rec) 178 { 179 uint64_t off; 180 int adj; 181 182 adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1; 183 184 key->rmap.rm_startblock = rec->rmap.rm_startblock; 185 be32_add_cpu(&key->rmap.rm_startblock, adj); 186 key->rmap.rm_owner = rec->rmap.rm_owner; 187 key->rmap.rm_offset = rec->rmap.rm_offset; 188 if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) || 189 XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset))) 190 return; 191 off = be64_to_cpu(key->rmap.rm_offset); 192 off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK); 193 key->rmap.rm_offset = cpu_to_be64(off); 194 } 195 196 STATIC void 197 xfs_rmapbt_init_rec_from_cur( 198 struct xfs_btree_cur *cur, 199 union xfs_btree_rec *rec) 200 { 201 rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock); 202 rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount); 203 rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner); 204 rec->rmap.rm_offset = cpu_to_be64( 205 xfs_rmap_irec_offset_pack(&cur->bc_rec.r)); 206 } 207 208 STATIC void 209 xfs_rmapbt_init_ptr_from_cur( 210 struct xfs_btree_cur *cur, 211 union xfs_btree_ptr *ptr) 212 { 213 struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; 214 215 ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno)); 216 217 ptr->s = agf->agf_roots[cur->bc_btnum]; 218 } 219 220 STATIC int64_t 221 xfs_rmapbt_key_diff( 222 struct xfs_btree_cur *cur, 223 const union xfs_btree_key *key) 224 { 225 struct xfs_rmap_irec *rec = &cur->bc_rec.r; 226 const struct xfs_rmap_key *kp = &key->rmap; 227 __u64 x, y; 228 int64_t d; 229 230 d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock; 231 if (d) 232 return d; 233 234 x = be64_to_cpu(kp->rm_owner); 235 y = rec->rm_owner; 236 if (x > y) 237 return 1; 238 else if (y > x) 239 return -1; 240 241 x = XFS_RMAP_OFF(be64_to_cpu(kp->rm_offset)); 242 y = rec->rm_offset; 243 if (x > y) 244 return 1; 245 else if (y > x) 246 return -1; 247 return 0; 248 } 249 250 STATIC int64_t 251 xfs_rmapbt_diff_two_keys( 252 struct xfs_btree_cur *cur, 253 const union xfs_btree_key *k1, 254 const union xfs_btree_key *k2) 255 { 256 const struct xfs_rmap_key *kp1 = &k1->rmap; 257 const struct xfs_rmap_key *kp2 = &k2->rmap; 258 int64_t d; 259 __u64 x, y; 260 261 d = (int64_t)be32_to_cpu(kp1->rm_startblock) - 262 be32_to_cpu(kp2->rm_startblock); 263 if (d) 264 return d; 265 266 x = be64_to_cpu(kp1->rm_owner); 267 y = be64_to_cpu(kp2->rm_owner); 268 if (x > y) 269 return 1; 270 else if (y > x) 271 return -1; 272 273 x = XFS_RMAP_OFF(be64_to_cpu(kp1->rm_offset)); 274 y = XFS_RMAP_OFF(be64_to_cpu(kp2->rm_offset)); 275 if (x > y) 276 return 1; 277 else if (y > x) 278 return -1; 279 return 0; 280 } 281 282 static xfs_failaddr_t 283 xfs_rmapbt_verify( 284 struct xfs_buf *bp) 285 { 286 struct xfs_mount *mp = bp->b_mount; 287 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 288 struct xfs_perag *pag = bp->b_pag; 289 xfs_failaddr_t fa; 290 unsigned int level; 291 292 /* 293 * magic number and level verification 294 * 295 * During growfs operations, we can't verify the exact level or owner as 296 * the perag is not fully initialised and hence not attached to the 297 * buffer. In this case, check against the maximum tree depth. 298 * 299 * Similarly, during log recovery we will have a perag structure 300 * attached, but the agf information will not yet have been initialised 301 * from the on disk AGF. Again, we can only check against maximum limits 302 * in this case. 303 */ 304 if (!xfs_verify_magic(bp, block->bb_magic)) 305 return __this_address; 306 307 if (!xfs_has_rmapbt(mp)) 308 return __this_address; 309 fa = xfs_btree_sblock_v5hdr_verify(bp); 310 if (fa) 311 return fa; 312 313 level = be16_to_cpu(block->bb_level); 314 if (pag && pag->pagf_init) { 315 if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi]) 316 return __this_address; 317 } else if (level >= mp->m_rmap_maxlevels) 318 return __this_address; 319 320 return xfs_btree_sblock_verify(bp, mp->m_rmap_mxr[level != 0]); 321 } 322 323 static void 324 xfs_rmapbt_read_verify( 325 struct xfs_buf *bp) 326 { 327 xfs_failaddr_t fa; 328 329 if (!xfs_btree_sblock_verify_crc(bp)) 330 xfs_verifier_error(bp, -EFSBADCRC, __this_address); 331 else { 332 fa = xfs_rmapbt_verify(bp); 333 if (fa) 334 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 335 } 336 337 if (bp->b_error) 338 trace_xfs_btree_corrupt(bp, _RET_IP_); 339 } 340 341 static void 342 xfs_rmapbt_write_verify( 343 struct xfs_buf *bp) 344 { 345 xfs_failaddr_t fa; 346 347 fa = xfs_rmapbt_verify(bp); 348 if (fa) { 349 trace_xfs_btree_corrupt(bp, _RET_IP_); 350 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 351 return; 352 } 353 xfs_btree_sblock_calc_crc(bp); 354 355 } 356 357 const struct xfs_buf_ops xfs_rmapbt_buf_ops = { 358 .name = "xfs_rmapbt", 359 .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) }, 360 .verify_read = xfs_rmapbt_read_verify, 361 .verify_write = xfs_rmapbt_write_verify, 362 .verify_struct = xfs_rmapbt_verify, 363 }; 364 365 STATIC int 366 xfs_rmapbt_keys_inorder( 367 struct xfs_btree_cur *cur, 368 const union xfs_btree_key *k1, 369 const union xfs_btree_key *k2) 370 { 371 uint32_t x; 372 uint32_t y; 373 uint64_t a; 374 uint64_t b; 375 376 x = be32_to_cpu(k1->rmap.rm_startblock); 377 y = be32_to_cpu(k2->rmap.rm_startblock); 378 if (x < y) 379 return 1; 380 else if (x > y) 381 return 0; 382 a = be64_to_cpu(k1->rmap.rm_owner); 383 b = be64_to_cpu(k2->rmap.rm_owner); 384 if (a < b) 385 return 1; 386 else if (a > b) 387 return 0; 388 a = XFS_RMAP_OFF(be64_to_cpu(k1->rmap.rm_offset)); 389 b = XFS_RMAP_OFF(be64_to_cpu(k2->rmap.rm_offset)); 390 if (a <= b) 391 return 1; 392 return 0; 393 } 394 395 STATIC int 396 xfs_rmapbt_recs_inorder( 397 struct xfs_btree_cur *cur, 398 const union xfs_btree_rec *r1, 399 const union xfs_btree_rec *r2) 400 { 401 uint32_t x; 402 uint32_t y; 403 uint64_t a; 404 uint64_t b; 405 406 x = be32_to_cpu(r1->rmap.rm_startblock); 407 y = be32_to_cpu(r2->rmap.rm_startblock); 408 if (x < y) 409 return 1; 410 else if (x > y) 411 return 0; 412 a = be64_to_cpu(r1->rmap.rm_owner); 413 b = be64_to_cpu(r2->rmap.rm_owner); 414 if (a < b) 415 return 1; 416 else if (a > b) 417 return 0; 418 a = XFS_RMAP_OFF(be64_to_cpu(r1->rmap.rm_offset)); 419 b = XFS_RMAP_OFF(be64_to_cpu(r2->rmap.rm_offset)); 420 if (a <= b) 421 return 1; 422 return 0; 423 } 424 425 static const struct xfs_btree_ops xfs_rmapbt_ops = { 426 .rec_len = sizeof(struct xfs_rmap_rec), 427 .key_len = 2 * sizeof(struct xfs_rmap_key), 428 429 .dup_cursor = xfs_rmapbt_dup_cursor, 430 .set_root = xfs_rmapbt_set_root, 431 .alloc_block = xfs_rmapbt_alloc_block, 432 .free_block = xfs_rmapbt_free_block, 433 .get_minrecs = xfs_rmapbt_get_minrecs, 434 .get_maxrecs = xfs_rmapbt_get_maxrecs, 435 .init_key_from_rec = xfs_rmapbt_init_key_from_rec, 436 .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec, 437 .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur, 438 .init_ptr_from_cur = xfs_rmapbt_init_ptr_from_cur, 439 .key_diff = xfs_rmapbt_key_diff, 440 .buf_ops = &xfs_rmapbt_buf_ops, 441 .diff_two_keys = xfs_rmapbt_diff_two_keys, 442 .keys_inorder = xfs_rmapbt_keys_inorder, 443 .recs_inorder = xfs_rmapbt_recs_inorder, 444 }; 445 446 static struct xfs_btree_cur * 447 xfs_rmapbt_init_common( 448 struct xfs_mount *mp, 449 struct xfs_trans *tp, 450 struct xfs_perag *pag) 451 { 452 struct xfs_btree_cur *cur; 453 454 /* Overlapping btree; 2 keys per pointer. */ 455 cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_RMAP, 456 mp->m_rmap_maxlevels); 457 cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING; 458 cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); 459 cur->bc_ops = &xfs_rmapbt_ops; 460 461 /* take a reference for the cursor */ 462 atomic_inc(&pag->pag_ref); 463 cur->bc_ag.pag = pag; 464 465 return cur; 466 } 467 468 /* Create a new reverse mapping btree cursor. */ 469 struct xfs_btree_cur * 470 xfs_rmapbt_init_cursor( 471 struct xfs_mount *mp, 472 struct xfs_trans *tp, 473 struct xfs_buf *agbp, 474 struct xfs_perag *pag) 475 { 476 struct xfs_agf *agf = agbp->b_addr; 477 struct xfs_btree_cur *cur; 478 479 cur = xfs_rmapbt_init_common(mp, tp, pag); 480 cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); 481 cur->bc_ag.agbp = agbp; 482 return cur; 483 } 484 485 /* Create a new reverse mapping btree cursor with a fake root for staging. */ 486 struct xfs_btree_cur * 487 xfs_rmapbt_stage_cursor( 488 struct xfs_mount *mp, 489 struct xbtree_afakeroot *afake, 490 struct xfs_perag *pag) 491 { 492 struct xfs_btree_cur *cur; 493 494 cur = xfs_rmapbt_init_common(mp, NULL, pag); 495 xfs_btree_stage_afakeroot(cur, afake); 496 return cur; 497 } 498 499 /* 500 * Install a new reverse mapping btree root. Caller is responsible for 501 * invalidating and freeing the old btree blocks. 502 */ 503 void 504 xfs_rmapbt_commit_staged_btree( 505 struct xfs_btree_cur *cur, 506 struct xfs_trans *tp, 507 struct xfs_buf *agbp) 508 { 509 struct xfs_agf *agf = agbp->b_addr; 510 struct xbtree_afakeroot *afake = cur->bc_ag.afake; 511 512 ASSERT(cur->bc_flags & XFS_BTREE_STAGING); 513 514 agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root); 515 agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels); 516 agf->agf_rmap_blocks = cpu_to_be32(afake->af_blocks); 517 xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS | 518 XFS_AGF_RMAP_BLOCKS); 519 xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_rmapbt_ops); 520 } 521 522 /* 523 * Calculate number of records in an rmap btree block. 524 */ 525 int 526 xfs_rmapbt_maxrecs( 527 int blocklen, 528 int leaf) 529 { 530 blocklen -= XFS_RMAP_BLOCK_LEN; 531 532 if (leaf) 533 return blocklen / sizeof(struct xfs_rmap_rec); 534 return blocklen / 535 (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t)); 536 } 537 538 /* Compute the maximum height of an rmap btree. */ 539 void 540 xfs_rmapbt_compute_maxlevels( 541 struct xfs_mount *mp) 542 { 543 /* 544 * On a non-reflink filesystem, the maximum number of rmap 545 * records is the number of blocks in the AG, hence the max 546 * rmapbt height is log_$maxrecs($agblocks). However, with 547 * reflink each AG block can have up to 2^32 (per the refcount 548 * record format) owners, which means that theoretically we 549 * could face up to 2^64 rmap records. 550 * 551 * That effectively means that the max rmapbt height must be 552 * XFS_BTREE_MAXLEVELS. "Fortunately" we'll run out of AG 553 * blocks to feed the rmapbt long before the rmapbt reaches 554 * maximum height. The reflink code uses ag_resv_critical to 555 * disallow reflinking when less than 10% of the per-AG metadata 556 * block reservation since the fallback is a regular file copy. 557 */ 558 if (xfs_has_reflink(mp)) 559 mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS; 560 else 561 mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels( 562 mp->m_rmap_mnr, mp->m_sb.sb_agblocks); 563 } 564 565 /* Calculate the refcount btree size for some records. */ 566 xfs_extlen_t 567 xfs_rmapbt_calc_size( 568 struct xfs_mount *mp, 569 unsigned long long len) 570 { 571 return xfs_btree_calc_size(mp->m_rmap_mnr, len); 572 } 573 574 /* 575 * Calculate the maximum refcount btree size. 576 */ 577 xfs_extlen_t 578 xfs_rmapbt_max_size( 579 struct xfs_mount *mp, 580 xfs_agblock_t agblocks) 581 { 582 /* Bail out if we're uninitialized, which can happen in mkfs. */ 583 if (mp->m_rmap_mxr[0] == 0) 584 return 0; 585 586 return xfs_rmapbt_calc_size(mp, agblocks); 587 } 588 589 /* 590 * Figure out how many blocks to reserve and how many are used by this btree. 591 */ 592 int 593 xfs_rmapbt_calc_reserves( 594 struct xfs_mount *mp, 595 struct xfs_trans *tp, 596 struct xfs_perag *pag, 597 xfs_extlen_t *ask, 598 xfs_extlen_t *used) 599 { 600 struct xfs_buf *agbp; 601 struct xfs_agf *agf; 602 xfs_agblock_t agblocks; 603 xfs_extlen_t tree_len; 604 int error; 605 606 if (!xfs_has_rmapbt(mp)) 607 return 0; 608 609 error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp); 610 if (error) 611 return error; 612 613 agf = agbp->b_addr; 614 agblocks = be32_to_cpu(agf->agf_length); 615 tree_len = be32_to_cpu(agf->agf_rmap_blocks); 616 xfs_trans_brelse(tp, agbp); 617 618 /* 619 * The log is permanently allocated, so the space it occupies will 620 * never be available for the kinds of things that would require btree 621 * expansion. We therefore can pretend the space isn't there. 622 */ 623 if (mp->m_sb.sb_logstart && 624 XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == pag->pag_agno) 625 agblocks -= mp->m_sb.sb_logblocks; 626 627 /* Reserve 1% of the AG or enough for 1 block per record. */ 628 *ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks)); 629 *used += tree_len; 630 631 return error; 632 } 633