1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2014 Red Hat, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_sb.h" 13 #include "xfs_mount.h" 14 #include "xfs_trans.h" 15 #include "xfs_alloc.h" 16 #include "xfs_btree.h" 17 #include "xfs_btree_staging.h" 18 #include "xfs_rmap.h" 19 #include "xfs_rmap_btree.h" 20 #include "xfs_trace.h" 21 #include "xfs_error.h" 22 #include "xfs_extent_busy.h" 23 #include "xfs_ag_resv.h" 24 25 /* 26 * Reverse map btree. 27 * 28 * This is a per-ag tree used to track the owner(s) of a given extent. With 29 * reflink it is possible for there to be multiple owners, which is a departure 30 * from classic XFS. Owner records for data extents are inserted when the 31 * extent is mapped and removed when an extent is unmapped. Owner records for 32 * all other block types (i.e. metadata) are inserted when an extent is 33 * allocated and removed when an extent is freed. There can only be one owner 34 * of a metadata extent, usually an inode or some other metadata structure like 35 * an AG btree. 36 * 37 * The rmap btree is part of the free space management, so blocks for the tree 38 * are sourced from the agfl. Hence we need transaction reservation support for 39 * this tree so that the freelist is always large enough. This also impacts on 40 * the minimum space we need to leave free in the AG. 41 * 42 * The tree is ordered by [ag block, owner, offset]. This is a large key size, 43 * but it is the only way to enforce unique keys when a block can be owned by 44 * multiple files at any offset. There's no need to order/search by extent 45 * size for online updating/management of the tree. It is intended that most 46 * reverse lookups will be to find the owner(s) of a particular block, or to 47 * try to recover tree and file data from corrupt primary metadata. 48 */ 49 50 static struct xfs_btree_cur * 51 xfs_rmapbt_dup_cursor( 52 struct xfs_btree_cur *cur) 53 { 54 return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp, 55 cur->bc_ag.agbp, cur->bc_ag.agno); 56 } 57 58 STATIC void 59 xfs_rmapbt_set_root( 60 struct xfs_btree_cur *cur, 61 union xfs_btree_ptr *ptr, 62 int inc) 63 { 64 struct xfs_buf *agbp = cur->bc_ag.agbp; 65 struct xfs_agf *agf = agbp->b_addr; 66 int btnum = cur->bc_btnum; 67 struct xfs_perag *pag = agbp->b_pag; 68 69 ASSERT(ptr->s != 0); 70 71 agf->agf_roots[btnum] = ptr->s; 72 be32_add_cpu(&agf->agf_levels[btnum], inc); 73 pag->pagf_levels[btnum] += inc; 74 75 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); 76 } 77 78 STATIC int 79 xfs_rmapbt_alloc_block( 80 struct xfs_btree_cur *cur, 81 union xfs_btree_ptr *start, 82 union xfs_btree_ptr *new, 83 int *stat) 84 { 85 struct xfs_buf *agbp = cur->bc_ag.agbp; 86 struct xfs_agf *agf = agbp->b_addr; 87 int error; 88 xfs_agblock_t bno; 89 90 /* Allocate the new block from the freelist. If we can't, give up. */ 91 error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_ag.agbp, 92 &bno, 1); 93 if (error) 94 return error; 95 96 trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_ag.agno, 97 bno, 1); 98 if (bno == NULLAGBLOCK) { 99 *stat = 0; 100 return 0; 101 } 102 103 xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1, 104 false); 105 106 new->s = cpu_to_be32(bno); 107 be32_add_cpu(&agf->agf_rmap_blocks, 1); 108 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); 109 110 xfs_ag_resv_rmapbt_alloc(cur->bc_mp, cur->bc_ag.agno); 111 112 *stat = 1; 113 return 0; 114 } 115 116 STATIC int 117 xfs_rmapbt_free_block( 118 struct xfs_btree_cur *cur, 119 struct xfs_buf *bp) 120 { 121 struct xfs_buf *agbp = cur->bc_ag.agbp; 122 struct xfs_agf *agf = agbp->b_addr; 123 struct xfs_perag *pag; 124 xfs_agblock_t bno; 125 int error; 126 127 bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp)); 128 trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_ag.agno, 129 bno, 1); 130 be32_add_cpu(&agf->agf_rmap_blocks, -1); 131 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); 132 error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1); 133 if (error) 134 return error; 135 136 xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, 137 XFS_EXTENT_BUSY_SKIP_DISCARD); 138 139 pag = cur->bc_ag.agbp->b_pag; 140 xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1); 141 return 0; 142 } 143 144 STATIC int 145 xfs_rmapbt_get_minrecs( 146 struct xfs_btree_cur *cur, 147 int level) 148 { 149 return cur->bc_mp->m_rmap_mnr[level != 0]; 150 } 151 152 STATIC int 153 xfs_rmapbt_get_maxrecs( 154 struct xfs_btree_cur *cur, 155 int level) 156 { 157 return cur->bc_mp->m_rmap_mxr[level != 0]; 158 } 159 160 STATIC void 161 xfs_rmapbt_init_key_from_rec( 162 union xfs_btree_key *key, 163 union xfs_btree_rec *rec) 164 { 165 key->rmap.rm_startblock = rec->rmap.rm_startblock; 166 key->rmap.rm_owner = rec->rmap.rm_owner; 167 key->rmap.rm_offset = rec->rmap.rm_offset; 168 } 169 170 /* 171 * The high key for a reverse mapping record can be computed by shifting 172 * the startblock and offset to the highest value that would still map 173 * to that record. In practice this means that we add blockcount-1 to 174 * the startblock for all records, and if the record is for a data/attr 175 * fork mapping, we add blockcount-1 to the offset too. 176 */ 177 STATIC void 178 xfs_rmapbt_init_high_key_from_rec( 179 union xfs_btree_key *key, 180 union xfs_btree_rec *rec) 181 { 182 uint64_t off; 183 int adj; 184 185 adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1; 186 187 key->rmap.rm_startblock = rec->rmap.rm_startblock; 188 be32_add_cpu(&key->rmap.rm_startblock, adj); 189 key->rmap.rm_owner = rec->rmap.rm_owner; 190 key->rmap.rm_offset = rec->rmap.rm_offset; 191 if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) || 192 XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset))) 193 return; 194 off = be64_to_cpu(key->rmap.rm_offset); 195 off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK); 196 key->rmap.rm_offset = cpu_to_be64(off); 197 } 198 199 STATIC void 200 xfs_rmapbt_init_rec_from_cur( 201 struct xfs_btree_cur *cur, 202 union xfs_btree_rec *rec) 203 { 204 rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock); 205 rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount); 206 rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner); 207 rec->rmap.rm_offset = cpu_to_be64( 208 xfs_rmap_irec_offset_pack(&cur->bc_rec.r)); 209 } 210 211 STATIC void 212 xfs_rmapbt_init_ptr_from_cur( 213 struct xfs_btree_cur *cur, 214 union xfs_btree_ptr *ptr) 215 { 216 struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; 217 218 ASSERT(cur->bc_ag.agno == be32_to_cpu(agf->agf_seqno)); 219 220 ptr->s = agf->agf_roots[cur->bc_btnum]; 221 } 222 223 STATIC int64_t 224 xfs_rmapbt_key_diff( 225 struct xfs_btree_cur *cur, 226 union xfs_btree_key *key) 227 { 228 struct xfs_rmap_irec *rec = &cur->bc_rec.r; 229 struct xfs_rmap_key *kp = &key->rmap; 230 __u64 x, y; 231 int64_t d; 232 233 d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock; 234 if (d) 235 return d; 236 237 x = be64_to_cpu(kp->rm_owner); 238 y = rec->rm_owner; 239 if (x > y) 240 return 1; 241 else if (y > x) 242 return -1; 243 244 x = XFS_RMAP_OFF(be64_to_cpu(kp->rm_offset)); 245 y = rec->rm_offset; 246 if (x > y) 247 return 1; 248 else if (y > x) 249 return -1; 250 return 0; 251 } 252 253 STATIC int64_t 254 xfs_rmapbt_diff_two_keys( 255 struct xfs_btree_cur *cur, 256 union xfs_btree_key *k1, 257 union xfs_btree_key *k2) 258 { 259 struct xfs_rmap_key *kp1 = &k1->rmap; 260 struct xfs_rmap_key *kp2 = &k2->rmap; 261 int64_t d; 262 __u64 x, y; 263 264 d = (int64_t)be32_to_cpu(kp1->rm_startblock) - 265 be32_to_cpu(kp2->rm_startblock); 266 if (d) 267 return d; 268 269 x = be64_to_cpu(kp1->rm_owner); 270 y = be64_to_cpu(kp2->rm_owner); 271 if (x > y) 272 return 1; 273 else if (y > x) 274 return -1; 275 276 x = XFS_RMAP_OFF(be64_to_cpu(kp1->rm_offset)); 277 y = XFS_RMAP_OFF(be64_to_cpu(kp2->rm_offset)); 278 if (x > y) 279 return 1; 280 else if (y > x) 281 return -1; 282 return 0; 283 } 284 285 static xfs_failaddr_t 286 xfs_rmapbt_verify( 287 struct xfs_buf *bp) 288 { 289 struct xfs_mount *mp = bp->b_mount; 290 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 291 struct xfs_perag *pag = bp->b_pag; 292 xfs_failaddr_t fa; 293 unsigned int level; 294 295 /* 296 * magic number and level verification 297 * 298 * During growfs operations, we can't verify the exact level or owner as 299 * the perag is not fully initialised and hence not attached to the 300 * buffer. In this case, check against the maximum tree depth. 301 * 302 * Similarly, during log recovery we will have a perag structure 303 * attached, but the agf information will not yet have been initialised 304 * from the on disk AGF. Again, we can only check against maximum limits 305 * in this case. 306 */ 307 if (!xfs_verify_magic(bp, block->bb_magic)) 308 return __this_address; 309 310 if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) 311 return __this_address; 312 fa = xfs_btree_sblock_v5hdr_verify(bp); 313 if (fa) 314 return fa; 315 316 level = be16_to_cpu(block->bb_level); 317 if (pag && pag->pagf_init) { 318 if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi]) 319 return __this_address; 320 } else if (level >= mp->m_rmap_maxlevels) 321 return __this_address; 322 323 return xfs_btree_sblock_verify(bp, mp->m_rmap_mxr[level != 0]); 324 } 325 326 static void 327 xfs_rmapbt_read_verify( 328 struct xfs_buf *bp) 329 { 330 xfs_failaddr_t fa; 331 332 if (!xfs_btree_sblock_verify_crc(bp)) 333 xfs_verifier_error(bp, -EFSBADCRC, __this_address); 334 else { 335 fa = xfs_rmapbt_verify(bp); 336 if (fa) 337 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 338 } 339 340 if (bp->b_error) 341 trace_xfs_btree_corrupt(bp, _RET_IP_); 342 } 343 344 static void 345 xfs_rmapbt_write_verify( 346 struct xfs_buf *bp) 347 { 348 xfs_failaddr_t fa; 349 350 fa = xfs_rmapbt_verify(bp); 351 if (fa) { 352 trace_xfs_btree_corrupt(bp, _RET_IP_); 353 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 354 return; 355 } 356 xfs_btree_sblock_calc_crc(bp); 357 358 } 359 360 const struct xfs_buf_ops xfs_rmapbt_buf_ops = { 361 .name = "xfs_rmapbt", 362 .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) }, 363 .verify_read = xfs_rmapbt_read_verify, 364 .verify_write = xfs_rmapbt_write_verify, 365 .verify_struct = xfs_rmapbt_verify, 366 }; 367 368 STATIC int 369 xfs_rmapbt_keys_inorder( 370 struct xfs_btree_cur *cur, 371 union xfs_btree_key *k1, 372 union xfs_btree_key *k2) 373 { 374 uint32_t x; 375 uint32_t y; 376 uint64_t a; 377 uint64_t b; 378 379 x = be32_to_cpu(k1->rmap.rm_startblock); 380 y = be32_to_cpu(k2->rmap.rm_startblock); 381 if (x < y) 382 return 1; 383 else if (x > y) 384 return 0; 385 a = be64_to_cpu(k1->rmap.rm_owner); 386 b = be64_to_cpu(k2->rmap.rm_owner); 387 if (a < b) 388 return 1; 389 else if (a > b) 390 return 0; 391 a = XFS_RMAP_OFF(be64_to_cpu(k1->rmap.rm_offset)); 392 b = XFS_RMAP_OFF(be64_to_cpu(k2->rmap.rm_offset)); 393 if (a <= b) 394 return 1; 395 return 0; 396 } 397 398 STATIC int 399 xfs_rmapbt_recs_inorder( 400 struct xfs_btree_cur *cur, 401 union xfs_btree_rec *r1, 402 union xfs_btree_rec *r2) 403 { 404 uint32_t x; 405 uint32_t y; 406 uint64_t a; 407 uint64_t b; 408 409 x = be32_to_cpu(r1->rmap.rm_startblock); 410 y = be32_to_cpu(r2->rmap.rm_startblock); 411 if (x < y) 412 return 1; 413 else if (x > y) 414 return 0; 415 a = be64_to_cpu(r1->rmap.rm_owner); 416 b = be64_to_cpu(r2->rmap.rm_owner); 417 if (a < b) 418 return 1; 419 else if (a > b) 420 return 0; 421 a = XFS_RMAP_OFF(be64_to_cpu(r1->rmap.rm_offset)); 422 b = XFS_RMAP_OFF(be64_to_cpu(r2->rmap.rm_offset)); 423 if (a <= b) 424 return 1; 425 return 0; 426 } 427 428 static const struct xfs_btree_ops xfs_rmapbt_ops = { 429 .rec_len = sizeof(struct xfs_rmap_rec), 430 .key_len = 2 * sizeof(struct xfs_rmap_key), 431 432 .dup_cursor = xfs_rmapbt_dup_cursor, 433 .set_root = xfs_rmapbt_set_root, 434 .alloc_block = xfs_rmapbt_alloc_block, 435 .free_block = xfs_rmapbt_free_block, 436 .get_minrecs = xfs_rmapbt_get_minrecs, 437 .get_maxrecs = xfs_rmapbt_get_maxrecs, 438 .init_key_from_rec = xfs_rmapbt_init_key_from_rec, 439 .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec, 440 .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur, 441 .init_ptr_from_cur = xfs_rmapbt_init_ptr_from_cur, 442 .key_diff = xfs_rmapbt_key_diff, 443 .buf_ops = &xfs_rmapbt_buf_ops, 444 .diff_two_keys = xfs_rmapbt_diff_two_keys, 445 .keys_inorder = xfs_rmapbt_keys_inorder, 446 .recs_inorder = xfs_rmapbt_recs_inorder, 447 }; 448 449 static struct xfs_btree_cur * 450 xfs_rmapbt_init_common( 451 struct xfs_mount *mp, 452 struct xfs_trans *tp, 453 xfs_agnumber_t agno) 454 { 455 struct xfs_btree_cur *cur; 456 457 cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL); 458 cur->bc_tp = tp; 459 cur->bc_mp = mp; 460 /* Overlapping btree; 2 keys per pointer. */ 461 cur->bc_btnum = XFS_BTNUM_RMAP; 462 cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING; 463 cur->bc_blocklog = mp->m_sb.sb_blocklog; 464 cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); 465 cur->bc_ag.agno = agno; 466 cur->bc_ops = &xfs_rmapbt_ops; 467 468 return cur; 469 } 470 471 /* Create a new reverse mapping btree cursor. */ 472 struct xfs_btree_cur * 473 xfs_rmapbt_init_cursor( 474 struct xfs_mount *mp, 475 struct xfs_trans *tp, 476 struct xfs_buf *agbp, 477 xfs_agnumber_t agno) 478 { 479 struct xfs_agf *agf = agbp->b_addr; 480 struct xfs_btree_cur *cur; 481 482 cur = xfs_rmapbt_init_common(mp, tp, agno); 483 cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); 484 cur->bc_ag.agbp = agbp; 485 return cur; 486 } 487 488 /* Create a new reverse mapping btree cursor with a fake root for staging. */ 489 struct xfs_btree_cur * 490 xfs_rmapbt_stage_cursor( 491 struct xfs_mount *mp, 492 struct xbtree_afakeroot *afake, 493 xfs_agnumber_t agno) 494 { 495 struct xfs_btree_cur *cur; 496 497 cur = xfs_rmapbt_init_common(mp, NULL, agno); 498 xfs_btree_stage_afakeroot(cur, afake); 499 return cur; 500 } 501 502 /* 503 * Install a new reverse mapping btree root. Caller is responsible for 504 * invalidating and freeing the old btree blocks. 505 */ 506 void 507 xfs_rmapbt_commit_staged_btree( 508 struct xfs_btree_cur *cur, 509 struct xfs_trans *tp, 510 struct xfs_buf *agbp) 511 { 512 struct xfs_agf *agf = agbp->b_addr; 513 struct xbtree_afakeroot *afake = cur->bc_ag.afake; 514 515 ASSERT(cur->bc_flags & XFS_BTREE_STAGING); 516 517 agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root); 518 agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels); 519 agf->agf_rmap_blocks = cpu_to_be32(afake->af_blocks); 520 xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS | 521 XFS_AGF_RMAP_BLOCKS); 522 xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_rmapbt_ops); 523 } 524 525 /* 526 * Calculate number of records in an rmap btree block. 527 */ 528 int 529 xfs_rmapbt_maxrecs( 530 int blocklen, 531 int leaf) 532 { 533 blocklen -= XFS_RMAP_BLOCK_LEN; 534 535 if (leaf) 536 return blocklen / sizeof(struct xfs_rmap_rec); 537 return blocklen / 538 (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t)); 539 } 540 541 /* Compute the maximum height of an rmap btree. */ 542 void 543 xfs_rmapbt_compute_maxlevels( 544 struct xfs_mount *mp) 545 { 546 /* 547 * On a non-reflink filesystem, the maximum number of rmap 548 * records is the number of blocks in the AG, hence the max 549 * rmapbt height is log_$maxrecs($agblocks). However, with 550 * reflink each AG block can have up to 2^32 (per the refcount 551 * record format) owners, which means that theoretically we 552 * could face up to 2^64 rmap records. 553 * 554 * That effectively means that the max rmapbt height must be 555 * XFS_BTREE_MAXLEVELS. "Fortunately" we'll run out of AG 556 * blocks to feed the rmapbt long before the rmapbt reaches 557 * maximum height. The reflink code uses ag_resv_critical to 558 * disallow reflinking when less than 10% of the per-AG metadata 559 * block reservation since the fallback is a regular file copy. 560 */ 561 if (xfs_sb_version_hasreflink(&mp->m_sb)) 562 mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS; 563 else 564 mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels( 565 mp->m_rmap_mnr, mp->m_sb.sb_agblocks); 566 } 567 568 /* Calculate the refcount btree size for some records. */ 569 xfs_extlen_t 570 xfs_rmapbt_calc_size( 571 struct xfs_mount *mp, 572 unsigned long long len) 573 { 574 return xfs_btree_calc_size(mp->m_rmap_mnr, len); 575 } 576 577 /* 578 * Calculate the maximum refcount btree size. 579 */ 580 xfs_extlen_t 581 xfs_rmapbt_max_size( 582 struct xfs_mount *mp, 583 xfs_agblock_t agblocks) 584 { 585 /* Bail out if we're uninitialized, which can happen in mkfs. */ 586 if (mp->m_rmap_mxr[0] == 0) 587 return 0; 588 589 return xfs_rmapbt_calc_size(mp, agblocks); 590 } 591 592 /* 593 * Figure out how many blocks to reserve and how many are used by this btree. 594 */ 595 int 596 xfs_rmapbt_calc_reserves( 597 struct xfs_mount *mp, 598 struct xfs_trans *tp, 599 xfs_agnumber_t agno, 600 xfs_extlen_t *ask, 601 xfs_extlen_t *used) 602 { 603 struct xfs_buf *agbp; 604 struct xfs_agf *agf; 605 xfs_agblock_t agblocks; 606 xfs_extlen_t tree_len; 607 int error; 608 609 if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) 610 return 0; 611 612 error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); 613 if (error) 614 return error; 615 616 agf = agbp->b_addr; 617 agblocks = be32_to_cpu(agf->agf_length); 618 tree_len = be32_to_cpu(agf->agf_rmap_blocks); 619 xfs_trans_brelse(tp, agbp); 620 621 /* 622 * The log is permanently allocated, so the space it occupies will 623 * never be available for the kinds of things that would require btree 624 * expansion. We therefore can pretend the space isn't there. 625 */ 626 if (mp->m_sb.sb_logstart && 627 XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno) 628 agblocks -= mp->m_sb.sb_logblocks; 629 630 /* Reserve 1% of the AG or enough for 1 block per record. */ 631 *ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks)); 632 *used += tree_len; 633 634 return error; 635 } 636