1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2014 Red Hat, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_sb.h" 13 #include "xfs_mount.h" 14 #include "xfs_trans.h" 15 #include "xfs_alloc.h" 16 #include "xfs_btree.h" 17 #include "xfs_rmap.h" 18 #include "xfs_rmap_btree.h" 19 #include "xfs_trace.h" 20 #include "xfs_error.h" 21 #include "xfs_extent_busy.h" 22 #include "xfs_ag_resv.h" 23 24 /* 25 * Reverse map btree. 26 * 27 * This is a per-ag tree used to track the owner(s) of a given extent. With 28 * reflink it is possible for there to be multiple owners, which is a departure 29 * from classic XFS. Owner records for data extents are inserted when the 30 * extent is mapped and removed when an extent is unmapped. Owner records for 31 * all other block types (i.e. metadata) are inserted when an extent is 32 * allocated and removed when an extent is freed. There can only be one owner 33 * of a metadata extent, usually an inode or some other metadata structure like 34 * an AG btree. 35 * 36 * The rmap btree is part of the free space management, so blocks for the tree 37 * are sourced from the agfl. Hence we need transaction reservation support for 38 * this tree so that the freelist is always large enough. This also impacts on 39 * the minimum space we need to leave free in the AG. 40 * 41 * The tree is ordered by [ag block, owner, offset]. This is a large key size, 42 * but it is the only way to enforce unique keys when a block can be owned by 43 * multiple files at any offset. There's no need to order/search by extent 44 * size for online updating/management of the tree. It is intended that most 45 * reverse lookups will be to find the owner(s) of a particular block, or to 46 * try to recover tree and file data from corrupt primary metadata. 47 */ 48 49 static struct xfs_btree_cur * 50 xfs_rmapbt_dup_cursor( 51 struct xfs_btree_cur *cur) 52 { 53 return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp, 54 cur->bc_private.a.agbp, cur->bc_private.a.agno); 55 } 56 57 STATIC void 58 xfs_rmapbt_set_root( 59 struct xfs_btree_cur *cur, 60 union xfs_btree_ptr *ptr, 61 int inc) 62 { 63 struct xfs_buf *agbp = cur->bc_private.a.agbp; 64 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); 65 xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); 66 int btnum = cur->bc_btnum; 67 struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); 68 69 ASSERT(ptr->s != 0); 70 71 agf->agf_roots[btnum] = ptr->s; 72 be32_add_cpu(&agf->agf_levels[btnum], inc); 73 pag->pagf_levels[btnum] += inc; 74 xfs_perag_put(pag); 75 76 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); 77 } 78 79 STATIC int 80 xfs_rmapbt_alloc_block( 81 struct xfs_btree_cur *cur, 82 union xfs_btree_ptr *start, 83 union xfs_btree_ptr *new, 84 int *stat) 85 { 86 struct xfs_buf *agbp = cur->bc_private.a.agbp; 87 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); 88 int error; 89 xfs_agblock_t bno; 90 91 /* Allocate the new block from the freelist. If we can't, give up. */ 92 error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, 93 &bno, 1); 94 if (error) 95 return error; 96 97 trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno, 98 bno, 1); 99 if (bno == NULLAGBLOCK) { 100 *stat = 0; 101 return 0; 102 } 103 104 xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, 105 false); 106 107 xfs_trans_agbtree_delta(cur->bc_tp, 1); 108 new->s = cpu_to_be32(bno); 109 be32_add_cpu(&agf->agf_rmap_blocks, 1); 110 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); 111 112 xfs_ag_resv_rmapbt_alloc(cur->bc_mp, cur->bc_private.a.agno); 113 114 *stat = 1; 115 return 0; 116 } 117 118 STATIC int 119 xfs_rmapbt_free_block( 120 struct xfs_btree_cur *cur, 121 struct xfs_buf *bp) 122 { 123 struct xfs_buf *agbp = cur->bc_private.a.agbp; 124 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); 125 xfs_agblock_t bno; 126 int error; 127 128 bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp)); 129 trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_private.a.agno, 130 bno, 1); 131 be32_add_cpu(&agf->agf_rmap_blocks, -1); 132 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); 133 error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1); 134 if (error) 135 return error; 136 137 xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, 138 XFS_EXTENT_BUSY_SKIP_DISCARD); 139 xfs_trans_agbtree_delta(cur->bc_tp, -1); 140 141 xfs_ag_resv_rmapbt_free(cur->bc_mp, cur->bc_private.a.agno); 142 143 return 0; 144 } 145 146 STATIC int 147 xfs_rmapbt_get_minrecs( 148 struct xfs_btree_cur *cur, 149 int level) 150 { 151 return cur->bc_mp->m_rmap_mnr[level != 0]; 152 } 153 154 STATIC int 155 xfs_rmapbt_get_maxrecs( 156 struct xfs_btree_cur *cur, 157 int level) 158 { 159 return cur->bc_mp->m_rmap_mxr[level != 0]; 160 } 161 162 STATIC void 163 xfs_rmapbt_init_key_from_rec( 164 union xfs_btree_key *key, 165 union xfs_btree_rec *rec) 166 { 167 key->rmap.rm_startblock = rec->rmap.rm_startblock; 168 key->rmap.rm_owner = rec->rmap.rm_owner; 169 key->rmap.rm_offset = rec->rmap.rm_offset; 170 } 171 172 /* 173 * The high key for a reverse mapping record can be computed by shifting 174 * the startblock and offset to the highest value that would still map 175 * to that record. In practice this means that we add blockcount-1 to 176 * the startblock for all records, and if the record is for a data/attr 177 * fork mapping, we add blockcount-1 to the offset too. 178 */ 179 STATIC void 180 xfs_rmapbt_init_high_key_from_rec( 181 union xfs_btree_key *key, 182 union xfs_btree_rec *rec) 183 { 184 uint64_t off; 185 int adj; 186 187 adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1; 188 189 key->rmap.rm_startblock = rec->rmap.rm_startblock; 190 be32_add_cpu(&key->rmap.rm_startblock, adj); 191 key->rmap.rm_owner = rec->rmap.rm_owner; 192 key->rmap.rm_offset = rec->rmap.rm_offset; 193 if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) || 194 XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset))) 195 return; 196 off = be64_to_cpu(key->rmap.rm_offset); 197 off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK); 198 key->rmap.rm_offset = cpu_to_be64(off); 199 } 200 201 STATIC void 202 xfs_rmapbt_init_rec_from_cur( 203 struct xfs_btree_cur *cur, 204 union xfs_btree_rec *rec) 205 { 206 rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock); 207 rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount); 208 rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner); 209 rec->rmap.rm_offset = cpu_to_be64( 210 xfs_rmap_irec_offset_pack(&cur->bc_rec.r)); 211 } 212 213 STATIC void 214 xfs_rmapbt_init_ptr_from_cur( 215 struct xfs_btree_cur *cur, 216 union xfs_btree_ptr *ptr) 217 { 218 struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); 219 220 ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); 221 222 ptr->s = agf->agf_roots[cur->bc_btnum]; 223 } 224 225 STATIC int64_t 226 xfs_rmapbt_key_diff( 227 struct xfs_btree_cur *cur, 228 union xfs_btree_key *key) 229 { 230 struct xfs_rmap_irec *rec = &cur->bc_rec.r; 231 struct xfs_rmap_key *kp = &key->rmap; 232 __u64 x, y; 233 int64_t d; 234 235 d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock; 236 if (d) 237 return d; 238 239 x = be64_to_cpu(kp->rm_owner); 240 y = rec->rm_owner; 241 if (x > y) 242 return 1; 243 else if (y > x) 244 return -1; 245 246 x = XFS_RMAP_OFF(be64_to_cpu(kp->rm_offset)); 247 y = rec->rm_offset; 248 if (x > y) 249 return 1; 250 else if (y > x) 251 return -1; 252 return 0; 253 } 254 255 STATIC int64_t 256 xfs_rmapbt_diff_two_keys( 257 struct xfs_btree_cur *cur, 258 union xfs_btree_key *k1, 259 union xfs_btree_key *k2) 260 { 261 struct xfs_rmap_key *kp1 = &k1->rmap; 262 struct xfs_rmap_key *kp2 = &k2->rmap; 263 int64_t d; 264 __u64 x, y; 265 266 d = (int64_t)be32_to_cpu(kp1->rm_startblock) - 267 be32_to_cpu(kp2->rm_startblock); 268 if (d) 269 return d; 270 271 x = be64_to_cpu(kp1->rm_owner); 272 y = be64_to_cpu(kp2->rm_owner); 273 if (x > y) 274 return 1; 275 else if (y > x) 276 return -1; 277 278 x = XFS_RMAP_OFF(be64_to_cpu(kp1->rm_offset)); 279 y = XFS_RMAP_OFF(be64_to_cpu(kp2->rm_offset)); 280 if (x > y) 281 return 1; 282 else if (y > x) 283 return -1; 284 return 0; 285 } 286 287 static xfs_failaddr_t 288 xfs_rmapbt_verify( 289 struct xfs_buf *bp) 290 { 291 struct xfs_mount *mp = bp->b_mount; 292 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 293 struct xfs_perag *pag = bp->b_pag; 294 xfs_failaddr_t fa; 295 unsigned int level; 296 297 /* 298 * magic number and level verification 299 * 300 * During growfs operations, we can't verify the exact level or owner as 301 * the perag is not fully initialised and hence not attached to the 302 * buffer. In this case, check against the maximum tree depth. 303 * 304 * Similarly, during log recovery we will have a perag structure 305 * attached, but the agf information will not yet have been initialised 306 * from the on disk AGF. Again, we can only check against maximum limits 307 * in this case. 308 */ 309 if (!xfs_verify_magic(bp, block->bb_magic)) 310 return __this_address; 311 312 if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) 313 return __this_address; 314 fa = xfs_btree_sblock_v5hdr_verify(bp); 315 if (fa) 316 return fa; 317 318 level = be16_to_cpu(block->bb_level); 319 if (pag && pag->pagf_init) { 320 if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi]) 321 return __this_address; 322 } else if (level >= mp->m_rmap_maxlevels) 323 return __this_address; 324 325 return xfs_btree_sblock_verify(bp, mp->m_rmap_mxr[level != 0]); 326 } 327 328 static void 329 xfs_rmapbt_read_verify( 330 struct xfs_buf *bp) 331 { 332 xfs_failaddr_t fa; 333 334 if (!xfs_btree_sblock_verify_crc(bp)) 335 xfs_verifier_error(bp, -EFSBADCRC, __this_address); 336 else { 337 fa = xfs_rmapbt_verify(bp); 338 if (fa) 339 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 340 } 341 342 if (bp->b_error) 343 trace_xfs_btree_corrupt(bp, _RET_IP_); 344 } 345 346 static void 347 xfs_rmapbt_write_verify( 348 struct xfs_buf *bp) 349 { 350 xfs_failaddr_t fa; 351 352 fa = xfs_rmapbt_verify(bp); 353 if (fa) { 354 trace_xfs_btree_corrupt(bp, _RET_IP_); 355 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 356 return; 357 } 358 xfs_btree_sblock_calc_crc(bp); 359 360 } 361 362 const struct xfs_buf_ops xfs_rmapbt_buf_ops = { 363 .name = "xfs_rmapbt", 364 .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) }, 365 .verify_read = xfs_rmapbt_read_verify, 366 .verify_write = xfs_rmapbt_write_verify, 367 .verify_struct = xfs_rmapbt_verify, 368 }; 369 370 STATIC int 371 xfs_rmapbt_keys_inorder( 372 struct xfs_btree_cur *cur, 373 union xfs_btree_key *k1, 374 union xfs_btree_key *k2) 375 { 376 uint32_t x; 377 uint32_t y; 378 uint64_t a; 379 uint64_t b; 380 381 x = be32_to_cpu(k1->rmap.rm_startblock); 382 y = be32_to_cpu(k2->rmap.rm_startblock); 383 if (x < y) 384 return 1; 385 else if (x > y) 386 return 0; 387 a = be64_to_cpu(k1->rmap.rm_owner); 388 b = be64_to_cpu(k2->rmap.rm_owner); 389 if (a < b) 390 return 1; 391 else if (a > b) 392 return 0; 393 a = XFS_RMAP_OFF(be64_to_cpu(k1->rmap.rm_offset)); 394 b = XFS_RMAP_OFF(be64_to_cpu(k2->rmap.rm_offset)); 395 if (a <= b) 396 return 1; 397 return 0; 398 } 399 400 STATIC int 401 xfs_rmapbt_recs_inorder( 402 struct xfs_btree_cur *cur, 403 union xfs_btree_rec *r1, 404 union xfs_btree_rec *r2) 405 { 406 uint32_t x; 407 uint32_t y; 408 uint64_t a; 409 uint64_t b; 410 411 x = be32_to_cpu(r1->rmap.rm_startblock); 412 y = be32_to_cpu(r2->rmap.rm_startblock); 413 if (x < y) 414 return 1; 415 else if (x > y) 416 return 0; 417 a = be64_to_cpu(r1->rmap.rm_owner); 418 b = be64_to_cpu(r2->rmap.rm_owner); 419 if (a < b) 420 return 1; 421 else if (a > b) 422 return 0; 423 a = XFS_RMAP_OFF(be64_to_cpu(r1->rmap.rm_offset)); 424 b = XFS_RMAP_OFF(be64_to_cpu(r2->rmap.rm_offset)); 425 if (a <= b) 426 return 1; 427 return 0; 428 } 429 430 static const struct xfs_btree_ops xfs_rmapbt_ops = { 431 .rec_len = sizeof(struct xfs_rmap_rec), 432 .key_len = 2 * sizeof(struct xfs_rmap_key), 433 434 .dup_cursor = xfs_rmapbt_dup_cursor, 435 .set_root = xfs_rmapbt_set_root, 436 .alloc_block = xfs_rmapbt_alloc_block, 437 .free_block = xfs_rmapbt_free_block, 438 .get_minrecs = xfs_rmapbt_get_minrecs, 439 .get_maxrecs = xfs_rmapbt_get_maxrecs, 440 .init_key_from_rec = xfs_rmapbt_init_key_from_rec, 441 .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec, 442 .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur, 443 .init_ptr_from_cur = xfs_rmapbt_init_ptr_from_cur, 444 .key_diff = xfs_rmapbt_key_diff, 445 .buf_ops = &xfs_rmapbt_buf_ops, 446 .diff_two_keys = xfs_rmapbt_diff_two_keys, 447 .keys_inorder = xfs_rmapbt_keys_inorder, 448 .recs_inorder = xfs_rmapbt_recs_inorder, 449 }; 450 451 /* 452 * Allocate a new allocation btree cursor. 453 */ 454 struct xfs_btree_cur * 455 xfs_rmapbt_init_cursor( 456 struct xfs_mount *mp, 457 struct xfs_trans *tp, 458 struct xfs_buf *agbp, 459 xfs_agnumber_t agno) 460 { 461 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); 462 struct xfs_btree_cur *cur; 463 464 cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); 465 cur->bc_tp = tp; 466 cur->bc_mp = mp; 467 /* Overlapping btree; 2 keys per pointer. */ 468 cur->bc_btnum = XFS_BTNUM_RMAP; 469 cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING; 470 cur->bc_blocklog = mp->m_sb.sb_blocklog; 471 cur->bc_ops = &xfs_rmapbt_ops; 472 cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); 473 cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); 474 475 cur->bc_private.a.agbp = agbp; 476 cur->bc_private.a.agno = agno; 477 478 return cur; 479 } 480 481 /* 482 * Calculate number of records in an rmap btree block. 483 */ 484 int 485 xfs_rmapbt_maxrecs( 486 int blocklen, 487 int leaf) 488 { 489 blocklen -= XFS_RMAP_BLOCK_LEN; 490 491 if (leaf) 492 return blocklen / sizeof(struct xfs_rmap_rec); 493 return blocklen / 494 (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t)); 495 } 496 497 /* Compute the maximum height of an rmap btree. */ 498 void 499 xfs_rmapbt_compute_maxlevels( 500 struct xfs_mount *mp) 501 { 502 /* 503 * On a non-reflink filesystem, the maximum number of rmap 504 * records is the number of blocks in the AG, hence the max 505 * rmapbt height is log_$maxrecs($agblocks). However, with 506 * reflink each AG block can have up to 2^32 (per the refcount 507 * record format) owners, which means that theoretically we 508 * could face up to 2^64 rmap records. 509 * 510 * That effectively means that the max rmapbt height must be 511 * XFS_BTREE_MAXLEVELS. "Fortunately" we'll run out of AG 512 * blocks to feed the rmapbt long before the rmapbt reaches 513 * maximum height. The reflink code uses ag_resv_critical to 514 * disallow reflinking when less than 10% of the per-AG metadata 515 * block reservation since the fallback is a regular file copy. 516 */ 517 if (xfs_sb_version_hasreflink(&mp->m_sb)) 518 mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS; 519 else 520 mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels( 521 mp->m_rmap_mnr, mp->m_sb.sb_agblocks); 522 } 523 524 /* Calculate the refcount btree size for some records. */ 525 xfs_extlen_t 526 xfs_rmapbt_calc_size( 527 struct xfs_mount *mp, 528 unsigned long long len) 529 { 530 return xfs_btree_calc_size(mp->m_rmap_mnr, len); 531 } 532 533 /* 534 * Calculate the maximum refcount btree size. 535 */ 536 xfs_extlen_t 537 xfs_rmapbt_max_size( 538 struct xfs_mount *mp, 539 xfs_agblock_t agblocks) 540 { 541 /* Bail out if we're uninitialized, which can happen in mkfs. */ 542 if (mp->m_rmap_mxr[0] == 0) 543 return 0; 544 545 return xfs_rmapbt_calc_size(mp, agblocks); 546 } 547 548 /* 549 * Figure out how many blocks to reserve and how many are used by this btree. 550 */ 551 int 552 xfs_rmapbt_calc_reserves( 553 struct xfs_mount *mp, 554 struct xfs_trans *tp, 555 xfs_agnumber_t agno, 556 xfs_extlen_t *ask, 557 xfs_extlen_t *used) 558 { 559 struct xfs_buf *agbp; 560 struct xfs_agf *agf; 561 xfs_agblock_t agblocks; 562 xfs_extlen_t tree_len; 563 int error; 564 565 if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) 566 return 0; 567 568 error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); 569 if (error) 570 return error; 571 572 agf = XFS_BUF_TO_AGF(agbp); 573 agblocks = be32_to_cpu(agf->agf_length); 574 tree_len = be32_to_cpu(agf->agf_rmap_blocks); 575 xfs_trans_brelse(tp, agbp); 576 577 /* 578 * The log is permanently allocated, so the space it occupies will 579 * never be available for the kinds of things that would require btree 580 * expansion. We therefore can pretend the space isn't there. 581 */ 582 if (mp->m_sb.sb_logstart && 583 XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno) 584 agblocks -= mp->m_sb.sb_logblocks; 585 586 /* Reserve 1% of the AG or enough for 1 block per record. */ 587 *ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks)); 588 *used += tree_len; 589 590 return error; 591 } 592