1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2020 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <darrick.wong@oracle.com>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_bit.h"
13 #include "xfs_mount.h"
14 #include "xfs_inode.h"
15 #include "xfs_trans.h"
16 #include "xfs_btree.h"
17 #include "xfs_trace.h"
18 #include "xfs_btree_staging.h"
19 
20 /*
21  * Staging Cursors and Fake Roots for Btrees
22  * =========================================
23  *
24  * A staging btree cursor is a special type of btree cursor that callers must
25  * use to construct a new btree index using the btree bulk loader code.  The
26  * bulk loading code uses the staging btree cursor to abstract the details of
27  * initializing new btree blocks and filling them with records or key/ptr
28  * pairs.  Regular btree operations (e.g. queries and modifications) are not
29  * supported with staging cursors, and callers must not invoke them.
30  *
31  * Fake root structures contain all the information about a btree that is under
32  * construction by the bulk loading code.  Staging btree cursors point to fake
33  * root structures instead of the usual AG header or inode structure.
34  *
35  * Callers are expected to initialize a fake root structure and pass it into
36  * the _stage_cursor function for a specific btree type.  When bulk loading is
37  * complete, callers should call the _commit_staged_btree function for that
38  * specific btree type to commit the new btree into the filesystem.
39  */
40 
41 /*
42  * Don't allow staging cursors to be duplicated because they're supposed to be
43  * kept private to a single thread.
44  */
45 STATIC struct xfs_btree_cur *
xfs_btree_fakeroot_dup_cursor(struct xfs_btree_cur * cur)46 xfs_btree_fakeroot_dup_cursor(
47 	struct xfs_btree_cur	*cur)
48 {
49 	ASSERT(0);
50 	return NULL;
51 }
52 
53 /*
54  * Don't allow block allocation for a staging cursor, because staging cursors
55  * do not support regular btree modifications.
56  *
57  * Bulk loading uses a separate callback to obtain new blocks from a
58  * preallocated list, which prevents ENOSPC failures during loading.
59  */
60 STATIC int
xfs_btree_fakeroot_alloc_block(struct xfs_btree_cur * cur,const union xfs_btree_ptr * start_bno,union xfs_btree_ptr * new_bno,int * stat)61 xfs_btree_fakeroot_alloc_block(
62 	struct xfs_btree_cur		*cur,
63 	const union xfs_btree_ptr	*start_bno,
64 	union xfs_btree_ptr		*new_bno,
65 	int				*stat)
66 {
67 	ASSERT(0);
68 	return -EFSCORRUPTED;
69 }
70 
71 /*
72  * Don't allow block freeing for a staging cursor, because staging cursors
73  * do not support regular btree modifications.
74  */
75 STATIC int
xfs_btree_fakeroot_free_block(struct xfs_btree_cur * cur,struct xfs_buf * bp)76 xfs_btree_fakeroot_free_block(
77 	struct xfs_btree_cur	*cur,
78 	struct xfs_buf		*bp)
79 {
80 	ASSERT(0);
81 	return -EFSCORRUPTED;
82 }
83 
84 /* Initialize a pointer to the root block from the fakeroot. */
85 STATIC void
xfs_btree_fakeroot_init_ptr_from_cur(struct xfs_btree_cur * cur,union xfs_btree_ptr * ptr)86 xfs_btree_fakeroot_init_ptr_from_cur(
87 	struct xfs_btree_cur	*cur,
88 	union xfs_btree_ptr	*ptr)
89 {
90 	struct xbtree_afakeroot	*afake;
91 
92 	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
93 
94 	afake = cur->bc_ag.afake;
95 	ptr->s = cpu_to_be32(afake->af_root);
96 }
97 
98 /*
99  * Bulk Loading for AG Btrees
100  * ==========================
101  *
102  * For a btree rooted in an AG header, pass a xbtree_afakeroot structure to the
103  * staging cursor.  Callers should initialize this to zero.
104  *
105  * The _stage_cursor() function for a specific btree type should call
106  * xfs_btree_stage_afakeroot to set up the in-memory cursor as a staging
107  * cursor.  The corresponding _commit_staged_btree() function should log the
108  * new root and call xfs_btree_commit_afakeroot() to transform the staging
109  * cursor into a regular btree cursor.
110  */
111 
112 /* Update the btree root information for a per-AG fake root. */
113 STATIC void
xfs_btree_afakeroot_set_root(struct xfs_btree_cur * cur,const union xfs_btree_ptr * ptr,int inc)114 xfs_btree_afakeroot_set_root(
115 	struct xfs_btree_cur		*cur,
116 	const union xfs_btree_ptr	*ptr,
117 	int				inc)
118 {
119 	struct xbtree_afakeroot	*afake = cur->bc_ag.afake;
120 
121 	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
122 	afake->af_root = be32_to_cpu(ptr->s);
123 	afake->af_levels += inc;
124 }
125 
126 /*
127  * Initialize a AG-rooted btree cursor with the given AG btree fake root.
128  * The btree cursor's bc_ops will be overridden as needed to make the staging
129  * functionality work.
130  */
131 void
xfs_btree_stage_afakeroot(struct xfs_btree_cur * cur,struct xbtree_afakeroot * afake)132 xfs_btree_stage_afakeroot(
133 	struct xfs_btree_cur		*cur,
134 	struct xbtree_afakeroot		*afake)
135 {
136 	struct xfs_btree_ops		*nops;
137 
138 	ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
139 	ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE));
140 	ASSERT(cur->bc_tp == NULL);
141 
142 	nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
143 	memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
144 	nops->alloc_block = xfs_btree_fakeroot_alloc_block;
145 	nops->free_block = xfs_btree_fakeroot_free_block;
146 	nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
147 	nops->set_root = xfs_btree_afakeroot_set_root;
148 	nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
149 
150 	cur->bc_ag.afake = afake;
151 	cur->bc_nlevels = afake->af_levels;
152 	cur->bc_ops = nops;
153 	cur->bc_flags |= XFS_BTREE_STAGING;
154 }
155 
156 /*
157  * Transform an AG-rooted staging btree cursor back into a regular cursor by
158  * substituting a real btree root for the fake one and restoring normal btree
159  * cursor ops.  The caller must log the btree root change prior to calling
160  * this.
161  */
162 void
xfs_btree_commit_afakeroot(struct xfs_btree_cur * cur,struct xfs_trans * tp,struct xfs_buf * agbp,const struct xfs_btree_ops * ops)163 xfs_btree_commit_afakeroot(
164 	struct xfs_btree_cur		*cur,
165 	struct xfs_trans		*tp,
166 	struct xfs_buf			*agbp,
167 	const struct xfs_btree_ops	*ops)
168 {
169 	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
170 	ASSERT(cur->bc_tp == NULL);
171 
172 	trace_xfs_btree_commit_afakeroot(cur);
173 
174 	kmem_free((void *)cur->bc_ops);
175 	cur->bc_ag.agbp = agbp;
176 	cur->bc_ops = ops;
177 	cur->bc_flags &= ~XFS_BTREE_STAGING;
178 	cur->bc_tp = tp;
179 }
180 
181 /*
182  * Bulk Loading for Inode-Rooted Btrees
183  * ====================================
184  *
185  * For a btree rooted in an inode fork, pass a xbtree_ifakeroot structure to
186  * the staging cursor.  This structure should be initialized as follows:
187  *
188  * - if_fork_size field should be set to the number of bytes available to the
189  *   fork in the inode.
190  *
191  * - if_fork should point to a freshly allocated struct xfs_ifork.
192  *
193  * - if_format should be set to the appropriate fork type (e.g.
194  *   XFS_DINODE_FMT_BTREE).
195  *
196  * All other fields must be zero.
197  *
198  * The _stage_cursor() function for a specific btree type should call
199  * xfs_btree_stage_ifakeroot to set up the in-memory cursor as a staging
200  * cursor.  The corresponding _commit_staged_btree() function should log the
201  * new root and call xfs_btree_commit_ifakeroot() to transform the staging
202  * cursor into a regular btree cursor.
203  */
204 
205 /*
206  * Initialize an inode-rooted btree cursor with the given inode btree fake
207  * root.  The btree cursor's bc_ops will be overridden as needed to make the
208  * staging functionality work.  If new_ops is not NULL, these new ops will be
209  * passed out to the caller for further overriding.
210  */
211 void
xfs_btree_stage_ifakeroot(struct xfs_btree_cur * cur,struct xbtree_ifakeroot * ifake,struct xfs_btree_ops ** new_ops)212 xfs_btree_stage_ifakeroot(
213 	struct xfs_btree_cur		*cur,
214 	struct xbtree_ifakeroot		*ifake,
215 	struct xfs_btree_ops		**new_ops)
216 {
217 	struct xfs_btree_ops		*nops;
218 
219 	ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
220 	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
221 	ASSERT(cur->bc_tp == NULL);
222 
223 	nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
224 	memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
225 	nops->alloc_block = xfs_btree_fakeroot_alloc_block;
226 	nops->free_block = xfs_btree_fakeroot_free_block;
227 	nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
228 	nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
229 
230 	cur->bc_ino.ifake = ifake;
231 	cur->bc_nlevels = ifake->if_levels;
232 	cur->bc_ops = nops;
233 	cur->bc_flags |= XFS_BTREE_STAGING;
234 
235 	if (new_ops)
236 		*new_ops = nops;
237 }
238 
239 /*
240  * Transform an inode-rooted staging btree cursor back into a regular cursor by
241  * substituting a real btree root for the fake one and restoring normal btree
242  * cursor ops.  The caller must log the btree root change prior to calling
243  * this.
244  */
245 void
xfs_btree_commit_ifakeroot(struct xfs_btree_cur * cur,struct xfs_trans * tp,int whichfork,const struct xfs_btree_ops * ops)246 xfs_btree_commit_ifakeroot(
247 	struct xfs_btree_cur		*cur,
248 	struct xfs_trans		*tp,
249 	int				whichfork,
250 	const struct xfs_btree_ops	*ops)
251 {
252 	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
253 	ASSERT(cur->bc_tp == NULL);
254 
255 	trace_xfs_btree_commit_ifakeroot(cur);
256 
257 	kmem_free((void *)cur->bc_ops);
258 	cur->bc_ino.ifake = NULL;
259 	cur->bc_ino.whichfork = whichfork;
260 	cur->bc_ops = ops;
261 	cur->bc_flags &= ~XFS_BTREE_STAGING;
262 	cur->bc_tp = tp;
263 }
264 
265 /*
266  * Bulk Loading of Staged Btrees
267  * =============================
268  *
269  * This interface is used with a staged btree cursor to create a totally new
270  * btree with a large number of records (i.e. more than what would fit in a
271  * single root block).  When the creation is complete, the new root can be
272  * linked atomically into the filesystem by committing the staged cursor.
273  *
274  * Creation of a new btree proceeds roughly as follows:
275  *
276  * The first step is to initialize an appropriate fake btree root structure and
277  * then construct a staged btree cursor.  Refer to the block comments about
278  * "Bulk Loading for AG Btrees" and "Bulk Loading for Inode-Rooted Btrees" for
279  * more information about how to do this.
280  *
281  * The second step is to initialize a struct xfs_btree_bload context as
282  * documented in the structure definition.
283  *
284  * The third step is to call xfs_btree_bload_compute_geometry to compute the
285  * height of and the number of blocks needed to construct the btree.  See the
286  * section "Computing the Geometry of the New Btree" for details about this
287  * computation.
288  *
289  * In step four, the caller must allocate xfs_btree_bload.nr_blocks blocks and
290  * save them for later use by ->claim_block().  Bulk loading requires all
291  * blocks to be allocated beforehand to avoid ENOSPC failures midway through a
292  * rebuild, and to minimize seek distances of the new btree.
293  *
294  * Step five is to call xfs_btree_bload() to start constructing the btree.
295  *
296  * The final step is to commit the staging btree cursor, which logs the new
297  * btree root and turns the staging cursor into a regular cursor.  The caller
298  * is responsible for cleaning up the previous btree blocks, if any.
299  *
300  * Computing the Geometry of the New Btree
301  * =======================================
302  *
303  * The number of items placed in each btree block is computed via the following
304  * algorithm: For leaf levels, the number of items for the level is nr_records
305  * in the bload structure.  For node levels, the number of items for the level
306  * is the number of blocks in the next lower level of the tree.  For each
307  * level, the desired number of items per block is defined as:
308  *
309  * desired = max(minrecs, maxrecs - slack factor)
310  *
311  * The number of blocks for the level is defined to be:
312  *
313  * blocks = floor(nr_items / desired)
314  *
315  * Note this is rounded down so that the npb calculation below will never fall
316  * below minrecs.  The number of items that will actually be loaded into each
317  * btree block is defined as:
318  *
319  * npb =  nr_items / blocks
320  *
321  * Some of the leftmost blocks in the level will contain one extra record as
322  * needed to handle uneven division.  If the number of records in any block
323  * would exceed maxrecs for that level, blocks is incremented and npb is
324  * recalculated.
325  *
326  * In other words, we compute the number of blocks needed to satisfy a given
327  * loading level, then spread the items as evenly as possible.
328  *
329  * The height and number of fs blocks required to create the btree are computed
330  * and returned via btree_height and nr_blocks.
331  */
332 
333 /*
334  * Put a btree block that we're loading onto the ordered list and release it.
335  * The btree blocks will be written to disk when bulk loading is finished.
336  */
337 static void
xfs_btree_bload_drop_buf(struct list_head * buffers_list,struct xfs_buf ** bpp)338 xfs_btree_bload_drop_buf(
339 	struct list_head	*buffers_list,
340 	struct xfs_buf		**bpp)
341 {
342 	if (*bpp == NULL)
343 		return;
344 
345 	xfs_buf_delwri_queue_here(*bpp, buffers_list);
346 	xfs_buf_relse(*bpp);
347 	*bpp = NULL;
348 }
349 
350 /*
351  * Allocate and initialize one btree block for bulk loading.
352  *
353  * The new btree block will have its level and numrecs fields set to the values
354  * of the level and nr_this_block parameters, respectively.
355  *
356  * The caller should ensure that ptrp, bpp, and blockp refer to the left
357  * sibling of the new block, if there is any.  On exit, ptrp, bpp, and blockp
358  * will all point to the new block.
359  */
360 STATIC int
xfs_btree_bload_prep_block(struct xfs_btree_cur * cur,struct xfs_btree_bload * bbl,struct list_head * buffers_list,unsigned int level,unsigned int nr_this_block,union xfs_btree_ptr * ptrp,struct xfs_buf ** bpp,struct xfs_btree_block ** blockp,void * priv)361 xfs_btree_bload_prep_block(
362 	struct xfs_btree_cur		*cur,
363 	struct xfs_btree_bload		*bbl,
364 	struct list_head		*buffers_list,
365 	unsigned int			level,
366 	unsigned int			nr_this_block,
367 	union xfs_btree_ptr		*ptrp, /* in/out */
368 	struct xfs_buf			**bpp, /* in/out */
369 	struct xfs_btree_block		**blockp, /* in/out */
370 	void				*priv)
371 {
372 	union xfs_btree_ptr		new_ptr;
373 	struct xfs_buf			*new_bp;
374 	struct xfs_btree_block		*new_block;
375 	int				ret;
376 
377 	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
378 	    level == cur->bc_nlevels - 1) {
379 		struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
380 		size_t			new_size;
381 
382 		ASSERT(*bpp == NULL);
383 
384 		/* Allocate a new incore btree root block. */
385 		new_size = bbl->iroot_size(cur, nr_this_block, priv);
386 		ifp->if_broot = kmem_zalloc(new_size, 0);
387 		ifp->if_broot_bytes = (int)new_size;
388 
389 		/* Initialize it and send it out. */
390 		xfs_btree_init_block_int(cur->bc_mp, ifp->if_broot,
391 				XFS_BUF_DADDR_NULL, cur->bc_btnum, level,
392 				nr_this_block, cur->bc_ino.ip->i_ino,
393 				cur->bc_flags);
394 
395 		*bpp = NULL;
396 		*blockp = ifp->if_broot;
397 		xfs_btree_set_ptr_null(cur, ptrp);
398 		return 0;
399 	}
400 
401 	/* Claim one of the caller's preallocated blocks. */
402 	xfs_btree_set_ptr_null(cur, &new_ptr);
403 	ret = bbl->claim_block(cur, &new_ptr, priv);
404 	if (ret)
405 		return ret;
406 
407 	ASSERT(!xfs_btree_ptr_is_null(cur, &new_ptr));
408 
409 	ret = xfs_btree_get_buf_block(cur, &new_ptr, &new_block, &new_bp);
410 	if (ret)
411 		return ret;
412 
413 	/*
414 	 * The previous block (if any) is the left sibling of the new block,
415 	 * so set its right sibling pointer to the new block and drop it.
416 	 */
417 	if (*blockp)
418 		xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB);
419 	xfs_btree_bload_drop_buf(buffers_list, bpp);
420 
421 	/* Initialize the new btree block. */
422 	xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block);
423 	xfs_btree_set_sibling(cur, new_block, ptrp, XFS_BB_LEFTSIB);
424 
425 	/* Set the out parameters. */
426 	*bpp = new_bp;
427 	*blockp = new_block;
428 	xfs_btree_copy_ptrs(cur, ptrp, &new_ptr, 1);
429 	return 0;
430 }
431 
432 /* Load one leaf block. */
433 STATIC int
xfs_btree_bload_leaf(struct xfs_btree_cur * cur,unsigned int recs_this_block,xfs_btree_bload_get_record_fn get_record,struct xfs_btree_block * block,void * priv)434 xfs_btree_bload_leaf(
435 	struct xfs_btree_cur		*cur,
436 	unsigned int			recs_this_block,
437 	xfs_btree_bload_get_record_fn	get_record,
438 	struct xfs_btree_block		*block,
439 	void				*priv)
440 {
441 	unsigned int			j;
442 	int				ret;
443 
444 	/* Fill the leaf block with records. */
445 	for (j = 1; j <= recs_this_block; j++) {
446 		union xfs_btree_rec	*block_rec;
447 
448 		ret = get_record(cur, priv);
449 		if (ret)
450 			return ret;
451 		block_rec = xfs_btree_rec_addr(cur, j, block);
452 		cur->bc_ops->init_rec_from_cur(cur, block_rec);
453 	}
454 
455 	return 0;
456 }
457 
458 /*
459  * Load one node block with key/ptr pairs.
460  *
461  * child_ptr must point to a block within the next level down in the tree.  A
462  * key/ptr entry will be created in the new node block to the block pointed to
463  * by child_ptr.  On exit, child_ptr points to the next block on the child
464  * level that needs processing.
465  */
466 STATIC int
xfs_btree_bload_node(struct xfs_btree_cur * cur,unsigned int recs_this_block,union xfs_btree_ptr * child_ptr,struct xfs_btree_block * block)467 xfs_btree_bload_node(
468 	struct xfs_btree_cur	*cur,
469 	unsigned int		recs_this_block,
470 	union xfs_btree_ptr	*child_ptr,
471 	struct xfs_btree_block	*block)
472 {
473 	unsigned int		j;
474 	int			ret;
475 
476 	/* Fill the node block with keys and pointers. */
477 	for (j = 1; j <= recs_this_block; j++) {
478 		union xfs_btree_key	child_key;
479 		union xfs_btree_ptr	*block_ptr;
480 		union xfs_btree_key	*block_key;
481 		struct xfs_btree_block	*child_block;
482 		struct xfs_buf		*child_bp;
483 
484 		ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr));
485 
486 		ret = xfs_btree_get_buf_block(cur, child_ptr, &child_block,
487 				&child_bp);
488 		if (ret)
489 			return ret;
490 
491 		block_ptr = xfs_btree_ptr_addr(cur, j, block);
492 		xfs_btree_copy_ptrs(cur, block_ptr, child_ptr, 1);
493 
494 		block_key = xfs_btree_key_addr(cur, j, block);
495 		xfs_btree_get_keys(cur, child_block, &child_key);
496 		xfs_btree_copy_keys(cur, block_key, &child_key, 1);
497 
498 		xfs_btree_get_sibling(cur, child_block, child_ptr,
499 				XFS_BB_RIGHTSIB);
500 		xfs_buf_relse(child_bp);
501 	}
502 
503 	return 0;
504 }
505 
506 /*
507  * Compute the maximum number of records (or keyptrs) per block that we want to
508  * install at this level in the btree.  Caller is responsible for having set
509  * @cur->bc_ino.forksize to the desired fork size, if appropriate.
510  */
511 STATIC unsigned int
xfs_btree_bload_max_npb(struct xfs_btree_cur * cur,struct xfs_btree_bload * bbl,unsigned int level)512 xfs_btree_bload_max_npb(
513 	struct xfs_btree_cur	*cur,
514 	struct xfs_btree_bload	*bbl,
515 	unsigned int		level)
516 {
517 	unsigned int		ret;
518 
519 	if (level == cur->bc_nlevels - 1 && cur->bc_ops->get_dmaxrecs)
520 		return cur->bc_ops->get_dmaxrecs(cur, level);
521 
522 	ret = cur->bc_ops->get_maxrecs(cur, level);
523 	if (level == 0)
524 		ret -= bbl->leaf_slack;
525 	else
526 		ret -= bbl->node_slack;
527 	return ret;
528 }
529 
530 /*
531  * Compute the desired number of records (or keyptrs) per block that we want to
532  * install at this level in the btree, which must be somewhere between minrecs
533  * and max_npb.  The caller is free to install fewer records per block.
534  */
535 STATIC unsigned int
xfs_btree_bload_desired_npb(struct xfs_btree_cur * cur,struct xfs_btree_bload * bbl,unsigned int level)536 xfs_btree_bload_desired_npb(
537 	struct xfs_btree_cur	*cur,
538 	struct xfs_btree_bload	*bbl,
539 	unsigned int		level)
540 {
541 	unsigned int		npb = xfs_btree_bload_max_npb(cur, bbl, level);
542 
543 	/* Root blocks are not subject to minrecs rules. */
544 	if (level == cur->bc_nlevels - 1)
545 		return max(1U, npb);
546 
547 	return max_t(unsigned int, cur->bc_ops->get_minrecs(cur, level), npb);
548 }
549 
550 /*
551  * Compute the number of records to be stored in each block at this level and
552  * the number of blocks for this level.  For leaf levels, we must populate an
553  * empty root block even if there are no records, so we have to have at least
554  * one block.
555  */
556 STATIC void
xfs_btree_bload_level_geometry(struct xfs_btree_cur * cur,struct xfs_btree_bload * bbl,unsigned int level,uint64_t nr_this_level,unsigned int * avg_per_block,uint64_t * blocks,uint64_t * blocks_with_extra)557 xfs_btree_bload_level_geometry(
558 	struct xfs_btree_cur	*cur,
559 	struct xfs_btree_bload	*bbl,
560 	unsigned int		level,
561 	uint64_t		nr_this_level,
562 	unsigned int		*avg_per_block,
563 	uint64_t		*blocks,
564 	uint64_t		*blocks_with_extra)
565 {
566 	uint64_t		npb;
567 	uint64_t		dontcare;
568 	unsigned int		desired_npb;
569 	unsigned int		maxnr;
570 
571 	maxnr = cur->bc_ops->get_maxrecs(cur, level);
572 
573 	/*
574 	 * Compute the number of blocks we need to fill each block with the
575 	 * desired number of records/keyptrs per block.  Because desired_npb
576 	 * could be minrecs, we use regular integer division (which rounds
577 	 * the block count down) so that in the next step the effective # of
578 	 * items per block will never be less than desired_npb.
579 	 */
580 	desired_npb = xfs_btree_bload_desired_npb(cur, bbl, level);
581 	*blocks = div64_u64_rem(nr_this_level, desired_npb, &dontcare);
582 	*blocks = max(1ULL, *blocks);
583 
584 	/*
585 	 * Compute the number of records that we will actually put in each
586 	 * block, assuming that we want to spread the records evenly between
587 	 * the blocks.  Take care that the effective # of items per block (npb)
588 	 * won't exceed maxrecs even for the blocks that get an extra record,
589 	 * since desired_npb could be maxrecs, and in the previous step we
590 	 * rounded the block count down.
591 	 */
592 	npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
593 	if (npb > maxnr || (npb == maxnr && *blocks_with_extra > 0)) {
594 		(*blocks)++;
595 		npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
596 	}
597 
598 	*avg_per_block = min_t(uint64_t, npb, nr_this_level);
599 
600 	trace_xfs_btree_bload_level_geometry(cur, level, nr_this_level,
601 			*avg_per_block, desired_npb, *blocks,
602 			*blocks_with_extra);
603 }
604 
605 /*
606  * Ensure a slack value is appropriate for the btree.
607  *
608  * If the slack value is negative, set slack so that we fill the block to
609  * halfway between minrecs and maxrecs.  Make sure the slack is never so large
610  * that we can underflow minrecs.
611  */
612 static void
xfs_btree_bload_ensure_slack(struct xfs_btree_cur * cur,int * slack,int level)613 xfs_btree_bload_ensure_slack(
614 	struct xfs_btree_cur	*cur,
615 	int			*slack,
616 	int			level)
617 {
618 	int			maxr;
619 	int			minr;
620 
621 	maxr = cur->bc_ops->get_maxrecs(cur, level);
622 	minr = cur->bc_ops->get_minrecs(cur, level);
623 
624 	/*
625 	 * If slack is negative, automatically set slack so that we load the
626 	 * btree block approximately halfway between minrecs and maxrecs.
627 	 * Generally, this will net us 75% loading.
628 	 */
629 	if (*slack < 0)
630 		*slack = maxr - ((maxr + minr) >> 1);
631 
632 	*slack = min(*slack, maxr - minr);
633 }
634 
635 /*
636  * Prepare a btree cursor for a bulk load operation by computing the geometry
637  * fields in bbl.  Caller must ensure that the btree cursor is a staging
638  * cursor.  This function can be called multiple times.
639  */
640 int
xfs_btree_bload_compute_geometry(struct xfs_btree_cur * cur,struct xfs_btree_bload * bbl,uint64_t nr_records)641 xfs_btree_bload_compute_geometry(
642 	struct xfs_btree_cur	*cur,
643 	struct xfs_btree_bload	*bbl,
644 	uint64_t		nr_records)
645 {
646 	uint64_t		nr_blocks = 0;
647 	uint64_t		nr_this_level;
648 
649 	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
650 
651 	/*
652 	 * Make sure that the slack values make sense for traditional leaf and
653 	 * node blocks.  Inode-rooted btrees will return different minrecs and
654 	 * maxrecs values for the root block (bc_nlevels == level - 1).  We're
655 	 * checking levels 0 and 1 here, so set bc_nlevels such that the btree
656 	 * code doesn't interpret either as the root level.
657 	 */
658 	cur->bc_nlevels = cur->bc_maxlevels - 1;
659 	xfs_btree_bload_ensure_slack(cur, &bbl->leaf_slack, 0);
660 	xfs_btree_bload_ensure_slack(cur, &bbl->node_slack, 1);
661 
662 	bbl->nr_records = nr_this_level = nr_records;
663 	for (cur->bc_nlevels = 1; cur->bc_nlevels <= cur->bc_maxlevels;) {
664 		uint64_t	level_blocks;
665 		uint64_t	dontcare64;
666 		unsigned int	level = cur->bc_nlevels - 1;
667 		unsigned int	avg_per_block;
668 
669 		xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
670 				&avg_per_block, &level_blocks, &dontcare64);
671 
672 		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
673 			/*
674 			 * If all the items we want to store at this level
675 			 * would fit in the inode root block, then we have our
676 			 * btree root and are done.
677 			 *
678 			 * Note that bmap btrees forbid records in the root.
679 			 */
680 			if (level != 0 && nr_this_level <= avg_per_block) {
681 				nr_blocks++;
682 				break;
683 			}
684 
685 			/*
686 			 * Otherwise, we have to store all the items for this
687 			 * level in traditional btree blocks and therefore need
688 			 * another level of btree to point to those blocks.
689 			 *
690 			 * We have to re-compute the geometry for each level of
691 			 * an inode-rooted btree because the geometry differs
692 			 * between a btree root in an inode fork and a
693 			 * traditional btree block.
694 			 *
695 			 * This distinction is made in the btree code based on
696 			 * whether level == bc_nlevels - 1.  Based on the
697 			 * previous root block size check against the root
698 			 * block geometry, we know that we aren't yet ready to
699 			 * populate the root.  Increment bc_nevels and
700 			 * recalculate the geometry for a traditional
701 			 * block-based btree level.
702 			 */
703 			cur->bc_nlevels++;
704 			ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
705 			xfs_btree_bload_level_geometry(cur, bbl, level,
706 					nr_this_level, &avg_per_block,
707 					&level_blocks, &dontcare64);
708 		} else {
709 			/*
710 			 * If all the items we want to store at this level
711 			 * would fit in a single root block, we're done.
712 			 */
713 			if (nr_this_level <= avg_per_block) {
714 				nr_blocks++;
715 				break;
716 			}
717 
718 			/* Otherwise, we need another level of btree. */
719 			cur->bc_nlevels++;
720 			ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
721 		}
722 
723 		nr_blocks += level_blocks;
724 		nr_this_level = level_blocks;
725 	}
726 
727 	if (cur->bc_nlevels > cur->bc_maxlevels)
728 		return -EOVERFLOW;
729 
730 	bbl->btree_height = cur->bc_nlevels;
731 	if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
732 		bbl->nr_blocks = nr_blocks - 1;
733 	else
734 		bbl->nr_blocks = nr_blocks;
735 	return 0;
736 }
737 
738 /* Bulk load a btree given the parameters and geometry established in bbl. */
739 int
xfs_btree_bload(struct xfs_btree_cur * cur,struct xfs_btree_bload * bbl,void * priv)740 xfs_btree_bload(
741 	struct xfs_btree_cur		*cur,
742 	struct xfs_btree_bload		*bbl,
743 	void				*priv)
744 {
745 	struct list_head		buffers_list;
746 	union xfs_btree_ptr		child_ptr;
747 	union xfs_btree_ptr		ptr;
748 	struct xfs_buf			*bp = NULL;
749 	struct xfs_btree_block		*block = NULL;
750 	uint64_t			nr_this_level = bbl->nr_records;
751 	uint64_t			blocks;
752 	uint64_t			i;
753 	uint64_t			blocks_with_extra;
754 	uint64_t			total_blocks = 0;
755 	unsigned int			avg_per_block;
756 	unsigned int			level = 0;
757 	int				ret;
758 
759 	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
760 
761 	INIT_LIST_HEAD(&buffers_list);
762 	cur->bc_nlevels = bbl->btree_height;
763 	xfs_btree_set_ptr_null(cur, &child_ptr);
764 	xfs_btree_set_ptr_null(cur, &ptr);
765 
766 	xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
767 			&avg_per_block, &blocks, &blocks_with_extra);
768 
769 	/* Load each leaf block. */
770 	for (i = 0; i < blocks; i++) {
771 		unsigned int		nr_this_block = avg_per_block;
772 
773 		/*
774 		 * Due to rounding, btree blocks will not be evenly populated
775 		 * in most cases.  blocks_with_extra tells us how many blocks
776 		 * will receive an extra record to distribute the excess across
777 		 * the current level as evenly as possible.
778 		 */
779 		if (i < blocks_with_extra)
780 			nr_this_block++;
781 
782 		ret = xfs_btree_bload_prep_block(cur, bbl, &buffers_list, level,
783 				nr_this_block, &ptr, &bp, &block, priv);
784 		if (ret)
785 			goto out;
786 
787 		trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr,
788 				nr_this_block);
789 
790 		ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_record,
791 				block, priv);
792 		if (ret)
793 			goto out;
794 
795 		/*
796 		 * Record the leftmost leaf pointer so we know where to start
797 		 * with the first node level.
798 		 */
799 		if (i == 0)
800 			xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1);
801 	}
802 	total_blocks += blocks;
803 	xfs_btree_bload_drop_buf(&buffers_list, &bp);
804 
805 	/* Populate the internal btree nodes. */
806 	for (level = 1; level < cur->bc_nlevels; level++) {
807 		union xfs_btree_ptr	first_ptr;
808 
809 		nr_this_level = blocks;
810 		block = NULL;
811 		xfs_btree_set_ptr_null(cur, &ptr);
812 
813 		xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
814 				&avg_per_block, &blocks, &blocks_with_extra);
815 
816 		/* Load each node block. */
817 		for (i = 0; i < blocks; i++) {
818 			unsigned int	nr_this_block = avg_per_block;
819 
820 			if (i < blocks_with_extra)
821 				nr_this_block++;
822 
823 			ret = xfs_btree_bload_prep_block(cur, bbl,
824 					&buffers_list, level, nr_this_block,
825 					&ptr, &bp, &block, priv);
826 			if (ret)
827 				goto out;
828 
829 			trace_xfs_btree_bload_block(cur, level, i, blocks,
830 					&ptr, nr_this_block);
831 
832 			ret = xfs_btree_bload_node(cur, nr_this_block,
833 					&child_ptr, block);
834 			if (ret)
835 				goto out;
836 
837 			/*
838 			 * Record the leftmost node pointer so that we know
839 			 * where to start the next node level above this one.
840 			 */
841 			if (i == 0)
842 				xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1);
843 		}
844 		total_blocks += blocks;
845 		xfs_btree_bload_drop_buf(&buffers_list, &bp);
846 		xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1);
847 	}
848 
849 	/* Initialize the new root. */
850 	if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
851 		ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
852 		cur->bc_ino.ifake->if_levels = cur->bc_nlevels;
853 		cur->bc_ino.ifake->if_blocks = total_blocks - 1;
854 	} else {
855 		cur->bc_ag.afake->af_root = be32_to_cpu(ptr.s);
856 		cur->bc_ag.afake->af_levels = cur->bc_nlevels;
857 		cur->bc_ag.afake->af_blocks = total_blocks;
858 	}
859 
860 	/*
861 	 * Write the new blocks to disk.  If the ordered list isn't empty after
862 	 * that, then something went wrong and we have to fail.  This should
863 	 * never happen, but we'll check anyway.
864 	 */
865 	ret = xfs_buf_delwri_submit(&buffers_list);
866 	if (ret)
867 		goto out;
868 	if (!list_empty(&buffers_list)) {
869 		ASSERT(list_empty(&buffers_list));
870 		ret = -EIO;
871 	}
872 
873 out:
874 	xfs_buf_delwri_cancel(&buffers_list);
875 	if (bp)
876 		xfs_buf_relse(bp);
877 	return ret;
878 }
879