xref: /openbmc/linux/fs/jfs/jfs_xtree.c (revision 3cea11cd)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *   Copyright (C) International Business Machines Corp., 2000-2005
4  */
5 /*
6  *	jfs_xtree.c: extent allocation descriptor B+-tree manager
7  */
8 
9 #include <linux/fs.h>
10 #include <linux/module.h>
11 #include <linux/quotaops.h>
12 #include <linux/seq_file.h>
13 #include "jfs_incore.h"
14 #include "jfs_filsys.h"
15 #include "jfs_metapage.h"
16 #include "jfs_dmap.h"
17 #include "jfs_dinode.h"
18 #include "jfs_superblock.h"
19 #include "jfs_debug.h"
20 
21 /*
22  * xtree local flag
23  */
24 #define XT_INSERT	0x00000001
25 
26 /*
27  *	xtree key/entry comparison: extent offset
28  *
29  * return:
30  *	-1: k < start of extent
31  *	 0: start_of_extent <= k <= end_of_extent
32  *	 1: k > end_of_extent
33  */
34 #define XT_CMP(CMP, K, X, OFFSET64)\
35 {\
36 	OFFSET64 = offsetXAD(X);\
37 	(CMP) = ((K) >= OFFSET64 + lengthXAD(X)) ? 1 :\
38 		((K) < OFFSET64) ? -1 : 0;\
39 }
40 
41 /* write a xad entry */
42 #define XT_PUTENTRY(XAD, FLAG, OFF, LEN, ADDR)\
43 {\
44 	(XAD)->flag = (FLAG);\
45 	XADoffset((XAD), (OFF));\
46 	XADlength((XAD), (LEN));\
47 	XADaddress((XAD), (ADDR));\
48 }
49 
50 #define XT_PAGE(IP, MP) BT_PAGE(IP, MP, xtpage_t, i_xtroot)
51 
52 /* get page buffer for specified block address */
53 /* ToDo: Replace this ugly macro with a function */
54 #define XT_GETPAGE(IP, BN, MP, SIZE, P, RC)				\
55 do {									\
56 	BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot);	\
57 	if (!(RC)) {							\
58 		if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) || \
59 		    (le16_to_cpu((P)->header.nextindex) >		\
60 		     le16_to_cpu((P)->header.maxentry)) ||		\
61 		    (le16_to_cpu((P)->header.maxentry) >		\
62 		     (((BN) == 0) ? XTROOTMAXSLOT : PSIZE >> L2XTSLOTSIZE))) { \
63 			jfs_error((IP)->i_sb,				\
64 				  "XT_GETPAGE: xtree page corrupt\n");	\
65 			BT_PUTPAGE(MP);					\
66 			MP = NULL;					\
67 			RC = -EIO;					\
68 		}							\
69 	}								\
70 } while (0)
71 
72 /* for consistency */
73 #define XT_PUTPAGE(MP) BT_PUTPAGE(MP)
74 
75 #define XT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \
76 	BT_GETSEARCH(IP, LEAF, BN, MP, xtpage_t, P, INDEX, i_xtroot)
77 /* xtree entry parameter descriptor */
78 struct xtsplit {
79 	struct metapage *mp;
80 	s16 index;
81 	u8 flag;
82 	s64 off;
83 	s64 addr;
84 	int len;
85 	struct pxdlist *pxdlist;
86 };
87 
88 
89 /*
90  *	statistics
91  */
92 #ifdef CONFIG_JFS_STATISTICS
93 static struct {
94 	uint search;
95 	uint fastSearch;
96 	uint split;
97 } xtStat;
98 #endif
99 
100 
101 /*
102  * forward references
103  */
104 static int xtSearch(struct inode *ip, s64 xoff, s64 *next, int *cmpp,
105 		    struct btstack * btstack, int flag);
106 
107 static int xtSplitUp(tid_t tid,
108 		     struct inode *ip,
109 		     struct xtsplit * split, struct btstack * btstack);
110 
111 static int xtSplitPage(tid_t tid, struct inode *ip, struct xtsplit * split,
112 		       struct metapage ** rmpp, s64 * rbnp);
113 
114 static int xtSplitRoot(tid_t tid, struct inode *ip,
115 		       struct xtsplit * split, struct metapage ** rmpp);
116 
117 #ifdef _STILL_TO_PORT
118 static int xtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp,
119 		      xtpage_t * fp, struct btstack * btstack);
120 
121 static int xtSearchNode(struct inode *ip,
122 			xad_t * xad,
123 			int *cmpp, struct btstack * btstack, int flag);
124 
125 static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * fp);
126 #endif				/*  _STILL_TO_PORT */
127 
128 /*
129  *	xtLookup()
130  *
131  * function: map a single page into a physical extent;
132  */
133 int xtLookup(struct inode *ip, s64 lstart,
134 	     s64 llen, int *pflag, s64 * paddr, s32 * plen, int no_check)
135 {
136 	int rc = 0;
137 	struct btstack btstack;
138 	int cmp;
139 	s64 bn;
140 	struct metapage *mp;
141 	xtpage_t *p;
142 	int index;
143 	xad_t *xad;
144 	s64 next, size, xoff, xend;
145 	int xlen;
146 	s64 xaddr;
147 
148 	*paddr = 0;
149 	*plen = llen;
150 
151 	if (!no_check) {
152 		/* is lookup offset beyond eof ? */
153 		size = ((u64) ip->i_size + (JFS_SBI(ip->i_sb)->bsize - 1)) >>
154 		    JFS_SBI(ip->i_sb)->l2bsize;
155 		if (lstart >= size)
156 			return 0;
157 	}
158 
159 	/*
160 	 * search for the xad entry covering the logical extent
161 	 */
162 //search:
163 	if ((rc = xtSearch(ip, lstart, &next, &cmp, &btstack, 0))) {
164 		jfs_err("xtLookup: xtSearch returned %d", rc);
165 		return rc;
166 	}
167 
168 	/*
169 	 *	compute the physical extent covering logical extent
170 	 *
171 	 * N.B. search may have failed (e.g., hole in sparse file),
172 	 * and returned the index of the next entry.
173 	 */
174 	/* retrieve search result */
175 	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
176 
177 	/* is xad found covering start of logical extent ?
178 	 * lstart is a page start address,
179 	 * i.e., lstart cannot start in a hole;
180 	 */
181 	if (cmp) {
182 		if (next)
183 			*plen = min(next - lstart, llen);
184 		goto out;
185 	}
186 
187 	/*
188 	 * lxd covered by xad
189 	 */
190 	xad = &p->xad[index];
191 	xoff = offsetXAD(xad);
192 	xlen = lengthXAD(xad);
193 	xend = xoff + xlen;
194 	xaddr = addressXAD(xad);
195 
196 	/* initialize new pxd */
197 	*pflag = xad->flag;
198 	*paddr = xaddr + (lstart - xoff);
199 	/* a page must be fully covered by an xad */
200 	*plen = min(xend - lstart, llen);
201 
202       out:
203 	XT_PUTPAGE(mp);
204 
205 	return rc;
206 }
207 
208 /*
209  *	xtSearch()
210  *
211  * function:	search for the xad entry covering specified offset.
212  *
213  * parameters:
214  *	ip	- file object;
215  *	xoff	- extent offset;
216  *	nextp	- address of next extent (if any) for search miss
217  *	cmpp	- comparison result:
218  *	btstack - traverse stack;
219  *	flag	- search process flag (XT_INSERT);
220  *
221  * returns:
222  *	btstack contains (bn, index) of search path traversed to the entry.
223  *	*cmpp is set to result of comparison with the entry returned.
224  *	the page containing the entry is pinned at exit.
225  */
226 static int xtSearch(struct inode *ip, s64 xoff,	s64 *nextp,
227 		    int *cmpp, struct btstack * btstack, int flag)
228 {
229 	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
230 	int rc = 0;
231 	int cmp = 1;		/* init for empty page */
232 	s64 bn;			/* block number */
233 	struct metapage *mp;	/* page buffer */
234 	xtpage_t *p;		/* page */
235 	xad_t *xad;
236 	int base, index, lim, btindex;
237 	struct btframe *btsp;
238 	int nsplit = 0;		/* number of pages to split */
239 	s64 t64;
240 	s64 next = 0;
241 
242 	INCREMENT(xtStat.search);
243 
244 	BT_CLR(btstack);
245 
246 	btstack->nsplit = 0;
247 
248 	/*
249 	 *	search down tree from root:
250 	 *
251 	 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
252 	 * internal page, child page Pi contains entry with k, Ki <= K < Kj.
253 	 *
254 	 * if entry with search key K is not found
255 	 * internal page search find the entry with largest key Ki
256 	 * less than K which point to the child page to search;
257 	 * leaf page search find the entry with smallest key Kj
258 	 * greater than K so that the returned index is the position of
259 	 * the entry to be shifted right for insertion of new entry.
260 	 * for empty tree, search key is greater than any key of the tree.
261 	 *
262 	 * by convention, root bn = 0.
263 	 */
264 	for (bn = 0;;) {
265 		/* get/pin the page to search */
266 		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
267 		if (rc)
268 			return rc;
269 
270 		/* try sequential access heuristics with the previous
271 		 * access entry in target leaf page:
272 		 * once search narrowed down into the target leaf,
273 		 * key must either match an entry in the leaf or
274 		 * key entry does not exist in the tree;
275 		 */
276 //fastSearch:
277 		if ((jfs_ip->btorder & BT_SEQUENTIAL) &&
278 		    (p->header.flag & BT_LEAF) &&
279 		    (index = jfs_ip->btindex) <
280 		    le16_to_cpu(p->header.nextindex)) {
281 			xad = &p->xad[index];
282 			t64 = offsetXAD(xad);
283 			if (xoff < t64 + lengthXAD(xad)) {
284 				if (xoff >= t64) {
285 					*cmpp = 0;
286 					goto out;
287 				}
288 
289 				/* stop sequential access heuristics */
290 				goto binarySearch;
291 			} else {	/* (t64 + lengthXAD(xad)) <= xoff */
292 
293 				/* try next sequential entry */
294 				index++;
295 				if (index <
296 				    le16_to_cpu(p->header.nextindex)) {
297 					xad++;
298 					t64 = offsetXAD(xad);
299 					if (xoff < t64 + lengthXAD(xad)) {
300 						if (xoff >= t64) {
301 							*cmpp = 0;
302 							goto out;
303 						}
304 
305 						/* miss: key falls between
306 						 * previous and this entry
307 						 */
308 						*cmpp = 1;
309 						next = t64;
310 						goto out;
311 					}
312 
313 					/* (xoff >= t64 + lengthXAD(xad));
314 					 * matching entry may be further out:
315 					 * stop heuristic search
316 					 */
317 					/* stop sequential access heuristics */
318 					goto binarySearch;
319 				}
320 
321 				/* (index == p->header.nextindex);
322 				 * miss: key entry does not exist in
323 				 * the target leaf/tree
324 				 */
325 				*cmpp = 1;
326 				goto out;
327 			}
328 
329 			/*
330 			 * if hit, return index of the entry found, and
331 			 * if miss, where new entry with search key is
332 			 * to be inserted;
333 			 */
334 		      out:
335 			/* compute number of pages to split */
336 			if (flag & XT_INSERT) {
337 				if (p->header.nextindex ==	/* little-endian */
338 				    p->header.maxentry)
339 					nsplit++;
340 				else
341 					nsplit = 0;
342 				btstack->nsplit = nsplit;
343 			}
344 
345 			/* save search result */
346 			btsp = btstack->top;
347 			btsp->bn = bn;
348 			btsp->index = index;
349 			btsp->mp = mp;
350 
351 			/* update sequential access heuristics */
352 			jfs_ip->btindex = index;
353 
354 			if (nextp)
355 				*nextp = next;
356 
357 			INCREMENT(xtStat.fastSearch);
358 			return 0;
359 		}
360 
361 		/* well, ... full search now */
362 	      binarySearch:
363 		lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART;
364 
365 		/*
366 		 * binary search with search key K on the current page
367 		 */
368 		for (base = XTENTRYSTART; lim; lim >>= 1) {
369 			index = base + (lim >> 1);
370 
371 			XT_CMP(cmp, xoff, &p->xad[index], t64);
372 			if (cmp == 0) {
373 				/*
374 				 *	search hit
375 				 */
376 				/* search hit - leaf page:
377 				 * return the entry found
378 				 */
379 				if (p->header.flag & BT_LEAF) {
380 					*cmpp = cmp;
381 
382 					/* compute number of pages to split */
383 					if (flag & XT_INSERT) {
384 						if (p->header.nextindex ==
385 						    p->header.maxentry)
386 							nsplit++;
387 						else
388 							nsplit = 0;
389 						btstack->nsplit = nsplit;
390 					}
391 
392 					/* save search result */
393 					btsp = btstack->top;
394 					btsp->bn = bn;
395 					btsp->index = index;
396 					btsp->mp = mp;
397 
398 					/* init sequential access heuristics */
399 					btindex = jfs_ip->btindex;
400 					if (index == btindex ||
401 					    index == btindex + 1)
402 						jfs_ip->btorder = BT_SEQUENTIAL;
403 					else
404 						jfs_ip->btorder = BT_RANDOM;
405 					jfs_ip->btindex = index;
406 
407 					return 0;
408 				}
409 				/* search hit - internal page:
410 				 * descend/search its child page
411 				 */
412 				if (index < le16_to_cpu(p->header.nextindex)-1)
413 					next = offsetXAD(&p->xad[index + 1]);
414 				goto next;
415 			}
416 
417 			if (cmp > 0) {
418 				base = index + 1;
419 				--lim;
420 			}
421 		}
422 
423 		/*
424 		 *	search miss
425 		 *
426 		 * base is the smallest index with key (Kj) greater than
427 		 * search key (K) and may be zero or maxentry index.
428 		 */
429 		if (base < le16_to_cpu(p->header.nextindex))
430 			next = offsetXAD(&p->xad[base]);
431 		/*
432 		 * search miss - leaf page:
433 		 *
434 		 * return location of entry (base) where new entry with
435 		 * search key K is to be inserted.
436 		 */
437 		if (p->header.flag & BT_LEAF) {
438 			*cmpp = cmp;
439 
440 			/* compute number of pages to split */
441 			if (flag & XT_INSERT) {
442 				if (p->header.nextindex ==
443 				    p->header.maxentry)
444 					nsplit++;
445 				else
446 					nsplit = 0;
447 				btstack->nsplit = nsplit;
448 			}
449 
450 			/* save search result */
451 			btsp = btstack->top;
452 			btsp->bn = bn;
453 			btsp->index = base;
454 			btsp->mp = mp;
455 
456 			/* init sequential access heuristics */
457 			btindex = jfs_ip->btindex;
458 			if (base == btindex || base == btindex + 1)
459 				jfs_ip->btorder = BT_SEQUENTIAL;
460 			else
461 				jfs_ip->btorder = BT_RANDOM;
462 			jfs_ip->btindex = base;
463 
464 			if (nextp)
465 				*nextp = next;
466 
467 			return 0;
468 		}
469 
470 		/*
471 		 * search miss - non-leaf page:
472 		 *
473 		 * if base is non-zero, decrement base by one to get the parent
474 		 * entry of the child page to search.
475 		 */
476 		index = base ? base - 1 : base;
477 
478 		/*
479 		 * go down to child page
480 		 */
481 	      next:
482 		/* update number of pages to split */
483 		if (p->header.nextindex == p->header.maxentry)
484 			nsplit++;
485 		else
486 			nsplit = 0;
487 
488 		/* push (bn, index) of the parent page/entry */
489 		if (BT_STACK_FULL(btstack)) {
490 			jfs_error(ip->i_sb, "stack overrun!\n");
491 			XT_PUTPAGE(mp);
492 			return -EIO;
493 		}
494 		BT_PUSH(btstack, bn, index);
495 
496 		/* get the child page block number */
497 		bn = addressXAD(&p->xad[index]);
498 
499 		/* unpin the parent page */
500 		XT_PUTPAGE(mp);
501 	}
502 }
503 
504 /*
505  *	xtInsert()
506  *
507  * function:
508  *
509  * parameter:
510  *	tid	- transaction id;
511  *	ip	- file object;
512  *	xflag	- extent flag (XAD_NOTRECORDED):
513  *	xoff	- extent offset;
514  *	xlen	- extent length;
515  *	xaddrp	- extent address pointer (in/out):
516  *		if (*xaddrp)
517  *			caller allocated data extent at *xaddrp;
518  *		else
519  *			allocate data extent and return its xaddr;
520  *	flag	-
521  *
522  * return:
523  */
524 int xtInsert(tid_t tid,		/* transaction id */
525 	     struct inode *ip, int xflag, s64 xoff, s32 xlen, s64 * xaddrp,
526 	     int flag)
527 {
528 	int rc = 0;
529 	s64 xaddr, hint;
530 	struct metapage *mp;	/* meta-page buffer */
531 	xtpage_t *p;		/* base B+-tree index page */
532 	s64 bn;
533 	int index, nextindex;
534 	struct btstack btstack;	/* traverse stack */
535 	struct xtsplit split;	/* split information */
536 	xad_t *xad;
537 	int cmp;
538 	s64 next;
539 	struct tlock *tlck;
540 	struct xtlock *xtlck;
541 
542 	jfs_info("xtInsert: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen);
543 
544 	/*
545 	 *	search for the entry location at which to insert:
546 	 *
547 	 * xtFastSearch() and xtSearch() both returns (leaf page
548 	 * pinned, index at which to insert).
549 	 * n.b. xtSearch() may return index of maxentry of
550 	 * the full page.
551 	 */
552 	if ((rc = xtSearch(ip, xoff, &next, &cmp, &btstack, XT_INSERT)))
553 		return rc;
554 
555 	/* retrieve search result */
556 	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
557 
558 	/* This test must follow XT_GETSEARCH since mp must be valid if
559 	 * we branch to out: */
560 	if ((cmp == 0) || (next && (xlen > next - xoff))) {
561 		rc = -EEXIST;
562 		goto out;
563 	}
564 
565 	/*
566 	 * allocate data extent requested
567 	 *
568 	 * allocation hint: last xad
569 	 */
570 	if ((xaddr = *xaddrp) == 0) {
571 		if (index > XTENTRYSTART) {
572 			xad = &p->xad[index - 1];
573 			hint = addressXAD(xad) + lengthXAD(xad) - 1;
574 		} else
575 			hint = 0;
576 		if ((rc = dquot_alloc_block(ip, xlen)))
577 			goto out;
578 		if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) {
579 			dquot_free_block(ip, xlen);
580 			goto out;
581 		}
582 	}
583 
584 	/*
585 	 *	insert entry for new extent
586 	 */
587 	xflag |= XAD_NEW;
588 
589 	/*
590 	 *	if the leaf page is full, split the page and
591 	 *	propagate up the router entry for the new page from split
592 	 *
593 	 * The xtSplitUp() will insert the entry and unpin the leaf page.
594 	 */
595 	nextindex = le16_to_cpu(p->header.nextindex);
596 	if (nextindex == le16_to_cpu(p->header.maxentry)) {
597 		split.mp = mp;
598 		split.index = index;
599 		split.flag = xflag;
600 		split.off = xoff;
601 		split.len = xlen;
602 		split.addr = xaddr;
603 		split.pxdlist = NULL;
604 		if ((rc = xtSplitUp(tid, ip, &split, &btstack))) {
605 			/* undo data extent allocation */
606 			if (*xaddrp == 0) {
607 				dbFree(ip, xaddr, (s64) xlen);
608 				dquot_free_block(ip, xlen);
609 			}
610 			return rc;
611 		}
612 
613 		*xaddrp = xaddr;
614 		return 0;
615 	}
616 
617 	/*
618 	 *	insert the new entry into the leaf page
619 	 */
620 	/*
621 	 * acquire a transaction lock on the leaf page;
622 	 *
623 	 * action: xad insertion/extension;
624 	 */
625 	BT_MARK_DIRTY(mp, ip);
626 
627 	/* if insert into middle, shift right remaining entries. */
628 	if (index < nextindex)
629 		memmove(&p->xad[index + 1], &p->xad[index],
630 			(nextindex - index) * sizeof(xad_t));
631 
632 	/* insert the new entry: mark the entry NEW */
633 	xad = &p->xad[index];
634 	XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
635 
636 	/* advance next available entry index */
637 	le16_add_cpu(&p->header.nextindex, 1);
638 
639 	/* Don't log it if there are no links to the file */
640 	if (!test_cflag(COMMIT_Nolink, ip)) {
641 		tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
642 		xtlck = (struct xtlock *) & tlck->lock;
643 		xtlck->lwm.offset =
644 		    (xtlck->lwm.offset) ? min(index,
645 					      (int)xtlck->lwm.offset) : index;
646 		xtlck->lwm.length =
647 		    le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset;
648 	}
649 
650 	*xaddrp = xaddr;
651 
652       out:
653 	/* unpin the leaf page */
654 	XT_PUTPAGE(mp);
655 
656 	return rc;
657 }
658 
659 
660 /*
661  *	xtSplitUp()
662  *
663  * function:
664  *	split full pages as propagating insertion up the tree
665  *
666  * parameter:
667  *	tid	- transaction id;
668  *	ip	- file object;
669  *	split	- entry parameter descriptor;
670  *	btstack - traverse stack from xtSearch()
671  *
672  * return:
673  */
674 static int
675 xtSplitUp(tid_t tid,
676 	  struct inode *ip, struct xtsplit * split, struct btstack * btstack)
677 {
678 	int rc = 0;
679 	struct metapage *smp;
680 	xtpage_t *sp;		/* split page */
681 	struct metapage *rmp;
682 	s64 rbn;		/* new right page block number */
683 	struct metapage *rcmp;
684 	xtpage_t *rcp;		/* right child page */
685 	s64 rcbn;		/* right child page block number */
686 	int skip;		/* index of entry of insertion */
687 	int nextindex;		/* next available entry index of p */
688 	struct btframe *parent;	/* parent page entry on traverse stack */
689 	xad_t *xad;
690 	s64 xaddr;
691 	int xlen;
692 	int nsplit;		/* number of pages split */
693 	struct pxdlist pxdlist;
694 	pxd_t *pxd;
695 	struct tlock *tlck;
696 	struct xtlock *xtlck;
697 
698 	smp = split->mp;
699 	sp = XT_PAGE(ip, smp);
700 
701 	/* is inode xtree root extension/inline EA area free ? */
702 	if ((sp->header.flag & BT_ROOT) && (!S_ISDIR(ip->i_mode)) &&
703 	    (le16_to_cpu(sp->header.maxentry) < XTROOTMAXSLOT) &&
704 	    (JFS_IP(ip)->mode2 & INLINEEA)) {
705 		sp->header.maxentry = cpu_to_le16(XTROOTMAXSLOT);
706 		JFS_IP(ip)->mode2 &= ~INLINEEA;
707 
708 		BT_MARK_DIRTY(smp, ip);
709 		/*
710 		 * acquire a transaction lock on the leaf page;
711 		 *
712 		 * action: xad insertion/extension;
713 		 */
714 
715 		/* if insert into middle, shift right remaining entries. */
716 		skip = split->index;
717 		nextindex = le16_to_cpu(sp->header.nextindex);
718 		if (skip < nextindex)
719 			memmove(&sp->xad[skip + 1], &sp->xad[skip],
720 				(nextindex - skip) * sizeof(xad_t));
721 
722 		/* insert the new entry: mark the entry NEW */
723 		xad = &sp->xad[skip];
724 		XT_PUTENTRY(xad, split->flag, split->off, split->len,
725 			    split->addr);
726 
727 		/* advance next available entry index */
728 		le16_add_cpu(&sp->header.nextindex, 1);
729 
730 		/* Don't log it if there are no links to the file */
731 		if (!test_cflag(COMMIT_Nolink, ip)) {
732 			tlck = txLock(tid, ip, smp, tlckXTREE | tlckGROW);
733 			xtlck = (struct xtlock *) & tlck->lock;
734 			xtlck->lwm.offset = (xtlck->lwm.offset) ?
735 			    min(skip, (int)xtlck->lwm.offset) : skip;
736 			xtlck->lwm.length =
737 			    le16_to_cpu(sp->header.nextindex) -
738 			    xtlck->lwm.offset;
739 		}
740 
741 		return 0;
742 	}
743 
744 	/*
745 	 * allocate new index blocks to cover index page split(s)
746 	 *
747 	 * allocation hint: ?
748 	 */
749 	if (split->pxdlist == NULL) {
750 		nsplit = btstack->nsplit;
751 		split->pxdlist = &pxdlist;
752 		pxdlist.maxnpxd = pxdlist.npxd = 0;
753 		pxd = &pxdlist.pxd[0];
754 		xlen = JFS_SBI(ip->i_sb)->nbperpage;
755 		for (; nsplit > 0; nsplit--, pxd++) {
756 			if ((rc = dbAlloc(ip, (s64) 0, (s64) xlen, &xaddr))
757 			    == 0) {
758 				PXDaddress(pxd, xaddr);
759 				PXDlength(pxd, xlen);
760 
761 				pxdlist.maxnpxd++;
762 
763 				continue;
764 			}
765 
766 			/* undo allocation */
767 
768 			XT_PUTPAGE(smp);
769 			return rc;
770 		}
771 	}
772 
773 	/*
774 	 * Split leaf page <sp> into <sp> and a new right page <rp>.
775 	 *
776 	 * The split routines insert the new entry into the leaf page,
777 	 * and acquire txLock as appropriate.
778 	 * return <rp> pinned and its block number <rpbn>.
779 	 */
780 	rc = (sp->header.flag & BT_ROOT) ?
781 	    xtSplitRoot(tid, ip, split, &rmp) :
782 	    xtSplitPage(tid, ip, split, &rmp, &rbn);
783 
784 	XT_PUTPAGE(smp);
785 
786 	if (rc)
787 		return -EIO;
788 	/*
789 	 * propagate up the router entry for the leaf page just split
790 	 *
791 	 * insert a router entry for the new page into the parent page,
792 	 * propagate the insert/split up the tree by walking back the stack
793 	 * of (bn of parent page, index of child page entry in parent page)
794 	 * that were traversed during the search for the page that split.
795 	 *
796 	 * the propagation of insert/split up the tree stops if the root
797 	 * splits or the page inserted into doesn't have to split to hold
798 	 * the new entry.
799 	 *
800 	 * the parent entry for the split page remains the same, and
801 	 * a new entry is inserted at its right with the first key and
802 	 * block number of the new right page.
803 	 *
804 	 * There are a maximum of 3 pages pinned at any time:
805 	 * right child, left parent and right parent (when the parent splits)
806 	 * to keep the child page pinned while working on the parent.
807 	 * make sure that all pins are released at exit.
808 	 */
809 	while ((parent = BT_POP(btstack)) != NULL) {
810 		/* parent page specified by stack frame <parent> */
811 
812 		/* keep current child pages <rcp> pinned */
813 		rcmp = rmp;
814 		rcbn = rbn;
815 		rcp = XT_PAGE(ip, rcmp);
816 
817 		/*
818 		 * insert router entry in parent for new right child page <rp>
819 		 */
820 		/* get/pin the parent page <sp> */
821 		XT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc);
822 		if (rc) {
823 			XT_PUTPAGE(rcmp);
824 			return rc;
825 		}
826 
827 		/*
828 		 * The new key entry goes ONE AFTER the index of parent entry,
829 		 * because the split was to the right.
830 		 */
831 		skip = parent->index + 1;
832 
833 		/*
834 		 * split or shift right remaining entries of the parent page
835 		 */
836 		nextindex = le16_to_cpu(sp->header.nextindex);
837 		/*
838 		 * parent page is full - split the parent page
839 		 */
840 		if (nextindex == le16_to_cpu(sp->header.maxentry)) {
841 			/* init for parent page split */
842 			split->mp = smp;
843 			split->index = skip;	/* index at insert */
844 			split->flag = XAD_NEW;
845 			split->off = offsetXAD(&rcp->xad[XTENTRYSTART]);
846 			split->len = JFS_SBI(ip->i_sb)->nbperpage;
847 			split->addr = rcbn;
848 
849 			/* unpin previous right child page */
850 			XT_PUTPAGE(rcmp);
851 
852 			/* The split routines insert the new entry,
853 			 * and acquire txLock as appropriate.
854 			 * return <rp> pinned and its block number <rpbn>.
855 			 */
856 			rc = (sp->header.flag & BT_ROOT) ?
857 			    xtSplitRoot(tid, ip, split, &rmp) :
858 			    xtSplitPage(tid, ip, split, &rmp, &rbn);
859 			if (rc) {
860 				XT_PUTPAGE(smp);
861 				return rc;
862 			}
863 
864 			XT_PUTPAGE(smp);
865 			/* keep new child page <rp> pinned */
866 		}
867 		/*
868 		 * parent page is not full - insert in parent page
869 		 */
870 		else {
871 			/*
872 			 * insert router entry in parent for the right child
873 			 * page from the first entry of the right child page:
874 			 */
875 			/*
876 			 * acquire a transaction lock on the parent page;
877 			 *
878 			 * action: router xad insertion;
879 			 */
880 			BT_MARK_DIRTY(smp, ip);
881 
882 			/*
883 			 * if insert into middle, shift right remaining entries
884 			 */
885 			if (skip < nextindex)
886 				memmove(&sp->xad[skip + 1], &sp->xad[skip],
887 					(nextindex -
888 					 skip) << L2XTSLOTSIZE);
889 
890 			/* insert the router entry */
891 			xad = &sp->xad[skip];
892 			XT_PUTENTRY(xad, XAD_NEW,
893 				    offsetXAD(&rcp->xad[XTENTRYSTART]),
894 				    JFS_SBI(ip->i_sb)->nbperpage, rcbn);
895 
896 			/* advance next available entry index. */
897 			le16_add_cpu(&sp->header.nextindex, 1);
898 
899 			/* Don't log it if there are no links to the file */
900 			if (!test_cflag(COMMIT_Nolink, ip)) {
901 				tlck = txLock(tid, ip, smp,
902 					      tlckXTREE | tlckGROW);
903 				xtlck = (struct xtlock *) & tlck->lock;
904 				xtlck->lwm.offset = (xtlck->lwm.offset) ?
905 				    min(skip, (int)xtlck->lwm.offset) : skip;
906 				xtlck->lwm.length =
907 				    le16_to_cpu(sp->header.nextindex) -
908 				    xtlck->lwm.offset;
909 			}
910 
911 			/* unpin parent page */
912 			XT_PUTPAGE(smp);
913 
914 			/* exit propagate up */
915 			break;
916 		}
917 	}
918 
919 	/* unpin current right page */
920 	XT_PUTPAGE(rmp);
921 
922 	return 0;
923 }
924 
925 
926 /*
927  *	xtSplitPage()
928  *
929  * function:
930  *	split a full non-root page into
931  *	original/split/left page and new right page
932  *	i.e., the original/split page remains as left page.
933  *
934  * parameter:
935  *	int		tid,
936  *	struct inode	*ip,
937  *	struct xtsplit	*split,
938  *	struct metapage	**rmpp,
939  *	u64		*rbnp,
940  *
941  * return:
942  *	Pointer to page in which to insert or NULL on error.
943  */
944 static int
945 xtSplitPage(tid_t tid, struct inode *ip,
946 	    struct xtsplit * split, struct metapage ** rmpp, s64 * rbnp)
947 {
948 	int rc = 0;
949 	struct metapage *smp;
950 	xtpage_t *sp;
951 	struct metapage *rmp;
952 	xtpage_t *rp;		/* new right page allocated */
953 	s64 rbn;		/* new right page block number */
954 	struct metapage *mp;
955 	xtpage_t *p;
956 	s64 nextbn;
957 	int skip, maxentry, middle, righthalf, n;
958 	xad_t *xad;
959 	struct pxdlist *pxdlist;
960 	pxd_t *pxd;
961 	struct tlock *tlck;
962 	struct xtlock *sxtlck = NULL, *rxtlck = NULL;
963 	int quota_allocation = 0;
964 
965 	smp = split->mp;
966 	sp = XT_PAGE(ip, smp);
967 
968 	INCREMENT(xtStat.split);
969 
970 	pxdlist = split->pxdlist;
971 	pxd = &pxdlist->pxd[pxdlist->npxd];
972 	pxdlist->npxd++;
973 	rbn = addressPXD(pxd);
974 
975 	/* Allocate blocks to quota. */
976 	rc = dquot_alloc_block(ip, lengthPXD(pxd));
977 	if (rc)
978 		goto clean_up;
979 
980 	quota_allocation += lengthPXD(pxd);
981 
982 	/*
983 	 * allocate the new right page for the split
984 	 */
985 	rmp = get_metapage(ip, rbn, PSIZE, 1);
986 	if (rmp == NULL) {
987 		rc = -EIO;
988 		goto clean_up;
989 	}
990 
991 	jfs_info("xtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp);
992 
993 	BT_MARK_DIRTY(rmp, ip);
994 	/*
995 	 * action: new page;
996 	 */
997 
998 	rp = (xtpage_t *) rmp->data;
999 	rp->header.self = *pxd;
1000 	rp->header.flag = sp->header.flag & BT_TYPE;
1001 	rp->header.maxentry = sp->header.maxentry;	/* little-endian */
1002 	rp->header.nextindex = cpu_to_le16(XTENTRYSTART);
1003 
1004 	BT_MARK_DIRTY(smp, ip);
1005 	/* Don't log it if there are no links to the file */
1006 	if (!test_cflag(COMMIT_Nolink, ip)) {
1007 		/*
1008 		 * acquire a transaction lock on the new right page;
1009 		 */
1010 		tlck = txLock(tid, ip, rmp, tlckXTREE | tlckNEW);
1011 		rxtlck = (struct xtlock *) & tlck->lock;
1012 		rxtlck->lwm.offset = XTENTRYSTART;
1013 		/*
1014 		 * acquire a transaction lock on the split page
1015 		 */
1016 		tlck = txLock(tid, ip, smp, tlckXTREE | tlckGROW);
1017 		sxtlck = (struct xtlock *) & tlck->lock;
1018 	}
1019 
1020 	/*
1021 	 * initialize/update sibling pointers of <sp> and <rp>
1022 	 */
1023 	nextbn = le64_to_cpu(sp->header.next);
1024 	rp->header.next = cpu_to_le64(nextbn);
1025 	rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self));
1026 	sp->header.next = cpu_to_le64(rbn);
1027 
1028 	skip = split->index;
1029 
1030 	/*
1031 	 *	sequential append at tail (after last entry of last page)
1032 	 *
1033 	 * if splitting the last page on a level because of appending
1034 	 * a entry to it (skip is maxentry), it's likely that the access is
1035 	 * sequential. adding an empty page on the side of the level is less
1036 	 * work and can push the fill factor much higher than normal.
1037 	 * if we're wrong it's no big deal -  we will do the split the right
1038 	 * way next time.
1039 	 * (it may look like it's equally easy to do a similar hack for
1040 	 * reverse sorted data, that is, split the tree left, but it's not.
1041 	 * Be my guest.)
1042 	 */
1043 	if (nextbn == 0 && skip == le16_to_cpu(sp->header.maxentry)) {
1044 		/*
1045 		 * acquire a transaction lock on the new/right page;
1046 		 *
1047 		 * action: xad insertion;
1048 		 */
1049 		/* insert entry at the first entry of the new right page */
1050 		xad = &rp->xad[XTENTRYSTART];
1051 		XT_PUTENTRY(xad, split->flag, split->off, split->len,
1052 			    split->addr);
1053 
1054 		rp->header.nextindex = cpu_to_le16(XTENTRYSTART + 1);
1055 
1056 		if (!test_cflag(COMMIT_Nolink, ip)) {
1057 			/* rxtlck->lwm.offset = XTENTRYSTART; */
1058 			rxtlck->lwm.length = 1;
1059 		}
1060 
1061 		*rmpp = rmp;
1062 		*rbnp = rbn;
1063 
1064 		jfs_info("xtSplitPage: sp:0x%p rp:0x%p", sp, rp);
1065 		return 0;
1066 	}
1067 
1068 	/*
1069 	 *	non-sequential insert (at possibly middle page)
1070 	 */
1071 
1072 	/*
1073 	 * update previous pointer of old next/right page of <sp>
1074 	 */
1075 	if (nextbn != 0) {
1076 		XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
1077 		if (rc) {
1078 			XT_PUTPAGE(rmp);
1079 			goto clean_up;
1080 		}
1081 
1082 		BT_MARK_DIRTY(mp, ip);
1083 		/*
1084 		 * acquire a transaction lock on the next page;
1085 		 *
1086 		 * action:sibling pointer update;
1087 		 */
1088 		if (!test_cflag(COMMIT_Nolink, ip))
1089 			tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK);
1090 
1091 		p->header.prev = cpu_to_le64(rbn);
1092 
1093 		/* sibling page may have been updated previously, or
1094 		 * it may be updated later;
1095 		 */
1096 
1097 		XT_PUTPAGE(mp);
1098 	}
1099 
1100 	/*
1101 	 * split the data between the split and new/right pages
1102 	 */
1103 	maxentry = le16_to_cpu(sp->header.maxentry);
1104 	middle = maxentry >> 1;
1105 	righthalf = maxentry - middle;
1106 
1107 	/*
1108 	 * skip index in old split/left page - insert into left page:
1109 	 */
1110 	if (skip <= middle) {
1111 		/* move right half of split page to the new right page */
1112 		memmove(&rp->xad[XTENTRYSTART], &sp->xad[middle],
1113 			righthalf << L2XTSLOTSIZE);
1114 
1115 		/* shift right tail of left half to make room for new entry */
1116 		if (skip < middle)
1117 			memmove(&sp->xad[skip + 1], &sp->xad[skip],
1118 				(middle - skip) << L2XTSLOTSIZE);
1119 
1120 		/* insert new entry */
1121 		xad = &sp->xad[skip];
1122 		XT_PUTENTRY(xad, split->flag, split->off, split->len,
1123 			    split->addr);
1124 
1125 		/* update page header */
1126 		sp->header.nextindex = cpu_to_le16(middle + 1);
1127 		if (!test_cflag(COMMIT_Nolink, ip)) {
1128 			sxtlck->lwm.offset = (sxtlck->lwm.offset) ?
1129 			    min(skip, (int)sxtlck->lwm.offset) : skip;
1130 		}
1131 
1132 		rp->header.nextindex =
1133 		    cpu_to_le16(XTENTRYSTART + righthalf);
1134 	}
1135 	/*
1136 	 * skip index in new right page - insert into right page:
1137 	 */
1138 	else {
1139 		/* move left head of right half to right page */
1140 		n = skip - middle;
1141 		memmove(&rp->xad[XTENTRYSTART], &sp->xad[middle],
1142 			n << L2XTSLOTSIZE);
1143 
1144 		/* insert new entry */
1145 		n += XTENTRYSTART;
1146 		xad = &rp->xad[n];
1147 		XT_PUTENTRY(xad, split->flag, split->off, split->len,
1148 			    split->addr);
1149 
1150 		/* move right tail of right half to right page */
1151 		if (skip < maxentry)
1152 			memmove(&rp->xad[n + 1], &sp->xad[skip],
1153 				(maxentry - skip) << L2XTSLOTSIZE);
1154 
1155 		/* update page header */
1156 		sp->header.nextindex = cpu_to_le16(middle);
1157 		if (!test_cflag(COMMIT_Nolink, ip)) {
1158 			sxtlck->lwm.offset = (sxtlck->lwm.offset) ?
1159 			    min(middle, (int)sxtlck->lwm.offset) : middle;
1160 		}
1161 
1162 		rp->header.nextindex = cpu_to_le16(XTENTRYSTART +
1163 						   righthalf + 1);
1164 	}
1165 
1166 	if (!test_cflag(COMMIT_Nolink, ip)) {
1167 		sxtlck->lwm.length = le16_to_cpu(sp->header.nextindex) -
1168 		    sxtlck->lwm.offset;
1169 
1170 		/* rxtlck->lwm.offset = XTENTRYSTART; */
1171 		rxtlck->lwm.length = le16_to_cpu(rp->header.nextindex) -
1172 		    XTENTRYSTART;
1173 	}
1174 
1175 	*rmpp = rmp;
1176 	*rbnp = rbn;
1177 
1178 	jfs_info("xtSplitPage: sp:0x%p rp:0x%p", sp, rp);
1179 	return rc;
1180 
1181       clean_up:
1182 
1183 	/* Rollback quota allocation. */
1184 	if (quota_allocation)
1185 		dquot_free_block(ip, quota_allocation);
1186 
1187 	return (rc);
1188 }
1189 
1190 
1191 /*
1192  *	xtSplitRoot()
1193  *
1194  * function:
1195  *	split the full root page into original/root/split page and new
1196  *	right page
1197  *	i.e., root remains fixed in tree anchor (inode) and the root is
1198  *	copied to a single new right child page since root page <<
1199  *	non-root page, and the split root page contains a single entry
1200  *	for the new right child page.
1201  *
1202  * parameter:
1203  *	int		tid,
1204  *	struct inode	*ip,
1205  *	struct xtsplit	*split,
1206  *	struct metapage	**rmpp)
1207  *
1208  * return:
1209  *	Pointer to page in which to insert or NULL on error.
1210  */
1211 static int
1212 xtSplitRoot(tid_t tid,
1213 	    struct inode *ip, struct xtsplit * split, struct metapage ** rmpp)
1214 {
1215 	xtpage_t *sp;
1216 	struct metapage *rmp;
1217 	xtpage_t *rp;
1218 	s64 rbn;
1219 	int skip, nextindex;
1220 	xad_t *xad;
1221 	pxd_t *pxd;
1222 	struct pxdlist *pxdlist;
1223 	struct tlock *tlck;
1224 	struct xtlock *xtlck;
1225 	int rc;
1226 
1227 	sp = &JFS_IP(ip)->i_xtroot;
1228 
1229 	INCREMENT(xtStat.split);
1230 
1231 	/*
1232 	 *	allocate a single (right) child page
1233 	 */
1234 	pxdlist = split->pxdlist;
1235 	pxd = &pxdlist->pxd[pxdlist->npxd];
1236 	pxdlist->npxd++;
1237 	rbn = addressPXD(pxd);
1238 	rmp = get_metapage(ip, rbn, PSIZE, 1);
1239 	if (rmp == NULL)
1240 		return -EIO;
1241 
1242 	/* Allocate blocks to quota. */
1243 	rc = dquot_alloc_block(ip, lengthPXD(pxd));
1244 	if (rc) {
1245 		release_metapage(rmp);
1246 		return rc;
1247 	}
1248 
1249 	jfs_info("xtSplitRoot: ip:0x%p rmp:0x%p", ip, rmp);
1250 
1251 	/*
1252 	 * acquire a transaction lock on the new right page;
1253 	 *
1254 	 * action: new page;
1255 	 */
1256 	BT_MARK_DIRTY(rmp, ip);
1257 
1258 	rp = (xtpage_t *) rmp->data;
1259 	rp->header.flag =
1260 	    (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL;
1261 	rp->header.self = *pxd;
1262 	rp->header.nextindex = cpu_to_le16(XTENTRYSTART);
1263 	rp->header.maxentry = cpu_to_le16(PSIZE >> L2XTSLOTSIZE);
1264 
1265 	/* initialize sibling pointers */
1266 	rp->header.next = 0;
1267 	rp->header.prev = 0;
1268 
1269 	/*
1270 	 * copy the in-line root page into new right page extent
1271 	 */
1272 	nextindex = le16_to_cpu(sp->header.maxentry);
1273 	memmove(&rp->xad[XTENTRYSTART], &sp->xad[XTENTRYSTART],
1274 		(nextindex - XTENTRYSTART) << L2XTSLOTSIZE);
1275 
1276 	/*
1277 	 * insert the new entry into the new right/child page
1278 	 * (skip index in the new right page will not change)
1279 	 */
1280 	skip = split->index;
1281 	/* if insert into middle, shift right remaining entries */
1282 	if (skip != nextindex)
1283 		memmove(&rp->xad[skip + 1], &rp->xad[skip],
1284 			(nextindex - skip) * sizeof(xad_t));
1285 
1286 	xad = &rp->xad[skip];
1287 	XT_PUTENTRY(xad, split->flag, split->off, split->len, split->addr);
1288 
1289 	/* update page header */
1290 	rp->header.nextindex = cpu_to_le16(nextindex + 1);
1291 
1292 	if (!test_cflag(COMMIT_Nolink, ip)) {
1293 		tlck = txLock(tid, ip, rmp, tlckXTREE | tlckNEW);
1294 		xtlck = (struct xtlock *) & tlck->lock;
1295 		xtlck->lwm.offset = XTENTRYSTART;
1296 		xtlck->lwm.length = le16_to_cpu(rp->header.nextindex) -
1297 		    XTENTRYSTART;
1298 	}
1299 
1300 	/*
1301 	 *	reset the root
1302 	 *
1303 	 * init root with the single entry for the new right page
1304 	 * set the 1st entry offset to 0, which force the left-most key
1305 	 * at any level of the tree to be less than any search key.
1306 	 */
1307 	/*
1308 	 * acquire a transaction lock on the root page (in-memory inode);
1309 	 *
1310 	 * action: root split;
1311 	 */
1312 	BT_MARK_DIRTY(split->mp, ip);
1313 
1314 	xad = &sp->xad[XTENTRYSTART];
1315 	XT_PUTENTRY(xad, XAD_NEW, 0, JFS_SBI(ip->i_sb)->nbperpage, rbn);
1316 
1317 	/* update page header of root */
1318 	sp->header.flag &= ~BT_LEAF;
1319 	sp->header.flag |= BT_INTERNAL;
1320 
1321 	sp->header.nextindex = cpu_to_le16(XTENTRYSTART + 1);
1322 
1323 	if (!test_cflag(COMMIT_Nolink, ip)) {
1324 		tlck = txLock(tid, ip, split->mp, tlckXTREE | tlckGROW);
1325 		xtlck = (struct xtlock *) & tlck->lock;
1326 		xtlck->lwm.offset = XTENTRYSTART;
1327 		xtlck->lwm.length = 1;
1328 	}
1329 
1330 	*rmpp = rmp;
1331 
1332 	jfs_info("xtSplitRoot: sp:0x%p rp:0x%p", sp, rp);
1333 	return 0;
1334 }
1335 
1336 
1337 /*
1338  *	xtExtend()
1339  *
1340  * function: extend in-place;
1341  *
1342  * note: existing extent may or may not have been committed.
1343  * caller is responsible for pager buffer cache update, and
1344  * working block allocation map update;
1345  * update pmap: alloc whole extended extent;
1346  */
1347 int xtExtend(tid_t tid,		/* transaction id */
1348 	     struct inode *ip, s64 xoff,	/* delta extent offset */
1349 	     s32 xlen,		/* delta extent length */
1350 	     int flag)
1351 {
1352 	int rc = 0;
1353 	int cmp;
1354 	struct metapage *mp;	/* meta-page buffer */
1355 	xtpage_t *p;		/* base B+-tree index page */
1356 	s64 bn;
1357 	int index, nextindex, len;
1358 	struct btstack btstack;	/* traverse stack */
1359 	struct xtsplit split;	/* split information */
1360 	xad_t *xad;
1361 	s64 xaddr;
1362 	struct tlock *tlck;
1363 	struct xtlock *xtlck = NULL;
1364 
1365 	jfs_info("xtExtend: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen);
1366 
1367 	/* there must exist extent to be extended */
1368 	if ((rc = xtSearch(ip, xoff - 1, NULL, &cmp, &btstack, XT_INSERT)))
1369 		return rc;
1370 
1371 	/* retrieve search result */
1372 	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
1373 
1374 	if (cmp != 0) {
1375 		XT_PUTPAGE(mp);
1376 		jfs_error(ip->i_sb, "xtSearch did not find extent\n");
1377 		return -EIO;
1378 	}
1379 
1380 	/* extension must be contiguous */
1381 	xad = &p->xad[index];
1382 	if ((offsetXAD(xad) + lengthXAD(xad)) != xoff) {
1383 		XT_PUTPAGE(mp);
1384 		jfs_error(ip->i_sb, "extension is not contiguous\n");
1385 		return -EIO;
1386 	}
1387 
1388 	/*
1389 	 * acquire a transaction lock on the leaf page;
1390 	 *
1391 	 * action: xad insertion/extension;
1392 	 */
1393 	BT_MARK_DIRTY(mp, ip);
1394 	if (!test_cflag(COMMIT_Nolink, ip)) {
1395 		tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
1396 		xtlck = (struct xtlock *) & tlck->lock;
1397 	}
1398 
1399 	/* extend will overflow extent ? */
1400 	xlen = lengthXAD(xad) + xlen;
1401 	if ((len = xlen - MAXXLEN) <= 0)
1402 		goto extendOld;
1403 
1404 	/*
1405 	 *	extent overflow: insert entry for new extent
1406 	 */
1407 //insertNew:
1408 	xoff = offsetXAD(xad) + MAXXLEN;
1409 	xaddr = addressXAD(xad) + MAXXLEN;
1410 	nextindex = le16_to_cpu(p->header.nextindex);
1411 
1412 	/*
1413 	 *	if the leaf page is full, insert the new entry and
1414 	 *	propagate up the router entry for the new page from split
1415 	 *
1416 	 * The xtSplitUp() will insert the entry and unpin the leaf page.
1417 	 */
1418 	if (nextindex == le16_to_cpu(p->header.maxentry)) {
1419 		/* xtSpliUp() unpins leaf pages */
1420 		split.mp = mp;
1421 		split.index = index + 1;
1422 		split.flag = XAD_NEW;
1423 		split.off = xoff;	/* split offset */
1424 		split.len = len;
1425 		split.addr = xaddr;
1426 		split.pxdlist = NULL;
1427 		if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
1428 			return rc;
1429 
1430 		/* get back old page */
1431 		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
1432 		if (rc)
1433 			return rc;
1434 		/*
1435 		 * if leaf root has been split, original root has been
1436 		 * copied to new child page, i.e., original entry now
1437 		 * resides on the new child page;
1438 		 */
1439 		if (p->header.flag & BT_INTERNAL) {
1440 			ASSERT(p->header.nextindex ==
1441 			       cpu_to_le16(XTENTRYSTART + 1));
1442 			xad = &p->xad[XTENTRYSTART];
1443 			bn = addressXAD(xad);
1444 			XT_PUTPAGE(mp);
1445 
1446 			/* get new child page */
1447 			XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
1448 			if (rc)
1449 				return rc;
1450 
1451 			BT_MARK_DIRTY(mp, ip);
1452 			if (!test_cflag(COMMIT_Nolink, ip)) {
1453 				tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
1454 				xtlck = (struct xtlock *) & tlck->lock;
1455 			}
1456 		}
1457 	}
1458 	/*
1459 	 *	insert the new entry into the leaf page
1460 	 */
1461 	else {
1462 		/* insert the new entry: mark the entry NEW */
1463 		xad = &p->xad[index + 1];
1464 		XT_PUTENTRY(xad, XAD_NEW, xoff, len, xaddr);
1465 
1466 		/* advance next available entry index */
1467 		le16_add_cpu(&p->header.nextindex, 1);
1468 	}
1469 
1470 	/* get back old entry */
1471 	xad = &p->xad[index];
1472 	xlen = MAXXLEN;
1473 
1474 	/*
1475 	 * extend old extent
1476 	 */
1477       extendOld:
1478 	XADlength(xad, xlen);
1479 	if (!(xad->flag & XAD_NEW))
1480 		xad->flag |= XAD_EXTENDED;
1481 
1482 	if (!test_cflag(COMMIT_Nolink, ip)) {
1483 		xtlck->lwm.offset =
1484 		    (xtlck->lwm.offset) ? min(index,
1485 					      (int)xtlck->lwm.offset) : index;
1486 		xtlck->lwm.length =
1487 		    le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset;
1488 	}
1489 
1490 	/* unpin the leaf page */
1491 	XT_PUTPAGE(mp);
1492 
1493 	return rc;
1494 }
1495 
1496 #ifdef _NOTYET
1497 /*
1498  *	xtTailgate()
1499  *
1500  * function: split existing 'tail' extent
1501  *	(split offset >= start offset of tail extent), and
1502  *	relocate and extend the split tail half;
1503  *
1504  * note: existing extent may or may not have been committed.
1505  * caller is responsible for pager buffer cache update, and
1506  * working block allocation map update;
1507  * update pmap: free old split tail extent, alloc new extent;
1508  */
1509 int xtTailgate(tid_t tid,		/* transaction id */
1510 	       struct inode *ip, s64 xoff,	/* split/new extent offset */
1511 	       s32 xlen,	/* new extent length */
1512 	       s64 xaddr,	/* new extent address */
1513 	       int flag)
1514 {
1515 	int rc = 0;
1516 	int cmp;
1517 	struct metapage *mp;	/* meta-page buffer */
1518 	xtpage_t *p;		/* base B+-tree index page */
1519 	s64 bn;
1520 	int index, nextindex, llen, rlen;
1521 	struct btstack btstack;	/* traverse stack */
1522 	struct xtsplit split;	/* split information */
1523 	xad_t *xad;
1524 	struct tlock *tlck;
1525 	struct xtlock *xtlck = 0;
1526 	struct tlock *mtlck;
1527 	struct maplock *pxdlock;
1528 
1529 /*
1530 printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
1531 	(ulong)xoff, xlen, (ulong)xaddr);
1532 */
1533 
1534 	/* there must exist extent to be tailgated */
1535 	if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, XT_INSERT)))
1536 		return rc;
1537 
1538 	/* retrieve search result */
1539 	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
1540 
1541 	if (cmp != 0) {
1542 		XT_PUTPAGE(mp);
1543 		jfs_error(ip->i_sb, "couldn't find extent\n");
1544 		return -EIO;
1545 	}
1546 
1547 	/* entry found must be last entry */
1548 	nextindex = le16_to_cpu(p->header.nextindex);
1549 	if (index != nextindex - 1) {
1550 		XT_PUTPAGE(mp);
1551 		jfs_error(ip->i_sb, "the entry found is not the last entry\n");
1552 		return -EIO;
1553 	}
1554 
1555 	BT_MARK_DIRTY(mp, ip);
1556 	/*
1557 	 * acquire tlock of the leaf page containing original entry
1558 	 */
1559 	if (!test_cflag(COMMIT_Nolink, ip)) {
1560 		tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
1561 		xtlck = (struct xtlock *) & tlck->lock;
1562 	}
1563 
1564 	/* completely replace extent ? */
1565 	xad = &p->xad[index];
1566 /*
1567 printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n",
1568 	(ulong)offsetXAD(xad), lengthXAD(xad), (ulong)addressXAD(xad));
1569 */
1570 	if ((llen = xoff - offsetXAD(xad)) == 0)
1571 		goto updateOld;
1572 
1573 	/*
1574 	 *	partially replace extent: insert entry for new extent
1575 	 */
1576 //insertNew:
1577 	/*
1578 	 *	if the leaf page is full, insert the new entry and
1579 	 *	propagate up the router entry for the new page from split
1580 	 *
1581 	 * The xtSplitUp() will insert the entry and unpin the leaf page.
1582 	 */
1583 	if (nextindex == le16_to_cpu(p->header.maxentry)) {
1584 		/* xtSpliUp() unpins leaf pages */
1585 		split.mp = mp;
1586 		split.index = index + 1;
1587 		split.flag = XAD_NEW;
1588 		split.off = xoff;	/* split offset */
1589 		split.len = xlen;
1590 		split.addr = xaddr;
1591 		split.pxdlist = NULL;
1592 		if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
1593 			return rc;
1594 
1595 		/* get back old page */
1596 		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
1597 		if (rc)
1598 			return rc;
1599 		/*
1600 		 * if leaf root has been split, original root has been
1601 		 * copied to new child page, i.e., original entry now
1602 		 * resides on the new child page;
1603 		 */
1604 		if (p->header.flag & BT_INTERNAL) {
1605 			ASSERT(p->header.nextindex ==
1606 			       cpu_to_le16(XTENTRYSTART + 1));
1607 			xad = &p->xad[XTENTRYSTART];
1608 			bn = addressXAD(xad);
1609 			XT_PUTPAGE(mp);
1610 
1611 			/* get new child page */
1612 			XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
1613 			if (rc)
1614 				return rc;
1615 
1616 			BT_MARK_DIRTY(mp, ip);
1617 			if (!test_cflag(COMMIT_Nolink, ip)) {
1618 				tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
1619 				xtlck = (struct xtlock *) & tlck->lock;
1620 			}
1621 		}
1622 	}
1623 	/*
1624 	 *	insert the new entry into the leaf page
1625 	 */
1626 	else {
1627 		/* insert the new entry: mark the entry NEW */
1628 		xad = &p->xad[index + 1];
1629 		XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr);
1630 
1631 		/* advance next available entry index */
1632 		le16_add_cpu(&p->header.nextindex, 1);
1633 	}
1634 
1635 	/* get back old XAD */
1636 	xad = &p->xad[index];
1637 
1638 	/*
1639 	 * truncate/relocate old extent at split offset
1640 	 */
1641       updateOld:
1642 	/* update dmap for old/committed/truncated extent */
1643 	rlen = lengthXAD(xad) - llen;
1644 	if (!(xad->flag & XAD_NEW)) {
1645 		/* free from PWMAP at commit */
1646 		if (!test_cflag(COMMIT_Nolink, ip)) {
1647 			mtlck = txMaplock(tid, ip, tlckMAP);
1648 			pxdlock = (struct maplock *) & mtlck->lock;
1649 			pxdlock->flag = mlckFREEPXD;
1650 			PXDaddress(&pxdlock->pxd, addressXAD(xad) + llen);
1651 			PXDlength(&pxdlock->pxd, rlen);
1652 			pxdlock->index = 1;
1653 		}
1654 	} else
1655 		/* free from WMAP */
1656 		dbFree(ip, addressXAD(xad) + llen, (s64) rlen);
1657 
1658 	if (llen)
1659 		/* truncate */
1660 		XADlength(xad, llen);
1661 	else
1662 		/* replace */
1663 		XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr);
1664 
1665 	if (!test_cflag(COMMIT_Nolink, ip)) {
1666 		xtlck->lwm.offset = (xtlck->lwm.offset) ?
1667 		    min(index, (int)xtlck->lwm.offset) : index;
1668 		xtlck->lwm.length = le16_to_cpu(p->header.nextindex) -
1669 		    xtlck->lwm.offset;
1670 	}
1671 
1672 	/* unpin the leaf page */
1673 	XT_PUTPAGE(mp);
1674 
1675 	return rc;
1676 }
1677 #endif /* _NOTYET */
1678 
1679 /*
1680  *	xtUpdate()
1681  *
1682  * function: update XAD;
1683  *
1684  *	update extent for allocated_but_not_recorded or
1685  *	compressed extent;
1686  *
1687  * parameter:
1688  *	nxad	- new XAD;
1689  *		logical extent of the specified XAD must be completely
1690  *		contained by an existing XAD;
1691  */
1692 int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
1693 {				/* new XAD */
1694 	int rc = 0;
1695 	int cmp;
1696 	struct metapage *mp;	/* meta-page buffer */
1697 	xtpage_t *p;		/* base B+-tree index page */
1698 	s64 bn;
1699 	int index0, index, newindex, nextindex;
1700 	struct btstack btstack;	/* traverse stack */
1701 	struct xtsplit split;	/* split information */
1702 	xad_t *xad, *lxad, *rxad;
1703 	int xflag;
1704 	s64 nxoff, xoff;
1705 	int nxlen, xlen, lxlen, rxlen;
1706 	s64 nxaddr, xaddr;
1707 	struct tlock *tlck;
1708 	struct xtlock *xtlck = NULL;
1709 	int newpage = 0;
1710 
1711 	/* there must exist extent to be tailgated */
1712 	nxoff = offsetXAD(nxad);
1713 	nxlen = lengthXAD(nxad);
1714 	nxaddr = addressXAD(nxad);
1715 
1716 	if ((rc = xtSearch(ip, nxoff, NULL, &cmp, &btstack, XT_INSERT)))
1717 		return rc;
1718 
1719 	/* retrieve search result */
1720 	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index0);
1721 
1722 	if (cmp != 0) {
1723 		XT_PUTPAGE(mp);
1724 		jfs_error(ip->i_sb, "Could not find extent\n");
1725 		return -EIO;
1726 	}
1727 
1728 	BT_MARK_DIRTY(mp, ip);
1729 	/*
1730 	 * acquire tlock of the leaf page containing original entry
1731 	 */
1732 	if (!test_cflag(COMMIT_Nolink, ip)) {
1733 		tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
1734 		xtlck = (struct xtlock *) & tlck->lock;
1735 	}
1736 
1737 	xad = &p->xad[index0];
1738 	xflag = xad->flag;
1739 	xoff = offsetXAD(xad);
1740 	xlen = lengthXAD(xad);
1741 	xaddr = addressXAD(xad);
1742 
1743 	/* nXAD must be completely contained within XAD */
1744 	if ((xoff > nxoff) ||
1745 	    (nxoff + nxlen > xoff + xlen)) {
1746 		XT_PUTPAGE(mp);
1747 		jfs_error(ip->i_sb,
1748 			  "nXAD in not completely contained within XAD\n");
1749 		return -EIO;
1750 	}
1751 
1752 	index = index0;
1753 	newindex = index + 1;
1754 	nextindex = le16_to_cpu(p->header.nextindex);
1755 
1756 #ifdef  _JFS_WIP_NOCOALESCE
1757 	if (xoff < nxoff)
1758 		goto updateRight;
1759 
1760 	/*
1761 	 * replace XAD with nXAD
1762 	 */
1763       replace:			/* (nxoff == xoff) */
1764 	if (nxlen == xlen) {
1765 		/* replace XAD with nXAD:recorded */
1766 		*xad = *nxad;
1767 		xad->flag = xflag & ~XAD_NOTRECORDED;
1768 
1769 		goto out;
1770 	} else			/* (nxlen < xlen) */
1771 		goto updateLeft;
1772 #endif				/* _JFS_WIP_NOCOALESCE */
1773 
1774 /* #ifdef _JFS_WIP_COALESCE */
1775 	if (xoff < nxoff)
1776 		goto coalesceRight;
1777 
1778 	/*
1779 	 * coalesce with left XAD
1780 	 */
1781 //coalesceLeft: /* (xoff == nxoff) */
1782 	/* is XAD first entry of page ? */
1783 	if (index == XTENTRYSTART)
1784 		goto replace;
1785 
1786 	/* is nXAD logically and physically contiguous with lXAD ? */
1787 	lxad = &p->xad[index - 1];
1788 	lxlen = lengthXAD(lxad);
1789 	if (!(lxad->flag & XAD_NOTRECORDED) &&
1790 	    (nxoff == offsetXAD(lxad) + lxlen) &&
1791 	    (nxaddr == addressXAD(lxad) + lxlen) &&
1792 	    (lxlen + nxlen < MAXXLEN)) {
1793 		/* extend right lXAD */
1794 		index0 = index - 1;
1795 		XADlength(lxad, lxlen + nxlen);
1796 
1797 		/* If we just merged two extents together, need to make sure the
1798 		 * right extent gets logged.  If the left one is marked XAD_NEW,
1799 		 * then we know it will be logged.  Otherwise, mark as
1800 		 * XAD_EXTENDED
1801 		 */
1802 		if (!(lxad->flag & XAD_NEW))
1803 			lxad->flag |= XAD_EXTENDED;
1804 
1805 		if (xlen > nxlen) {
1806 			/* truncate XAD */
1807 			XADoffset(xad, xoff + nxlen);
1808 			XADlength(xad, xlen - nxlen);
1809 			XADaddress(xad, xaddr + nxlen);
1810 			goto out;
1811 		} else {	/* (xlen == nxlen) */
1812 
1813 			/* remove XAD */
1814 			if (index < nextindex - 1)
1815 				memmove(&p->xad[index], &p->xad[index + 1],
1816 					(nextindex - index -
1817 					 1) << L2XTSLOTSIZE);
1818 
1819 			p->header.nextindex =
1820 			    cpu_to_le16(le16_to_cpu(p->header.nextindex) -
1821 					1);
1822 
1823 			index = index0;
1824 			newindex = index + 1;
1825 			nextindex = le16_to_cpu(p->header.nextindex);
1826 			xoff = nxoff = offsetXAD(lxad);
1827 			xlen = nxlen = lxlen + nxlen;
1828 			xaddr = nxaddr = addressXAD(lxad);
1829 			goto coalesceRight;
1830 		}
1831 	}
1832 
1833 	/*
1834 	 * replace XAD with nXAD
1835 	 */
1836       replace:			/* (nxoff == xoff) */
1837 	if (nxlen == xlen) {
1838 		/* replace XAD with nXAD:recorded */
1839 		*xad = *nxad;
1840 		xad->flag = xflag & ~XAD_NOTRECORDED;
1841 
1842 		goto coalesceRight;
1843 	} else			/* (nxlen < xlen) */
1844 		goto updateLeft;
1845 
1846 	/*
1847 	 * coalesce with right XAD
1848 	 */
1849       coalesceRight:		/* (xoff <= nxoff) */
1850 	/* is XAD last entry of page ? */
1851 	if (newindex == nextindex) {
1852 		if (xoff == nxoff)
1853 			goto out;
1854 		goto updateRight;
1855 	}
1856 
1857 	/* is nXAD logically and physically contiguous with rXAD ? */
1858 	rxad = &p->xad[index + 1];
1859 	rxlen = lengthXAD(rxad);
1860 	if (!(rxad->flag & XAD_NOTRECORDED) &&
1861 	    (nxoff + nxlen == offsetXAD(rxad)) &&
1862 	    (nxaddr + nxlen == addressXAD(rxad)) &&
1863 	    (rxlen + nxlen < MAXXLEN)) {
1864 		/* extend left rXAD */
1865 		XADoffset(rxad, nxoff);
1866 		XADlength(rxad, rxlen + nxlen);
1867 		XADaddress(rxad, nxaddr);
1868 
1869 		/* If we just merged two extents together, need to make sure
1870 		 * the left extent gets logged.  If the right one is marked
1871 		 * XAD_NEW, then we know it will be logged.  Otherwise, mark as
1872 		 * XAD_EXTENDED
1873 		 */
1874 		if (!(rxad->flag & XAD_NEW))
1875 			rxad->flag |= XAD_EXTENDED;
1876 
1877 		if (xlen > nxlen)
1878 			/* truncate XAD */
1879 			XADlength(xad, xlen - nxlen);
1880 		else {		/* (xlen == nxlen) */
1881 
1882 			/* remove XAD */
1883 			memmove(&p->xad[index], &p->xad[index + 1],
1884 				(nextindex - index - 1) << L2XTSLOTSIZE);
1885 
1886 			p->header.nextindex =
1887 			    cpu_to_le16(le16_to_cpu(p->header.nextindex) -
1888 					1);
1889 		}
1890 
1891 		goto out;
1892 	} else if (xoff == nxoff)
1893 		goto out;
1894 
1895 	if (xoff >= nxoff) {
1896 		XT_PUTPAGE(mp);
1897 		jfs_error(ip->i_sb, "xoff >= nxoff\n");
1898 		return -EIO;
1899 	}
1900 /* #endif _JFS_WIP_COALESCE */
1901 
1902 	/*
1903 	 * split XAD into (lXAD, nXAD):
1904 	 *
1905 	 *          |---nXAD--->
1906 	 * --|----------XAD----------|--
1907 	 *   |-lXAD-|
1908 	 */
1909       updateRight:		/* (xoff < nxoff) */
1910 	/* truncate old XAD as lXAD:not_recorded */
1911 	xad = &p->xad[index];
1912 	XADlength(xad, nxoff - xoff);
1913 
1914 	/* insert nXAD:recorded */
1915 	if (nextindex == le16_to_cpu(p->header.maxentry)) {
1916 
1917 		/* xtSpliUp() unpins leaf pages */
1918 		split.mp = mp;
1919 		split.index = newindex;
1920 		split.flag = xflag & ~XAD_NOTRECORDED;
1921 		split.off = nxoff;
1922 		split.len = nxlen;
1923 		split.addr = nxaddr;
1924 		split.pxdlist = NULL;
1925 		if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
1926 			return rc;
1927 
1928 		/* get back old page */
1929 		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
1930 		if (rc)
1931 			return rc;
1932 		/*
1933 		 * if leaf root has been split, original root has been
1934 		 * copied to new child page, i.e., original entry now
1935 		 * resides on the new child page;
1936 		 */
1937 		if (p->header.flag & BT_INTERNAL) {
1938 			ASSERT(p->header.nextindex ==
1939 			       cpu_to_le16(XTENTRYSTART + 1));
1940 			xad = &p->xad[XTENTRYSTART];
1941 			bn = addressXAD(xad);
1942 			XT_PUTPAGE(mp);
1943 
1944 			/* get new child page */
1945 			XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
1946 			if (rc)
1947 				return rc;
1948 
1949 			BT_MARK_DIRTY(mp, ip);
1950 			if (!test_cflag(COMMIT_Nolink, ip)) {
1951 				tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
1952 				xtlck = (struct xtlock *) & tlck->lock;
1953 			}
1954 		} else {
1955 			/* is nXAD on new page ? */
1956 			if (newindex >
1957 			    (le16_to_cpu(p->header.maxentry) >> 1)) {
1958 				newindex =
1959 				    newindex -
1960 				    le16_to_cpu(p->header.nextindex) +
1961 				    XTENTRYSTART;
1962 				newpage = 1;
1963 			}
1964 		}
1965 	} else {
1966 		/* if insert into middle, shift right remaining entries */
1967 		if (newindex < nextindex)
1968 			memmove(&p->xad[newindex + 1], &p->xad[newindex],
1969 				(nextindex - newindex) << L2XTSLOTSIZE);
1970 
1971 		/* insert the entry */
1972 		xad = &p->xad[newindex];
1973 		*xad = *nxad;
1974 		xad->flag = xflag & ~XAD_NOTRECORDED;
1975 
1976 		/* advance next available entry index. */
1977 		p->header.nextindex =
1978 		    cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
1979 	}
1980 
1981 	/*
1982 	 * does nXAD force 3-way split ?
1983 	 *
1984 	 *          |---nXAD--->|
1985 	 * --|----------XAD-------------|--
1986 	 *   |-lXAD-|           |-rXAD -|
1987 	 */
1988 	if (nxoff + nxlen == xoff + xlen)
1989 		goto out;
1990 
1991 	/* reorient nXAD as XAD for further split XAD into (nXAD, rXAD) */
1992 	if (newpage) {
1993 		/* close out old page */
1994 		if (!test_cflag(COMMIT_Nolink, ip)) {
1995 			xtlck->lwm.offset = (xtlck->lwm.offset) ?
1996 			    min(index0, (int)xtlck->lwm.offset) : index0;
1997 			xtlck->lwm.length =
1998 			    le16_to_cpu(p->header.nextindex) -
1999 			    xtlck->lwm.offset;
2000 		}
2001 
2002 		bn = le64_to_cpu(p->header.next);
2003 		XT_PUTPAGE(mp);
2004 
2005 		/* get new right page */
2006 		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
2007 		if (rc)
2008 			return rc;
2009 
2010 		BT_MARK_DIRTY(mp, ip);
2011 		if (!test_cflag(COMMIT_Nolink, ip)) {
2012 			tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
2013 			xtlck = (struct xtlock *) & tlck->lock;
2014 		}
2015 
2016 		index0 = index = newindex;
2017 	} else
2018 		index++;
2019 
2020 	newindex = index + 1;
2021 	nextindex = le16_to_cpu(p->header.nextindex);
2022 	xlen = xlen - (nxoff - xoff);
2023 	xoff = nxoff;
2024 	xaddr = nxaddr;
2025 
2026 	/* recompute split pages */
2027 	if (nextindex == le16_to_cpu(p->header.maxentry)) {
2028 		XT_PUTPAGE(mp);
2029 
2030 		if ((rc = xtSearch(ip, nxoff, NULL, &cmp, &btstack, XT_INSERT)))
2031 			return rc;
2032 
2033 		/* retrieve search result */
2034 		XT_GETSEARCH(ip, btstack.top, bn, mp, p, index0);
2035 
2036 		if (cmp != 0) {
2037 			XT_PUTPAGE(mp);
2038 			jfs_error(ip->i_sb, "xtSearch failed\n");
2039 			return -EIO;
2040 		}
2041 
2042 		if (index0 != index) {
2043 			XT_PUTPAGE(mp);
2044 			jfs_error(ip->i_sb, "unexpected value of index\n");
2045 			return -EIO;
2046 		}
2047 	}
2048 
2049 	/*
2050 	 * split XAD into (nXAD, rXAD)
2051 	 *
2052 	 *          ---nXAD---|
2053 	 * --|----------XAD----------|--
2054 	 *                    |-rXAD-|
2055 	 */
2056       updateLeft:		/* (nxoff == xoff) && (nxlen < xlen) */
2057 	/* update old XAD with nXAD:recorded */
2058 	xad = &p->xad[index];
2059 	*xad = *nxad;
2060 	xad->flag = xflag & ~XAD_NOTRECORDED;
2061 
2062 	/* insert rXAD:not_recorded */
2063 	xoff = xoff + nxlen;
2064 	xlen = xlen - nxlen;
2065 	xaddr = xaddr + nxlen;
2066 	if (nextindex == le16_to_cpu(p->header.maxentry)) {
2067 /*
2068 printf("xtUpdate.updateLeft.split p:0x%p\n", p);
2069 */
2070 		/* xtSpliUp() unpins leaf pages */
2071 		split.mp = mp;
2072 		split.index = newindex;
2073 		split.flag = xflag;
2074 		split.off = xoff;
2075 		split.len = xlen;
2076 		split.addr = xaddr;
2077 		split.pxdlist = NULL;
2078 		if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
2079 			return rc;
2080 
2081 		/* get back old page */
2082 		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
2083 		if (rc)
2084 			return rc;
2085 
2086 		/*
2087 		 * if leaf root has been split, original root has been
2088 		 * copied to new child page, i.e., original entry now
2089 		 * resides on the new child page;
2090 		 */
2091 		if (p->header.flag & BT_INTERNAL) {
2092 			ASSERT(p->header.nextindex ==
2093 			       cpu_to_le16(XTENTRYSTART + 1));
2094 			xad = &p->xad[XTENTRYSTART];
2095 			bn = addressXAD(xad);
2096 			XT_PUTPAGE(mp);
2097 
2098 			/* get new child page */
2099 			XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
2100 			if (rc)
2101 				return rc;
2102 
2103 			BT_MARK_DIRTY(mp, ip);
2104 			if (!test_cflag(COMMIT_Nolink, ip)) {
2105 				tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
2106 				xtlck = (struct xtlock *) & tlck->lock;
2107 			}
2108 		}
2109 	} else {
2110 		/* if insert into middle, shift right remaining entries */
2111 		if (newindex < nextindex)
2112 			memmove(&p->xad[newindex + 1], &p->xad[newindex],
2113 				(nextindex - newindex) << L2XTSLOTSIZE);
2114 
2115 		/* insert the entry */
2116 		xad = &p->xad[newindex];
2117 		XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
2118 
2119 		/* advance next available entry index. */
2120 		p->header.nextindex =
2121 		    cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
2122 	}
2123 
2124       out:
2125 	if (!test_cflag(COMMIT_Nolink, ip)) {
2126 		xtlck->lwm.offset = (xtlck->lwm.offset) ?
2127 		    min(index0, (int)xtlck->lwm.offset) : index0;
2128 		xtlck->lwm.length = le16_to_cpu(p->header.nextindex) -
2129 		    xtlck->lwm.offset;
2130 	}
2131 
2132 	/* unpin the leaf page */
2133 	XT_PUTPAGE(mp);
2134 
2135 	return rc;
2136 }
2137 
2138 
2139 /*
2140  *	xtAppend()
2141  *
2142  * function: grow in append mode from contiguous region specified ;
2143  *
2144  * parameter:
2145  *	tid		- transaction id;
2146  *	ip		- file object;
2147  *	xflag		- extent flag:
2148  *	xoff		- extent offset;
2149  *	maxblocks	- max extent length;
2150  *	xlen		- extent length (in/out);
2151  *	xaddrp		- extent address pointer (in/out):
2152  *	flag		-
2153  *
2154  * return:
2155  */
2156 int xtAppend(tid_t tid,		/* transaction id */
2157 	     struct inode *ip, int xflag, s64 xoff, s32 maxblocks,
2158 	     s32 * xlenp,	/* (in/out) */
2159 	     s64 * xaddrp,	/* (in/out) */
2160 	     int flag)
2161 {
2162 	int rc = 0;
2163 	struct metapage *mp;	/* meta-page buffer */
2164 	xtpage_t *p;		/* base B+-tree index page */
2165 	s64 bn, xaddr;
2166 	int index, nextindex;
2167 	struct btstack btstack;	/* traverse stack */
2168 	struct xtsplit split;	/* split information */
2169 	xad_t *xad;
2170 	int cmp;
2171 	struct tlock *tlck;
2172 	struct xtlock *xtlck;
2173 	int nsplit, nblocks, xlen;
2174 	struct pxdlist pxdlist;
2175 	pxd_t *pxd;
2176 	s64 next;
2177 
2178 	xaddr = *xaddrp;
2179 	xlen = *xlenp;
2180 	jfs_info("xtAppend: xoff:0x%lx maxblocks:%d xlen:%d xaddr:0x%lx",
2181 		 (ulong) xoff, maxblocks, xlen, (ulong) xaddr);
2182 
2183 	/*
2184 	 *	search for the entry location at which to insert:
2185 	 *
2186 	 * xtFastSearch() and xtSearch() both returns (leaf page
2187 	 * pinned, index at which to insert).
2188 	 * n.b. xtSearch() may return index of maxentry of
2189 	 * the full page.
2190 	 */
2191 	if ((rc = xtSearch(ip, xoff, &next, &cmp, &btstack, XT_INSERT)))
2192 		return rc;
2193 
2194 	/* retrieve search result */
2195 	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
2196 
2197 	if (cmp == 0) {
2198 		rc = -EEXIST;
2199 		goto out;
2200 	}
2201 
2202 	if (next)
2203 		xlen = min(xlen, (int)(next - xoff));
2204 //insert:
2205 	/*
2206 	 *	insert entry for new extent
2207 	 */
2208 	xflag |= XAD_NEW;
2209 
2210 	/*
2211 	 *	if the leaf page is full, split the page and
2212 	 *	propagate up the router entry for the new page from split
2213 	 *
2214 	 * The xtSplitUp() will insert the entry and unpin the leaf page.
2215 	 */
2216 	nextindex = le16_to_cpu(p->header.nextindex);
2217 	if (nextindex < le16_to_cpu(p->header.maxentry))
2218 		goto insertLeaf;
2219 
2220 	/*
2221 	 * allocate new index blocks to cover index page split(s)
2222 	 */
2223 	nsplit = btstack.nsplit;
2224 	split.pxdlist = &pxdlist;
2225 	pxdlist.maxnpxd = pxdlist.npxd = 0;
2226 	pxd = &pxdlist.pxd[0];
2227 	nblocks = JFS_SBI(ip->i_sb)->nbperpage;
2228 	for (; nsplit > 0; nsplit--, pxd++, xaddr += nblocks, maxblocks -= nblocks) {
2229 		if ((rc = dbAllocBottomUp(ip, xaddr, (s64) nblocks)) == 0) {
2230 			PXDaddress(pxd, xaddr);
2231 			PXDlength(pxd, nblocks);
2232 
2233 			pxdlist.maxnpxd++;
2234 
2235 			continue;
2236 		}
2237 
2238 		/* undo allocation */
2239 
2240 		goto out;
2241 	}
2242 
2243 	xlen = min(xlen, maxblocks);
2244 
2245 	/*
2246 	 * allocate data extent requested
2247 	 */
2248 	if ((rc = dbAllocBottomUp(ip, xaddr, (s64) xlen)))
2249 		goto out;
2250 
2251 	split.mp = mp;
2252 	split.index = index;
2253 	split.flag = xflag;
2254 	split.off = xoff;
2255 	split.len = xlen;
2256 	split.addr = xaddr;
2257 	if ((rc = xtSplitUp(tid, ip, &split, &btstack))) {
2258 		/* undo data extent allocation */
2259 		dbFree(ip, *xaddrp, (s64) * xlenp);
2260 
2261 		return rc;
2262 	}
2263 
2264 	*xaddrp = xaddr;
2265 	*xlenp = xlen;
2266 	return 0;
2267 
2268 	/*
2269 	 *	insert the new entry into the leaf page
2270 	 */
2271       insertLeaf:
2272 	/*
2273 	 * allocate data extent requested
2274 	 */
2275 	if ((rc = dbAllocBottomUp(ip, xaddr, (s64) xlen)))
2276 		goto out;
2277 
2278 	BT_MARK_DIRTY(mp, ip);
2279 	/*
2280 	 * acquire a transaction lock on the leaf page;
2281 	 *
2282 	 * action: xad insertion/extension;
2283 	 */
2284 	tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
2285 	xtlck = (struct xtlock *) & tlck->lock;
2286 
2287 	/* insert the new entry: mark the entry NEW */
2288 	xad = &p->xad[index];
2289 	XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
2290 
2291 	/* advance next available entry index */
2292 	le16_add_cpu(&p->header.nextindex, 1);
2293 
2294 	xtlck->lwm.offset =
2295 	    (xtlck->lwm.offset) ? min(index,(int) xtlck->lwm.offset) : index;
2296 	xtlck->lwm.length = le16_to_cpu(p->header.nextindex) -
2297 	    xtlck->lwm.offset;
2298 
2299 	*xaddrp = xaddr;
2300 	*xlenp = xlen;
2301 
2302       out:
2303 	/* unpin the leaf page */
2304 	XT_PUTPAGE(mp);
2305 
2306 	return rc;
2307 }
2308 #ifdef _STILL_TO_PORT
2309 
2310 /* - TBD for defragmentaion/reorganization -
2311  *
2312  *	xtDelete()
2313  *
2314  * function:
2315  *	delete the entry with the specified key.
2316  *
2317  *	N.B.: whole extent of the entry is assumed to be deleted.
2318  *
2319  * parameter:
2320  *
2321  * return:
2322  *	ENOENT: if the entry is not found.
2323  *
2324  * exception:
2325  */
2326 int xtDelete(tid_t tid, struct inode *ip, s64 xoff, s32 xlen, int flag)
2327 {
2328 	int rc = 0;
2329 	struct btstack btstack;
2330 	int cmp;
2331 	s64 bn;
2332 	struct metapage *mp;
2333 	xtpage_t *p;
2334 	int index, nextindex;
2335 	struct tlock *tlck;
2336 	struct xtlock *xtlck;
2337 
2338 	/*
2339 	 * find the matching entry; xtSearch() pins the page
2340 	 */
2341 	if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0)))
2342 		return rc;
2343 
2344 	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
2345 	if (cmp) {
2346 		/* unpin the leaf page */
2347 		XT_PUTPAGE(mp);
2348 		return -ENOENT;
2349 	}
2350 
2351 	/*
2352 	 * delete the entry from the leaf page
2353 	 */
2354 	nextindex = le16_to_cpu(p->header.nextindex);
2355 	le16_add_cpu(&p->header.nextindex, -1);
2356 
2357 	/*
2358 	 * if the leaf page bocome empty, free the page
2359 	 */
2360 	if (p->header.nextindex == cpu_to_le16(XTENTRYSTART))
2361 		return (xtDeleteUp(tid, ip, mp, p, &btstack));
2362 
2363 	BT_MARK_DIRTY(mp, ip);
2364 	/*
2365 	 * acquire a transaction lock on the leaf page;
2366 	 *
2367 	 * action:xad deletion;
2368 	 */
2369 	tlck = txLock(tid, ip, mp, tlckXTREE);
2370 	xtlck = (struct xtlock *) & tlck->lock;
2371 	xtlck->lwm.offset =
2372 	    (xtlck->lwm.offset) ? min(index, xtlck->lwm.offset) : index;
2373 
2374 	/* if delete from middle, shift left/compact the remaining entries */
2375 	if (index < nextindex - 1)
2376 		memmove(&p->xad[index], &p->xad[index + 1],
2377 			(nextindex - index - 1) * sizeof(xad_t));
2378 
2379 	XT_PUTPAGE(mp);
2380 
2381 	return 0;
2382 }
2383 
2384 
2385 /* - TBD for defragmentaion/reorganization -
2386  *
2387  *	xtDeleteUp()
2388  *
2389  * function:
2390  *	free empty pages as propagating deletion up the tree
2391  *
2392  * parameter:
2393  *
2394  * return:
2395  */
2396 static int
2397 xtDeleteUp(tid_t tid, struct inode *ip,
2398 	   struct metapage * fmp, xtpage_t * fp, struct btstack * btstack)
2399 {
2400 	int rc = 0;
2401 	struct metapage *mp;
2402 	xtpage_t *p;
2403 	int index, nextindex;
2404 	s64 xaddr;
2405 	int xlen;
2406 	struct btframe *parent;
2407 	struct tlock *tlck;
2408 	struct xtlock *xtlck;
2409 
2410 	/*
2411 	 * keep root leaf page which has become empty
2412 	 */
2413 	if (fp->header.flag & BT_ROOT) {
2414 		/* keep the root page */
2415 		fp->header.flag &= ~BT_INTERNAL;
2416 		fp->header.flag |= BT_LEAF;
2417 		fp->header.nextindex = cpu_to_le16(XTENTRYSTART);
2418 
2419 		/* XT_PUTPAGE(fmp); */
2420 
2421 		return 0;
2422 	}
2423 
2424 	/*
2425 	 * free non-root leaf page
2426 	 */
2427 	if ((rc = xtRelink(tid, ip, fp))) {
2428 		XT_PUTPAGE(fmp);
2429 		return rc;
2430 	}
2431 
2432 	xaddr = addressPXD(&fp->header.self);
2433 	xlen = lengthPXD(&fp->header.self);
2434 	/* free the page extent */
2435 	dbFree(ip, xaddr, (s64) xlen);
2436 
2437 	/* free the buffer page */
2438 	discard_metapage(fmp);
2439 
2440 	/*
2441 	 * propagate page deletion up the index tree
2442 	 *
2443 	 * If the delete from the parent page makes it empty,
2444 	 * continue all the way up the tree.
2445 	 * stop if the root page is reached (which is never deleted) or
2446 	 * if the entry deletion does not empty the page.
2447 	 */
2448 	while ((parent = BT_POP(btstack)) != NULL) {
2449 		/* get/pin the parent page <sp> */
2450 		XT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc);
2451 		if (rc)
2452 			return rc;
2453 
2454 		index = parent->index;
2455 
2456 		/* delete the entry for the freed child page from parent.
2457 		 */
2458 		nextindex = le16_to_cpu(p->header.nextindex);
2459 
2460 		/*
2461 		 * the parent has the single entry being deleted:
2462 		 * free the parent page which has become empty.
2463 		 */
2464 		if (nextindex == 1) {
2465 			if (p->header.flag & BT_ROOT) {
2466 				/* keep the root page */
2467 				p->header.flag &= ~BT_INTERNAL;
2468 				p->header.flag |= BT_LEAF;
2469 				p->header.nextindex =
2470 				    cpu_to_le16(XTENTRYSTART);
2471 
2472 				/* XT_PUTPAGE(mp); */
2473 
2474 				break;
2475 			} else {
2476 				/* free the parent page */
2477 				if ((rc = xtRelink(tid, ip, p)))
2478 					return rc;
2479 
2480 				xaddr = addressPXD(&p->header.self);
2481 				/* free the page extent */
2482 				dbFree(ip, xaddr,
2483 				       (s64) JFS_SBI(ip->i_sb)->nbperpage);
2484 
2485 				/* unpin/free the buffer page */
2486 				discard_metapage(mp);
2487 
2488 				/* propagate up */
2489 				continue;
2490 			}
2491 		}
2492 		/*
2493 		 * the parent has other entries remaining:
2494 		 * delete the router entry from the parent page.
2495 		 */
2496 		else {
2497 			BT_MARK_DIRTY(mp, ip);
2498 			/*
2499 			 * acquire a transaction lock on the leaf page;
2500 			 *
2501 			 * action:xad deletion;
2502 			 */
2503 			tlck = txLock(tid, ip, mp, tlckXTREE);
2504 			xtlck = (struct xtlock *) & tlck->lock;
2505 			xtlck->lwm.offset =
2506 			    (xtlck->lwm.offset) ? min(index,
2507 						      xtlck->lwm.
2508 						      offset) : index;
2509 
2510 			/* if delete from middle,
2511 			 * shift left/compact the remaining entries in the page
2512 			 */
2513 			if (index < nextindex - 1)
2514 				memmove(&p->xad[index], &p->xad[index + 1],
2515 					(nextindex - index -
2516 					 1) << L2XTSLOTSIZE);
2517 
2518 			le16_add_cpu(&p->header.nextindex, -1);
2519 			jfs_info("xtDeleteUp(entry): 0x%lx[%d]",
2520 				 (ulong) parent->bn, index);
2521 		}
2522 
2523 		/* unpin the parent page */
2524 		XT_PUTPAGE(mp);
2525 
2526 		/* exit propagation up */
2527 		break;
2528 	}
2529 
2530 	return 0;
2531 }
2532 
2533 
2534 /*
2535  * NAME:	xtRelocate()
2536  *
2537  * FUNCTION:	relocate xtpage or data extent of regular file;
2538  *		This function is mainly used by defragfs utility.
2539  *
2540  * NOTE:	This routine does not have the logic to handle
2541  *		uncommitted allocated extent. The caller should call
2542  *		txCommit() to commit all the allocation before call
2543  *		this routine.
2544  */
2545 int
2546 xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad,	/* old XAD */
2547 	   s64 nxaddr,		/* new xaddr */
2548 	   int xtype)
2549 {				/* extent type: XTPAGE or DATAEXT */
2550 	int rc = 0;
2551 	struct tblock *tblk;
2552 	struct tlock *tlck;
2553 	struct xtlock *xtlck;
2554 	struct metapage *mp, *pmp, *lmp, *rmp;	/* meta-page buffer */
2555 	xtpage_t *p, *pp, *rp, *lp;	/* base B+-tree index page */
2556 	xad_t *xad;
2557 	pxd_t *pxd;
2558 	s64 xoff, xsize;
2559 	int xlen;
2560 	s64 oxaddr, sxaddr, dxaddr, nextbn, prevbn;
2561 	cbuf_t *cp;
2562 	s64 offset, nbytes, nbrd, pno;
2563 	int nb, npages, nblks;
2564 	s64 bn;
2565 	int cmp;
2566 	int index;
2567 	struct pxd_lock *pxdlock;
2568 	struct btstack btstack;	/* traverse stack */
2569 
2570 	xtype = xtype & EXTENT_TYPE;
2571 
2572 	xoff = offsetXAD(oxad);
2573 	oxaddr = addressXAD(oxad);
2574 	xlen = lengthXAD(oxad);
2575 
2576 	/* validate extent offset */
2577 	offset = xoff << JFS_SBI(ip->i_sb)->l2bsize;
2578 	if (offset >= ip->i_size)
2579 		return -ESTALE;	/* stale extent */
2580 
2581 	jfs_info("xtRelocate: xtype:%d xoff:0x%lx xlen:0x%x xaddr:0x%lx:0x%lx",
2582 		 xtype, (ulong) xoff, xlen, (ulong) oxaddr, (ulong) nxaddr);
2583 
2584 	/*
2585 	 *	1. get and validate the parent xtpage/xad entry
2586 	 *	covering the source extent to be relocated;
2587 	 */
2588 	if (xtype == DATAEXT) {
2589 		/* search in leaf entry */
2590 		rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0);
2591 		if (rc)
2592 			return rc;
2593 
2594 		/* retrieve search result */
2595 		XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
2596 
2597 		if (cmp) {
2598 			XT_PUTPAGE(pmp);
2599 			return -ESTALE;
2600 		}
2601 
2602 		/* validate for exact match with a single entry */
2603 		xad = &pp->xad[index];
2604 		if (addressXAD(xad) != oxaddr || lengthXAD(xad) != xlen) {
2605 			XT_PUTPAGE(pmp);
2606 			return -ESTALE;
2607 		}
2608 	} else {		/* (xtype == XTPAGE) */
2609 
2610 		/* search in internal entry */
2611 		rc = xtSearchNode(ip, oxad, &cmp, &btstack, 0);
2612 		if (rc)
2613 			return rc;
2614 
2615 		/* retrieve search result */
2616 		XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
2617 
2618 		if (cmp) {
2619 			XT_PUTPAGE(pmp);
2620 			return -ESTALE;
2621 		}
2622 
2623 		/* xtSearchNode() validated for exact match with a single entry
2624 		 */
2625 		xad = &pp->xad[index];
2626 	}
2627 	jfs_info("xtRelocate: parent xad entry validated.");
2628 
2629 	/*
2630 	 *	2. relocate the extent
2631 	 */
2632 	if (xtype == DATAEXT) {
2633 		/* if the extent is allocated-but-not-recorded
2634 		 * there is no real data to be moved in this extent,
2635 		 */
2636 		if (xad->flag & XAD_NOTRECORDED)
2637 			goto out;
2638 		else
2639 			/* release xtpage for cmRead()/xtLookup() */
2640 			XT_PUTPAGE(pmp);
2641 
2642 		/*
2643 		 *	cmRelocate()
2644 		 *
2645 		 * copy target data pages to be relocated;
2646 		 *
2647 		 * data extent must start at page boundary and
2648 		 * multiple of page size (except the last data extent);
2649 		 * read in each page of the source data extent into cbuf,
2650 		 * update the cbuf extent descriptor of the page to be
2651 		 * homeward bound to new dst data extent
2652 		 * copy the data from the old extent to new extent.
2653 		 * copy is essential for compressed files to avoid problems
2654 		 * that can arise if there was a change in compression
2655 		 * algorithms.
2656 		 * it is a good strategy because it may disrupt cache
2657 		 * policy to keep the pages in memory afterwards.
2658 		 */
2659 		offset = xoff << JFS_SBI(ip->i_sb)->l2bsize;
2660 		assert((offset & CM_OFFSET) == 0);
2661 		nbytes = xlen << JFS_SBI(ip->i_sb)->l2bsize;
2662 		pno = offset >> CM_L2BSIZE;
2663 		npages = (nbytes + (CM_BSIZE - 1)) >> CM_L2BSIZE;
2664 /*
2665 		npages = ((offset + nbytes - 1) >> CM_L2BSIZE) -
2666 			  (offset >> CM_L2BSIZE) + 1;
2667 */
2668 		sxaddr = oxaddr;
2669 		dxaddr = nxaddr;
2670 
2671 		/* process the request one cache buffer at a time */
2672 		for (nbrd = 0; nbrd < nbytes; nbrd += nb,
2673 		     offset += nb, pno++, npages--) {
2674 			/* compute page size */
2675 			nb = min(nbytes - nbrd, CM_BSIZE);
2676 
2677 			/* get the cache buffer of the page */
2678 			if (rc = cmRead(ip, offset, npages, &cp))
2679 				break;
2680 
2681 			assert(addressPXD(&cp->cm_pxd) == sxaddr);
2682 			assert(!cp->cm_modified);
2683 
2684 			/* bind buffer with the new extent address */
2685 			nblks = nb >> JFS_IP(ip->i_sb)->l2bsize;
2686 			cmSetXD(ip, cp, pno, dxaddr, nblks);
2687 
2688 			/* release the cbuf, mark it as modified */
2689 			cmPut(cp, true);
2690 
2691 			dxaddr += nblks;
2692 			sxaddr += nblks;
2693 		}
2694 
2695 		/* get back parent page */
2696 		if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0)))
2697 			return rc;
2698 
2699 		XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
2700 		jfs_info("xtRelocate: target data extent relocated.");
2701 	} else {		/* (xtype == XTPAGE) */
2702 
2703 		/*
2704 		 * read in the target xtpage from the source extent;
2705 		 */
2706 		XT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc);
2707 		if (rc) {
2708 			XT_PUTPAGE(pmp);
2709 			return rc;
2710 		}
2711 
2712 		/*
2713 		 * read in sibling pages if any to update sibling pointers;
2714 		 */
2715 		rmp = NULL;
2716 		if (p->header.next) {
2717 			nextbn = le64_to_cpu(p->header.next);
2718 			XT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc);
2719 			if (rc) {
2720 				XT_PUTPAGE(pmp);
2721 				XT_PUTPAGE(mp);
2722 				return (rc);
2723 			}
2724 		}
2725 
2726 		lmp = NULL;
2727 		if (p->header.prev) {
2728 			prevbn = le64_to_cpu(p->header.prev);
2729 			XT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc);
2730 			if (rc) {
2731 				XT_PUTPAGE(pmp);
2732 				XT_PUTPAGE(mp);
2733 				if (rmp)
2734 					XT_PUTPAGE(rmp);
2735 				return (rc);
2736 			}
2737 		}
2738 
2739 		/* at this point, all xtpages to be updated are in memory */
2740 
2741 		/*
2742 		 * update sibling pointers of sibling xtpages if any;
2743 		 */
2744 		if (lmp) {
2745 			BT_MARK_DIRTY(lmp, ip);
2746 			tlck = txLock(tid, ip, lmp, tlckXTREE | tlckRELINK);
2747 			lp->header.next = cpu_to_le64(nxaddr);
2748 			XT_PUTPAGE(lmp);
2749 		}
2750 
2751 		if (rmp) {
2752 			BT_MARK_DIRTY(rmp, ip);
2753 			tlck = txLock(tid, ip, rmp, tlckXTREE | tlckRELINK);
2754 			rp->header.prev = cpu_to_le64(nxaddr);
2755 			XT_PUTPAGE(rmp);
2756 		}
2757 
2758 		/*
2759 		 * update the target xtpage to be relocated
2760 		 *
2761 		 * update the self address of the target page
2762 		 * and write to destination extent;
2763 		 * redo image covers the whole xtpage since it is new page
2764 		 * to the destination extent;
2765 		 * update of bmap for the free of source extent
2766 		 * of the target xtpage itself:
2767 		 * update of bmap for the allocation of destination extent
2768 		 * of the target xtpage itself:
2769 		 * update of bmap for the extents covered by xad entries in
2770 		 * the target xtpage is not necessary since they are not
2771 		 * updated;
2772 		 * if not committed before this relocation,
2773 		 * target page may contain XAD_NEW entries which must
2774 		 * be scanned for bmap update (logredo() always
2775 		 * scan xtpage REDOPAGE image for bmap update);
2776 		 * if committed before this relocation (tlckRELOCATE),
2777 		 * scan may be skipped by commit() and logredo();
2778 		 */
2779 		BT_MARK_DIRTY(mp, ip);
2780 		/* tlckNEW init xtlck->lwm.offset = XTENTRYSTART; */
2781 		tlck = txLock(tid, ip, mp, tlckXTREE | tlckNEW);
2782 		xtlck = (struct xtlock *) & tlck->lock;
2783 
2784 		/* update the self address in the xtpage header */
2785 		pxd = &p->header.self;
2786 		PXDaddress(pxd, nxaddr);
2787 
2788 		/* linelock for the after image of the whole page */
2789 		xtlck->lwm.length =
2790 		    le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset;
2791 
2792 		/* update the buffer extent descriptor of target xtpage */
2793 		xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize;
2794 		bmSetXD(mp, nxaddr, xsize);
2795 
2796 		/* unpin the target page to new homeward bound */
2797 		XT_PUTPAGE(mp);
2798 		jfs_info("xtRelocate: target xtpage relocated.");
2799 	}
2800 
2801 	/*
2802 	 *	3. acquire maplock for the source extent to be freed;
2803 	 *
2804 	 * acquire a maplock saving the src relocated extent address;
2805 	 * to free of the extent at commit time;
2806 	 */
2807       out:
2808 	/* if DATAEXT relocation, write a LOG_UPDATEMAP record for
2809 	 * free PXD of the source data extent (logredo() will update
2810 	 * bmap for free of source data extent), and update bmap for
2811 	 * free of the source data extent;
2812 	 */
2813 	if (xtype == DATAEXT)
2814 		tlck = txMaplock(tid, ip, tlckMAP);
2815 	/* if XTPAGE relocation, write a LOG_NOREDOPAGE record
2816 	 * for the source xtpage (logredo() will init NoRedoPage
2817 	 * filter and will also update bmap for free of the source
2818 	 * xtpage), and update bmap for free of the source xtpage;
2819 	 * N.B. We use tlckMAP instead of tlkcXTREE because there
2820 	 *      is no buffer associated with this lock since the buffer
2821 	 *      has been redirected to the target location.
2822 	 */
2823 	else			/* (xtype == XTPAGE) */
2824 		tlck = txMaplock(tid, ip, tlckMAP | tlckRELOCATE);
2825 
2826 	pxdlock = (struct pxd_lock *) & tlck->lock;
2827 	pxdlock->flag = mlckFREEPXD;
2828 	PXDaddress(&pxdlock->pxd, oxaddr);
2829 	PXDlength(&pxdlock->pxd, xlen);
2830 	pxdlock->index = 1;
2831 
2832 	/*
2833 	 *	4. update the parent xad entry for relocation;
2834 	 *
2835 	 * acquire tlck for the parent entry with XAD_NEW as entry
2836 	 * update which will write LOG_REDOPAGE and update bmap for
2837 	 * allocation of XAD_NEW destination extent;
2838 	 */
2839 	jfs_info("xtRelocate: update parent xad entry.");
2840 	BT_MARK_DIRTY(pmp, ip);
2841 	tlck = txLock(tid, ip, pmp, tlckXTREE | tlckGROW);
2842 	xtlck = (struct xtlock *) & tlck->lock;
2843 
2844 	/* update the XAD with the new destination extent; */
2845 	xad = &pp->xad[index];
2846 	xad->flag |= XAD_NEW;
2847 	XADaddress(xad, nxaddr);
2848 
2849 	xtlck->lwm.offset = min(index, xtlck->lwm.offset);
2850 	xtlck->lwm.length = le16_to_cpu(pp->header.nextindex) -
2851 	    xtlck->lwm.offset;
2852 
2853 	/* unpin the parent xtpage */
2854 	XT_PUTPAGE(pmp);
2855 
2856 	return rc;
2857 }
2858 
2859 
2860 /*
2861  *	xtSearchNode()
2862  *
2863  * function:	search for the internal xad entry covering specified extent.
2864  *		This function is mainly used by defragfs utility.
2865  *
2866  * parameters:
2867  *	ip	- file object;
2868  *	xad	- extent to find;
2869  *	cmpp	- comparison result:
2870  *	btstack - traverse stack;
2871  *	flag	- search process flag;
2872  *
2873  * returns:
2874  *	btstack contains (bn, index) of search path traversed to the entry.
2875  *	*cmpp is set to result of comparison with the entry returned.
2876  *	the page containing the entry is pinned at exit.
2877  */
2878 static int xtSearchNode(struct inode *ip, xad_t * xad,	/* required XAD entry */
2879 			int *cmpp, struct btstack * btstack, int flag)
2880 {
2881 	int rc = 0;
2882 	s64 xoff, xaddr;
2883 	int xlen;
2884 	int cmp = 1;		/* init for empty page */
2885 	s64 bn;			/* block number */
2886 	struct metapage *mp;	/* meta-page buffer */
2887 	xtpage_t *p;		/* page */
2888 	int base, index, lim;
2889 	struct btframe *btsp;
2890 	s64 t64;
2891 
2892 	BT_CLR(btstack);
2893 
2894 	xoff = offsetXAD(xad);
2895 	xlen = lengthXAD(xad);
2896 	xaddr = addressXAD(xad);
2897 
2898 	/*
2899 	 *	search down tree from root:
2900 	 *
2901 	 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
2902 	 * internal page, child page Pi contains entry with k, Ki <= K < Kj.
2903 	 *
2904 	 * if entry with search key K is not found
2905 	 * internal page search find the entry with largest key Ki
2906 	 * less than K which point to the child page to search;
2907 	 * leaf page search find the entry with smallest key Kj
2908 	 * greater than K so that the returned index is the position of
2909 	 * the entry to be shifted right for insertion of new entry.
2910 	 * for empty tree, search key is greater than any key of the tree.
2911 	 *
2912 	 * by convention, root bn = 0.
2913 	 */
2914 	for (bn = 0;;) {
2915 		/* get/pin the page to search */
2916 		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
2917 		if (rc)
2918 			return rc;
2919 		if (p->header.flag & BT_LEAF) {
2920 			XT_PUTPAGE(mp);
2921 			return -ESTALE;
2922 		}
2923 
2924 		lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART;
2925 
2926 		/*
2927 		 * binary search with search key K on the current page
2928 		 */
2929 		for (base = XTENTRYSTART; lim; lim >>= 1) {
2930 			index = base + (lim >> 1);
2931 
2932 			XT_CMP(cmp, xoff, &p->xad[index], t64);
2933 			if (cmp == 0) {
2934 				/*
2935 				 *	search hit
2936 				 *
2937 				 * verify for exact match;
2938 				 */
2939 				if (xaddr == addressXAD(&p->xad[index]) &&
2940 				    xoff == offsetXAD(&p->xad[index])) {
2941 					*cmpp = cmp;
2942 
2943 					/* save search result */
2944 					btsp = btstack->top;
2945 					btsp->bn = bn;
2946 					btsp->index = index;
2947 					btsp->mp = mp;
2948 
2949 					return 0;
2950 				}
2951 
2952 				/* descend/search its child page */
2953 				goto next;
2954 			}
2955 
2956 			if (cmp > 0) {
2957 				base = index + 1;
2958 				--lim;
2959 			}
2960 		}
2961 
2962 		/*
2963 		 *	search miss - non-leaf page:
2964 		 *
2965 		 * base is the smallest index with key (Kj) greater than
2966 		 * search key (K) and may be zero or maxentry index.
2967 		 * if base is non-zero, decrement base by one to get the parent
2968 		 * entry of the child page to search.
2969 		 */
2970 		index = base ? base - 1 : base;
2971 
2972 		/*
2973 		 * go down to child page
2974 		 */
2975 	      next:
2976 		/* get the child page block number */
2977 		bn = addressXAD(&p->xad[index]);
2978 
2979 		/* unpin the parent page */
2980 		XT_PUTPAGE(mp);
2981 	}
2982 }
2983 
2984 
2985 /*
2986  *	xtRelink()
2987  *
2988  * function:
2989  *	link around a freed page.
2990  *
2991  * Parameter:
2992  *	int		tid,
2993  *	struct inode	*ip,
2994  *	xtpage_t	*p)
2995  *
2996  * returns:
2997  */
2998 static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * p)
2999 {
3000 	int rc = 0;
3001 	struct metapage *mp;
3002 	s64 nextbn, prevbn;
3003 	struct tlock *tlck;
3004 
3005 	nextbn = le64_to_cpu(p->header.next);
3006 	prevbn = le64_to_cpu(p->header.prev);
3007 
3008 	/* update prev pointer of the next page */
3009 	if (nextbn != 0) {
3010 		XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
3011 		if (rc)
3012 			return rc;
3013 
3014 		/*
3015 		 * acquire a transaction lock on the page;
3016 		 *
3017 		 * action: update prev pointer;
3018 		 */
3019 		BT_MARK_DIRTY(mp, ip);
3020 		tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK);
3021 
3022 		/* the page may already have been tlock'd */
3023 
3024 		p->header.prev = cpu_to_le64(prevbn);
3025 
3026 		XT_PUTPAGE(mp);
3027 	}
3028 
3029 	/* update next pointer of the previous page */
3030 	if (prevbn != 0) {
3031 		XT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc);
3032 		if (rc)
3033 			return rc;
3034 
3035 		/*
3036 		 * acquire a transaction lock on the page;
3037 		 *
3038 		 * action: update next pointer;
3039 		 */
3040 		BT_MARK_DIRTY(mp, ip);
3041 		tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK);
3042 
3043 		/* the page may already have been tlock'd */
3044 
3045 		p->header.next = le64_to_cpu(nextbn);
3046 
3047 		XT_PUTPAGE(mp);
3048 	}
3049 
3050 	return 0;
3051 }
3052 #endif				/*  _STILL_TO_PORT */
3053 
3054 
3055 /*
3056  *	xtInitRoot()
3057  *
3058  * initialize file root (inline in inode)
3059  */
3060 void xtInitRoot(tid_t tid, struct inode *ip)
3061 {
3062 	xtpage_t *p;
3063 
3064 	/*
3065 	 * acquire a transaction lock on the root
3066 	 *
3067 	 * action:
3068 	 */
3069 	txLock(tid, ip, (struct metapage *) &JFS_IP(ip)->bxflag,
3070 		      tlckXTREE | tlckNEW);
3071 	p = &JFS_IP(ip)->i_xtroot;
3072 
3073 	p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF;
3074 	p->header.nextindex = cpu_to_le16(XTENTRYSTART);
3075 
3076 	if (S_ISDIR(ip->i_mode))
3077 		p->header.maxentry = cpu_to_le16(XTROOTINITSLOT_DIR);
3078 	else {
3079 		p->header.maxentry = cpu_to_le16(XTROOTINITSLOT);
3080 		ip->i_size = 0;
3081 	}
3082 
3083 
3084 	return;
3085 }
3086 
3087 
3088 /*
3089  * We can run into a deadlock truncating a file with a large number of
3090  * xtree pages (large fragmented file).  A robust fix would entail a
3091  * reservation system where we would reserve a number of metadata pages
3092  * and tlocks which we would be guaranteed without a deadlock.  Without
3093  * this, a partial fix is to limit number of metadata pages we will lock
3094  * in a single transaction.  Currently we will truncate the file so that
3095  * no more than 50 leaf pages will be locked.  The caller of xtTruncate
3096  * will be responsible for ensuring that the current transaction gets
3097  * committed, and that subsequent transactions are created to truncate
3098  * the file further if needed.
3099  */
3100 #define MAX_TRUNCATE_LEAVES 50
3101 
3102 /*
3103  *	xtTruncate()
3104  *
3105  * function:
3106  *	traverse for truncation logging backward bottom up;
3107  *	terminate at the last extent entry at the current subtree
3108  *	root page covering new down size.
3109  *	truncation may occur within the last extent entry.
3110  *
3111  * parameter:
3112  *	int		tid,
3113  *	struct inode	*ip,
3114  *	s64		newsize,
3115  *	int		type)	{PWMAP, PMAP, WMAP; DELETE, TRUNCATE}
3116  *
3117  * return:
3118  *
3119  * note:
3120  *	PWMAP:
3121  *	 1. truncate (non-COMMIT_NOLINK file)
3122  *	    by jfs_truncate() or jfs_open(O_TRUNC):
3123  *	    xtree is updated;
3124  *	 2. truncate index table of directory when last entry removed
3125  *	map update via tlock at commit time;
3126  *	PMAP:
3127  *	 Call xtTruncate_pmap instead
3128  *	WMAP:
3129  *	 1. remove (free zero link count) on last reference release
3130  *	    (pmap has been freed at commit zero link count);
3131  *	 2. truncate (COMMIT_NOLINK file, i.e., tmp file):
3132  *	    xtree is updated;
3133  *	 map update directly at truncation time;
3134  *
3135  *	if (DELETE)
3136  *		no LOG_NOREDOPAGE is required (NOREDOFILE is sufficient);
3137  *	else if (TRUNCATE)
3138  *		must write LOG_NOREDOPAGE for deleted index page;
3139  *
3140  * pages may already have been tlocked by anonymous transactions
3141  * during file growth (i.e., write) before truncation;
3142  *
3143  * except last truncated entry, deleted entries remains as is
3144  * in the page (nextindex is updated) for other use
3145  * (e.g., log/update allocation map): this avoid copying the page
3146  * info but delay free of pages;
3147  *
3148  */
3149 s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
3150 {
3151 	int rc = 0;
3152 	s64 teof;
3153 	struct metapage *mp;
3154 	xtpage_t *p;
3155 	s64 bn;
3156 	int index, nextindex;
3157 	xad_t *xad;
3158 	s64 xoff, xaddr;
3159 	int xlen, len, freexlen;
3160 	struct btstack btstack;
3161 	struct btframe *parent;
3162 	struct tblock *tblk = NULL;
3163 	struct tlock *tlck = NULL;
3164 	struct xtlock *xtlck = NULL;
3165 	struct xdlistlock xadlock;	/* maplock for COMMIT_WMAP */
3166 	struct pxd_lock *pxdlock;		/* maplock for COMMIT_WMAP */
3167 	s64 nfreed;
3168 	int freed, log;
3169 	int locked_leaves = 0;
3170 
3171 	/* save object truncation type */
3172 	if (tid) {
3173 		tblk = tid_to_tblock(tid);
3174 		tblk->xflag |= flag;
3175 	}
3176 
3177 	nfreed = 0;
3178 
3179 	flag &= COMMIT_MAP;
3180 	assert(flag != COMMIT_PMAP);
3181 
3182 	if (flag == COMMIT_PWMAP)
3183 		log = 1;
3184 	else {
3185 		log = 0;
3186 		xadlock.flag = mlckFREEXADLIST;
3187 		xadlock.index = 1;
3188 	}
3189 
3190 	/*
3191 	 * if the newsize is not an integral number of pages,
3192 	 * the file between newsize and next page boundary will
3193 	 * be cleared.
3194 	 * if truncating into a file hole, it will cause
3195 	 * a full block to be allocated for the logical block.
3196 	 */
3197 
3198 	/*
3199 	 * release page blocks of truncated region <teof, eof>
3200 	 *
3201 	 * free the data blocks from the leaf index blocks.
3202 	 * delete the parent index entries corresponding to
3203 	 * the freed child data/index blocks.
3204 	 * free the index blocks themselves which aren't needed
3205 	 * in new sized file.
3206 	 *
3207 	 * index blocks are updated only if the blocks are to be
3208 	 * retained in the new sized file.
3209 	 * if type is PMAP, the data and index pages are NOT
3210 	 * freed, and the data and index blocks are NOT freed
3211 	 * from working map.
3212 	 * (this will allow continued access of data/index of
3213 	 * temporary file (zerolink count file truncated to zero-length)).
3214 	 */
3215 	teof = (newsize + (JFS_SBI(ip->i_sb)->bsize - 1)) >>
3216 	    JFS_SBI(ip->i_sb)->l2bsize;
3217 
3218 	/* clear stack */
3219 	BT_CLR(&btstack);
3220 
3221 	/*
3222 	 * start with root
3223 	 *
3224 	 * root resides in the inode
3225 	 */
3226 	bn = 0;
3227 
3228 	/*
3229 	 * first access of each page:
3230 	 */
3231       getPage:
3232 	XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3233 	if (rc)
3234 		return rc;
3235 
3236 	/* process entries backward from last index */
3237 	index = le16_to_cpu(p->header.nextindex) - 1;
3238 
3239 
3240 	/* Since this is the rightmost page at this level, and we may have
3241 	 * already freed a page that was formerly to the right, let's make
3242 	 * sure that the next pointer is zero.
3243 	 */
3244 	if (p->header.next) {
3245 		if (log)
3246 			/*
3247 			 * Make sure this change to the header is logged.
3248 			 * If we really truncate this leaf, the flag
3249 			 * will be changed to tlckTRUNCATE
3250 			 */
3251 			tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
3252 		BT_MARK_DIRTY(mp, ip);
3253 		p->header.next = 0;
3254 	}
3255 
3256 	if (p->header.flag & BT_INTERNAL)
3257 		goto getChild;
3258 
3259 	/*
3260 	 *	leaf page
3261 	 */
3262 	freed = 0;
3263 
3264 	/* does region covered by leaf page precede Teof ? */
3265 	xad = &p->xad[index];
3266 	xoff = offsetXAD(xad);
3267 	xlen = lengthXAD(xad);
3268 	if (teof >= xoff + xlen) {
3269 		XT_PUTPAGE(mp);
3270 		goto getParent;
3271 	}
3272 
3273 	/* (re)acquire tlock of the leaf page */
3274 	if (log) {
3275 		if (++locked_leaves > MAX_TRUNCATE_LEAVES) {
3276 			/*
3277 			 * We need to limit the size of the transaction
3278 			 * to avoid exhausting pagecache & tlocks
3279 			 */
3280 			XT_PUTPAGE(mp);
3281 			newsize = (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize;
3282 			goto getParent;
3283 		}
3284 		tlck = txLock(tid, ip, mp, tlckXTREE);
3285 		tlck->type = tlckXTREE | tlckTRUNCATE;
3286 		xtlck = (struct xtlock *) & tlck->lock;
3287 		xtlck->hwm.offset = le16_to_cpu(p->header.nextindex) - 1;
3288 	}
3289 	BT_MARK_DIRTY(mp, ip);
3290 
3291 	/*
3292 	 * scan backward leaf page entries
3293 	 */
3294 	for (; index >= XTENTRYSTART; index--) {
3295 		xad = &p->xad[index];
3296 		xoff = offsetXAD(xad);
3297 		xlen = lengthXAD(xad);
3298 		xaddr = addressXAD(xad);
3299 
3300 		/*
3301 		 * The "data" for a directory is indexed by the block
3302 		 * device's address space.  This metadata must be invalidated
3303 		 * here
3304 		 */
3305 		if (S_ISDIR(ip->i_mode) && (teof == 0))
3306 			invalidate_xad_metapages(ip, *xad);
3307 		/*
3308 		 * entry beyond eof: continue scan of current page
3309 		 *          xad
3310 		 * ---|---=======------->
3311 		 *   eof
3312 		 */
3313 		if (teof < xoff) {
3314 			nfreed += xlen;
3315 			continue;
3316 		}
3317 
3318 		/*
3319 		 * (xoff <= teof): last entry to be deleted from page;
3320 		 * If other entries remain in page: keep and update the page.
3321 		 */
3322 
3323 		/*
3324 		 * eof == entry_start: delete the entry
3325 		 *           xad
3326 		 * -------|=======------->
3327 		 *       eof
3328 		 *
3329 		 */
3330 		if (teof == xoff) {
3331 			nfreed += xlen;
3332 
3333 			if (index == XTENTRYSTART)
3334 				break;
3335 
3336 			nextindex = index;
3337 		}
3338 		/*
3339 		 * eof within the entry: truncate the entry.
3340 		 *          xad
3341 		 * -------===|===------->
3342 		 *          eof
3343 		 */
3344 		else if (teof < xoff + xlen) {
3345 			/* update truncated entry */
3346 			len = teof - xoff;
3347 			freexlen = xlen - len;
3348 			XADlength(xad, len);
3349 
3350 			/* save pxd of truncated extent in tlck */
3351 			xaddr += len;
3352 			if (log) {	/* COMMIT_PWMAP */
3353 				xtlck->lwm.offset = (xtlck->lwm.offset) ?
3354 				    min(index, (int)xtlck->lwm.offset) : index;
3355 				xtlck->lwm.length = index + 1 -
3356 				    xtlck->lwm.offset;
3357 				xtlck->twm.offset = index;
3358 				pxdlock = (struct pxd_lock *) & xtlck->pxdlock;
3359 				pxdlock->flag = mlckFREEPXD;
3360 				PXDaddress(&pxdlock->pxd, xaddr);
3361 				PXDlength(&pxdlock->pxd, freexlen);
3362 			}
3363 			/* free truncated extent */
3364 			else {	/* COMMIT_WMAP */
3365 
3366 				pxdlock = (struct pxd_lock *) & xadlock;
3367 				pxdlock->flag = mlckFREEPXD;
3368 				PXDaddress(&pxdlock->pxd, xaddr);
3369 				PXDlength(&pxdlock->pxd, freexlen);
3370 				txFreeMap(ip, pxdlock, NULL, COMMIT_WMAP);
3371 
3372 				/* reset map lock */
3373 				xadlock.flag = mlckFREEXADLIST;
3374 			}
3375 
3376 			/* current entry is new last entry; */
3377 			nextindex = index + 1;
3378 
3379 			nfreed += freexlen;
3380 		}
3381 		/*
3382 		 * eof beyond the entry:
3383 		 *          xad
3384 		 * -------=======---|--->
3385 		 *                 eof
3386 		 */
3387 		else {		/* (xoff + xlen < teof) */
3388 
3389 			nextindex = index + 1;
3390 		}
3391 
3392 		if (nextindex < le16_to_cpu(p->header.nextindex)) {
3393 			if (!log) {	/* COMMIT_WAMP */
3394 				xadlock.xdlist = &p->xad[nextindex];
3395 				xadlock.count =
3396 				    le16_to_cpu(p->header.nextindex) -
3397 				    nextindex;
3398 				txFreeMap(ip, (struct maplock *) & xadlock,
3399 					  NULL, COMMIT_WMAP);
3400 			}
3401 			p->header.nextindex = cpu_to_le16(nextindex);
3402 		}
3403 
3404 		XT_PUTPAGE(mp);
3405 
3406 		/* assert(freed == 0); */
3407 		goto getParent;
3408 	}			/* end scan of leaf page entries */
3409 
3410 	freed = 1;
3411 
3412 	/*
3413 	 * leaf page become empty: free the page if type != PMAP
3414 	 */
3415 	if (log) {		/* COMMIT_PWMAP */
3416 		/* txCommit() with tlckFREE:
3417 		 * free data extents covered by leaf [XTENTRYSTART:hwm);
3418 		 * invalidate leaf if COMMIT_PWMAP;
3419 		 * if (TRUNCATE), will write LOG_NOREDOPAGE;
3420 		 */
3421 		tlck->type = tlckXTREE | tlckFREE;
3422 	} else {		/* COMMIT_WAMP */
3423 
3424 		/* free data extents covered by leaf */
3425 		xadlock.xdlist = &p->xad[XTENTRYSTART];
3426 		xadlock.count =
3427 		    le16_to_cpu(p->header.nextindex) - XTENTRYSTART;
3428 		txFreeMap(ip, (struct maplock *) & xadlock, NULL, COMMIT_WMAP);
3429 	}
3430 
3431 	if (p->header.flag & BT_ROOT) {
3432 		p->header.flag &= ~BT_INTERNAL;
3433 		p->header.flag |= BT_LEAF;
3434 		p->header.nextindex = cpu_to_le16(XTENTRYSTART);
3435 
3436 		XT_PUTPAGE(mp);	/* debug */
3437 		goto out;
3438 	} else {
3439 		if (log) {	/* COMMIT_PWMAP */
3440 			/* page will be invalidated at tx completion
3441 			 */
3442 			XT_PUTPAGE(mp);
3443 		} else {	/* COMMIT_WMAP */
3444 
3445 			if (mp->lid)
3446 				lid_to_tlock(mp->lid)->flag |= tlckFREELOCK;
3447 
3448 			/* invalidate empty leaf page */
3449 			discard_metapage(mp);
3450 		}
3451 	}
3452 
3453 	/*
3454 	 * the leaf page become empty: delete the parent entry
3455 	 * for the leaf page if the parent page is to be kept
3456 	 * in the new sized file.
3457 	 */
3458 
3459 	/*
3460 	 * go back up to the parent page
3461 	 */
3462       getParent:
3463 	/* pop/restore parent entry for the current child page */
3464 	if ((parent = BT_POP(&btstack)) == NULL)
3465 		/* current page must have been root */
3466 		goto out;
3467 
3468 	/* get back the parent page */
3469 	bn = parent->bn;
3470 	XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3471 	if (rc)
3472 		return rc;
3473 
3474 	index = parent->index;
3475 
3476 	/*
3477 	 * child page was not empty:
3478 	 */
3479 	if (freed == 0) {
3480 		/* has any entry deleted from parent ? */
3481 		if (index < le16_to_cpu(p->header.nextindex) - 1) {
3482 			/* (re)acquire tlock on the parent page */
3483 			if (log) {	/* COMMIT_PWMAP */
3484 				/* txCommit() with tlckTRUNCATE:
3485 				 * free child extents covered by parent [);
3486 				 */
3487 				tlck = txLock(tid, ip, mp, tlckXTREE);
3488 				xtlck = (struct xtlock *) & tlck->lock;
3489 				if (!(tlck->type & tlckTRUNCATE)) {
3490 					xtlck->hwm.offset =
3491 					    le16_to_cpu(p->header.
3492 							nextindex) - 1;
3493 					tlck->type =
3494 					    tlckXTREE | tlckTRUNCATE;
3495 				}
3496 			} else {	/* COMMIT_WMAP */
3497 
3498 				/* free child extents covered by parent */
3499 				xadlock.xdlist = &p->xad[index + 1];
3500 				xadlock.count =
3501 				    le16_to_cpu(p->header.nextindex) -
3502 				    index - 1;
3503 				txFreeMap(ip, (struct maplock *) & xadlock,
3504 					  NULL, COMMIT_WMAP);
3505 			}
3506 			BT_MARK_DIRTY(mp, ip);
3507 
3508 			p->header.nextindex = cpu_to_le16(index + 1);
3509 		}
3510 		XT_PUTPAGE(mp);
3511 		goto getParent;
3512 	}
3513 
3514 	/*
3515 	 * child page was empty:
3516 	 */
3517 	nfreed += lengthXAD(&p->xad[index]);
3518 
3519 	/*
3520 	 * During working map update, child page's tlock must be handled
3521 	 * before parent's.  This is because the parent's tlock will cause
3522 	 * the child's disk space to be marked available in the wmap, so
3523 	 * it's important that the child page be released by that time.
3524 	 *
3525 	 * ToDo:  tlocks should be on doubly-linked list, so we can
3526 	 * quickly remove it and add it to the end.
3527 	 */
3528 
3529 	/*
3530 	 * Move parent page's tlock to the end of the tid's tlock list
3531 	 */
3532 	if (log && mp->lid && (tblk->last != mp->lid) &&
3533 	    lid_to_tlock(mp->lid)->tid) {
3534 		lid_t lid = mp->lid;
3535 		struct tlock *prev;
3536 
3537 		tlck = lid_to_tlock(lid);
3538 
3539 		if (tblk->next == lid)
3540 			tblk->next = tlck->next;
3541 		else {
3542 			for (prev = lid_to_tlock(tblk->next);
3543 			     prev->next != lid;
3544 			     prev = lid_to_tlock(prev->next)) {
3545 				assert(prev->next);
3546 			}
3547 			prev->next = tlck->next;
3548 		}
3549 		lid_to_tlock(tblk->last)->next = lid;
3550 		tlck->next = 0;
3551 		tblk->last = lid;
3552 	}
3553 
3554 	/*
3555 	 * parent page become empty: free the page
3556 	 */
3557 	if (index == XTENTRYSTART) {
3558 		if (log) {	/* COMMIT_PWMAP */
3559 			/* txCommit() with tlckFREE:
3560 			 * free child extents covered by parent;
3561 			 * invalidate parent if COMMIT_PWMAP;
3562 			 */
3563 			tlck = txLock(tid, ip, mp, tlckXTREE);
3564 			xtlck = (struct xtlock *) & tlck->lock;
3565 			xtlck->hwm.offset =
3566 			    le16_to_cpu(p->header.nextindex) - 1;
3567 			tlck->type = tlckXTREE | tlckFREE;
3568 		} else {	/* COMMIT_WMAP */
3569 
3570 			/* free child extents covered by parent */
3571 			xadlock.xdlist = &p->xad[XTENTRYSTART];
3572 			xadlock.count =
3573 			    le16_to_cpu(p->header.nextindex) -
3574 			    XTENTRYSTART;
3575 			txFreeMap(ip, (struct maplock *) & xadlock, NULL,
3576 				  COMMIT_WMAP);
3577 		}
3578 		BT_MARK_DIRTY(mp, ip);
3579 
3580 		if (p->header.flag & BT_ROOT) {
3581 			p->header.flag &= ~BT_INTERNAL;
3582 			p->header.flag |= BT_LEAF;
3583 			p->header.nextindex = cpu_to_le16(XTENTRYSTART);
3584 			if (le16_to_cpu(p->header.maxentry) == XTROOTMAXSLOT) {
3585 				/*
3586 				 * Shrink root down to allow inline
3587 				 * EA (otherwise fsck complains)
3588 				 */
3589 				p->header.maxentry =
3590 				    cpu_to_le16(XTROOTINITSLOT);
3591 				JFS_IP(ip)->mode2 |= INLINEEA;
3592 			}
3593 
3594 			XT_PUTPAGE(mp);	/* debug */
3595 			goto out;
3596 		} else {
3597 			if (log) {	/* COMMIT_PWMAP */
3598 				/* page will be invalidated at tx completion
3599 				 */
3600 				XT_PUTPAGE(mp);
3601 			} else {	/* COMMIT_WMAP */
3602 
3603 				if (mp->lid)
3604 					lid_to_tlock(mp->lid)->flag |=
3605 						tlckFREELOCK;
3606 
3607 				/* invalidate parent page */
3608 				discard_metapage(mp);
3609 			}
3610 
3611 			/* parent has become empty and freed:
3612 			 * go back up to its parent page
3613 			 */
3614 			/* freed = 1; */
3615 			goto getParent;
3616 		}
3617 	}
3618 	/*
3619 	 * parent page still has entries for front region;
3620 	 */
3621 	else {
3622 		/* try truncate region covered by preceding entry
3623 		 * (process backward)
3624 		 */
3625 		index--;
3626 
3627 		/* go back down to the child page corresponding
3628 		 * to the entry
3629 		 */
3630 		goto getChild;
3631 	}
3632 
3633 	/*
3634 	 *	internal page: go down to child page of current entry
3635 	 */
3636       getChild:
3637 	/* save current parent entry for the child page */
3638 	if (BT_STACK_FULL(&btstack)) {
3639 		jfs_error(ip->i_sb, "stack overrun!\n");
3640 		XT_PUTPAGE(mp);
3641 		return -EIO;
3642 	}
3643 	BT_PUSH(&btstack, bn, index);
3644 
3645 	/* get child page */
3646 	xad = &p->xad[index];
3647 	bn = addressXAD(xad);
3648 
3649 	/*
3650 	 * first access of each internal entry:
3651 	 */
3652 	/* release parent page */
3653 	XT_PUTPAGE(mp);
3654 
3655 	/* process the child page */
3656 	goto getPage;
3657 
3658       out:
3659 	/*
3660 	 * update file resource stat
3661 	 */
3662 	/* set size
3663 	 */
3664 	if (S_ISDIR(ip->i_mode) && !newsize)
3665 		ip->i_size = 1;	/* fsck hates zero-length directories */
3666 	else
3667 		ip->i_size = newsize;
3668 
3669 	/* update quota allocation to reflect freed blocks */
3670 	dquot_free_block(ip, nfreed);
3671 
3672 	/*
3673 	 * free tlock of invalidated pages
3674 	 */
3675 	if (flag == COMMIT_WMAP)
3676 		txFreelock(ip);
3677 
3678 	return newsize;
3679 }
3680 
3681 
3682 /*
3683  *	xtTruncate_pmap()
3684  *
3685  * function:
3686  *	Perform truncate to zero length for deleted file, leaving the
3687  *	the xtree and working map untouched.  This allows the file to
3688  *	be accessed via open file handles, while the delete of the file
3689  *	is committed to disk.
3690  *
3691  * parameter:
3692  *	tid_t		tid,
3693  *	struct inode	*ip,
3694  *	s64		committed_size)
3695  *
3696  * return: new committed size
3697  *
3698  * note:
3699  *
3700  *	To avoid deadlock by holding too many transaction locks, the
3701  *	truncation may be broken up into multiple transactions.
3702  *	The committed_size keeps track of part of the file has been
3703  *	freed from the pmaps.
3704  */
3705 s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
3706 {
3707 	s64 bn;
3708 	struct btstack btstack;
3709 	int cmp;
3710 	int index;
3711 	int locked_leaves = 0;
3712 	struct metapage *mp;
3713 	xtpage_t *p;
3714 	struct btframe *parent;
3715 	int rc;
3716 	struct tblock *tblk;
3717 	struct tlock *tlck = NULL;
3718 	xad_t *xad;
3719 	int xlen;
3720 	s64 xoff;
3721 	struct xtlock *xtlck = NULL;
3722 
3723 	/* save object truncation type */
3724 	tblk = tid_to_tblock(tid);
3725 	tblk->xflag |= COMMIT_PMAP;
3726 
3727 	/* clear stack */
3728 	BT_CLR(&btstack);
3729 
3730 	if (committed_size) {
3731 		xoff = (committed_size >> JFS_SBI(ip->i_sb)->l2bsize) - 1;
3732 		rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0);
3733 		if (rc)
3734 			return rc;
3735 
3736 		XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
3737 
3738 		if (cmp != 0) {
3739 			XT_PUTPAGE(mp);
3740 			jfs_error(ip->i_sb, "did not find extent\n");
3741 			return -EIO;
3742 		}
3743 	} else {
3744 		/*
3745 		 * start with root
3746 		 *
3747 		 * root resides in the inode
3748 		 */
3749 		bn = 0;
3750 
3751 		/*
3752 		 * first access of each page:
3753 		 */
3754       getPage:
3755 		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3756 		if (rc)
3757 			return rc;
3758 
3759 		/* process entries backward from last index */
3760 		index = le16_to_cpu(p->header.nextindex) - 1;
3761 
3762 		if (p->header.flag & BT_INTERNAL)
3763 			goto getChild;
3764 	}
3765 
3766 	/*
3767 	 *	leaf page
3768 	 */
3769 
3770 	if (++locked_leaves > MAX_TRUNCATE_LEAVES) {
3771 		/*
3772 		 * We need to limit the size of the transaction
3773 		 * to avoid exhausting pagecache & tlocks
3774 		 */
3775 		xad = &p->xad[index];
3776 		xoff = offsetXAD(xad);
3777 		xlen = lengthXAD(xad);
3778 		XT_PUTPAGE(mp);
3779 		return (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize;
3780 	}
3781 	tlck = txLock(tid, ip, mp, tlckXTREE);
3782 	tlck->type = tlckXTREE | tlckFREE;
3783 	xtlck = (struct xtlock *) & tlck->lock;
3784 	xtlck->hwm.offset = index;
3785 
3786 
3787 	XT_PUTPAGE(mp);
3788 
3789 	/*
3790 	 * go back up to the parent page
3791 	 */
3792       getParent:
3793 	/* pop/restore parent entry for the current child page */
3794 	if ((parent = BT_POP(&btstack)) == NULL)
3795 		/* current page must have been root */
3796 		goto out;
3797 
3798 	/* get back the parent page */
3799 	bn = parent->bn;
3800 	XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3801 	if (rc)
3802 		return rc;
3803 
3804 	index = parent->index;
3805 
3806 	/*
3807 	 * parent page become empty: free the page
3808 	 */
3809 	if (index == XTENTRYSTART) {
3810 		/* txCommit() with tlckFREE:
3811 		 * free child extents covered by parent;
3812 		 * invalidate parent if COMMIT_PWMAP;
3813 		 */
3814 		tlck = txLock(tid, ip, mp, tlckXTREE);
3815 		xtlck = (struct xtlock *) & tlck->lock;
3816 		xtlck->hwm.offset = le16_to_cpu(p->header.nextindex) - 1;
3817 		tlck->type = tlckXTREE | tlckFREE;
3818 
3819 		XT_PUTPAGE(mp);
3820 
3821 		if (p->header.flag & BT_ROOT) {
3822 
3823 			goto out;
3824 		} else {
3825 			goto getParent;
3826 		}
3827 	}
3828 	/*
3829 	 * parent page still has entries for front region;
3830 	 */
3831 	else
3832 		index--;
3833 	/*
3834 	 *	internal page: go down to child page of current entry
3835 	 */
3836       getChild:
3837 	/* save current parent entry for the child page */
3838 	if (BT_STACK_FULL(&btstack)) {
3839 		jfs_error(ip->i_sb, "stack overrun!\n");
3840 		XT_PUTPAGE(mp);
3841 		return -EIO;
3842 	}
3843 	BT_PUSH(&btstack, bn, index);
3844 
3845 	/* get child page */
3846 	xad = &p->xad[index];
3847 	bn = addressXAD(xad);
3848 
3849 	/*
3850 	 * first access of each internal entry:
3851 	 */
3852 	/* release parent page */
3853 	XT_PUTPAGE(mp);
3854 
3855 	/* process the child page */
3856 	goto getPage;
3857 
3858       out:
3859 
3860 	return 0;
3861 }
3862 
3863 #ifdef CONFIG_JFS_STATISTICS
3864 int jfs_xtstat_proc_show(struct seq_file *m, void *v)
3865 {
3866 	seq_printf(m,
3867 		       "JFS Xtree statistics\n"
3868 		       "====================\n"
3869 		       "searches = %d\n"
3870 		       "fast searches = %d\n"
3871 		       "splits = %d\n",
3872 		       xtStat.search,
3873 		       xtStat.fastSearch,
3874 		       xtStat.split);
3875 	return 0;
3876 }
3877 #endif
3878