xref: /openbmc/linux/fs/ocfs2/extent_map.c (revision 87c2ce3b)
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * extent_map.c
5  *
6  * In-memory extent map for OCFS2.  Man, this code was prettier in
7  * the library.
8  *
9  * Copyright (C) 2004 Oracle.  All rights reserved.
10  *
11  * This program is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU General Public
13  * License, version 2,  as published by the Free Software Foundation.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25 
26 #include <linux/fs.h>
27 #include <linux/init.h>
28 #include <linux/types.h>
29 #include <linux/slab.h>
30 #include <linux/rbtree.h>
31 
32 #define MLOG_MASK_PREFIX ML_EXTENT_MAP
33 #include <cluster/masklog.h>
34 
35 #include "ocfs2.h"
36 
37 #include "extent_map.h"
38 #include "inode.h"
39 #include "super.h"
40 
41 #include "buffer_head_io.h"
42 
43 
44 /*
45  * SUCK SUCK SUCK
46  * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h
47  */
48 
49 struct ocfs2_extent_map_entry {
50 	struct rb_node e_node;
51 	int e_tree_depth;
52 	struct ocfs2_extent_rec e_rec;
53 };
54 
55 struct ocfs2_em_insert_context {
56 	int need_left;
57 	int need_right;
58 	struct ocfs2_extent_map_entry *new_ent;
59 	struct ocfs2_extent_map_entry *old_ent;
60 	struct ocfs2_extent_map_entry *left_ent;
61 	struct ocfs2_extent_map_entry *right_ent;
62 };
63 
64 static kmem_cache_t *ocfs2_em_ent_cachep = NULL;
65 
66 
67 static struct ocfs2_extent_map_entry *
68 ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
69 			u32 cpos, u32 clusters,
70 			struct rb_node ***ret_p,
71 			struct rb_node **ret_parent);
72 static int ocfs2_extent_map_insert(struct inode *inode,
73 				   struct ocfs2_extent_rec *rec,
74 				   int tree_depth);
75 static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
76 					 struct ocfs2_extent_map_entry *ent);
77 static int ocfs2_extent_map_find_leaf(struct inode *inode,
78 				      u32 cpos, u32 clusters,
79 				      struct ocfs2_extent_list *el);
80 static int ocfs2_extent_map_lookup_read(struct inode *inode,
81 					u32 cpos, u32 clusters,
82 					struct ocfs2_extent_map_entry **ret_ent);
83 static int ocfs2_extent_map_try_insert(struct inode *inode,
84 				       struct ocfs2_extent_rec *rec,
85 				       int tree_depth,
86 				       struct ocfs2_em_insert_context *ctxt);
87 
88 /* returns 1 only if the rec contains all the given clusters -- that is that
89  * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos +
90  * clusters) is >= the argument's endpoint */
91 static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec,
92 					      u32 cpos, u32 clusters)
93 {
94 	if (le32_to_cpu(rec->e_cpos) > cpos)
95 		return 0;
96 	if (cpos + clusters > le32_to_cpu(rec->e_cpos) +
97 			      le32_to_cpu(rec->e_clusters))
98 		return 0;
99 	return 1;
100 }
101 
102 
103 /*
104  * Find an entry in the tree that intersects the region passed in.
105  * Note that this will find straddled intervals, it is up to the
106  * callers to enforce any boundary conditions.
107  *
108  * Callers must hold ip_lock.  This lookup is not guaranteed to return
109  * a tree_depth 0 match, and as such can race inserts if the lock
110  * were not held.
111  *
112  * The rb_node garbage lets insertion share the search.  Trivial
113  * callers pass NULL.
114  */
115 static struct ocfs2_extent_map_entry *
116 ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
117 			u32 cpos, u32 clusters,
118 			struct rb_node ***ret_p,
119 			struct rb_node **ret_parent)
120 {
121 	struct rb_node **p = &em->em_extents.rb_node;
122 	struct rb_node *parent = NULL;
123 	struct ocfs2_extent_map_entry *ent = NULL;
124 
125 	while (*p)
126 	{
127 		parent = *p;
128 		ent = rb_entry(parent, struct ocfs2_extent_map_entry,
129 			       e_node);
130 		if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) {
131 			p = &(*p)->rb_left;
132 			ent = NULL;
133 		} else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) +
134 				    le32_to_cpu(ent->e_rec.e_clusters))) {
135 			p = &(*p)->rb_right;
136 			ent = NULL;
137 		} else
138 			break;
139 	}
140 
141 	if (ret_p != NULL)
142 		*ret_p = p;
143 	if (ret_parent != NULL)
144 		*ret_parent = parent;
145 	return ent;
146 }
147 
148 /*
149  * Find the leaf containing the interval we want.  While we're on our
150  * way down the tree, fill in every record we see at any depth, because
151  * we might want it later.
152  *
153  * Note that this code is run without ip_lock.  That's because it
154  * sleeps while reading.  If someone is also filling the extent list at
155  * the same time we are, we might have to restart.
156  */
157 static int ocfs2_extent_map_find_leaf(struct inode *inode,
158 				      u32 cpos, u32 clusters,
159 				      struct ocfs2_extent_list *el)
160 {
161 	int i, ret;
162 	struct buffer_head *eb_bh = NULL;
163 	u64 blkno;
164 	u32 rec_end;
165 	struct ocfs2_extent_block *eb;
166 	struct ocfs2_extent_rec *rec;
167 
168 	/*
169 	 * The bh data containing the el cannot change here, because
170 	 * we hold alloc_sem.  So we can do this without other
171 	 * locks.
172 	 */
173 	while (el->l_tree_depth)
174 	{
175 		blkno = 0;
176 		for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
177 			rec = &el->l_recs[i];
178 			rec_end = (le32_to_cpu(rec->e_cpos) +
179 				   le32_to_cpu(rec->e_clusters));
180 
181 			ret = -EBADR;
182 			if (rec_end > OCFS2_I(inode)->ip_clusters) {
183 				mlog_errno(ret);
184 				goto out_free;
185 			}
186 
187 			if (rec_end <= cpos) {
188 				ret = ocfs2_extent_map_insert(inode, rec,
189 						le16_to_cpu(el->l_tree_depth));
190 				if (ret && (ret != -EEXIST)) {
191 					mlog_errno(ret);
192 					goto out_free;
193 				}
194 				continue;
195 			}
196 			if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) {
197 				ret = ocfs2_extent_map_insert(inode, rec,
198 						le16_to_cpu(el->l_tree_depth));
199 				if (ret && (ret != -EEXIST)) {
200 					mlog_errno(ret);
201 					goto out_free;
202 				}
203 				continue;
204 			}
205 
206 			/*
207 			 * We've found a record that matches our
208 			 * interval.  We don't insert it because we're
209 			 * about to traverse it.
210 			 */
211 
212 			/* Check to see if we're stradling */
213 			ret = -ESRCH;
214 			if (!ocfs2_extent_rec_contains_clusters(rec,
215 							        cpos,
216 								clusters)) {
217 				mlog_errno(ret);
218 				goto out_free;
219 			}
220 
221 			/*
222 			 * If we've already found a record, the el has
223 			 * two records covering the same interval.
224 			 * EEEK!
225 			 */
226 			ret = -EBADR;
227 			if (blkno) {
228 				mlog_errno(ret);
229 				goto out_free;
230 			}
231 
232 			blkno = le64_to_cpu(rec->e_blkno);
233 		}
234 
235 		/*
236 		 * We don't support holes, and we're still up
237 		 * in the branches, so we'd better have found someone
238 		 */
239 		ret = -EBADR;
240 		if (!blkno) {
241 			mlog_errno(ret);
242 			goto out_free;
243 		}
244 
245 		if (eb_bh) {
246 			brelse(eb_bh);
247 			eb_bh = NULL;
248 		}
249 		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
250 				       blkno, &eb_bh, OCFS2_BH_CACHED,
251 				       inode);
252 		if (ret) {
253 			mlog_errno(ret);
254 			goto out_free;
255 		}
256 		eb = (struct ocfs2_extent_block *)eb_bh->b_data;
257 		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
258 			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
259 			ret = -EIO;
260 			goto out_free;
261 		}
262 		el = &eb->h_list;
263 	}
264 
265 	if (el->l_tree_depth)
266 		BUG();
267 
268 	for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
269 		rec = &el->l_recs[i];
270 		ret = ocfs2_extent_map_insert(inode, rec,
271 					      le16_to_cpu(el->l_tree_depth));
272 		if (ret) {
273 			mlog_errno(ret);
274 			goto out_free;
275 		}
276 	}
277 
278 	ret = 0;
279 
280 out_free:
281 	if (eb_bh)
282 		brelse(eb_bh);
283 
284 	return ret;
285 }
286 
287 /*
288  * This lookup actually will read from disk.  It has one invariant:
289  * It will never re-traverse blocks.  This means that all inserts should
290  * be new regions or more granular regions (both allowed by insert).
291  */
292 static int ocfs2_extent_map_lookup_read(struct inode *inode,
293 					u32 cpos,
294 					u32 clusters,
295 					struct ocfs2_extent_map_entry **ret_ent)
296 {
297 	int ret;
298 	u64 blkno;
299 	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
300 	struct ocfs2_extent_map_entry *ent;
301 	struct buffer_head *bh = NULL;
302 	struct ocfs2_extent_block *eb;
303 	struct ocfs2_dinode *di;
304 	struct ocfs2_extent_list *el;
305 
306 	spin_lock(&OCFS2_I(inode)->ip_lock);
307 	ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
308 	if (ent) {
309 		if (!ent->e_tree_depth) {
310 			spin_unlock(&OCFS2_I(inode)->ip_lock);
311 			*ret_ent = ent;
312 			return 0;
313 		}
314 		blkno = le64_to_cpu(ent->e_rec.e_blkno);
315 		spin_unlock(&OCFS2_I(inode)->ip_lock);
316 
317 		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh,
318 				       OCFS2_BH_CACHED, inode);
319 		if (ret) {
320 			mlog_errno(ret);
321 			if (bh)
322 				brelse(bh);
323 			return ret;
324 		}
325 		eb = (struct ocfs2_extent_block *)bh->b_data;
326 		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
327 			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
328 			brelse(bh);
329 			return -EIO;
330 		}
331 		el = &eb->h_list;
332 	} else {
333 		spin_unlock(&OCFS2_I(inode)->ip_lock);
334 
335 		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
336 				       OCFS2_I(inode)->ip_blkno, &bh,
337 				       OCFS2_BH_CACHED, inode);
338 		if (ret) {
339 			mlog_errno(ret);
340 			if (bh)
341 				brelse(bh);
342 			return ret;
343 		}
344 		di = (struct ocfs2_dinode *)bh->b_data;
345 		if (!OCFS2_IS_VALID_DINODE(di)) {
346 			brelse(bh);
347 			OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di);
348 			return -EIO;
349 		}
350 		el = &di->id2.i_list;
351 	}
352 
353 	ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el);
354 	brelse(bh);
355 	if (ret) {
356 		mlog_errno(ret);
357 		return ret;
358 	}
359 
360 	ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
361 	if (!ent) {
362 		ret = -ESRCH;
363 		mlog_errno(ret);
364 		return ret;
365 	}
366 
367 	if (ent->e_tree_depth)
368 		BUG();  /* FIXME: Make sure this isn't a corruption */
369 
370 	*ret_ent = ent;
371 
372 	return 0;
373 }
374 
375 /*
376  * Callers must hold ip_lock.  This can insert pieces of the tree,
377  * thus racing lookup if the lock weren't held.
378  */
379 static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
380 					 struct ocfs2_extent_map_entry *ent)
381 {
382 	struct rb_node **p, *parent;
383 	struct ocfs2_extent_map_entry *old_ent;
384 
385 	old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos),
386 					  le32_to_cpu(ent->e_rec.e_clusters),
387 					  &p, &parent);
388 	if (old_ent)
389 		return -EEXIST;
390 
391 	rb_link_node(&ent->e_node, parent, p);
392 	rb_insert_color(&ent->e_node, &em->em_extents);
393 
394 	return 0;
395 }
396 
397 
398 /*
399  * Simple rule: on any return code other than -EAGAIN, anything left
400  * in the insert_context will be freed.
401  */
402 static int ocfs2_extent_map_try_insert(struct inode *inode,
403 				       struct ocfs2_extent_rec *rec,
404 				       int tree_depth,
405 				       struct ocfs2_em_insert_context *ctxt)
406 {
407 	int ret;
408 	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
409 	struct ocfs2_extent_map_entry *old_ent;
410 
411 	ctxt->need_left = 0;
412 	ctxt->need_right = 0;
413 	ctxt->old_ent = NULL;
414 
415 	spin_lock(&OCFS2_I(inode)->ip_lock);
416 	ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
417 	if (!ret) {
418 		ctxt->new_ent = NULL;
419 		goto out_unlock;
420 	}
421 
422 	old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos),
423 					  le32_to_cpu(rec->e_clusters), NULL,
424 					  NULL);
425 
426 	if (!old_ent)
427 		BUG();
428 
429 	ret = -EEXIST;
430 	if (old_ent->e_tree_depth < tree_depth)
431 		goto out_unlock;
432 
433 	if (old_ent->e_tree_depth == tree_depth) {
434 		if (!memcmp(rec, &old_ent->e_rec,
435 			    sizeof(struct ocfs2_extent_rec)))
436 			ret = 0;
437 
438 		/* FIXME: Should this be ESRCH/EBADR??? */
439 		goto out_unlock;
440 	}
441 
442 	/*
443 	 * We do it in this order specifically so that no actual tree
444 	 * changes occur until we have all the pieces we need.  We
445 	 * don't want malloc failures to leave an inconsistent tree.
446 	 * Whenever we drop the lock, another process could be
447 	 * inserting.  Also note that, if another process just beat us
448 	 * to an insert, we might not need the same pieces we needed
449 	 * the first go round.  In the end, the pieces we need will
450 	 * be used, and the pieces we don't will be freed.
451 	 */
452 	ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) >
453 			     le32_to_cpu(old_ent->e_rec.e_cpos));
454 	ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) +
455 			       le32_to_cpu(old_ent->e_rec.e_clusters)) >
456 			      (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)));
457 	ret = -EAGAIN;
458 	if (ctxt->need_left) {
459 		if (!ctxt->left_ent)
460 			goto out_unlock;
461 		*(ctxt->left_ent) = *old_ent;
462 		ctxt->left_ent->e_rec.e_clusters =
463 			cpu_to_le32(le32_to_cpu(rec->e_cpos) -
464 				    le32_to_cpu(ctxt->left_ent->e_rec.e_cpos));
465 	}
466 	if (ctxt->need_right) {
467 		if (!ctxt->right_ent)
468 			goto out_unlock;
469 		*(ctxt->right_ent) = *old_ent;
470 		ctxt->right_ent->e_rec.e_cpos =
471 			cpu_to_le32(le32_to_cpu(rec->e_cpos) +
472 				    le32_to_cpu(rec->e_clusters));
473 		ctxt->right_ent->e_rec.e_clusters =
474 			cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) +
475 				     le32_to_cpu(old_ent->e_rec.e_clusters)) -
476 				    le32_to_cpu(ctxt->right_ent->e_rec.e_cpos));
477 	}
478 
479 	rb_erase(&old_ent->e_node, &em->em_extents);
480 	/* Now that he's erased, set him up for deletion */
481 	ctxt->old_ent = old_ent;
482 
483 	if (ctxt->need_left) {
484 		ret = ocfs2_extent_map_insert_entry(em,
485 						    ctxt->left_ent);
486 		if (ret)
487 			goto out_unlock;
488 		ctxt->left_ent = NULL;
489 	}
490 
491 	if (ctxt->need_right) {
492 		ret = ocfs2_extent_map_insert_entry(em,
493 						    ctxt->right_ent);
494 		if (ret)
495 			goto out_unlock;
496 		ctxt->right_ent = NULL;
497 	}
498 
499 	ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
500 
501 	if (!ret)
502 		ctxt->new_ent = NULL;
503 
504 out_unlock:
505 	spin_unlock(&OCFS2_I(inode)->ip_lock);
506 
507 	return ret;
508 }
509 
510 
511 static int ocfs2_extent_map_insert(struct inode *inode,
512 				   struct ocfs2_extent_rec *rec,
513 				   int tree_depth)
514 {
515 	int ret;
516 	struct ocfs2_em_insert_context ctxt = {0, };
517 
518 	if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
519 	    OCFS2_I(inode)->ip_map.em_clusters) {
520 		ret = -EBADR;
521 		mlog_errno(ret);
522 		return ret;
523 	}
524 
525 	/* Zero e_clusters means a truncated tail record.  It better be EOF */
526 	if (!rec->e_clusters) {
527 		if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) !=
528 		    OCFS2_I(inode)->ip_map.em_clusters) {
529 			ret = -EBADR;
530 			mlog_errno(ret);
531 			return ret;
532 		}
533 
534 		/* Ignore the truncated tail */
535 		return 0;
536 	}
537 
538 	ret = -ENOMEM;
539 	ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep,
540 					GFP_KERNEL);
541 	if (!ctxt.new_ent) {
542 		mlog_errno(ret);
543 		return ret;
544 	}
545 
546 	ctxt.new_ent->e_rec = *rec;
547 	ctxt.new_ent->e_tree_depth = tree_depth;
548 
549 	do {
550 		ret = -ENOMEM;
551 		if (ctxt.need_left && !ctxt.left_ent) {
552 			ctxt.left_ent =
553 				kmem_cache_alloc(ocfs2_em_ent_cachep,
554 						 GFP_KERNEL);
555 			if (!ctxt.left_ent)
556 				break;
557 		}
558 		if (ctxt.need_right && !ctxt.right_ent) {
559 			ctxt.right_ent =
560 				kmem_cache_alloc(ocfs2_em_ent_cachep,
561 						 GFP_KERNEL);
562 			if (!ctxt.right_ent)
563 				break;
564 		}
565 
566 		ret = ocfs2_extent_map_try_insert(inode, rec,
567 						  tree_depth, &ctxt);
568 	} while (ret == -EAGAIN);
569 
570 	if (ret < 0)
571 		mlog_errno(ret);
572 
573 	if (ctxt.left_ent)
574 		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent);
575 	if (ctxt.right_ent)
576 		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent);
577 	if (ctxt.old_ent)
578 		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent);
579 	if (ctxt.new_ent)
580 		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent);
581 
582 	return ret;
583 }
584 
585 /*
586  * Append this record to the tail of the extent map.  It must be
587  * tree_depth 0.  The record might be an extension of an existing
588  * record, and as such that needs to be handled.  eg:
589  *
590  * Existing record in the extent map:
591  *
592  *	cpos = 10, len = 10
593  * 	|---------|
594  *
595  * New Record:
596  *
597  *	cpos = 10, len = 20
598  * 	|------------------|
599  *
600  * The passed record is the new on-disk record.  The new_clusters value
601  * is how many clusters were added to the file.  If the append is a
602  * contiguous append, the new_clusters has been added to
603  * rec->e_clusters.  If the append is an entirely new extent, then
604  * rec->e_clusters is == new_clusters.
605  */
606 int ocfs2_extent_map_append(struct inode *inode,
607 			    struct ocfs2_extent_rec *rec,
608 			    u32 new_clusters)
609 {
610 	int ret;
611 	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
612 	struct ocfs2_extent_map_entry *ent;
613 	struct ocfs2_extent_rec *old;
614 
615 	BUG_ON(!new_clusters);
616 	BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters);
617 
618 	if (em->em_clusters < OCFS2_I(inode)->ip_clusters) {
619 		/*
620 		 * Size changed underneath us on disk.  Drop any
621 		 * straddling records and update our idea of
622 		 * i_clusters
623 		 */
624 		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
625 		em->em_clusters = OCFS2_I(inode)->ip_clusters;
626 	}
627 
628 	mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) +
629 			 le32_to_cpu(rec->e_clusters)) !=
630 			(em->em_clusters + new_clusters),
631 			"Inode %"MLFu64":\n"
632 			"rec->e_cpos = %u + rec->e_clusters = %u = %u\n"
633 			"em->em_clusters = %u + new_clusters = %u = %u\n",
634 			OCFS2_I(inode)->ip_blkno,
635 			le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
636 			le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters),
637 			em->em_clusters, new_clusters,
638 			em->em_clusters + new_clusters);
639 
640 	em->em_clusters += new_clusters;
641 
642 	ret = -ENOENT;
643 	if (le32_to_cpu(rec->e_clusters) > new_clusters) {
644 		/* This is a contiguous append */
645 		ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1,
646 					      NULL, NULL);
647 		if (ent) {
648 			old = &ent->e_rec;
649 			BUG_ON((le32_to_cpu(rec->e_cpos) +
650 				le32_to_cpu(rec->e_clusters)) !=
651 				 (le32_to_cpu(old->e_cpos) +
652 				  le32_to_cpu(old->e_clusters) +
653 				  new_clusters));
654 			if (ent->e_tree_depth == 0) {
655 				BUG_ON(le32_to_cpu(old->e_cpos) !=
656 				       le32_to_cpu(rec->e_cpos));
657 				BUG_ON(le64_to_cpu(old->e_blkno) !=
658 				       le64_to_cpu(rec->e_blkno));
659 				ret = 0;
660 			}
661 			/*
662 			 * Let non-leafs fall through as -ENOENT to
663 			 * force insertion of the new leaf.
664 			 */
665 			le32_add_cpu(&old->e_clusters, new_clusters);
666 		}
667 	}
668 
669 	if (ret == -ENOENT)
670 		ret = ocfs2_extent_map_insert(inode, rec, 0);
671 	if (ret < 0)
672 		mlog_errno(ret);
673 	return ret;
674 }
675 
676 #if 0
677 /* Code here is included but defined out as it completes the extent
678  * map api and may be used in the future. */
679 
680 /*
681  * Look up the record containing this cluster offset.  This record is
682  * part of the extent map.  Do not free it.  Any changes you make to
683  * it will reflect in the extent map.  So, if your last extent
684  * is (cpos = 10, clusters = 10) and you truncate the file by 5
685  * clusters, you can do:
686  *
687  * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec);
688  * rec->e_clusters -= 5;
689  *
690  * The lookup does not read from disk.  If the map isn't filled in for
691  * an entry, you won't find it.
692  *
693  * Also note that the returned record is valid until alloc_sem is
694  * dropped.  After that, truncate and extend can happen.  Caveat Emptor.
695  */
696 int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos,
697 			     struct ocfs2_extent_rec **rec,
698 			     int *tree_depth)
699 {
700 	int ret = -ENOENT;
701 	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
702 	struct ocfs2_extent_map_entry *ent;
703 
704 	*rec = NULL;
705 
706 	if (cpos >= OCFS2_I(inode)->ip_clusters)
707 		return -EINVAL;
708 
709 	if (cpos >= em->em_clusters) {
710 		/*
711 		 * Size changed underneath us on disk.  Drop any
712 		 * straddling records and update our idea of
713 		 * i_clusters
714 		 */
715 		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
716 		em->em_clusters = OCFS2_I(inode)->ip_clusters ;
717 	}
718 
719 	ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1,
720 				      NULL, NULL);
721 
722 	if (ent) {
723 		*rec = &ent->e_rec;
724 		if (tree_depth)
725 			*tree_depth = ent->e_tree_depth;
726 		ret = 0;
727 	}
728 
729 	return ret;
730 }
731 
732 int ocfs2_extent_map_get_clusters(struct inode *inode,
733 				  u32 v_cpos, int count,
734 				  u32 *p_cpos, int *ret_count)
735 {
736 	int ret;
737 	u32 coff, ccount;
738 	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
739 	struct ocfs2_extent_map_entry *ent = NULL;
740 
741 	*p_cpos = ccount = 0;
742 
743 	if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters)
744 		return -EINVAL;
745 
746 	if ((v_cpos + count) > em->em_clusters) {
747 		/*
748 		 * Size changed underneath us on disk.  Drop any
749 		 * straddling records and update our idea of
750 		 * i_clusters
751 		 */
752 		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
753 		em->em_clusters = OCFS2_I(inode)->ip_clusters;
754 	}
755 
756 
757 	ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent);
758 	if (ret)
759 		return ret;
760 
761 	if (ent) {
762 		/* We should never find ourselves straddling an interval */
763 		if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec,
764 							v_cpos,
765 							count))
766 			return -ESRCH;
767 
768 		coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos);
769 		*p_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
770 				le64_to_cpu(ent->e_rec.e_blkno)) +
771 			  coff;
772 
773 		if (ret_count)
774 			*ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff;
775 
776 		return 0;
777 	}
778 
779 
780 	return -ENOENT;
781 }
782 
783 #endif  /*  0  */
784 
785 int ocfs2_extent_map_get_blocks(struct inode *inode,
786 				u64 v_blkno, int count,
787 				u64 *p_blkno, int *ret_count)
788 {
789 	int ret;
790 	u64 boff;
791 	u32 cpos, clusters;
792 	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
793 	struct ocfs2_extent_map_entry *ent = NULL;
794 	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
795 	struct ocfs2_extent_rec *rec;
796 
797 	*p_blkno = 0;
798 
799 	cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
800 	clusters = ocfs2_blocks_to_clusters(inode->i_sb,
801 					    (u64)count + bpc - 1);
802 	if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) {
803 		ret = -EINVAL;
804 		mlog_errno(ret);
805 		return ret;
806 	}
807 
808 	if ((cpos + clusters) > em->em_clusters) {
809 		/*
810 		 * Size changed underneath us on disk.  Drop any
811 		 * straddling records and update our idea of
812 		 * i_clusters
813 		 */
814 		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
815 		em->em_clusters = OCFS2_I(inode)->ip_clusters;
816 	}
817 
818 	ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent);
819 	if (ret) {
820 		mlog_errno(ret);
821 		return ret;
822 	}
823 
824 	if (ent)
825 	{
826 		rec = &ent->e_rec;
827 
828 		/* We should never find ourselves straddling an interval */
829 		if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) {
830 			ret = -ESRCH;
831 			mlog_errno(ret);
832 			return ret;
833 		}
834 
835 		boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos -
836 						le32_to_cpu(rec->e_cpos));
837 		boff += (v_blkno & (u64)(bpc - 1));
838 		*p_blkno = le64_to_cpu(rec->e_blkno) + boff;
839 
840 		if (ret_count) {
841 			*ret_count = ocfs2_clusters_to_blocks(inode->i_sb,
842 					le32_to_cpu(rec->e_clusters)) - boff;
843 		}
844 
845 		return 0;
846 	}
847 
848 	return -ENOENT;
849 }
850 
851 int ocfs2_extent_map_init(struct inode *inode)
852 {
853 	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
854 
855 	em->em_extents = RB_ROOT;
856 	em->em_clusters = 0;
857 
858 	return 0;
859 }
860 
861 /* Needs the lock */
862 static void __ocfs2_extent_map_drop(struct inode *inode,
863 				    u32 new_clusters,
864 				    struct rb_node **free_head,
865 				    struct ocfs2_extent_map_entry **tail_ent)
866 {
867 	struct rb_node *node, *next;
868 	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
869 	struct ocfs2_extent_map_entry *ent;
870 
871 	*free_head = NULL;
872 
873 	ent = NULL;
874 	node = rb_last(&em->em_extents);
875 	while (node)
876 	{
877 		next = rb_prev(node);
878 
879 		ent = rb_entry(node, struct ocfs2_extent_map_entry,
880 			       e_node);
881 		if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters)
882 			break;
883 
884 		rb_erase(&ent->e_node, &em->em_extents);
885 
886 		node->rb_right = *free_head;
887 		*free_head = node;
888 
889 		ent = NULL;
890 		node = next;
891 	}
892 
893 	/* Do we have an entry straddling new_clusters? */
894 	if (tail_ent) {
895 		if (ent &&
896 		    ((le32_to_cpu(ent->e_rec.e_cpos) +
897 		      le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters))
898 			*tail_ent = ent;
899 		else
900 			*tail_ent = NULL;
901 	}
902 }
903 
904 static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head)
905 {
906 	struct rb_node *node;
907 	struct ocfs2_extent_map_entry *ent;
908 
909 	while (free_head) {
910 		node = free_head;
911 		free_head = node->rb_right;
912 
913 		ent = rb_entry(node, struct ocfs2_extent_map_entry,
914 			       e_node);
915 		kmem_cache_free(ocfs2_em_ent_cachep, ent);
916 	}
917 }
918 
919 /*
920  * Remove all entries past new_clusters, inclusive of an entry that
921  * contains new_clusters.  This is effectively a cache forget.
922  *
923  * If you want to also clip the last extent by some number of clusters,
924  * you need to call ocfs2_extent_map_trunc().
925  * This code does not check or modify ip_clusters.
926  */
927 int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters)
928 {
929 	struct rb_node *free_head = NULL;
930 	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
931 	struct ocfs2_extent_map_entry *ent;
932 
933 	spin_lock(&OCFS2_I(inode)->ip_lock);
934 
935 	__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
936 
937 	if (ent) {
938 		rb_erase(&ent->e_node, &em->em_extents);
939 		ent->e_node.rb_right = free_head;
940 		free_head = &ent->e_node;
941 	}
942 
943 	spin_unlock(&OCFS2_I(inode)->ip_lock);
944 
945 	if (free_head)
946 		__ocfs2_extent_map_drop_cleanup(free_head);
947 
948 	return 0;
949 }
950 
951 /*
952  * Remove all entries past new_clusters and also clip any extent
953  * straddling new_clusters, if there is one.  This does not check
954  * or modify ip_clusters
955  */
956 int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters)
957 {
958 	struct rb_node *free_head = NULL;
959 	struct ocfs2_extent_map_entry *ent = NULL;
960 
961 	spin_lock(&OCFS2_I(inode)->ip_lock);
962 
963 	__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
964 
965 	if (ent)
966 		ent->e_rec.e_clusters = cpu_to_le32(new_clusters -
967 					       le32_to_cpu(ent->e_rec.e_cpos));
968 
969 	OCFS2_I(inode)->ip_map.em_clusters = new_clusters;
970 
971 	spin_unlock(&OCFS2_I(inode)->ip_lock);
972 
973 	if (free_head)
974 		__ocfs2_extent_map_drop_cleanup(free_head);
975 
976 	return 0;
977 }
978 
979 int __init init_ocfs2_extent_maps(void)
980 {
981 	ocfs2_em_ent_cachep =
982 		kmem_cache_create("ocfs2_em_ent",
983 				  sizeof(struct ocfs2_extent_map_entry),
984 				  0, SLAB_HWCACHE_ALIGN, NULL, NULL);
985 	if (!ocfs2_em_ent_cachep)
986 		return -ENOMEM;
987 
988 	return 0;
989 }
990 
991 void __exit exit_ocfs2_extent_maps(void)
992 {
993 	kmem_cache_destroy(ocfs2_em_ent_cachep);
994 }
995