xref: /openbmc/linux/fs/gfs2/dir.c (revision 9d56dd3b083a3bec56e9da35ce07baca81030b03)
1 /*
2  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
3  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
4  *
5  * This copyrighted material is made available to anyone wishing to use,
6  * modify, copy, or redistribute it subject to the terms and conditions
7  * of the GNU General Public License version 2.
8  */
9 
10 /*
11  * Implements Extendible Hashing as described in:
12  *   "Extendible Hashing" by Fagin, et al in
13  *     __ACM Trans. on Database Systems__, Sept 1979.
14  *
15  *
16  * Here's the layout of dirents which is essentially the same as that of ext2
17  * within a single block. The field de_name_len is the number of bytes
18  * actually required for the name (no null terminator). The field de_rec_len
19  * is the number of bytes allocated to the dirent. The offset of the next
20  * dirent in the block is (dirent + dirent->de_rec_len). When a dirent is
21  * deleted, the preceding dirent inherits its allocated space, ie
22  * prev->de_rec_len += deleted->de_rec_len. Since the next dirent is obtained
23  * by adding de_rec_len to the current dirent, this essentially causes the
24  * deleted dirent to get jumped over when iterating through all the dirents.
25  *
26  * When deleting the first dirent in a block, there is no previous dirent so
27  * the field de_ino is set to zero to designate it as deleted. When allocating
28  * a dirent, gfs2_dirent_alloc iterates through the dirents in a block. If the
29  * first dirent has (de_ino == 0) and de_rec_len is large enough, this first
30  * dirent is allocated. Otherwise it must go through all the 'used' dirents
31  * searching for one in which the amount of total space minus the amount of
32  * used space will provide enough space for the new dirent.
33  *
34  * There are two types of blocks in which dirents reside. In a stuffed dinode,
35  * the dirents begin at offset sizeof(struct gfs2_dinode) from the beginning of
36  * the block.  In leaves, they begin at offset sizeof(struct gfs2_leaf) from the
37  * beginning of the leaf block. The dirents reside in leaves when
38  *
39  * dip->i_diskflags & GFS2_DIF_EXHASH is true
40  *
41  * Otherwise, the dirents are "linear", within a single stuffed dinode block.
42  *
43  * When the dirents are in leaves, the actual contents of the directory file are
44  * used as an array of 64-bit block pointers pointing to the leaf blocks. The
45  * dirents are NOT in the directory file itself. There can be more than one
46  * block pointer in the array that points to the same leaf. In fact, when a
47  * directory is first converted from linear to exhash, all of the pointers
48  * point to the same leaf.
49  *
50  * When a leaf is completely full, the size of the hash table can be
51  * doubled unless it is already at the maximum size which is hard coded into
52  * GFS2_DIR_MAX_DEPTH. After that, leaves are chained together in a linked list,
53  * but never before the maximum hash table size has been reached.
54  */
55 
56 #include <linux/slab.h>
57 #include <linux/spinlock.h>
58 #include <linux/buffer_head.h>
59 #include <linux/sort.h>
60 #include <linux/gfs2_ondisk.h>
61 #include <linux/crc32.h>
62 #include <linux/vmalloc.h>
63 
64 #include "gfs2.h"
65 #include "incore.h"
66 #include "dir.h"
67 #include "glock.h"
68 #include "inode.h"
69 #include "meta_io.h"
70 #include "quota.h"
71 #include "rgrp.h"
72 #include "trans.h"
73 #include "bmap.h"
74 #include "util.h"
75 
76 #define IS_LEAF     1 /* Hashed (leaf) directory */
77 #define IS_DINODE   2 /* Linear (stuffed dinode block) directory */
78 
79 #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
80 #define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
81 
82 typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len,
83 			    u64 leaf_no, void *data);
84 typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent,
85 			    const struct qstr *name, void *opaque);
86 
87 
88 int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
89 			    struct buffer_head **bhp)
90 {
91 	struct buffer_head *bh;
92 
93 	bh = gfs2_meta_new(ip->i_gl, block);
94 	gfs2_trans_add_bh(ip->i_gl, bh, 1);
95 	gfs2_metatype_set(bh, GFS2_METATYPE_JD, GFS2_FORMAT_JD);
96 	gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
97 	*bhp = bh;
98 	return 0;
99 }
100 
101 static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, u64 block,
102 					struct buffer_head **bhp)
103 {
104 	struct buffer_head *bh;
105 	int error;
106 
107 	error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, &bh);
108 	if (error)
109 		return error;
110 	if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_JD)) {
111 		brelse(bh);
112 		return -EIO;
113 	}
114 	*bhp = bh;
115 	return 0;
116 }
117 
118 static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
119 				  unsigned int offset, unsigned int size)
120 {
121 	struct buffer_head *dibh;
122 	int error;
123 
124 	error = gfs2_meta_inode_buffer(ip, &dibh);
125 	if (error)
126 		return error;
127 
128 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
129 	memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
130 	if (ip->i_disksize < offset + size)
131 		ip->i_disksize = offset + size;
132 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
133 	gfs2_dinode_out(ip, dibh->b_data);
134 
135 	brelse(dibh);
136 
137 	return size;
138 }
139 
140 
141 
142 /**
143  * gfs2_dir_write_data - Write directory information to the inode
144  * @ip: The GFS2 inode
145  * @buf: The buffer containing information to be written
146  * @offset: The file offset to start writing at
147  * @size: The amount of data to write
148  *
149  * Returns: The number of bytes correctly written or error code
150  */
151 static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
152 			       u64 offset, unsigned int size)
153 {
154 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
155 	struct buffer_head *dibh;
156 	u64 lblock, dblock;
157 	u32 extlen = 0;
158 	unsigned int o;
159 	int copied = 0;
160 	int error = 0;
161 	int new = 0;
162 
163 	if (!size)
164 		return 0;
165 
166 	if (gfs2_is_stuffed(ip) &&
167 	    offset + size <= sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
168 		return gfs2_dir_write_stuffed(ip, buf, (unsigned int)offset,
169 					      size);
170 
171 	if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
172 		return -EINVAL;
173 
174 	if (gfs2_is_stuffed(ip)) {
175 		error = gfs2_unstuff_dinode(ip, NULL);
176 		if (error)
177 			return error;
178 	}
179 
180 	lblock = offset;
181 	o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
182 
183 	while (copied < size) {
184 		unsigned int amount;
185 		struct buffer_head *bh;
186 
187 		amount = size - copied;
188 		if (amount > sdp->sd_sb.sb_bsize - o)
189 			amount = sdp->sd_sb.sb_bsize - o;
190 
191 		if (!extlen) {
192 			new = 1;
193 			error = gfs2_extent_map(&ip->i_inode, lblock, &new,
194 						&dblock, &extlen);
195 			if (error)
196 				goto fail;
197 			error = -EIO;
198 			if (gfs2_assert_withdraw(sdp, dblock))
199 				goto fail;
200 		}
201 
202 		if (amount == sdp->sd_jbsize || new)
203 			error = gfs2_dir_get_new_buffer(ip, dblock, &bh);
204 		else
205 			error = gfs2_dir_get_existing_buffer(ip, dblock, &bh);
206 
207 		if (error)
208 			goto fail;
209 
210 		gfs2_trans_add_bh(ip->i_gl, bh, 1);
211 		memcpy(bh->b_data + o, buf, amount);
212 		brelse(bh);
213 
214 		buf += amount;
215 		copied += amount;
216 		lblock++;
217 		dblock++;
218 		extlen--;
219 
220 		o = sizeof(struct gfs2_meta_header);
221 	}
222 
223 out:
224 	error = gfs2_meta_inode_buffer(ip, &dibh);
225 	if (error)
226 		return error;
227 
228 	if (ip->i_disksize < offset + copied)
229 		ip->i_disksize = offset + copied;
230 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
231 
232 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
233 	gfs2_dinode_out(ip, dibh->b_data);
234 	brelse(dibh);
235 
236 	return copied;
237 fail:
238 	if (copied)
239 		goto out;
240 	return error;
241 }
242 
243 static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf,
244 				 u64 offset, unsigned int size)
245 {
246 	struct buffer_head *dibh;
247 	int error;
248 
249 	error = gfs2_meta_inode_buffer(ip, &dibh);
250 	if (!error) {
251 		offset += sizeof(struct gfs2_dinode);
252 		memcpy(buf, dibh->b_data + offset, size);
253 		brelse(dibh);
254 	}
255 
256 	return (error) ? error : size;
257 }
258 
259 
260 /**
261  * gfs2_dir_read_data - Read a data from a directory inode
262  * @ip: The GFS2 Inode
263  * @buf: The buffer to place result into
264  * @offset: File offset to begin jdata_readng from
265  * @size: Amount of data to transfer
266  *
267  * Returns: The amount of data actually copied or the error
268  */
269 static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
270 			      unsigned int size, unsigned ra)
271 {
272 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
273 	u64 lblock, dblock;
274 	u32 extlen = 0;
275 	unsigned int o;
276 	int copied = 0;
277 	int error = 0;
278 
279 	if (offset >= ip->i_disksize)
280 		return 0;
281 
282 	if (offset + size > ip->i_disksize)
283 		size = ip->i_disksize - offset;
284 
285 	if (!size)
286 		return 0;
287 
288 	if (gfs2_is_stuffed(ip))
289 		return gfs2_dir_read_stuffed(ip, buf, offset, size);
290 
291 	if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
292 		return -EINVAL;
293 
294 	lblock = offset;
295 	o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
296 
297 	while (copied < size) {
298 		unsigned int amount;
299 		struct buffer_head *bh;
300 		int new;
301 
302 		amount = size - copied;
303 		if (amount > sdp->sd_sb.sb_bsize - o)
304 			amount = sdp->sd_sb.sb_bsize - o;
305 
306 		if (!extlen) {
307 			new = 0;
308 			error = gfs2_extent_map(&ip->i_inode, lblock, &new,
309 						&dblock, &extlen);
310 			if (error || !dblock)
311 				goto fail;
312 			BUG_ON(extlen < 1);
313 			if (!ra)
314 				extlen = 1;
315 			bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
316 		} else {
317 			error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh);
318 			if (error)
319 				goto fail;
320 		}
321 		error = gfs2_metatype_check(sdp, bh, GFS2_METATYPE_JD);
322 		if (error) {
323 			brelse(bh);
324 			goto fail;
325 		}
326 		dblock++;
327 		extlen--;
328 		memcpy(buf, bh->b_data + o, amount);
329 		brelse(bh);
330 		buf += amount;
331 		copied += amount;
332 		lblock++;
333 		o = sizeof(struct gfs2_meta_header);
334 	}
335 
336 	return copied;
337 fail:
338 	return (copied) ? copied : error;
339 }
340 
341 static inline int gfs2_dirent_sentinel(const struct gfs2_dirent *dent)
342 {
343 	return dent->de_inum.no_addr == 0 || dent->de_inum.no_formal_ino == 0;
344 }
345 
346 static inline int __gfs2_dirent_find(const struct gfs2_dirent *dent,
347 				     const struct qstr *name, int ret)
348 {
349 	if (!gfs2_dirent_sentinel(dent) &&
350 	    be32_to_cpu(dent->de_hash) == name->hash &&
351 	    be16_to_cpu(dent->de_name_len) == name->len &&
352 	    memcmp(dent+1, name->name, name->len) == 0)
353 		return ret;
354 	return 0;
355 }
356 
357 static int gfs2_dirent_find(const struct gfs2_dirent *dent,
358 			    const struct qstr *name,
359 			    void *opaque)
360 {
361 	return __gfs2_dirent_find(dent, name, 1);
362 }
363 
364 static int gfs2_dirent_prev(const struct gfs2_dirent *dent,
365 			    const struct qstr *name,
366 			    void *opaque)
367 {
368 	return __gfs2_dirent_find(dent, name, 2);
369 }
370 
371 /*
372  * name->name holds ptr to start of block.
373  * name->len holds size of block.
374  */
375 static int gfs2_dirent_last(const struct gfs2_dirent *dent,
376 			    const struct qstr *name,
377 			    void *opaque)
378 {
379 	const char *start = name->name;
380 	const char *end = (const char *)dent + be16_to_cpu(dent->de_rec_len);
381 	if (name->len == (end - start))
382 		return 1;
383 	return 0;
384 }
385 
386 static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
387 				  const struct qstr *name,
388 				  void *opaque)
389 {
390 	unsigned required = GFS2_DIRENT_SIZE(name->len);
391 	unsigned actual = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
392 	unsigned totlen = be16_to_cpu(dent->de_rec_len);
393 
394 	if (gfs2_dirent_sentinel(dent))
395 		actual = GFS2_DIRENT_SIZE(0);
396 	if (totlen - actual >= required)
397 		return 1;
398 	return 0;
399 }
400 
401 struct dirent_gather {
402 	const struct gfs2_dirent **pdent;
403 	unsigned offset;
404 };
405 
406 static int gfs2_dirent_gather(const struct gfs2_dirent *dent,
407 			      const struct qstr *name,
408 			      void *opaque)
409 {
410 	struct dirent_gather *g = opaque;
411 	if (!gfs2_dirent_sentinel(dent)) {
412 		g->pdent[g->offset++] = dent;
413 	}
414 	return 0;
415 }
416 
417 /*
418  * Other possible things to check:
419  * - Inode located within filesystem size (and on valid block)
420  * - Valid directory entry type
421  * Not sure how heavy-weight we want to make this... could also check
422  * hash is correct for example, but that would take a lot of extra time.
423  * For now the most important thing is to check that the various sizes
424  * are correct.
425  */
426 static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset,
427 			     unsigned int size, unsigned int len, int first)
428 {
429 	const char *msg = "gfs2_dirent too small";
430 	if (unlikely(size < sizeof(struct gfs2_dirent)))
431 		goto error;
432 	msg = "gfs2_dirent misaligned";
433 	if (unlikely(offset & 0x7))
434 		goto error;
435 	msg = "gfs2_dirent points beyond end of block";
436 	if (unlikely(offset + size > len))
437 		goto error;
438 	msg = "zero inode number";
439 	if (unlikely(!first && gfs2_dirent_sentinel(dent)))
440 		goto error;
441 	msg = "name length is greater than space in dirent";
442 	if (!gfs2_dirent_sentinel(dent) &&
443 	    unlikely(sizeof(struct gfs2_dirent)+be16_to_cpu(dent->de_name_len) >
444 		     size))
445 		goto error;
446 	return 0;
447 error:
448 	printk(KERN_WARNING "gfs2_check_dirent: %s (%s)\n", msg,
449 	       first ? "first in block" : "not first in block");
450 	return -EIO;
451 }
452 
453 static int gfs2_dirent_offset(const void *buf)
454 {
455 	const struct gfs2_meta_header *h = buf;
456 	int offset;
457 
458 	BUG_ON(buf == NULL);
459 
460 	switch(be32_to_cpu(h->mh_type)) {
461 	case GFS2_METATYPE_LF:
462 		offset = sizeof(struct gfs2_leaf);
463 		break;
464 	case GFS2_METATYPE_DI:
465 		offset = sizeof(struct gfs2_dinode);
466 		break;
467 	default:
468 		goto wrong_type;
469 	}
470 	return offset;
471 wrong_type:
472 	printk(KERN_WARNING "gfs2_scan_dirent: wrong block type %u\n",
473 	       be32_to_cpu(h->mh_type));
474 	return -1;
475 }
476 
477 static struct gfs2_dirent *gfs2_dirent_scan(struct inode *inode, void *buf,
478 					    unsigned int len, gfs2_dscan_t scan,
479 					    const struct qstr *name,
480 					    void *opaque)
481 {
482 	struct gfs2_dirent *dent, *prev;
483 	unsigned offset;
484 	unsigned size;
485 	int ret = 0;
486 
487 	ret = gfs2_dirent_offset(buf);
488 	if (ret < 0)
489 		goto consist_inode;
490 
491 	offset = ret;
492 	prev = NULL;
493 	dent = buf + offset;
494 	size = be16_to_cpu(dent->de_rec_len);
495 	if (gfs2_check_dirent(dent, offset, size, len, 1))
496 		goto consist_inode;
497 	do {
498 		ret = scan(dent, name, opaque);
499 		if (ret)
500 			break;
501 		offset += size;
502 		if (offset == len)
503 			break;
504 		prev = dent;
505 		dent = buf + offset;
506 		size = be16_to_cpu(dent->de_rec_len);
507 		if (gfs2_check_dirent(dent, offset, size, len, 0))
508 			goto consist_inode;
509 	} while(1);
510 
511 	switch(ret) {
512 	case 0:
513 		return NULL;
514 	case 1:
515 		return dent;
516 	case 2:
517 		return prev ? prev : dent;
518 	default:
519 		BUG_ON(ret > 0);
520 		return ERR_PTR(ret);
521 	}
522 
523 consist_inode:
524 	gfs2_consist_inode(GFS2_I(inode));
525 	return ERR_PTR(-EIO);
526 }
527 
528 static int dirent_check_reclen(struct gfs2_inode *dip,
529 			       const struct gfs2_dirent *d, const void *end_p)
530 {
531 	const void *ptr = d;
532 	u16 rec_len = be16_to_cpu(d->de_rec_len);
533 
534 	if (unlikely(rec_len < sizeof(struct gfs2_dirent)))
535 		goto broken;
536 	ptr += rec_len;
537 	if (ptr < end_p)
538 		return rec_len;
539 	if (ptr == end_p)
540 		return -ENOENT;
541 broken:
542 	gfs2_consist_inode(dip);
543 	return -EIO;
544 }
545 
546 /**
547  * dirent_next - Next dirent
548  * @dip: the directory
549  * @bh: The buffer
550  * @dent: Pointer to list of dirents
551  *
552  * Returns: 0 on success, error code otherwise
553  */
554 
555 static int dirent_next(struct gfs2_inode *dip, struct buffer_head *bh,
556 		       struct gfs2_dirent **dent)
557 {
558 	struct gfs2_dirent *cur = *dent, *tmp;
559 	char *bh_end = bh->b_data + bh->b_size;
560 	int ret;
561 
562 	ret = dirent_check_reclen(dip, cur, bh_end);
563 	if (ret < 0)
564 		return ret;
565 
566 	tmp = (void *)cur + ret;
567 	ret = dirent_check_reclen(dip, tmp, bh_end);
568 	if (ret == -EIO)
569 		return ret;
570 
571         /* Only the first dent could ever have de_inum.no_addr == 0 */
572 	if (gfs2_dirent_sentinel(tmp)) {
573 		gfs2_consist_inode(dip);
574 		return -EIO;
575 	}
576 
577 	*dent = tmp;
578 	return 0;
579 }
580 
581 /**
582  * dirent_del - Delete a dirent
583  * @dip: The GFS2 inode
584  * @bh: The buffer
585  * @prev: The previous dirent
586  * @cur: The current dirent
587  *
588  */
589 
590 static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh,
591 		       struct gfs2_dirent *prev, struct gfs2_dirent *cur)
592 {
593 	u16 cur_rec_len, prev_rec_len;
594 
595 	if (gfs2_dirent_sentinel(cur)) {
596 		gfs2_consist_inode(dip);
597 		return;
598 	}
599 
600 	gfs2_trans_add_bh(dip->i_gl, bh, 1);
601 
602 	/* If there is no prev entry, this is the first entry in the block.
603 	   The de_rec_len is already as big as it needs to be.  Just zero
604 	   out the inode number and return.  */
605 
606 	if (!prev) {
607 		cur->de_inum.no_addr = 0;
608 		cur->de_inum.no_formal_ino = 0;
609 		return;
610 	}
611 
612 	/*  Combine this dentry with the previous one.  */
613 
614 	prev_rec_len = be16_to_cpu(prev->de_rec_len);
615 	cur_rec_len = be16_to_cpu(cur->de_rec_len);
616 
617 	if ((char *)prev + prev_rec_len != (char *)cur)
618 		gfs2_consist_inode(dip);
619 	if ((char *)cur + cur_rec_len > bh->b_data + bh->b_size)
620 		gfs2_consist_inode(dip);
621 
622 	prev_rec_len += cur_rec_len;
623 	prev->de_rec_len = cpu_to_be16(prev_rec_len);
624 }
625 
626 /*
627  * Takes a dent from which to grab space as an argument. Returns the
628  * newly created dent.
629  */
630 static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode,
631 					    struct gfs2_dirent *dent,
632 					    const struct qstr *name,
633 					    struct buffer_head *bh)
634 {
635 	struct gfs2_inode *ip = GFS2_I(inode);
636 	struct gfs2_dirent *ndent;
637 	unsigned offset = 0, totlen;
638 
639 	if (!gfs2_dirent_sentinel(dent))
640 		offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
641 	totlen = be16_to_cpu(dent->de_rec_len);
642 	BUG_ON(offset + name->len > totlen);
643 	gfs2_trans_add_bh(ip->i_gl, bh, 1);
644 	ndent = (struct gfs2_dirent *)((char *)dent + offset);
645 	dent->de_rec_len = cpu_to_be16(offset);
646 	gfs2_qstr2dirent(name, totlen - offset, ndent);
647 	return ndent;
648 }
649 
650 static struct gfs2_dirent *gfs2_dirent_alloc(struct inode *inode,
651 					     struct buffer_head *bh,
652 					     const struct qstr *name)
653 {
654 	struct gfs2_dirent *dent;
655 	dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
656 				gfs2_dirent_find_space, name, NULL);
657 	if (!dent || IS_ERR(dent))
658 		return dent;
659 	return gfs2_init_dirent(inode, dent, name, bh);
660 }
661 
662 static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
663 		    struct buffer_head **bhp)
664 {
665 	int error;
666 
667 	error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp);
668 	if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) {
669 		/* printk(KERN_INFO "block num=%llu\n", leaf_no); */
670 		error = -EIO;
671 	}
672 
673 	return error;
674 }
675 
676 /**
677  * get_leaf_nr - Get a leaf number associated with the index
678  * @dip: The GFS2 inode
679  * @index:
680  * @leaf_out:
681  *
682  * Returns: 0 on success, error code otherwise
683  */
684 
685 static int get_leaf_nr(struct gfs2_inode *dip, u32 index,
686 		       u64 *leaf_out)
687 {
688 	__be64 leaf_no;
689 	int error;
690 
691 	error = gfs2_dir_read_data(dip, (char *)&leaf_no,
692 				    index * sizeof(__be64),
693 				    sizeof(__be64), 0);
694 	if (error != sizeof(u64))
695 		return (error < 0) ? error : -EIO;
696 
697 	*leaf_out = be64_to_cpu(leaf_no);
698 
699 	return 0;
700 }
701 
702 static int get_first_leaf(struct gfs2_inode *dip, u32 index,
703 			  struct buffer_head **bh_out)
704 {
705 	u64 leaf_no;
706 	int error;
707 
708 	error = get_leaf_nr(dip, index, &leaf_no);
709 	if (!error)
710 		error = get_leaf(dip, leaf_no, bh_out);
711 
712 	return error;
713 }
714 
715 static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
716 					      const struct qstr *name,
717 					      gfs2_dscan_t scan,
718 					      struct buffer_head **pbh)
719 {
720 	struct buffer_head *bh;
721 	struct gfs2_dirent *dent;
722 	struct gfs2_inode *ip = GFS2_I(inode);
723 	int error;
724 
725 	if (ip->i_diskflags & GFS2_DIF_EXHASH) {
726 		struct gfs2_leaf *leaf;
727 		unsigned hsize = 1 << ip->i_depth;
728 		unsigned index;
729 		u64 ln;
730 		if (hsize * sizeof(u64) != ip->i_disksize) {
731 			gfs2_consist_inode(ip);
732 			return ERR_PTR(-EIO);
733 		}
734 
735 		index = name->hash >> (32 - ip->i_depth);
736 		error = get_first_leaf(ip, index, &bh);
737 		if (error)
738 			return ERR_PTR(error);
739 		do {
740 			dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
741 						scan, name, NULL);
742 			if (dent)
743 				goto got_dent;
744 			leaf = (struct gfs2_leaf *)bh->b_data;
745 			ln = be64_to_cpu(leaf->lf_next);
746 			brelse(bh);
747 			if (!ln)
748 				break;
749 
750 			error = get_leaf(ip, ln, &bh);
751 		} while(!error);
752 
753 		return error ? ERR_PTR(error) : NULL;
754 	}
755 
756 
757 	error = gfs2_meta_inode_buffer(ip, &bh);
758 	if (error)
759 		return ERR_PTR(error);
760 	dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, scan, name, NULL);
761 got_dent:
762 	if (unlikely(dent == NULL || IS_ERR(dent))) {
763 		brelse(bh);
764 		bh = NULL;
765 	}
766 	*pbh = bh;
767 	return dent;
768 }
769 
770 static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, u16 depth)
771 {
772 	struct gfs2_inode *ip = GFS2_I(inode);
773 	unsigned int n = 1;
774 	u64 bn;
775 	int error;
776 	struct buffer_head *bh;
777 	struct gfs2_leaf *leaf;
778 	struct gfs2_dirent *dent;
779 	struct qstr name = { .name = "", .len = 0, .hash = 0 };
780 
781 	error = gfs2_alloc_block(ip, &bn, &n);
782 	if (error)
783 		return NULL;
784 	bh = gfs2_meta_new(ip->i_gl, bn);
785 	if (!bh)
786 		return NULL;
787 
788 	gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1);
789 	gfs2_trans_add_bh(ip->i_gl, bh, 1);
790 	gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
791 	leaf = (struct gfs2_leaf *)bh->b_data;
792 	leaf->lf_depth = cpu_to_be16(depth);
793 	leaf->lf_entries = 0;
794 	leaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE);
795 	leaf->lf_next = 0;
796 	memset(leaf->lf_reserved, 0, sizeof(leaf->lf_reserved));
797 	dent = (struct gfs2_dirent *)(leaf+1);
798 	gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent);
799 	*pbh = bh;
800 	return leaf;
801 }
802 
803 /**
804  * dir_make_exhash - Convert a stuffed directory into an ExHash directory
805  * @dip: The GFS2 inode
806  *
807  * Returns: 0 on success, error code otherwise
808  */
809 
810 static int dir_make_exhash(struct inode *inode)
811 {
812 	struct gfs2_inode *dip = GFS2_I(inode);
813 	struct gfs2_sbd *sdp = GFS2_SB(inode);
814 	struct gfs2_dirent *dent;
815 	struct qstr args;
816 	struct buffer_head *bh, *dibh;
817 	struct gfs2_leaf *leaf;
818 	int y;
819 	u32 x;
820 	__be64 *lp;
821 	u64 bn;
822 	int error;
823 
824 	error = gfs2_meta_inode_buffer(dip, &dibh);
825 	if (error)
826 		return error;
827 
828 	/*  Turn over a new leaf  */
829 
830 	leaf = new_leaf(inode, &bh, 0);
831 	if (!leaf)
832 		return -ENOSPC;
833 	bn = bh->b_blocknr;
834 
835 	gfs2_assert(sdp, dip->i_entries < (1 << 16));
836 	leaf->lf_entries = cpu_to_be16(dip->i_entries);
837 
838 	/*  Copy dirents  */
839 
840 	gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_leaf), dibh,
841 			     sizeof(struct gfs2_dinode));
842 
843 	/*  Find last entry  */
844 
845 	x = 0;
846 	args.len = bh->b_size - sizeof(struct gfs2_dinode) +
847 		   sizeof(struct gfs2_leaf);
848 	args.name = bh->b_data;
849 	dent = gfs2_dirent_scan(&dip->i_inode, bh->b_data, bh->b_size,
850 				gfs2_dirent_last, &args, NULL);
851 	if (!dent) {
852 		brelse(bh);
853 		brelse(dibh);
854 		return -EIO;
855 	}
856 	if (IS_ERR(dent)) {
857 		brelse(bh);
858 		brelse(dibh);
859 		return PTR_ERR(dent);
860 	}
861 
862 	/*  Adjust the last dirent's record length
863 	   (Remember that dent still points to the last entry.)  */
864 
865 	dent->de_rec_len = cpu_to_be16(be16_to_cpu(dent->de_rec_len) +
866 		sizeof(struct gfs2_dinode) -
867 		sizeof(struct gfs2_leaf));
868 
869 	brelse(bh);
870 
871 	/*  We're done with the new leaf block, now setup the new
872 	    hash table.  */
873 
874 	gfs2_trans_add_bh(dip->i_gl, dibh, 1);
875 	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
876 
877 	lp = (__be64 *)(dibh->b_data + sizeof(struct gfs2_dinode));
878 
879 	for (x = sdp->sd_hash_ptrs; x--; lp++)
880 		*lp = cpu_to_be64(bn);
881 
882 	dip->i_disksize = sdp->sd_sb.sb_bsize / 2;
883 	gfs2_add_inode_blocks(&dip->i_inode, 1);
884 	dip->i_diskflags |= GFS2_DIF_EXHASH;
885 
886 	for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
887 	dip->i_depth = y;
888 
889 	gfs2_dinode_out(dip, dibh->b_data);
890 
891 	brelse(dibh);
892 
893 	return 0;
894 }
895 
896 /**
897  * dir_split_leaf - Split a leaf block into two
898  * @dip: The GFS2 inode
899  * @index:
900  * @leaf_no:
901  *
902  * Returns: 0 on success, error code on failure
903  */
904 
905 static int dir_split_leaf(struct inode *inode, const struct qstr *name)
906 {
907 	struct gfs2_inode *dip = GFS2_I(inode);
908 	struct buffer_head *nbh, *obh, *dibh;
909 	struct gfs2_leaf *nleaf, *oleaf;
910 	struct gfs2_dirent *dent = NULL, *prev = NULL, *next = NULL, *new;
911 	u32 start, len, half_len, divider;
912 	u64 bn, leaf_no;
913 	__be64 *lp;
914 	u32 index;
915 	int x, moved = 0;
916 	int error;
917 
918 	index = name->hash >> (32 - dip->i_depth);
919 	error = get_leaf_nr(dip, index, &leaf_no);
920 	if (error)
921 		return error;
922 
923 	/*  Get the old leaf block  */
924 	error = get_leaf(dip, leaf_no, &obh);
925 	if (error)
926 		return error;
927 
928 	oleaf = (struct gfs2_leaf *)obh->b_data;
929 	if (dip->i_depth == be16_to_cpu(oleaf->lf_depth)) {
930 		brelse(obh);
931 		return 1; /* can't split */
932 	}
933 
934 	gfs2_trans_add_bh(dip->i_gl, obh, 1);
935 
936 	nleaf = new_leaf(inode, &nbh, be16_to_cpu(oleaf->lf_depth) + 1);
937 	if (!nleaf) {
938 		brelse(obh);
939 		return -ENOSPC;
940 	}
941 	bn = nbh->b_blocknr;
942 
943 	/*  Compute the start and len of leaf pointers in the hash table.  */
944 	len = 1 << (dip->i_depth - be16_to_cpu(oleaf->lf_depth));
945 	half_len = len >> 1;
946 	if (!half_len) {
947 		printk(KERN_WARNING "i_depth %u lf_depth %u index %u\n", dip->i_depth, be16_to_cpu(oleaf->lf_depth), index);
948 		gfs2_consist_inode(dip);
949 		error = -EIO;
950 		goto fail_brelse;
951 	}
952 
953 	start = (index & ~(len - 1));
954 
955 	/* Change the pointers.
956 	   Don't bother distinguishing stuffed from non-stuffed.
957 	   This code is complicated enough already. */
958 	lp = kmalloc(half_len * sizeof(__be64), GFP_NOFS | __GFP_NOFAIL);
959 	/*  Change the pointers  */
960 	for (x = 0; x < half_len; x++)
961 		lp[x] = cpu_to_be64(bn);
962 
963 	error = gfs2_dir_write_data(dip, (char *)lp, start * sizeof(u64),
964 				    half_len * sizeof(u64));
965 	if (error != half_len * sizeof(u64)) {
966 		if (error >= 0)
967 			error = -EIO;
968 		goto fail_lpfree;
969 	}
970 
971 	kfree(lp);
972 
973 	/*  Compute the divider  */
974 	divider = (start + half_len) << (32 - dip->i_depth);
975 
976 	/*  Copy the entries  */
977 	dent = (struct gfs2_dirent *)(obh->b_data + sizeof(struct gfs2_leaf));
978 
979 	do {
980 		next = dent;
981 		if (dirent_next(dip, obh, &next))
982 			next = NULL;
983 
984 		if (!gfs2_dirent_sentinel(dent) &&
985 		    be32_to_cpu(dent->de_hash) < divider) {
986 			struct qstr str;
987 			str.name = (char*)(dent+1);
988 			str.len = be16_to_cpu(dent->de_name_len);
989 			str.hash = be32_to_cpu(dent->de_hash);
990 			new = gfs2_dirent_alloc(inode, nbh, &str);
991 			if (IS_ERR(new)) {
992 				error = PTR_ERR(new);
993 				break;
994 			}
995 
996 			new->de_inum = dent->de_inum; /* No endian worries */
997 			new->de_type = dent->de_type; /* No endian worries */
998 			be16_add_cpu(&nleaf->lf_entries, 1);
999 
1000 			dirent_del(dip, obh, prev, dent);
1001 
1002 			if (!oleaf->lf_entries)
1003 				gfs2_consist_inode(dip);
1004 			be16_add_cpu(&oleaf->lf_entries, -1);
1005 
1006 			if (!prev)
1007 				prev = dent;
1008 
1009 			moved = 1;
1010 		} else {
1011 			prev = dent;
1012 		}
1013 		dent = next;
1014 	} while (dent);
1015 
1016 	oleaf->lf_depth = nleaf->lf_depth;
1017 
1018 	error = gfs2_meta_inode_buffer(dip, &dibh);
1019 	if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) {
1020 		gfs2_trans_add_bh(dip->i_gl, dibh, 1);
1021 		gfs2_add_inode_blocks(&dip->i_inode, 1);
1022 		gfs2_dinode_out(dip, dibh->b_data);
1023 		brelse(dibh);
1024 	}
1025 
1026 	brelse(obh);
1027 	brelse(nbh);
1028 
1029 	return error;
1030 
1031 fail_lpfree:
1032 	kfree(lp);
1033 
1034 fail_brelse:
1035 	brelse(obh);
1036 	brelse(nbh);
1037 	return error;
1038 }
1039 
1040 /**
1041  * dir_double_exhash - Double size of ExHash table
1042  * @dip: The GFS2 dinode
1043  *
1044  * Returns: 0 on success, error code on failure
1045  */
1046 
1047 static int dir_double_exhash(struct gfs2_inode *dip)
1048 {
1049 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1050 	struct buffer_head *dibh;
1051 	u32 hsize;
1052 	u64 *buf;
1053 	u64 *from, *to;
1054 	u64 block;
1055 	int x;
1056 	int error = 0;
1057 
1058 	hsize = 1 << dip->i_depth;
1059 	if (hsize * sizeof(u64) != dip->i_disksize) {
1060 		gfs2_consist_inode(dip);
1061 		return -EIO;
1062 	}
1063 
1064 	/*  Allocate both the "from" and "to" buffers in one big chunk  */
1065 
1066 	buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS | __GFP_NOFAIL);
1067 
1068 	for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) {
1069 		error = gfs2_dir_read_data(dip, (char *)buf,
1070 					    block * sdp->sd_hash_bsize,
1071 					    sdp->sd_hash_bsize, 1);
1072 		if (error != sdp->sd_hash_bsize) {
1073 			if (error >= 0)
1074 				error = -EIO;
1075 			goto fail;
1076 		}
1077 
1078 		from = buf;
1079 		to = (u64 *)((char *)buf + sdp->sd_hash_bsize);
1080 
1081 		for (x = sdp->sd_hash_ptrs; x--; from++) {
1082 			*to++ = *from;	/*  No endianess worries  */
1083 			*to++ = *from;
1084 		}
1085 
1086 		error = gfs2_dir_write_data(dip,
1087 					     (char *)buf + sdp->sd_hash_bsize,
1088 					     block * sdp->sd_sb.sb_bsize,
1089 					     sdp->sd_sb.sb_bsize);
1090 		if (error != sdp->sd_sb.sb_bsize) {
1091 			if (error >= 0)
1092 				error = -EIO;
1093 			goto fail;
1094 		}
1095 	}
1096 
1097 	kfree(buf);
1098 
1099 	error = gfs2_meta_inode_buffer(dip, &dibh);
1100 	if (!gfs2_assert_withdraw(sdp, !error)) {
1101 		dip->i_depth++;
1102 		gfs2_dinode_out(dip, dibh->b_data);
1103 		brelse(dibh);
1104 	}
1105 
1106 	return error;
1107 
1108 fail:
1109 	kfree(buf);
1110 	return error;
1111 }
1112 
1113 /**
1114  * compare_dents - compare directory entries by hash value
1115  * @a: first dent
1116  * @b: second dent
1117  *
1118  * When comparing the hash entries of @a to @b:
1119  *   gt: returns 1
1120  *   lt: returns -1
1121  *   eq: returns 0
1122  */
1123 
1124 static int compare_dents(const void *a, const void *b)
1125 {
1126 	const struct gfs2_dirent *dent_a, *dent_b;
1127 	u32 hash_a, hash_b;
1128 	int ret = 0;
1129 
1130 	dent_a = *(const struct gfs2_dirent **)a;
1131 	hash_a = be32_to_cpu(dent_a->de_hash);
1132 
1133 	dent_b = *(const struct gfs2_dirent **)b;
1134 	hash_b = be32_to_cpu(dent_b->de_hash);
1135 
1136 	if (hash_a > hash_b)
1137 		ret = 1;
1138 	else if (hash_a < hash_b)
1139 		ret = -1;
1140 	else {
1141 		unsigned int len_a = be16_to_cpu(dent_a->de_name_len);
1142 		unsigned int len_b = be16_to_cpu(dent_b->de_name_len);
1143 
1144 		if (len_a > len_b)
1145 			ret = 1;
1146 		else if (len_a < len_b)
1147 			ret = -1;
1148 		else
1149 			ret = memcmp(dent_a + 1, dent_b + 1, len_a);
1150 	}
1151 
1152 	return ret;
1153 }
1154 
1155 /**
1156  * do_filldir_main - read out directory entries
1157  * @dip: The GFS2 inode
1158  * @offset: The offset in the file to read from
1159  * @opaque: opaque data to pass to filldir
1160  * @filldir: The function to pass entries to
1161  * @darr: an array of struct gfs2_dirent pointers to read
1162  * @entries: the number of entries in darr
1163  * @copied: pointer to int that's non-zero if a entry has been copied out
1164  *
1165  * Jump through some hoops to make sure that if there are hash collsions,
1166  * they are read out at the beginning of a buffer.  We want to minimize
1167  * the possibility that they will fall into different readdir buffers or
1168  * that someone will want to seek to that location.
1169  *
1170  * Returns: errno, >0 on exception from filldir
1171  */
1172 
1173 static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
1174 			   void *opaque, filldir_t filldir,
1175 			   const struct gfs2_dirent **darr, u32 entries,
1176 			   int *copied)
1177 {
1178 	const struct gfs2_dirent *dent, *dent_next;
1179 	u64 off, off_next;
1180 	unsigned int x, y;
1181 	int run = 0;
1182 	int error = 0;
1183 
1184 	sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
1185 
1186 	dent_next = darr[0];
1187 	off_next = be32_to_cpu(dent_next->de_hash);
1188 	off_next = gfs2_disk_hash2offset(off_next);
1189 
1190 	for (x = 0, y = 1; x < entries; x++, y++) {
1191 		dent = dent_next;
1192 		off = off_next;
1193 
1194 		if (y < entries) {
1195 			dent_next = darr[y];
1196 			off_next = be32_to_cpu(dent_next->de_hash);
1197 			off_next = gfs2_disk_hash2offset(off_next);
1198 
1199 			if (off < *offset)
1200 				continue;
1201 			*offset = off;
1202 
1203 			if (off_next == off) {
1204 				if (*copied && !run)
1205 					return 1;
1206 				run = 1;
1207 			} else
1208 				run = 0;
1209 		} else {
1210 			if (off < *offset)
1211 				continue;
1212 			*offset = off;
1213 		}
1214 
1215 		error = filldir(opaque, (const char *)(dent + 1),
1216 				be16_to_cpu(dent->de_name_len),
1217 				off, be64_to_cpu(dent->de_inum.no_addr),
1218 				be16_to_cpu(dent->de_type));
1219 		if (error)
1220 			return 1;
1221 
1222 		*copied = 1;
1223 	}
1224 
1225 	/* Increment the *offset by one, so the next time we come into the
1226 	   do_filldir fxn, we get the next entry instead of the last one in the
1227 	   current leaf */
1228 
1229 	(*offset)++;
1230 
1231 	return 0;
1232 }
1233 
1234 static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
1235 			      filldir_t filldir, int *copied, unsigned *depth,
1236 			      u64 leaf_no)
1237 {
1238 	struct gfs2_inode *ip = GFS2_I(inode);
1239 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1240 	struct buffer_head *bh;
1241 	struct gfs2_leaf *lf;
1242 	unsigned entries = 0, entries2 = 0;
1243 	unsigned leaves = 0;
1244 	const struct gfs2_dirent **darr, *dent;
1245 	struct dirent_gather g;
1246 	struct buffer_head **larr;
1247 	int leaf = 0;
1248 	int error, i;
1249 	u64 lfn = leaf_no;
1250 
1251 	do {
1252 		error = get_leaf(ip, lfn, &bh);
1253 		if (error)
1254 			goto out;
1255 		lf = (struct gfs2_leaf *)bh->b_data;
1256 		if (leaves == 0)
1257 			*depth = be16_to_cpu(lf->lf_depth);
1258 		entries += be16_to_cpu(lf->lf_entries);
1259 		leaves++;
1260 		lfn = be64_to_cpu(lf->lf_next);
1261 		brelse(bh);
1262 	} while(lfn);
1263 
1264 	if (!entries)
1265 		return 0;
1266 
1267 	error = -ENOMEM;
1268 	/*
1269 	 * The extra 99 entries are not normally used, but are a buffer
1270 	 * zone in case the number of entries in the leaf is corrupt.
1271 	 * 99 is the maximum number of entries that can fit in a single
1272 	 * leaf block.
1273 	 */
1274 	larr = vmalloc((leaves + entries + 99) * sizeof(void *));
1275 	if (!larr)
1276 		goto out;
1277 	darr = (const struct gfs2_dirent **)(larr + leaves);
1278 	g.pdent = darr;
1279 	g.offset = 0;
1280 	lfn = leaf_no;
1281 
1282 	do {
1283 		error = get_leaf(ip, lfn, &bh);
1284 		if (error)
1285 			goto out_kfree;
1286 		lf = (struct gfs2_leaf *)bh->b_data;
1287 		lfn = be64_to_cpu(lf->lf_next);
1288 		if (lf->lf_entries) {
1289 			entries2 += be16_to_cpu(lf->lf_entries);
1290 			dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
1291 						gfs2_dirent_gather, NULL, &g);
1292 			error = PTR_ERR(dent);
1293 			if (IS_ERR(dent))
1294 				goto out_kfree;
1295 			if (entries2 != g.offset) {
1296 				fs_warn(sdp, "Number of entries corrupt in dir "
1297 						"leaf %llu, entries2 (%u) != "
1298 						"g.offset (%u)\n",
1299 					(unsigned long long)bh->b_blocknr,
1300 					entries2, g.offset);
1301 
1302 				error = -EIO;
1303 				goto out_kfree;
1304 			}
1305 			error = 0;
1306 			larr[leaf++] = bh;
1307 		} else {
1308 			brelse(bh);
1309 		}
1310 	} while(lfn);
1311 
1312 	BUG_ON(entries2 != entries);
1313 	error = do_filldir_main(ip, offset, opaque, filldir, darr,
1314 				entries, copied);
1315 out_kfree:
1316 	for(i = 0; i < leaf; i++)
1317 		brelse(larr[i]);
1318 	vfree(larr);
1319 out:
1320 	return error;
1321 }
1322 
1323 /**
1324  * dir_e_read - Reads the entries from a directory into a filldir buffer
1325  * @dip: dinode pointer
1326  * @offset: the hash of the last entry read shifted to the right once
1327  * @opaque: buffer for the filldir function to fill
1328  * @filldir: points to the filldir function to use
1329  *
1330  * Returns: errno
1331  */
1332 
1333 static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
1334 		      filldir_t filldir)
1335 {
1336 	struct gfs2_inode *dip = GFS2_I(inode);
1337 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1338 	u32 hsize, len = 0;
1339 	u32 ht_offset, lp_offset, ht_offset_cur = -1;
1340 	u32 hash, index;
1341 	__be64 *lp;
1342 	int copied = 0;
1343 	int error = 0;
1344 	unsigned depth = 0;
1345 
1346 	hsize = 1 << dip->i_depth;
1347 	if (hsize * sizeof(u64) != dip->i_disksize) {
1348 		gfs2_consist_inode(dip);
1349 		return -EIO;
1350 	}
1351 
1352 	hash = gfs2_dir_offset2hash(*offset);
1353 	index = hash >> (32 - dip->i_depth);
1354 
1355 	lp = kmalloc(sdp->sd_hash_bsize, GFP_NOFS);
1356 	if (!lp)
1357 		return -ENOMEM;
1358 
1359 	while (index < hsize) {
1360 		lp_offset = index & (sdp->sd_hash_ptrs - 1);
1361 		ht_offset = index - lp_offset;
1362 
1363 		if (ht_offset_cur != ht_offset) {
1364 			error = gfs2_dir_read_data(dip, (char *)lp,
1365 						ht_offset * sizeof(__be64),
1366 						sdp->sd_hash_bsize, 1);
1367 			if (error != sdp->sd_hash_bsize) {
1368 				if (error >= 0)
1369 					error = -EIO;
1370 				goto out;
1371 			}
1372 			ht_offset_cur = ht_offset;
1373 		}
1374 
1375 		error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
1376 					   &copied, &depth,
1377 					   be64_to_cpu(lp[lp_offset]));
1378 		if (error)
1379 			break;
1380 
1381 		len = 1 << (dip->i_depth - depth);
1382 		index = (index & ~(len - 1)) + len;
1383 	}
1384 
1385 out:
1386 	kfree(lp);
1387 	if (error > 0)
1388 		error = 0;
1389 	return error;
1390 }
1391 
1392 int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
1393 		  filldir_t filldir)
1394 {
1395 	struct gfs2_inode *dip = GFS2_I(inode);
1396 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1397 	struct dirent_gather g;
1398 	const struct gfs2_dirent **darr, *dent;
1399 	struct buffer_head *dibh;
1400 	int copied = 0;
1401 	int error;
1402 
1403 	if (!dip->i_entries)
1404 		return 0;
1405 
1406 	if (dip->i_diskflags & GFS2_DIF_EXHASH)
1407 		return dir_e_read(inode, offset, opaque, filldir);
1408 
1409 	if (!gfs2_is_stuffed(dip)) {
1410 		gfs2_consist_inode(dip);
1411 		return -EIO;
1412 	}
1413 
1414 	error = gfs2_meta_inode_buffer(dip, &dibh);
1415 	if (error)
1416 		return error;
1417 
1418 	error = -ENOMEM;
1419 	/* 96 is max number of dirents which can be stuffed into an inode */
1420 	darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_NOFS);
1421 	if (darr) {
1422 		g.pdent = darr;
1423 		g.offset = 0;
1424 		dent = gfs2_dirent_scan(inode, dibh->b_data, dibh->b_size,
1425 					gfs2_dirent_gather, NULL, &g);
1426 		if (IS_ERR(dent)) {
1427 			error = PTR_ERR(dent);
1428 			goto out;
1429 		}
1430 		if (dip->i_entries != g.offset) {
1431 			fs_warn(sdp, "Number of entries corrupt in dir %llu, "
1432 				"ip->i_entries (%u) != g.offset (%u)\n",
1433 				(unsigned long long)dip->i_no_addr,
1434 				dip->i_entries,
1435 				g.offset);
1436 			error = -EIO;
1437 			goto out;
1438 		}
1439 		error = do_filldir_main(dip, offset, opaque, filldir, darr,
1440 					dip->i_entries, &copied);
1441 out:
1442 		kfree(darr);
1443 	}
1444 
1445 	if (error > 0)
1446 		error = 0;
1447 
1448 	brelse(dibh);
1449 
1450 	return error;
1451 }
1452 
1453 /**
1454  * gfs2_dir_search - Search a directory
1455  * @dip: The GFS2 inode
1456  * @filename:
1457  * @inode:
1458  *
1459  * This routine searches a directory for a file or another directory.
1460  * Assumes a glock is held on dip.
1461  *
1462  * Returns: errno
1463  */
1464 
1465 struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name)
1466 {
1467 	struct buffer_head *bh;
1468 	struct gfs2_dirent *dent;
1469 	struct inode *inode;
1470 
1471 	dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
1472 	if (dent) {
1473 		if (IS_ERR(dent))
1474 			return ERR_CAST(dent);
1475 		inode = gfs2_inode_lookup(dir->i_sb,
1476 				be16_to_cpu(dent->de_type),
1477 				be64_to_cpu(dent->de_inum.no_addr),
1478 				be64_to_cpu(dent->de_inum.no_formal_ino), 0);
1479 		brelse(bh);
1480 		return inode;
1481 	}
1482 	return ERR_PTR(-ENOENT);
1483 }
1484 
1485 int gfs2_dir_check(struct inode *dir, const struct qstr *name,
1486 		   const struct gfs2_inode *ip)
1487 {
1488 	struct buffer_head *bh;
1489 	struct gfs2_dirent *dent;
1490 	int ret = -ENOENT;
1491 
1492 	dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
1493 	if (dent) {
1494 		if (IS_ERR(dent))
1495 			return PTR_ERR(dent);
1496 		if (ip) {
1497 			if (be64_to_cpu(dent->de_inum.no_addr) != ip->i_no_addr)
1498 				goto out;
1499 			if (be64_to_cpu(dent->de_inum.no_formal_ino) !=
1500 			    ip->i_no_formal_ino)
1501 				goto out;
1502 			if (unlikely(IF2DT(ip->i_inode.i_mode) !=
1503 			    be16_to_cpu(dent->de_type))) {
1504 				gfs2_consist_inode(GFS2_I(dir));
1505 				ret = -EIO;
1506 				goto out;
1507 			}
1508 		}
1509 		ret = 0;
1510 out:
1511 		brelse(bh);
1512 	}
1513 	return ret;
1514 }
1515 
1516 static int dir_new_leaf(struct inode *inode, const struct qstr *name)
1517 {
1518 	struct buffer_head *bh, *obh;
1519 	struct gfs2_inode *ip = GFS2_I(inode);
1520 	struct gfs2_leaf *leaf, *oleaf;
1521 	int error;
1522 	u32 index;
1523 	u64 bn;
1524 
1525 	index = name->hash >> (32 - ip->i_depth);
1526 	error = get_first_leaf(ip, index, &obh);
1527 	if (error)
1528 		return error;
1529 	do {
1530 		oleaf = (struct gfs2_leaf *)obh->b_data;
1531 		bn = be64_to_cpu(oleaf->lf_next);
1532 		if (!bn)
1533 			break;
1534 		brelse(obh);
1535 		error = get_leaf(ip, bn, &obh);
1536 		if (error)
1537 			return error;
1538 	} while(1);
1539 
1540 	gfs2_trans_add_bh(ip->i_gl, obh, 1);
1541 
1542 	leaf = new_leaf(inode, &bh, be16_to_cpu(oleaf->lf_depth));
1543 	if (!leaf) {
1544 		brelse(obh);
1545 		return -ENOSPC;
1546 	}
1547 	oleaf->lf_next = cpu_to_be64(bh->b_blocknr);
1548 	brelse(bh);
1549 	brelse(obh);
1550 
1551 	error = gfs2_meta_inode_buffer(ip, &bh);
1552 	if (error)
1553 		return error;
1554 	gfs2_trans_add_bh(ip->i_gl, bh, 1);
1555 	gfs2_add_inode_blocks(&ip->i_inode, 1);
1556 	gfs2_dinode_out(ip, bh->b_data);
1557 	brelse(bh);
1558 	return 0;
1559 }
1560 
1561 /**
1562  * gfs2_dir_add - Add new filename into directory
1563  * @dip: The GFS2 inode
1564  * @filename: The new name
1565  * @inode: The inode number of the entry
1566  * @type: The type of the entry
1567  *
1568  * Returns: 0 on success, error code on failure
1569  */
1570 
1571 int gfs2_dir_add(struct inode *inode, const struct qstr *name,
1572 		 const struct gfs2_inode *nip, unsigned type)
1573 {
1574 	struct gfs2_inode *ip = GFS2_I(inode);
1575 	struct buffer_head *bh;
1576 	struct gfs2_dirent *dent;
1577 	struct gfs2_leaf *leaf;
1578 	int error;
1579 
1580 	while(1) {
1581 		dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space,
1582 					  &bh);
1583 		if (dent) {
1584 			if (IS_ERR(dent))
1585 				return PTR_ERR(dent);
1586 			dent = gfs2_init_dirent(inode, dent, name, bh);
1587 			gfs2_inum_out(nip, dent);
1588 			dent->de_type = cpu_to_be16(type);
1589 			if (ip->i_diskflags & GFS2_DIF_EXHASH) {
1590 				leaf = (struct gfs2_leaf *)bh->b_data;
1591 				be16_add_cpu(&leaf->lf_entries, 1);
1592 			}
1593 			brelse(bh);
1594 			error = gfs2_meta_inode_buffer(ip, &bh);
1595 			if (error)
1596 				break;
1597 			gfs2_trans_add_bh(ip->i_gl, bh, 1);
1598 			ip->i_entries++;
1599 			ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1600 			gfs2_dinode_out(ip, bh->b_data);
1601 			brelse(bh);
1602 			error = 0;
1603 			break;
1604 		}
1605 		if (!(ip->i_diskflags & GFS2_DIF_EXHASH)) {
1606 			error = dir_make_exhash(inode);
1607 			if (error)
1608 				break;
1609 			continue;
1610 		}
1611 		error = dir_split_leaf(inode, name);
1612 		if (error == 0)
1613 			continue;
1614 		if (error < 0)
1615 			break;
1616 		if (ip->i_depth < GFS2_DIR_MAX_DEPTH) {
1617 			error = dir_double_exhash(ip);
1618 			if (error)
1619 				break;
1620 			error = dir_split_leaf(inode, name);
1621 			if (error < 0)
1622 				break;
1623 			if (error == 0)
1624 				continue;
1625 		}
1626 		error = dir_new_leaf(inode, name);
1627 		if (!error)
1628 			continue;
1629 		error = -ENOSPC;
1630 		break;
1631 	}
1632 	return error;
1633 }
1634 
1635 
1636 /**
1637  * gfs2_dir_del - Delete a directory entry
1638  * @dip: The GFS2 inode
1639  * @filename: The filename
1640  *
1641  * Returns: 0 on success, error code on failure
1642  */
1643 
1644 int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
1645 {
1646 	struct gfs2_dirent *dent, *prev = NULL;
1647 	struct buffer_head *bh;
1648 	int error;
1649 
1650 	/* Returns _either_ the entry (if its first in block) or the
1651 	   previous entry otherwise */
1652 	dent = gfs2_dirent_search(&dip->i_inode, name, gfs2_dirent_prev, &bh);
1653 	if (!dent) {
1654 		gfs2_consist_inode(dip);
1655 		return -EIO;
1656 	}
1657 	if (IS_ERR(dent)) {
1658 		gfs2_consist_inode(dip);
1659 		return PTR_ERR(dent);
1660 	}
1661 	/* If not first in block, adjust pointers accordingly */
1662 	if (gfs2_dirent_find(dent, name, NULL) == 0) {
1663 		prev = dent;
1664 		dent = (struct gfs2_dirent *)((char *)dent + be16_to_cpu(prev->de_rec_len));
1665 	}
1666 
1667 	dirent_del(dip, bh, prev, dent);
1668 	if (dip->i_diskflags & GFS2_DIF_EXHASH) {
1669 		struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
1670 		u16 entries = be16_to_cpu(leaf->lf_entries);
1671 		if (!entries)
1672 			gfs2_consist_inode(dip);
1673 		leaf->lf_entries = cpu_to_be16(--entries);
1674 	}
1675 	brelse(bh);
1676 
1677 	error = gfs2_meta_inode_buffer(dip, &bh);
1678 	if (error)
1679 		return error;
1680 
1681 	if (!dip->i_entries)
1682 		gfs2_consist_inode(dip);
1683 	gfs2_trans_add_bh(dip->i_gl, bh, 1);
1684 	dip->i_entries--;
1685 	dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
1686 	gfs2_dinode_out(dip, bh->b_data);
1687 	brelse(bh);
1688 	mark_inode_dirty(&dip->i_inode);
1689 
1690 	return error;
1691 }
1692 
1693 /**
1694  * gfs2_dir_mvino - Change inode number of directory entry
1695  * @dip: The GFS2 inode
1696  * @filename:
1697  * @new_inode:
1698  *
1699  * This routine changes the inode number of a directory entry.  It's used
1700  * by rename to change ".." when a directory is moved.
1701  * Assumes a glock is held on dvp.
1702  *
1703  * Returns: errno
1704  */
1705 
1706 int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
1707 		   const struct gfs2_inode *nip, unsigned int new_type)
1708 {
1709 	struct buffer_head *bh;
1710 	struct gfs2_dirent *dent;
1711 	int error;
1712 
1713 	dent = gfs2_dirent_search(&dip->i_inode, filename, gfs2_dirent_find, &bh);
1714 	if (!dent) {
1715 		gfs2_consist_inode(dip);
1716 		return -EIO;
1717 	}
1718 	if (IS_ERR(dent))
1719 		return PTR_ERR(dent);
1720 
1721 	gfs2_trans_add_bh(dip->i_gl, bh, 1);
1722 	gfs2_inum_out(nip, dent);
1723 	dent->de_type = cpu_to_be16(new_type);
1724 
1725 	if (dip->i_diskflags & GFS2_DIF_EXHASH) {
1726 		brelse(bh);
1727 		error = gfs2_meta_inode_buffer(dip, &bh);
1728 		if (error)
1729 			return error;
1730 		gfs2_trans_add_bh(dip->i_gl, bh, 1);
1731 	}
1732 
1733 	dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
1734 	gfs2_dinode_out(dip, bh->b_data);
1735 	brelse(bh);
1736 	return 0;
1737 }
1738 
1739 /**
1740  * foreach_leaf - call a function for each leaf in a directory
1741  * @dip: the directory
1742  * @lc: the function to call for each each
1743  * @data: private data to pass to it
1744  *
1745  * Returns: errno
1746  */
1747 
1748 static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
1749 {
1750 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1751 	struct buffer_head *bh;
1752 	struct gfs2_leaf *leaf;
1753 	u32 hsize, len;
1754 	u32 ht_offset, lp_offset, ht_offset_cur = -1;
1755 	u32 index = 0;
1756 	__be64 *lp;
1757 	u64 leaf_no;
1758 	int error = 0;
1759 
1760 	hsize = 1 << dip->i_depth;
1761 	if (hsize * sizeof(u64) != dip->i_disksize) {
1762 		gfs2_consist_inode(dip);
1763 		return -EIO;
1764 	}
1765 
1766 	lp = kmalloc(sdp->sd_hash_bsize, GFP_NOFS);
1767 	if (!lp)
1768 		return -ENOMEM;
1769 
1770 	while (index < hsize) {
1771 		lp_offset = index & (sdp->sd_hash_ptrs - 1);
1772 		ht_offset = index - lp_offset;
1773 
1774 		if (ht_offset_cur != ht_offset) {
1775 			error = gfs2_dir_read_data(dip, (char *)lp,
1776 						ht_offset * sizeof(__be64),
1777 						sdp->sd_hash_bsize, 1);
1778 			if (error != sdp->sd_hash_bsize) {
1779 				if (error >= 0)
1780 					error = -EIO;
1781 				goto out;
1782 			}
1783 			ht_offset_cur = ht_offset;
1784 		}
1785 
1786 		leaf_no = be64_to_cpu(lp[lp_offset]);
1787 		if (leaf_no) {
1788 			error = get_leaf(dip, leaf_no, &bh);
1789 			if (error)
1790 				goto out;
1791 			leaf = (struct gfs2_leaf *)bh->b_data;
1792 			len = 1 << (dip->i_depth - be16_to_cpu(leaf->lf_depth));
1793 			brelse(bh);
1794 
1795 			error = lc(dip, index, len, leaf_no, data);
1796 			if (error)
1797 				goto out;
1798 
1799 			index = (index & ~(len - 1)) + len;
1800 		} else
1801 			index++;
1802 	}
1803 
1804 	if (index != hsize) {
1805 		gfs2_consist_inode(dip);
1806 		error = -EIO;
1807 	}
1808 
1809 out:
1810 	kfree(lp);
1811 
1812 	return error;
1813 }
1814 
1815 /**
1816  * leaf_dealloc - Deallocate a directory leaf
1817  * @dip: the directory
1818  * @index: the hash table offset in the directory
1819  * @len: the number of pointers to this leaf
1820  * @leaf_no: the leaf number
1821  * @data: not used
1822  *
1823  * Returns: errno
1824  */
1825 
1826 static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
1827 			u64 leaf_no, void *data)
1828 {
1829 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1830 	struct gfs2_leaf *tmp_leaf;
1831 	struct gfs2_rgrp_list rlist;
1832 	struct buffer_head *bh, *dibh;
1833 	u64 blk, nblk;
1834 	unsigned int rg_blocks = 0, l_blocks = 0;
1835 	char *ht;
1836 	unsigned int x, size = len * sizeof(u64);
1837 	int error;
1838 
1839 	memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
1840 
1841 	ht = kzalloc(size, GFP_NOFS);
1842 	if (!ht)
1843 		return -ENOMEM;
1844 
1845 	if (!gfs2_alloc_get(dip)) {
1846 		error = -ENOMEM;
1847 		goto out;
1848 	}
1849 
1850 	error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
1851 	if (error)
1852 		goto out_put;
1853 
1854 	error = gfs2_rindex_hold(sdp, &dip->i_alloc->al_ri_gh);
1855 	if (error)
1856 		goto out_qs;
1857 
1858 	/*  Count the number of leaves  */
1859 
1860 	for (blk = leaf_no; blk; blk = nblk) {
1861 		error = get_leaf(dip, blk, &bh);
1862 		if (error)
1863 			goto out_rlist;
1864 		tmp_leaf = (struct gfs2_leaf *)bh->b_data;
1865 		nblk = be64_to_cpu(tmp_leaf->lf_next);
1866 		brelse(bh);
1867 
1868 		gfs2_rlist_add(sdp, &rlist, blk);
1869 		l_blocks++;
1870 	}
1871 
1872 	gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
1873 
1874 	for (x = 0; x < rlist.rl_rgrps; x++) {
1875 		struct gfs2_rgrpd *rgd;
1876 		rgd = rlist.rl_ghs[x].gh_gl->gl_object;
1877 		rg_blocks += rgd->rd_length;
1878 	}
1879 
1880 	error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
1881 	if (error)
1882 		goto out_rlist;
1883 
1884 	error = gfs2_trans_begin(sdp,
1885 			rg_blocks + (DIV_ROUND_UP(size, sdp->sd_jbsize) + 1) +
1886 			RES_DINODE + RES_STATFS + RES_QUOTA, l_blocks);
1887 	if (error)
1888 		goto out_rg_gunlock;
1889 
1890 	for (blk = leaf_no; blk; blk = nblk) {
1891 		error = get_leaf(dip, blk, &bh);
1892 		if (error)
1893 			goto out_end_trans;
1894 		tmp_leaf = (struct gfs2_leaf *)bh->b_data;
1895 		nblk = be64_to_cpu(tmp_leaf->lf_next);
1896 		brelse(bh);
1897 
1898 		gfs2_free_meta(dip, blk, 1);
1899 		gfs2_add_inode_blocks(&dip->i_inode, -1);
1900 	}
1901 
1902 	error = gfs2_dir_write_data(dip, ht, index * sizeof(u64), size);
1903 	if (error != size) {
1904 		if (error >= 0)
1905 			error = -EIO;
1906 		goto out_end_trans;
1907 	}
1908 
1909 	error = gfs2_meta_inode_buffer(dip, &dibh);
1910 	if (error)
1911 		goto out_end_trans;
1912 
1913 	gfs2_trans_add_bh(dip->i_gl, dibh, 1);
1914 	gfs2_dinode_out(dip, dibh->b_data);
1915 	brelse(dibh);
1916 
1917 out_end_trans:
1918 	gfs2_trans_end(sdp);
1919 out_rg_gunlock:
1920 	gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
1921 out_rlist:
1922 	gfs2_rlist_free(&rlist);
1923 	gfs2_glock_dq_uninit(&dip->i_alloc->al_ri_gh);
1924 out_qs:
1925 	gfs2_quota_unhold(dip);
1926 out_put:
1927 	gfs2_alloc_put(dip);
1928 out:
1929 	kfree(ht);
1930 	return error;
1931 }
1932 
1933 /**
1934  * gfs2_dir_exhash_dealloc - free all the leaf blocks in a directory
1935  * @dip: the directory
1936  *
1937  * Dealloc all on-disk directory leaves to FREEMETA state
1938  * Change on-disk inode type to "regular file"
1939  *
1940  * Returns: errno
1941  */
1942 
1943 int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
1944 {
1945 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1946 	struct buffer_head *bh;
1947 	int error;
1948 
1949 	/* Dealloc on-disk leaves to FREEMETA state */
1950 	error = foreach_leaf(dip, leaf_dealloc, NULL);
1951 	if (error)
1952 		return error;
1953 
1954 	/* Make this a regular file in case we crash.
1955 	   (We don't want to free these blocks a second time.)  */
1956 
1957 	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1958 	if (error)
1959 		return error;
1960 
1961 	error = gfs2_meta_inode_buffer(dip, &bh);
1962 	if (!error) {
1963 		gfs2_trans_add_bh(dip->i_gl, bh, 1);
1964 		((struct gfs2_dinode *)bh->b_data)->di_mode =
1965 						cpu_to_be32(S_IFREG);
1966 		brelse(bh);
1967 	}
1968 
1969 	gfs2_trans_end(sdp);
1970 
1971 	return error;
1972 }
1973 
1974 /**
1975  * gfs2_diradd_alloc_required - find if adding entry will require an allocation
1976  * @ip: the file being written to
1977  * @filname: the filename that's going to be added
1978  *
1979  * Returns: 1 if alloc required, 0 if not, -ve on error
1980  */
1981 
1982 int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name)
1983 {
1984 	struct gfs2_dirent *dent;
1985 	struct buffer_head *bh;
1986 
1987 	dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh);
1988 	if (!dent) {
1989 		return 1;
1990 	}
1991 	if (IS_ERR(dent))
1992 		return PTR_ERR(dent);
1993 	brelse(bh);
1994 	return 0;
1995 }
1996 
1997