1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * dir.c
4 *
5 * Creates, reads, walks and deletes directory-nodes
6 *
7 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
8 *
9 * Portions of this code from linux/fs/ext3/dir.c
10 *
11 * Copyright (C) 1992, 1993, 1994, 1995
12 * Remy Card (card@masi.ibp.fr)
13 * Laboratoire MASI - Institut Blaise pascal
14 * Universite Pierre et Marie Curie (Paris VI)
15 *
16 * from
17 *
18 * linux/fs/minix/dir.c
19 *
20 * Copyright (C) 1991, 1992 Linus Torvalds
21 */
22
23 #include <linux/fs.h>
24 #include <linux/types.h>
25 #include <linux/slab.h>
26 #include <linux/highmem.h>
27 #include <linux/quotaops.h>
28 #include <linux/sort.h>
29 #include <linux/iversion.h>
30
31 #include <cluster/masklog.h>
32
33 #include "ocfs2.h"
34
35 #include "alloc.h"
36 #include "blockcheck.h"
37 #include "dir.h"
38 #include "dlmglue.h"
39 #include "extent_map.h"
40 #include "file.h"
41 #include "inode.h"
42 #include "journal.h"
43 #include "namei.h"
44 #include "suballoc.h"
45 #include "super.h"
46 #include "sysfile.h"
47 #include "uptodate.h"
48 #include "ocfs2_trace.h"
49
50 #include "buffer_head_io.h"
51
52 #define NAMEI_RA_CHUNKS 2
53 #define NAMEI_RA_BLOCKS 4
54 #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
55
56 static int ocfs2_do_extend_dir(struct super_block *sb,
57 handle_t *handle,
58 struct inode *dir,
59 struct buffer_head *parent_fe_bh,
60 struct ocfs2_alloc_context *data_ac,
61 struct ocfs2_alloc_context *meta_ac,
62 struct buffer_head **new_bh);
63 static int ocfs2_dir_indexed(struct inode *inode);
64
65 /*
66 * These are distinct checks because future versions of the file system will
67 * want to have a trailing dirent structure independent of indexing.
68 */
ocfs2_supports_dir_trailer(struct inode * dir)69 static int ocfs2_supports_dir_trailer(struct inode *dir)
70 {
71 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
72
73 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
74 return 0;
75
76 return ocfs2_meta_ecc(osb) || ocfs2_dir_indexed(dir);
77 }
78
79 /*
80 * "new' here refers to the point at which we're creating a new
81 * directory via "mkdir()", but also when we're expanding an inline
82 * directory. In either case, we don't yet have the indexing bit set
83 * on the directory, so the standard checks will fail in when metaecc
84 * is turned off. Only directory-initialization type functions should
85 * use this then. Everything else wants ocfs2_supports_dir_trailer()
86 */
ocfs2_new_dir_wants_trailer(struct inode * dir)87 static int ocfs2_new_dir_wants_trailer(struct inode *dir)
88 {
89 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
90
91 return ocfs2_meta_ecc(osb) ||
92 ocfs2_supports_indexed_dirs(osb);
93 }
94
ocfs2_dir_trailer_blk_off(struct super_block * sb)95 static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb)
96 {
97 return sb->s_blocksize - sizeof(struct ocfs2_dir_block_trailer);
98 }
99
100 #define ocfs2_trailer_from_bh(_bh, _sb) ((struct ocfs2_dir_block_trailer *) ((_bh)->b_data + ocfs2_dir_trailer_blk_off((_sb))))
101
102 /* XXX ocfs2_block_dqtrailer() is similar but not quite - can we make
103 * them more consistent? */
ocfs2_dir_trailer_from_size(int blocksize,void * data)104 struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
105 void *data)
106 {
107 char *p = data;
108
109 p += blocksize - sizeof(struct ocfs2_dir_block_trailer);
110 return (struct ocfs2_dir_block_trailer *)p;
111 }
112
113 /*
114 * XXX: This is executed once on every dirent. We should consider optimizing
115 * it.
116 */
ocfs2_skip_dir_trailer(struct inode * dir,struct ocfs2_dir_entry * de,unsigned long offset,unsigned long blklen)117 static int ocfs2_skip_dir_trailer(struct inode *dir,
118 struct ocfs2_dir_entry *de,
119 unsigned long offset,
120 unsigned long blklen)
121 {
122 unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer);
123
124 if (!ocfs2_supports_dir_trailer(dir))
125 return 0;
126
127 if (offset != toff)
128 return 0;
129
130 return 1;
131 }
132
ocfs2_init_dir_trailer(struct inode * inode,struct buffer_head * bh,u16 rec_len)133 static void ocfs2_init_dir_trailer(struct inode *inode,
134 struct buffer_head *bh, u16 rec_len)
135 {
136 struct ocfs2_dir_block_trailer *trailer;
137
138 trailer = ocfs2_trailer_from_bh(bh, inode->i_sb);
139 strcpy(trailer->db_signature, OCFS2_DIR_TRAILER_SIGNATURE);
140 trailer->db_compat_rec_len =
141 cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer));
142 trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
143 trailer->db_blkno = cpu_to_le64(bh->b_blocknr);
144 trailer->db_free_rec_len = cpu_to_le16(rec_len);
145 }
146 /*
147 * Link an unindexed block with a dir trailer structure into the index free
148 * list. This function will modify dirdata_bh, but assumes you've already
149 * passed it to the journal.
150 */
ocfs2_dx_dir_link_trailer(struct inode * dir,handle_t * handle,struct buffer_head * dx_root_bh,struct buffer_head * dirdata_bh)151 static int ocfs2_dx_dir_link_trailer(struct inode *dir, handle_t *handle,
152 struct buffer_head *dx_root_bh,
153 struct buffer_head *dirdata_bh)
154 {
155 int ret;
156 struct ocfs2_dx_root_block *dx_root;
157 struct ocfs2_dir_block_trailer *trailer;
158
159 ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
160 OCFS2_JOURNAL_ACCESS_WRITE);
161 if (ret) {
162 mlog_errno(ret);
163 goto out;
164 }
165 trailer = ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
166 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
167
168 trailer->db_free_next = dx_root->dr_free_blk;
169 dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr);
170
171 ocfs2_journal_dirty(handle, dx_root_bh);
172
173 out:
174 return ret;
175 }
176
ocfs2_free_list_at_root(struct ocfs2_dir_lookup_result * res)177 static int ocfs2_free_list_at_root(struct ocfs2_dir_lookup_result *res)
178 {
179 return res->dl_prev_leaf_bh == NULL;
180 }
181
ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result * res)182 void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res)
183 {
184 brelse(res->dl_dx_root_bh);
185 brelse(res->dl_leaf_bh);
186 brelse(res->dl_dx_leaf_bh);
187 brelse(res->dl_prev_leaf_bh);
188 }
189
ocfs2_dir_indexed(struct inode * inode)190 static int ocfs2_dir_indexed(struct inode *inode)
191 {
192 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INDEXED_DIR_FL)
193 return 1;
194 return 0;
195 }
196
ocfs2_dx_root_inline(struct ocfs2_dx_root_block * dx_root)197 static inline int ocfs2_dx_root_inline(struct ocfs2_dx_root_block *dx_root)
198 {
199 return dx_root->dr_flags & OCFS2_DX_FLAG_INLINE;
200 }
201
202 /*
203 * Hashing code adapted from ext3
204 */
205 #define DELTA 0x9E3779B9
206
TEA_transform(__u32 buf[4],__u32 const in[])207 static void TEA_transform(__u32 buf[4], __u32 const in[])
208 {
209 __u32 sum = 0;
210 __u32 b0 = buf[0], b1 = buf[1];
211 __u32 a = in[0], b = in[1], c = in[2], d = in[3];
212 int n = 16;
213
214 do {
215 sum += DELTA;
216 b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
217 b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
218 } while (--n);
219
220 buf[0] += b0;
221 buf[1] += b1;
222 }
223
str2hashbuf(const char * msg,int len,__u32 * buf,int num)224 static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
225 {
226 __u32 pad, val;
227 int i;
228
229 pad = (__u32)len | ((__u32)len << 8);
230 pad |= pad << 16;
231
232 val = pad;
233 if (len > num*4)
234 len = num * 4;
235 for (i = 0; i < len; i++) {
236 if ((i % 4) == 0)
237 val = pad;
238 val = msg[i] + (val << 8);
239 if ((i % 4) == 3) {
240 *buf++ = val;
241 val = pad;
242 num--;
243 }
244 }
245 if (--num >= 0)
246 *buf++ = val;
247 while (--num >= 0)
248 *buf++ = pad;
249 }
250
ocfs2_dx_dir_name_hash(struct inode * dir,const char * name,int len,struct ocfs2_dx_hinfo * hinfo)251 static void ocfs2_dx_dir_name_hash(struct inode *dir, const char *name, int len,
252 struct ocfs2_dx_hinfo *hinfo)
253 {
254 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
255 const char *p;
256 __u32 in[8], buf[4];
257
258 /*
259 * XXX: Is this really necessary, if the index is never looked
260 * at by readdir? Is a hash value of '0' a bad idea?
261 */
262 if ((len == 1 && !strncmp(".", name, 1)) ||
263 (len == 2 && !strncmp("..", name, 2))) {
264 buf[0] = buf[1] = 0;
265 goto out;
266 }
267
268 #ifdef OCFS2_DEBUG_DX_DIRS
269 /*
270 * This makes it very easy to debug indexing problems. We
271 * should never allow this to be selected without hand editing
272 * this file though.
273 */
274 buf[0] = buf[1] = len;
275 goto out;
276 #endif
277
278 memcpy(buf, osb->osb_dx_seed, sizeof(buf));
279
280 p = name;
281 while (len > 0) {
282 str2hashbuf(p, len, in, 4);
283 TEA_transform(buf, in);
284 len -= 16;
285 p += 16;
286 }
287
288 out:
289 hinfo->major_hash = buf[0];
290 hinfo->minor_hash = buf[1];
291 }
292
293 /*
294 * bh passed here can be an inode block or a dir data block, depending
295 * on the inode inline data flag.
296 */
ocfs2_check_dir_entry(struct inode * dir,struct ocfs2_dir_entry * de,struct buffer_head * bh,char * buf,unsigned int size,unsigned long offset)297 static int ocfs2_check_dir_entry(struct inode *dir,
298 struct ocfs2_dir_entry *de,
299 struct buffer_head *bh,
300 char *buf,
301 unsigned int size,
302 unsigned long offset)
303 {
304 const char *error_msg = NULL;
305 const int rlen = le16_to_cpu(de->rec_len);
306 const unsigned long next_offset = ((char *) de - buf) + rlen;
307
308 if (unlikely(rlen < OCFS2_DIR_REC_LEN(1)))
309 error_msg = "rec_len is smaller than minimal";
310 else if (unlikely(rlen % 4 != 0))
311 error_msg = "rec_len % 4 != 0";
312 else if (unlikely(rlen < OCFS2_DIR_REC_LEN(de->name_len)))
313 error_msg = "rec_len is too small for name_len";
314 else if (unlikely(next_offset > size))
315 error_msg = "directory entry overrun";
316 else if (unlikely(next_offset > size - OCFS2_DIR_REC_LEN(1)) &&
317 next_offset != size)
318 error_msg = "directory entry too close to end";
319
320 if (unlikely(error_msg != NULL))
321 mlog(ML_ERROR, "bad entry in directory #%llu: %s - "
322 "offset=%lu, inode=%llu, rec_len=%d, name_len=%d\n",
323 (unsigned long long)OCFS2_I(dir)->ip_blkno, error_msg,
324 offset, (unsigned long long)le64_to_cpu(de->inode), rlen,
325 de->name_len);
326
327 return error_msg == NULL ? 1 : 0;
328 }
329
ocfs2_match(int len,const char * const name,struct ocfs2_dir_entry * de)330 static inline int ocfs2_match(int len,
331 const char * const name,
332 struct ocfs2_dir_entry *de)
333 {
334 if (len != de->name_len)
335 return 0;
336 if (!de->inode)
337 return 0;
338 return !memcmp(name, de->name, len);
339 }
340
341 /*
342 * Returns 0 if not found, -1 on failure, and 1 on success
343 */
ocfs2_search_dirblock(struct buffer_head * bh,struct inode * dir,const char * name,int namelen,unsigned long offset,char * first_de,unsigned int bytes,struct ocfs2_dir_entry ** res_dir)344 static inline int ocfs2_search_dirblock(struct buffer_head *bh,
345 struct inode *dir,
346 const char *name, int namelen,
347 unsigned long offset,
348 char *first_de,
349 unsigned int bytes,
350 struct ocfs2_dir_entry **res_dir)
351 {
352 struct ocfs2_dir_entry *de;
353 char *dlimit, *de_buf;
354 int de_len;
355 int ret = 0;
356
357 de_buf = first_de;
358 dlimit = de_buf + bytes;
359
360 while (de_buf < dlimit - OCFS2_DIR_MEMBER_LEN) {
361 /* this code is executed quadratically often */
362 /* do minimal checking `by hand' */
363
364 de = (struct ocfs2_dir_entry *) de_buf;
365
366 if (de->name + namelen <= dlimit &&
367 ocfs2_match(namelen, name, de)) {
368 /* found a match - just to be sure, do a full check */
369 if (!ocfs2_check_dir_entry(dir, de, bh, first_de,
370 bytes, offset)) {
371 ret = -1;
372 goto bail;
373 }
374 *res_dir = de;
375 ret = 1;
376 goto bail;
377 }
378
379 /* prevent looping on a bad block */
380 de_len = le16_to_cpu(de->rec_len);
381 if (de_len <= 0) {
382 ret = -1;
383 goto bail;
384 }
385
386 de_buf += de_len;
387 offset += de_len;
388 }
389
390 bail:
391 trace_ocfs2_search_dirblock(ret);
392 return ret;
393 }
394
ocfs2_find_entry_id(const char * name,int namelen,struct inode * dir,struct ocfs2_dir_entry ** res_dir)395 static struct buffer_head *ocfs2_find_entry_id(const char *name,
396 int namelen,
397 struct inode *dir,
398 struct ocfs2_dir_entry **res_dir)
399 {
400 int ret, found;
401 struct buffer_head *di_bh = NULL;
402 struct ocfs2_dinode *di;
403 struct ocfs2_inline_data *data;
404
405 ret = ocfs2_read_inode_block(dir, &di_bh);
406 if (ret) {
407 mlog_errno(ret);
408 goto out;
409 }
410
411 di = (struct ocfs2_dinode *)di_bh->b_data;
412 data = &di->id2.i_data;
413
414 found = ocfs2_search_dirblock(di_bh, dir, name, namelen, 0,
415 data->id_data, i_size_read(dir), res_dir);
416 if (found == 1)
417 return di_bh;
418
419 brelse(di_bh);
420 out:
421 return NULL;
422 }
423
ocfs2_validate_dir_block(struct super_block * sb,struct buffer_head * bh)424 static int ocfs2_validate_dir_block(struct super_block *sb,
425 struct buffer_head *bh)
426 {
427 int rc;
428 struct ocfs2_dir_block_trailer *trailer =
429 ocfs2_trailer_from_bh(bh, sb);
430
431
432 /*
433 * We don't validate dirents here, that's handled
434 * in-place when the code walks them.
435 */
436 trace_ocfs2_validate_dir_block((unsigned long long)bh->b_blocknr);
437
438 BUG_ON(!buffer_uptodate(bh));
439
440 /*
441 * If the ecc fails, we return the error but otherwise
442 * leave the filesystem running. We know any error is
443 * local to this block.
444 *
445 * Note that we are safe to call this even if the directory
446 * doesn't have a trailer. Filesystems without metaecc will do
447 * nothing, and filesystems with it will have one.
448 */
449 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &trailer->db_check);
450 if (rc)
451 mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
452 (unsigned long long)bh->b_blocknr);
453
454 return rc;
455 }
456
457 /*
458 * Validate a directory trailer.
459 *
460 * We check the trailer here rather than in ocfs2_validate_dir_block()
461 * because that function doesn't have the inode to test.
462 */
ocfs2_check_dir_trailer(struct inode * dir,struct buffer_head * bh)463 static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
464 {
465 int rc = 0;
466 struct ocfs2_dir_block_trailer *trailer;
467
468 trailer = ocfs2_trailer_from_bh(bh, dir->i_sb);
469 if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
470 rc = ocfs2_error(dir->i_sb,
471 "Invalid dirblock #%llu: signature = %.*s\n",
472 (unsigned long long)bh->b_blocknr, 7,
473 trailer->db_signature);
474 goto out;
475 }
476 if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) {
477 rc = ocfs2_error(dir->i_sb,
478 "Directory block #%llu has an invalid db_blkno of %llu\n",
479 (unsigned long long)bh->b_blocknr,
480 (unsigned long long)le64_to_cpu(trailer->db_blkno));
481 goto out;
482 }
483 if (le64_to_cpu(trailer->db_parent_dinode) !=
484 OCFS2_I(dir)->ip_blkno) {
485 rc = ocfs2_error(dir->i_sb,
486 "Directory block #%llu on dinode #%llu has an invalid parent_dinode of %llu\n",
487 (unsigned long long)bh->b_blocknr,
488 (unsigned long long)OCFS2_I(dir)->ip_blkno,
489 (unsigned long long)le64_to_cpu(trailer->db_blkno));
490 goto out;
491 }
492 out:
493 return rc;
494 }
495
496 /*
497 * This function forces all errors to -EIO for consistency with its
498 * predecessor, ocfs2_bread(). We haven't audited what returning the
499 * real error codes would do to callers. We log the real codes with
500 * mlog_errno() before we squash them.
501 */
ocfs2_read_dir_block(struct inode * inode,u64 v_block,struct buffer_head ** bh,int flags)502 static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
503 struct buffer_head **bh, int flags)
504 {
505 int rc = 0;
506 struct buffer_head *tmp = *bh;
507
508 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags,
509 ocfs2_validate_dir_block);
510 if (rc) {
511 mlog_errno(rc);
512 goto out;
513 }
514
515 if (!(flags & OCFS2_BH_READAHEAD) &&
516 ocfs2_supports_dir_trailer(inode)) {
517 rc = ocfs2_check_dir_trailer(inode, tmp);
518 if (rc) {
519 if (!*bh)
520 brelse(tmp);
521 mlog_errno(rc);
522 goto out;
523 }
524 }
525
526 /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
527 if (!*bh)
528 *bh = tmp;
529
530 out:
531 return rc ? -EIO : 0;
532 }
533
534 /*
535 * Read the block at 'phys' which belongs to this directory
536 * inode. This function does no virtual->physical block translation -
537 * what's passed in is assumed to be a valid directory block.
538 */
ocfs2_read_dir_block_direct(struct inode * dir,u64 phys,struct buffer_head ** bh)539 static int ocfs2_read_dir_block_direct(struct inode *dir, u64 phys,
540 struct buffer_head **bh)
541 {
542 int ret;
543 struct buffer_head *tmp = *bh;
544
545 ret = ocfs2_read_block(INODE_CACHE(dir), phys, &tmp,
546 ocfs2_validate_dir_block);
547 if (ret) {
548 mlog_errno(ret);
549 goto out;
550 }
551
552 if (ocfs2_supports_dir_trailer(dir)) {
553 ret = ocfs2_check_dir_trailer(dir, tmp);
554 if (ret) {
555 if (!*bh)
556 brelse(tmp);
557 mlog_errno(ret);
558 goto out;
559 }
560 }
561
562 if (!ret && !*bh)
563 *bh = tmp;
564 out:
565 return ret;
566 }
567
ocfs2_validate_dx_root(struct super_block * sb,struct buffer_head * bh)568 static int ocfs2_validate_dx_root(struct super_block *sb,
569 struct buffer_head *bh)
570 {
571 int ret;
572 struct ocfs2_dx_root_block *dx_root;
573
574 BUG_ON(!buffer_uptodate(bh));
575
576 dx_root = (struct ocfs2_dx_root_block *) bh->b_data;
577
578 ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_root->dr_check);
579 if (ret) {
580 mlog(ML_ERROR,
581 "Checksum failed for dir index root block %llu\n",
582 (unsigned long long)bh->b_blocknr);
583 return ret;
584 }
585
586 if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) {
587 ret = ocfs2_error(sb,
588 "Dir Index Root # %llu has bad signature %.*s\n",
589 (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
590 7, dx_root->dr_signature);
591 }
592
593 return ret;
594 }
595
ocfs2_read_dx_root(struct inode * dir,struct ocfs2_dinode * di,struct buffer_head ** dx_root_bh)596 static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
597 struct buffer_head **dx_root_bh)
598 {
599 int ret;
600 u64 blkno = le64_to_cpu(di->i_dx_root);
601 struct buffer_head *tmp = *dx_root_bh;
602
603 ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp,
604 ocfs2_validate_dx_root);
605
606 /* If ocfs2_read_block() got us a new bh, pass it up. */
607 if (!ret && !*dx_root_bh)
608 *dx_root_bh = tmp;
609
610 return ret;
611 }
612
ocfs2_validate_dx_leaf(struct super_block * sb,struct buffer_head * bh)613 static int ocfs2_validate_dx_leaf(struct super_block *sb,
614 struct buffer_head *bh)
615 {
616 int ret;
617 struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)bh->b_data;
618
619 BUG_ON(!buffer_uptodate(bh));
620
621 ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_leaf->dl_check);
622 if (ret) {
623 mlog(ML_ERROR,
624 "Checksum failed for dir index leaf block %llu\n",
625 (unsigned long long)bh->b_blocknr);
626 return ret;
627 }
628
629 if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) {
630 ret = ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s\n",
631 7, dx_leaf->dl_signature);
632 }
633
634 return ret;
635 }
636
ocfs2_read_dx_leaf(struct inode * dir,u64 blkno,struct buffer_head ** dx_leaf_bh)637 static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
638 struct buffer_head **dx_leaf_bh)
639 {
640 int ret;
641 struct buffer_head *tmp = *dx_leaf_bh;
642
643 ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp,
644 ocfs2_validate_dx_leaf);
645
646 /* If ocfs2_read_block() got us a new bh, pass it up. */
647 if (!ret && !*dx_leaf_bh)
648 *dx_leaf_bh = tmp;
649
650 return ret;
651 }
652
653 /*
654 * Read a series of dx_leaf blocks. This expects all buffer_head
655 * pointers to be NULL on function entry.
656 */
ocfs2_read_dx_leaves(struct inode * dir,u64 start,int num,struct buffer_head ** dx_leaf_bhs)657 static int ocfs2_read_dx_leaves(struct inode *dir, u64 start, int num,
658 struct buffer_head **dx_leaf_bhs)
659 {
660 int ret;
661
662 ret = ocfs2_read_blocks(INODE_CACHE(dir), start, num, dx_leaf_bhs, 0,
663 ocfs2_validate_dx_leaf);
664 if (ret)
665 mlog_errno(ret);
666
667 return ret;
668 }
669
ocfs2_find_entry_el(const char * name,int namelen,struct inode * dir,struct ocfs2_dir_entry ** res_dir)670 static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
671 struct inode *dir,
672 struct ocfs2_dir_entry **res_dir)
673 {
674 struct super_block *sb;
675 struct buffer_head *bh_use[NAMEI_RA_SIZE];
676 struct buffer_head *bh, *ret = NULL;
677 unsigned long start, block, b;
678 int ra_max = 0; /* Number of bh's in the readahead
679 buffer, bh_use[] */
680 int ra_ptr = 0; /* Current index into readahead
681 buffer */
682 int num = 0;
683 int nblocks, i;
684
685 sb = dir->i_sb;
686
687 nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
688 start = OCFS2_I(dir)->ip_dir_start_lookup;
689 if (start >= nblocks)
690 start = 0;
691 block = start;
692
693 restart:
694 do {
695 /*
696 * We deal with the read-ahead logic here.
697 */
698 if (ra_ptr >= ra_max) {
699 /* Refill the readahead buffer */
700 ra_ptr = 0;
701 b = block;
702 for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
703 /*
704 * Terminate if we reach the end of the
705 * directory and must wrap, or if our
706 * search has finished at this block.
707 */
708 if (b >= nblocks || (num && block == start)) {
709 bh_use[ra_max] = NULL;
710 break;
711 }
712 num++;
713
714 bh = NULL;
715 ocfs2_read_dir_block(dir, b++, &bh,
716 OCFS2_BH_READAHEAD);
717 bh_use[ra_max] = bh;
718 }
719 }
720 if ((bh = bh_use[ra_ptr++]) == NULL)
721 goto next;
722 if (ocfs2_read_dir_block(dir, block, &bh, 0)) {
723 /* read error, skip block & hope for the best.
724 * ocfs2_read_dir_block() has released the bh. */
725 mlog(ML_ERROR, "reading directory %llu, "
726 "offset %lu\n",
727 (unsigned long long)OCFS2_I(dir)->ip_blkno,
728 block);
729 goto next;
730 }
731 i = ocfs2_search_dirblock(bh, dir, name, namelen,
732 block << sb->s_blocksize_bits,
733 bh->b_data, sb->s_blocksize,
734 res_dir);
735 if (i == 1) {
736 OCFS2_I(dir)->ip_dir_start_lookup = block;
737 ret = bh;
738 goto cleanup_and_exit;
739 } else {
740 brelse(bh);
741 if (i < 0)
742 goto cleanup_and_exit;
743 }
744 next:
745 if (++block >= nblocks)
746 block = 0;
747 } while (block != start);
748
749 /*
750 * If the directory has grown while we were searching, then
751 * search the last part of the directory before giving up.
752 */
753 block = nblocks;
754 nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
755 if (block < nblocks) {
756 start = 0;
757 goto restart;
758 }
759
760 cleanup_and_exit:
761 /* Clean up the read-ahead blocks */
762 for (; ra_ptr < ra_max; ra_ptr++)
763 brelse(bh_use[ra_ptr]);
764
765 trace_ocfs2_find_entry_el(ret);
766 return ret;
767 }
768
ocfs2_dx_dir_lookup_rec(struct inode * inode,struct ocfs2_extent_list * el,u32 major_hash,u32 * ret_cpos,u64 * ret_phys_blkno,unsigned int * ret_clen)769 static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
770 struct ocfs2_extent_list *el,
771 u32 major_hash,
772 u32 *ret_cpos,
773 u64 *ret_phys_blkno,
774 unsigned int *ret_clen)
775 {
776 int ret = 0, i, found;
777 struct buffer_head *eb_bh = NULL;
778 struct ocfs2_extent_block *eb;
779 struct ocfs2_extent_rec *rec = NULL;
780
781 if (el->l_tree_depth) {
782 ret = ocfs2_find_leaf(INODE_CACHE(inode), el, major_hash,
783 &eb_bh);
784 if (ret) {
785 mlog_errno(ret);
786 goto out;
787 }
788
789 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
790 el = &eb->h_list;
791
792 if (el->l_tree_depth) {
793 ret = ocfs2_error(inode->i_sb,
794 "Inode %lu has non zero tree depth in btree tree block %llu\n",
795 inode->i_ino,
796 (unsigned long long)eb_bh->b_blocknr);
797 goto out;
798 }
799 }
800
801 found = 0;
802 for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
803 rec = &el->l_recs[i];
804
805 if (le32_to_cpu(rec->e_cpos) <= major_hash) {
806 found = 1;
807 break;
808 }
809 }
810
811 if (!found) {
812 ret = ocfs2_error(inode->i_sb,
813 "Inode %lu has bad extent record (%u, %u, 0) in btree\n",
814 inode->i_ino,
815 le32_to_cpu(rec->e_cpos),
816 ocfs2_rec_clusters(el, rec));
817 goto out;
818 }
819
820 if (ret_phys_blkno)
821 *ret_phys_blkno = le64_to_cpu(rec->e_blkno);
822 if (ret_cpos)
823 *ret_cpos = le32_to_cpu(rec->e_cpos);
824 if (ret_clen)
825 *ret_clen = le16_to_cpu(rec->e_leaf_clusters);
826
827 out:
828 brelse(eb_bh);
829 return ret;
830 }
831
832 /*
833 * Returns the block index, from the start of the cluster which this
834 * hash belongs too.
835 */
__ocfs2_dx_dir_hash_idx(struct ocfs2_super * osb,u32 minor_hash)836 static inline unsigned int __ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
837 u32 minor_hash)
838 {
839 return minor_hash & osb->osb_dx_mask;
840 }
841
ocfs2_dx_dir_hash_idx(struct ocfs2_super * osb,struct ocfs2_dx_hinfo * hinfo)842 static inline unsigned int ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
843 struct ocfs2_dx_hinfo *hinfo)
844 {
845 return __ocfs2_dx_dir_hash_idx(osb, hinfo->minor_hash);
846 }
847
ocfs2_dx_dir_lookup(struct inode * inode,struct ocfs2_extent_list * el,struct ocfs2_dx_hinfo * hinfo,u32 * ret_cpos,u64 * ret_phys_blkno)848 static int ocfs2_dx_dir_lookup(struct inode *inode,
849 struct ocfs2_extent_list *el,
850 struct ocfs2_dx_hinfo *hinfo,
851 u32 *ret_cpos,
852 u64 *ret_phys_blkno)
853 {
854 int ret = 0;
855 unsigned int cend, clen;
856 u32 cpos;
857 u64 blkno;
858 u32 name_hash = hinfo->major_hash;
859
860 ret = ocfs2_dx_dir_lookup_rec(inode, el, name_hash, &cpos, &blkno,
861 &clen);
862 if (ret) {
863 mlog_errno(ret);
864 goto out;
865 }
866
867 cend = cpos + clen;
868 if (name_hash >= cend) {
869 /* We want the last cluster */
870 blkno += ocfs2_clusters_to_blocks(inode->i_sb, clen - 1);
871 cpos += clen - 1;
872 } else {
873 blkno += ocfs2_clusters_to_blocks(inode->i_sb,
874 name_hash - cpos);
875 cpos = name_hash;
876 }
877
878 /*
879 * We now have the cluster which should hold our entry. To
880 * find the exact block from the start of the cluster to
881 * search, we take the lower bits of the hash.
882 */
883 blkno += ocfs2_dx_dir_hash_idx(OCFS2_SB(inode->i_sb), hinfo);
884
885 if (ret_phys_blkno)
886 *ret_phys_blkno = blkno;
887 if (ret_cpos)
888 *ret_cpos = cpos;
889
890 out:
891
892 return ret;
893 }
894
ocfs2_dx_dir_search(const char * name,int namelen,struct inode * dir,struct ocfs2_dx_root_block * dx_root,struct ocfs2_dir_lookup_result * res)895 static int ocfs2_dx_dir_search(const char *name, int namelen,
896 struct inode *dir,
897 struct ocfs2_dx_root_block *dx_root,
898 struct ocfs2_dir_lookup_result *res)
899 {
900 int ret, i, found;
901 u64 phys;
902 struct buffer_head *dx_leaf_bh = NULL;
903 struct ocfs2_dx_leaf *dx_leaf;
904 struct ocfs2_dx_entry *dx_entry = NULL;
905 struct buffer_head *dir_ent_bh = NULL;
906 struct ocfs2_dir_entry *dir_ent = NULL;
907 struct ocfs2_dx_hinfo *hinfo = &res->dl_hinfo;
908 struct ocfs2_extent_list *dr_el;
909 struct ocfs2_dx_entry_list *entry_list;
910
911 ocfs2_dx_dir_name_hash(dir, name, namelen, &res->dl_hinfo);
912
913 if (ocfs2_dx_root_inline(dx_root)) {
914 entry_list = &dx_root->dr_entries;
915 goto search;
916 }
917
918 dr_el = &dx_root->dr_list;
919
920 ret = ocfs2_dx_dir_lookup(dir, dr_el, hinfo, NULL, &phys);
921 if (ret) {
922 mlog_errno(ret);
923 goto out;
924 }
925
926 trace_ocfs2_dx_dir_search((unsigned long long)OCFS2_I(dir)->ip_blkno,
927 namelen, name, hinfo->major_hash,
928 hinfo->minor_hash, (unsigned long long)phys);
929
930 ret = ocfs2_read_dx_leaf(dir, phys, &dx_leaf_bh);
931 if (ret) {
932 mlog_errno(ret);
933 goto out;
934 }
935
936 dx_leaf = (struct ocfs2_dx_leaf *) dx_leaf_bh->b_data;
937
938 trace_ocfs2_dx_dir_search_leaf_info(
939 le16_to_cpu(dx_leaf->dl_list.de_num_used),
940 le16_to_cpu(dx_leaf->dl_list.de_count));
941
942 entry_list = &dx_leaf->dl_list;
943
944 search:
945 /*
946 * Empty leaf is legal, so no need to check for that.
947 */
948 found = 0;
949 for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) {
950 dx_entry = &entry_list->de_entries[i];
951
952 if (hinfo->major_hash != le32_to_cpu(dx_entry->dx_major_hash)
953 || hinfo->minor_hash != le32_to_cpu(dx_entry->dx_minor_hash))
954 continue;
955
956 /*
957 * Search unindexed leaf block now. We're not
958 * guaranteed to find anything.
959 */
960 ret = ocfs2_read_dir_block_direct(dir,
961 le64_to_cpu(dx_entry->dx_dirent_blk),
962 &dir_ent_bh);
963 if (ret) {
964 mlog_errno(ret);
965 goto out;
966 }
967
968 /*
969 * XXX: We should check the unindexed block here,
970 * before using it.
971 */
972
973 found = ocfs2_search_dirblock(dir_ent_bh, dir, name, namelen,
974 0, dir_ent_bh->b_data,
975 dir->i_sb->s_blocksize, &dir_ent);
976 if (found == 1)
977 break;
978
979 if (found == -1) {
980 /* This means we found a bad directory entry. */
981 ret = -EIO;
982 mlog_errno(ret);
983 goto out;
984 }
985
986 brelse(dir_ent_bh);
987 dir_ent_bh = NULL;
988 }
989
990 if (found <= 0) {
991 ret = -ENOENT;
992 goto out;
993 }
994
995 res->dl_leaf_bh = dir_ent_bh;
996 res->dl_entry = dir_ent;
997 res->dl_dx_leaf_bh = dx_leaf_bh;
998 res->dl_dx_entry = dx_entry;
999
1000 ret = 0;
1001 out:
1002 if (ret) {
1003 brelse(dx_leaf_bh);
1004 brelse(dir_ent_bh);
1005 }
1006 return ret;
1007 }
1008
ocfs2_find_entry_dx(const char * name,int namelen,struct inode * dir,struct ocfs2_dir_lookup_result * lookup)1009 static int ocfs2_find_entry_dx(const char *name, int namelen,
1010 struct inode *dir,
1011 struct ocfs2_dir_lookup_result *lookup)
1012 {
1013 int ret;
1014 struct buffer_head *di_bh = NULL;
1015 struct ocfs2_dinode *di;
1016 struct buffer_head *dx_root_bh = NULL;
1017 struct ocfs2_dx_root_block *dx_root;
1018
1019 ret = ocfs2_read_inode_block(dir, &di_bh);
1020 if (ret) {
1021 mlog_errno(ret);
1022 goto out;
1023 }
1024
1025 di = (struct ocfs2_dinode *)di_bh->b_data;
1026
1027 ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
1028 if (ret) {
1029 mlog_errno(ret);
1030 goto out;
1031 }
1032 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
1033
1034 ret = ocfs2_dx_dir_search(name, namelen, dir, dx_root, lookup);
1035 if (ret) {
1036 if (ret != -ENOENT)
1037 mlog_errno(ret);
1038 goto out;
1039 }
1040
1041 lookup->dl_dx_root_bh = dx_root_bh;
1042 dx_root_bh = NULL;
1043 out:
1044 brelse(di_bh);
1045 brelse(dx_root_bh);
1046 return ret;
1047 }
1048
1049 /*
1050 * Try to find an entry of the provided name within 'dir'.
1051 *
1052 * If nothing was found, -ENOENT is returned. Otherwise, zero is
1053 * returned and the struct 'res' will contain information useful to
1054 * other directory manipulation functions.
1055 *
1056 * Caller can NOT assume anything about the contents of the
1057 * buffer_heads - they are passed back only so that it can be passed
1058 * into any one of the manipulation functions (add entry, delete
1059 * entry, etc). As an example, bh in the extent directory case is a
1060 * data block, in the inline-data case it actually points to an inode,
1061 * in the indexed directory case, multiple buffers are involved.
1062 */
ocfs2_find_entry(const char * name,int namelen,struct inode * dir,struct ocfs2_dir_lookup_result * lookup)1063 int ocfs2_find_entry(const char *name, int namelen,
1064 struct inode *dir, struct ocfs2_dir_lookup_result *lookup)
1065 {
1066 struct buffer_head *bh;
1067 struct ocfs2_dir_entry *res_dir = NULL;
1068
1069 if (ocfs2_dir_indexed(dir))
1070 return ocfs2_find_entry_dx(name, namelen, dir, lookup);
1071
1072 /*
1073 * The unindexed dir code only uses part of the lookup
1074 * structure, so there's no reason to push it down further
1075 * than this.
1076 */
1077 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1078 bh = ocfs2_find_entry_id(name, namelen, dir, &res_dir);
1079 else
1080 bh = ocfs2_find_entry_el(name, namelen, dir, &res_dir);
1081
1082 if (bh == NULL)
1083 return -ENOENT;
1084
1085 lookup->dl_leaf_bh = bh;
1086 lookup->dl_entry = res_dir;
1087 return 0;
1088 }
1089
1090 /*
1091 * Update inode number and type of a previously found directory entry.
1092 */
ocfs2_update_entry(struct inode * dir,handle_t * handle,struct ocfs2_dir_lookup_result * res,struct inode * new_entry_inode)1093 int ocfs2_update_entry(struct inode *dir, handle_t *handle,
1094 struct ocfs2_dir_lookup_result *res,
1095 struct inode *new_entry_inode)
1096 {
1097 int ret;
1098 ocfs2_journal_access_func access = ocfs2_journal_access_db;
1099 struct ocfs2_dir_entry *de = res->dl_entry;
1100 struct buffer_head *de_bh = res->dl_leaf_bh;
1101
1102 /*
1103 * The same code works fine for both inline-data and extent
1104 * based directories, so no need to split this up. The only
1105 * difference is the journal_access function.
1106 */
1107
1108 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1109 access = ocfs2_journal_access_di;
1110
1111 ret = access(handle, INODE_CACHE(dir), de_bh,
1112 OCFS2_JOURNAL_ACCESS_WRITE);
1113 if (ret) {
1114 mlog_errno(ret);
1115 goto out;
1116 }
1117
1118 de->inode = cpu_to_le64(OCFS2_I(new_entry_inode)->ip_blkno);
1119 ocfs2_set_de_type(de, new_entry_inode->i_mode);
1120
1121 ocfs2_journal_dirty(handle, de_bh);
1122
1123 out:
1124 return ret;
1125 }
1126
1127 /*
1128 * __ocfs2_delete_entry deletes a directory entry by merging it with the
1129 * previous entry
1130 */
__ocfs2_delete_entry(handle_t * handle,struct inode * dir,struct ocfs2_dir_entry * de_del,struct buffer_head * bh,char * first_de,unsigned int bytes)1131 static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
1132 struct ocfs2_dir_entry *de_del,
1133 struct buffer_head *bh, char *first_de,
1134 unsigned int bytes)
1135 {
1136 struct ocfs2_dir_entry *de, *pde;
1137 int i, status = -ENOENT;
1138 ocfs2_journal_access_func access = ocfs2_journal_access_db;
1139
1140 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1141 access = ocfs2_journal_access_di;
1142
1143 i = 0;
1144 pde = NULL;
1145 de = (struct ocfs2_dir_entry *) first_de;
1146 while (i < bytes) {
1147 if (!ocfs2_check_dir_entry(dir, de, bh, first_de, bytes, i)) {
1148 status = -EIO;
1149 mlog_errno(status);
1150 goto bail;
1151 }
1152 if (de == de_del) {
1153 status = access(handle, INODE_CACHE(dir), bh,
1154 OCFS2_JOURNAL_ACCESS_WRITE);
1155 if (status < 0) {
1156 status = -EIO;
1157 mlog_errno(status);
1158 goto bail;
1159 }
1160 if (pde)
1161 le16_add_cpu(&pde->rec_len,
1162 le16_to_cpu(de->rec_len));
1163 de->inode = 0;
1164 inode_inc_iversion(dir);
1165 ocfs2_journal_dirty(handle, bh);
1166 goto bail;
1167 }
1168 i += le16_to_cpu(de->rec_len);
1169 pde = de;
1170 de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len));
1171 }
1172 bail:
1173 return status;
1174 }
1175
ocfs2_figure_dirent_hole(struct ocfs2_dir_entry * de)1176 static unsigned int ocfs2_figure_dirent_hole(struct ocfs2_dir_entry *de)
1177 {
1178 unsigned int hole;
1179
1180 if (le64_to_cpu(de->inode) == 0)
1181 hole = le16_to_cpu(de->rec_len);
1182 else
1183 hole = le16_to_cpu(de->rec_len) -
1184 OCFS2_DIR_REC_LEN(de->name_len);
1185
1186 return hole;
1187 }
1188
ocfs2_find_max_rec_len(struct super_block * sb,struct buffer_head * dirblock_bh)1189 static int ocfs2_find_max_rec_len(struct super_block *sb,
1190 struct buffer_head *dirblock_bh)
1191 {
1192 int size, this_hole, largest_hole = 0;
1193 char *trailer, *de_buf, *limit, *start = dirblock_bh->b_data;
1194 struct ocfs2_dir_entry *de;
1195
1196 trailer = (char *)ocfs2_trailer_from_bh(dirblock_bh, sb);
1197 size = ocfs2_dir_trailer_blk_off(sb);
1198 limit = start + size;
1199 de_buf = start;
1200 de = (struct ocfs2_dir_entry *)de_buf;
1201 do {
1202 if (de_buf != trailer) {
1203 this_hole = ocfs2_figure_dirent_hole(de);
1204 if (this_hole > largest_hole)
1205 largest_hole = this_hole;
1206 }
1207
1208 de_buf += le16_to_cpu(de->rec_len);
1209 de = (struct ocfs2_dir_entry *)de_buf;
1210 } while (de_buf < limit);
1211
1212 if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
1213 return largest_hole;
1214 return 0;
1215 }
1216
ocfs2_dx_list_remove_entry(struct ocfs2_dx_entry_list * entry_list,int index)1217 static void ocfs2_dx_list_remove_entry(struct ocfs2_dx_entry_list *entry_list,
1218 int index)
1219 {
1220 int num_used = le16_to_cpu(entry_list->de_num_used);
1221
1222 if (num_used == 1 || index == (num_used - 1))
1223 goto clear;
1224
1225 memmove(&entry_list->de_entries[index],
1226 &entry_list->de_entries[index + 1],
1227 (num_used - index - 1)*sizeof(struct ocfs2_dx_entry));
1228 clear:
1229 num_used--;
1230 memset(&entry_list->de_entries[num_used], 0,
1231 sizeof(struct ocfs2_dx_entry));
1232 entry_list->de_num_used = cpu_to_le16(num_used);
1233 }
1234
ocfs2_delete_entry_dx(handle_t * handle,struct inode * dir,struct ocfs2_dir_lookup_result * lookup)1235 static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
1236 struct ocfs2_dir_lookup_result *lookup)
1237 {
1238 int ret, index, max_rec_len, add_to_free_list = 0;
1239 struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
1240 struct buffer_head *leaf_bh = lookup->dl_leaf_bh;
1241 struct ocfs2_dx_leaf *dx_leaf;
1242 struct ocfs2_dx_entry *dx_entry = lookup->dl_dx_entry;
1243 struct ocfs2_dir_block_trailer *trailer;
1244 struct ocfs2_dx_root_block *dx_root;
1245 struct ocfs2_dx_entry_list *entry_list;
1246
1247 /*
1248 * This function gets a bit messy because we might have to
1249 * modify the root block, regardless of whether the indexed
1250 * entries are stored inline.
1251 */
1252
1253 /*
1254 * *Only* set 'entry_list' here, based on where we're looking
1255 * for the indexed entries. Later, we might still want to
1256 * journal both blocks, based on free list state.
1257 */
1258 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
1259 if (ocfs2_dx_root_inline(dx_root)) {
1260 entry_list = &dx_root->dr_entries;
1261 } else {
1262 dx_leaf = (struct ocfs2_dx_leaf *) lookup->dl_dx_leaf_bh->b_data;
1263 entry_list = &dx_leaf->dl_list;
1264 }
1265
1266 /* Neither of these are a disk corruption - that should have
1267 * been caught by lookup, before we got here. */
1268 BUG_ON(le16_to_cpu(entry_list->de_count) <= 0);
1269 BUG_ON(le16_to_cpu(entry_list->de_num_used) <= 0);
1270
1271 index = (char *)dx_entry - (char *)entry_list->de_entries;
1272 index /= sizeof(*dx_entry);
1273
1274 if (index >= le16_to_cpu(entry_list->de_num_used)) {
1275 mlog(ML_ERROR, "Dir %llu: Bad dx_entry ptr idx %d, (%p, %p)\n",
1276 (unsigned long long)OCFS2_I(dir)->ip_blkno, index,
1277 entry_list, dx_entry);
1278 return -EIO;
1279 }
1280
1281 /*
1282 * We know that removal of this dirent will leave enough room
1283 * for a new one, so add this block to the free list if it
1284 * isn't already there.
1285 */
1286 trailer = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb);
1287 if (trailer->db_free_rec_len == 0)
1288 add_to_free_list = 1;
1289
1290 /*
1291 * Add the block holding our index into the journal before
1292 * removing the unindexed entry. If we get an error return
1293 * from __ocfs2_delete_entry(), then it hasn't removed the
1294 * entry yet. Likewise, successful return means we *must*
1295 * remove the indexed entry.
1296 *
1297 * We're also careful to journal the root tree block here as
1298 * the entry count needs to be updated. Also, we might be
1299 * adding to the start of the free list.
1300 */
1301 ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
1302 OCFS2_JOURNAL_ACCESS_WRITE);
1303 if (ret) {
1304 mlog_errno(ret);
1305 goto out;
1306 }
1307
1308 if (!ocfs2_dx_root_inline(dx_root)) {
1309 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
1310 lookup->dl_dx_leaf_bh,
1311 OCFS2_JOURNAL_ACCESS_WRITE);
1312 if (ret) {
1313 mlog_errno(ret);
1314 goto out;
1315 }
1316 }
1317
1318 trace_ocfs2_delete_entry_dx((unsigned long long)OCFS2_I(dir)->ip_blkno,
1319 index);
1320
1321 ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry,
1322 leaf_bh, leaf_bh->b_data, leaf_bh->b_size);
1323 if (ret) {
1324 mlog_errno(ret);
1325 goto out;
1326 }
1327
1328 max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, leaf_bh);
1329 trailer->db_free_rec_len = cpu_to_le16(max_rec_len);
1330 if (add_to_free_list) {
1331 trailer->db_free_next = dx_root->dr_free_blk;
1332 dx_root->dr_free_blk = cpu_to_le64(leaf_bh->b_blocknr);
1333 ocfs2_journal_dirty(handle, dx_root_bh);
1334 }
1335
1336 /* leaf_bh was journal_accessed for us in __ocfs2_delete_entry */
1337 ocfs2_journal_dirty(handle, leaf_bh);
1338
1339 le32_add_cpu(&dx_root->dr_num_entries, -1);
1340 ocfs2_journal_dirty(handle, dx_root_bh);
1341
1342 ocfs2_dx_list_remove_entry(entry_list, index);
1343
1344 if (!ocfs2_dx_root_inline(dx_root))
1345 ocfs2_journal_dirty(handle, lookup->dl_dx_leaf_bh);
1346
1347 out:
1348 return ret;
1349 }
1350
ocfs2_delete_entry_id(handle_t * handle,struct inode * dir,struct ocfs2_dir_entry * de_del,struct buffer_head * bh)1351 static inline int ocfs2_delete_entry_id(handle_t *handle,
1352 struct inode *dir,
1353 struct ocfs2_dir_entry *de_del,
1354 struct buffer_head *bh)
1355 {
1356 int ret;
1357 struct buffer_head *di_bh = NULL;
1358 struct ocfs2_dinode *di;
1359 struct ocfs2_inline_data *data;
1360
1361 ret = ocfs2_read_inode_block(dir, &di_bh);
1362 if (ret) {
1363 mlog_errno(ret);
1364 goto out;
1365 }
1366
1367 di = (struct ocfs2_dinode *)di_bh->b_data;
1368 data = &di->id2.i_data;
1369
1370 ret = __ocfs2_delete_entry(handle, dir, de_del, bh, data->id_data,
1371 i_size_read(dir));
1372
1373 brelse(di_bh);
1374 out:
1375 return ret;
1376 }
1377
ocfs2_delete_entry_el(handle_t * handle,struct inode * dir,struct ocfs2_dir_entry * de_del,struct buffer_head * bh)1378 static inline int ocfs2_delete_entry_el(handle_t *handle,
1379 struct inode *dir,
1380 struct ocfs2_dir_entry *de_del,
1381 struct buffer_head *bh)
1382 {
1383 return __ocfs2_delete_entry(handle, dir, de_del, bh, bh->b_data,
1384 bh->b_size);
1385 }
1386
1387 /*
1388 * Delete a directory entry. Hide the details of directory
1389 * implementation from the caller.
1390 */
ocfs2_delete_entry(handle_t * handle,struct inode * dir,struct ocfs2_dir_lookup_result * res)1391 int ocfs2_delete_entry(handle_t *handle,
1392 struct inode *dir,
1393 struct ocfs2_dir_lookup_result *res)
1394 {
1395 if (ocfs2_dir_indexed(dir))
1396 return ocfs2_delete_entry_dx(handle, dir, res);
1397
1398 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1399 return ocfs2_delete_entry_id(handle, dir, res->dl_entry,
1400 res->dl_leaf_bh);
1401
1402 return ocfs2_delete_entry_el(handle, dir, res->dl_entry,
1403 res->dl_leaf_bh);
1404 }
1405
1406 /*
1407 * Check whether 'de' has enough room to hold an entry of
1408 * 'new_rec_len' bytes.
1409 */
ocfs2_dirent_would_fit(struct ocfs2_dir_entry * de,unsigned int new_rec_len)1410 static inline int ocfs2_dirent_would_fit(struct ocfs2_dir_entry *de,
1411 unsigned int new_rec_len)
1412 {
1413 unsigned int de_really_used;
1414
1415 /* Check whether this is an empty record with enough space */
1416 if (le64_to_cpu(de->inode) == 0 &&
1417 le16_to_cpu(de->rec_len) >= new_rec_len)
1418 return 1;
1419
1420 /*
1421 * Record might have free space at the end which we can
1422 * use.
1423 */
1424 de_really_used = OCFS2_DIR_REC_LEN(de->name_len);
1425 if (le16_to_cpu(de->rec_len) >= (de_really_used + new_rec_len))
1426 return 1;
1427
1428 return 0;
1429 }
1430
ocfs2_dx_dir_leaf_insert_tail(struct ocfs2_dx_leaf * dx_leaf,struct ocfs2_dx_entry * dx_new_entry)1431 static void ocfs2_dx_dir_leaf_insert_tail(struct ocfs2_dx_leaf *dx_leaf,
1432 struct ocfs2_dx_entry *dx_new_entry)
1433 {
1434 int i;
1435
1436 i = le16_to_cpu(dx_leaf->dl_list.de_num_used);
1437 dx_leaf->dl_list.de_entries[i] = *dx_new_entry;
1438
1439 le16_add_cpu(&dx_leaf->dl_list.de_num_used, 1);
1440 }
1441
ocfs2_dx_entry_list_insert(struct ocfs2_dx_entry_list * entry_list,struct ocfs2_dx_hinfo * hinfo,u64 dirent_blk)1442 static void ocfs2_dx_entry_list_insert(struct ocfs2_dx_entry_list *entry_list,
1443 struct ocfs2_dx_hinfo *hinfo,
1444 u64 dirent_blk)
1445 {
1446 int i;
1447 struct ocfs2_dx_entry *dx_entry;
1448
1449 i = le16_to_cpu(entry_list->de_num_used);
1450 dx_entry = &entry_list->de_entries[i];
1451
1452 memset(dx_entry, 0, sizeof(*dx_entry));
1453 dx_entry->dx_major_hash = cpu_to_le32(hinfo->major_hash);
1454 dx_entry->dx_minor_hash = cpu_to_le32(hinfo->minor_hash);
1455 dx_entry->dx_dirent_blk = cpu_to_le64(dirent_blk);
1456
1457 le16_add_cpu(&entry_list->de_num_used, 1);
1458 }
1459
__ocfs2_dx_dir_leaf_insert(struct inode * dir,handle_t * handle,struct ocfs2_dx_hinfo * hinfo,u64 dirent_blk,struct buffer_head * dx_leaf_bh)1460 static int __ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle,
1461 struct ocfs2_dx_hinfo *hinfo,
1462 u64 dirent_blk,
1463 struct buffer_head *dx_leaf_bh)
1464 {
1465 int ret;
1466 struct ocfs2_dx_leaf *dx_leaf;
1467
1468 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh,
1469 OCFS2_JOURNAL_ACCESS_WRITE);
1470 if (ret) {
1471 mlog_errno(ret);
1472 goto out;
1473 }
1474
1475 dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
1476 ocfs2_dx_entry_list_insert(&dx_leaf->dl_list, hinfo, dirent_blk);
1477 ocfs2_journal_dirty(handle, dx_leaf_bh);
1478
1479 out:
1480 return ret;
1481 }
1482
ocfs2_dx_inline_root_insert(struct inode * dir,handle_t * handle,struct ocfs2_dx_hinfo * hinfo,u64 dirent_blk,struct ocfs2_dx_root_block * dx_root)1483 static void ocfs2_dx_inline_root_insert(struct inode *dir, handle_t *handle,
1484 struct ocfs2_dx_hinfo *hinfo,
1485 u64 dirent_blk,
1486 struct ocfs2_dx_root_block *dx_root)
1487 {
1488 ocfs2_dx_entry_list_insert(&dx_root->dr_entries, hinfo, dirent_blk);
1489 }
1490
ocfs2_dx_dir_insert(struct inode * dir,handle_t * handle,struct ocfs2_dir_lookup_result * lookup)1491 static int ocfs2_dx_dir_insert(struct inode *dir, handle_t *handle,
1492 struct ocfs2_dir_lookup_result *lookup)
1493 {
1494 int ret = 0;
1495 struct ocfs2_dx_root_block *dx_root;
1496 struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
1497
1498 ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
1499 OCFS2_JOURNAL_ACCESS_WRITE);
1500 if (ret) {
1501 mlog_errno(ret);
1502 goto out;
1503 }
1504
1505 dx_root = (struct ocfs2_dx_root_block *)lookup->dl_dx_root_bh->b_data;
1506 if (ocfs2_dx_root_inline(dx_root)) {
1507 ocfs2_dx_inline_root_insert(dir, handle,
1508 &lookup->dl_hinfo,
1509 lookup->dl_leaf_bh->b_blocknr,
1510 dx_root);
1511 } else {
1512 ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &lookup->dl_hinfo,
1513 lookup->dl_leaf_bh->b_blocknr,
1514 lookup->dl_dx_leaf_bh);
1515 if (ret)
1516 goto out;
1517 }
1518
1519 le32_add_cpu(&dx_root->dr_num_entries, 1);
1520 ocfs2_journal_dirty(handle, dx_root_bh);
1521
1522 out:
1523 return ret;
1524 }
1525
ocfs2_remove_block_from_free_list(struct inode * dir,handle_t * handle,struct ocfs2_dir_lookup_result * lookup)1526 static void ocfs2_remove_block_from_free_list(struct inode *dir,
1527 handle_t *handle,
1528 struct ocfs2_dir_lookup_result *lookup)
1529 {
1530 struct ocfs2_dir_block_trailer *trailer, *prev;
1531 struct ocfs2_dx_root_block *dx_root;
1532 struct buffer_head *bh;
1533
1534 trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb);
1535
1536 if (ocfs2_free_list_at_root(lookup)) {
1537 bh = lookup->dl_dx_root_bh;
1538 dx_root = (struct ocfs2_dx_root_block *)bh->b_data;
1539 dx_root->dr_free_blk = trailer->db_free_next;
1540 } else {
1541 bh = lookup->dl_prev_leaf_bh;
1542 prev = ocfs2_trailer_from_bh(bh, dir->i_sb);
1543 prev->db_free_next = trailer->db_free_next;
1544 }
1545
1546 trailer->db_free_rec_len = cpu_to_le16(0);
1547 trailer->db_free_next = cpu_to_le64(0);
1548
1549 ocfs2_journal_dirty(handle, bh);
1550 ocfs2_journal_dirty(handle, lookup->dl_leaf_bh);
1551 }
1552
1553 /*
1554 * This expects that a journal write has been reserved on
1555 * lookup->dl_prev_leaf_bh or lookup->dl_dx_root_bh
1556 */
ocfs2_recalc_free_list(struct inode * dir,handle_t * handle,struct ocfs2_dir_lookup_result * lookup)1557 static void ocfs2_recalc_free_list(struct inode *dir, handle_t *handle,
1558 struct ocfs2_dir_lookup_result *lookup)
1559 {
1560 int max_rec_len;
1561 struct ocfs2_dir_block_trailer *trailer;
1562
1563 /* Walk dl_leaf_bh to figure out what the new free rec_len is. */
1564 max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, lookup->dl_leaf_bh);
1565 if (max_rec_len) {
1566 /*
1567 * There's still room in this block, so no need to remove it
1568 * from the free list. In this case, we just want to update
1569 * the rec len accounting.
1570 */
1571 trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb);
1572 trailer->db_free_rec_len = cpu_to_le16(max_rec_len);
1573 ocfs2_journal_dirty(handle, lookup->dl_leaf_bh);
1574 } else {
1575 ocfs2_remove_block_from_free_list(dir, handle, lookup);
1576 }
1577 }
1578
1579 /* we don't always have a dentry for what we want to add, so people
1580 * like orphan dir can call this instead.
1581 *
1582 * The lookup context must have been filled from
1583 * ocfs2_prepare_dir_for_insert.
1584 */
__ocfs2_add_entry(handle_t * handle,struct inode * dir,const char * name,int namelen,struct inode * inode,u64 blkno,struct buffer_head * parent_fe_bh,struct ocfs2_dir_lookup_result * lookup)1585 int __ocfs2_add_entry(handle_t *handle,
1586 struct inode *dir,
1587 const char *name, int namelen,
1588 struct inode *inode, u64 blkno,
1589 struct buffer_head *parent_fe_bh,
1590 struct ocfs2_dir_lookup_result *lookup)
1591 {
1592 unsigned long offset;
1593 unsigned short rec_len;
1594 struct ocfs2_dir_entry *de, *de1;
1595 struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_fe_bh->b_data;
1596 struct super_block *sb = dir->i_sb;
1597 int retval;
1598 unsigned int size = sb->s_blocksize;
1599 struct buffer_head *insert_bh = lookup->dl_leaf_bh;
1600 char *data_start = insert_bh->b_data;
1601
1602 if (!namelen)
1603 return -EINVAL;
1604
1605 if (ocfs2_dir_indexed(dir)) {
1606 struct buffer_head *bh;
1607
1608 /*
1609 * An indexed dir may require that we update the free space
1610 * list. Reserve a write to the previous node in the list so
1611 * that we don't fail later.
1612 *
1613 * XXX: This can be either a dx_root_block, or an unindexed
1614 * directory tree leaf block.
1615 */
1616 if (ocfs2_free_list_at_root(lookup)) {
1617 bh = lookup->dl_dx_root_bh;
1618 retval = ocfs2_journal_access_dr(handle,
1619 INODE_CACHE(dir), bh,
1620 OCFS2_JOURNAL_ACCESS_WRITE);
1621 } else {
1622 bh = lookup->dl_prev_leaf_bh;
1623 retval = ocfs2_journal_access_db(handle,
1624 INODE_CACHE(dir), bh,
1625 OCFS2_JOURNAL_ACCESS_WRITE);
1626 }
1627 if (retval) {
1628 mlog_errno(retval);
1629 return retval;
1630 }
1631 } else if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1632 data_start = di->id2.i_data.id_data;
1633 size = i_size_read(dir);
1634
1635 BUG_ON(insert_bh != parent_fe_bh);
1636 }
1637
1638 rec_len = OCFS2_DIR_REC_LEN(namelen);
1639 offset = 0;
1640 de = (struct ocfs2_dir_entry *) data_start;
1641 while (1) {
1642 BUG_ON((char *)de >= (size + data_start));
1643
1644 /* These checks should've already been passed by the
1645 * prepare function, but I guess we can leave them
1646 * here anyway. */
1647 if (!ocfs2_check_dir_entry(dir, de, insert_bh, data_start,
1648 size, offset)) {
1649 retval = -ENOENT;
1650 goto bail;
1651 }
1652 if (ocfs2_match(namelen, name, de)) {
1653 retval = -EEXIST;
1654 goto bail;
1655 }
1656
1657 /* We're guaranteed that we should have space, so we
1658 * can't possibly have hit the trailer...right? */
1659 mlog_bug_on_msg(ocfs2_skip_dir_trailer(dir, de, offset, size),
1660 "Hit dir trailer trying to insert %.*s "
1661 "(namelen %d) into directory %llu. "
1662 "offset is %lu, trailer offset is %d\n",
1663 namelen, name, namelen,
1664 (unsigned long long)parent_fe_bh->b_blocknr,
1665 offset, ocfs2_dir_trailer_blk_off(dir->i_sb));
1666
1667 if (ocfs2_dirent_would_fit(de, rec_len)) {
1668 inode_set_mtime_to_ts(dir,
1669 inode_set_ctime_current(dir));
1670 retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
1671 if (retval < 0) {
1672 mlog_errno(retval);
1673 goto bail;
1674 }
1675
1676 if (insert_bh == parent_fe_bh)
1677 retval = ocfs2_journal_access_di(handle,
1678 INODE_CACHE(dir),
1679 insert_bh,
1680 OCFS2_JOURNAL_ACCESS_WRITE);
1681 else {
1682 retval = ocfs2_journal_access_db(handle,
1683 INODE_CACHE(dir),
1684 insert_bh,
1685 OCFS2_JOURNAL_ACCESS_WRITE);
1686
1687 if (!retval && ocfs2_dir_indexed(dir))
1688 retval = ocfs2_dx_dir_insert(dir,
1689 handle,
1690 lookup);
1691 }
1692
1693 if (retval) {
1694 mlog_errno(retval);
1695 goto bail;
1696 }
1697
1698 /* By now the buffer is marked for journaling */
1699 offset += le16_to_cpu(de->rec_len);
1700 if (le64_to_cpu(de->inode)) {
1701 de1 = (struct ocfs2_dir_entry *)((char *) de +
1702 OCFS2_DIR_REC_LEN(de->name_len));
1703 de1->rec_len =
1704 cpu_to_le16(le16_to_cpu(de->rec_len) -
1705 OCFS2_DIR_REC_LEN(de->name_len));
1706 de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
1707 de = de1;
1708 }
1709 de->file_type = FT_UNKNOWN;
1710 if (blkno) {
1711 de->inode = cpu_to_le64(blkno);
1712 ocfs2_set_de_type(de, inode->i_mode);
1713 } else
1714 de->inode = 0;
1715 de->name_len = namelen;
1716 memcpy(de->name, name, namelen);
1717
1718 if (ocfs2_dir_indexed(dir))
1719 ocfs2_recalc_free_list(dir, handle, lookup);
1720
1721 inode_inc_iversion(dir);
1722 ocfs2_journal_dirty(handle, insert_bh);
1723 retval = 0;
1724 goto bail;
1725 }
1726
1727 offset += le16_to_cpu(de->rec_len);
1728 de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
1729 }
1730
1731 /* when you think about it, the assert above should prevent us
1732 * from ever getting here. */
1733 retval = -ENOSPC;
1734 bail:
1735 if (retval)
1736 mlog_errno(retval);
1737
1738 return retval;
1739 }
1740
ocfs2_dir_foreach_blk_id(struct inode * inode,u64 * f_version,struct dir_context * ctx)1741 static int ocfs2_dir_foreach_blk_id(struct inode *inode,
1742 u64 *f_version,
1743 struct dir_context *ctx)
1744 {
1745 int ret, i;
1746 unsigned long offset = ctx->pos;
1747 struct buffer_head *di_bh = NULL;
1748 struct ocfs2_dinode *di;
1749 struct ocfs2_inline_data *data;
1750 struct ocfs2_dir_entry *de;
1751
1752 ret = ocfs2_read_inode_block(inode, &di_bh);
1753 if (ret) {
1754 mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
1755 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1756 goto out;
1757 }
1758
1759 di = (struct ocfs2_dinode *)di_bh->b_data;
1760 data = &di->id2.i_data;
1761
1762 while (ctx->pos < i_size_read(inode)) {
1763 /* If the dir block has changed since the last call to
1764 * readdir(2), then we might be pointing to an invalid
1765 * dirent right now. Scan from the start of the block
1766 * to make sure. */
1767 if (!inode_eq_iversion(inode, *f_version)) {
1768 for (i = 0; i < i_size_read(inode) && i < offset; ) {
1769 de = (struct ocfs2_dir_entry *)
1770 (data->id_data + i);
1771 /* It's too expensive to do a full
1772 * dirent test each time round this
1773 * loop, but we do have to test at
1774 * least that it is non-zero. A
1775 * failure will be detected in the
1776 * dirent test below. */
1777 if (le16_to_cpu(de->rec_len) <
1778 OCFS2_DIR_REC_LEN(1))
1779 break;
1780 i += le16_to_cpu(de->rec_len);
1781 }
1782 ctx->pos = offset = i;
1783 *f_version = inode_query_iversion(inode);
1784 }
1785
1786 de = (struct ocfs2_dir_entry *) (data->id_data + ctx->pos);
1787 if (!ocfs2_check_dir_entry(inode, de, di_bh, (char *)data->id_data,
1788 i_size_read(inode), ctx->pos)) {
1789 /* On error, skip the f_pos to the end. */
1790 ctx->pos = i_size_read(inode);
1791 break;
1792 }
1793 offset += le16_to_cpu(de->rec_len);
1794 if (le64_to_cpu(de->inode)) {
1795 if (!dir_emit(ctx, de->name, de->name_len,
1796 le64_to_cpu(de->inode),
1797 fs_ftype_to_dtype(de->file_type)))
1798 goto out;
1799 }
1800 ctx->pos += le16_to_cpu(de->rec_len);
1801 }
1802 out:
1803 brelse(di_bh);
1804 return 0;
1805 }
1806
1807 /*
1808 * NOTE: This function can be called against unindexed directories,
1809 * and indexed ones.
1810 */
ocfs2_dir_foreach_blk_el(struct inode * inode,u64 * f_version,struct dir_context * ctx,bool persist)1811 static int ocfs2_dir_foreach_blk_el(struct inode *inode,
1812 u64 *f_version,
1813 struct dir_context *ctx,
1814 bool persist)
1815 {
1816 unsigned long offset, blk, last_ra_blk = 0;
1817 int i;
1818 struct buffer_head * bh, * tmp;
1819 struct ocfs2_dir_entry * de;
1820 struct super_block * sb = inode->i_sb;
1821 unsigned int ra_sectors = 16;
1822 int stored = 0;
1823
1824 bh = NULL;
1825
1826 offset = ctx->pos & (sb->s_blocksize - 1);
1827
1828 while (ctx->pos < i_size_read(inode)) {
1829 blk = ctx->pos >> sb->s_blocksize_bits;
1830 if (ocfs2_read_dir_block(inode, blk, &bh, 0)) {
1831 /* Skip the corrupt dirblock and keep trying */
1832 ctx->pos += sb->s_blocksize - offset;
1833 continue;
1834 }
1835
1836 /* The idea here is to begin with 8k read-ahead and to stay
1837 * 4k ahead of our current position.
1838 *
1839 * TODO: Use the pagecache for this. We just need to
1840 * make sure it's cluster-safe... */
1841 if (!last_ra_blk
1842 || (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) {
1843 for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
1844 i > 0; i--) {
1845 tmp = NULL;
1846 if (!ocfs2_read_dir_block(inode, ++blk, &tmp,
1847 OCFS2_BH_READAHEAD))
1848 brelse(tmp);
1849 }
1850 last_ra_blk = blk;
1851 ra_sectors = 8;
1852 }
1853
1854 /* If the dir block has changed since the last call to
1855 * readdir(2), then we might be pointing to an invalid
1856 * dirent right now. Scan from the start of the block
1857 * to make sure. */
1858 if (!inode_eq_iversion(inode, *f_version)) {
1859 for (i = 0; i < sb->s_blocksize && i < offset; ) {
1860 de = (struct ocfs2_dir_entry *) (bh->b_data + i);
1861 /* It's too expensive to do a full
1862 * dirent test each time round this
1863 * loop, but we do have to test at
1864 * least that it is non-zero. A
1865 * failure will be detected in the
1866 * dirent test below. */
1867 if (le16_to_cpu(de->rec_len) <
1868 OCFS2_DIR_REC_LEN(1))
1869 break;
1870 i += le16_to_cpu(de->rec_len);
1871 }
1872 offset = i;
1873 ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
1874 | offset;
1875 *f_version = inode_query_iversion(inode);
1876 }
1877
1878 while (ctx->pos < i_size_read(inode)
1879 && offset < sb->s_blocksize) {
1880 de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
1881 if (!ocfs2_check_dir_entry(inode, de, bh, bh->b_data,
1882 sb->s_blocksize, offset)) {
1883 /* On error, skip the f_pos to the
1884 next block. */
1885 ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1;
1886 break;
1887 }
1888 if (le64_to_cpu(de->inode)) {
1889 if (!dir_emit(ctx, de->name,
1890 de->name_len,
1891 le64_to_cpu(de->inode),
1892 fs_ftype_to_dtype(de->file_type))) {
1893 brelse(bh);
1894 return 0;
1895 }
1896 stored++;
1897 }
1898 offset += le16_to_cpu(de->rec_len);
1899 ctx->pos += le16_to_cpu(de->rec_len);
1900 }
1901 offset = 0;
1902 brelse(bh);
1903 bh = NULL;
1904 if (!persist && stored)
1905 break;
1906 }
1907 return 0;
1908 }
1909
ocfs2_dir_foreach_blk(struct inode * inode,u64 * f_version,struct dir_context * ctx,bool persist)1910 static int ocfs2_dir_foreach_blk(struct inode *inode, u64 *f_version,
1911 struct dir_context *ctx,
1912 bool persist)
1913 {
1914 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1915 return ocfs2_dir_foreach_blk_id(inode, f_version, ctx);
1916 return ocfs2_dir_foreach_blk_el(inode, f_version, ctx, persist);
1917 }
1918
1919 /*
1920 * This is intended to be called from inside other kernel functions,
1921 * so we fake some arguments.
1922 */
ocfs2_dir_foreach(struct inode * inode,struct dir_context * ctx)1923 int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx)
1924 {
1925 u64 version = inode_query_iversion(inode);
1926 ocfs2_dir_foreach_blk(inode, &version, ctx, true);
1927 return 0;
1928 }
1929
1930 /*
1931 * ocfs2_readdir()
1932 *
1933 */
ocfs2_readdir(struct file * file,struct dir_context * ctx)1934 int ocfs2_readdir(struct file *file, struct dir_context *ctx)
1935 {
1936 int error = 0;
1937 struct inode *inode = file_inode(file);
1938 int lock_level = 0;
1939
1940 trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno);
1941
1942 error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level, 1);
1943 if (lock_level && error >= 0) {
1944 /* We release EX lock which used to update atime
1945 * and get PR lock again to reduce contention
1946 * on commonly accessed directories. */
1947 ocfs2_inode_unlock(inode, 1);
1948 lock_level = 0;
1949 error = ocfs2_inode_lock(inode, NULL, 0);
1950 }
1951 if (error < 0) {
1952 if (error != -ENOENT)
1953 mlog_errno(error);
1954 /* we haven't got any yet, so propagate the error. */
1955 goto bail_nolock;
1956 }
1957
1958 error = ocfs2_dir_foreach_blk(inode, &file->f_version, ctx, false);
1959
1960 ocfs2_inode_unlock(inode, lock_level);
1961 if (error)
1962 mlog_errno(error);
1963
1964 bail_nolock:
1965
1966 return error;
1967 }
1968
1969 /*
1970 * NOTE: this should always be called with parent dir i_rwsem taken.
1971 */
ocfs2_find_files_on_disk(const char * name,int namelen,u64 * blkno,struct inode * inode,struct ocfs2_dir_lookup_result * lookup)1972 int ocfs2_find_files_on_disk(const char *name,
1973 int namelen,
1974 u64 *blkno,
1975 struct inode *inode,
1976 struct ocfs2_dir_lookup_result *lookup)
1977 {
1978 int status = -ENOENT;
1979
1980 trace_ocfs2_find_files_on_disk(namelen, name, blkno,
1981 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1982
1983 status = ocfs2_find_entry(name, namelen, inode, lookup);
1984 if (status)
1985 goto leave;
1986
1987 *blkno = le64_to_cpu(lookup->dl_entry->inode);
1988
1989 status = 0;
1990 leave:
1991
1992 return status;
1993 }
1994
1995 /*
1996 * Convenience function for callers which just want the block number
1997 * mapped to a name and don't require the full dirent info, etc.
1998 */
ocfs2_lookup_ino_from_name(struct inode * dir,const char * name,int namelen,u64 * blkno)1999 int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
2000 int namelen, u64 *blkno)
2001 {
2002 int ret;
2003 struct ocfs2_dir_lookup_result lookup = { NULL, };
2004
2005 ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &lookup);
2006 ocfs2_free_dir_lookup_result(&lookup);
2007
2008 return ret;
2009 }
2010
2011 /* Check for a name within a directory.
2012 *
2013 * Return 0 if the name does not exist
2014 * Return -EEXIST if the directory contains the name
2015 *
2016 * Callers should have i_rwsem + a cluster lock on dir
2017 */
ocfs2_check_dir_for_entry(struct inode * dir,const char * name,int namelen)2018 int ocfs2_check_dir_for_entry(struct inode *dir,
2019 const char *name,
2020 int namelen)
2021 {
2022 int ret = 0;
2023 struct ocfs2_dir_lookup_result lookup = { NULL, };
2024
2025 trace_ocfs2_check_dir_for_entry(
2026 (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
2027
2028 if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0) {
2029 ret = -EEXIST;
2030 mlog_errno(ret);
2031 }
2032
2033 ocfs2_free_dir_lookup_result(&lookup);
2034
2035 return ret;
2036 }
2037
2038 struct ocfs2_empty_dir_priv {
2039 struct dir_context ctx;
2040 unsigned seen_dot;
2041 unsigned seen_dot_dot;
2042 unsigned seen_other;
2043 unsigned dx_dir;
2044 };
ocfs2_empty_dir_filldir(struct dir_context * ctx,const char * name,int name_len,loff_t pos,u64 ino,unsigned type)2045 static bool ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name,
2046 int name_len, loff_t pos, u64 ino,
2047 unsigned type)
2048 {
2049 struct ocfs2_empty_dir_priv *p =
2050 container_of(ctx, struct ocfs2_empty_dir_priv, ctx);
2051
2052 /*
2053 * Check the positions of "." and ".." records to be sure
2054 * they're in the correct place.
2055 *
2056 * Indexed directories don't need to proceed past the first
2057 * two entries, so we end the scan after seeing '..'. Despite
2058 * that, we allow the scan to proceed In the event that we
2059 * have a corrupted indexed directory (no dot or dot dot
2060 * entries). This allows us to double check for existing
2061 * entries which might not have been found in the index.
2062 */
2063 if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) {
2064 p->seen_dot = 1;
2065 return true;
2066 }
2067
2068 if (name_len == 2 && !strncmp("..", name, 2) &&
2069 pos == OCFS2_DIR_REC_LEN(1)) {
2070 p->seen_dot_dot = 1;
2071
2072 if (p->dx_dir && p->seen_dot)
2073 return false;
2074
2075 return true;
2076 }
2077
2078 p->seen_other = 1;
2079 return false;
2080 }
2081
ocfs2_empty_dir_dx(struct inode * inode,struct ocfs2_empty_dir_priv * priv)2082 static int ocfs2_empty_dir_dx(struct inode *inode,
2083 struct ocfs2_empty_dir_priv *priv)
2084 {
2085 int ret;
2086 struct buffer_head *di_bh = NULL;
2087 struct buffer_head *dx_root_bh = NULL;
2088 struct ocfs2_dinode *di;
2089 struct ocfs2_dx_root_block *dx_root;
2090
2091 priv->dx_dir = 1;
2092
2093 ret = ocfs2_read_inode_block(inode, &di_bh);
2094 if (ret) {
2095 mlog_errno(ret);
2096 goto out;
2097 }
2098 di = (struct ocfs2_dinode *)di_bh->b_data;
2099
2100 ret = ocfs2_read_dx_root(inode, di, &dx_root_bh);
2101 if (ret) {
2102 mlog_errno(ret);
2103 goto out;
2104 }
2105 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2106
2107 if (le32_to_cpu(dx_root->dr_num_entries) != 2)
2108 priv->seen_other = 1;
2109
2110 out:
2111 brelse(di_bh);
2112 brelse(dx_root_bh);
2113 return ret;
2114 }
2115
2116 /*
2117 * routine to check that the specified directory is empty (for rmdir)
2118 *
2119 * Returns 1 if dir is empty, zero otherwise.
2120 *
2121 * XXX: This is a performance problem for unindexed directories.
2122 */
ocfs2_empty_dir(struct inode * inode)2123 int ocfs2_empty_dir(struct inode *inode)
2124 {
2125 int ret;
2126 struct ocfs2_empty_dir_priv priv = {
2127 .ctx.actor = ocfs2_empty_dir_filldir,
2128 };
2129
2130 if (ocfs2_dir_indexed(inode)) {
2131 ret = ocfs2_empty_dir_dx(inode, &priv);
2132 if (ret)
2133 mlog_errno(ret);
2134 /*
2135 * We still run ocfs2_dir_foreach to get the checks
2136 * for "." and "..".
2137 */
2138 }
2139
2140 ret = ocfs2_dir_foreach(inode, &priv.ctx);
2141 if (ret)
2142 mlog_errno(ret);
2143
2144 if (!priv.seen_dot || !priv.seen_dot_dot) {
2145 mlog(ML_ERROR, "bad directory (dir #%llu) - no `.' or `..'\n",
2146 (unsigned long long)OCFS2_I(inode)->ip_blkno);
2147 /*
2148 * XXX: Is it really safe to allow an unlink to continue?
2149 */
2150 return 1;
2151 }
2152
2153 return !priv.seen_other;
2154 }
2155
2156 /*
2157 * Fills "." and ".." dirents in a new directory block. Returns dirent for
2158 * "..", which might be used during creation of a directory with a trailing
2159 * header. It is otherwise safe to ignore the return code.
2160 */
ocfs2_fill_initial_dirents(struct inode * inode,struct inode * parent,char * start,unsigned int size)2161 static struct ocfs2_dir_entry *ocfs2_fill_initial_dirents(struct inode *inode,
2162 struct inode *parent,
2163 char *start,
2164 unsigned int size)
2165 {
2166 struct ocfs2_dir_entry *de = (struct ocfs2_dir_entry *)start;
2167
2168 de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
2169 de->name_len = 1;
2170 de->rec_len =
2171 cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
2172 strcpy(de->name, ".");
2173 ocfs2_set_de_type(de, S_IFDIR);
2174
2175 de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len));
2176 de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno);
2177 de->rec_len = cpu_to_le16(size - OCFS2_DIR_REC_LEN(1));
2178 de->name_len = 2;
2179 strcpy(de->name, "..");
2180 ocfs2_set_de_type(de, S_IFDIR);
2181
2182 return de;
2183 }
2184
2185 /*
2186 * This works together with code in ocfs2_mknod_locked() which sets
2187 * the inline-data flag and initializes the inline-data section.
2188 */
ocfs2_fill_new_dir_id(struct ocfs2_super * osb,handle_t * handle,struct inode * parent,struct inode * inode,struct buffer_head * di_bh)2189 static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
2190 handle_t *handle,
2191 struct inode *parent,
2192 struct inode *inode,
2193 struct buffer_head *di_bh)
2194 {
2195 int ret;
2196 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2197 struct ocfs2_inline_data *data = &di->id2.i_data;
2198 unsigned int size = le16_to_cpu(data->id_count);
2199
2200 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
2201 OCFS2_JOURNAL_ACCESS_WRITE);
2202 if (ret) {
2203 mlog_errno(ret);
2204 goto out;
2205 }
2206
2207 ocfs2_fill_initial_dirents(inode, parent, data->id_data, size);
2208 ocfs2_journal_dirty(handle, di_bh);
2209
2210 i_size_write(inode, size);
2211 set_nlink(inode, 2);
2212 inode->i_blocks = ocfs2_inode_sector_count(inode);
2213
2214 ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
2215 if (ret < 0)
2216 mlog_errno(ret);
2217
2218 out:
2219 return ret;
2220 }
2221
ocfs2_fill_new_dir_el(struct ocfs2_super * osb,handle_t * handle,struct inode * parent,struct inode * inode,struct buffer_head * fe_bh,struct ocfs2_alloc_context * data_ac,struct buffer_head ** ret_new_bh)2222 static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
2223 handle_t *handle,
2224 struct inode *parent,
2225 struct inode *inode,
2226 struct buffer_head *fe_bh,
2227 struct ocfs2_alloc_context *data_ac,
2228 struct buffer_head **ret_new_bh)
2229 {
2230 int status;
2231 unsigned int size = osb->sb->s_blocksize;
2232 struct buffer_head *new_bh = NULL;
2233 struct ocfs2_dir_entry *de;
2234
2235 if (ocfs2_new_dir_wants_trailer(inode))
2236 size = ocfs2_dir_trailer_blk_off(parent->i_sb);
2237
2238 status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
2239 data_ac, NULL, &new_bh);
2240 if (status < 0) {
2241 mlog_errno(status);
2242 goto bail;
2243 }
2244
2245 ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
2246
2247 status = ocfs2_journal_access_db(handle, INODE_CACHE(inode), new_bh,
2248 OCFS2_JOURNAL_ACCESS_CREATE);
2249 if (status < 0) {
2250 mlog_errno(status);
2251 goto bail;
2252 }
2253 memset(new_bh->b_data, 0, osb->sb->s_blocksize);
2254
2255 de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size);
2256 if (ocfs2_new_dir_wants_trailer(inode)) {
2257 int size = le16_to_cpu(de->rec_len);
2258
2259 /*
2260 * Figure out the size of the hole left over after
2261 * insertion of '.' and '..'. The trailer wants this
2262 * information.
2263 */
2264 size -= OCFS2_DIR_REC_LEN(2);
2265 size -= sizeof(struct ocfs2_dir_block_trailer);
2266
2267 ocfs2_init_dir_trailer(inode, new_bh, size);
2268 }
2269
2270 ocfs2_journal_dirty(handle, new_bh);
2271
2272 i_size_write(inode, inode->i_sb->s_blocksize);
2273 set_nlink(inode, 2);
2274 inode->i_blocks = ocfs2_inode_sector_count(inode);
2275 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
2276 if (status < 0) {
2277 mlog_errno(status);
2278 goto bail;
2279 }
2280
2281 status = 0;
2282 if (ret_new_bh) {
2283 *ret_new_bh = new_bh;
2284 new_bh = NULL;
2285 }
2286 bail:
2287 brelse(new_bh);
2288
2289 return status;
2290 }
2291
ocfs2_dx_dir_attach_index(struct ocfs2_super * osb,handle_t * handle,struct inode * dir,struct buffer_head * di_bh,struct buffer_head * dirdata_bh,struct ocfs2_alloc_context * meta_ac,int dx_inline,u32 num_entries,struct buffer_head ** ret_dx_root_bh)2292 static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2293 handle_t *handle, struct inode *dir,
2294 struct buffer_head *di_bh,
2295 struct buffer_head *dirdata_bh,
2296 struct ocfs2_alloc_context *meta_ac,
2297 int dx_inline, u32 num_entries,
2298 struct buffer_head **ret_dx_root_bh)
2299 {
2300 int ret;
2301 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
2302 u16 dr_suballoc_bit;
2303 u64 suballoc_loc, dr_blkno;
2304 unsigned int num_bits;
2305 struct buffer_head *dx_root_bh = NULL;
2306 struct ocfs2_dx_root_block *dx_root;
2307 struct ocfs2_dir_block_trailer *trailer =
2308 ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
2309
2310 ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
2311 &dr_suballoc_bit, &num_bits, &dr_blkno);
2312 if (ret) {
2313 mlog_errno(ret);
2314 goto out;
2315 }
2316
2317 trace_ocfs2_dx_dir_attach_index(
2318 (unsigned long long)OCFS2_I(dir)->ip_blkno,
2319 (unsigned long long)dr_blkno);
2320
2321 dx_root_bh = sb_getblk(osb->sb, dr_blkno);
2322 if (dx_root_bh == NULL) {
2323 ret = -ENOMEM;
2324 goto out;
2325 }
2326 ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dx_root_bh);
2327
2328 ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
2329 OCFS2_JOURNAL_ACCESS_CREATE);
2330 if (ret < 0) {
2331 mlog_errno(ret);
2332 goto out;
2333 }
2334
2335 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2336 memset(dx_root, 0, osb->sb->s_blocksize);
2337 strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
2338 dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
2339 dx_root->dr_suballoc_loc = cpu_to_le64(suballoc_loc);
2340 dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
2341 dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
2342 dx_root->dr_blkno = cpu_to_le64(dr_blkno);
2343 dx_root->dr_dir_blkno = cpu_to_le64(OCFS2_I(dir)->ip_blkno);
2344 dx_root->dr_num_entries = cpu_to_le32(num_entries);
2345 if (le16_to_cpu(trailer->db_free_rec_len))
2346 dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr);
2347 else
2348 dx_root->dr_free_blk = cpu_to_le64(0);
2349
2350 if (dx_inline) {
2351 dx_root->dr_flags |= OCFS2_DX_FLAG_INLINE;
2352 dx_root->dr_entries.de_count =
2353 cpu_to_le16(ocfs2_dx_entries_per_root(osb->sb));
2354 } else {
2355 dx_root->dr_list.l_count =
2356 cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
2357 }
2358 ocfs2_journal_dirty(handle, dx_root_bh);
2359
2360 ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
2361 OCFS2_JOURNAL_ACCESS_CREATE);
2362 if (ret) {
2363 mlog_errno(ret);
2364 goto out;
2365 }
2366
2367 di->i_dx_root = cpu_to_le64(dr_blkno);
2368
2369 spin_lock(&OCFS2_I(dir)->ip_lock);
2370 OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
2371 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
2372 spin_unlock(&OCFS2_I(dir)->ip_lock);
2373
2374 ocfs2_journal_dirty(handle, di_bh);
2375
2376 *ret_dx_root_bh = dx_root_bh;
2377 dx_root_bh = NULL;
2378
2379 out:
2380 brelse(dx_root_bh);
2381 return ret;
2382 }
2383
ocfs2_dx_dir_format_cluster(struct ocfs2_super * osb,handle_t * handle,struct inode * dir,struct buffer_head ** dx_leaves,int num_dx_leaves,u64 start_blk)2384 static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
2385 handle_t *handle, struct inode *dir,
2386 struct buffer_head **dx_leaves,
2387 int num_dx_leaves, u64 start_blk)
2388 {
2389 int ret, i;
2390 struct ocfs2_dx_leaf *dx_leaf;
2391 struct buffer_head *bh;
2392
2393 for (i = 0; i < num_dx_leaves; i++) {
2394 bh = sb_getblk(osb->sb, start_blk + i);
2395 if (bh == NULL) {
2396 ret = -ENOMEM;
2397 goto out;
2398 }
2399 dx_leaves[i] = bh;
2400
2401 ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), bh);
2402
2403 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), bh,
2404 OCFS2_JOURNAL_ACCESS_CREATE);
2405 if (ret < 0) {
2406 mlog_errno(ret);
2407 goto out;
2408 }
2409
2410 dx_leaf = (struct ocfs2_dx_leaf *) bh->b_data;
2411
2412 memset(dx_leaf, 0, osb->sb->s_blocksize);
2413 strcpy(dx_leaf->dl_signature, OCFS2_DX_LEAF_SIGNATURE);
2414 dx_leaf->dl_fs_generation = cpu_to_le32(osb->fs_generation);
2415 dx_leaf->dl_blkno = cpu_to_le64(bh->b_blocknr);
2416 dx_leaf->dl_list.de_count =
2417 cpu_to_le16(ocfs2_dx_entries_per_leaf(osb->sb));
2418
2419 trace_ocfs2_dx_dir_format_cluster(
2420 (unsigned long long)OCFS2_I(dir)->ip_blkno,
2421 (unsigned long long)bh->b_blocknr,
2422 le16_to_cpu(dx_leaf->dl_list.de_count));
2423
2424 ocfs2_journal_dirty(handle, bh);
2425 }
2426
2427 ret = 0;
2428 out:
2429 return ret;
2430 }
2431
2432 /*
2433 * Allocates and formats a new cluster for use in an indexed dir
2434 * leaf. This version will not do the extent insert, so that it can be
2435 * used by operations which need careful ordering.
2436 */
__ocfs2_dx_dir_new_cluster(struct inode * dir,u32 cpos,handle_t * handle,struct ocfs2_alloc_context * data_ac,struct buffer_head ** dx_leaves,int num_dx_leaves,u64 * ret_phys_blkno)2437 static int __ocfs2_dx_dir_new_cluster(struct inode *dir,
2438 u32 cpos, handle_t *handle,
2439 struct ocfs2_alloc_context *data_ac,
2440 struct buffer_head **dx_leaves,
2441 int num_dx_leaves, u64 *ret_phys_blkno)
2442 {
2443 int ret;
2444 u32 phys, num;
2445 u64 phys_blkno;
2446 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2447
2448 /*
2449 * XXX: For create, this should claim cluster for the index
2450 * *before* the unindexed insert so that we have a better
2451 * chance of contiguousness as the directory grows in number
2452 * of entries.
2453 */
2454 ret = __ocfs2_claim_clusters(handle, data_ac, 1, 1, &phys, &num);
2455 if (ret) {
2456 mlog_errno(ret);
2457 goto out;
2458 }
2459
2460 /*
2461 * Format the new cluster first. That way, we're inserting
2462 * valid data.
2463 */
2464 phys_blkno = ocfs2_clusters_to_blocks(osb->sb, phys);
2465 ret = ocfs2_dx_dir_format_cluster(osb, handle, dir, dx_leaves,
2466 num_dx_leaves, phys_blkno);
2467 if (ret) {
2468 mlog_errno(ret);
2469 goto out;
2470 }
2471
2472 *ret_phys_blkno = phys_blkno;
2473 out:
2474 return ret;
2475 }
2476
ocfs2_dx_dir_new_cluster(struct inode * dir,struct ocfs2_extent_tree * et,u32 cpos,handle_t * handle,struct ocfs2_alloc_context * data_ac,struct ocfs2_alloc_context * meta_ac,struct buffer_head ** dx_leaves,int num_dx_leaves)2477 static int ocfs2_dx_dir_new_cluster(struct inode *dir,
2478 struct ocfs2_extent_tree *et,
2479 u32 cpos, handle_t *handle,
2480 struct ocfs2_alloc_context *data_ac,
2481 struct ocfs2_alloc_context *meta_ac,
2482 struct buffer_head **dx_leaves,
2483 int num_dx_leaves)
2484 {
2485 int ret;
2486 u64 phys_blkno;
2487
2488 ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves,
2489 num_dx_leaves, &phys_blkno);
2490 if (ret) {
2491 mlog_errno(ret);
2492 goto out;
2493 }
2494
2495 ret = ocfs2_insert_extent(handle, et, cpos, phys_blkno, 1, 0,
2496 meta_ac);
2497 if (ret)
2498 mlog_errno(ret);
2499 out:
2500 return ret;
2501 }
2502
ocfs2_dx_dir_kmalloc_leaves(struct super_block * sb,int * ret_num_leaves)2503 static struct buffer_head **ocfs2_dx_dir_kmalloc_leaves(struct super_block *sb,
2504 int *ret_num_leaves)
2505 {
2506 int num_dx_leaves = ocfs2_clusters_to_blocks(sb, 1);
2507 struct buffer_head **dx_leaves;
2508
2509 dx_leaves = kcalloc(num_dx_leaves, sizeof(struct buffer_head *),
2510 GFP_NOFS);
2511 if (dx_leaves && ret_num_leaves)
2512 *ret_num_leaves = num_dx_leaves;
2513
2514 return dx_leaves;
2515 }
2516
ocfs2_fill_new_dir_dx(struct ocfs2_super * osb,handle_t * handle,struct inode * parent,struct inode * inode,struct buffer_head * di_bh,struct ocfs2_alloc_context * data_ac,struct ocfs2_alloc_context * meta_ac)2517 static int ocfs2_fill_new_dir_dx(struct ocfs2_super *osb,
2518 handle_t *handle,
2519 struct inode *parent,
2520 struct inode *inode,
2521 struct buffer_head *di_bh,
2522 struct ocfs2_alloc_context *data_ac,
2523 struct ocfs2_alloc_context *meta_ac)
2524 {
2525 int ret;
2526 struct buffer_head *leaf_bh = NULL;
2527 struct buffer_head *dx_root_bh = NULL;
2528 struct ocfs2_dx_hinfo hinfo;
2529 struct ocfs2_dx_root_block *dx_root;
2530 struct ocfs2_dx_entry_list *entry_list;
2531
2532 /*
2533 * Our strategy is to create the directory as though it were
2534 * unindexed, then add the index block. This works with very
2535 * little complication since the state of a new directory is a
2536 * very well known quantity.
2537 *
2538 * Essentially, we have two dirents ("." and ".."), in the 1st
2539 * block which need indexing. These are easily inserted into
2540 * the index block.
2541 */
2542
2543 ret = ocfs2_fill_new_dir_el(osb, handle, parent, inode, di_bh,
2544 data_ac, &leaf_bh);
2545 if (ret) {
2546 mlog_errno(ret);
2547 goto out;
2548 }
2549
2550 ret = ocfs2_dx_dir_attach_index(osb, handle, inode, di_bh, leaf_bh,
2551 meta_ac, 1, 2, &dx_root_bh);
2552 if (ret) {
2553 mlog_errno(ret);
2554 goto out;
2555 }
2556 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2557 entry_list = &dx_root->dr_entries;
2558
2559 /* Buffer has been journaled for us by ocfs2_dx_dir_attach_index */
2560 ocfs2_dx_dir_name_hash(inode, ".", 1, &hinfo);
2561 ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr);
2562
2563 ocfs2_dx_dir_name_hash(inode, "..", 2, &hinfo);
2564 ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr);
2565
2566 out:
2567 brelse(dx_root_bh);
2568 brelse(leaf_bh);
2569 return ret;
2570 }
2571
ocfs2_fill_new_dir(struct ocfs2_super * osb,handle_t * handle,struct inode * parent,struct inode * inode,struct buffer_head * fe_bh,struct ocfs2_alloc_context * data_ac,struct ocfs2_alloc_context * meta_ac)2572 int ocfs2_fill_new_dir(struct ocfs2_super *osb,
2573 handle_t *handle,
2574 struct inode *parent,
2575 struct inode *inode,
2576 struct buffer_head *fe_bh,
2577 struct ocfs2_alloc_context *data_ac,
2578 struct ocfs2_alloc_context *meta_ac)
2579
2580 {
2581 BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL);
2582
2583 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
2584 return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh);
2585
2586 if (ocfs2_supports_indexed_dirs(osb))
2587 return ocfs2_fill_new_dir_dx(osb, handle, parent, inode, fe_bh,
2588 data_ac, meta_ac);
2589
2590 return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh,
2591 data_ac, NULL);
2592 }
2593
ocfs2_dx_dir_index_block(struct inode * dir,handle_t * handle,struct buffer_head ** dx_leaves,int num_dx_leaves,u32 * num_dx_entries,struct buffer_head * dirent_bh)2594 static int ocfs2_dx_dir_index_block(struct inode *dir,
2595 handle_t *handle,
2596 struct buffer_head **dx_leaves,
2597 int num_dx_leaves,
2598 u32 *num_dx_entries,
2599 struct buffer_head *dirent_bh)
2600 {
2601 int ret = 0, namelen, i;
2602 char *de_buf, *limit;
2603 struct ocfs2_dir_entry *de;
2604 struct buffer_head *dx_leaf_bh;
2605 struct ocfs2_dx_hinfo hinfo;
2606 u64 dirent_blk = dirent_bh->b_blocknr;
2607
2608 de_buf = dirent_bh->b_data;
2609 limit = de_buf + dir->i_sb->s_blocksize;
2610
2611 while (de_buf < limit) {
2612 de = (struct ocfs2_dir_entry *)de_buf;
2613
2614 namelen = de->name_len;
2615 if (!namelen || !de->inode)
2616 goto inc;
2617
2618 ocfs2_dx_dir_name_hash(dir, de->name, namelen, &hinfo);
2619
2620 i = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), &hinfo);
2621 dx_leaf_bh = dx_leaves[i];
2622
2623 ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &hinfo,
2624 dirent_blk, dx_leaf_bh);
2625 if (ret) {
2626 mlog_errno(ret);
2627 goto out;
2628 }
2629
2630 *num_dx_entries = *num_dx_entries + 1;
2631
2632 inc:
2633 de_buf += le16_to_cpu(de->rec_len);
2634 }
2635
2636 out:
2637 return ret;
2638 }
2639
2640 /*
2641 * XXX: This expects dx_root_bh to already be part of the transaction.
2642 */
ocfs2_dx_dir_index_root_block(struct inode * dir,struct buffer_head * dx_root_bh,struct buffer_head * dirent_bh)2643 static void ocfs2_dx_dir_index_root_block(struct inode *dir,
2644 struct buffer_head *dx_root_bh,
2645 struct buffer_head *dirent_bh)
2646 {
2647 char *de_buf, *limit;
2648 struct ocfs2_dx_root_block *dx_root;
2649 struct ocfs2_dir_entry *de;
2650 struct ocfs2_dx_hinfo hinfo;
2651 u64 dirent_blk = dirent_bh->b_blocknr;
2652
2653 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2654
2655 de_buf = dirent_bh->b_data;
2656 limit = de_buf + dir->i_sb->s_blocksize;
2657
2658 while (de_buf < limit) {
2659 de = (struct ocfs2_dir_entry *)de_buf;
2660
2661 if (!de->name_len || !de->inode)
2662 goto inc;
2663
2664 ocfs2_dx_dir_name_hash(dir, de->name, de->name_len, &hinfo);
2665
2666 trace_ocfs2_dx_dir_index_root_block(
2667 (unsigned long long)dir->i_ino,
2668 hinfo.major_hash, hinfo.minor_hash,
2669 de->name_len, de->name,
2670 le16_to_cpu(dx_root->dr_entries.de_num_used));
2671
2672 ocfs2_dx_entry_list_insert(&dx_root->dr_entries, &hinfo,
2673 dirent_blk);
2674
2675 le32_add_cpu(&dx_root->dr_num_entries, 1);
2676 inc:
2677 de_buf += le16_to_cpu(de->rec_len);
2678 }
2679 }
2680
2681 /*
2682 * Count the number of inline directory entries in di_bh and compare
2683 * them against the number of entries we can hold in an inline dx root
2684 * block.
2685 */
ocfs2_new_dx_should_be_inline(struct inode * dir,struct buffer_head * di_bh)2686 static int ocfs2_new_dx_should_be_inline(struct inode *dir,
2687 struct buffer_head *di_bh)
2688 {
2689 int dirent_count = 0;
2690 char *de_buf, *limit;
2691 struct ocfs2_dir_entry *de;
2692 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2693
2694 de_buf = di->id2.i_data.id_data;
2695 limit = de_buf + i_size_read(dir);
2696
2697 while (de_buf < limit) {
2698 de = (struct ocfs2_dir_entry *)de_buf;
2699
2700 if (de->name_len && de->inode)
2701 dirent_count++;
2702
2703 de_buf += le16_to_cpu(de->rec_len);
2704 }
2705
2706 /* We are careful to leave room for one extra record. */
2707 return dirent_count < ocfs2_dx_entries_per_root(dir->i_sb);
2708 }
2709
2710 /*
2711 * Expand rec_len of the rightmost dirent in a directory block so that it
2712 * contains the end of our valid space for dirents. We do this during
2713 * expansion from an inline directory to one with extents. The first dir block
2714 * in that case is taken from the inline data portion of the inode block.
2715 *
2716 * This will also return the largest amount of contiguous space for a dirent
2717 * in the block. That value is *not* necessarily the last dirent, even after
2718 * expansion. The directory indexing code wants this value for free space
2719 * accounting. We do this here since we're already walking the entire dir
2720 * block.
2721 *
2722 * We add the dir trailer if this filesystem wants it.
2723 */
ocfs2_expand_last_dirent(char * start,unsigned int old_size,struct inode * dir)2724 static unsigned int ocfs2_expand_last_dirent(char *start, unsigned int old_size,
2725 struct inode *dir)
2726 {
2727 struct super_block *sb = dir->i_sb;
2728 struct ocfs2_dir_entry *de;
2729 struct ocfs2_dir_entry *prev_de;
2730 char *de_buf, *limit;
2731 unsigned int new_size = sb->s_blocksize;
2732 unsigned int bytes, this_hole;
2733 unsigned int largest_hole = 0;
2734
2735 if (ocfs2_new_dir_wants_trailer(dir))
2736 new_size = ocfs2_dir_trailer_blk_off(sb);
2737
2738 bytes = new_size - old_size;
2739
2740 limit = start + old_size;
2741 de_buf = start;
2742 de = (struct ocfs2_dir_entry *)de_buf;
2743 do {
2744 this_hole = ocfs2_figure_dirent_hole(de);
2745 if (this_hole > largest_hole)
2746 largest_hole = this_hole;
2747
2748 prev_de = de;
2749 de_buf += le16_to_cpu(de->rec_len);
2750 de = (struct ocfs2_dir_entry *)de_buf;
2751 } while (de_buf < limit);
2752
2753 le16_add_cpu(&prev_de->rec_len, bytes);
2754
2755 /* We need to double check this after modification of the final
2756 * dirent. */
2757 this_hole = ocfs2_figure_dirent_hole(prev_de);
2758 if (this_hole > largest_hole)
2759 largest_hole = this_hole;
2760
2761 if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
2762 return largest_hole;
2763 return 0;
2764 }
2765
2766 /*
2767 * We allocate enough clusters to fulfill "blocks_wanted", but set
2768 * i_size to exactly one block. Ocfs2_extend_dir() will handle the
2769 * rest automatically for us.
2770 *
2771 * *first_block_bh is a pointer to the 1st data block allocated to the
2772 * directory.
2773 */
ocfs2_expand_inline_dir(struct inode * dir,struct buffer_head * di_bh,unsigned int blocks_wanted,struct ocfs2_dir_lookup_result * lookup,struct buffer_head ** first_block_bh)2774 static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
2775 unsigned int blocks_wanted,
2776 struct ocfs2_dir_lookup_result *lookup,
2777 struct buffer_head **first_block_bh)
2778 {
2779 u32 alloc, dx_alloc, bit_off, len, num_dx_entries = 0;
2780 struct super_block *sb = dir->i_sb;
2781 int ret, i, num_dx_leaves = 0, dx_inline = 0,
2782 credits = ocfs2_inline_to_extents_credits(sb);
2783 u64 dx_insert_blkno, blkno,
2784 bytes = blocks_wanted << sb->s_blocksize_bits;
2785 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2786 struct ocfs2_inode_info *oi = OCFS2_I(dir);
2787 struct ocfs2_alloc_context *data_ac = NULL;
2788 struct ocfs2_alloc_context *meta_ac = NULL;
2789 struct buffer_head *dirdata_bh = NULL;
2790 struct buffer_head *dx_root_bh = NULL;
2791 struct buffer_head **dx_leaves = NULL;
2792 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2793 handle_t *handle;
2794 struct ocfs2_extent_tree et;
2795 struct ocfs2_extent_tree dx_et;
2796 int did_quota = 0, bytes_allocated = 0;
2797
2798 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir), di_bh);
2799
2800 alloc = ocfs2_clusters_for_bytes(sb, bytes);
2801 dx_alloc = 0;
2802
2803 down_write(&oi->ip_alloc_sem);
2804
2805 if (ocfs2_supports_indexed_dirs(osb)) {
2806 credits += ocfs2_add_dir_index_credits(sb);
2807
2808 dx_inline = ocfs2_new_dx_should_be_inline(dir, di_bh);
2809 if (!dx_inline) {
2810 /* Add one more cluster for an index leaf */
2811 dx_alloc++;
2812 dx_leaves = ocfs2_dx_dir_kmalloc_leaves(sb,
2813 &num_dx_leaves);
2814 if (!dx_leaves) {
2815 ret = -ENOMEM;
2816 mlog_errno(ret);
2817 goto out;
2818 }
2819 }
2820
2821 /* This gets us the dx_root */
2822 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
2823 if (ret) {
2824 mlog_errno(ret);
2825 goto out;
2826 }
2827 }
2828
2829 /*
2830 * We should never need more than 2 clusters for the unindexed
2831 * tree - maximum dirent size is far less than one block. In
2832 * fact, the only time we'd need more than one cluster is if
2833 * blocksize == clustersize and the dirent won't fit in the
2834 * extra space that the expansion to a single block gives. As
2835 * of today, that only happens on 4k/4k file systems.
2836 */
2837 BUG_ON(alloc > 2);
2838
2839 ret = ocfs2_reserve_clusters(osb, alloc + dx_alloc, &data_ac);
2840 if (ret) {
2841 mlog_errno(ret);
2842 goto out;
2843 }
2844
2845 /*
2846 * Prepare for worst case allocation scenario of two separate
2847 * extents in the unindexed tree.
2848 */
2849 if (alloc == 2)
2850 credits += OCFS2_SUBALLOC_ALLOC;
2851
2852 handle = ocfs2_start_trans(osb, credits);
2853 if (IS_ERR(handle)) {
2854 ret = PTR_ERR(handle);
2855 mlog_errno(ret);
2856 goto out;
2857 }
2858
2859 ret = dquot_alloc_space_nodirty(dir,
2860 ocfs2_clusters_to_bytes(osb->sb, alloc + dx_alloc));
2861 if (ret)
2862 goto out_commit;
2863 did_quota = 1;
2864
2865 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
2866 /*
2867 * Allocate our index cluster first, to maximize the
2868 * possibility that unindexed leaves grow
2869 * contiguously.
2870 */
2871 ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac,
2872 dx_leaves, num_dx_leaves,
2873 &dx_insert_blkno);
2874 if (ret) {
2875 mlog_errno(ret);
2876 goto out_commit;
2877 }
2878 bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
2879 }
2880
2881 /*
2882 * Try to claim as many clusters as the bitmap can give though
2883 * if we only get one now, that's enough to continue. The rest
2884 * will be claimed after the conversion to extents.
2885 */
2886 if (ocfs2_dir_resv_allowed(osb))
2887 data_ac->ac_resv = &oi->ip_la_data_resv;
2888 ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &len);
2889 if (ret) {
2890 mlog_errno(ret);
2891 goto out_commit;
2892 }
2893 bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
2894
2895 /*
2896 * Operations are carefully ordered so that we set up the new
2897 * data block first. The conversion from inline data to
2898 * extents follows.
2899 */
2900 blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
2901 dirdata_bh = sb_getblk(sb, blkno);
2902 if (!dirdata_bh) {
2903 ret = -ENOMEM;
2904 mlog_errno(ret);
2905 goto out_commit;
2906 }
2907
2908 ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dirdata_bh);
2909
2910 ret = ocfs2_journal_access_db(handle, INODE_CACHE(dir), dirdata_bh,
2911 OCFS2_JOURNAL_ACCESS_CREATE);
2912 if (ret) {
2913 mlog_errno(ret);
2914 goto out_commit;
2915 }
2916
2917 memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
2918 memset(dirdata_bh->b_data + i_size_read(dir), 0,
2919 sb->s_blocksize - i_size_read(dir));
2920 i = ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), dir);
2921 if (ocfs2_new_dir_wants_trailer(dir)) {
2922 /*
2923 * Prepare the dir trailer up front. It will otherwise look
2924 * like a valid dirent. Even if inserting the index fails
2925 * (unlikely), then all we'll have done is given first dir
2926 * block a small amount of fragmentation.
2927 */
2928 ocfs2_init_dir_trailer(dir, dirdata_bh, i);
2929 }
2930
2931 ocfs2_update_inode_fsync_trans(handle, dir, 1);
2932 ocfs2_journal_dirty(handle, dirdata_bh);
2933
2934 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
2935 /*
2936 * Dx dirs with an external cluster need to do this up
2937 * front. Inline dx root's get handled later, after
2938 * we've allocated our root block. We get passed back
2939 * a total number of items so that dr_num_entries can
2940 * be correctly set once the dx_root has been
2941 * allocated.
2942 */
2943 ret = ocfs2_dx_dir_index_block(dir, handle, dx_leaves,
2944 num_dx_leaves, &num_dx_entries,
2945 dirdata_bh);
2946 if (ret) {
2947 mlog_errno(ret);
2948 goto out_commit;
2949 }
2950 }
2951
2952 /*
2953 * Set extent, i_size, etc on the directory. After this, the
2954 * inode should contain the same exact dirents as before and
2955 * be fully accessible from system calls.
2956 *
2957 * We let the later dirent insert modify c/mtime - to the user
2958 * the data hasn't changed.
2959 */
2960 ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
2961 OCFS2_JOURNAL_ACCESS_CREATE);
2962 if (ret) {
2963 mlog_errno(ret);
2964 goto out_commit;
2965 }
2966
2967 spin_lock(&oi->ip_lock);
2968 oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
2969 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
2970 spin_unlock(&oi->ip_lock);
2971
2972 ocfs2_dinode_new_extent_list(dir, di);
2973
2974 i_size_write(dir, sb->s_blocksize);
2975 inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
2976
2977 di->i_size = cpu_to_le64(sb->s_blocksize);
2978 di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(dir));
2979 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(dir));
2980 ocfs2_update_inode_fsync_trans(handle, dir, 1);
2981
2982 /*
2983 * This should never fail as our extent list is empty and all
2984 * related blocks have been journaled already.
2985 */
2986 ret = ocfs2_insert_extent(handle, &et, 0, blkno, len,
2987 0, NULL);
2988 if (ret) {
2989 mlog_errno(ret);
2990 goto out_commit;
2991 }
2992
2993 /*
2994 * Set i_blocks after the extent insert for the most up to
2995 * date ip_clusters value.
2996 */
2997 dir->i_blocks = ocfs2_inode_sector_count(dir);
2998
2999 ocfs2_journal_dirty(handle, di_bh);
3000
3001 if (ocfs2_supports_indexed_dirs(osb)) {
3002 ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh,
3003 dirdata_bh, meta_ac, dx_inline,
3004 num_dx_entries, &dx_root_bh);
3005 if (ret) {
3006 mlog_errno(ret);
3007 goto out_commit;
3008 }
3009
3010 if (dx_inline) {
3011 ocfs2_dx_dir_index_root_block(dir, dx_root_bh,
3012 dirdata_bh);
3013 } else {
3014 ocfs2_init_dx_root_extent_tree(&dx_et,
3015 INODE_CACHE(dir),
3016 dx_root_bh);
3017 ret = ocfs2_insert_extent(handle, &dx_et, 0,
3018 dx_insert_blkno, 1, 0, NULL);
3019 if (ret)
3020 mlog_errno(ret);
3021 }
3022 }
3023
3024 /*
3025 * We asked for two clusters, but only got one in the 1st
3026 * pass. Claim the 2nd cluster as a separate extent.
3027 */
3028 if (alloc > len) {
3029 ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
3030 &len);
3031 if (ret) {
3032 mlog_errno(ret);
3033 goto out_commit;
3034 }
3035 blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
3036
3037 ret = ocfs2_insert_extent(handle, &et, 1,
3038 blkno, len, 0, NULL);
3039 if (ret) {
3040 mlog_errno(ret);
3041 goto out_commit;
3042 }
3043 bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
3044 }
3045
3046 *first_block_bh = dirdata_bh;
3047 dirdata_bh = NULL;
3048 if (ocfs2_supports_indexed_dirs(osb)) {
3049 unsigned int off;
3050
3051 if (!dx_inline) {
3052 /*
3053 * We need to return the correct block within the
3054 * cluster which should hold our entry.
3055 */
3056 off = ocfs2_dx_dir_hash_idx(osb,
3057 &lookup->dl_hinfo);
3058 get_bh(dx_leaves[off]);
3059 lookup->dl_dx_leaf_bh = dx_leaves[off];
3060 }
3061 lookup->dl_dx_root_bh = dx_root_bh;
3062 dx_root_bh = NULL;
3063 }
3064
3065 out_commit:
3066 if (ret < 0 && did_quota)
3067 dquot_free_space_nodirty(dir, bytes_allocated);
3068
3069 ocfs2_commit_trans(osb, handle);
3070
3071 out:
3072 up_write(&oi->ip_alloc_sem);
3073 if (data_ac)
3074 ocfs2_free_alloc_context(data_ac);
3075 if (meta_ac)
3076 ocfs2_free_alloc_context(meta_ac);
3077
3078 if (dx_leaves) {
3079 for (i = 0; i < num_dx_leaves; i++)
3080 brelse(dx_leaves[i]);
3081 kfree(dx_leaves);
3082 }
3083
3084 brelse(dirdata_bh);
3085 brelse(dx_root_bh);
3086
3087 return ret;
3088 }
3089
3090 /* returns a bh of the 1st new block in the allocation. */
ocfs2_do_extend_dir(struct super_block * sb,handle_t * handle,struct inode * dir,struct buffer_head * parent_fe_bh,struct ocfs2_alloc_context * data_ac,struct ocfs2_alloc_context * meta_ac,struct buffer_head ** new_bh)3091 static int ocfs2_do_extend_dir(struct super_block *sb,
3092 handle_t *handle,
3093 struct inode *dir,
3094 struct buffer_head *parent_fe_bh,
3095 struct ocfs2_alloc_context *data_ac,
3096 struct ocfs2_alloc_context *meta_ac,
3097 struct buffer_head **new_bh)
3098 {
3099 int status;
3100 int extend, did_quota = 0;
3101 u64 p_blkno, v_blkno;
3102
3103 spin_lock(&OCFS2_I(dir)->ip_lock);
3104 extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters));
3105 spin_unlock(&OCFS2_I(dir)->ip_lock);
3106
3107 if (extend) {
3108 u32 offset = OCFS2_I(dir)->ip_clusters;
3109
3110 status = dquot_alloc_space_nodirty(dir,
3111 ocfs2_clusters_to_bytes(sb, 1));
3112 if (status)
3113 goto bail;
3114 did_quota = 1;
3115
3116 status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
3117 1, 0, parent_fe_bh, handle,
3118 data_ac, meta_ac, NULL);
3119 BUG_ON(status == -EAGAIN);
3120 if (status < 0) {
3121 mlog_errno(status);
3122 goto bail;
3123 }
3124 }
3125
3126 v_blkno = ocfs2_blocks_for_bytes(sb, i_size_read(dir));
3127 status = ocfs2_extent_map_get_blocks(dir, v_blkno, &p_blkno, NULL, NULL);
3128 if (status < 0) {
3129 mlog_errno(status);
3130 goto bail;
3131 }
3132
3133 *new_bh = sb_getblk(sb, p_blkno);
3134 if (!*new_bh) {
3135 status = -ENOMEM;
3136 mlog_errno(status);
3137 goto bail;
3138 }
3139 status = 0;
3140 bail:
3141 if (did_quota && status < 0)
3142 dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
3143 return status;
3144 }
3145
3146 /*
3147 * Assumes you already have a cluster lock on the directory.
3148 *
3149 * 'blocks_wanted' is only used if we have an inline directory which
3150 * is to be turned into an extent based one. The size of the dirent to
3151 * insert might be larger than the space gained by growing to just one
3152 * block, so we may have to grow the inode by two blocks in that case.
3153 *
3154 * If the directory is already indexed, dx_root_bh must be provided.
3155 */
ocfs2_extend_dir(struct ocfs2_super * osb,struct inode * dir,struct buffer_head * parent_fe_bh,unsigned int blocks_wanted,struct ocfs2_dir_lookup_result * lookup,struct buffer_head ** new_de_bh)3156 static int ocfs2_extend_dir(struct ocfs2_super *osb,
3157 struct inode *dir,
3158 struct buffer_head *parent_fe_bh,
3159 unsigned int blocks_wanted,
3160 struct ocfs2_dir_lookup_result *lookup,
3161 struct buffer_head **new_de_bh)
3162 {
3163 int status = 0;
3164 int credits, num_free_extents, drop_alloc_sem = 0;
3165 loff_t dir_i_size;
3166 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
3167 struct ocfs2_extent_list *el = &fe->id2.i_list;
3168 struct ocfs2_alloc_context *data_ac = NULL;
3169 struct ocfs2_alloc_context *meta_ac = NULL;
3170 handle_t *handle = NULL;
3171 struct buffer_head *new_bh = NULL;
3172 struct ocfs2_dir_entry * de;
3173 struct super_block *sb = osb->sb;
3174 struct ocfs2_extent_tree et;
3175 struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
3176
3177 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
3178 /*
3179 * This would be a code error as an inline directory should
3180 * never have an index root.
3181 */
3182 BUG_ON(dx_root_bh);
3183
3184 status = ocfs2_expand_inline_dir(dir, parent_fe_bh,
3185 blocks_wanted, lookup,
3186 &new_bh);
3187 if (status) {
3188 mlog_errno(status);
3189 goto bail;
3190 }
3191
3192 /* Expansion from inline to an indexed directory will
3193 * have given us this. */
3194 dx_root_bh = lookup->dl_dx_root_bh;
3195
3196 if (blocks_wanted == 1) {
3197 /*
3198 * If the new dirent will fit inside the space
3199 * created by pushing out to one block, then
3200 * we can complete the operation
3201 * here. Otherwise we have to expand i_size
3202 * and format the 2nd block below.
3203 */
3204 BUG_ON(new_bh == NULL);
3205 goto bail_bh;
3206 }
3207
3208 /*
3209 * Get rid of 'new_bh' - we want to format the 2nd
3210 * data block and return that instead.
3211 */
3212 brelse(new_bh);
3213 new_bh = NULL;
3214
3215 down_write(&OCFS2_I(dir)->ip_alloc_sem);
3216 drop_alloc_sem = 1;
3217 dir_i_size = i_size_read(dir);
3218 credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
3219 goto do_extend;
3220 }
3221
3222 down_write(&OCFS2_I(dir)->ip_alloc_sem);
3223 drop_alloc_sem = 1;
3224 dir_i_size = i_size_read(dir);
3225 trace_ocfs2_extend_dir((unsigned long long)OCFS2_I(dir)->ip_blkno,
3226 dir_i_size);
3227
3228 /* dir->i_size is always block aligned. */
3229 spin_lock(&OCFS2_I(dir)->ip_lock);
3230 if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
3231 spin_unlock(&OCFS2_I(dir)->ip_lock);
3232 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir),
3233 parent_fe_bh);
3234 num_free_extents = ocfs2_num_free_extents(&et);
3235 if (num_free_extents < 0) {
3236 status = num_free_extents;
3237 mlog_errno(status);
3238 goto bail;
3239 }
3240
3241 if (!num_free_extents) {
3242 status = ocfs2_reserve_new_metadata(osb, el, &meta_ac);
3243 if (status < 0) {
3244 if (status != -ENOSPC)
3245 mlog_errno(status);
3246 goto bail;
3247 }
3248 }
3249
3250 status = ocfs2_reserve_clusters(osb, 1, &data_ac);
3251 if (status < 0) {
3252 if (status != -ENOSPC)
3253 mlog_errno(status);
3254 goto bail;
3255 }
3256
3257 if (ocfs2_dir_resv_allowed(osb))
3258 data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv;
3259
3260 credits = ocfs2_calc_extend_credits(sb, el);
3261 } else {
3262 spin_unlock(&OCFS2_I(dir)->ip_lock);
3263 credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
3264 }
3265
3266 do_extend:
3267 if (ocfs2_dir_indexed(dir))
3268 credits++; /* For attaching the new dirent block to the
3269 * dx_root */
3270
3271 handle = ocfs2_start_trans(osb, credits);
3272 if (IS_ERR(handle)) {
3273 status = PTR_ERR(handle);
3274 handle = NULL;
3275 mlog_errno(status);
3276 goto bail;
3277 }
3278
3279 status = ocfs2_do_extend_dir(osb->sb, handle, dir, parent_fe_bh,
3280 data_ac, meta_ac, &new_bh);
3281 if (status < 0) {
3282 mlog_errno(status);
3283 goto bail;
3284 }
3285
3286 ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), new_bh);
3287
3288 status = ocfs2_journal_access_db(handle, INODE_CACHE(dir), new_bh,
3289 OCFS2_JOURNAL_ACCESS_CREATE);
3290 if (status < 0) {
3291 mlog_errno(status);
3292 goto bail;
3293 }
3294 memset(new_bh->b_data, 0, sb->s_blocksize);
3295
3296 de = (struct ocfs2_dir_entry *) new_bh->b_data;
3297 de->inode = 0;
3298 if (ocfs2_supports_dir_trailer(dir)) {
3299 de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb));
3300
3301 ocfs2_init_dir_trailer(dir, new_bh, le16_to_cpu(de->rec_len));
3302
3303 if (ocfs2_dir_indexed(dir)) {
3304 status = ocfs2_dx_dir_link_trailer(dir, handle,
3305 dx_root_bh, new_bh);
3306 if (status) {
3307 mlog_errno(status);
3308 goto bail;
3309 }
3310 }
3311 } else {
3312 de->rec_len = cpu_to_le16(sb->s_blocksize);
3313 }
3314 ocfs2_update_inode_fsync_trans(handle, dir, 1);
3315 ocfs2_journal_dirty(handle, new_bh);
3316
3317 dir_i_size += dir->i_sb->s_blocksize;
3318 i_size_write(dir, dir_i_size);
3319 dir->i_blocks = ocfs2_inode_sector_count(dir);
3320 status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
3321 if (status < 0) {
3322 mlog_errno(status);
3323 goto bail;
3324 }
3325
3326 bail_bh:
3327 *new_de_bh = new_bh;
3328 get_bh(*new_de_bh);
3329 bail:
3330 if (handle)
3331 ocfs2_commit_trans(osb, handle);
3332 if (drop_alloc_sem)
3333 up_write(&OCFS2_I(dir)->ip_alloc_sem);
3334
3335 if (data_ac)
3336 ocfs2_free_alloc_context(data_ac);
3337 if (meta_ac)
3338 ocfs2_free_alloc_context(meta_ac);
3339
3340 brelse(new_bh);
3341
3342 return status;
3343 }
3344
ocfs2_find_dir_space_id(struct inode * dir,struct buffer_head * di_bh,const char * name,int namelen,struct buffer_head ** ret_de_bh,unsigned int * blocks_wanted)3345 static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
3346 const char *name, int namelen,
3347 struct buffer_head **ret_de_bh,
3348 unsigned int *blocks_wanted)
3349 {
3350 int ret;
3351 struct super_block *sb = dir->i_sb;
3352 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3353 struct ocfs2_dir_entry *de, *last_de = NULL;
3354 char *first_de, *de_buf, *limit;
3355 unsigned long offset = 0;
3356 unsigned int rec_len, new_rec_len, free_space;
3357
3358 /*
3359 * This calculates how many free bytes we'd have in block zero, should
3360 * this function force expansion to an extent tree.
3361 */
3362 if (ocfs2_new_dir_wants_trailer(dir))
3363 free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir);
3364 else
3365 free_space = dir->i_sb->s_blocksize - i_size_read(dir);
3366
3367 first_de = di->id2.i_data.id_data;
3368 de_buf = first_de;
3369 limit = de_buf + i_size_read(dir);
3370 rec_len = OCFS2_DIR_REC_LEN(namelen);
3371
3372 while (de_buf < limit) {
3373 de = (struct ocfs2_dir_entry *)de_buf;
3374
3375 if (!ocfs2_check_dir_entry(dir, de, di_bh, first_de,
3376 i_size_read(dir), offset)) {
3377 ret = -ENOENT;
3378 goto out;
3379 }
3380 if (ocfs2_match(namelen, name, de)) {
3381 ret = -EEXIST;
3382 goto out;
3383 }
3384 /*
3385 * No need to check for a trailing dirent record here as
3386 * they're not used for inline dirs.
3387 */
3388
3389 if (ocfs2_dirent_would_fit(de, rec_len)) {
3390 /* Ok, we found a spot. Return this bh and let
3391 * the caller actually fill it in. */
3392 *ret_de_bh = di_bh;
3393 get_bh(*ret_de_bh);
3394 ret = 0;
3395 goto out;
3396 }
3397
3398 last_de = de;
3399 de_buf += le16_to_cpu(de->rec_len);
3400 offset += le16_to_cpu(de->rec_len);
3401 }
3402
3403 /*
3404 * We're going to require expansion of the directory - figure
3405 * out how many blocks we'll need so that a place for the
3406 * dirent can be found.
3407 */
3408 *blocks_wanted = 1;
3409 new_rec_len = le16_to_cpu(last_de->rec_len) + free_space;
3410 if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len)))
3411 *blocks_wanted = 2;
3412
3413 ret = -ENOSPC;
3414 out:
3415 return ret;
3416 }
3417
ocfs2_find_dir_space_el(struct inode * dir,const char * name,int namelen,struct buffer_head ** ret_de_bh)3418 static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
3419 int namelen, struct buffer_head **ret_de_bh)
3420 {
3421 unsigned long offset;
3422 struct buffer_head *bh = NULL;
3423 unsigned short rec_len;
3424 struct ocfs2_dir_entry *de;
3425 struct super_block *sb = dir->i_sb;
3426 int status;
3427 int blocksize = dir->i_sb->s_blocksize;
3428
3429 status = ocfs2_read_dir_block(dir, 0, &bh, 0);
3430 if (status)
3431 goto bail;
3432
3433 rec_len = OCFS2_DIR_REC_LEN(namelen);
3434 offset = 0;
3435 de = (struct ocfs2_dir_entry *) bh->b_data;
3436 while (1) {
3437 if ((char *)de >= sb->s_blocksize + bh->b_data) {
3438 brelse(bh);
3439 bh = NULL;
3440
3441 if (i_size_read(dir) <= offset) {
3442 /*
3443 * Caller will have to expand this
3444 * directory.
3445 */
3446 status = -ENOSPC;
3447 goto bail;
3448 }
3449 status = ocfs2_read_dir_block(dir,
3450 offset >> sb->s_blocksize_bits,
3451 &bh, 0);
3452 if (status)
3453 goto bail;
3454
3455 /* move to next block */
3456 de = (struct ocfs2_dir_entry *) bh->b_data;
3457 }
3458 if (!ocfs2_check_dir_entry(dir, de, bh, bh->b_data, blocksize,
3459 offset)) {
3460 status = -ENOENT;
3461 goto bail;
3462 }
3463 if (ocfs2_match(namelen, name, de)) {
3464 status = -EEXIST;
3465 goto bail;
3466 }
3467
3468 if (ocfs2_skip_dir_trailer(dir, de, offset % blocksize,
3469 blocksize))
3470 goto next;
3471
3472 if (ocfs2_dirent_would_fit(de, rec_len)) {
3473 /* Ok, we found a spot. Return this bh and let
3474 * the caller actually fill it in. */
3475 *ret_de_bh = bh;
3476 get_bh(*ret_de_bh);
3477 status = 0;
3478 goto bail;
3479 }
3480 next:
3481 offset += le16_to_cpu(de->rec_len);
3482 de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
3483 }
3484
3485 bail:
3486 brelse(bh);
3487 if (status)
3488 mlog_errno(status);
3489
3490 return status;
3491 }
3492
dx_leaf_sort_cmp(const void * a,const void * b)3493 static int dx_leaf_sort_cmp(const void *a, const void *b)
3494 {
3495 const struct ocfs2_dx_entry *entry1 = a;
3496 const struct ocfs2_dx_entry *entry2 = b;
3497 u32 major_hash1 = le32_to_cpu(entry1->dx_major_hash);
3498 u32 major_hash2 = le32_to_cpu(entry2->dx_major_hash);
3499 u32 minor_hash1 = le32_to_cpu(entry1->dx_minor_hash);
3500 u32 minor_hash2 = le32_to_cpu(entry2->dx_minor_hash);
3501
3502 if (major_hash1 > major_hash2)
3503 return 1;
3504 if (major_hash1 < major_hash2)
3505 return -1;
3506
3507 /*
3508 * It is not strictly necessary to sort by minor
3509 */
3510 if (minor_hash1 > minor_hash2)
3511 return 1;
3512 if (minor_hash1 < minor_hash2)
3513 return -1;
3514 return 0;
3515 }
3516
dx_leaf_sort_swap(void * a,void * b,int size)3517 static void dx_leaf_sort_swap(void *a, void *b, int size)
3518 {
3519 struct ocfs2_dx_entry *entry1 = a;
3520 struct ocfs2_dx_entry *entry2 = b;
3521
3522 BUG_ON(size != sizeof(*entry1));
3523
3524 swap(*entry1, *entry2);
3525 }
3526
ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf * dx_leaf)3527 static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf)
3528 {
3529 struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
3530 int i, num = le16_to_cpu(dl_list->de_num_used);
3531
3532 for (i = 0; i < (num - 1); i++) {
3533 if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) !=
3534 le32_to_cpu(dl_list->de_entries[i + 1].dx_major_hash))
3535 return 0;
3536 }
3537
3538 return 1;
3539 }
3540
3541 /*
3542 * Find the optimal value to split this leaf on. This expects the leaf
3543 * entries to be in sorted order.
3544 *
3545 * leaf_cpos is the cpos of the leaf we're splitting. insert_hash is
3546 * the hash we want to insert.
3547 *
3548 * This function is only concerned with the major hash - that which
3549 * determines which cluster an item belongs to.
3550 */
ocfs2_dx_dir_find_leaf_split(struct ocfs2_dx_leaf * dx_leaf,u32 leaf_cpos,u32 insert_hash,u32 * split_hash)3551 static int ocfs2_dx_dir_find_leaf_split(struct ocfs2_dx_leaf *dx_leaf,
3552 u32 leaf_cpos, u32 insert_hash,
3553 u32 *split_hash)
3554 {
3555 struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
3556 int i, num_used = le16_to_cpu(dl_list->de_num_used);
3557 int allsame;
3558
3559 /*
3560 * There's a couple rare, but nasty corner cases we have to
3561 * check for here. All of them involve a leaf where all value
3562 * have the same hash, which is what we look for first.
3563 *
3564 * Most of the time, all of the above is false, and we simply
3565 * pick the median value for a split.
3566 */
3567 allsame = ocfs2_dx_leaf_same_major(dx_leaf);
3568 if (allsame) {
3569 u32 val = le32_to_cpu(dl_list->de_entries[0].dx_major_hash);
3570
3571 if (val == insert_hash) {
3572 /*
3573 * No matter where we would choose to split,
3574 * the new entry would want to occupy the same
3575 * block as these. Since there's no space left
3576 * in their existing block, we know there
3577 * won't be space after the split.
3578 */
3579 return -ENOSPC;
3580 }
3581
3582 if (val == leaf_cpos) {
3583 /*
3584 * Because val is the same as leaf_cpos (which
3585 * is the smallest value this leaf can have),
3586 * yet is not equal to insert_hash, then we
3587 * know that insert_hash *must* be larger than
3588 * val (and leaf_cpos). At least cpos+1 in value.
3589 *
3590 * We also know then, that there cannot be an
3591 * adjacent extent (otherwise we'd be looking
3592 * at it). Choosing this value gives us a
3593 * chance to get some contiguousness.
3594 */
3595 *split_hash = leaf_cpos + 1;
3596 return 0;
3597 }
3598
3599 if (val > insert_hash) {
3600 /*
3601 * val can not be the same as insert hash, and
3602 * also must be larger than leaf_cpos. Also,
3603 * we know that there can't be a leaf between
3604 * cpos and val, otherwise the entries with
3605 * hash 'val' would be there.
3606 */
3607 *split_hash = val;
3608 return 0;
3609 }
3610
3611 *split_hash = insert_hash;
3612 return 0;
3613 }
3614
3615 /*
3616 * Since the records are sorted and the checks above
3617 * guaranteed that not all records in this block are the same,
3618 * we simple travel forward, from the median, and pick the 1st
3619 * record whose value is larger than leaf_cpos.
3620 */
3621 for (i = (num_used / 2); i < num_used; i++)
3622 if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) >
3623 leaf_cpos)
3624 break;
3625
3626 BUG_ON(i == num_used); /* Should be impossible */
3627 *split_hash = le32_to_cpu(dl_list->de_entries[i].dx_major_hash);
3628 return 0;
3629 }
3630
3631 /*
3632 * Transfer all entries in orig_dx_leaves whose major hash is equal to or
3633 * larger than split_hash into new_dx_leaves. We use a temporary
3634 * buffer (tmp_dx_leaf) to make the changes to the original leaf blocks.
3635 *
3636 * Since the block offset inside a leaf (cluster) is a constant mask
3637 * of minor_hash, we can optimize - an item at block offset X within
3638 * the original cluster, will be at offset X within the new cluster.
3639 */
ocfs2_dx_dir_transfer_leaf(struct inode * dir,u32 split_hash,handle_t * handle,struct ocfs2_dx_leaf * tmp_dx_leaf,struct buffer_head ** orig_dx_leaves,struct buffer_head ** new_dx_leaves,int num_dx_leaves)3640 static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash,
3641 handle_t *handle,
3642 struct ocfs2_dx_leaf *tmp_dx_leaf,
3643 struct buffer_head **orig_dx_leaves,
3644 struct buffer_head **new_dx_leaves,
3645 int num_dx_leaves)
3646 {
3647 int i, j, num_used;
3648 u32 major_hash;
3649 struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf;
3650 struct ocfs2_dx_entry_list *orig_list, *tmp_list;
3651 struct ocfs2_dx_entry *dx_entry;
3652
3653 tmp_list = &tmp_dx_leaf->dl_list;
3654
3655 for (i = 0; i < num_dx_leaves; i++) {
3656 orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data;
3657 orig_list = &orig_dx_leaf->dl_list;
3658 new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data;
3659
3660 num_used = le16_to_cpu(orig_list->de_num_used);
3661
3662 memcpy(tmp_dx_leaf, orig_dx_leaf, dir->i_sb->s_blocksize);
3663 tmp_list->de_num_used = cpu_to_le16(0);
3664 memset(&tmp_list->de_entries, 0, sizeof(*dx_entry)*num_used);
3665
3666 for (j = 0; j < num_used; j++) {
3667 dx_entry = &orig_list->de_entries[j];
3668 major_hash = le32_to_cpu(dx_entry->dx_major_hash);
3669 if (major_hash >= split_hash)
3670 ocfs2_dx_dir_leaf_insert_tail(new_dx_leaf,
3671 dx_entry);
3672 else
3673 ocfs2_dx_dir_leaf_insert_tail(tmp_dx_leaf,
3674 dx_entry);
3675 }
3676 memcpy(orig_dx_leaf, tmp_dx_leaf, dir->i_sb->s_blocksize);
3677
3678 ocfs2_journal_dirty(handle, orig_dx_leaves[i]);
3679 ocfs2_journal_dirty(handle, new_dx_leaves[i]);
3680 }
3681 }
3682
ocfs2_dx_dir_rebalance_credits(struct ocfs2_super * osb,struct ocfs2_dx_root_block * dx_root)3683 static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb,
3684 struct ocfs2_dx_root_block *dx_root)
3685 {
3686 int credits = ocfs2_clusters_to_blocks(osb->sb, 3);
3687
3688 credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list);
3689 credits += ocfs2_quota_trans_credits(osb->sb);
3690 return credits;
3691 }
3692
3693 /*
3694 * Find the median value in dx_leaf_bh and allocate a new leaf to move
3695 * half our entries into.
3696 */
ocfs2_dx_dir_rebalance(struct ocfs2_super * osb,struct inode * dir,struct buffer_head * dx_root_bh,struct buffer_head * dx_leaf_bh,struct ocfs2_dx_hinfo * hinfo,u32 leaf_cpos,u64 leaf_blkno)3697 static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3698 struct buffer_head *dx_root_bh,
3699 struct buffer_head *dx_leaf_bh,
3700 struct ocfs2_dx_hinfo *hinfo, u32 leaf_cpos,
3701 u64 leaf_blkno)
3702 {
3703 struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
3704 int credits, ret, i, num_used, did_quota = 0;
3705 u32 cpos, split_hash, insert_hash = hinfo->major_hash;
3706 u64 orig_leaves_start;
3707 int num_dx_leaves;
3708 struct buffer_head **orig_dx_leaves = NULL;
3709 struct buffer_head **new_dx_leaves = NULL;
3710 struct ocfs2_alloc_context *data_ac = NULL, *meta_ac = NULL;
3711 struct ocfs2_extent_tree et;
3712 handle_t *handle = NULL;
3713 struct ocfs2_dx_root_block *dx_root;
3714 struct ocfs2_dx_leaf *tmp_dx_leaf = NULL;
3715
3716 trace_ocfs2_dx_dir_rebalance((unsigned long long)OCFS2_I(dir)->ip_blkno,
3717 (unsigned long long)leaf_blkno,
3718 insert_hash);
3719
3720 ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
3721
3722 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
3723 /*
3724 * XXX: This is a rather large limit. We should use a more
3725 * realistic value.
3726 */
3727 if (le32_to_cpu(dx_root->dr_clusters) == UINT_MAX)
3728 return -ENOSPC;
3729
3730 num_used = le16_to_cpu(dx_leaf->dl_list.de_num_used);
3731 if (num_used < le16_to_cpu(dx_leaf->dl_list.de_count)) {
3732 mlog(ML_ERROR, "DX Dir: %llu, Asked to rebalance empty leaf: "
3733 "%llu, %d\n", (unsigned long long)OCFS2_I(dir)->ip_blkno,
3734 (unsigned long long)leaf_blkno, num_used);
3735 ret = -EIO;
3736 goto out;
3737 }
3738
3739 orig_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
3740 if (!orig_dx_leaves) {
3741 ret = -ENOMEM;
3742 mlog_errno(ret);
3743 goto out;
3744 }
3745
3746 new_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, NULL);
3747 if (!new_dx_leaves) {
3748 ret = -ENOMEM;
3749 mlog_errno(ret);
3750 goto out;
3751 }
3752
3753 ret = ocfs2_lock_allocators(dir, &et, 1, 0, &data_ac, &meta_ac);
3754 if (ret) {
3755 if (ret != -ENOSPC)
3756 mlog_errno(ret);
3757 goto out;
3758 }
3759
3760 credits = ocfs2_dx_dir_rebalance_credits(osb, dx_root);
3761 handle = ocfs2_start_trans(osb, credits);
3762 if (IS_ERR(handle)) {
3763 ret = PTR_ERR(handle);
3764 handle = NULL;
3765 mlog_errno(ret);
3766 goto out;
3767 }
3768
3769 ret = dquot_alloc_space_nodirty(dir,
3770 ocfs2_clusters_to_bytes(dir->i_sb, 1));
3771 if (ret)
3772 goto out_commit;
3773 did_quota = 1;
3774
3775 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh,
3776 OCFS2_JOURNAL_ACCESS_WRITE);
3777 if (ret) {
3778 mlog_errno(ret);
3779 goto out_commit;
3780 }
3781
3782 /*
3783 * This block is changing anyway, so we can sort it in place.
3784 */
3785 sort(dx_leaf->dl_list.de_entries, num_used,
3786 sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
3787 dx_leaf_sort_swap);
3788
3789 ocfs2_journal_dirty(handle, dx_leaf_bh);
3790
3791 ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash,
3792 &split_hash);
3793 if (ret) {
3794 mlog_errno(ret);
3795 goto out_commit;
3796 }
3797
3798 trace_ocfs2_dx_dir_rebalance_split(leaf_cpos, split_hash, insert_hash);
3799
3800 /*
3801 * We have to carefully order operations here. There are items
3802 * which want to be in the new cluster before insert, but in
3803 * order to put those items in the new cluster, we alter the
3804 * old cluster. A failure to insert gets nasty.
3805 *
3806 * So, start by reserving writes to the old
3807 * cluster. ocfs2_dx_dir_new_cluster will reserve writes on
3808 * the new cluster for us, before inserting it. The insert
3809 * won't happen if there's an error before that. Once the
3810 * insert is done then, we can transfer from one leaf into the
3811 * other without fear of hitting any error.
3812 */
3813
3814 /*
3815 * The leaf transfer wants some scratch space so that we don't
3816 * wind up doing a bunch of expensive memmove().
3817 */
3818 tmp_dx_leaf = kmalloc(osb->sb->s_blocksize, GFP_NOFS);
3819 if (!tmp_dx_leaf) {
3820 ret = -ENOMEM;
3821 mlog_errno(ret);
3822 goto out_commit;
3823 }
3824
3825 orig_leaves_start = ocfs2_block_to_cluster_start(dir->i_sb, leaf_blkno);
3826 ret = ocfs2_read_dx_leaves(dir, orig_leaves_start, num_dx_leaves,
3827 orig_dx_leaves);
3828 if (ret) {
3829 mlog_errno(ret);
3830 goto out_commit;
3831 }
3832
3833 cpos = split_hash;
3834 ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
3835 data_ac, meta_ac, new_dx_leaves,
3836 num_dx_leaves);
3837 if (ret) {
3838 mlog_errno(ret);
3839 goto out_commit;
3840 }
3841
3842 for (i = 0; i < num_dx_leaves; i++) {
3843 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
3844 orig_dx_leaves[i],
3845 OCFS2_JOURNAL_ACCESS_WRITE);
3846 if (ret) {
3847 mlog_errno(ret);
3848 goto out_commit;
3849 }
3850
3851 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
3852 new_dx_leaves[i],
3853 OCFS2_JOURNAL_ACCESS_WRITE);
3854 if (ret) {
3855 mlog_errno(ret);
3856 goto out_commit;
3857 }
3858 }
3859
3860 ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf,
3861 orig_dx_leaves, new_dx_leaves, num_dx_leaves);
3862
3863 out_commit:
3864 if (ret < 0 && did_quota)
3865 dquot_free_space_nodirty(dir,
3866 ocfs2_clusters_to_bytes(dir->i_sb, 1));
3867
3868 ocfs2_update_inode_fsync_trans(handle, dir, 1);
3869 ocfs2_commit_trans(osb, handle);
3870
3871 out:
3872 if (orig_dx_leaves || new_dx_leaves) {
3873 for (i = 0; i < num_dx_leaves; i++) {
3874 if (orig_dx_leaves)
3875 brelse(orig_dx_leaves[i]);
3876 if (new_dx_leaves)
3877 brelse(new_dx_leaves[i]);
3878 }
3879 kfree(orig_dx_leaves);
3880 kfree(new_dx_leaves);
3881 }
3882
3883 if (meta_ac)
3884 ocfs2_free_alloc_context(meta_ac);
3885 if (data_ac)
3886 ocfs2_free_alloc_context(data_ac);
3887
3888 kfree(tmp_dx_leaf);
3889 return ret;
3890 }
3891
ocfs2_find_dir_space_dx(struct ocfs2_super * osb,struct inode * dir,struct buffer_head * di_bh,struct buffer_head * dx_root_bh,const char * name,int namelen,struct ocfs2_dir_lookup_result * lookup)3892 static int ocfs2_find_dir_space_dx(struct ocfs2_super *osb, struct inode *dir,
3893 struct buffer_head *di_bh,
3894 struct buffer_head *dx_root_bh,
3895 const char *name, int namelen,
3896 struct ocfs2_dir_lookup_result *lookup)
3897 {
3898 int ret, rebalanced = 0;
3899 struct ocfs2_dx_root_block *dx_root;
3900 struct buffer_head *dx_leaf_bh = NULL;
3901 struct ocfs2_dx_leaf *dx_leaf;
3902 u64 blkno;
3903 u32 leaf_cpos;
3904
3905 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
3906
3907 restart_search:
3908 ret = ocfs2_dx_dir_lookup(dir, &dx_root->dr_list, &lookup->dl_hinfo,
3909 &leaf_cpos, &blkno);
3910 if (ret) {
3911 mlog_errno(ret);
3912 goto out;
3913 }
3914
3915 ret = ocfs2_read_dx_leaf(dir, blkno, &dx_leaf_bh);
3916 if (ret) {
3917 mlog_errno(ret);
3918 goto out;
3919 }
3920
3921 dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
3922
3923 if (le16_to_cpu(dx_leaf->dl_list.de_num_used) >=
3924 le16_to_cpu(dx_leaf->dl_list.de_count)) {
3925 if (rebalanced) {
3926 /*
3927 * Rebalancing should have provided us with
3928 * space in an appropriate leaf.
3929 *
3930 * XXX: Is this an abnormal condition then?
3931 * Should we print a message here?
3932 */
3933 ret = -ENOSPC;
3934 goto out;
3935 }
3936
3937 ret = ocfs2_dx_dir_rebalance(osb, dir, dx_root_bh, dx_leaf_bh,
3938 &lookup->dl_hinfo, leaf_cpos,
3939 blkno);
3940 if (ret) {
3941 if (ret != -ENOSPC)
3942 mlog_errno(ret);
3943 goto out;
3944 }
3945
3946 /*
3947 * Restart the lookup. The rebalance might have
3948 * changed which block our item fits into. Mark our
3949 * progress, so we only execute this once.
3950 */
3951 brelse(dx_leaf_bh);
3952 dx_leaf_bh = NULL;
3953 rebalanced = 1;
3954 goto restart_search;
3955 }
3956
3957 lookup->dl_dx_leaf_bh = dx_leaf_bh;
3958 dx_leaf_bh = NULL;
3959
3960 out:
3961 brelse(dx_leaf_bh);
3962 return ret;
3963 }
3964
ocfs2_search_dx_free_list(struct inode * dir,struct buffer_head * dx_root_bh,int namelen,struct ocfs2_dir_lookup_result * lookup)3965 static int ocfs2_search_dx_free_list(struct inode *dir,
3966 struct buffer_head *dx_root_bh,
3967 int namelen,
3968 struct ocfs2_dir_lookup_result *lookup)
3969 {
3970 int ret = -ENOSPC;
3971 struct buffer_head *leaf_bh = NULL, *prev_leaf_bh = NULL;
3972 struct ocfs2_dir_block_trailer *db;
3973 u64 next_block;
3974 int rec_len = OCFS2_DIR_REC_LEN(namelen);
3975 struct ocfs2_dx_root_block *dx_root;
3976
3977 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
3978 next_block = le64_to_cpu(dx_root->dr_free_blk);
3979
3980 while (next_block) {
3981 brelse(prev_leaf_bh);
3982 prev_leaf_bh = leaf_bh;
3983 leaf_bh = NULL;
3984
3985 ret = ocfs2_read_dir_block_direct(dir, next_block, &leaf_bh);
3986 if (ret) {
3987 mlog_errno(ret);
3988 goto out;
3989 }
3990
3991 db = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb);
3992 if (rec_len <= le16_to_cpu(db->db_free_rec_len)) {
3993 lookup->dl_leaf_bh = leaf_bh;
3994 lookup->dl_prev_leaf_bh = prev_leaf_bh;
3995 leaf_bh = NULL;
3996 prev_leaf_bh = NULL;
3997 break;
3998 }
3999
4000 next_block = le64_to_cpu(db->db_free_next);
4001 }
4002
4003 if (!next_block)
4004 ret = -ENOSPC;
4005
4006 out:
4007
4008 brelse(leaf_bh);
4009 brelse(prev_leaf_bh);
4010 return ret;
4011 }
4012
ocfs2_expand_inline_dx_root(struct inode * dir,struct buffer_head * dx_root_bh)4013 static int ocfs2_expand_inline_dx_root(struct inode *dir,
4014 struct buffer_head *dx_root_bh)
4015 {
4016 int ret, num_dx_leaves, i, j, did_quota = 0;
4017 struct buffer_head **dx_leaves = NULL;
4018 struct ocfs2_extent_tree et;
4019 u64 insert_blkno;
4020 struct ocfs2_alloc_context *data_ac = NULL;
4021 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
4022 handle_t *handle = NULL;
4023 struct ocfs2_dx_root_block *dx_root;
4024 struct ocfs2_dx_entry_list *entry_list;
4025 struct ocfs2_dx_entry *dx_entry;
4026 struct ocfs2_dx_leaf *target_leaf;
4027
4028 ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
4029 if (ret) {
4030 mlog_errno(ret);
4031 goto out;
4032 }
4033
4034 dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
4035 if (!dx_leaves) {
4036 ret = -ENOMEM;
4037 mlog_errno(ret);
4038 goto out;
4039 }
4040
4041 handle = ocfs2_start_trans(osb, ocfs2_calc_dxi_expand_credits(osb->sb));
4042 if (IS_ERR(handle)) {
4043 ret = PTR_ERR(handle);
4044 mlog_errno(ret);
4045 goto out;
4046 }
4047
4048 ret = dquot_alloc_space_nodirty(dir,
4049 ocfs2_clusters_to_bytes(osb->sb, 1));
4050 if (ret)
4051 goto out_commit;
4052 did_quota = 1;
4053
4054 /*
4055 * We do this up front, before the allocation, so that a
4056 * failure to add the dx_root_bh to the journal won't result
4057 * us losing clusters.
4058 */
4059 ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
4060 OCFS2_JOURNAL_ACCESS_WRITE);
4061 if (ret) {
4062 mlog_errno(ret);
4063 goto out_commit;
4064 }
4065
4066 ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac, dx_leaves,
4067 num_dx_leaves, &insert_blkno);
4068 if (ret) {
4069 mlog_errno(ret);
4070 goto out_commit;
4071 }
4072
4073 /*
4074 * Transfer the entries from our dx_root into the appropriate
4075 * block
4076 */
4077 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
4078 entry_list = &dx_root->dr_entries;
4079
4080 for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) {
4081 dx_entry = &entry_list->de_entries[i];
4082
4083 j = __ocfs2_dx_dir_hash_idx(osb,
4084 le32_to_cpu(dx_entry->dx_minor_hash));
4085 target_leaf = (struct ocfs2_dx_leaf *)dx_leaves[j]->b_data;
4086
4087 ocfs2_dx_dir_leaf_insert_tail(target_leaf, dx_entry);
4088
4089 /* Each leaf has been passed to the journal already
4090 * via __ocfs2_dx_dir_new_cluster() */
4091 }
4092
4093 dx_root->dr_flags &= ~OCFS2_DX_FLAG_INLINE;
4094 memset(&dx_root->dr_list, 0, osb->sb->s_blocksize -
4095 offsetof(struct ocfs2_dx_root_block, dr_list));
4096 dx_root->dr_list.l_count =
4097 cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
4098
4099 /* This should never fail considering we start with an empty
4100 * dx_root. */
4101 ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
4102 ret = ocfs2_insert_extent(handle, &et, 0, insert_blkno, 1, 0, NULL);
4103 if (ret)
4104 mlog_errno(ret);
4105 did_quota = 0;
4106
4107 ocfs2_update_inode_fsync_trans(handle, dir, 1);
4108 ocfs2_journal_dirty(handle, dx_root_bh);
4109
4110 out_commit:
4111 if (ret < 0 && did_quota)
4112 dquot_free_space_nodirty(dir,
4113 ocfs2_clusters_to_bytes(dir->i_sb, 1));
4114
4115 ocfs2_commit_trans(osb, handle);
4116
4117 out:
4118 if (data_ac)
4119 ocfs2_free_alloc_context(data_ac);
4120
4121 if (dx_leaves) {
4122 for (i = 0; i < num_dx_leaves; i++)
4123 brelse(dx_leaves[i]);
4124 kfree(dx_leaves);
4125 }
4126 return ret;
4127 }
4128
ocfs2_inline_dx_has_space(struct buffer_head * dx_root_bh)4129 static int ocfs2_inline_dx_has_space(struct buffer_head *dx_root_bh)
4130 {
4131 struct ocfs2_dx_root_block *dx_root;
4132 struct ocfs2_dx_entry_list *entry_list;
4133
4134 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
4135 entry_list = &dx_root->dr_entries;
4136
4137 if (le16_to_cpu(entry_list->de_num_used) >=
4138 le16_to_cpu(entry_list->de_count))
4139 return -ENOSPC;
4140
4141 return 0;
4142 }
4143
ocfs2_prepare_dx_dir_for_insert(struct inode * dir,struct buffer_head * di_bh,const char * name,int namelen,struct ocfs2_dir_lookup_result * lookup)4144 static int ocfs2_prepare_dx_dir_for_insert(struct inode *dir,
4145 struct buffer_head *di_bh,
4146 const char *name,
4147 int namelen,
4148 struct ocfs2_dir_lookup_result *lookup)
4149 {
4150 int ret, free_dx_root = 1;
4151 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
4152 struct buffer_head *dx_root_bh = NULL;
4153 struct buffer_head *leaf_bh = NULL;
4154 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4155 struct ocfs2_dx_root_block *dx_root;
4156
4157 ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
4158 if (ret) {
4159 mlog_errno(ret);
4160 goto out;
4161 }
4162
4163 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
4164 if (le32_to_cpu(dx_root->dr_num_entries) == OCFS2_DX_ENTRIES_MAX) {
4165 ret = -ENOSPC;
4166 mlog_errno(ret);
4167 goto out;
4168 }
4169
4170 if (ocfs2_dx_root_inline(dx_root)) {
4171 ret = ocfs2_inline_dx_has_space(dx_root_bh);
4172
4173 if (ret == 0)
4174 goto search_el;
4175
4176 /*
4177 * We ran out of room in the root block. Expand it to
4178 * an extent, then allow ocfs2_find_dir_space_dx to do
4179 * the rest.
4180 */
4181 ret = ocfs2_expand_inline_dx_root(dir, dx_root_bh);
4182 if (ret) {
4183 mlog_errno(ret);
4184 goto out;
4185 }
4186 }
4187
4188 /*
4189 * Insert preparation for an indexed directory is split into two
4190 * steps. The call to find_dir_space_dx reserves room in the index for
4191 * an additional item. If we run out of space there, it's a real error
4192 * we can't continue on.
4193 */
4194 ret = ocfs2_find_dir_space_dx(osb, dir, di_bh, dx_root_bh, name,
4195 namelen, lookup);
4196 if (ret) {
4197 mlog_errno(ret);
4198 goto out;
4199 }
4200
4201 search_el:
4202 /*
4203 * Next, we need to find space in the unindexed tree. This call
4204 * searches using the free space linked list. If the unindexed tree
4205 * lacks sufficient space, we'll expand it below. The expansion code
4206 * is smart enough to add any new blocks to the free space list.
4207 */
4208 ret = ocfs2_search_dx_free_list(dir, dx_root_bh, namelen, lookup);
4209 if (ret && ret != -ENOSPC) {
4210 mlog_errno(ret);
4211 goto out;
4212 }
4213
4214 /* Do this up here - ocfs2_extend_dir might need the dx_root */
4215 lookup->dl_dx_root_bh = dx_root_bh;
4216 free_dx_root = 0;
4217
4218 if (ret == -ENOSPC) {
4219 ret = ocfs2_extend_dir(osb, dir, di_bh, 1, lookup, &leaf_bh);
4220
4221 if (ret) {
4222 mlog_errno(ret);
4223 goto out;
4224 }
4225
4226 /*
4227 * We make the assumption here that new leaf blocks are added
4228 * to the front of our free list.
4229 */
4230 lookup->dl_prev_leaf_bh = NULL;
4231 lookup->dl_leaf_bh = leaf_bh;
4232 }
4233
4234 out:
4235 if (free_dx_root)
4236 brelse(dx_root_bh);
4237 return ret;
4238 }
4239
4240 /*
4241 * Get a directory ready for insert. Any directory allocation required
4242 * happens here. Success returns zero, and enough context in the dir
4243 * lookup result that ocfs2_add_entry() will be able complete the task
4244 * with minimal performance impact.
4245 */
ocfs2_prepare_dir_for_insert(struct ocfs2_super * osb,struct inode * dir,struct buffer_head * parent_fe_bh,const char * name,int namelen,struct ocfs2_dir_lookup_result * lookup)4246 int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
4247 struct inode *dir,
4248 struct buffer_head *parent_fe_bh,
4249 const char *name,
4250 int namelen,
4251 struct ocfs2_dir_lookup_result *lookup)
4252 {
4253 int ret;
4254 unsigned int blocks_wanted = 1;
4255 struct buffer_head *bh = NULL;
4256
4257 trace_ocfs2_prepare_dir_for_insert(
4258 (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen);
4259
4260 if (!namelen) {
4261 ret = -EINVAL;
4262 mlog_errno(ret);
4263 goto out;
4264 }
4265
4266 /*
4267 * Do this up front to reduce confusion.
4268 *
4269 * The directory might start inline, then be turned into an
4270 * indexed one, in which case we'd need to hash deep inside
4271 * ocfs2_find_dir_space_id(). Since
4272 * ocfs2_prepare_dx_dir_for_insert() also needs this hash
4273 * done, there seems no point in spreading out the calls. We
4274 * can optimize away the case where the file system doesn't
4275 * support indexing.
4276 */
4277 if (ocfs2_supports_indexed_dirs(osb))
4278 ocfs2_dx_dir_name_hash(dir, name, namelen, &lookup->dl_hinfo);
4279
4280 if (ocfs2_dir_indexed(dir)) {
4281 ret = ocfs2_prepare_dx_dir_for_insert(dir, parent_fe_bh,
4282 name, namelen, lookup);
4283 if (ret)
4284 mlog_errno(ret);
4285 goto out;
4286 }
4287
4288 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
4289 ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name,
4290 namelen, &bh, &blocks_wanted);
4291 } else
4292 ret = ocfs2_find_dir_space_el(dir, name, namelen, &bh);
4293
4294 if (ret && ret != -ENOSPC) {
4295 mlog_errno(ret);
4296 goto out;
4297 }
4298
4299 if (ret == -ENOSPC) {
4300 /*
4301 * We have to expand the directory to add this name.
4302 */
4303 BUG_ON(bh);
4304
4305 ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted,
4306 lookup, &bh);
4307 if (ret) {
4308 if (ret != -ENOSPC)
4309 mlog_errno(ret);
4310 goto out;
4311 }
4312
4313 BUG_ON(!bh);
4314 }
4315
4316 lookup->dl_leaf_bh = bh;
4317 bh = NULL;
4318 out:
4319 brelse(bh);
4320 return ret;
4321 }
4322
ocfs2_dx_dir_remove_index(struct inode * dir,struct buffer_head * di_bh,struct buffer_head * dx_root_bh)4323 static int ocfs2_dx_dir_remove_index(struct inode *dir,
4324 struct buffer_head *di_bh,
4325 struct buffer_head *dx_root_bh)
4326 {
4327 int ret;
4328 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
4329 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4330 struct ocfs2_dx_root_block *dx_root;
4331 struct inode *dx_alloc_inode = NULL;
4332 struct buffer_head *dx_alloc_bh = NULL;
4333 handle_t *handle;
4334 u64 blk;
4335 u16 bit;
4336 u64 bg_blkno;
4337
4338 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
4339
4340 dx_alloc_inode = ocfs2_get_system_file_inode(osb,
4341 EXTENT_ALLOC_SYSTEM_INODE,
4342 le16_to_cpu(dx_root->dr_suballoc_slot));
4343 if (!dx_alloc_inode) {
4344 ret = -ENOMEM;
4345 mlog_errno(ret);
4346 goto out;
4347 }
4348 inode_lock(dx_alloc_inode);
4349
4350 ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1);
4351 if (ret) {
4352 mlog_errno(ret);
4353 goto out_mutex;
4354 }
4355
4356 handle = ocfs2_start_trans(osb, OCFS2_DX_ROOT_REMOVE_CREDITS);
4357 if (IS_ERR(handle)) {
4358 ret = PTR_ERR(handle);
4359 mlog_errno(ret);
4360 goto out_unlock;
4361 }
4362
4363 ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
4364 OCFS2_JOURNAL_ACCESS_WRITE);
4365 if (ret) {
4366 mlog_errno(ret);
4367 goto out_commit;
4368 }
4369
4370 spin_lock(&OCFS2_I(dir)->ip_lock);
4371 OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL;
4372 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
4373 spin_unlock(&OCFS2_I(dir)->ip_lock);
4374 di->i_dx_root = cpu_to_le64(0ULL);
4375 ocfs2_update_inode_fsync_trans(handle, dir, 1);
4376
4377 ocfs2_journal_dirty(handle, di_bh);
4378
4379 blk = le64_to_cpu(dx_root->dr_blkno);
4380 bit = le16_to_cpu(dx_root->dr_suballoc_bit);
4381 if (dx_root->dr_suballoc_loc)
4382 bg_blkno = le64_to_cpu(dx_root->dr_suballoc_loc);
4383 else
4384 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
4385 ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh,
4386 bit, bg_blkno, 1);
4387 if (ret)
4388 mlog_errno(ret);
4389
4390 out_commit:
4391 ocfs2_commit_trans(osb, handle);
4392
4393 out_unlock:
4394 ocfs2_inode_unlock(dx_alloc_inode, 1);
4395
4396 out_mutex:
4397 inode_unlock(dx_alloc_inode);
4398 brelse(dx_alloc_bh);
4399 out:
4400 iput(dx_alloc_inode);
4401 return ret;
4402 }
4403
ocfs2_dx_dir_truncate(struct inode * dir,struct buffer_head * di_bh)4404 int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
4405 {
4406 int ret;
4407 unsigned int clen;
4408 u32 major_hash = UINT_MAX, p_cpos, cpos;
4409 u64 blkno;
4410 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
4411 struct buffer_head *dx_root_bh = NULL;
4412 struct ocfs2_dx_root_block *dx_root;
4413 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4414 struct ocfs2_cached_dealloc_ctxt dealloc;
4415 struct ocfs2_extent_tree et;
4416
4417 ocfs2_init_dealloc_ctxt(&dealloc);
4418
4419 if (!ocfs2_dir_indexed(dir))
4420 return 0;
4421
4422 ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
4423 if (ret) {
4424 mlog_errno(ret);
4425 goto out;
4426 }
4427 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
4428
4429 if (ocfs2_dx_root_inline(dx_root))
4430 goto remove_index;
4431
4432 ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
4433
4434 /* XXX: What if dr_clusters is too large? */
4435 while (le32_to_cpu(dx_root->dr_clusters)) {
4436 ret = ocfs2_dx_dir_lookup_rec(dir, &dx_root->dr_list,
4437 major_hash, &cpos, &blkno, &clen);
4438 if (ret) {
4439 mlog_errno(ret);
4440 goto out;
4441 }
4442
4443 p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
4444
4445 ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 0,
4446 &dealloc, 0, false);
4447 if (ret) {
4448 mlog_errno(ret);
4449 goto out;
4450 }
4451
4452 if (cpos == 0)
4453 break;
4454
4455 major_hash = cpos - 1;
4456 }
4457
4458 remove_index:
4459 ret = ocfs2_dx_dir_remove_index(dir, di_bh, dx_root_bh);
4460 if (ret) {
4461 mlog_errno(ret);
4462 goto out;
4463 }
4464
4465 ocfs2_remove_from_cache(INODE_CACHE(dir), dx_root_bh);
4466 out:
4467 ocfs2_schedule_truncate_log_flush(osb, 1);
4468 ocfs2_run_deallocs(osb, &dealloc);
4469
4470 brelse(dx_root_bh);
4471 return ret;
4472 }
4473