xref: /openbmc/linux/fs/reiserfs/inode.c (revision 87c2ce3b)
1 /*
2  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3  */
4 
5 #include <linux/config.h>
6 #include <linux/time.h>
7 #include <linux/fs.h>
8 #include <linux/reiserfs_fs.h>
9 #include <linux/reiserfs_acl.h>
10 #include <linux/reiserfs_xattr.h>
11 #include <linux/smp_lock.h>
12 #include <linux/pagemap.h>
13 #include <linux/highmem.h>
14 #include <asm/uaccess.h>
15 #include <asm/unaligned.h>
16 #include <linux/buffer_head.h>
17 #include <linux/mpage.h>
18 #include <linux/writeback.h>
19 #include <linux/quotaops.h>
20 
21 extern int reiserfs_default_io_size;	/* default io size devuned in super.c */
22 
23 static int reiserfs_commit_write(struct file *f, struct page *page,
24 				 unsigned from, unsigned to);
25 static int reiserfs_prepare_write(struct file *f, struct page *page,
26 				  unsigned from, unsigned to);
27 
28 void reiserfs_delete_inode(struct inode *inode)
29 {
30 	/* We need blocks for transaction + (user+group) quota update (possibly delete) */
31 	int jbegin_count =
32 	    JOURNAL_PER_BALANCE_CNT * 2 +
33 	    2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
34 	struct reiserfs_transaction_handle th;
35 	int err;
36 
37 	truncate_inode_pages(&inode->i_data, 0);
38 
39 	reiserfs_write_lock(inode->i_sb);
40 
41 	/* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
42 	if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) {	/* also handles bad_inode case */
43 		mutex_lock(&inode->i_mutex);
44 
45 		reiserfs_delete_xattrs(inode);
46 
47 		if (journal_begin(&th, inode->i_sb, jbegin_count)) {
48 			mutex_unlock(&inode->i_mutex);
49 			goto out;
50 		}
51 		reiserfs_update_inode_transaction(inode);
52 
53 		err = reiserfs_delete_object(&th, inode);
54 
55 		/* Do quota update inside a transaction for journaled quotas. We must do that
56 		 * after delete_object so that quota updates go into the same transaction as
57 		 * stat data deletion */
58 		if (!err)
59 			DQUOT_FREE_INODE(inode);
60 
61 		if (journal_end(&th, inode->i_sb, jbegin_count)) {
62 			mutex_unlock(&inode->i_mutex);
63 			goto out;
64 		}
65 
66 		mutex_unlock(&inode->i_mutex);
67 
68 		/* check return value from reiserfs_delete_object after
69 		 * ending the transaction
70 		 */
71 		if (err)
72 		    goto out;
73 
74 		/* all items of file are deleted, so we can remove "save" link */
75 		remove_save_link(inode, 0 /* not truncate */ );	/* we can't do anything
76 								 * about an error here */
77 	} else {
78 		/* no object items are in the tree */
79 		;
80 	}
81       out:
82 	clear_inode(inode);	/* note this must go after the journal_end to prevent deadlock */
83 	inode->i_blocks = 0;
84 	reiserfs_write_unlock(inode->i_sb);
85 }
86 
87 static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid,
88 			  __u32 objectid, loff_t offset, int type, int length)
89 {
90 	key->version = version;
91 
92 	key->on_disk_key.k_dir_id = dirid;
93 	key->on_disk_key.k_objectid = objectid;
94 	set_cpu_key_k_offset(key, offset);
95 	set_cpu_key_k_type(key, type);
96 	key->key_length = length;
97 }
98 
99 /* take base of inode_key (it comes from inode always) (dirid, objectid) and version from an inode, set
100    offset and type of key */
101 void make_cpu_key(struct cpu_key *key, struct inode *inode, loff_t offset,
102 		  int type, int length)
103 {
104 	_make_cpu_key(key, get_inode_item_key_version(inode),
105 		      le32_to_cpu(INODE_PKEY(inode)->k_dir_id),
106 		      le32_to_cpu(INODE_PKEY(inode)->k_objectid), offset, type,
107 		      length);
108 }
109 
110 //
111 // when key is 0, do not set version and short key
112 //
113 inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
114 			      int version,
115 			      loff_t offset, int type, int length,
116 			      int entry_count /*or ih_free_space */ )
117 {
118 	if (key) {
119 		ih->ih_key.k_dir_id = cpu_to_le32(key->on_disk_key.k_dir_id);
120 		ih->ih_key.k_objectid =
121 		    cpu_to_le32(key->on_disk_key.k_objectid);
122 	}
123 	put_ih_version(ih, version);
124 	set_le_ih_k_offset(ih, offset);
125 	set_le_ih_k_type(ih, type);
126 	put_ih_item_len(ih, length);
127 	/*    set_ih_free_space (ih, 0); */
128 	// for directory items it is entry count, for directs and stat
129 	// datas - 0xffff, for indirects - 0
130 	put_ih_entry_count(ih, entry_count);
131 }
132 
133 //
134 // FIXME: we might cache recently accessed indirect item
135 
136 // Ugh.  Not too eager for that....
137 //  I cut the code until such time as I see a convincing argument (benchmark).
138 // I don't want a bloated inode struct..., and I don't like code complexity....
139 
140 /* cutting the code is fine, since it really isn't in use yet and is easy
141 ** to add back in.  But, Vladimir has a really good idea here.  Think
142 ** about what happens for reading a file.  For each page,
143 ** The VFS layer calls reiserfs_readpage, who searches the tree to find
144 ** an indirect item.  This indirect item has X number of pointers, where
145 ** X is a big number if we've done the block allocation right.  But,
146 ** we only use one or two of these pointers during each call to readpage,
147 ** needlessly researching again later on.
148 **
149 ** The size of the cache could be dynamic based on the size of the file.
150 **
151 ** I'd also like to see us cache the location the stat data item, since
152 ** we are needlessly researching for that frequently.
153 **
154 ** --chris
155 */
156 
157 /* If this page has a file tail in it, and
158 ** it was read in by get_block_create_0, the page data is valid,
159 ** but tail is still sitting in a direct item, and we can't write to
160 ** it.  So, look through this page, and check all the mapped buffers
161 ** to make sure they have valid block numbers.  Any that don't need
162 ** to be unmapped, so that block_prepare_write will correctly call
163 ** reiserfs_get_block to convert the tail into an unformatted node
164 */
165 static inline void fix_tail_page_for_writing(struct page *page)
166 {
167 	struct buffer_head *head, *next, *bh;
168 
169 	if (page && page_has_buffers(page)) {
170 		head = page_buffers(page);
171 		bh = head;
172 		do {
173 			next = bh->b_this_page;
174 			if (buffer_mapped(bh) && bh->b_blocknr == 0) {
175 				reiserfs_unmap_buffer(bh);
176 			}
177 			bh = next;
178 		} while (bh != head);
179 	}
180 }
181 
182 /* reiserfs_get_block does not need to allocate a block only if it has been
183    done already or non-hole position has been found in the indirect item */
184 static inline int allocation_needed(int retval, b_blocknr_t allocated,
185 				    struct item_head *ih,
186 				    __le32 * item, int pos_in_item)
187 {
188 	if (allocated)
189 		return 0;
190 	if (retval == POSITION_FOUND && is_indirect_le_ih(ih) &&
191 	    get_block_num(item, pos_in_item))
192 		return 0;
193 	return 1;
194 }
195 
196 static inline int indirect_item_found(int retval, struct item_head *ih)
197 {
198 	return (retval == POSITION_FOUND) && is_indirect_le_ih(ih);
199 }
200 
201 static inline void set_block_dev_mapped(struct buffer_head *bh,
202 					b_blocknr_t block, struct inode *inode)
203 {
204 	map_bh(bh, inode->i_sb, block);
205 }
206 
207 //
208 // files which were created in the earlier version can not be longer,
209 // than 2 gb
210 //
211 static int file_capable(struct inode *inode, long block)
212 {
213 	if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 ||	// it is new file.
214 	    block < (1 << (31 - inode->i_sb->s_blocksize_bits)))	// old file, but 'block' is inside of 2gb
215 		return 1;
216 
217 	return 0;
218 }
219 
220 /*static*/ int restart_transaction(struct reiserfs_transaction_handle *th,
221 				   struct inode *inode, struct path *path)
222 {
223 	struct super_block *s = th->t_super;
224 	int len = th->t_blocks_allocated;
225 	int err;
226 
227 	BUG_ON(!th->t_trans_id);
228 	BUG_ON(!th->t_refcount);
229 
230 	/* we cannot restart while nested */
231 	if (th->t_refcount > 1) {
232 		return 0;
233 	}
234 	pathrelse(path);
235 	reiserfs_update_sd(th, inode);
236 	err = journal_end(th, s, len);
237 	if (!err) {
238 		err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6);
239 		if (!err)
240 			reiserfs_update_inode_transaction(inode);
241 	}
242 	return err;
243 }
244 
245 // it is called by get_block when create == 0. Returns block number
246 // for 'block'-th logical block of file. When it hits direct item it
247 // returns 0 (being called from bmap) or read direct item into piece
248 // of page (bh_result)
249 
250 // Please improve the english/clarity in the comment above, as it is
251 // hard to understand.
252 
253 static int _get_block_create_0(struct inode *inode, long block,
254 			       struct buffer_head *bh_result, int args)
255 {
256 	INITIALIZE_PATH(path);
257 	struct cpu_key key;
258 	struct buffer_head *bh;
259 	struct item_head *ih, tmp_ih;
260 	int fs_gen;
261 	int blocknr;
262 	char *p = NULL;
263 	int chars;
264 	int ret;
265 	int result;
266 	int done = 0;
267 	unsigned long offset;
268 
269 	// prepare the key to look for the 'block'-th block of file
270 	make_cpu_key(&key, inode,
271 		     (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY,
272 		     3);
273 
274       research:
275 	result = search_for_position_by_key(inode->i_sb, &key, &path);
276 	if (result != POSITION_FOUND) {
277 		pathrelse(&path);
278 		if (p)
279 			kunmap(bh_result->b_page);
280 		if (result == IO_ERROR)
281 			return -EIO;
282 		// We do not return -ENOENT if there is a hole but page is uptodate, because it means
283 		// That there is some MMAPED data associated with it that is yet to be written to disk.
284 		if ((args & GET_BLOCK_NO_HOLE)
285 		    && !PageUptodate(bh_result->b_page)) {
286 			return -ENOENT;
287 		}
288 		return 0;
289 	}
290 	//
291 	bh = get_last_bh(&path);
292 	ih = get_ih(&path);
293 	if (is_indirect_le_ih(ih)) {
294 		__le32 *ind_item = (__le32 *) B_I_PITEM(bh, ih);
295 
296 		/* FIXME: here we could cache indirect item or part of it in
297 		   the inode to avoid search_by_key in case of subsequent
298 		   access to file */
299 		blocknr = get_block_num(ind_item, path.pos_in_item);
300 		ret = 0;
301 		if (blocknr) {
302 			map_bh(bh_result, inode->i_sb, blocknr);
303 			if (path.pos_in_item ==
304 			    ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) {
305 				set_buffer_boundary(bh_result);
306 			}
307 		} else
308 			// We do not return -ENOENT if there is a hole but page is uptodate, because it means
309 			// That there is some MMAPED data associated with it that is yet to  be written to disk.
310 		if ((args & GET_BLOCK_NO_HOLE)
311 			    && !PageUptodate(bh_result->b_page)) {
312 			ret = -ENOENT;
313 		}
314 
315 		pathrelse(&path);
316 		if (p)
317 			kunmap(bh_result->b_page);
318 		return ret;
319 	}
320 	// requested data are in direct item(s)
321 	if (!(args & GET_BLOCK_READ_DIRECT)) {
322 		// we are called by bmap. FIXME: we can not map block of file
323 		// when it is stored in direct item(s)
324 		pathrelse(&path);
325 		if (p)
326 			kunmap(bh_result->b_page);
327 		return -ENOENT;
328 	}
329 
330 	/* if we've got a direct item, and the buffer or page was uptodate,
331 	 ** we don't want to pull data off disk again.  skip to the
332 	 ** end, where we map the buffer and return
333 	 */
334 	if (buffer_uptodate(bh_result)) {
335 		goto finished;
336 	} else
337 		/*
338 		 ** grab_tail_page can trigger calls to reiserfs_get_block on up to date
339 		 ** pages without any buffers.  If the page is up to date, we don't want
340 		 ** read old data off disk.  Set the up to date bit on the buffer instead
341 		 ** and jump to the end
342 		 */
343 	if (!bh_result->b_page || PageUptodate(bh_result->b_page)) {
344 		set_buffer_uptodate(bh_result);
345 		goto finished;
346 	}
347 	// read file tail into part of page
348 	offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1);
349 	fs_gen = get_generation(inode->i_sb);
350 	copy_item_head(&tmp_ih, ih);
351 
352 	/* we only want to kmap if we are reading the tail into the page.
353 	 ** this is not the common case, so we don't kmap until we are
354 	 ** sure we need to.  But, this means the item might move if
355 	 ** kmap schedules
356 	 */
357 	if (!p) {
358 		p = (char *)kmap(bh_result->b_page);
359 		if (fs_changed(fs_gen, inode->i_sb)
360 		    && item_moved(&tmp_ih, &path)) {
361 			goto research;
362 		}
363 	}
364 	p += offset;
365 	memset(p, 0, inode->i_sb->s_blocksize);
366 	do {
367 		if (!is_direct_le_ih(ih)) {
368 			BUG();
369 		}
370 		/* make sure we don't read more bytes than actually exist in
371 		 ** the file.  This can happen in odd cases where i_size isn't
372 		 ** correct, and when direct item padding results in a few
373 		 ** extra bytes at the end of the direct item
374 		 */
375 		if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size)
376 			break;
377 		if ((le_ih_k_offset(ih) - 1 + ih_item_len(ih)) > inode->i_size) {
378 			chars =
379 			    inode->i_size - (le_ih_k_offset(ih) - 1) -
380 			    path.pos_in_item;
381 			done = 1;
382 		} else {
383 			chars = ih_item_len(ih) - path.pos_in_item;
384 		}
385 		memcpy(p, B_I_PITEM(bh, ih) + path.pos_in_item, chars);
386 
387 		if (done)
388 			break;
389 
390 		p += chars;
391 
392 		if (PATH_LAST_POSITION(&path) != (B_NR_ITEMS(bh) - 1))
393 			// we done, if read direct item is not the last item of
394 			// node FIXME: we could try to check right delimiting key
395 			// to see whether direct item continues in the right
396 			// neighbor or rely on i_size
397 			break;
398 
399 		// update key to look for the next piece
400 		set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + chars);
401 		result = search_for_position_by_key(inode->i_sb, &key, &path);
402 		if (result != POSITION_FOUND)
403 			// i/o error most likely
404 			break;
405 		bh = get_last_bh(&path);
406 		ih = get_ih(&path);
407 	} while (1);
408 
409 	flush_dcache_page(bh_result->b_page);
410 	kunmap(bh_result->b_page);
411 
412       finished:
413 	pathrelse(&path);
414 
415 	if (result == IO_ERROR)
416 		return -EIO;
417 
418 	/* this buffer has valid data, but isn't valid for io.  mapping it to
419 	 * block #0 tells the rest of reiserfs it just has a tail in it
420 	 */
421 	map_bh(bh_result, inode->i_sb, 0);
422 	set_buffer_uptodate(bh_result);
423 	return 0;
424 }
425 
426 // this is called to create file map. So, _get_block_create_0 will not
427 // read direct item
428 static int reiserfs_bmap(struct inode *inode, sector_t block,
429 			 struct buffer_head *bh_result, int create)
430 {
431 	if (!file_capable(inode, block))
432 		return -EFBIG;
433 
434 	reiserfs_write_lock(inode->i_sb);
435 	/* do not read the direct item */
436 	_get_block_create_0(inode, block, bh_result, 0);
437 	reiserfs_write_unlock(inode->i_sb);
438 	return 0;
439 }
440 
441 /* special version of get_block that is only used by grab_tail_page right
442 ** now.  It is sent to block_prepare_write, and when you try to get a
443 ** block past the end of the file (or a block from a hole) it returns
444 ** -ENOENT instead of a valid buffer.  block_prepare_write expects to
445 ** be able to do i/o on the buffers returned, unless an error value
446 ** is also returned.
447 **
448 ** So, this allows block_prepare_write to be used for reading a single block
449 ** in a page.  Where it does not produce a valid page for holes, or past the
450 ** end of the file.  This turns out to be exactly what we need for reading
451 ** tails for conversion.
452 **
453 ** The point of the wrapper is forcing a certain value for create, even
454 ** though the VFS layer is calling this function with create==1.  If you
455 ** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
456 ** don't use this function.
457 */
458 static int reiserfs_get_block_create_0(struct inode *inode, sector_t block,
459 				       struct buffer_head *bh_result,
460 				       int create)
461 {
462 	return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE);
463 }
464 
465 /* This is special helper for reiserfs_get_block in case we are executing
466    direct_IO request. */
467 static int reiserfs_get_blocks_direct_io(struct inode *inode,
468 					 sector_t iblock,
469 					 unsigned long max_blocks,
470 					 struct buffer_head *bh_result,
471 					 int create)
472 {
473 	int ret;
474 
475 	bh_result->b_page = NULL;
476 
477 	/* We set the b_size before reiserfs_get_block call since it is
478 	   referenced in convert_tail_for_hole() that may be called from
479 	   reiserfs_get_block() */
480 	bh_result->b_size = (1 << inode->i_blkbits);
481 
482 	ret = reiserfs_get_block(inode, iblock, bh_result,
483 				 create | GET_BLOCK_NO_DANGLE);
484 	if (ret)
485 		goto out;
486 
487 	/* don't allow direct io onto tail pages */
488 	if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
489 		/* make sure future calls to the direct io funcs for this offset
490 		 ** in the file fail by unmapping the buffer
491 		 */
492 		clear_buffer_mapped(bh_result);
493 		ret = -EINVAL;
494 	}
495 	/* Possible unpacked tail. Flush the data before pages have
496 	   disappeared */
497 	if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
498 		int err;
499 		lock_kernel();
500 		err = reiserfs_commit_for_inode(inode);
501 		REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
502 		unlock_kernel();
503 		if (err < 0)
504 			ret = err;
505 	}
506       out:
507 	return ret;
508 }
509 
510 /*
511 ** helper function for when reiserfs_get_block is called for a hole
512 ** but the file tail is still in a direct item
513 ** bh_result is the buffer head for the hole
514 ** tail_offset is the offset of the start of the tail in the file
515 **
516 ** This calls prepare_write, which will start a new transaction
517 ** you should not be in a transaction, or have any paths held when you
518 ** call this.
519 */
520 static int convert_tail_for_hole(struct inode *inode,
521 				 struct buffer_head *bh_result,
522 				 loff_t tail_offset)
523 {
524 	unsigned long index;
525 	unsigned long tail_end;
526 	unsigned long tail_start;
527 	struct page *tail_page;
528 	struct page *hole_page = bh_result->b_page;
529 	int retval = 0;
530 
531 	if ((tail_offset & (bh_result->b_size - 1)) != 1)
532 		return -EIO;
533 
534 	/* always try to read until the end of the block */
535 	tail_start = tail_offset & (PAGE_CACHE_SIZE - 1);
536 	tail_end = (tail_start | (bh_result->b_size - 1)) + 1;
537 
538 	index = tail_offset >> PAGE_CACHE_SHIFT;
539 	/* hole_page can be zero in case of direct_io, we are sure
540 	   that we cannot get here if we write with O_DIRECT into
541 	   tail page */
542 	if (!hole_page || index != hole_page->index) {
543 		tail_page = grab_cache_page(inode->i_mapping, index);
544 		retval = -ENOMEM;
545 		if (!tail_page) {
546 			goto out;
547 		}
548 	} else {
549 		tail_page = hole_page;
550 	}
551 
552 	/* we don't have to make sure the conversion did not happen while
553 	 ** we were locking the page because anyone that could convert
554 	 ** must first take i_mutex.
555 	 **
556 	 ** We must fix the tail page for writing because it might have buffers
557 	 ** that are mapped, but have a block number of 0.  This indicates tail
558 	 ** data that has been read directly into the page, and block_prepare_write
559 	 ** won't trigger a get_block in this case.
560 	 */
561 	fix_tail_page_for_writing(tail_page);
562 	retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end);
563 	if (retval)
564 		goto unlock;
565 
566 	/* tail conversion might change the data in the page */
567 	flush_dcache_page(tail_page);
568 
569 	retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end);
570 
571       unlock:
572 	if (tail_page != hole_page) {
573 		unlock_page(tail_page);
574 		page_cache_release(tail_page);
575 	}
576       out:
577 	return retval;
578 }
579 
580 static inline int _allocate_block(struct reiserfs_transaction_handle *th,
581 				  long block,
582 				  struct inode *inode,
583 				  b_blocknr_t * allocated_block_nr,
584 				  struct path *path, int flags)
585 {
586 	BUG_ON(!th->t_trans_id);
587 
588 #ifdef REISERFS_PREALLOCATE
589 	if (!(flags & GET_BLOCK_NO_IMUX)) {
590 		return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr,
591 						  path, block);
592 	}
593 #endif
594 	return reiserfs_new_unf_blocknrs(th, inode, allocated_block_nr, path,
595 					 block);
596 }
597 
598 int reiserfs_get_block(struct inode *inode, sector_t block,
599 		       struct buffer_head *bh_result, int create)
600 {
601 	int repeat, retval = 0;
602 	b_blocknr_t allocated_block_nr = 0;	// b_blocknr_t is (unsigned) 32 bit int
603 	INITIALIZE_PATH(path);
604 	int pos_in_item;
605 	struct cpu_key key;
606 	struct buffer_head *bh, *unbh = NULL;
607 	struct item_head *ih, tmp_ih;
608 	__le32 *item;
609 	int done;
610 	int fs_gen;
611 	struct reiserfs_transaction_handle *th = NULL;
612 	/* space reserved in transaction batch:
613 	   . 3 balancings in direct->indirect conversion
614 	   . 1 block involved into reiserfs_update_sd()
615 	   XXX in practically impossible worst case direct2indirect()
616 	   can incur (much) more than 3 balancings.
617 	   quota update for user, group */
618 	int jbegin_count =
619 	    JOURNAL_PER_BALANCE_CNT * 3 + 1 +
620 	    2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
621 	int version;
622 	int dangle = 1;
623 	loff_t new_offset =
624 	    (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1;
625 
626 	/* bad.... */
627 	reiserfs_write_lock(inode->i_sb);
628 	version = get_inode_item_key_version(inode);
629 
630 	if (block < 0) {
631 		reiserfs_write_unlock(inode->i_sb);
632 		return -EIO;
633 	}
634 
635 	if (!file_capable(inode, block)) {
636 		reiserfs_write_unlock(inode->i_sb);
637 		return -EFBIG;
638 	}
639 
640 	/* if !create, we aren't changing the FS, so we don't need to
641 	 ** log anything, so we don't need to start a transaction
642 	 */
643 	if (!(create & GET_BLOCK_CREATE)) {
644 		int ret;
645 		/* find number of block-th logical block of the file */
646 		ret = _get_block_create_0(inode, block, bh_result,
647 					  create | GET_BLOCK_READ_DIRECT);
648 		reiserfs_write_unlock(inode->i_sb);
649 		return ret;
650 	}
651 	/*
652 	 * if we're already in a transaction, make sure to close
653 	 * any new transactions we start in this func
654 	 */
655 	if ((create & GET_BLOCK_NO_DANGLE) ||
656 	    reiserfs_transaction_running(inode->i_sb))
657 		dangle = 0;
658 
659 	/* If file is of such a size, that it might have a tail and tails are enabled
660 	 ** we should mark it as possibly needing tail packing on close
661 	 */
662 	if ((have_large_tails(inode->i_sb)
663 	     && inode->i_size < i_block_size(inode) * 4)
664 	    || (have_small_tails(inode->i_sb)
665 		&& inode->i_size < i_block_size(inode)))
666 		REISERFS_I(inode)->i_flags |= i_pack_on_close_mask;
667 
668 	/* set the key of the first byte in the 'block'-th block of file */
669 	make_cpu_key(&key, inode, new_offset, TYPE_ANY, 3 /*key length */ );
670 	if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
671 	      start_trans:
672 		th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count);
673 		if (!th) {
674 			retval = -ENOMEM;
675 			goto failure;
676 		}
677 		reiserfs_update_inode_transaction(inode);
678 	}
679       research:
680 
681 	retval = search_for_position_by_key(inode->i_sb, &key, &path);
682 	if (retval == IO_ERROR) {
683 		retval = -EIO;
684 		goto failure;
685 	}
686 
687 	bh = get_last_bh(&path);
688 	ih = get_ih(&path);
689 	item = get_item(&path);
690 	pos_in_item = path.pos_in_item;
691 
692 	fs_gen = get_generation(inode->i_sb);
693 	copy_item_head(&tmp_ih, ih);
694 
695 	if (allocation_needed
696 	    (retval, allocated_block_nr, ih, item, pos_in_item)) {
697 		/* we have to allocate block for the unformatted node */
698 		if (!th) {
699 			pathrelse(&path);
700 			goto start_trans;
701 		}
702 
703 		repeat =
704 		    _allocate_block(th, block, inode, &allocated_block_nr,
705 				    &path, create);
706 
707 		if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) {
708 			/* restart the transaction to give the journal a chance to free
709 			 ** some blocks.  releases the path, so we have to go back to
710 			 ** research if we succeed on the second try
711 			 */
712 			SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1;
713 			retval = restart_transaction(th, inode, &path);
714 			if (retval)
715 				goto failure;
716 			repeat =
717 			    _allocate_block(th, block, inode,
718 					    &allocated_block_nr, NULL, create);
719 
720 			if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) {
721 				goto research;
722 			}
723 			if (repeat == QUOTA_EXCEEDED)
724 				retval = -EDQUOT;
725 			else
726 				retval = -ENOSPC;
727 			goto failure;
728 		}
729 
730 		if (fs_changed(fs_gen, inode->i_sb)
731 		    && item_moved(&tmp_ih, &path)) {
732 			goto research;
733 		}
734 	}
735 
736 	if (indirect_item_found(retval, ih)) {
737 		b_blocknr_t unfm_ptr;
738 		/* 'block'-th block is in the file already (there is
739 		   corresponding cell in some indirect item). But it may be
740 		   zero unformatted node pointer (hole) */
741 		unfm_ptr = get_block_num(item, pos_in_item);
742 		if (unfm_ptr == 0) {
743 			/* use allocated block to plug the hole */
744 			reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
745 			if (fs_changed(fs_gen, inode->i_sb)
746 			    && item_moved(&tmp_ih, &path)) {
747 				reiserfs_restore_prepared_buffer(inode->i_sb,
748 								 bh);
749 				goto research;
750 			}
751 			set_buffer_new(bh_result);
752 			if (buffer_dirty(bh_result)
753 			    && reiserfs_data_ordered(inode->i_sb))
754 				reiserfs_add_ordered_list(inode, bh_result);
755 			put_block_num(item, pos_in_item, allocated_block_nr);
756 			unfm_ptr = allocated_block_nr;
757 			journal_mark_dirty(th, inode->i_sb, bh);
758 			reiserfs_update_sd(th, inode);
759 		}
760 		set_block_dev_mapped(bh_result, unfm_ptr, inode);
761 		pathrelse(&path);
762 		retval = 0;
763 		if (!dangle && th)
764 			retval = reiserfs_end_persistent_transaction(th);
765 
766 		reiserfs_write_unlock(inode->i_sb);
767 
768 		/* the item was found, so new blocks were not added to the file
769 		 ** there is no need to make sure the inode is updated with this
770 		 ** transaction
771 		 */
772 		return retval;
773 	}
774 
775 	if (!th) {
776 		pathrelse(&path);
777 		goto start_trans;
778 	}
779 
780 	/* desired position is not found or is in the direct item. We have
781 	   to append file with holes up to 'block'-th block converting
782 	   direct items to indirect one if necessary */
783 	done = 0;
784 	do {
785 		if (is_statdata_le_ih(ih)) {
786 			__le32 unp = 0;
787 			struct cpu_key tmp_key;
788 
789 			/* indirect item has to be inserted */
790 			make_le_item_head(&tmp_ih, &key, version, 1,
791 					  TYPE_INDIRECT, UNFM_P_SIZE,
792 					  0 /* free_space */ );
793 
794 			if (cpu_key_k_offset(&key) == 1) {
795 				/* we are going to add 'block'-th block to the file. Use
796 				   allocated block for that */
797 				unp = cpu_to_le32(allocated_block_nr);
798 				set_block_dev_mapped(bh_result,
799 						     allocated_block_nr, inode);
800 				set_buffer_new(bh_result);
801 				done = 1;
802 			}
803 			tmp_key = key;	// ;)
804 			set_cpu_key_k_offset(&tmp_key, 1);
805 			PATH_LAST_POSITION(&path)++;
806 
807 			retval =
808 			    reiserfs_insert_item(th, &path, &tmp_key, &tmp_ih,
809 						 inode, (char *)&unp);
810 			if (retval) {
811 				reiserfs_free_block(th, inode,
812 						    allocated_block_nr, 1);
813 				goto failure;	// retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST
814 			}
815 			//mark_tail_converted (inode);
816 		} else if (is_direct_le_ih(ih)) {
817 			/* direct item has to be converted */
818 			loff_t tail_offset;
819 
820 			tail_offset =
821 			    ((le_ih_k_offset(ih) -
822 			      1) & ~(inode->i_sb->s_blocksize - 1)) + 1;
823 			if (tail_offset == cpu_key_k_offset(&key)) {
824 				/* direct item we just found fits into block we have
825 				   to map. Convert it into unformatted node: use
826 				   bh_result for the conversion */
827 				set_block_dev_mapped(bh_result,
828 						     allocated_block_nr, inode);
829 				unbh = bh_result;
830 				done = 1;
831 			} else {
832 				/* we have to padd file tail stored in direct item(s)
833 				   up to block size and convert it to unformatted
834 				   node. FIXME: this should also get into page cache */
835 
836 				pathrelse(&path);
837 				/*
838 				 * ugly, but we can only end the transaction if
839 				 * we aren't nested
840 				 */
841 				BUG_ON(!th->t_refcount);
842 				if (th->t_refcount == 1) {
843 					retval =
844 					    reiserfs_end_persistent_transaction
845 					    (th);
846 					th = NULL;
847 					if (retval)
848 						goto failure;
849 				}
850 
851 				retval =
852 				    convert_tail_for_hole(inode, bh_result,
853 							  tail_offset);
854 				if (retval) {
855 					if (retval != -ENOSPC)
856 						reiserfs_warning(inode->i_sb,
857 								 "clm-6004: convert tail failed inode %lu, error %d",
858 								 inode->i_ino,
859 								 retval);
860 					if (allocated_block_nr) {
861 						/* the bitmap, the super, and the stat data == 3 */
862 						if (!th)
863 							th = reiserfs_persistent_transaction(inode->i_sb, 3);
864 						if (th)
865 							reiserfs_free_block(th,
866 									    inode,
867 									    allocated_block_nr,
868 									    1);
869 					}
870 					goto failure;
871 				}
872 				goto research;
873 			}
874 			retval =
875 			    direct2indirect(th, inode, &path, unbh,
876 					    tail_offset);
877 			if (retval) {
878 				reiserfs_unmap_buffer(unbh);
879 				reiserfs_free_block(th, inode,
880 						    allocated_block_nr, 1);
881 				goto failure;
882 			}
883 			/* it is important the set_buffer_uptodate is done after
884 			 ** the direct2indirect.  The buffer might contain valid
885 			 ** data newer than the data on disk (read by readpage, changed,
886 			 ** and then sent here by writepage).  direct2indirect needs
887 			 ** to know if unbh was already up to date, so it can decide
888 			 ** if the data in unbh needs to be replaced with data from
889 			 ** the disk
890 			 */
891 			set_buffer_uptodate(unbh);
892 
893 			/* unbh->b_page == NULL in case of DIRECT_IO request, this means
894 			   buffer will disappear shortly, so it should not be added to
895 			 */
896 			if (unbh->b_page) {
897 				/* we've converted the tail, so we must
898 				 ** flush unbh before the transaction commits
899 				 */
900 				reiserfs_add_tail_list(inode, unbh);
901 
902 				/* mark it dirty now to prevent commit_write from adding
903 				 ** this buffer to the inode's dirty buffer list
904 				 */
905 				/*
906 				 * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty().
907 				 * It's still atomic, but it sets the page dirty too,
908 				 * which makes it eligible for writeback at any time by the
909 				 * VM (which was also the case with __mark_buffer_dirty())
910 				 */
911 				mark_buffer_dirty(unbh);
912 			}
913 		} else {
914 			/* append indirect item with holes if needed, when appending
915 			   pointer to 'block'-th block use block, which is already
916 			   allocated */
917 			struct cpu_key tmp_key;
918 			unp_t unf_single = 0;	// We use this in case we need to allocate only
919 			// one block which is a fastpath
920 			unp_t *un;
921 			__u64 max_to_insert =
922 			    MAX_ITEM_LEN(inode->i_sb->s_blocksize) /
923 			    UNFM_P_SIZE;
924 			__u64 blocks_needed;
925 
926 			RFALSE(pos_in_item != ih_item_len(ih) / UNFM_P_SIZE,
927 			       "vs-804: invalid position for append");
928 			/* indirect item has to be appended, set up key of that position */
929 			make_cpu_key(&tmp_key, inode,
930 				     le_key_k_offset(version,
931 						     &(ih->ih_key)) +
932 				     op_bytes_number(ih,
933 						     inode->i_sb->s_blocksize),
934 				     //pos_in_item * inode->i_sb->s_blocksize,
935 				     TYPE_INDIRECT, 3);	// key type is unimportant
936 
937 			blocks_needed =
938 			    1 +
939 			    ((cpu_key_k_offset(&key) -
940 			      cpu_key_k_offset(&tmp_key)) >> inode->i_sb->
941 			     s_blocksize_bits);
942 			RFALSE(blocks_needed < 0, "green-805: invalid offset");
943 
944 			if (blocks_needed == 1) {
945 				un = &unf_single;
946 			} else {
947 				un = kmalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_ATOMIC);	// We need to avoid scheduling.
948 				if (!un) {
949 					un = &unf_single;
950 					blocks_needed = 1;
951 					max_to_insert = 0;
952 				} else
953 					memset(un, 0,
954 					       UNFM_P_SIZE * min(blocks_needed,
955 								 max_to_insert));
956 			}
957 			if (blocks_needed <= max_to_insert) {
958 				/* we are going to add target block to the file. Use allocated
959 				   block for that */
960 				un[blocks_needed - 1] =
961 				    cpu_to_le32(allocated_block_nr);
962 				set_block_dev_mapped(bh_result,
963 						     allocated_block_nr, inode);
964 				set_buffer_new(bh_result);
965 				done = 1;
966 			} else {
967 				/* paste hole to the indirect item */
968 				/* If kmalloc failed, max_to_insert becomes zero and it means we
969 				   only have space for one block */
970 				blocks_needed =
971 				    max_to_insert ? max_to_insert : 1;
972 			}
973 			retval =
974 			    reiserfs_paste_into_item(th, &path, &tmp_key, inode,
975 						     (char *)un,
976 						     UNFM_P_SIZE *
977 						     blocks_needed);
978 
979 			if (blocks_needed != 1)
980 				kfree(un);
981 
982 			if (retval) {
983 				reiserfs_free_block(th, inode,
984 						    allocated_block_nr, 1);
985 				goto failure;
986 			}
987 			if (!done) {
988 				/* We need to mark new file size in case this function will be
989 				   interrupted/aborted later on. And we may do this only for
990 				   holes. */
991 				inode->i_size +=
992 				    inode->i_sb->s_blocksize * blocks_needed;
993 			}
994 		}
995 
996 		if (done == 1)
997 			break;
998 
999 		/* this loop could log more blocks than we had originally asked
1000 		 ** for.  So, we have to allow the transaction to end if it is
1001 		 ** too big or too full.  Update the inode so things are
1002 		 ** consistent if we crash before the function returns
1003 		 **
1004 		 ** release the path so that anybody waiting on the path before
1005 		 ** ending their transaction will be able to continue.
1006 		 */
1007 		if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
1008 			retval = restart_transaction(th, inode, &path);
1009 			if (retval)
1010 				goto failure;
1011 		}
1012 		/* inserting indirect pointers for a hole can take a
1013 		 ** long time.  reschedule if needed
1014 		 */
1015 		cond_resched();
1016 
1017 		retval = search_for_position_by_key(inode->i_sb, &key, &path);
1018 		if (retval == IO_ERROR) {
1019 			retval = -EIO;
1020 			goto failure;
1021 		}
1022 		if (retval == POSITION_FOUND) {
1023 			reiserfs_warning(inode->i_sb,
1024 					 "vs-825: reiserfs_get_block: "
1025 					 "%K should not be found", &key);
1026 			retval = -EEXIST;
1027 			if (allocated_block_nr)
1028 				reiserfs_free_block(th, inode,
1029 						    allocated_block_nr, 1);
1030 			pathrelse(&path);
1031 			goto failure;
1032 		}
1033 		bh = get_last_bh(&path);
1034 		ih = get_ih(&path);
1035 		item = get_item(&path);
1036 		pos_in_item = path.pos_in_item;
1037 	} while (1);
1038 
1039 	retval = 0;
1040 
1041       failure:
1042 	if (th && (!dangle || (retval && !th->t_trans_id))) {
1043 		int err;
1044 		if (th->t_trans_id)
1045 			reiserfs_update_sd(th, inode);
1046 		err = reiserfs_end_persistent_transaction(th);
1047 		if (err)
1048 			retval = err;
1049 	}
1050 
1051 	reiserfs_write_unlock(inode->i_sb);
1052 	reiserfs_check_path(&path);
1053 	return retval;
1054 }
1055 
1056 static int
1057 reiserfs_readpages(struct file *file, struct address_space *mapping,
1058 		   struct list_head *pages, unsigned nr_pages)
1059 {
1060 	return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block);
1061 }
1062 
1063 /* Compute real number of used bytes by file
1064  * Following three functions can go away when we'll have enough space in stat item
1065  */
1066 static int real_space_diff(struct inode *inode, int sd_size)
1067 {
1068 	int bytes;
1069 	loff_t blocksize = inode->i_sb->s_blocksize;
1070 
1071 	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode))
1072 		return sd_size;
1073 
1074 	/* End of file is also in full block with indirect reference, so round
1075 	 ** up to the next block.
1076 	 **
1077 	 ** there is just no way to know if the tail is actually packed
1078 	 ** on the file, so we have to assume it isn't.  When we pack the
1079 	 ** tail, we add 4 bytes to pretend there really is an unformatted
1080 	 ** node pointer
1081 	 */
1082 	bytes =
1083 	    ((inode->i_size +
1084 	      (blocksize - 1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE +
1085 	    sd_size;
1086 	return bytes;
1087 }
1088 
1089 static inline loff_t to_real_used_space(struct inode *inode, ulong blocks,
1090 					int sd_size)
1091 {
1092 	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
1093 		return inode->i_size +
1094 		    (loff_t) (real_space_diff(inode, sd_size));
1095 	}
1096 	return ((loff_t) real_space_diff(inode, sd_size)) +
1097 	    (((loff_t) blocks) << 9);
1098 }
1099 
1100 /* Compute number of blocks used by file in ReiserFS counting */
1101 static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size)
1102 {
1103 	loff_t bytes = inode_get_bytes(inode);
1104 	loff_t real_space = real_space_diff(inode, sd_size);
1105 
1106 	/* keeps fsck and non-quota versions of reiserfs happy */
1107 	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
1108 		bytes += (loff_t) 511;
1109 	}
1110 
1111 	/* files from before the quota patch might i_blocks such that
1112 	 ** bytes < real_space.  Deal with that here to prevent it from
1113 	 ** going negative.
1114 	 */
1115 	if (bytes < real_space)
1116 		return 0;
1117 	return (bytes - real_space) >> 9;
1118 }
1119 
1120 //
1121 // BAD: new directories have stat data of new type and all other items
1122 // of old type. Version stored in the inode says about body items, so
1123 // in update_stat_data we can not rely on inode, but have to check
1124 // item version directly
1125 //
1126 
1127 // called by read_locked_inode
1128 static void init_inode(struct inode *inode, struct path *path)
1129 {
1130 	struct buffer_head *bh;
1131 	struct item_head *ih;
1132 	__u32 rdev;
1133 	//int version = ITEM_VERSION_1;
1134 
1135 	bh = PATH_PLAST_BUFFER(path);
1136 	ih = PATH_PITEM_HEAD(path);
1137 
1138 	copy_key(INODE_PKEY(inode), &(ih->ih_key));
1139 	inode->i_blksize = reiserfs_default_io_size;
1140 
1141 	INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list));
1142 	REISERFS_I(inode)->i_flags = 0;
1143 	REISERFS_I(inode)->i_prealloc_block = 0;
1144 	REISERFS_I(inode)->i_prealloc_count = 0;
1145 	REISERFS_I(inode)->i_trans_id = 0;
1146 	REISERFS_I(inode)->i_jl = NULL;
1147 	REISERFS_I(inode)->i_acl_access = NULL;
1148 	REISERFS_I(inode)->i_acl_default = NULL;
1149 	init_rwsem(&REISERFS_I(inode)->xattr_sem);
1150 
1151 	if (stat_data_v1(ih)) {
1152 		struct stat_data_v1 *sd =
1153 		    (struct stat_data_v1 *)B_I_PITEM(bh, ih);
1154 		unsigned long blocks;
1155 
1156 		set_inode_item_key_version(inode, KEY_FORMAT_3_5);
1157 		set_inode_sd_version(inode, STAT_DATA_V1);
1158 		inode->i_mode = sd_v1_mode(sd);
1159 		inode->i_nlink = sd_v1_nlink(sd);
1160 		inode->i_uid = sd_v1_uid(sd);
1161 		inode->i_gid = sd_v1_gid(sd);
1162 		inode->i_size = sd_v1_size(sd);
1163 		inode->i_atime.tv_sec = sd_v1_atime(sd);
1164 		inode->i_mtime.tv_sec = sd_v1_mtime(sd);
1165 		inode->i_ctime.tv_sec = sd_v1_ctime(sd);
1166 		inode->i_atime.tv_nsec = 0;
1167 		inode->i_ctime.tv_nsec = 0;
1168 		inode->i_mtime.tv_nsec = 0;
1169 
1170 		inode->i_blocks = sd_v1_blocks(sd);
1171 		inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
1172 		blocks = (inode->i_size + 511) >> 9;
1173 		blocks = _ROUND_UP(blocks, inode->i_sb->s_blocksize >> 9);
1174 		if (inode->i_blocks > blocks) {
1175 			// there was a bug in <=3.5.23 when i_blocks could take negative
1176 			// values. Starting from 3.5.17 this value could even be stored in
1177 			// stat data. For such files we set i_blocks based on file
1178 			// size. Just 2 notes: this can be wrong for sparce files. On-disk value will be
1179 			// only updated if file's inode will ever change
1180 			inode->i_blocks = blocks;
1181 		}
1182 
1183 		rdev = sd_v1_rdev(sd);
1184 		REISERFS_I(inode)->i_first_direct_byte =
1185 		    sd_v1_first_direct_byte(sd);
1186 		/* an early bug in the quota code can give us an odd number for the
1187 		 ** block count.  This is incorrect, fix it here.
1188 		 */
1189 		if (inode->i_blocks & 1) {
1190 			inode->i_blocks++;
1191 		}
1192 		inode_set_bytes(inode,
1193 				to_real_used_space(inode, inode->i_blocks,
1194 						   SD_V1_SIZE));
1195 		/* nopack is initially zero for v1 objects. For v2 objects,
1196 		   nopack is initialised from sd_attrs */
1197 		REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
1198 	} else {
1199 		// new stat data found, but object may have old items
1200 		// (directories and symlinks)
1201 		struct stat_data *sd = (struct stat_data *)B_I_PITEM(bh, ih);
1202 
1203 		inode->i_mode = sd_v2_mode(sd);
1204 		inode->i_nlink = sd_v2_nlink(sd);
1205 		inode->i_uid = sd_v2_uid(sd);
1206 		inode->i_size = sd_v2_size(sd);
1207 		inode->i_gid = sd_v2_gid(sd);
1208 		inode->i_mtime.tv_sec = sd_v2_mtime(sd);
1209 		inode->i_atime.tv_sec = sd_v2_atime(sd);
1210 		inode->i_ctime.tv_sec = sd_v2_ctime(sd);
1211 		inode->i_ctime.tv_nsec = 0;
1212 		inode->i_mtime.tv_nsec = 0;
1213 		inode->i_atime.tv_nsec = 0;
1214 		inode->i_blocks = sd_v2_blocks(sd);
1215 		rdev = sd_v2_rdev(sd);
1216 		if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
1217 			inode->i_generation =
1218 			    le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
1219 		else
1220 			inode->i_generation = sd_v2_generation(sd);
1221 
1222 		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
1223 			set_inode_item_key_version(inode, KEY_FORMAT_3_5);
1224 		else
1225 			set_inode_item_key_version(inode, KEY_FORMAT_3_6);
1226 		REISERFS_I(inode)->i_first_direct_byte = 0;
1227 		set_inode_sd_version(inode, STAT_DATA_V2);
1228 		inode_set_bytes(inode,
1229 				to_real_used_space(inode, inode->i_blocks,
1230 						   SD_V2_SIZE));
1231 		/* read persistent inode attributes from sd and initalise
1232 		   generic inode flags from them */
1233 		REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd);
1234 		sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode);
1235 	}
1236 
1237 	pathrelse(path);
1238 	if (S_ISREG(inode->i_mode)) {
1239 		inode->i_op = &reiserfs_file_inode_operations;
1240 		inode->i_fop = &reiserfs_file_operations;
1241 		inode->i_mapping->a_ops = &reiserfs_address_space_operations;
1242 	} else if (S_ISDIR(inode->i_mode)) {
1243 		inode->i_op = &reiserfs_dir_inode_operations;
1244 		inode->i_fop = &reiserfs_dir_operations;
1245 	} else if (S_ISLNK(inode->i_mode)) {
1246 		inode->i_op = &reiserfs_symlink_inode_operations;
1247 		inode->i_mapping->a_ops = &reiserfs_address_space_operations;
1248 	} else {
1249 		inode->i_blocks = 0;
1250 		inode->i_op = &reiserfs_special_inode_operations;
1251 		init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
1252 	}
1253 }
1254 
1255 // update new stat data with inode fields
1256 static void inode2sd(void *sd, struct inode *inode, loff_t size)
1257 {
1258 	struct stat_data *sd_v2 = (struct stat_data *)sd;
1259 	__u16 flags;
1260 
1261 	set_sd_v2_mode(sd_v2, inode->i_mode);
1262 	set_sd_v2_nlink(sd_v2, inode->i_nlink);
1263 	set_sd_v2_uid(sd_v2, inode->i_uid);
1264 	set_sd_v2_size(sd_v2, size);
1265 	set_sd_v2_gid(sd_v2, inode->i_gid);
1266 	set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec);
1267 	set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec);
1268 	set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec);
1269 	set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
1270 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
1271 		set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
1272 	else
1273 		set_sd_v2_generation(sd_v2, inode->i_generation);
1274 	flags = REISERFS_I(inode)->i_attrs;
1275 	i_attrs_to_sd_attrs(inode, &flags);
1276 	set_sd_v2_attrs(sd_v2, flags);
1277 }
1278 
1279 // used to copy inode's fields to old stat data
1280 static void inode2sd_v1(void *sd, struct inode *inode, loff_t size)
1281 {
1282 	struct stat_data_v1 *sd_v1 = (struct stat_data_v1 *)sd;
1283 
1284 	set_sd_v1_mode(sd_v1, inode->i_mode);
1285 	set_sd_v1_uid(sd_v1, inode->i_uid);
1286 	set_sd_v1_gid(sd_v1, inode->i_gid);
1287 	set_sd_v1_nlink(sd_v1, inode->i_nlink);
1288 	set_sd_v1_size(sd_v1, size);
1289 	set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec);
1290 	set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec);
1291 	set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec);
1292 
1293 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
1294 		set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev));
1295 	else
1296 		set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE));
1297 
1298 	// Sigh. i_first_direct_byte is back
1299 	set_sd_v1_first_direct_byte(sd_v1,
1300 				    REISERFS_I(inode)->i_first_direct_byte);
1301 }
1302 
1303 /* NOTE, you must prepare the buffer head before sending it here,
1304 ** and then log it after the call
1305 */
1306 static void update_stat_data(struct path *path, struct inode *inode,
1307 			     loff_t size)
1308 {
1309 	struct buffer_head *bh;
1310 	struct item_head *ih;
1311 
1312 	bh = PATH_PLAST_BUFFER(path);
1313 	ih = PATH_PITEM_HEAD(path);
1314 
1315 	if (!is_statdata_le_ih(ih))
1316 		reiserfs_panic(inode->i_sb,
1317 			       "vs-13065: update_stat_data: key %k, found item %h",
1318 			       INODE_PKEY(inode), ih);
1319 
1320 	if (stat_data_v1(ih)) {
1321 		// path points to old stat data
1322 		inode2sd_v1(B_I_PITEM(bh, ih), inode, size);
1323 	} else {
1324 		inode2sd(B_I_PITEM(bh, ih), inode, size);
1325 	}
1326 
1327 	return;
1328 }
1329 
1330 void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
1331 			     struct inode *inode, loff_t size)
1332 {
1333 	struct cpu_key key;
1334 	INITIALIZE_PATH(path);
1335 	struct buffer_head *bh;
1336 	int fs_gen;
1337 	struct item_head *ih, tmp_ih;
1338 	int retval;
1339 
1340 	BUG_ON(!th->t_trans_id);
1341 
1342 	make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3);	//key type is unimportant
1343 
1344 	for (;;) {
1345 		int pos;
1346 		/* look for the object's stat data */
1347 		retval = search_item(inode->i_sb, &key, &path);
1348 		if (retval == IO_ERROR) {
1349 			reiserfs_warning(inode->i_sb,
1350 					 "vs-13050: reiserfs_update_sd: "
1351 					 "i/o failure occurred trying to update %K stat data",
1352 					 &key);
1353 			return;
1354 		}
1355 		if (retval == ITEM_NOT_FOUND) {
1356 			pos = PATH_LAST_POSITION(&path);
1357 			pathrelse(&path);
1358 			if (inode->i_nlink == 0) {
1359 				/*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found"); */
1360 				return;
1361 			}
1362 			reiserfs_warning(inode->i_sb,
1363 					 "vs-13060: reiserfs_update_sd: "
1364 					 "stat data of object %k (nlink == %d) not found (pos %d)",
1365 					 INODE_PKEY(inode), inode->i_nlink,
1366 					 pos);
1367 			reiserfs_check_path(&path);
1368 			return;
1369 		}
1370 
1371 		/* sigh, prepare_for_journal might schedule.  When it schedules the
1372 		 ** FS might change.  We have to detect that, and loop back to the
1373 		 ** search if the stat data item has moved
1374 		 */
1375 		bh = get_last_bh(&path);
1376 		ih = get_ih(&path);
1377 		copy_item_head(&tmp_ih, ih);
1378 		fs_gen = get_generation(inode->i_sb);
1379 		reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
1380 		if (fs_changed(fs_gen, inode->i_sb)
1381 		    && item_moved(&tmp_ih, &path)) {
1382 			reiserfs_restore_prepared_buffer(inode->i_sb, bh);
1383 			continue;	/* Stat_data item has been moved after scheduling. */
1384 		}
1385 		break;
1386 	}
1387 	update_stat_data(&path, inode, size);
1388 	journal_mark_dirty(th, th->t_super, bh);
1389 	pathrelse(&path);
1390 	return;
1391 }
1392 
1393 /* reiserfs_read_locked_inode is called to read the inode off disk, and it
1394 ** does a make_bad_inode when things go wrong.  But, we need to make sure
1395 ** and clear the key in the private portion of the inode, otherwise a
1396 ** corresponding iput might try to delete whatever object the inode last
1397 ** represented.
1398 */
1399 static void reiserfs_make_bad_inode(struct inode *inode)
1400 {
1401 	memset(INODE_PKEY(inode), 0, KEY_SIZE);
1402 	make_bad_inode(inode);
1403 }
1404 
1405 //
1406 // initially this function was derived from minix or ext2's analog and
1407 // evolved as the prototype did
1408 //
1409 
1410 int reiserfs_init_locked_inode(struct inode *inode, void *p)
1411 {
1412 	struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p;
1413 	inode->i_ino = args->objectid;
1414 	INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid);
1415 	return 0;
1416 }
1417 
1418 /* looks for stat data in the tree, and fills up the fields of in-core
1419    inode stat data fields */
1420 void reiserfs_read_locked_inode(struct inode *inode,
1421 				struct reiserfs_iget_args *args)
1422 {
1423 	INITIALIZE_PATH(path_to_sd);
1424 	struct cpu_key key;
1425 	unsigned long dirino;
1426 	int retval;
1427 
1428 	dirino = args->dirid;
1429 
1430 	/* set version 1, version 2 could be used too, because stat data
1431 	   key is the same in both versions */
1432 	key.version = KEY_FORMAT_3_5;
1433 	key.on_disk_key.k_dir_id = dirino;
1434 	key.on_disk_key.k_objectid = inode->i_ino;
1435 	key.on_disk_key.k_offset = 0;
1436 	key.on_disk_key.k_type = 0;
1437 
1438 	/* look for the object's stat data */
1439 	retval = search_item(inode->i_sb, &key, &path_to_sd);
1440 	if (retval == IO_ERROR) {
1441 		reiserfs_warning(inode->i_sb,
1442 				 "vs-13070: reiserfs_read_locked_inode: "
1443 				 "i/o failure occurred trying to find stat data of %K",
1444 				 &key);
1445 		reiserfs_make_bad_inode(inode);
1446 		return;
1447 	}
1448 	if (retval != ITEM_FOUND) {
1449 		/* a stale NFS handle can trigger this without it being an error */
1450 		pathrelse(&path_to_sd);
1451 		reiserfs_make_bad_inode(inode);
1452 		inode->i_nlink = 0;
1453 		return;
1454 	}
1455 
1456 	init_inode(inode, &path_to_sd);
1457 
1458 	/* It is possible that knfsd is trying to access inode of a file
1459 	   that is being removed from the disk by some other thread. As we
1460 	   update sd on unlink all that is required is to check for nlink
1461 	   here. This bug was first found by Sizif when debugging
1462 	   SquidNG/Butterfly, forgotten, and found again after Philippe
1463 	   Gramoulle <philippe.gramoulle@mmania.com> reproduced it.
1464 
1465 	   More logical fix would require changes in fs/inode.c:iput() to
1466 	   remove inode from hash-table _after_ fs cleaned disk stuff up and
1467 	   in iget() to return NULL if I_FREEING inode is found in
1468 	   hash-table. */
1469 	/* Currently there is one place where it's ok to meet inode with
1470 	   nlink==0: processing of open-unlinked and half-truncated files
1471 	   during mount (fs/reiserfs/super.c:finish_unfinished()). */
1472 	if ((inode->i_nlink == 0) &&
1473 	    !REISERFS_SB(inode->i_sb)->s_is_unlinked_ok) {
1474 		reiserfs_warning(inode->i_sb,
1475 				 "vs-13075: reiserfs_read_locked_inode: "
1476 				 "dead inode read from disk %K. "
1477 				 "This is likely to be race with knfsd. Ignore",
1478 				 &key);
1479 		reiserfs_make_bad_inode(inode);
1480 	}
1481 
1482 	reiserfs_check_path(&path_to_sd);	/* init inode should be relsing */
1483 
1484 }
1485 
1486 /**
1487  * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked().
1488  *
1489  * @inode:    inode from hash table to check
1490  * @opaque:   "cookie" passed to iget5_locked(). This is &reiserfs_iget_args.
1491  *
1492  * This function is called by iget5_locked() to distinguish reiserfs inodes
1493  * having the same inode numbers. Such inodes can only exist due to some
1494  * error condition. One of them should be bad. Inodes with identical
1495  * inode numbers (objectids) are distinguished by parent directory ids.
1496  *
1497  */
1498 int reiserfs_find_actor(struct inode *inode, void *opaque)
1499 {
1500 	struct reiserfs_iget_args *args;
1501 
1502 	args = opaque;
1503 	/* args is already in CPU order */
1504 	return (inode->i_ino == args->objectid) &&
1505 	    (le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid);
1506 }
1507 
1508 struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key)
1509 {
1510 	struct inode *inode;
1511 	struct reiserfs_iget_args args;
1512 
1513 	args.objectid = key->on_disk_key.k_objectid;
1514 	args.dirid = key->on_disk_key.k_dir_id;
1515 	inode = iget5_locked(s, key->on_disk_key.k_objectid,
1516 			     reiserfs_find_actor, reiserfs_init_locked_inode,
1517 			     (void *)(&args));
1518 	if (!inode)
1519 		return ERR_PTR(-ENOMEM);
1520 
1521 	if (inode->i_state & I_NEW) {
1522 		reiserfs_read_locked_inode(inode, &args);
1523 		unlock_new_inode(inode);
1524 	}
1525 
1526 	if (comp_short_keys(INODE_PKEY(inode), key) || is_bad_inode(inode)) {
1527 		/* either due to i/o error or a stale NFS handle */
1528 		iput(inode);
1529 		inode = NULL;
1530 	}
1531 	return inode;
1532 }
1533 
1534 struct dentry *reiserfs_get_dentry(struct super_block *sb, void *vobjp)
1535 {
1536 	__u32 *data = vobjp;
1537 	struct cpu_key key;
1538 	struct dentry *result;
1539 	struct inode *inode;
1540 
1541 	key.on_disk_key.k_objectid = data[0];
1542 	key.on_disk_key.k_dir_id = data[1];
1543 	reiserfs_write_lock(sb);
1544 	inode = reiserfs_iget(sb, &key);
1545 	if (inode && !IS_ERR(inode) && data[2] != 0 &&
1546 	    data[2] != inode->i_generation) {
1547 		iput(inode);
1548 		inode = NULL;
1549 	}
1550 	reiserfs_write_unlock(sb);
1551 	if (!inode)
1552 		inode = ERR_PTR(-ESTALE);
1553 	if (IS_ERR(inode))
1554 		return ERR_PTR(PTR_ERR(inode));
1555 	result = d_alloc_anon(inode);
1556 	if (!result) {
1557 		iput(inode);
1558 		return ERR_PTR(-ENOMEM);
1559 	}
1560 	return result;
1561 }
1562 
1563 struct dentry *reiserfs_decode_fh(struct super_block *sb, __u32 * data,
1564 				  int len, int fhtype,
1565 				  int (*acceptable) (void *contect,
1566 						     struct dentry * de),
1567 				  void *context)
1568 {
1569 	__u32 obj[3], parent[3];
1570 
1571 	/* fhtype happens to reflect the number of u32s encoded.
1572 	 * due to a bug in earlier code, fhtype might indicate there
1573 	 * are more u32s then actually fitted.
1574 	 * so if fhtype seems to be more than len, reduce fhtype.
1575 	 * Valid types are:
1576 	 *   2 - objectid + dir_id - legacy support
1577 	 *   3 - objectid + dir_id + generation
1578 	 *   4 - objectid + dir_id + objectid and dirid of parent - legacy
1579 	 *   5 - objectid + dir_id + generation + objectid and dirid of parent
1580 	 *   6 - as above plus generation of directory
1581 	 * 6 does not fit in NFSv2 handles
1582 	 */
1583 	if (fhtype > len) {
1584 		if (fhtype != 6 || len != 5)
1585 			reiserfs_warning(sb,
1586 					 "nfsd/reiserfs, fhtype=%d, len=%d - odd",
1587 					 fhtype, len);
1588 		fhtype = 5;
1589 	}
1590 
1591 	obj[0] = data[0];
1592 	obj[1] = data[1];
1593 	if (fhtype == 3 || fhtype >= 5)
1594 		obj[2] = data[2];
1595 	else
1596 		obj[2] = 0;	/* generation number */
1597 
1598 	if (fhtype >= 4) {
1599 		parent[0] = data[fhtype >= 5 ? 3 : 2];
1600 		parent[1] = data[fhtype >= 5 ? 4 : 3];
1601 		if (fhtype == 6)
1602 			parent[2] = data[5];
1603 		else
1604 			parent[2] = 0;
1605 	}
1606 	return sb->s_export_op->find_exported_dentry(sb, obj,
1607 						     fhtype < 4 ? NULL : parent,
1608 						     acceptable, context);
1609 }
1610 
1611 int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
1612 		       int need_parent)
1613 {
1614 	struct inode *inode = dentry->d_inode;
1615 	int maxlen = *lenp;
1616 
1617 	if (maxlen < 3)
1618 		return 255;
1619 
1620 	data[0] = inode->i_ino;
1621 	data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
1622 	data[2] = inode->i_generation;
1623 	*lenp = 3;
1624 	/* no room for directory info? return what we've stored so far */
1625 	if (maxlen < 5 || !need_parent)
1626 		return 3;
1627 
1628 	spin_lock(&dentry->d_lock);
1629 	inode = dentry->d_parent->d_inode;
1630 	data[3] = inode->i_ino;
1631 	data[4] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
1632 	*lenp = 5;
1633 	if (maxlen >= 6) {
1634 		data[5] = inode->i_generation;
1635 		*lenp = 6;
1636 	}
1637 	spin_unlock(&dentry->d_lock);
1638 	return *lenp;
1639 }
1640 
1641 /* looks for stat data, then copies fields to it, marks the buffer
1642    containing stat data as dirty */
1643 /* reiserfs inodes are never really dirty, since the dirty inode call
1644 ** always logs them.  This call allows the VFS inode marking routines
1645 ** to properly mark inodes for datasync and such, but only actually
1646 ** does something when called for a synchronous update.
1647 */
1648 int reiserfs_write_inode(struct inode *inode, int do_sync)
1649 {
1650 	struct reiserfs_transaction_handle th;
1651 	int jbegin_count = 1;
1652 
1653 	if (inode->i_sb->s_flags & MS_RDONLY)
1654 		return -EROFS;
1655 	/* memory pressure can sometimes initiate write_inode calls with sync == 1,
1656 	 ** these cases are just when the system needs ram, not when the
1657 	 ** inode needs to reach disk for safety, and they can safely be
1658 	 ** ignored because the altered inode has already been logged.
1659 	 */
1660 	if (do_sync && !(current->flags & PF_MEMALLOC)) {
1661 		reiserfs_write_lock(inode->i_sb);
1662 		if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
1663 			reiserfs_update_sd(&th, inode);
1664 			journal_end_sync(&th, inode->i_sb, jbegin_count);
1665 		}
1666 		reiserfs_write_unlock(inode->i_sb);
1667 	}
1668 	return 0;
1669 }
1670 
1671 /* stat data of new object is inserted already, this inserts the item
1672    containing "." and ".." entries */
1673 static int reiserfs_new_directory(struct reiserfs_transaction_handle *th,
1674 				  struct inode *inode,
1675 				  struct item_head *ih, struct path *path,
1676 				  struct inode *dir)
1677 {
1678 	struct super_block *sb = th->t_super;
1679 	char empty_dir[EMPTY_DIR_SIZE];
1680 	char *body = empty_dir;
1681 	struct cpu_key key;
1682 	int retval;
1683 
1684 	BUG_ON(!th->t_trans_id);
1685 
1686 	_make_cpu_key(&key, KEY_FORMAT_3_5, le32_to_cpu(ih->ih_key.k_dir_id),
1687 		      le32_to_cpu(ih->ih_key.k_objectid), DOT_OFFSET,
1688 		      TYPE_DIRENTRY, 3 /*key length */ );
1689 
1690 	/* compose item head for new item. Directories consist of items of
1691 	   old type (ITEM_VERSION_1). Do not set key (second arg is 0), it
1692 	   is done by reiserfs_new_inode */
1693 	if (old_format_only(sb)) {
1694 		make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
1695 				  TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2);
1696 
1697 		make_empty_dir_item_v1(body, ih->ih_key.k_dir_id,
1698 				       ih->ih_key.k_objectid,
1699 				       INODE_PKEY(dir)->k_dir_id,
1700 				       INODE_PKEY(dir)->k_objectid);
1701 	} else {
1702 		make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
1703 				  TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2);
1704 
1705 		make_empty_dir_item(body, ih->ih_key.k_dir_id,
1706 				    ih->ih_key.k_objectid,
1707 				    INODE_PKEY(dir)->k_dir_id,
1708 				    INODE_PKEY(dir)->k_objectid);
1709 	}
1710 
1711 	/* look for place in the tree for new item */
1712 	retval = search_item(sb, &key, path);
1713 	if (retval == IO_ERROR) {
1714 		reiserfs_warning(sb, "vs-13080: reiserfs_new_directory: "
1715 				 "i/o failure occurred creating new directory");
1716 		return -EIO;
1717 	}
1718 	if (retval == ITEM_FOUND) {
1719 		pathrelse(path);
1720 		reiserfs_warning(sb, "vs-13070: reiserfs_new_directory: "
1721 				 "object with this key exists (%k)",
1722 				 &(ih->ih_key));
1723 		return -EEXIST;
1724 	}
1725 
1726 	/* insert item, that is empty directory item */
1727 	return reiserfs_insert_item(th, path, &key, ih, inode, body);
1728 }
1729 
1730 /* stat data of object has been inserted, this inserts the item
1731    containing the body of symlink */
1732 static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct inode *inode,	/* Inode of symlink */
1733 				struct item_head *ih,
1734 				struct path *path, const char *symname,
1735 				int item_len)
1736 {
1737 	struct super_block *sb = th->t_super;
1738 	struct cpu_key key;
1739 	int retval;
1740 
1741 	BUG_ON(!th->t_trans_id);
1742 
1743 	_make_cpu_key(&key, KEY_FORMAT_3_5,
1744 		      le32_to_cpu(ih->ih_key.k_dir_id),
1745 		      le32_to_cpu(ih->ih_key.k_objectid),
1746 		      1, TYPE_DIRECT, 3 /*key length */ );
1747 
1748 	make_le_item_head(ih, NULL, KEY_FORMAT_3_5, 1, TYPE_DIRECT, item_len,
1749 			  0 /*free_space */ );
1750 
1751 	/* look for place in the tree for new item */
1752 	retval = search_item(sb, &key, path);
1753 	if (retval == IO_ERROR) {
1754 		reiserfs_warning(sb, "vs-13080: reiserfs_new_symlinik: "
1755 				 "i/o failure occurred creating new symlink");
1756 		return -EIO;
1757 	}
1758 	if (retval == ITEM_FOUND) {
1759 		pathrelse(path);
1760 		reiserfs_warning(sb, "vs-13080: reiserfs_new_symlink: "
1761 				 "object with this key exists (%k)",
1762 				 &(ih->ih_key));
1763 		return -EEXIST;
1764 	}
1765 
1766 	/* insert item, that is body of symlink */
1767 	return reiserfs_insert_item(th, path, &key, ih, inode, symname);
1768 }
1769 
1770 /* inserts the stat data into the tree, and then calls
1771    reiserfs_new_directory (to insert ".", ".." item if new object is
1772    directory) or reiserfs_new_symlink (to insert symlink body if new
1773    object is symlink) or nothing (if new object is regular file)
1774 
1775    NOTE! uid and gid must already be set in the inode.  If we return
1776    non-zero due to an error, we have to drop the quota previously allocated
1777    for the fresh inode.  This can only be done outside a transaction, so
1778    if we return non-zero, we also end the transaction.  */
1779 int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1780 		       struct inode *dir, int mode, const char *symname,
1781 		       /* 0 for regular, EMTRY_DIR_SIZE for dirs,
1782 		          strlen (symname) for symlinks) */
1783 		       loff_t i_size, struct dentry *dentry,
1784 		       struct inode *inode)
1785 {
1786 	struct super_block *sb;
1787 	INITIALIZE_PATH(path_to_key);
1788 	struct cpu_key key;
1789 	struct item_head ih;
1790 	struct stat_data sd;
1791 	int retval;
1792 	int err;
1793 
1794 	BUG_ON(!th->t_trans_id);
1795 
1796 	if (DQUOT_ALLOC_INODE(inode)) {
1797 		err = -EDQUOT;
1798 		goto out_end_trans;
1799 	}
1800 	if (!dir || !dir->i_nlink) {
1801 		err = -EPERM;
1802 		goto out_bad_inode;
1803 	}
1804 
1805 	sb = dir->i_sb;
1806 
1807 	/* item head of new item */
1808 	ih.ih_key.k_dir_id = reiserfs_choose_packing(dir);
1809 	ih.ih_key.k_objectid = cpu_to_le32(reiserfs_get_unused_objectid(th));
1810 	if (!ih.ih_key.k_objectid) {
1811 		err = -ENOMEM;
1812 		goto out_bad_inode;
1813 	}
1814 	if (old_format_only(sb))
1815 		/* not a perfect generation count, as object ids can be reused, but
1816 		 ** this is as good as reiserfs can do right now.
1817 		 ** note that the private part of inode isn't filled in yet, we have
1818 		 ** to use the directory.
1819 		 */
1820 		inode->i_generation = le32_to_cpu(INODE_PKEY(dir)->k_objectid);
1821 	else
1822 #if defined( USE_INODE_GENERATION_COUNTER )
1823 		inode->i_generation =
1824 		    le32_to_cpu(REISERFS_SB(sb)->s_rs->s_inode_generation);
1825 #else
1826 		inode->i_generation = ++event;
1827 #endif
1828 
1829 	/* fill stat data */
1830 	inode->i_nlink = (S_ISDIR(mode) ? 2 : 1);
1831 
1832 	/* uid and gid must already be set by the caller for quota init */
1833 
1834 	/* symlink cannot be immutable or append only, right? */
1835 	if (S_ISLNK(inode->i_mode))
1836 		inode->i_flags &= ~(S_IMMUTABLE | S_APPEND);
1837 
1838 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
1839 	inode->i_size = i_size;
1840 	inode->i_blocks = 0;
1841 	inode->i_bytes = 0;
1842 	REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 :
1843 	    U32_MAX /*NO_BYTES_IN_DIRECT_ITEM */ ;
1844 
1845 	INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list));
1846 	REISERFS_I(inode)->i_flags = 0;
1847 	REISERFS_I(inode)->i_prealloc_block = 0;
1848 	REISERFS_I(inode)->i_prealloc_count = 0;
1849 	REISERFS_I(inode)->i_trans_id = 0;
1850 	REISERFS_I(inode)->i_jl = NULL;
1851 	REISERFS_I(inode)->i_attrs =
1852 	    REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
1853 	sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode);
1854 	REISERFS_I(inode)->i_acl_access = NULL;
1855 	REISERFS_I(inode)->i_acl_default = NULL;
1856 	init_rwsem(&REISERFS_I(inode)->xattr_sem);
1857 
1858 	if (old_format_only(sb))
1859 		make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
1860 				  TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
1861 	else
1862 		make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
1863 				  TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
1864 
1865 	/* key to search for correct place for new stat data */
1866 	_make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id),
1867 		      le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET,
1868 		      TYPE_STAT_DATA, 3 /*key length */ );
1869 
1870 	/* find proper place for inserting of stat data */
1871 	retval = search_item(sb, &key, &path_to_key);
1872 	if (retval == IO_ERROR) {
1873 		err = -EIO;
1874 		goto out_bad_inode;
1875 	}
1876 	if (retval == ITEM_FOUND) {
1877 		pathrelse(&path_to_key);
1878 		err = -EEXIST;
1879 		goto out_bad_inode;
1880 	}
1881 	if (old_format_only(sb)) {
1882 		if (inode->i_uid & ~0xffff || inode->i_gid & ~0xffff) {
1883 			pathrelse(&path_to_key);
1884 			/* i_uid or i_gid is too big to be stored in stat data v3.5 */
1885 			err = -EINVAL;
1886 			goto out_bad_inode;
1887 		}
1888 		inode2sd_v1(&sd, inode, inode->i_size);
1889 	} else {
1890 		inode2sd(&sd, inode, inode->i_size);
1891 	}
1892 	// these do not go to on-disk stat data
1893 	inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
1894 	inode->i_blksize = reiserfs_default_io_size;
1895 
1896 	// store in in-core inode the key of stat data and version all
1897 	// object items will have (directory items will have old offset
1898 	// format, other new objects will consist of new items)
1899 	memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
1900 	if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode))
1901 		set_inode_item_key_version(inode, KEY_FORMAT_3_5);
1902 	else
1903 		set_inode_item_key_version(inode, KEY_FORMAT_3_6);
1904 	if (old_format_only(sb))
1905 		set_inode_sd_version(inode, STAT_DATA_V1);
1906 	else
1907 		set_inode_sd_version(inode, STAT_DATA_V2);
1908 
1909 	/* insert the stat data into the tree */
1910 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
1911 	if (REISERFS_I(dir)->new_packing_locality)
1912 		th->displace_new_blocks = 1;
1913 #endif
1914 	retval =
1915 	    reiserfs_insert_item(th, &path_to_key, &key, &ih, inode,
1916 				 (char *)(&sd));
1917 	if (retval) {
1918 		err = retval;
1919 		reiserfs_check_path(&path_to_key);
1920 		goto out_bad_inode;
1921 	}
1922 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
1923 	if (!th->displace_new_blocks)
1924 		REISERFS_I(dir)->new_packing_locality = 0;
1925 #endif
1926 	if (S_ISDIR(mode)) {
1927 		/* insert item with "." and ".." */
1928 		retval =
1929 		    reiserfs_new_directory(th, inode, &ih, &path_to_key, dir);
1930 	}
1931 
1932 	if (S_ISLNK(mode)) {
1933 		/* insert body of symlink */
1934 		if (!old_format_only(sb))
1935 			i_size = ROUND_UP(i_size);
1936 		retval =
1937 		    reiserfs_new_symlink(th, inode, &ih, &path_to_key, symname,
1938 					 i_size);
1939 	}
1940 	if (retval) {
1941 		err = retval;
1942 		reiserfs_check_path(&path_to_key);
1943 		journal_end(th, th->t_super, th->t_blocks_allocated);
1944 		goto out_inserted_sd;
1945 	}
1946 
1947 	/* XXX CHECK THIS */
1948 	if (reiserfs_posixacl(inode->i_sb)) {
1949 		retval = reiserfs_inherit_default_acl(dir, dentry, inode);
1950 		if (retval) {
1951 			err = retval;
1952 			reiserfs_check_path(&path_to_key);
1953 			journal_end(th, th->t_super, th->t_blocks_allocated);
1954 			goto out_inserted_sd;
1955 		}
1956 	} else if (inode->i_sb->s_flags & MS_POSIXACL) {
1957 		reiserfs_warning(inode->i_sb, "ACLs aren't enabled in the fs, "
1958 				 "but vfs thinks they are!");
1959 	} else if (is_reiserfs_priv_object(dir)) {
1960 		reiserfs_mark_inode_private(inode);
1961 	}
1962 
1963 	insert_inode_hash(inode);
1964 	reiserfs_update_sd(th, inode);
1965 	reiserfs_check_path(&path_to_key);
1966 
1967 	return 0;
1968 
1969 /* it looks like you can easily compress these two goto targets into
1970  * one.  Keeping it like this doesn't actually hurt anything, and they
1971  * are place holders for what the quota code actually needs.
1972  */
1973       out_bad_inode:
1974 	/* Invalidate the object, nothing was inserted yet */
1975 	INODE_PKEY(inode)->k_objectid = 0;
1976 
1977 	/* Quota change must be inside a transaction for journaling */
1978 	DQUOT_FREE_INODE(inode);
1979 
1980       out_end_trans:
1981 	journal_end(th, th->t_super, th->t_blocks_allocated);
1982 	/* Drop can be outside and it needs more credits so it's better to have it outside */
1983 	DQUOT_DROP(inode);
1984 	inode->i_flags |= S_NOQUOTA;
1985 	make_bad_inode(inode);
1986 
1987       out_inserted_sd:
1988 	inode->i_nlink = 0;
1989 	th->t_trans_id = 0;	/* so the caller can't use this handle later */
1990 
1991 	/* If we were inheriting an ACL, we need to release the lock so that
1992 	 * iput doesn't deadlock in reiserfs_delete_xattrs. The locking
1993 	 * code really needs to be reworked, but this will take care of it
1994 	 * for now. -jeffm */
1995 	if (REISERFS_I(dir)->i_acl_default && !IS_ERR(REISERFS_I(dir)->i_acl_default)) {
1996 		reiserfs_write_unlock_xattrs(dir->i_sb);
1997 		iput(inode);
1998 		reiserfs_write_lock_xattrs(dir->i_sb);
1999 	} else
2000 		iput(inode);
2001 	return err;
2002 }
2003 
2004 /*
2005 ** finds the tail page in the page cache,
2006 ** reads the last block in.
2007 **
2008 ** On success, page_result is set to a locked, pinned page, and bh_result
2009 ** is set to an up to date buffer for the last block in the file.  returns 0.
2010 **
2011 ** tail conversion is not done, so bh_result might not be valid for writing
2012 ** check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before
2013 ** trying to write the block.
2014 **
2015 ** on failure, nonzero is returned, page_result and bh_result are untouched.
2016 */
2017 static int grab_tail_page(struct inode *p_s_inode,
2018 			  struct page **page_result,
2019 			  struct buffer_head **bh_result)
2020 {
2021 
2022 	/* we want the page with the last byte in the file,
2023 	 ** not the page that will hold the next byte for appending
2024 	 */
2025 	unsigned long index = (p_s_inode->i_size - 1) >> PAGE_CACHE_SHIFT;
2026 	unsigned long pos = 0;
2027 	unsigned long start = 0;
2028 	unsigned long blocksize = p_s_inode->i_sb->s_blocksize;
2029 	unsigned long offset = (p_s_inode->i_size) & (PAGE_CACHE_SIZE - 1);
2030 	struct buffer_head *bh;
2031 	struct buffer_head *head;
2032 	struct page *page;
2033 	int error;
2034 
2035 	/* we know that we are only called with inode->i_size > 0.
2036 	 ** we also know that a file tail can never be as big as a block
2037 	 ** If i_size % blocksize == 0, our file is currently block aligned
2038 	 ** and it won't need converting or zeroing after a truncate.
2039 	 */
2040 	if ((offset & (blocksize - 1)) == 0) {
2041 		return -ENOENT;
2042 	}
2043 	page = grab_cache_page(p_s_inode->i_mapping, index);
2044 	error = -ENOMEM;
2045 	if (!page) {
2046 		goto out;
2047 	}
2048 	/* start within the page of the last block in the file */
2049 	start = (offset / blocksize) * blocksize;
2050 
2051 	error = block_prepare_write(page, start, offset,
2052 				    reiserfs_get_block_create_0);
2053 	if (error)
2054 		goto unlock;
2055 
2056 	head = page_buffers(page);
2057 	bh = head;
2058 	do {
2059 		if (pos >= start) {
2060 			break;
2061 		}
2062 		bh = bh->b_this_page;
2063 		pos += blocksize;
2064 	} while (bh != head);
2065 
2066 	if (!buffer_uptodate(bh)) {
2067 		/* note, this should never happen, prepare_write should
2068 		 ** be taking care of this for us.  If the buffer isn't up to date,
2069 		 ** I've screwed up the code to find the buffer, or the code to
2070 		 ** call prepare_write
2071 		 */
2072 		reiserfs_warning(p_s_inode->i_sb,
2073 				 "clm-6000: error reading block %lu on dev %s",
2074 				 bh->b_blocknr,
2075 				 reiserfs_bdevname(p_s_inode->i_sb));
2076 		error = -EIO;
2077 		goto unlock;
2078 	}
2079 	*bh_result = bh;
2080 	*page_result = page;
2081 
2082       out:
2083 	return error;
2084 
2085       unlock:
2086 	unlock_page(page);
2087 	page_cache_release(page);
2088 	return error;
2089 }
2090 
2091 /*
2092 ** vfs version of truncate file.  Must NOT be called with
2093 ** a transaction already started.
2094 **
2095 ** some code taken from block_truncate_page
2096 */
2097 int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps)
2098 {
2099 	struct reiserfs_transaction_handle th;
2100 	/* we want the offset for the first byte after the end of the file */
2101 	unsigned long offset = p_s_inode->i_size & (PAGE_CACHE_SIZE - 1);
2102 	unsigned blocksize = p_s_inode->i_sb->s_blocksize;
2103 	unsigned length;
2104 	struct page *page = NULL;
2105 	int error;
2106 	struct buffer_head *bh = NULL;
2107 	int err2;
2108 
2109 	reiserfs_write_lock(p_s_inode->i_sb);
2110 
2111 	if (p_s_inode->i_size > 0) {
2112 		if ((error = grab_tail_page(p_s_inode, &page, &bh))) {
2113 			// -ENOENT means we truncated past the end of the file,
2114 			// and get_block_create_0 could not find a block to read in,
2115 			// which is ok.
2116 			if (error != -ENOENT)
2117 				reiserfs_warning(p_s_inode->i_sb,
2118 						 "clm-6001: grab_tail_page failed %d",
2119 						 error);
2120 			page = NULL;
2121 			bh = NULL;
2122 		}
2123 	}
2124 
2125 	/* so, if page != NULL, we have a buffer head for the offset at
2126 	 ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0,
2127 	 ** then we have an unformatted node.  Otherwise, we have a direct item,
2128 	 ** and no zeroing is required on disk.  We zero after the truncate,
2129 	 ** because the truncate might pack the item anyway
2130 	 ** (it will unmap bh if it packs).
2131 	 */
2132 	/* it is enough to reserve space in transaction for 2 balancings:
2133 	   one for "save" link adding and another for the first
2134 	   cut_from_item. 1 is for update_sd */
2135 	error = journal_begin(&th, p_s_inode->i_sb,
2136 			      JOURNAL_PER_BALANCE_CNT * 2 + 1);
2137 	if (error)
2138 		goto out;
2139 	reiserfs_update_inode_transaction(p_s_inode);
2140 	if (update_timestamps)
2141 		/* we are doing real truncate: if the system crashes before the last
2142 		   transaction of truncating gets committed - on reboot the file
2143 		   either appears truncated properly or not truncated at all */
2144 		add_save_link(&th, p_s_inode, 1);
2145 	err2 = reiserfs_do_truncate(&th, p_s_inode, page, update_timestamps);
2146 	error =
2147 	    journal_end(&th, p_s_inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1);
2148 	if (error)
2149 		goto out;
2150 
2151 	/* check reiserfs_do_truncate after ending the transaction */
2152 	if (err2) {
2153 		error = err2;
2154   		goto out;
2155 	}
2156 
2157 	if (update_timestamps) {
2158 		error = remove_save_link(p_s_inode, 1 /* truncate */ );
2159 		if (error)
2160 			goto out;
2161 	}
2162 
2163 	if (page) {
2164 		length = offset & (blocksize - 1);
2165 		/* if we are not on a block boundary */
2166 		if (length) {
2167 			char *kaddr;
2168 
2169 			length = blocksize - length;
2170 			kaddr = kmap_atomic(page, KM_USER0);
2171 			memset(kaddr + offset, 0, length);
2172 			flush_dcache_page(page);
2173 			kunmap_atomic(kaddr, KM_USER0);
2174 			if (buffer_mapped(bh) && bh->b_blocknr != 0) {
2175 				mark_buffer_dirty(bh);
2176 			}
2177 		}
2178 		unlock_page(page);
2179 		page_cache_release(page);
2180 	}
2181 
2182 	reiserfs_write_unlock(p_s_inode->i_sb);
2183 	return 0;
2184       out:
2185 	if (page) {
2186 		unlock_page(page);
2187 		page_cache_release(page);
2188 	}
2189 	reiserfs_write_unlock(p_s_inode->i_sb);
2190 	return error;
2191 }
2192 
2193 static int map_block_for_writepage(struct inode *inode,
2194 				   struct buffer_head *bh_result,
2195 				   unsigned long block)
2196 {
2197 	struct reiserfs_transaction_handle th;
2198 	int fs_gen;
2199 	struct item_head tmp_ih;
2200 	struct item_head *ih;
2201 	struct buffer_head *bh;
2202 	__le32 *item;
2203 	struct cpu_key key;
2204 	INITIALIZE_PATH(path);
2205 	int pos_in_item;
2206 	int jbegin_count = JOURNAL_PER_BALANCE_CNT;
2207 	loff_t byte_offset = ((loff_t)block << inode->i_sb->s_blocksize_bits)+1;
2208 	int retval;
2209 	int use_get_block = 0;
2210 	int bytes_copied = 0;
2211 	int copy_size;
2212 	int trans_running = 0;
2213 
2214 	/* catch places below that try to log something without starting a trans */
2215 	th.t_trans_id = 0;
2216 
2217 	if (!buffer_uptodate(bh_result)) {
2218 		return -EIO;
2219 	}
2220 
2221 	kmap(bh_result->b_page);
2222       start_over:
2223 	reiserfs_write_lock(inode->i_sb);
2224 	make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3);
2225 
2226       research:
2227 	retval = search_for_position_by_key(inode->i_sb, &key, &path);
2228 	if (retval != POSITION_FOUND) {
2229 		use_get_block = 1;
2230 		goto out;
2231 	}
2232 
2233 	bh = get_last_bh(&path);
2234 	ih = get_ih(&path);
2235 	item = get_item(&path);
2236 	pos_in_item = path.pos_in_item;
2237 
2238 	/* we've found an unformatted node */
2239 	if (indirect_item_found(retval, ih)) {
2240 		if (bytes_copied > 0) {
2241 			reiserfs_warning(inode->i_sb,
2242 					 "clm-6002: bytes_copied %d",
2243 					 bytes_copied);
2244 		}
2245 		if (!get_block_num(item, pos_in_item)) {
2246 			/* crap, we are writing to a hole */
2247 			use_get_block = 1;
2248 			goto out;
2249 		}
2250 		set_block_dev_mapped(bh_result,
2251 				     get_block_num(item, pos_in_item), inode);
2252 	} else if (is_direct_le_ih(ih)) {
2253 		char *p;
2254 		p = page_address(bh_result->b_page);
2255 		p += (byte_offset - 1) & (PAGE_CACHE_SIZE - 1);
2256 		copy_size = ih_item_len(ih) - pos_in_item;
2257 
2258 		fs_gen = get_generation(inode->i_sb);
2259 		copy_item_head(&tmp_ih, ih);
2260 
2261 		if (!trans_running) {
2262 			/* vs-3050 is gone, no need to drop the path */
2263 			retval = journal_begin(&th, inode->i_sb, jbegin_count);
2264 			if (retval)
2265 				goto out;
2266 			reiserfs_update_inode_transaction(inode);
2267 			trans_running = 1;
2268 			if (fs_changed(fs_gen, inode->i_sb)
2269 			    && item_moved(&tmp_ih, &path)) {
2270 				reiserfs_restore_prepared_buffer(inode->i_sb,
2271 								 bh);
2272 				goto research;
2273 			}
2274 		}
2275 
2276 		reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
2277 
2278 		if (fs_changed(fs_gen, inode->i_sb)
2279 		    && item_moved(&tmp_ih, &path)) {
2280 			reiserfs_restore_prepared_buffer(inode->i_sb, bh);
2281 			goto research;
2282 		}
2283 
2284 		memcpy(B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied,
2285 		       copy_size);
2286 
2287 		journal_mark_dirty(&th, inode->i_sb, bh);
2288 		bytes_copied += copy_size;
2289 		set_block_dev_mapped(bh_result, 0, inode);
2290 
2291 		/* are there still bytes left? */
2292 		if (bytes_copied < bh_result->b_size &&
2293 		    (byte_offset + bytes_copied) < inode->i_size) {
2294 			set_cpu_key_k_offset(&key,
2295 					     cpu_key_k_offset(&key) +
2296 					     copy_size);
2297 			goto research;
2298 		}
2299 	} else {
2300 		reiserfs_warning(inode->i_sb,
2301 				 "clm-6003: bad item inode %lu, device %s",
2302 				 inode->i_ino, reiserfs_bdevname(inode->i_sb));
2303 		retval = -EIO;
2304 		goto out;
2305 	}
2306 	retval = 0;
2307 
2308       out:
2309 	pathrelse(&path);
2310 	if (trans_running) {
2311 		int err = journal_end(&th, inode->i_sb, jbegin_count);
2312 		if (err)
2313 			retval = err;
2314 		trans_running = 0;
2315 	}
2316 	reiserfs_write_unlock(inode->i_sb);
2317 
2318 	/* this is where we fill in holes in the file. */
2319 	if (use_get_block) {
2320 		retval = reiserfs_get_block(inode, block, bh_result,
2321 					    GET_BLOCK_CREATE | GET_BLOCK_NO_IMUX
2322 					    | GET_BLOCK_NO_DANGLE);
2323 		if (!retval) {
2324 			if (!buffer_mapped(bh_result)
2325 			    || bh_result->b_blocknr == 0) {
2326 				/* get_block failed to find a mapped unformatted node. */
2327 				use_get_block = 0;
2328 				goto start_over;
2329 			}
2330 		}
2331 	}
2332 	kunmap(bh_result->b_page);
2333 
2334 	if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
2335 		/* we've copied data from the page into the direct item, so the
2336 		 * buffer in the page is now clean, mark it to reflect that.
2337 		 */
2338 		lock_buffer(bh_result);
2339 		clear_buffer_dirty(bh_result);
2340 		unlock_buffer(bh_result);
2341 	}
2342 	return retval;
2343 }
2344 
2345 /*
2346  * mason@suse.com: updated in 2.5.54 to follow the same general io
2347  * start/recovery path as __block_write_full_page, along with special
2348  * code to handle reiserfs tails.
2349  */
2350 static int reiserfs_write_full_page(struct page *page,
2351 				    struct writeback_control *wbc)
2352 {
2353 	struct inode *inode = page->mapping->host;
2354 	unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
2355 	int error = 0;
2356 	unsigned long block;
2357 	struct buffer_head *head, *bh;
2358 	int partial = 0;
2359 	int nr = 0;
2360 	int checked = PageChecked(page);
2361 	struct reiserfs_transaction_handle th;
2362 	struct super_block *s = inode->i_sb;
2363 	int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
2364 	th.t_trans_id = 0;
2365 
2366 	/* The page dirty bit is cleared before writepage is called, which
2367 	 * means we have to tell create_empty_buffers to make dirty buffers
2368 	 * The page really should be up to date at this point, so tossing
2369 	 * in the BH_Uptodate is just a sanity check.
2370 	 */
2371 	if (!page_has_buffers(page)) {
2372 		create_empty_buffers(page, s->s_blocksize,
2373 				     (1 << BH_Dirty) | (1 << BH_Uptodate));
2374 	}
2375 	head = page_buffers(page);
2376 
2377 	/* last page in the file, zero out any contents past the
2378 	 ** last byte in the file
2379 	 */
2380 	if (page->index >= end_index) {
2381 		char *kaddr;
2382 		unsigned last_offset;
2383 
2384 		last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
2385 		/* no file contents in this page */
2386 		if (page->index >= end_index + 1 || !last_offset) {
2387 			unlock_page(page);
2388 			return 0;
2389 		}
2390 		kaddr = kmap_atomic(page, KM_USER0);
2391 		memset(kaddr + last_offset, 0, PAGE_CACHE_SIZE - last_offset);
2392 		flush_dcache_page(page);
2393 		kunmap_atomic(kaddr, KM_USER0);
2394 	}
2395 	bh = head;
2396 	block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits);
2397 	/* first map all the buffers, logging any direct items we find */
2398 	do {
2399 		if ((checked || buffer_dirty(bh)) && (!buffer_mapped(bh) ||
2400 						      (buffer_mapped(bh)
2401 						       && bh->b_blocknr ==
2402 						       0))) {
2403 			/* not mapped yet, or it points to a direct item, search
2404 			 * the btree for the mapping info, and log any direct
2405 			 * items found
2406 			 */
2407 			if ((error = map_block_for_writepage(inode, bh, block))) {
2408 				goto fail;
2409 			}
2410 		}
2411 		bh = bh->b_this_page;
2412 		block++;
2413 	} while (bh != head);
2414 
2415 	/*
2416 	 * we start the transaction after map_block_for_writepage,
2417 	 * because it can create holes in the file (an unbounded operation).
2418 	 * starting it here, we can make a reliable estimate for how many
2419 	 * blocks we're going to log
2420 	 */
2421 	if (checked) {
2422 		ClearPageChecked(page);
2423 		reiserfs_write_lock(s);
2424 		error = journal_begin(&th, s, bh_per_page + 1);
2425 		if (error) {
2426 			reiserfs_write_unlock(s);
2427 			goto fail;
2428 		}
2429 		reiserfs_update_inode_transaction(inode);
2430 	}
2431 	/* now go through and lock any dirty buffers on the page */
2432 	do {
2433 		get_bh(bh);
2434 		if (!buffer_mapped(bh))
2435 			continue;
2436 		if (buffer_mapped(bh) && bh->b_blocknr == 0)
2437 			continue;
2438 
2439 		if (checked) {
2440 			reiserfs_prepare_for_journal(s, bh, 1);
2441 			journal_mark_dirty(&th, s, bh);
2442 			continue;
2443 		}
2444 		/* from this point on, we know the buffer is mapped to a
2445 		 * real block and not a direct item
2446 		 */
2447 		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
2448 			lock_buffer(bh);
2449 		} else {
2450 			if (test_set_buffer_locked(bh)) {
2451 				redirty_page_for_writepage(wbc, page);
2452 				continue;
2453 			}
2454 		}
2455 		if (test_clear_buffer_dirty(bh)) {
2456 			mark_buffer_async_write(bh);
2457 		} else {
2458 			unlock_buffer(bh);
2459 		}
2460 	} while ((bh = bh->b_this_page) != head);
2461 
2462 	if (checked) {
2463 		error = journal_end(&th, s, bh_per_page + 1);
2464 		reiserfs_write_unlock(s);
2465 		if (error)
2466 			goto fail;
2467 	}
2468 	BUG_ON(PageWriteback(page));
2469 	set_page_writeback(page);
2470 	unlock_page(page);
2471 
2472 	/*
2473 	 * since any buffer might be the only dirty buffer on the page,
2474 	 * the first submit_bh can bring the page out of writeback.
2475 	 * be careful with the buffers.
2476 	 */
2477 	do {
2478 		struct buffer_head *next = bh->b_this_page;
2479 		if (buffer_async_write(bh)) {
2480 			submit_bh(WRITE, bh);
2481 			nr++;
2482 		}
2483 		put_bh(bh);
2484 		bh = next;
2485 	} while (bh != head);
2486 
2487 	error = 0;
2488       done:
2489 	if (nr == 0) {
2490 		/*
2491 		 * if this page only had a direct item, it is very possible for
2492 		 * no io to be required without there being an error.  Or,
2493 		 * someone else could have locked them and sent them down the
2494 		 * pipe without locking the page
2495 		 */
2496 		bh = head;
2497 		do {
2498 			if (!buffer_uptodate(bh)) {
2499 				partial = 1;
2500 				break;
2501 			}
2502 			bh = bh->b_this_page;
2503 		} while (bh != head);
2504 		if (!partial)
2505 			SetPageUptodate(page);
2506 		end_page_writeback(page);
2507 	}
2508 	return error;
2509 
2510       fail:
2511 	/* catches various errors, we need to make sure any valid dirty blocks
2512 	 * get to the media.  The page is currently locked and not marked for
2513 	 * writeback
2514 	 */
2515 	ClearPageUptodate(page);
2516 	bh = head;
2517 	do {
2518 		get_bh(bh);
2519 		if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) {
2520 			lock_buffer(bh);
2521 			mark_buffer_async_write(bh);
2522 		} else {
2523 			/*
2524 			 * clear any dirty bits that might have come from getting
2525 			 * attached to a dirty page
2526 			 */
2527 			clear_buffer_dirty(bh);
2528 		}
2529 		bh = bh->b_this_page;
2530 	} while (bh != head);
2531 	SetPageError(page);
2532 	BUG_ON(PageWriteback(page));
2533 	set_page_writeback(page);
2534 	unlock_page(page);
2535 	do {
2536 		struct buffer_head *next = bh->b_this_page;
2537 		if (buffer_async_write(bh)) {
2538 			clear_buffer_dirty(bh);
2539 			submit_bh(WRITE, bh);
2540 			nr++;
2541 		}
2542 		put_bh(bh);
2543 		bh = next;
2544 	} while (bh != head);
2545 	goto done;
2546 }
2547 
2548 static int reiserfs_readpage(struct file *f, struct page *page)
2549 {
2550 	return block_read_full_page(page, reiserfs_get_block);
2551 }
2552 
2553 static int reiserfs_writepage(struct page *page, struct writeback_control *wbc)
2554 {
2555 	struct inode *inode = page->mapping->host;
2556 	reiserfs_wait_on_write_block(inode->i_sb);
2557 	return reiserfs_write_full_page(page, wbc);
2558 }
2559 
2560 static int reiserfs_prepare_write(struct file *f, struct page *page,
2561 				  unsigned from, unsigned to)
2562 {
2563 	struct inode *inode = page->mapping->host;
2564 	int ret;
2565 	int old_ref = 0;
2566 
2567 	reiserfs_wait_on_write_block(inode->i_sb);
2568 	fix_tail_page_for_writing(page);
2569 	if (reiserfs_transaction_running(inode->i_sb)) {
2570 		struct reiserfs_transaction_handle *th;
2571 		th = (struct reiserfs_transaction_handle *)current->
2572 		    journal_info;
2573 		BUG_ON(!th->t_refcount);
2574 		BUG_ON(!th->t_trans_id);
2575 		old_ref = th->t_refcount;
2576 		th->t_refcount++;
2577 	}
2578 
2579 	ret = block_prepare_write(page, from, to, reiserfs_get_block);
2580 	if (ret && reiserfs_transaction_running(inode->i_sb)) {
2581 		struct reiserfs_transaction_handle *th = current->journal_info;
2582 		/* this gets a little ugly.  If reiserfs_get_block returned an
2583 		 * error and left a transacstion running, we've got to close it,
2584 		 * and we've got to free handle if it was a persistent transaction.
2585 		 *
2586 		 * But, if we had nested into an existing transaction, we need
2587 		 * to just drop the ref count on the handle.
2588 		 *
2589 		 * If old_ref == 0, the transaction is from reiserfs_get_block,
2590 		 * and it was a persistent trans.  Otherwise, it was nested above.
2591 		 */
2592 		if (th->t_refcount > old_ref) {
2593 			if (old_ref)
2594 				th->t_refcount--;
2595 			else {
2596 				int err;
2597 				reiserfs_write_lock(inode->i_sb);
2598 				err = reiserfs_end_persistent_transaction(th);
2599 				reiserfs_write_unlock(inode->i_sb);
2600 				if (err)
2601 					ret = err;
2602 			}
2603 		}
2604 	}
2605 	return ret;
2606 
2607 }
2608 
2609 static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block)
2610 {
2611 	return generic_block_bmap(as, block, reiserfs_bmap);
2612 }
2613 
2614 static int reiserfs_commit_write(struct file *f, struct page *page,
2615 				 unsigned from, unsigned to)
2616 {
2617 	struct inode *inode = page->mapping->host;
2618 	loff_t pos = ((loff_t) page->index << PAGE_CACHE_SHIFT) + to;
2619 	int ret = 0;
2620 	int update_sd = 0;
2621 	struct reiserfs_transaction_handle *th = NULL;
2622 
2623 	reiserfs_wait_on_write_block(inode->i_sb);
2624 	if (reiserfs_transaction_running(inode->i_sb)) {
2625 		th = current->journal_info;
2626 	}
2627 	reiserfs_commit_page(inode, page, from, to);
2628 
2629 	/* generic_commit_write does this for us, but does not update the
2630 	 ** transaction tracking stuff when the size changes.  So, we have
2631 	 ** to do the i_size updates here.
2632 	 */
2633 	if (pos > inode->i_size) {
2634 		struct reiserfs_transaction_handle myth;
2635 		reiserfs_write_lock(inode->i_sb);
2636 		/* If the file have grown beyond the border where it
2637 		   can have a tail, unmark it as needing a tail
2638 		   packing */
2639 		if ((have_large_tails(inode->i_sb)
2640 		     && inode->i_size > i_block_size(inode) * 4)
2641 		    || (have_small_tails(inode->i_sb)
2642 			&& inode->i_size > i_block_size(inode)))
2643 			REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
2644 
2645 		ret = journal_begin(&myth, inode->i_sb, 1);
2646 		if (ret) {
2647 			reiserfs_write_unlock(inode->i_sb);
2648 			goto journal_error;
2649 		}
2650 		reiserfs_update_inode_transaction(inode);
2651 		inode->i_size = pos;
2652 		/*
2653 		 * this will just nest into our transaction.  It's important
2654 		 * to use mark_inode_dirty so the inode gets pushed around on the
2655 		 * dirty lists, and so that O_SYNC works as expected
2656 		 */
2657 		mark_inode_dirty(inode);
2658 		reiserfs_update_sd(&myth, inode);
2659 		update_sd = 1;
2660 		ret = journal_end(&myth, inode->i_sb, 1);
2661 		reiserfs_write_unlock(inode->i_sb);
2662 		if (ret)
2663 			goto journal_error;
2664 	}
2665 	if (th) {
2666 		reiserfs_write_lock(inode->i_sb);
2667 		if (!update_sd)
2668 			mark_inode_dirty(inode);
2669 		ret = reiserfs_end_persistent_transaction(th);
2670 		reiserfs_write_unlock(inode->i_sb);
2671 		if (ret)
2672 			goto out;
2673 	}
2674 
2675       out:
2676 	return ret;
2677 
2678       journal_error:
2679 	if (th) {
2680 		reiserfs_write_lock(inode->i_sb);
2681 		if (!update_sd)
2682 			reiserfs_update_sd(th, inode);
2683 		ret = reiserfs_end_persistent_transaction(th);
2684 		reiserfs_write_unlock(inode->i_sb);
2685 	}
2686 
2687 	return ret;
2688 }
2689 
2690 void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode)
2691 {
2692 	if (reiserfs_attrs(inode->i_sb)) {
2693 		if (sd_attrs & REISERFS_SYNC_FL)
2694 			inode->i_flags |= S_SYNC;
2695 		else
2696 			inode->i_flags &= ~S_SYNC;
2697 		if (sd_attrs & REISERFS_IMMUTABLE_FL)
2698 			inode->i_flags |= S_IMMUTABLE;
2699 		else
2700 			inode->i_flags &= ~S_IMMUTABLE;
2701 		if (sd_attrs & REISERFS_APPEND_FL)
2702 			inode->i_flags |= S_APPEND;
2703 		else
2704 			inode->i_flags &= ~S_APPEND;
2705 		if (sd_attrs & REISERFS_NOATIME_FL)
2706 			inode->i_flags |= S_NOATIME;
2707 		else
2708 			inode->i_flags &= ~S_NOATIME;
2709 		if (sd_attrs & REISERFS_NOTAIL_FL)
2710 			REISERFS_I(inode)->i_flags |= i_nopack_mask;
2711 		else
2712 			REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
2713 	}
2714 }
2715 
2716 void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs)
2717 {
2718 	if (reiserfs_attrs(inode->i_sb)) {
2719 		if (inode->i_flags & S_IMMUTABLE)
2720 			*sd_attrs |= REISERFS_IMMUTABLE_FL;
2721 		else
2722 			*sd_attrs &= ~REISERFS_IMMUTABLE_FL;
2723 		if (inode->i_flags & S_SYNC)
2724 			*sd_attrs |= REISERFS_SYNC_FL;
2725 		else
2726 			*sd_attrs &= ~REISERFS_SYNC_FL;
2727 		if (inode->i_flags & S_NOATIME)
2728 			*sd_attrs |= REISERFS_NOATIME_FL;
2729 		else
2730 			*sd_attrs &= ~REISERFS_NOATIME_FL;
2731 		if (REISERFS_I(inode)->i_flags & i_nopack_mask)
2732 			*sd_attrs |= REISERFS_NOTAIL_FL;
2733 		else
2734 			*sd_attrs &= ~REISERFS_NOTAIL_FL;
2735 	}
2736 }
2737 
2738 /* decide if this buffer needs to stay around for data logging or ordered
2739 ** write purposes
2740 */
2741 static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
2742 {
2743 	int ret = 1;
2744 	struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
2745 
2746 	spin_lock(&j->j_dirty_buffers_lock);
2747 	if (!buffer_mapped(bh)) {
2748 		goto free_jh;
2749 	}
2750 	/* the page is locked, and the only places that log a data buffer
2751 	 * also lock the page.
2752 	 */
2753 	if (reiserfs_file_data_log(inode)) {
2754 		/*
2755 		 * very conservative, leave the buffer pinned if
2756 		 * anyone might need it.
2757 		 */
2758 		if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
2759 			ret = 0;
2760 		}
2761 	} else if (buffer_dirty(bh) || buffer_locked(bh)) {
2762 		struct reiserfs_journal_list *jl;
2763 		struct reiserfs_jh *jh = bh->b_private;
2764 
2765 		/* why is this safe?
2766 		 * reiserfs_setattr updates i_size in the on disk
2767 		 * stat data before allowing vmtruncate to be called.
2768 		 *
2769 		 * If buffer was put onto the ordered list for this
2770 		 * transaction, we know for sure either this transaction
2771 		 * or an older one already has updated i_size on disk,
2772 		 * and this ordered data won't be referenced in the file
2773 		 * if we crash.
2774 		 *
2775 		 * if the buffer was put onto the ordered list for an older
2776 		 * transaction, we need to leave it around
2777 		 */
2778 		if (jh && (jl = jh->jl)
2779 		    && jl != SB_JOURNAL(inode->i_sb)->j_current_jl)
2780 			ret = 0;
2781 	}
2782       free_jh:
2783 	if (ret && bh->b_private) {
2784 		reiserfs_free_jh(bh);
2785 	}
2786 	spin_unlock(&j->j_dirty_buffers_lock);
2787 	return ret;
2788 }
2789 
2790 /* clm -- taken from fs/buffer.c:block_invalidate_page */
2791 static int reiserfs_invalidatepage(struct page *page, unsigned long offset)
2792 {
2793 	struct buffer_head *head, *bh, *next;
2794 	struct inode *inode = page->mapping->host;
2795 	unsigned int curr_off = 0;
2796 	int ret = 1;
2797 
2798 	BUG_ON(!PageLocked(page));
2799 
2800 	if (offset == 0)
2801 		ClearPageChecked(page);
2802 
2803 	if (!page_has_buffers(page))
2804 		goto out;
2805 
2806 	head = page_buffers(page);
2807 	bh = head;
2808 	do {
2809 		unsigned int next_off = curr_off + bh->b_size;
2810 		next = bh->b_this_page;
2811 
2812 		/*
2813 		 * is this block fully invalidated?
2814 		 */
2815 		if (offset <= curr_off) {
2816 			if (invalidatepage_can_drop(inode, bh))
2817 				reiserfs_unmap_buffer(bh);
2818 			else
2819 				ret = 0;
2820 		}
2821 		curr_off = next_off;
2822 		bh = next;
2823 	} while (bh != head);
2824 
2825 	/*
2826 	 * We release buffers only if the entire page is being invalidated.
2827 	 * The get_block cached value has been unconditionally invalidated,
2828 	 * so real IO is not possible anymore.
2829 	 */
2830 	if (!offset && ret)
2831 		ret = try_to_release_page(page, 0);
2832       out:
2833 	return ret;
2834 }
2835 
2836 static int reiserfs_set_page_dirty(struct page *page)
2837 {
2838 	struct inode *inode = page->mapping->host;
2839 	if (reiserfs_file_data_log(inode)) {
2840 		SetPageChecked(page);
2841 		return __set_page_dirty_nobuffers(page);
2842 	}
2843 	return __set_page_dirty_buffers(page);
2844 }
2845 
2846 /*
2847  * Returns 1 if the page's buffers were dropped.  The page is locked.
2848  *
2849  * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads
2850  * in the buffers at page_buffers(page).
2851  *
2852  * even in -o notail mode, we can't be sure an old mount without -o notail
2853  * didn't create files with tails.
2854  */
2855 static int reiserfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
2856 {
2857 	struct inode *inode = page->mapping->host;
2858 	struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
2859 	struct buffer_head *head;
2860 	struct buffer_head *bh;
2861 	int ret = 1;
2862 
2863 	WARN_ON(PageChecked(page));
2864 	spin_lock(&j->j_dirty_buffers_lock);
2865 	head = page_buffers(page);
2866 	bh = head;
2867 	do {
2868 		if (bh->b_private) {
2869 			if (!buffer_dirty(bh) && !buffer_locked(bh)) {
2870 				reiserfs_free_jh(bh);
2871 			} else {
2872 				ret = 0;
2873 				break;
2874 			}
2875 		}
2876 		bh = bh->b_this_page;
2877 	} while (bh != head);
2878 	if (ret)
2879 		ret = try_to_free_buffers(page);
2880 	spin_unlock(&j->j_dirty_buffers_lock);
2881 	return ret;
2882 }
2883 
2884 /* We thank Mingming Cao for helping us understand in great detail what
2885    to do in this section of the code. */
2886 static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
2887 				  const struct iovec *iov, loff_t offset,
2888 				  unsigned long nr_segs)
2889 {
2890 	struct file *file = iocb->ki_filp;
2891 	struct inode *inode = file->f_mapping->host;
2892 
2893 	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
2894 				  offset, nr_segs,
2895 				  reiserfs_get_blocks_direct_io, NULL);
2896 }
2897 
2898 int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
2899 {
2900 	struct inode *inode = dentry->d_inode;
2901 	int error;
2902 	unsigned int ia_valid = attr->ia_valid;
2903 	reiserfs_write_lock(inode->i_sb);
2904 	if (attr->ia_valid & ATTR_SIZE) {
2905 		/* version 2 items will be caught by the s_maxbytes check
2906 		 ** done for us in vmtruncate
2907 		 */
2908 		if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 &&
2909 		    attr->ia_size > MAX_NON_LFS) {
2910 			error = -EFBIG;
2911 			goto out;
2912 		}
2913 		/* fill in hole pointers in the expanding truncate case. */
2914 		if (attr->ia_size > inode->i_size) {
2915 			error = generic_cont_expand(inode, attr->ia_size);
2916 			if (REISERFS_I(inode)->i_prealloc_count > 0) {
2917 				int err;
2918 				struct reiserfs_transaction_handle th;
2919 				/* we're changing at most 2 bitmaps, inode + super */
2920 				err = journal_begin(&th, inode->i_sb, 4);
2921 				if (!err) {
2922 					reiserfs_discard_prealloc(&th, inode);
2923 					err = journal_end(&th, inode->i_sb, 4);
2924 				}
2925 				if (err)
2926 					error = err;
2927 			}
2928 			if (error)
2929 				goto out;
2930 		}
2931 	}
2932 
2933 	if ((((attr->ia_valid & ATTR_UID) && (attr->ia_uid & ~0xffff)) ||
2934 	     ((attr->ia_valid & ATTR_GID) && (attr->ia_gid & ~0xffff))) &&
2935 	    (get_inode_sd_version(inode) == STAT_DATA_V1)) {
2936 		/* stat data of format v3.5 has 16 bit uid and gid */
2937 		error = -EINVAL;
2938 		goto out;
2939 	}
2940 
2941 	error = inode_change_ok(inode, attr);
2942 	if (!error) {
2943 		if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
2944 		    (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
2945 			error = reiserfs_chown_xattrs(inode, attr);
2946 
2947 			if (!error) {
2948 				struct reiserfs_transaction_handle th;
2949 				int jbegin_count =
2950 				    2 *
2951 				    (REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb) +
2952 				     REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)) +
2953 				    2;
2954 
2955 				/* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */
2956 				error =
2957 				    journal_begin(&th, inode->i_sb,
2958 						  jbegin_count);
2959 				if (error)
2960 					goto out;
2961 				error =
2962 				    DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
2963 				if (error) {
2964 					journal_end(&th, inode->i_sb,
2965 						    jbegin_count);
2966 					goto out;
2967 				}
2968 				/* Update corresponding info in inode so that everything is in
2969 				 * one transaction */
2970 				if (attr->ia_valid & ATTR_UID)
2971 					inode->i_uid = attr->ia_uid;
2972 				if (attr->ia_valid & ATTR_GID)
2973 					inode->i_gid = attr->ia_gid;
2974 				mark_inode_dirty(inode);
2975 				error =
2976 				    journal_end(&th, inode->i_sb, jbegin_count);
2977 			}
2978 		}
2979 		if (!error)
2980 			error = inode_setattr(inode, attr);
2981 	}
2982 
2983 	if (!error && reiserfs_posixacl(inode->i_sb)) {
2984 		if (attr->ia_valid & ATTR_MODE)
2985 			error = reiserfs_acl_chmod(inode);
2986 	}
2987 
2988       out:
2989 	reiserfs_write_unlock(inode->i_sb);
2990 	return error;
2991 }
2992 
2993 struct address_space_operations reiserfs_address_space_operations = {
2994 	.writepage = reiserfs_writepage,
2995 	.readpage = reiserfs_readpage,
2996 	.readpages = reiserfs_readpages,
2997 	.releasepage = reiserfs_releasepage,
2998 	.invalidatepage = reiserfs_invalidatepage,
2999 	.sync_page = block_sync_page,
3000 	.prepare_write = reiserfs_prepare_write,
3001 	.commit_write = reiserfs_commit_write,
3002 	.bmap = reiserfs_aop_bmap,
3003 	.direct_IO = reiserfs_direct_IO,
3004 	.set_page_dirty = reiserfs_set_page_dirty,
3005 };
3006