xref: /openbmc/linux/fs/reiserfs/inode.c (revision d5cb9783536a41df9f9cba5b0a1d78047ed787f7)
1 /*
2  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3  */
4 
5 #include <linux/config.h>
6 #include <linux/time.h>
7 #include <linux/fs.h>
8 #include <linux/reiserfs_fs.h>
9 #include <linux/reiserfs_acl.h>
10 #include <linux/reiserfs_xattr.h>
11 #include <linux/smp_lock.h>
12 #include <linux/pagemap.h>
13 #include <linux/highmem.h>
14 #include <asm/uaccess.h>
15 #include <asm/unaligned.h>
16 #include <linux/buffer_head.h>
17 #include <linux/mpage.h>
18 #include <linux/writeback.h>
19 #include <linux/quotaops.h>
20 
21 extern int reiserfs_default_io_size;	/* default io size devuned in super.c */
22 
23 static int reiserfs_commit_write(struct file *f, struct page *page,
24 				 unsigned from, unsigned to);
25 static int reiserfs_prepare_write(struct file *f, struct page *page,
26 				  unsigned from, unsigned to);
27 
28 void reiserfs_delete_inode(struct inode *inode)
29 {
30 	/* We need blocks for transaction + (user+group) quota update (possibly delete) */
31 	int jbegin_count =
32 	    JOURNAL_PER_BALANCE_CNT * 2 +
33 	    2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
34 	struct reiserfs_transaction_handle th;
35 
36 	truncate_inode_pages(&inode->i_data, 0);
37 
38 	reiserfs_write_lock(inode->i_sb);
39 
40 	/* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
41 	if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) {	/* also handles bad_inode case */
42 		down(&inode->i_sem);
43 
44 		reiserfs_delete_xattrs(inode);
45 
46 		if (journal_begin(&th, inode->i_sb, jbegin_count)) {
47 			up(&inode->i_sem);
48 			goto out;
49 		}
50 		reiserfs_update_inode_transaction(inode);
51 
52 		if (reiserfs_delete_object(&th, inode)) {
53 			up(&inode->i_sem);
54 			goto out;
55 		}
56 
57 		/* Do quota update inside a transaction for journaled quotas. We must do that
58 		 * after delete_object so that quota updates go into the same transaction as
59 		 * stat data deletion */
60 		DQUOT_FREE_INODE(inode);
61 
62 		if (journal_end(&th, inode->i_sb, jbegin_count)) {
63 			up(&inode->i_sem);
64 			goto out;
65 		}
66 
67 		up(&inode->i_sem);
68 
69 		/* all items of file are deleted, so we can remove "save" link */
70 		remove_save_link(inode, 0 /* not truncate */ );	/* we can't do anything
71 								 * about an error here */
72 	} else {
73 		/* no object items are in the tree */
74 		;
75 	}
76       out:
77 	clear_inode(inode);	/* note this must go after the journal_end to prevent deadlock */
78 	inode->i_blocks = 0;
79 	reiserfs_write_unlock(inode->i_sb);
80 }
81 
82 static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid,
83 			  __u32 objectid, loff_t offset, int type, int length)
84 {
85 	key->version = version;
86 
87 	key->on_disk_key.k_dir_id = dirid;
88 	key->on_disk_key.k_objectid = objectid;
89 	set_cpu_key_k_offset(key, offset);
90 	set_cpu_key_k_type(key, type);
91 	key->key_length = length;
92 }
93 
94 /* take base of inode_key (it comes from inode always) (dirid, objectid) and version from an inode, set
95    offset and type of key */
96 void make_cpu_key(struct cpu_key *key, struct inode *inode, loff_t offset,
97 		  int type, int length)
98 {
99 	_make_cpu_key(key, get_inode_item_key_version(inode),
100 		      le32_to_cpu(INODE_PKEY(inode)->k_dir_id),
101 		      le32_to_cpu(INODE_PKEY(inode)->k_objectid), offset, type,
102 		      length);
103 }
104 
105 //
106 // when key is 0, do not set version and short key
107 //
108 inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
109 			      int version,
110 			      loff_t offset, int type, int length,
111 			      int entry_count /*or ih_free_space */ )
112 {
113 	if (key) {
114 		ih->ih_key.k_dir_id = cpu_to_le32(key->on_disk_key.k_dir_id);
115 		ih->ih_key.k_objectid =
116 		    cpu_to_le32(key->on_disk_key.k_objectid);
117 	}
118 	put_ih_version(ih, version);
119 	set_le_ih_k_offset(ih, offset);
120 	set_le_ih_k_type(ih, type);
121 	put_ih_item_len(ih, length);
122 	/*    set_ih_free_space (ih, 0); */
123 	// for directory items it is entry count, for directs and stat
124 	// datas - 0xffff, for indirects - 0
125 	put_ih_entry_count(ih, entry_count);
126 }
127 
128 //
129 // FIXME: we might cache recently accessed indirect item
130 
131 // Ugh.  Not too eager for that....
132 //  I cut the code until such time as I see a convincing argument (benchmark).
133 // I don't want a bloated inode struct..., and I don't like code complexity....
134 
135 /* cutting the code is fine, since it really isn't in use yet and is easy
136 ** to add back in.  But, Vladimir has a really good idea here.  Think
137 ** about what happens for reading a file.  For each page,
138 ** The VFS layer calls reiserfs_readpage, who searches the tree to find
139 ** an indirect item.  This indirect item has X number of pointers, where
140 ** X is a big number if we've done the block allocation right.  But,
141 ** we only use one or two of these pointers during each call to readpage,
142 ** needlessly researching again later on.
143 **
144 ** The size of the cache could be dynamic based on the size of the file.
145 **
146 ** I'd also like to see us cache the location the stat data item, since
147 ** we are needlessly researching for that frequently.
148 **
149 ** --chris
150 */
151 
152 /* If this page has a file tail in it, and
153 ** it was read in by get_block_create_0, the page data is valid,
154 ** but tail is still sitting in a direct item, and we can't write to
155 ** it.  So, look through this page, and check all the mapped buffers
156 ** to make sure they have valid block numbers.  Any that don't need
157 ** to be unmapped, so that block_prepare_write will correctly call
158 ** reiserfs_get_block to convert the tail into an unformatted node
159 */
160 static inline void fix_tail_page_for_writing(struct page *page)
161 {
162 	struct buffer_head *head, *next, *bh;
163 
164 	if (page && page_has_buffers(page)) {
165 		head = page_buffers(page);
166 		bh = head;
167 		do {
168 			next = bh->b_this_page;
169 			if (buffer_mapped(bh) && bh->b_blocknr == 0) {
170 				reiserfs_unmap_buffer(bh);
171 			}
172 			bh = next;
173 		} while (bh != head);
174 	}
175 }
176 
177 /* reiserfs_get_block does not need to allocate a block only if it has been
178    done already or non-hole position has been found in the indirect item */
179 static inline int allocation_needed(int retval, b_blocknr_t allocated,
180 				    struct item_head *ih,
181 				    __le32 * item, int pos_in_item)
182 {
183 	if (allocated)
184 		return 0;
185 	if (retval == POSITION_FOUND && is_indirect_le_ih(ih) &&
186 	    get_block_num(item, pos_in_item))
187 		return 0;
188 	return 1;
189 }
190 
191 static inline int indirect_item_found(int retval, struct item_head *ih)
192 {
193 	return (retval == POSITION_FOUND) && is_indirect_le_ih(ih);
194 }
195 
196 static inline void set_block_dev_mapped(struct buffer_head *bh,
197 					b_blocknr_t block, struct inode *inode)
198 {
199 	map_bh(bh, inode->i_sb, block);
200 }
201 
202 //
203 // files which were created in the earlier version can not be longer,
204 // than 2 gb
205 //
206 static int file_capable(struct inode *inode, long block)
207 {
208 	if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 ||	// it is new file.
209 	    block < (1 << (31 - inode->i_sb->s_blocksize_bits)))	// old file, but 'block' is inside of 2gb
210 		return 1;
211 
212 	return 0;
213 }
214 
215 /*static*/ int restart_transaction(struct reiserfs_transaction_handle *th,
216 				   struct inode *inode, struct path *path)
217 {
218 	struct super_block *s = th->t_super;
219 	int len = th->t_blocks_allocated;
220 	int err;
221 
222 	BUG_ON(!th->t_trans_id);
223 	BUG_ON(!th->t_refcount);
224 
225 	/* we cannot restart while nested */
226 	if (th->t_refcount > 1) {
227 		return 0;
228 	}
229 	pathrelse(path);
230 	reiserfs_update_sd(th, inode);
231 	err = journal_end(th, s, len);
232 	if (!err) {
233 		err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6);
234 		if (!err)
235 			reiserfs_update_inode_transaction(inode);
236 	}
237 	return err;
238 }
239 
240 // it is called by get_block when create == 0. Returns block number
241 // for 'block'-th logical block of file. When it hits direct item it
242 // returns 0 (being called from bmap) or read direct item into piece
243 // of page (bh_result)
244 
245 // Please improve the english/clarity in the comment above, as it is
246 // hard to understand.
247 
248 static int _get_block_create_0(struct inode *inode, long block,
249 			       struct buffer_head *bh_result, int args)
250 {
251 	INITIALIZE_PATH(path);
252 	struct cpu_key key;
253 	struct buffer_head *bh;
254 	struct item_head *ih, tmp_ih;
255 	int fs_gen;
256 	int blocknr;
257 	char *p = NULL;
258 	int chars;
259 	int ret;
260 	int result;
261 	int done = 0;
262 	unsigned long offset;
263 
264 	// prepare the key to look for the 'block'-th block of file
265 	make_cpu_key(&key, inode,
266 		     (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY,
267 		     3);
268 
269       research:
270 	result = search_for_position_by_key(inode->i_sb, &key, &path);
271 	if (result != POSITION_FOUND) {
272 		pathrelse(&path);
273 		if (p)
274 			kunmap(bh_result->b_page);
275 		if (result == IO_ERROR)
276 			return -EIO;
277 		// We do not return -ENOENT if there is a hole but page is uptodate, because it means
278 		// That there is some MMAPED data associated with it that is yet to be written to disk.
279 		if ((args & GET_BLOCK_NO_HOLE)
280 		    && !PageUptodate(bh_result->b_page)) {
281 			return -ENOENT;
282 		}
283 		return 0;
284 	}
285 	//
286 	bh = get_last_bh(&path);
287 	ih = get_ih(&path);
288 	if (is_indirect_le_ih(ih)) {
289 		__le32 *ind_item = (__le32 *) B_I_PITEM(bh, ih);
290 
291 		/* FIXME: here we could cache indirect item or part of it in
292 		   the inode to avoid search_by_key in case of subsequent
293 		   access to file */
294 		blocknr = get_block_num(ind_item, path.pos_in_item);
295 		ret = 0;
296 		if (blocknr) {
297 			map_bh(bh_result, inode->i_sb, blocknr);
298 			if (path.pos_in_item ==
299 			    ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) {
300 				set_buffer_boundary(bh_result);
301 			}
302 		} else
303 			// We do not return -ENOENT if there is a hole but page is uptodate, because it means
304 			// That there is some MMAPED data associated with it that is yet to  be written to disk.
305 		if ((args & GET_BLOCK_NO_HOLE)
306 			    && !PageUptodate(bh_result->b_page)) {
307 			ret = -ENOENT;
308 		}
309 
310 		pathrelse(&path);
311 		if (p)
312 			kunmap(bh_result->b_page);
313 		return ret;
314 	}
315 	// requested data are in direct item(s)
316 	if (!(args & GET_BLOCK_READ_DIRECT)) {
317 		// we are called by bmap. FIXME: we can not map block of file
318 		// when it is stored in direct item(s)
319 		pathrelse(&path);
320 		if (p)
321 			kunmap(bh_result->b_page);
322 		return -ENOENT;
323 	}
324 
325 	/* if we've got a direct item, and the buffer or page was uptodate,
326 	 ** we don't want to pull data off disk again.  skip to the
327 	 ** end, where we map the buffer and return
328 	 */
329 	if (buffer_uptodate(bh_result)) {
330 		goto finished;
331 	} else
332 		/*
333 		 ** grab_tail_page can trigger calls to reiserfs_get_block on up to date
334 		 ** pages without any buffers.  If the page is up to date, we don't want
335 		 ** read old data off disk.  Set the up to date bit on the buffer instead
336 		 ** and jump to the end
337 		 */
338 	if (!bh_result->b_page || PageUptodate(bh_result->b_page)) {
339 		set_buffer_uptodate(bh_result);
340 		goto finished;
341 	}
342 	// read file tail into part of page
343 	offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1);
344 	fs_gen = get_generation(inode->i_sb);
345 	copy_item_head(&tmp_ih, ih);
346 
347 	/* we only want to kmap if we are reading the tail into the page.
348 	 ** this is not the common case, so we don't kmap until we are
349 	 ** sure we need to.  But, this means the item might move if
350 	 ** kmap schedules
351 	 */
352 	if (!p) {
353 		p = (char *)kmap(bh_result->b_page);
354 		if (fs_changed(fs_gen, inode->i_sb)
355 		    && item_moved(&tmp_ih, &path)) {
356 			goto research;
357 		}
358 	}
359 	p += offset;
360 	memset(p, 0, inode->i_sb->s_blocksize);
361 	do {
362 		if (!is_direct_le_ih(ih)) {
363 			BUG();
364 		}
365 		/* make sure we don't read more bytes than actually exist in
366 		 ** the file.  This can happen in odd cases where i_size isn't
367 		 ** correct, and when direct item padding results in a few
368 		 ** extra bytes at the end of the direct item
369 		 */
370 		if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size)
371 			break;
372 		if ((le_ih_k_offset(ih) - 1 + ih_item_len(ih)) > inode->i_size) {
373 			chars =
374 			    inode->i_size - (le_ih_k_offset(ih) - 1) -
375 			    path.pos_in_item;
376 			done = 1;
377 		} else {
378 			chars = ih_item_len(ih) - path.pos_in_item;
379 		}
380 		memcpy(p, B_I_PITEM(bh, ih) + path.pos_in_item, chars);
381 
382 		if (done)
383 			break;
384 
385 		p += chars;
386 
387 		if (PATH_LAST_POSITION(&path) != (B_NR_ITEMS(bh) - 1))
388 			// we done, if read direct item is not the last item of
389 			// node FIXME: we could try to check right delimiting key
390 			// to see whether direct item continues in the right
391 			// neighbor or rely on i_size
392 			break;
393 
394 		// update key to look for the next piece
395 		set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + chars);
396 		result = search_for_position_by_key(inode->i_sb, &key, &path);
397 		if (result != POSITION_FOUND)
398 			// i/o error most likely
399 			break;
400 		bh = get_last_bh(&path);
401 		ih = get_ih(&path);
402 	} while (1);
403 
404 	flush_dcache_page(bh_result->b_page);
405 	kunmap(bh_result->b_page);
406 
407       finished:
408 	pathrelse(&path);
409 
410 	if (result == IO_ERROR)
411 		return -EIO;
412 
413 	/* this buffer has valid data, but isn't valid for io.  mapping it to
414 	 * block #0 tells the rest of reiserfs it just has a tail in it
415 	 */
416 	map_bh(bh_result, inode->i_sb, 0);
417 	set_buffer_uptodate(bh_result);
418 	return 0;
419 }
420 
421 // this is called to create file map. So, _get_block_create_0 will not
422 // read direct item
423 static int reiserfs_bmap(struct inode *inode, sector_t block,
424 			 struct buffer_head *bh_result, int create)
425 {
426 	if (!file_capable(inode, block))
427 		return -EFBIG;
428 
429 	reiserfs_write_lock(inode->i_sb);
430 	/* do not read the direct item */
431 	_get_block_create_0(inode, block, bh_result, 0);
432 	reiserfs_write_unlock(inode->i_sb);
433 	return 0;
434 }
435 
436 /* special version of get_block that is only used by grab_tail_page right
437 ** now.  It is sent to block_prepare_write, and when you try to get a
438 ** block past the end of the file (or a block from a hole) it returns
439 ** -ENOENT instead of a valid buffer.  block_prepare_write expects to
440 ** be able to do i/o on the buffers returned, unless an error value
441 ** is also returned.
442 **
443 ** So, this allows block_prepare_write to be used for reading a single block
444 ** in a page.  Where it does not produce a valid page for holes, or past the
445 ** end of the file.  This turns out to be exactly what we need for reading
446 ** tails for conversion.
447 **
448 ** The point of the wrapper is forcing a certain value for create, even
449 ** though the VFS layer is calling this function with create==1.  If you
450 ** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
451 ** don't use this function.
452 */
453 static int reiserfs_get_block_create_0(struct inode *inode, sector_t block,
454 				       struct buffer_head *bh_result,
455 				       int create)
456 {
457 	return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE);
458 }
459 
460 /* This is special helper for reiserfs_get_block in case we are executing
461    direct_IO request. */
462 static int reiserfs_get_blocks_direct_io(struct inode *inode,
463 					 sector_t iblock,
464 					 unsigned long max_blocks,
465 					 struct buffer_head *bh_result,
466 					 int create)
467 {
468 	int ret;
469 
470 	bh_result->b_page = NULL;
471 
472 	/* We set the b_size before reiserfs_get_block call since it is
473 	   referenced in convert_tail_for_hole() that may be called from
474 	   reiserfs_get_block() */
475 	bh_result->b_size = (1 << inode->i_blkbits);
476 
477 	ret = reiserfs_get_block(inode, iblock, bh_result,
478 				 create | GET_BLOCK_NO_DANGLE);
479 	if (ret)
480 		goto out;
481 
482 	/* don't allow direct io onto tail pages */
483 	if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
484 		/* make sure future calls to the direct io funcs for this offset
485 		 ** in the file fail by unmapping the buffer
486 		 */
487 		clear_buffer_mapped(bh_result);
488 		ret = -EINVAL;
489 	}
490 	/* Possible unpacked tail. Flush the data before pages have
491 	   disappeared */
492 	if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
493 		int err;
494 		lock_kernel();
495 		err = reiserfs_commit_for_inode(inode);
496 		REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
497 		unlock_kernel();
498 		if (err < 0)
499 			ret = err;
500 	}
501       out:
502 	return ret;
503 }
504 
505 /*
506 ** helper function for when reiserfs_get_block is called for a hole
507 ** but the file tail is still in a direct item
508 ** bh_result is the buffer head for the hole
509 ** tail_offset is the offset of the start of the tail in the file
510 **
511 ** This calls prepare_write, which will start a new transaction
512 ** you should not be in a transaction, or have any paths held when you
513 ** call this.
514 */
515 static int convert_tail_for_hole(struct inode *inode,
516 				 struct buffer_head *bh_result,
517 				 loff_t tail_offset)
518 {
519 	unsigned long index;
520 	unsigned long tail_end;
521 	unsigned long tail_start;
522 	struct page *tail_page;
523 	struct page *hole_page = bh_result->b_page;
524 	int retval = 0;
525 
526 	if ((tail_offset & (bh_result->b_size - 1)) != 1)
527 		return -EIO;
528 
529 	/* always try to read until the end of the block */
530 	tail_start = tail_offset & (PAGE_CACHE_SIZE - 1);
531 	tail_end = (tail_start | (bh_result->b_size - 1)) + 1;
532 
533 	index = tail_offset >> PAGE_CACHE_SHIFT;
534 	/* hole_page can be zero in case of direct_io, we are sure
535 	   that we cannot get here if we write with O_DIRECT into
536 	   tail page */
537 	if (!hole_page || index != hole_page->index) {
538 		tail_page = grab_cache_page(inode->i_mapping, index);
539 		retval = -ENOMEM;
540 		if (!tail_page) {
541 			goto out;
542 		}
543 	} else {
544 		tail_page = hole_page;
545 	}
546 
547 	/* we don't have to make sure the conversion did not happen while
548 	 ** we were locking the page because anyone that could convert
549 	 ** must first take i_sem.
550 	 **
551 	 ** We must fix the tail page for writing because it might have buffers
552 	 ** that are mapped, but have a block number of 0.  This indicates tail
553 	 ** data that has been read directly into the page, and block_prepare_write
554 	 ** won't trigger a get_block in this case.
555 	 */
556 	fix_tail_page_for_writing(tail_page);
557 	retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end);
558 	if (retval)
559 		goto unlock;
560 
561 	/* tail conversion might change the data in the page */
562 	flush_dcache_page(tail_page);
563 
564 	retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end);
565 
566       unlock:
567 	if (tail_page != hole_page) {
568 		unlock_page(tail_page);
569 		page_cache_release(tail_page);
570 	}
571       out:
572 	return retval;
573 }
574 
575 static inline int _allocate_block(struct reiserfs_transaction_handle *th,
576 				  long block,
577 				  struct inode *inode,
578 				  b_blocknr_t * allocated_block_nr,
579 				  struct path *path, int flags)
580 {
581 	BUG_ON(!th->t_trans_id);
582 
583 #ifdef REISERFS_PREALLOCATE
584 	if (!(flags & GET_BLOCK_NO_ISEM)) {
585 		return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr,
586 						  path, block);
587 	}
588 #endif
589 	return reiserfs_new_unf_blocknrs(th, inode, allocated_block_nr, path,
590 					 block);
591 }
592 
593 int reiserfs_get_block(struct inode *inode, sector_t block,
594 		       struct buffer_head *bh_result, int create)
595 {
596 	int repeat, retval = 0;
597 	b_blocknr_t allocated_block_nr = 0;	// b_blocknr_t is (unsigned) 32 bit int
598 	INITIALIZE_PATH(path);
599 	int pos_in_item;
600 	struct cpu_key key;
601 	struct buffer_head *bh, *unbh = NULL;
602 	struct item_head *ih, tmp_ih;
603 	__le32 *item;
604 	int done;
605 	int fs_gen;
606 	struct reiserfs_transaction_handle *th = NULL;
607 	/* space reserved in transaction batch:
608 	   . 3 balancings in direct->indirect conversion
609 	   . 1 block involved into reiserfs_update_sd()
610 	   XXX in practically impossible worst case direct2indirect()
611 	   can incur (much) more than 3 balancings.
612 	   quota update for user, group */
613 	int jbegin_count =
614 	    JOURNAL_PER_BALANCE_CNT * 3 + 1 +
615 	    2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
616 	int version;
617 	int dangle = 1;
618 	loff_t new_offset =
619 	    (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1;
620 
621 	/* bad.... */
622 	reiserfs_write_lock(inode->i_sb);
623 	version = get_inode_item_key_version(inode);
624 
625 	if (block < 0) {
626 		reiserfs_write_unlock(inode->i_sb);
627 		return -EIO;
628 	}
629 
630 	if (!file_capable(inode, block)) {
631 		reiserfs_write_unlock(inode->i_sb);
632 		return -EFBIG;
633 	}
634 
635 	/* if !create, we aren't changing the FS, so we don't need to
636 	 ** log anything, so we don't need to start a transaction
637 	 */
638 	if (!(create & GET_BLOCK_CREATE)) {
639 		int ret;
640 		/* find number of block-th logical block of the file */
641 		ret = _get_block_create_0(inode, block, bh_result,
642 					  create | GET_BLOCK_READ_DIRECT);
643 		reiserfs_write_unlock(inode->i_sb);
644 		return ret;
645 	}
646 	/*
647 	 * if we're already in a transaction, make sure to close
648 	 * any new transactions we start in this func
649 	 */
650 	if ((create & GET_BLOCK_NO_DANGLE) ||
651 	    reiserfs_transaction_running(inode->i_sb))
652 		dangle = 0;
653 
654 	/* If file is of such a size, that it might have a tail and tails are enabled
655 	 ** we should mark it as possibly needing tail packing on close
656 	 */
657 	if ((have_large_tails(inode->i_sb)
658 	     && inode->i_size < i_block_size(inode) * 4)
659 	    || (have_small_tails(inode->i_sb)
660 		&& inode->i_size < i_block_size(inode)))
661 		REISERFS_I(inode)->i_flags |= i_pack_on_close_mask;
662 
663 	/* set the key of the first byte in the 'block'-th block of file */
664 	make_cpu_key(&key, inode, new_offset, TYPE_ANY, 3 /*key length */ );
665 	if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
666 	      start_trans:
667 		th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count);
668 		if (!th) {
669 			retval = -ENOMEM;
670 			goto failure;
671 		}
672 		reiserfs_update_inode_transaction(inode);
673 	}
674       research:
675 
676 	retval = search_for_position_by_key(inode->i_sb, &key, &path);
677 	if (retval == IO_ERROR) {
678 		retval = -EIO;
679 		goto failure;
680 	}
681 
682 	bh = get_last_bh(&path);
683 	ih = get_ih(&path);
684 	item = get_item(&path);
685 	pos_in_item = path.pos_in_item;
686 
687 	fs_gen = get_generation(inode->i_sb);
688 	copy_item_head(&tmp_ih, ih);
689 
690 	if (allocation_needed
691 	    (retval, allocated_block_nr, ih, item, pos_in_item)) {
692 		/* we have to allocate block for the unformatted node */
693 		if (!th) {
694 			pathrelse(&path);
695 			goto start_trans;
696 		}
697 
698 		repeat =
699 		    _allocate_block(th, block, inode, &allocated_block_nr,
700 				    &path, create);
701 
702 		if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) {
703 			/* restart the transaction to give the journal a chance to free
704 			 ** some blocks.  releases the path, so we have to go back to
705 			 ** research if we succeed on the second try
706 			 */
707 			SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1;
708 			retval = restart_transaction(th, inode, &path);
709 			if (retval)
710 				goto failure;
711 			repeat =
712 			    _allocate_block(th, block, inode,
713 					    &allocated_block_nr, NULL, create);
714 
715 			if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) {
716 				goto research;
717 			}
718 			if (repeat == QUOTA_EXCEEDED)
719 				retval = -EDQUOT;
720 			else
721 				retval = -ENOSPC;
722 			goto failure;
723 		}
724 
725 		if (fs_changed(fs_gen, inode->i_sb)
726 		    && item_moved(&tmp_ih, &path)) {
727 			goto research;
728 		}
729 	}
730 
731 	if (indirect_item_found(retval, ih)) {
732 		b_blocknr_t unfm_ptr;
733 		/* 'block'-th block is in the file already (there is
734 		   corresponding cell in some indirect item). But it may be
735 		   zero unformatted node pointer (hole) */
736 		unfm_ptr = get_block_num(item, pos_in_item);
737 		if (unfm_ptr == 0) {
738 			/* use allocated block to plug the hole */
739 			reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
740 			if (fs_changed(fs_gen, inode->i_sb)
741 			    && item_moved(&tmp_ih, &path)) {
742 				reiserfs_restore_prepared_buffer(inode->i_sb,
743 								 bh);
744 				goto research;
745 			}
746 			set_buffer_new(bh_result);
747 			if (buffer_dirty(bh_result)
748 			    && reiserfs_data_ordered(inode->i_sb))
749 				reiserfs_add_ordered_list(inode, bh_result);
750 			put_block_num(item, pos_in_item, allocated_block_nr);
751 			unfm_ptr = allocated_block_nr;
752 			journal_mark_dirty(th, inode->i_sb, bh);
753 			reiserfs_update_sd(th, inode);
754 		}
755 		set_block_dev_mapped(bh_result, unfm_ptr, inode);
756 		pathrelse(&path);
757 		retval = 0;
758 		if (!dangle && th)
759 			retval = reiserfs_end_persistent_transaction(th);
760 
761 		reiserfs_write_unlock(inode->i_sb);
762 
763 		/* the item was found, so new blocks were not added to the file
764 		 ** there is no need to make sure the inode is updated with this
765 		 ** transaction
766 		 */
767 		return retval;
768 	}
769 
770 	if (!th) {
771 		pathrelse(&path);
772 		goto start_trans;
773 	}
774 
775 	/* desired position is not found or is in the direct item. We have
776 	   to append file with holes up to 'block'-th block converting
777 	   direct items to indirect one if necessary */
778 	done = 0;
779 	do {
780 		if (is_statdata_le_ih(ih)) {
781 			__le32 unp = 0;
782 			struct cpu_key tmp_key;
783 
784 			/* indirect item has to be inserted */
785 			make_le_item_head(&tmp_ih, &key, version, 1,
786 					  TYPE_INDIRECT, UNFM_P_SIZE,
787 					  0 /* free_space */ );
788 
789 			if (cpu_key_k_offset(&key) == 1) {
790 				/* we are going to add 'block'-th block to the file. Use
791 				   allocated block for that */
792 				unp = cpu_to_le32(allocated_block_nr);
793 				set_block_dev_mapped(bh_result,
794 						     allocated_block_nr, inode);
795 				set_buffer_new(bh_result);
796 				done = 1;
797 			}
798 			tmp_key = key;	// ;)
799 			set_cpu_key_k_offset(&tmp_key, 1);
800 			PATH_LAST_POSITION(&path)++;
801 
802 			retval =
803 			    reiserfs_insert_item(th, &path, &tmp_key, &tmp_ih,
804 						 inode, (char *)&unp);
805 			if (retval) {
806 				reiserfs_free_block(th, inode,
807 						    allocated_block_nr, 1);
808 				goto failure;	// retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST
809 			}
810 			//mark_tail_converted (inode);
811 		} else if (is_direct_le_ih(ih)) {
812 			/* direct item has to be converted */
813 			loff_t tail_offset;
814 
815 			tail_offset =
816 			    ((le_ih_k_offset(ih) -
817 			      1) & ~(inode->i_sb->s_blocksize - 1)) + 1;
818 			if (tail_offset == cpu_key_k_offset(&key)) {
819 				/* direct item we just found fits into block we have
820 				   to map. Convert it into unformatted node: use
821 				   bh_result for the conversion */
822 				set_block_dev_mapped(bh_result,
823 						     allocated_block_nr, inode);
824 				unbh = bh_result;
825 				done = 1;
826 			} else {
827 				/* we have to padd file tail stored in direct item(s)
828 				   up to block size and convert it to unformatted
829 				   node. FIXME: this should also get into page cache */
830 
831 				pathrelse(&path);
832 				/*
833 				 * ugly, but we can only end the transaction if
834 				 * we aren't nested
835 				 */
836 				BUG_ON(!th->t_refcount);
837 				if (th->t_refcount == 1) {
838 					retval =
839 					    reiserfs_end_persistent_transaction
840 					    (th);
841 					th = NULL;
842 					if (retval)
843 						goto failure;
844 				}
845 
846 				retval =
847 				    convert_tail_for_hole(inode, bh_result,
848 							  tail_offset);
849 				if (retval) {
850 					if (retval != -ENOSPC)
851 						reiserfs_warning(inode->i_sb,
852 								 "clm-6004: convert tail failed inode %lu, error %d",
853 								 inode->i_ino,
854 								 retval);
855 					if (allocated_block_nr) {
856 						/* the bitmap, the super, and the stat data == 3 */
857 						if (!th)
858 							th = reiserfs_persistent_transaction(inode->i_sb, 3);
859 						if (th)
860 							reiserfs_free_block(th,
861 									    inode,
862 									    allocated_block_nr,
863 									    1);
864 					}
865 					goto failure;
866 				}
867 				goto research;
868 			}
869 			retval =
870 			    direct2indirect(th, inode, &path, unbh,
871 					    tail_offset);
872 			if (retval) {
873 				reiserfs_unmap_buffer(unbh);
874 				reiserfs_free_block(th, inode,
875 						    allocated_block_nr, 1);
876 				goto failure;
877 			}
878 			/* it is important the set_buffer_uptodate is done after
879 			 ** the direct2indirect.  The buffer might contain valid
880 			 ** data newer than the data on disk (read by readpage, changed,
881 			 ** and then sent here by writepage).  direct2indirect needs
882 			 ** to know if unbh was already up to date, so it can decide
883 			 ** if the data in unbh needs to be replaced with data from
884 			 ** the disk
885 			 */
886 			set_buffer_uptodate(unbh);
887 
888 			/* unbh->b_page == NULL in case of DIRECT_IO request, this means
889 			   buffer will disappear shortly, so it should not be added to
890 			 */
891 			if (unbh->b_page) {
892 				/* we've converted the tail, so we must
893 				 ** flush unbh before the transaction commits
894 				 */
895 				reiserfs_add_tail_list(inode, unbh);
896 
897 				/* mark it dirty now to prevent commit_write from adding
898 				 ** this buffer to the inode's dirty buffer list
899 				 */
900 				/*
901 				 * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty().
902 				 * It's still atomic, but it sets the page dirty too,
903 				 * which makes it eligible for writeback at any time by the
904 				 * VM (which was also the case with __mark_buffer_dirty())
905 				 */
906 				mark_buffer_dirty(unbh);
907 			}
908 		} else {
909 			/* append indirect item with holes if needed, when appending
910 			   pointer to 'block'-th block use block, which is already
911 			   allocated */
912 			struct cpu_key tmp_key;
913 			unp_t unf_single = 0;	// We use this in case we need to allocate only
914 			// one block which is a fastpath
915 			unp_t *un;
916 			__u64 max_to_insert =
917 			    MAX_ITEM_LEN(inode->i_sb->s_blocksize) /
918 			    UNFM_P_SIZE;
919 			__u64 blocks_needed;
920 
921 			RFALSE(pos_in_item != ih_item_len(ih) / UNFM_P_SIZE,
922 			       "vs-804: invalid position for append");
923 			/* indirect item has to be appended, set up key of that position */
924 			make_cpu_key(&tmp_key, inode,
925 				     le_key_k_offset(version,
926 						     &(ih->ih_key)) +
927 				     op_bytes_number(ih,
928 						     inode->i_sb->s_blocksize),
929 				     //pos_in_item * inode->i_sb->s_blocksize,
930 				     TYPE_INDIRECT, 3);	// key type is unimportant
931 
932 			blocks_needed =
933 			    1 +
934 			    ((cpu_key_k_offset(&key) -
935 			      cpu_key_k_offset(&tmp_key)) >> inode->i_sb->
936 			     s_blocksize_bits);
937 			RFALSE(blocks_needed < 0, "green-805: invalid offset");
938 
939 			if (blocks_needed == 1) {
940 				un = &unf_single;
941 			} else {
942 				un = kmalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_ATOMIC);	// We need to avoid scheduling.
943 				if (!un) {
944 					un = &unf_single;
945 					blocks_needed = 1;
946 					max_to_insert = 0;
947 				} else
948 					memset(un, 0,
949 					       UNFM_P_SIZE * min(blocks_needed,
950 								 max_to_insert));
951 			}
952 			if (blocks_needed <= max_to_insert) {
953 				/* we are going to add target block to the file. Use allocated
954 				   block for that */
955 				un[blocks_needed - 1] =
956 				    cpu_to_le32(allocated_block_nr);
957 				set_block_dev_mapped(bh_result,
958 						     allocated_block_nr, inode);
959 				set_buffer_new(bh_result);
960 				done = 1;
961 			} else {
962 				/* paste hole to the indirect item */
963 				/* If kmalloc failed, max_to_insert becomes zero and it means we
964 				   only have space for one block */
965 				blocks_needed =
966 				    max_to_insert ? max_to_insert : 1;
967 			}
968 			retval =
969 			    reiserfs_paste_into_item(th, &path, &tmp_key, inode,
970 						     (char *)un,
971 						     UNFM_P_SIZE *
972 						     blocks_needed);
973 
974 			if (blocks_needed != 1)
975 				kfree(un);
976 
977 			if (retval) {
978 				reiserfs_free_block(th, inode,
979 						    allocated_block_nr, 1);
980 				goto failure;
981 			}
982 			if (!done) {
983 				/* We need to mark new file size in case this function will be
984 				   interrupted/aborted later on. And we may do this only for
985 				   holes. */
986 				inode->i_size +=
987 				    inode->i_sb->s_blocksize * blocks_needed;
988 			}
989 		}
990 
991 		if (done == 1)
992 			break;
993 
994 		/* this loop could log more blocks than we had originally asked
995 		 ** for.  So, we have to allow the transaction to end if it is
996 		 ** too big or too full.  Update the inode so things are
997 		 ** consistent if we crash before the function returns
998 		 **
999 		 ** release the path so that anybody waiting on the path before
1000 		 ** ending their transaction will be able to continue.
1001 		 */
1002 		if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
1003 			retval = restart_transaction(th, inode, &path);
1004 			if (retval)
1005 				goto failure;
1006 		}
1007 		/* inserting indirect pointers for a hole can take a
1008 		 ** long time.  reschedule if needed
1009 		 */
1010 		cond_resched();
1011 
1012 		retval = search_for_position_by_key(inode->i_sb, &key, &path);
1013 		if (retval == IO_ERROR) {
1014 			retval = -EIO;
1015 			goto failure;
1016 		}
1017 		if (retval == POSITION_FOUND) {
1018 			reiserfs_warning(inode->i_sb,
1019 					 "vs-825: reiserfs_get_block: "
1020 					 "%K should not be found", &key);
1021 			retval = -EEXIST;
1022 			if (allocated_block_nr)
1023 				reiserfs_free_block(th, inode,
1024 						    allocated_block_nr, 1);
1025 			pathrelse(&path);
1026 			goto failure;
1027 		}
1028 		bh = get_last_bh(&path);
1029 		ih = get_ih(&path);
1030 		item = get_item(&path);
1031 		pos_in_item = path.pos_in_item;
1032 	} while (1);
1033 
1034 	retval = 0;
1035 
1036       failure:
1037 	if (th && (!dangle || (retval && !th->t_trans_id))) {
1038 		int err;
1039 		if (th->t_trans_id)
1040 			reiserfs_update_sd(th, inode);
1041 		err = reiserfs_end_persistent_transaction(th);
1042 		if (err)
1043 			retval = err;
1044 	}
1045 
1046 	reiserfs_write_unlock(inode->i_sb);
1047 	reiserfs_check_path(&path);
1048 	return retval;
1049 }
1050 
1051 static int
1052 reiserfs_readpages(struct file *file, struct address_space *mapping,
1053 		   struct list_head *pages, unsigned nr_pages)
1054 {
1055 	return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block);
1056 }
1057 
1058 /* Compute real number of used bytes by file
1059  * Following three functions can go away when we'll have enough space in stat item
1060  */
1061 static int real_space_diff(struct inode *inode, int sd_size)
1062 {
1063 	int bytes;
1064 	loff_t blocksize = inode->i_sb->s_blocksize;
1065 
1066 	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode))
1067 		return sd_size;
1068 
1069 	/* End of file is also in full block with indirect reference, so round
1070 	 ** up to the next block.
1071 	 **
1072 	 ** there is just no way to know if the tail is actually packed
1073 	 ** on the file, so we have to assume it isn't.  When we pack the
1074 	 ** tail, we add 4 bytes to pretend there really is an unformatted
1075 	 ** node pointer
1076 	 */
1077 	bytes =
1078 	    ((inode->i_size +
1079 	      (blocksize - 1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE +
1080 	    sd_size;
1081 	return bytes;
1082 }
1083 
1084 static inline loff_t to_real_used_space(struct inode *inode, ulong blocks,
1085 					int sd_size)
1086 {
1087 	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
1088 		return inode->i_size +
1089 		    (loff_t) (real_space_diff(inode, sd_size));
1090 	}
1091 	return ((loff_t) real_space_diff(inode, sd_size)) +
1092 	    (((loff_t) blocks) << 9);
1093 }
1094 
1095 /* Compute number of blocks used by file in ReiserFS counting */
1096 static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size)
1097 {
1098 	loff_t bytes = inode_get_bytes(inode);
1099 	loff_t real_space = real_space_diff(inode, sd_size);
1100 
1101 	/* keeps fsck and non-quota versions of reiserfs happy */
1102 	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
1103 		bytes += (loff_t) 511;
1104 	}
1105 
1106 	/* files from before the quota patch might i_blocks such that
1107 	 ** bytes < real_space.  Deal with that here to prevent it from
1108 	 ** going negative.
1109 	 */
1110 	if (bytes < real_space)
1111 		return 0;
1112 	return (bytes - real_space) >> 9;
1113 }
1114 
1115 //
1116 // BAD: new directories have stat data of new type and all other items
1117 // of old type. Version stored in the inode says about body items, so
1118 // in update_stat_data we can not rely on inode, but have to check
1119 // item version directly
1120 //
1121 
1122 // called by read_locked_inode
1123 static void init_inode(struct inode *inode, struct path *path)
1124 {
1125 	struct buffer_head *bh;
1126 	struct item_head *ih;
1127 	__u32 rdev;
1128 	//int version = ITEM_VERSION_1;
1129 
1130 	bh = PATH_PLAST_BUFFER(path);
1131 	ih = PATH_PITEM_HEAD(path);
1132 
1133 	copy_key(INODE_PKEY(inode), &(ih->ih_key));
1134 	inode->i_blksize = reiserfs_default_io_size;
1135 
1136 	INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list));
1137 	REISERFS_I(inode)->i_flags = 0;
1138 	REISERFS_I(inode)->i_prealloc_block = 0;
1139 	REISERFS_I(inode)->i_prealloc_count = 0;
1140 	REISERFS_I(inode)->i_trans_id = 0;
1141 	REISERFS_I(inode)->i_jl = NULL;
1142 	REISERFS_I(inode)->i_acl_access = NULL;
1143 	REISERFS_I(inode)->i_acl_default = NULL;
1144 	init_rwsem(&REISERFS_I(inode)->xattr_sem);
1145 
1146 	if (stat_data_v1(ih)) {
1147 		struct stat_data_v1 *sd =
1148 		    (struct stat_data_v1 *)B_I_PITEM(bh, ih);
1149 		unsigned long blocks;
1150 
1151 		set_inode_item_key_version(inode, KEY_FORMAT_3_5);
1152 		set_inode_sd_version(inode, STAT_DATA_V1);
1153 		inode->i_mode = sd_v1_mode(sd);
1154 		inode->i_nlink = sd_v1_nlink(sd);
1155 		inode->i_uid = sd_v1_uid(sd);
1156 		inode->i_gid = sd_v1_gid(sd);
1157 		inode->i_size = sd_v1_size(sd);
1158 		inode->i_atime.tv_sec = sd_v1_atime(sd);
1159 		inode->i_mtime.tv_sec = sd_v1_mtime(sd);
1160 		inode->i_ctime.tv_sec = sd_v1_ctime(sd);
1161 		inode->i_atime.tv_nsec = 0;
1162 		inode->i_ctime.tv_nsec = 0;
1163 		inode->i_mtime.tv_nsec = 0;
1164 
1165 		inode->i_blocks = sd_v1_blocks(sd);
1166 		inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
1167 		blocks = (inode->i_size + 511) >> 9;
1168 		blocks = _ROUND_UP(blocks, inode->i_sb->s_blocksize >> 9);
1169 		if (inode->i_blocks > blocks) {
1170 			// there was a bug in <=3.5.23 when i_blocks could take negative
1171 			// values. Starting from 3.5.17 this value could even be stored in
1172 			// stat data. For such files we set i_blocks based on file
1173 			// size. Just 2 notes: this can be wrong for sparce files. On-disk value will be
1174 			// only updated if file's inode will ever change
1175 			inode->i_blocks = blocks;
1176 		}
1177 
1178 		rdev = sd_v1_rdev(sd);
1179 		REISERFS_I(inode)->i_first_direct_byte =
1180 		    sd_v1_first_direct_byte(sd);
1181 		/* an early bug in the quota code can give us an odd number for the
1182 		 ** block count.  This is incorrect, fix it here.
1183 		 */
1184 		if (inode->i_blocks & 1) {
1185 			inode->i_blocks++;
1186 		}
1187 		inode_set_bytes(inode,
1188 				to_real_used_space(inode, inode->i_blocks,
1189 						   SD_V1_SIZE));
1190 		/* nopack is initially zero for v1 objects. For v2 objects,
1191 		   nopack is initialised from sd_attrs */
1192 		REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
1193 	} else {
1194 		// new stat data found, but object may have old items
1195 		// (directories and symlinks)
1196 		struct stat_data *sd = (struct stat_data *)B_I_PITEM(bh, ih);
1197 
1198 		inode->i_mode = sd_v2_mode(sd);
1199 		inode->i_nlink = sd_v2_nlink(sd);
1200 		inode->i_uid = sd_v2_uid(sd);
1201 		inode->i_size = sd_v2_size(sd);
1202 		inode->i_gid = sd_v2_gid(sd);
1203 		inode->i_mtime.tv_sec = sd_v2_mtime(sd);
1204 		inode->i_atime.tv_sec = sd_v2_atime(sd);
1205 		inode->i_ctime.tv_sec = sd_v2_ctime(sd);
1206 		inode->i_ctime.tv_nsec = 0;
1207 		inode->i_mtime.tv_nsec = 0;
1208 		inode->i_atime.tv_nsec = 0;
1209 		inode->i_blocks = sd_v2_blocks(sd);
1210 		rdev = sd_v2_rdev(sd);
1211 		if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
1212 			inode->i_generation =
1213 			    le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
1214 		else
1215 			inode->i_generation = sd_v2_generation(sd);
1216 
1217 		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
1218 			set_inode_item_key_version(inode, KEY_FORMAT_3_5);
1219 		else
1220 			set_inode_item_key_version(inode, KEY_FORMAT_3_6);
1221 		REISERFS_I(inode)->i_first_direct_byte = 0;
1222 		set_inode_sd_version(inode, STAT_DATA_V2);
1223 		inode_set_bytes(inode,
1224 				to_real_used_space(inode, inode->i_blocks,
1225 						   SD_V2_SIZE));
1226 		/* read persistent inode attributes from sd and initalise
1227 		   generic inode flags from them */
1228 		REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd);
1229 		sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode);
1230 	}
1231 
1232 	pathrelse(path);
1233 	if (S_ISREG(inode->i_mode)) {
1234 		inode->i_op = &reiserfs_file_inode_operations;
1235 		inode->i_fop = &reiserfs_file_operations;
1236 		inode->i_mapping->a_ops = &reiserfs_address_space_operations;
1237 	} else if (S_ISDIR(inode->i_mode)) {
1238 		inode->i_op = &reiserfs_dir_inode_operations;
1239 		inode->i_fop = &reiserfs_dir_operations;
1240 	} else if (S_ISLNK(inode->i_mode)) {
1241 		inode->i_op = &reiserfs_symlink_inode_operations;
1242 		inode->i_mapping->a_ops = &reiserfs_address_space_operations;
1243 	} else {
1244 		inode->i_blocks = 0;
1245 		inode->i_op = &reiserfs_special_inode_operations;
1246 		init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
1247 	}
1248 }
1249 
1250 // update new stat data with inode fields
1251 static void inode2sd(void *sd, struct inode *inode, loff_t size)
1252 {
1253 	struct stat_data *sd_v2 = (struct stat_data *)sd;
1254 	__u16 flags;
1255 
1256 	set_sd_v2_mode(sd_v2, inode->i_mode);
1257 	set_sd_v2_nlink(sd_v2, inode->i_nlink);
1258 	set_sd_v2_uid(sd_v2, inode->i_uid);
1259 	set_sd_v2_size(sd_v2, size);
1260 	set_sd_v2_gid(sd_v2, inode->i_gid);
1261 	set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec);
1262 	set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec);
1263 	set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec);
1264 	set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
1265 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
1266 		set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
1267 	else
1268 		set_sd_v2_generation(sd_v2, inode->i_generation);
1269 	flags = REISERFS_I(inode)->i_attrs;
1270 	i_attrs_to_sd_attrs(inode, &flags);
1271 	set_sd_v2_attrs(sd_v2, flags);
1272 }
1273 
1274 // used to copy inode's fields to old stat data
1275 static void inode2sd_v1(void *sd, struct inode *inode, loff_t size)
1276 {
1277 	struct stat_data_v1 *sd_v1 = (struct stat_data_v1 *)sd;
1278 
1279 	set_sd_v1_mode(sd_v1, inode->i_mode);
1280 	set_sd_v1_uid(sd_v1, inode->i_uid);
1281 	set_sd_v1_gid(sd_v1, inode->i_gid);
1282 	set_sd_v1_nlink(sd_v1, inode->i_nlink);
1283 	set_sd_v1_size(sd_v1, size);
1284 	set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec);
1285 	set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec);
1286 	set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec);
1287 
1288 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
1289 		set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev));
1290 	else
1291 		set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE));
1292 
1293 	// Sigh. i_first_direct_byte is back
1294 	set_sd_v1_first_direct_byte(sd_v1,
1295 				    REISERFS_I(inode)->i_first_direct_byte);
1296 }
1297 
1298 /* NOTE, you must prepare the buffer head before sending it here,
1299 ** and then log it after the call
1300 */
1301 static void update_stat_data(struct path *path, struct inode *inode,
1302 			     loff_t size)
1303 {
1304 	struct buffer_head *bh;
1305 	struct item_head *ih;
1306 
1307 	bh = PATH_PLAST_BUFFER(path);
1308 	ih = PATH_PITEM_HEAD(path);
1309 
1310 	if (!is_statdata_le_ih(ih))
1311 		reiserfs_panic(inode->i_sb,
1312 			       "vs-13065: update_stat_data: key %k, found item %h",
1313 			       INODE_PKEY(inode), ih);
1314 
1315 	if (stat_data_v1(ih)) {
1316 		// path points to old stat data
1317 		inode2sd_v1(B_I_PITEM(bh, ih), inode, size);
1318 	} else {
1319 		inode2sd(B_I_PITEM(bh, ih), inode, size);
1320 	}
1321 
1322 	return;
1323 }
1324 
1325 void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
1326 			     struct inode *inode, loff_t size)
1327 {
1328 	struct cpu_key key;
1329 	INITIALIZE_PATH(path);
1330 	struct buffer_head *bh;
1331 	int fs_gen;
1332 	struct item_head *ih, tmp_ih;
1333 	int retval;
1334 
1335 	BUG_ON(!th->t_trans_id);
1336 
1337 	make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3);	//key type is unimportant
1338 
1339 	for (;;) {
1340 		int pos;
1341 		/* look for the object's stat data */
1342 		retval = search_item(inode->i_sb, &key, &path);
1343 		if (retval == IO_ERROR) {
1344 			reiserfs_warning(inode->i_sb,
1345 					 "vs-13050: reiserfs_update_sd: "
1346 					 "i/o failure occurred trying to update %K stat data",
1347 					 &key);
1348 			return;
1349 		}
1350 		if (retval == ITEM_NOT_FOUND) {
1351 			pos = PATH_LAST_POSITION(&path);
1352 			pathrelse(&path);
1353 			if (inode->i_nlink == 0) {
1354 				/*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found"); */
1355 				return;
1356 			}
1357 			reiserfs_warning(inode->i_sb,
1358 					 "vs-13060: reiserfs_update_sd: "
1359 					 "stat data of object %k (nlink == %d) not found (pos %d)",
1360 					 INODE_PKEY(inode), inode->i_nlink,
1361 					 pos);
1362 			reiserfs_check_path(&path);
1363 			return;
1364 		}
1365 
1366 		/* sigh, prepare_for_journal might schedule.  When it schedules the
1367 		 ** FS might change.  We have to detect that, and loop back to the
1368 		 ** search if the stat data item has moved
1369 		 */
1370 		bh = get_last_bh(&path);
1371 		ih = get_ih(&path);
1372 		copy_item_head(&tmp_ih, ih);
1373 		fs_gen = get_generation(inode->i_sb);
1374 		reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
1375 		if (fs_changed(fs_gen, inode->i_sb)
1376 		    && item_moved(&tmp_ih, &path)) {
1377 			reiserfs_restore_prepared_buffer(inode->i_sb, bh);
1378 			continue;	/* Stat_data item has been moved after scheduling. */
1379 		}
1380 		break;
1381 	}
1382 	update_stat_data(&path, inode, size);
1383 	journal_mark_dirty(th, th->t_super, bh);
1384 	pathrelse(&path);
1385 	return;
1386 }
1387 
1388 /* reiserfs_read_locked_inode is called to read the inode off disk, and it
1389 ** does a make_bad_inode when things go wrong.  But, we need to make sure
1390 ** and clear the key in the private portion of the inode, otherwise a
1391 ** corresponding iput might try to delete whatever object the inode last
1392 ** represented.
1393 */
1394 static void reiserfs_make_bad_inode(struct inode *inode)
1395 {
1396 	memset(INODE_PKEY(inode), 0, KEY_SIZE);
1397 	make_bad_inode(inode);
1398 }
1399 
1400 //
1401 // initially this function was derived from minix or ext2's analog and
1402 // evolved as the prototype did
1403 //
1404 
1405 int reiserfs_init_locked_inode(struct inode *inode, void *p)
1406 {
1407 	struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p;
1408 	inode->i_ino = args->objectid;
1409 	INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid);
1410 	return 0;
1411 }
1412 
1413 /* looks for stat data in the tree, and fills up the fields of in-core
1414    inode stat data fields */
1415 void reiserfs_read_locked_inode(struct inode *inode,
1416 				struct reiserfs_iget_args *args)
1417 {
1418 	INITIALIZE_PATH(path_to_sd);
1419 	struct cpu_key key;
1420 	unsigned long dirino;
1421 	int retval;
1422 
1423 	dirino = args->dirid;
1424 
1425 	/* set version 1, version 2 could be used too, because stat data
1426 	   key is the same in both versions */
1427 	key.version = KEY_FORMAT_3_5;
1428 	key.on_disk_key.k_dir_id = dirino;
1429 	key.on_disk_key.k_objectid = inode->i_ino;
1430 	key.on_disk_key.k_offset = 0;
1431 	key.on_disk_key.k_type = 0;
1432 
1433 	/* look for the object's stat data */
1434 	retval = search_item(inode->i_sb, &key, &path_to_sd);
1435 	if (retval == IO_ERROR) {
1436 		reiserfs_warning(inode->i_sb,
1437 				 "vs-13070: reiserfs_read_locked_inode: "
1438 				 "i/o failure occurred trying to find stat data of %K",
1439 				 &key);
1440 		reiserfs_make_bad_inode(inode);
1441 		return;
1442 	}
1443 	if (retval != ITEM_FOUND) {
1444 		/* a stale NFS handle can trigger this without it being an error */
1445 		pathrelse(&path_to_sd);
1446 		reiserfs_make_bad_inode(inode);
1447 		inode->i_nlink = 0;
1448 		return;
1449 	}
1450 
1451 	init_inode(inode, &path_to_sd);
1452 
1453 	/* It is possible that knfsd is trying to access inode of a file
1454 	   that is being removed from the disk by some other thread. As we
1455 	   update sd on unlink all that is required is to check for nlink
1456 	   here. This bug was first found by Sizif when debugging
1457 	   SquidNG/Butterfly, forgotten, and found again after Philippe
1458 	   Gramoulle <philippe.gramoulle@mmania.com> reproduced it.
1459 
1460 	   More logical fix would require changes in fs/inode.c:iput() to
1461 	   remove inode from hash-table _after_ fs cleaned disk stuff up and
1462 	   in iget() to return NULL if I_FREEING inode is found in
1463 	   hash-table. */
1464 	/* Currently there is one place where it's ok to meet inode with
1465 	   nlink==0: processing of open-unlinked and half-truncated files
1466 	   during mount (fs/reiserfs/super.c:finish_unfinished()). */
1467 	if ((inode->i_nlink == 0) &&
1468 	    !REISERFS_SB(inode->i_sb)->s_is_unlinked_ok) {
1469 		reiserfs_warning(inode->i_sb,
1470 				 "vs-13075: reiserfs_read_locked_inode: "
1471 				 "dead inode read from disk %K. "
1472 				 "This is likely to be race with knfsd. Ignore",
1473 				 &key);
1474 		reiserfs_make_bad_inode(inode);
1475 	}
1476 
1477 	reiserfs_check_path(&path_to_sd);	/* init inode should be relsing */
1478 
1479 }
1480 
1481 /**
1482  * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked().
1483  *
1484  * @inode:    inode from hash table to check
1485  * @opaque:   "cookie" passed to iget5_locked(). This is &reiserfs_iget_args.
1486  *
1487  * This function is called by iget5_locked() to distinguish reiserfs inodes
1488  * having the same inode numbers. Such inodes can only exist due to some
1489  * error condition. One of them should be bad. Inodes with identical
1490  * inode numbers (objectids) are distinguished by parent directory ids.
1491  *
1492  */
1493 int reiserfs_find_actor(struct inode *inode, void *opaque)
1494 {
1495 	struct reiserfs_iget_args *args;
1496 
1497 	args = opaque;
1498 	/* args is already in CPU order */
1499 	return (inode->i_ino == args->objectid) &&
1500 	    (le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid);
1501 }
1502 
1503 struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key)
1504 {
1505 	struct inode *inode;
1506 	struct reiserfs_iget_args args;
1507 
1508 	args.objectid = key->on_disk_key.k_objectid;
1509 	args.dirid = key->on_disk_key.k_dir_id;
1510 	inode = iget5_locked(s, key->on_disk_key.k_objectid,
1511 			     reiserfs_find_actor, reiserfs_init_locked_inode,
1512 			     (void *)(&args));
1513 	if (!inode)
1514 		return ERR_PTR(-ENOMEM);
1515 
1516 	if (inode->i_state & I_NEW) {
1517 		reiserfs_read_locked_inode(inode, &args);
1518 		unlock_new_inode(inode);
1519 	}
1520 
1521 	if (comp_short_keys(INODE_PKEY(inode), key) || is_bad_inode(inode)) {
1522 		/* either due to i/o error or a stale NFS handle */
1523 		iput(inode);
1524 		inode = NULL;
1525 	}
1526 	return inode;
1527 }
1528 
1529 struct dentry *reiserfs_get_dentry(struct super_block *sb, void *vobjp)
1530 {
1531 	__u32 *data = vobjp;
1532 	struct cpu_key key;
1533 	struct dentry *result;
1534 	struct inode *inode;
1535 
1536 	key.on_disk_key.k_objectid = data[0];
1537 	key.on_disk_key.k_dir_id = data[1];
1538 	reiserfs_write_lock(sb);
1539 	inode = reiserfs_iget(sb, &key);
1540 	if (inode && !IS_ERR(inode) && data[2] != 0 &&
1541 	    data[2] != inode->i_generation) {
1542 		iput(inode);
1543 		inode = NULL;
1544 	}
1545 	reiserfs_write_unlock(sb);
1546 	if (!inode)
1547 		inode = ERR_PTR(-ESTALE);
1548 	if (IS_ERR(inode))
1549 		return ERR_PTR(PTR_ERR(inode));
1550 	result = d_alloc_anon(inode);
1551 	if (!result) {
1552 		iput(inode);
1553 		return ERR_PTR(-ENOMEM);
1554 	}
1555 	return result;
1556 }
1557 
1558 struct dentry *reiserfs_decode_fh(struct super_block *sb, __u32 * data,
1559 				  int len, int fhtype,
1560 				  int (*acceptable) (void *contect,
1561 						     struct dentry * de),
1562 				  void *context)
1563 {
1564 	__u32 obj[3], parent[3];
1565 
1566 	/* fhtype happens to reflect the number of u32s encoded.
1567 	 * due to a bug in earlier code, fhtype might indicate there
1568 	 * are more u32s then actually fitted.
1569 	 * so if fhtype seems to be more than len, reduce fhtype.
1570 	 * Valid types are:
1571 	 *   2 - objectid + dir_id - legacy support
1572 	 *   3 - objectid + dir_id + generation
1573 	 *   4 - objectid + dir_id + objectid and dirid of parent - legacy
1574 	 *   5 - objectid + dir_id + generation + objectid and dirid of parent
1575 	 *   6 - as above plus generation of directory
1576 	 * 6 does not fit in NFSv2 handles
1577 	 */
1578 	if (fhtype > len) {
1579 		if (fhtype != 6 || len != 5)
1580 			reiserfs_warning(sb,
1581 					 "nfsd/reiserfs, fhtype=%d, len=%d - odd",
1582 					 fhtype, len);
1583 		fhtype = 5;
1584 	}
1585 
1586 	obj[0] = data[0];
1587 	obj[1] = data[1];
1588 	if (fhtype == 3 || fhtype >= 5)
1589 		obj[2] = data[2];
1590 	else
1591 		obj[2] = 0;	/* generation number */
1592 
1593 	if (fhtype >= 4) {
1594 		parent[0] = data[fhtype >= 5 ? 3 : 2];
1595 		parent[1] = data[fhtype >= 5 ? 4 : 3];
1596 		if (fhtype == 6)
1597 			parent[2] = data[5];
1598 		else
1599 			parent[2] = 0;
1600 	}
1601 	return sb->s_export_op->find_exported_dentry(sb, obj,
1602 						     fhtype < 4 ? NULL : parent,
1603 						     acceptable, context);
1604 }
1605 
1606 int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
1607 		       int need_parent)
1608 {
1609 	struct inode *inode = dentry->d_inode;
1610 	int maxlen = *lenp;
1611 
1612 	if (maxlen < 3)
1613 		return 255;
1614 
1615 	data[0] = inode->i_ino;
1616 	data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
1617 	data[2] = inode->i_generation;
1618 	*lenp = 3;
1619 	/* no room for directory info? return what we've stored so far */
1620 	if (maxlen < 5 || !need_parent)
1621 		return 3;
1622 
1623 	spin_lock(&dentry->d_lock);
1624 	inode = dentry->d_parent->d_inode;
1625 	data[3] = inode->i_ino;
1626 	data[4] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
1627 	*lenp = 5;
1628 	if (maxlen >= 6) {
1629 		data[5] = inode->i_generation;
1630 		*lenp = 6;
1631 	}
1632 	spin_unlock(&dentry->d_lock);
1633 	return *lenp;
1634 }
1635 
1636 /* looks for stat data, then copies fields to it, marks the buffer
1637    containing stat data as dirty */
1638 /* reiserfs inodes are never really dirty, since the dirty inode call
1639 ** always logs them.  This call allows the VFS inode marking routines
1640 ** to properly mark inodes for datasync and such, but only actually
1641 ** does something when called for a synchronous update.
1642 */
1643 int reiserfs_write_inode(struct inode *inode, int do_sync)
1644 {
1645 	struct reiserfs_transaction_handle th;
1646 	int jbegin_count = 1;
1647 
1648 	if (inode->i_sb->s_flags & MS_RDONLY)
1649 		return -EROFS;
1650 	/* memory pressure can sometimes initiate write_inode calls with sync == 1,
1651 	 ** these cases are just when the system needs ram, not when the
1652 	 ** inode needs to reach disk for safety, and they can safely be
1653 	 ** ignored because the altered inode has already been logged.
1654 	 */
1655 	if (do_sync && !(current->flags & PF_MEMALLOC)) {
1656 		reiserfs_write_lock(inode->i_sb);
1657 		if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
1658 			reiserfs_update_sd(&th, inode);
1659 			journal_end_sync(&th, inode->i_sb, jbegin_count);
1660 		}
1661 		reiserfs_write_unlock(inode->i_sb);
1662 	}
1663 	return 0;
1664 }
1665 
1666 /* stat data of new object is inserted already, this inserts the item
1667    containing "." and ".." entries */
1668 static int reiserfs_new_directory(struct reiserfs_transaction_handle *th,
1669 				  struct inode *inode,
1670 				  struct item_head *ih, struct path *path,
1671 				  struct inode *dir)
1672 {
1673 	struct super_block *sb = th->t_super;
1674 	char empty_dir[EMPTY_DIR_SIZE];
1675 	char *body = empty_dir;
1676 	struct cpu_key key;
1677 	int retval;
1678 
1679 	BUG_ON(!th->t_trans_id);
1680 
1681 	_make_cpu_key(&key, KEY_FORMAT_3_5, le32_to_cpu(ih->ih_key.k_dir_id),
1682 		      le32_to_cpu(ih->ih_key.k_objectid), DOT_OFFSET,
1683 		      TYPE_DIRENTRY, 3 /*key length */ );
1684 
1685 	/* compose item head for new item. Directories consist of items of
1686 	   old type (ITEM_VERSION_1). Do not set key (second arg is 0), it
1687 	   is done by reiserfs_new_inode */
1688 	if (old_format_only(sb)) {
1689 		make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
1690 				  TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2);
1691 
1692 		make_empty_dir_item_v1(body, ih->ih_key.k_dir_id,
1693 				       ih->ih_key.k_objectid,
1694 				       INODE_PKEY(dir)->k_dir_id,
1695 				       INODE_PKEY(dir)->k_objectid);
1696 	} else {
1697 		make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
1698 				  TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2);
1699 
1700 		make_empty_dir_item(body, ih->ih_key.k_dir_id,
1701 				    ih->ih_key.k_objectid,
1702 				    INODE_PKEY(dir)->k_dir_id,
1703 				    INODE_PKEY(dir)->k_objectid);
1704 	}
1705 
1706 	/* look for place in the tree for new item */
1707 	retval = search_item(sb, &key, path);
1708 	if (retval == IO_ERROR) {
1709 		reiserfs_warning(sb, "vs-13080: reiserfs_new_directory: "
1710 				 "i/o failure occurred creating new directory");
1711 		return -EIO;
1712 	}
1713 	if (retval == ITEM_FOUND) {
1714 		pathrelse(path);
1715 		reiserfs_warning(sb, "vs-13070: reiserfs_new_directory: "
1716 				 "object with this key exists (%k)",
1717 				 &(ih->ih_key));
1718 		return -EEXIST;
1719 	}
1720 
1721 	/* insert item, that is empty directory item */
1722 	return reiserfs_insert_item(th, path, &key, ih, inode, body);
1723 }
1724 
1725 /* stat data of object has been inserted, this inserts the item
1726    containing the body of symlink */
1727 static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct inode *inode,	/* Inode of symlink */
1728 				struct item_head *ih,
1729 				struct path *path, const char *symname,
1730 				int item_len)
1731 {
1732 	struct super_block *sb = th->t_super;
1733 	struct cpu_key key;
1734 	int retval;
1735 
1736 	BUG_ON(!th->t_trans_id);
1737 
1738 	_make_cpu_key(&key, KEY_FORMAT_3_5,
1739 		      le32_to_cpu(ih->ih_key.k_dir_id),
1740 		      le32_to_cpu(ih->ih_key.k_objectid),
1741 		      1, TYPE_DIRECT, 3 /*key length */ );
1742 
1743 	make_le_item_head(ih, NULL, KEY_FORMAT_3_5, 1, TYPE_DIRECT, item_len,
1744 			  0 /*free_space */ );
1745 
1746 	/* look for place in the tree for new item */
1747 	retval = search_item(sb, &key, path);
1748 	if (retval == IO_ERROR) {
1749 		reiserfs_warning(sb, "vs-13080: reiserfs_new_symlinik: "
1750 				 "i/o failure occurred creating new symlink");
1751 		return -EIO;
1752 	}
1753 	if (retval == ITEM_FOUND) {
1754 		pathrelse(path);
1755 		reiserfs_warning(sb, "vs-13080: reiserfs_new_symlink: "
1756 				 "object with this key exists (%k)",
1757 				 &(ih->ih_key));
1758 		return -EEXIST;
1759 	}
1760 
1761 	/* insert item, that is body of symlink */
1762 	return reiserfs_insert_item(th, path, &key, ih, inode, symname);
1763 }
1764 
1765 /* inserts the stat data into the tree, and then calls
1766    reiserfs_new_directory (to insert ".", ".." item if new object is
1767    directory) or reiserfs_new_symlink (to insert symlink body if new
1768    object is symlink) or nothing (if new object is regular file)
1769 
1770    NOTE! uid and gid must already be set in the inode.  If we return
1771    non-zero due to an error, we have to drop the quota previously allocated
1772    for the fresh inode.  This can only be done outside a transaction, so
1773    if we return non-zero, we also end the transaction.  */
1774 int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1775 		       struct inode *dir, int mode, const char *symname,
1776 		       /* 0 for regular, EMTRY_DIR_SIZE for dirs,
1777 		          strlen (symname) for symlinks) */
1778 		       loff_t i_size, struct dentry *dentry,
1779 		       struct inode *inode)
1780 {
1781 	struct super_block *sb;
1782 	INITIALIZE_PATH(path_to_key);
1783 	struct cpu_key key;
1784 	struct item_head ih;
1785 	struct stat_data sd;
1786 	int retval;
1787 	int err;
1788 
1789 	BUG_ON(!th->t_trans_id);
1790 
1791 	if (DQUOT_ALLOC_INODE(inode)) {
1792 		err = -EDQUOT;
1793 		goto out_end_trans;
1794 	}
1795 	if (!dir || !dir->i_nlink) {
1796 		err = -EPERM;
1797 		goto out_bad_inode;
1798 	}
1799 
1800 	sb = dir->i_sb;
1801 
1802 	/* item head of new item */
1803 	ih.ih_key.k_dir_id = reiserfs_choose_packing(dir);
1804 	ih.ih_key.k_objectid = cpu_to_le32(reiserfs_get_unused_objectid(th));
1805 	if (!ih.ih_key.k_objectid) {
1806 		err = -ENOMEM;
1807 		goto out_bad_inode;
1808 	}
1809 	if (old_format_only(sb))
1810 		/* not a perfect generation count, as object ids can be reused, but
1811 		 ** this is as good as reiserfs can do right now.
1812 		 ** note that the private part of inode isn't filled in yet, we have
1813 		 ** to use the directory.
1814 		 */
1815 		inode->i_generation = le32_to_cpu(INODE_PKEY(dir)->k_objectid);
1816 	else
1817 #if defined( USE_INODE_GENERATION_COUNTER )
1818 		inode->i_generation =
1819 		    le32_to_cpu(REISERFS_SB(sb)->s_rs->s_inode_generation);
1820 #else
1821 		inode->i_generation = ++event;
1822 #endif
1823 
1824 	/* fill stat data */
1825 	inode->i_nlink = (S_ISDIR(mode) ? 2 : 1);
1826 
1827 	/* uid and gid must already be set by the caller for quota init */
1828 
1829 	/* symlink cannot be immutable or append only, right? */
1830 	if (S_ISLNK(inode->i_mode))
1831 		inode->i_flags &= ~(S_IMMUTABLE | S_APPEND);
1832 
1833 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
1834 	inode->i_size = i_size;
1835 	inode->i_blocks = 0;
1836 	inode->i_bytes = 0;
1837 	REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 :
1838 	    U32_MAX /*NO_BYTES_IN_DIRECT_ITEM */ ;
1839 
1840 	INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list));
1841 	REISERFS_I(inode)->i_flags = 0;
1842 	REISERFS_I(inode)->i_prealloc_block = 0;
1843 	REISERFS_I(inode)->i_prealloc_count = 0;
1844 	REISERFS_I(inode)->i_trans_id = 0;
1845 	REISERFS_I(inode)->i_jl = NULL;
1846 	REISERFS_I(inode)->i_attrs =
1847 	    REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
1848 	sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode);
1849 	REISERFS_I(inode)->i_acl_access = NULL;
1850 	REISERFS_I(inode)->i_acl_default = NULL;
1851 	init_rwsem(&REISERFS_I(inode)->xattr_sem);
1852 
1853 	if (old_format_only(sb))
1854 		make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
1855 				  TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
1856 	else
1857 		make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
1858 				  TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
1859 
1860 	/* key to search for correct place for new stat data */
1861 	_make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id),
1862 		      le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET,
1863 		      TYPE_STAT_DATA, 3 /*key length */ );
1864 
1865 	/* find proper place for inserting of stat data */
1866 	retval = search_item(sb, &key, &path_to_key);
1867 	if (retval == IO_ERROR) {
1868 		err = -EIO;
1869 		goto out_bad_inode;
1870 	}
1871 	if (retval == ITEM_FOUND) {
1872 		pathrelse(&path_to_key);
1873 		err = -EEXIST;
1874 		goto out_bad_inode;
1875 	}
1876 	if (old_format_only(sb)) {
1877 		if (inode->i_uid & ~0xffff || inode->i_gid & ~0xffff) {
1878 			pathrelse(&path_to_key);
1879 			/* i_uid or i_gid is too big to be stored in stat data v3.5 */
1880 			err = -EINVAL;
1881 			goto out_bad_inode;
1882 		}
1883 		inode2sd_v1(&sd, inode, inode->i_size);
1884 	} else {
1885 		inode2sd(&sd, inode, inode->i_size);
1886 	}
1887 	// these do not go to on-disk stat data
1888 	inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
1889 	inode->i_blksize = reiserfs_default_io_size;
1890 
1891 	// store in in-core inode the key of stat data and version all
1892 	// object items will have (directory items will have old offset
1893 	// format, other new objects will consist of new items)
1894 	memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
1895 	if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode))
1896 		set_inode_item_key_version(inode, KEY_FORMAT_3_5);
1897 	else
1898 		set_inode_item_key_version(inode, KEY_FORMAT_3_6);
1899 	if (old_format_only(sb))
1900 		set_inode_sd_version(inode, STAT_DATA_V1);
1901 	else
1902 		set_inode_sd_version(inode, STAT_DATA_V2);
1903 
1904 	/* insert the stat data into the tree */
1905 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
1906 	if (REISERFS_I(dir)->new_packing_locality)
1907 		th->displace_new_blocks = 1;
1908 #endif
1909 	retval =
1910 	    reiserfs_insert_item(th, &path_to_key, &key, &ih, inode,
1911 				 (char *)(&sd));
1912 	if (retval) {
1913 		err = retval;
1914 		reiserfs_check_path(&path_to_key);
1915 		goto out_bad_inode;
1916 	}
1917 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
1918 	if (!th->displace_new_blocks)
1919 		REISERFS_I(dir)->new_packing_locality = 0;
1920 #endif
1921 	if (S_ISDIR(mode)) {
1922 		/* insert item with "." and ".." */
1923 		retval =
1924 		    reiserfs_new_directory(th, inode, &ih, &path_to_key, dir);
1925 	}
1926 
1927 	if (S_ISLNK(mode)) {
1928 		/* insert body of symlink */
1929 		if (!old_format_only(sb))
1930 			i_size = ROUND_UP(i_size);
1931 		retval =
1932 		    reiserfs_new_symlink(th, inode, &ih, &path_to_key, symname,
1933 					 i_size);
1934 	}
1935 	if (retval) {
1936 		err = retval;
1937 		reiserfs_check_path(&path_to_key);
1938 		journal_end(th, th->t_super, th->t_blocks_allocated);
1939 		goto out_inserted_sd;
1940 	}
1941 
1942 	/* XXX CHECK THIS */
1943 	if (reiserfs_posixacl(inode->i_sb)) {
1944 		retval = reiserfs_inherit_default_acl(dir, dentry, inode);
1945 		if (retval) {
1946 			err = retval;
1947 			reiserfs_check_path(&path_to_key);
1948 			journal_end(th, th->t_super, th->t_blocks_allocated);
1949 			goto out_inserted_sd;
1950 		}
1951 	} else if (inode->i_sb->s_flags & MS_POSIXACL) {
1952 		reiserfs_warning(inode->i_sb, "ACLs aren't enabled in the fs, "
1953 				 "but vfs thinks they are!");
1954 	} else if (is_reiserfs_priv_object(dir)) {
1955 		reiserfs_mark_inode_private(inode);
1956 	}
1957 
1958 	insert_inode_hash(inode);
1959 	reiserfs_update_sd(th, inode);
1960 	reiserfs_check_path(&path_to_key);
1961 
1962 	return 0;
1963 
1964 /* it looks like you can easily compress these two goto targets into
1965  * one.  Keeping it like this doesn't actually hurt anything, and they
1966  * are place holders for what the quota code actually needs.
1967  */
1968       out_bad_inode:
1969 	/* Invalidate the object, nothing was inserted yet */
1970 	INODE_PKEY(inode)->k_objectid = 0;
1971 
1972 	/* Quota change must be inside a transaction for journaling */
1973 	DQUOT_FREE_INODE(inode);
1974 
1975       out_end_trans:
1976 	journal_end(th, th->t_super, th->t_blocks_allocated);
1977 	/* Drop can be outside and it needs more credits so it's better to have it outside */
1978 	DQUOT_DROP(inode);
1979 	inode->i_flags |= S_NOQUOTA;
1980 	make_bad_inode(inode);
1981 
1982       out_inserted_sd:
1983 	inode->i_nlink = 0;
1984 	th->t_trans_id = 0;	/* so the caller can't use this handle later */
1985 
1986 	/* If we were inheriting an ACL, we need to release the lock so that
1987 	 * iput doesn't deadlock in reiserfs_delete_xattrs. The locking
1988 	 * code really needs to be reworked, but this will take care of it
1989 	 * for now. -jeffm */
1990 	if (REISERFS_I(dir)->i_acl_default && !IS_ERR(REISERFS_I(dir)->i_acl_default)) {
1991 		reiserfs_write_unlock_xattrs(dir->i_sb);
1992 		iput(inode);
1993 		reiserfs_write_lock_xattrs(dir->i_sb);
1994 	} else
1995 		iput(inode);
1996 	return err;
1997 }
1998 
1999 /*
2000 ** finds the tail page in the page cache,
2001 ** reads the last block in.
2002 **
2003 ** On success, page_result is set to a locked, pinned page, and bh_result
2004 ** is set to an up to date buffer for the last block in the file.  returns 0.
2005 **
2006 ** tail conversion is not done, so bh_result might not be valid for writing
2007 ** check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before
2008 ** trying to write the block.
2009 **
2010 ** on failure, nonzero is returned, page_result and bh_result are untouched.
2011 */
2012 static int grab_tail_page(struct inode *p_s_inode,
2013 			  struct page **page_result,
2014 			  struct buffer_head **bh_result)
2015 {
2016 
2017 	/* we want the page with the last byte in the file,
2018 	 ** not the page that will hold the next byte for appending
2019 	 */
2020 	unsigned long index = (p_s_inode->i_size - 1) >> PAGE_CACHE_SHIFT;
2021 	unsigned long pos = 0;
2022 	unsigned long start = 0;
2023 	unsigned long blocksize = p_s_inode->i_sb->s_blocksize;
2024 	unsigned long offset = (p_s_inode->i_size) & (PAGE_CACHE_SIZE - 1);
2025 	struct buffer_head *bh;
2026 	struct buffer_head *head;
2027 	struct page *page;
2028 	int error;
2029 
2030 	/* we know that we are only called with inode->i_size > 0.
2031 	 ** we also know that a file tail can never be as big as a block
2032 	 ** If i_size % blocksize == 0, our file is currently block aligned
2033 	 ** and it won't need converting or zeroing after a truncate.
2034 	 */
2035 	if ((offset & (blocksize - 1)) == 0) {
2036 		return -ENOENT;
2037 	}
2038 	page = grab_cache_page(p_s_inode->i_mapping, index);
2039 	error = -ENOMEM;
2040 	if (!page) {
2041 		goto out;
2042 	}
2043 	/* start within the page of the last block in the file */
2044 	start = (offset / blocksize) * blocksize;
2045 
2046 	error = block_prepare_write(page, start, offset,
2047 				    reiserfs_get_block_create_0);
2048 	if (error)
2049 		goto unlock;
2050 
2051 	head = page_buffers(page);
2052 	bh = head;
2053 	do {
2054 		if (pos >= start) {
2055 			break;
2056 		}
2057 		bh = bh->b_this_page;
2058 		pos += blocksize;
2059 	} while (bh != head);
2060 
2061 	if (!buffer_uptodate(bh)) {
2062 		/* note, this should never happen, prepare_write should
2063 		 ** be taking care of this for us.  If the buffer isn't up to date,
2064 		 ** I've screwed up the code to find the buffer, or the code to
2065 		 ** call prepare_write
2066 		 */
2067 		reiserfs_warning(p_s_inode->i_sb,
2068 				 "clm-6000: error reading block %lu on dev %s",
2069 				 bh->b_blocknr,
2070 				 reiserfs_bdevname(p_s_inode->i_sb));
2071 		error = -EIO;
2072 		goto unlock;
2073 	}
2074 	*bh_result = bh;
2075 	*page_result = page;
2076 
2077       out:
2078 	return error;
2079 
2080       unlock:
2081 	unlock_page(page);
2082 	page_cache_release(page);
2083 	return error;
2084 }
2085 
2086 /*
2087 ** vfs version of truncate file.  Must NOT be called with
2088 ** a transaction already started.
2089 **
2090 ** some code taken from block_truncate_page
2091 */
2092 int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps)
2093 {
2094 	struct reiserfs_transaction_handle th;
2095 	/* we want the offset for the first byte after the end of the file */
2096 	unsigned long offset = p_s_inode->i_size & (PAGE_CACHE_SIZE - 1);
2097 	unsigned blocksize = p_s_inode->i_sb->s_blocksize;
2098 	unsigned length;
2099 	struct page *page = NULL;
2100 	int error;
2101 	struct buffer_head *bh = NULL;
2102 
2103 	reiserfs_write_lock(p_s_inode->i_sb);
2104 
2105 	if (p_s_inode->i_size > 0) {
2106 		if ((error = grab_tail_page(p_s_inode, &page, &bh))) {
2107 			// -ENOENT means we truncated past the end of the file,
2108 			// and get_block_create_0 could not find a block to read in,
2109 			// which is ok.
2110 			if (error != -ENOENT)
2111 				reiserfs_warning(p_s_inode->i_sb,
2112 						 "clm-6001: grab_tail_page failed %d",
2113 						 error);
2114 			page = NULL;
2115 			bh = NULL;
2116 		}
2117 	}
2118 
2119 	/* so, if page != NULL, we have a buffer head for the offset at
2120 	 ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0,
2121 	 ** then we have an unformatted node.  Otherwise, we have a direct item,
2122 	 ** and no zeroing is required on disk.  We zero after the truncate,
2123 	 ** because the truncate might pack the item anyway
2124 	 ** (it will unmap bh if it packs).
2125 	 */
2126 	/* it is enough to reserve space in transaction for 2 balancings:
2127 	   one for "save" link adding and another for the first
2128 	   cut_from_item. 1 is for update_sd */
2129 	error = journal_begin(&th, p_s_inode->i_sb,
2130 			      JOURNAL_PER_BALANCE_CNT * 2 + 1);
2131 	if (error)
2132 		goto out;
2133 	reiserfs_update_inode_transaction(p_s_inode);
2134 	if (update_timestamps)
2135 		/* we are doing real truncate: if the system crashes before the last
2136 		   transaction of truncating gets committed - on reboot the file
2137 		   either appears truncated properly or not truncated at all */
2138 		add_save_link(&th, p_s_inode, 1);
2139 	error = reiserfs_do_truncate(&th, p_s_inode, page, update_timestamps);
2140 	if (error)
2141 		goto out;
2142 	error =
2143 	    journal_end(&th, p_s_inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1);
2144 	if (error)
2145 		goto out;
2146 
2147 	if (update_timestamps) {
2148 		error = remove_save_link(p_s_inode, 1 /* truncate */ );
2149 		if (error)
2150 			goto out;
2151 	}
2152 
2153 	if (page) {
2154 		length = offset & (blocksize - 1);
2155 		/* if we are not on a block boundary */
2156 		if (length) {
2157 			char *kaddr;
2158 
2159 			length = blocksize - length;
2160 			kaddr = kmap_atomic(page, KM_USER0);
2161 			memset(kaddr + offset, 0, length);
2162 			flush_dcache_page(page);
2163 			kunmap_atomic(kaddr, KM_USER0);
2164 			if (buffer_mapped(bh) && bh->b_blocknr != 0) {
2165 				mark_buffer_dirty(bh);
2166 			}
2167 		}
2168 		unlock_page(page);
2169 		page_cache_release(page);
2170 	}
2171 
2172 	reiserfs_write_unlock(p_s_inode->i_sb);
2173 	return 0;
2174       out:
2175 	if (page) {
2176 		unlock_page(page);
2177 		page_cache_release(page);
2178 	}
2179 	reiserfs_write_unlock(p_s_inode->i_sb);
2180 	return error;
2181 }
2182 
2183 static int map_block_for_writepage(struct inode *inode,
2184 				   struct buffer_head *bh_result,
2185 				   unsigned long block)
2186 {
2187 	struct reiserfs_transaction_handle th;
2188 	int fs_gen;
2189 	struct item_head tmp_ih;
2190 	struct item_head *ih;
2191 	struct buffer_head *bh;
2192 	__le32 *item;
2193 	struct cpu_key key;
2194 	INITIALIZE_PATH(path);
2195 	int pos_in_item;
2196 	int jbegin_count = JOURNAL_PER_BALANCE_CNT;
2197 	loff_t byte_offset = (block << inode->i_sb->s_blocksize_bits) + 1;
2198 	int retval;
2199 	int use_get_block = 0;
2200 	int bytes_copied = 0;
2201 	int copy_size;
2202 	int trans_running = 0;
2203 
2204 	/* catch places below that try to log something without starting a trans */
2205 	th.t_trans_id = 0;
2206 
2207 	if (!buffer_uptodate(bh_result)) {
2208 		return -EIO;
2209 	}
2210 
2211 	kmap(bh_result->b_page);
2212       start_over:
2213 	reiserfs_write_lock(inode->i_sb);
2214 	make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3);
2215 
2216       research:
2217 	retval = search_for_position_by_key(inode->i_sb, &key, &path);
2218 	if (retval != POSITION_FOUND) {
2219 		use_get_block = 1;
2220 		goto out;
2221 	}
2222 
2223 	bh = get_last_bh(&path);
2224 	ih = get_ih(&path);
2225 	item = get_item(&path);
2226 	pos_in_item = path.pos_in_item;
2227 
2228 	/* we've found an unformatted node */
2229 	if (indirect_item_found(retval, ih)) {
2230 		if (bytes_copied > 0) {
2231 			reiserfs_warning(inode->i_sb,
2232 					 "clm-6002: bytes_copied %d",
2233 					 bytes_copied);
2234 		}
2235 		if (!get_block_num(item, pos_in_item)) {
2236 			/* crap, we are writing to a hole */
2237 			use_get_block = 1;
2238 			goto out;
2239 		}
2240 		set_block_dev_mapped(bh_result,
2241 				     get_block_num(item, pos_in_item), inode);
2242 	} else if (is_direct_le_ih(ih)) {
2243 		char *p;
2244 		p = page_address(bh_result->b_page);
2245 		p += (byte_offset - 1) & (PAGE_CACHE_SIZE - 1);
2246 		copy_size = ih_item_len(ih) - pos_in_item;
2247 
2248 		fs_gen = get_generation(inode->i_sb);
2249 		copy_item_head(&tmp_ih, ih);
2250 
2251 		if (!trans_running) {
2252 			/* vs-3050 is gone, no need to drop the path */
2253 			retval = journal_begin(&th, inode->i_sb, jbegin_count);
2254 			if (retval)
2255 				goto out;
2256 			reiserfs_update_inode_transaction(inode);
2257 			trans_running = 1;
2258 			if (fs_changed(fs_gen, inode->i_sb)
2259 			    && item_moved(&tmp_ih, &path)) {
2260 				reiserfs_restore_prepared_buffer(inode->i_sb,
2261 								 bh);
2262 				goto research;
2263 			}
2264 		}
2265 
2266 		reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
2267 
2268 		if (fs_changed(fs_gen, inode->i_sb)
2269 		    && item_moved(&tmp_ih, &path)) {
2270 			reiserfs_restore_prepared_buffer(inode->i_sb, bh);
2271 			goto research;
2272 		}
2273 
2274 		memcpy(B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied,
2275 		       copy_size);
2276 
2277 		journal_mark_dirty(&th, inode->i_sb, bh);
2278 		bytes_copied += copy_size;
2279 		set_block_dev_mapped(bh_result, 0, inode);
2280 
2281 		/* are there still bytes left? */
2282 		if (bytes_copied < bh_result->b_size &&
2283 		    (byte_offset + bytes_copied) < inode->i_size) {
2284 			set_cpu_key_k_offset(&key,
2285 					     cpu_key_k_offset(&key) +
2286 					     copy_size);
2287 			goto research;
2288 		}
2289 	} else {
2290 		reiserfs_warning(inode->i_sb,
2291 				 "clm-6003: bad item inode %lu, device %s",
2292 				 inode->i_ino, reiserfs_bdevname(inode->i_sb));
2293 		retval = -EIO;
2294 		goto out;
2295 	}
2296 	retval = 0;
2297 
2298       out:
2299 	pathrelse(&path);
2300 	if (trans_running) {
2301 		int err = journal_end(&th, inode->i_sb, jbegin_count);
2302 		if (err)
2303 			retval = err;
2304 		trans_running = 0;
2305 	}
2306 	reiserfs_write_unlock(inode->i_sb);
2307 
2308 	/* this is where we fill in holes in the file. */
2309 	if (use_get_block) {
2310 		retval = reiserfs_get_block(inode, block, bh_result,
2311 					    GET_BLOCK_CREATE | GET_BLOCK_NO_ISEM
2312 					    | GET_BLOCK_NO_DANGLE);
2313 		if (!retval) {
2314 			if (!buffer_mapped(bh_result)
2315 			    || bh_result->b_blocknr == 0) {
2316 				/* get_block failed to find a mapped unformatted node. */
2317 				use_get_block = 0;
2318 				goto start_over;
2319 			}
2320 		}
2321 	}
2322 	kunmap(bh_result->b_page);
2323 
2324 	if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
2325 		/* we've copied data from the page into the direct item, so the
2326 		 * buffer in the page is now clean, mark it to reflect that.
2327 		 */
2328 		lock_buffer(bh_result);
2329 		clear_buffer_dirty(bh_result);
2330 		unlock_buffer(bh_result);
2331 	}
2332 	return retval;
2333 }
2334 
2335 /*
2336  * mason@suse.com: updated in 2.5.54 to follow the same general io
2337  * start/recovery path as __block_write_full_page, along with special
2338  * code to handle reiserfs tails.
2339  */
2340 static int reiserfs_write_full_page(struct page *page,
2341 				    struct writeback_control *wbc)
2342 {
2343 	struct inode *inode = page->mapping->host;
2344 	unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
2345 	int error = 0;
2346 	unsigned long block;
2347 	struct buffer_head *head, *bh;
2348 	int partial = 0;
2349 	int nr = 0;
2350 	int checked = PageChecked(page);
2351 	struct reiserfs_transaction_handle th;
2352 	struct super_block *s = inode->i_sb;
2353 	int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
2354 	th.t_trans_id = 0;
2355 
2356 	/* The page dirty bit is cleared before writepage is called, which
2357 	 * means we have to tell create_empty_buffers to make dirty buffers
2358 	 * The page really should be up to date at this point, so tossing
2359 	 * in the BH_Uptodate is just a sanity check.
2360 	 */
2361 	if (!page_has_buffers(page)) {
2362 		create_empty_buffers(page, s->s_blocksize,
2363 				     (1 << BH_Dirty) | (1 << BH_Uptodate));
2364 	}
2365 	head = page_buffers(page);
2366 
2367 	/* last page in the file, zero out any contents past the
2368 	 ** last byte in the file
2369 	 */
2370 	if (page->index >= end_index) {
2371 		char *kaddr;
2372 		unsigned last_offset;
2373 
2374 		last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
2375 		/* no file contents in this page */
2376 		if (page->index >= end_index + 1 || !last_offset) {
2377 			unlock_page(page);
2378 			return 0;
2379 		}
2380 		kaddr = kmap_atomic(page, KM_USER0);
2381 		memset(kaddr + last_offset, 0, PAGE_CACHE_SIZE - last_offset);
2382 		flush_dcache_page(page);
2383 		kunmap_atomic(kaddr, KM_USER0);
2384 	}
2385 	bh = head;
2386 	block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits);
2387 	/* first map all the buffers, logging any direct items we find */
2388 	do {
2389 		if ((checked || buffer_dirty(bh)) && (!buffer_mapped(bh) ||
2390 						      (buffer_mapped(bh)
2391 						       && bh->b_blocknr ==
2392 						       0))) {
2393 			/* not mapped yet, or it points to a direct item, search
2394 			 * the btree for the mapping info, and log any direct
2395 			 * items found
2396 			 */
2397 			if ((error = map_block_for_writepage(inode, bh, block))) {
2398 				goto fail;
2399 			}
2400 		}
2401 		bh = bh->b_this_page;
2402 		block++;
2403 	} while (bh != head);
2404 
2405 	/*
2406 	 * we start the transaction after map_block_for_writepage,
2407 	 * because it can create holes in the file (an unbounded operation).
2408 	 * starting it here, we can make a reliable estimate for how many
2409 	 * blocks we're going to log
2410 	 */
2411 	if (checked) {
2412 		ClearPageChecked(page);
2413 		reiserfs_write_lock(s);
2414 		error = journal_begin(&th, s, bh_per_page + 1);
2415 		if (error) {
2416 			reiserfs_write_unlock(s);
2417 			goto fail;
2418 		}
2419 		reiserfs_update_inode_transaction(inode);
2420 	}
2421 	/* now go through and lock any dirty buffers on the page */
2422 	do {
2423 		get_bh(bh);
2424 		if (!buffer_mapped(bh))
2425 			continue;
2426 		if (buffer_mapped(bh) && bh->b_blocknr == 0)
2427 			continue;
2428 
2429 		if (checked) {
2430 			reiserfs_prepare_for_journal(s, bh, 1);
2431 			journal_mark_dirty(&th, s, bh);
2432 			continue;
2433 		}
2434 		/* from this point on, we know the buffer is mapped to a
2435 		 * real block and not a direct item
2436 		 */
2437 		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
2438 			lock_buffer(bh);
2439 		} else {
2440 			if (test_set_buffer_locked(bh)) {
2441 				redirty_page_for_writepage(wbc, page);
2442 				continue;
2443 			}
2444 		}
2445 		if (test_clear_buffer_dirty(bh)) {
2446 			mark_buffer_async_write(bh);
2447 		} else {
2448 			unlock_buffer(bh);
2449 		}
2450 	} while ((bh = bh->b_this_page) != head);
2451 
2452 	if (checked) {
2453 		error = journal_end(&th, s, bh_per_page + 1);
2454 		reiserfs_write_unlock(s);
2455 		if (error)
2456 			goto fail;
2457 	}
2458 	BUG_ON(PageWriteback(page));
2459 	set_page_writeback(page);
2460 	unlock_page(page);
2461 
2462 	/*
2463 	 * since any buffer might be the only dirty buffer on the page,
2464 	 * the first submit_bh can bring the page out of writeback.
2465 	 * be careful with the buffers.
2466 	 */
2467 	do {
2468 		struct buffer_head *next = bh->b_this_page;
2469 		if (buffer_async_write(bh)) {
2470 			submit_bh(WRITE, bh);
2471 			nr++;
2472 		}
2473 		put_bh(bh);
2474 		bh = next;
2475 	} while (bh != head);
2476 
2477 	error = 0;
2478       done:
2479 	if (nr == 0) {
2480 		/*
2481 		 * if this page only had a direct item, it is very possible for
2482 		 * no io to be required without there being an error.  Or,
2483 		 * someone else could have locked them and sent them down the
2484 		 * pipe without locking the page
2485 		 */
2486 		bh = head;
2487 		do {
2488 			if (!buffer_uptodate(bh)) {
2489 				partial = 1;
2490 				break;
2491 			}
2492 			bh = bh->b_this_page;
2493 		} while (bh != head);
2494 		if (!partial)
2495 			SetPageUptodate(page);
2496 		end_page_writeback(page);
2497 	}
2498 	return error;
2499 
2500       fail:
2501 	/* catches various errors, we need to make sure any valid dirty blocks
2502 	 * get to the media.  The page is currently locked and not marked for
2503 	 * writeback
2504 	 */
2505 	ClearPageUptodate(page);
2506 	bh = head;
2507 	do {
2508 		get_bh(bh);
2509 		if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) {
2510 			lock_buffer(bh);
2511 			mark_buffer_async_write(bh);
2512 		} else {
2513 			/*
2514 			 * clear any dirty bits that might have come from getting
2515 			 * attached to a dirty page
2516 			 */
2517 			clear_buffer_dirty(bh);
2518 		}
2519 		bh = bh->b_this_page;
2520 	} while (bh != head);
2521 	SetPageError(page);
2522 	BUG_ON(PageWriteback(page));
2523 	set_page_writeback(page);
2524 	unlock_page(page);
2525 	do {
2526 		struct buffer_head *next = bh->b_this_page;
2527 		if (buffer_async_write(bh)) {
2528 			clear_buffer_dirty(bh);
2529 			submit_bh(WRITE, bh);
2530 			nr++;
2531 		}
2532 		put_bh(bh);
2533 		bh = next;
2534 	} while (bh != head);
2535 	goto done;
2536 }
2537 
2538 static int reiserfs_readpage(struct file *f, struct page *page)
2539 {
2540 	return block_read_full_page(page, reiserfs_get_block);
2541 }
2542 
2543 static int reiserfs_writepage(struct page *page, struct writeback_control *wbc)
2544 {
2545 	struct inode *inode = page->mapping->host;
2546 	reiserfs_wait_on_write_block(inode->i_sb);
2547 	return reiserfs_write_full_page(page, wbc);
2548 }
2549 
2550 static int reiserfs_prepare_write(struct file *f, struct page *page,
2551 				  unsigned from, unsigned to)
2552 {
2553 	struct inode *inode = page->mapping->host;
2554 	int ret;
2555 	int old_ref = 0;
2556 
2557 	reiserfs_wait_on_write_block(inode->i_sb);
2558 	fix_tail_page_for_writing(page);
2559 	if (reiserfs_transaction_running(inode->i_sb)) {
2560 		struct reiserfs_transaction_handle *th;
2561 		th = (struct reiserfs_transaction_handle *)current->
2562 		    journal_info;
2563 		BUG_ON(!th->t_refcount);
2564 		BUG_ON(!th->t_trans_id);
2565 		old_ref = th->t_refcount;
2566 		th->t_refcount++;
2567 	}
2568 
2569 	ret = block_prepare_write(page, from, to, reiserfs_get_block);
2570 	if (ret && reiserfs_transaction_running(inode->i_sb)) {
2571 		struct reiserfs_transaction_handle *th = current->journal_info;
2572 		/* this gets a little ugly.  If reiserfs_get_block returned an
2573 		 * error and left a transacstion running, we've got to close it,
2574 		 * and we've got to free handle if it was a persistent transaction.
2575 		 *
2576 		 * But, if we had nested into an existing transaction, we need
2577 		 * to just drop the ref count on the handle.
2578 		 *
2579 		 * If old_ref == 0, the transaction is from reiserfs_get_block,
2580 		 * and it was a persistent trans.  Otherwise, it was nested above.
2581 		 */
2582 		if (th->t_refcount > old_ref) {
2583 			if (old_ref)
2584 				th->t_refcount--;
2585 			else {
2586 				int err;
2587 				reiserfs_write_lock(inode->i_sb);
2588 				err = reiserfs_end_persistent_transaction(th);
2589 				reiserfs_write_unlock(inode->i_sb);
2590 				if (err)
2591 					ret = err;
2592 			}
2593 		}
2594 	}
2595 	return ret;
2596 
2597 }
2598 
2599 static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block)
2600 {
2601 	return generic_block_bmap(as, block, reiserfs_bmap);
2602 }
2603 
2604 static int reiserfs_commit_write(struct file *f, struct page *page,
2605 				 unsigned from, unsigned to)
2606 {
2607 	struct inode *inode = page->mapping->host;
2608 	loff_t pos = ((loff_t) page->index << PAGE_CACHE_SHIFT) + to;
2609 	int ret = 0;
2610 	int update_sd = 0;
2611 	struct reiserfs_transaction_handle *th = NULL;
2612 
2613 	reiserfs_wait_on_write_block(inode->i_sb);
2614 	if (reiserfs_transaction_running(inode->i_sb)) {
2615 		th = current->journal_info;
2616 	}
2617 	reiserfs_commit_page(inode, page, from, to);
2618 
2619 	/* generic_commit_write does this for us, but does not update the
2620 	 ** transaction tracking stuff when the size changes.  So, we have
2621 	 ** to do the i_size updates here.
2622 	 */
2623 	if (pos > inode->i_size) {
2624 		struct reiserfs_transaction_handle myth;
2625 		reiserfs_write_lock(inode->i_sb);
2626 		/* If the file have grown beyond the border where it
2627 		   can have a tail, unmark it as needing a tail
2628 		   packing */
2629 		if ((have_large_tails(inode->i_sb)
2630 		     && inode->i_size > i_block_size(inode) * 4)
2631 		    || (have_small_tails(inode->i_sb)
2632 			&& inode->i_size > i_block_size(inode)))
2633 			REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
2634 
2635 		ret = journal_begin(&myth, inode->i_sb, 1);
2636 		if (ret) {
2637 			reiserfs_write_unlock(inode->i_sb);
2638 			goto journal_error;
2639 		}
2640 		reiserfs_update_inode_transaction(inode);
2641 		inode->i_size = pos;
2642 		/*
2643 		 * this will just nest into our transaction.  It's important
2644 		 * to use mark_inode_dirty so the inode gets pushed around on the
2645 		 * dirty lists, and so that O_SYNC works as expected
2646 		 */
2647 		mark_inode_dirty(inode);
2648 		reiserfs_update_sd(&myth, inode);
2649 		update_sd = 1;
2650 		ret = journal_end(&myth, inode->i_sb, 1);
2651 		reiserfs_write_unlock(inode->i_sb);
2652 		if (ret)
2653 			goto journal_error;
2654 	}
2655 	if (th) {
2656 		reiserfs_write_lock(inode->i_sb);
2657 		if (!update_sd)
2658 			mark_inode_dirty(inode);
2659 		ret = reiserfs_end_persistent_transaction(th);
2660 		reiserfs_write_unlock(inode->i_sb);
2661 		if (ret)
2662 			goto out;
2663 	}
2664 
2665       out:
2666 	return ret;
2667 
2668       journal_error:
2669 	if (th) {
2670 		reiserfs_write_lock(inode->i_sb);
2671 		if (!update_sd)
2672 			reiserfs_update_sd(th, inode);
2673 		ret = reiserfs_end_persistent_transaction(th);
2674 		reiserfs_write_unlock(inode->i_sb);
2675 	}
2676 
2677 	return ret;
2678 }
2679 
2680 void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode)
2681 {
2682 	if (reiserfs_attrs(inode->i_sb)) {
2683 		if (sd_attrs & REISERFS_SYNC_FL)
2684 			inode->i_flags |= S_SYNC;
2685 		else
2686 			inode->i_flags &= ~S_SYNC;
2687 		if (sd_attrs & REISERFS_IMMUTABLE_FL)
2688 			inode->i_flags |= S_IMMUTABLE;
2689 		else
2690 			inode->i_flags &= ~S_IMMUTABLE;
2691 		if (sd_attrs & REISERFS_APPEND_FL)
2692 			inode->i_flags |= S_APPEND;
2693 		else
2694 			inode->i_flags &= ~S_APPEND;
2695 		if (sd_attrs & REISERFS_NOATIME_FL)
2696 			inode->i_flags |= S_NOATIME;
2697 		else
2698 			inode->i_flags &= ~S_NOATIME;
2699 		if (sd_attrs & REISERFS_NOTAIL_FL)
2700 			REISERFS_I(inode)->i_flags |= i_nopack_mask;
2701 		else
2702 			REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
2703 	}
2704 }
2705 
2706 void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs)
2707 {
2708 	if (reiserfs_attrs(inode->i_sb)) {
2709 		if (inode->i_flags & S_IMMUTABLE)
2710 			*sd_attrs |= REISERFS_IMMUTABLE_FL;
2711 		else
2712 			*sd_attrs &= ~REISERFS_IMMUTABLE_FL;
2713 		if (inode->i_flags & S_SYNC)
2714 			*sd_attrs |= REISERFS_SYNC_FL;
2715 		else
2716 			*sd_attrs &= ~REISERFS_SYNC_FL;
2717 		if (inode->i_flags & S_NOATIME)
2718 			*sd_attrs |= REISERFS_NOATIME_FL;
2719 		else
2720 			*sd_attrs &= ~REISERFS_NOATIME_FL;
2721 		if (REISERFS_I(inode)->i_flags & i_nopack_mask)
2722 			*sd_attrs |= REISERFS_NOTAIL_FL;
2723 		else
2724 			*sd_attrs &= ~REISERFS_NOTAIL_FL;
2725 	}
2726 }
2727 
2728 /* decide if this buffer needs to stay around for data logging or ordered
2729 ** write purposes
2730 */
2731 static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
2732 {
2733 	int ret = 1;
2734 	struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
2735 
2736 	spin_lock(&j->j_dirty_buffers_lock);
2737 	if (!buffer_mapped(bh)) {
2738 		goto free_jh;
2739 	}
2740 	/* the page is locked, and the only places that log a data buffer
2741 	 * also lock the page.
2742 	 */
2743 	if (reiserfs_file_data_log(inode)) {
2744 		/*
2745 		 * very conservative, leave the buffer pinned if
2746 		 * anyone might need it.
2747 		 */
2748 		if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
2749 			ret = 0;
2750 		}
2751 	} else if (buffer_dirty(bh) || buffer_locked(bh)) {
2752 		struct reiserfs_journal_list *jl;
2753 		struct reiserfs_jh *jh = bh->b_private;
2754 
2755 		/* why is this safe?
2756 		 * reiserfs_setattr updates i_size in the on disk
2757 		 * stat data before allowing vmtruncate to be called.
2758 		 *
2759 		 * If buffer was put onto the ordered list for this
2760 		 * transaction, we know for sure either this transaction
2761 		 * or an older one already has updated i_size on disk,
2762 		 * and this ordered data won't be referenced in the file
2763 		 * if we crash.
2764 		 *
2765 		 * if the buffer was put onto the ordered list for an older
2766 		 * transaction, we need to leave it around
2767 		 */
2768 		if (jh && (jl = jh->jl)
2769 		    && jl != SB_JOURNAL(inode->i_sb)->j_current_jl)
2770 			ret = 0;
2771 	}
2772       free_jh:
2773 	if (ret && bh->b_private) {
2774 		reiserfs_free_jh(bh);
2775 	}
2776 	spin_unlock(&j->j_dirty_buffers_lock);
2777 	return ret;
2778 }
2779 
2780 /* clm -- taken from fs/buffer.c:block_invalidate_page */
2781 static int reiserfs_invalidatepage(struct page *page, unsigned long offset)
2782 {
2783 	struct buffer_head *head, *bh, *next;
2784 	struct inode *inode = page->mapping->host;
2785 	unsigned int curr_off = 0;
2786 	int ret = 1;
2787 
2788 	BUG_ON(!PageLocked(page));
2789 
2790 	if (offset == 0)
2791 		ClearPageChecked(page);
2792 
2793 	if (!page_has_buffers(page))
2794 		goto out;
2795 
2796 	head = page_buffers(page);
2797 	bh = head;
2798 	do {
2799 		unsigned int next_off = curr_off + bh->b_size;
2800 		next = bh->b_this_page;
2801 
2802 		/*
2803 		 * is this block fully invalidated?
2804 		 */
2805 		if (offset <= curr_off) {
2806 			if (invalidatepage_can_drop(inode, bh))
2807 				reiserfs_unmap_buffer(bh);
2808 			else
2809 				ret = 0;
2810 		}
2811 		curr_off = next_off;
2812 		bh = next;
2813 	} while (bh != head);
2814 
2815 	/*
2816 	 * We release buffers only if the entire page is being invalidated.
2817 	 * The get_block cached value has been unconditionally invalidated,
2818 	 * so real IO is not possible anymore.
2819 	 */
2820 	if (!offset && ret)
2821 		ret = try_to_release_page(page, 0);
2822       out:
2823 	return ret;
2824 }
2825 
2826 static int reiserfs_set_page_dirty(struct page *page)
2827 {
2828 	struct inode *inode = page->mapping->host;
2829 	if (reiserfs_file_data_log(inode)) {
2830 		SetPageChecked(page);
2831 		return __set_page_dirty_nobuffers(page);
2832 	}
2833 	return __set_page_dirty_buffers(page);
2834 }
2835 
2836 /*
2837  * Returns 1 if the page's buffers were dropped.  The page is locked.
2838  *
2839  * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads
2840  * in the buffers at page_buffers(page).
2841  *
2842  * even in -o notail mode, we can't be sure an old mount without -o notail
2843  * didn't create files with tails.
2844  */
2845 static int reiserfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
2846 {
2847 	struct inode *inode = page->mapping->host;
2848 	struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
2849 	struct buffer_head *head;
2850 	struct buffer_head *bh;
2851 	int ret = 1;
2852 
2853 	WARN_ON(PageChecked(page));
2854 	spin_lock(&j->j_dirty_buffers_lock);
2855 	head = page_buffers(page);
2856 	bh = head;
2857 	do {
2858 		if (bh->b_private) {
2859 			if (!buffer_dirty(bh) && !buffer_locked(bh)) {
2860 				reiserfs_free_jh(bh);
2861 			} else {
2862 				ret = 0;
2863 				break;
2864 			}
2865 		}
2866 		bh = bh->b_this_page;
2867 	} while (bh != head);
2868 	if (ret)
2869 		ret = try_to_free_buffers(page);
2870 	spin_unlock(&j->j_dirty_buffers_lock);
2871 	return ret;
2872 }
2873 
2874 /* We thank Mingming Cao for helping us understand in great detail what
2875    to do in this section of the code. */
2876 static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
2877 				  const struct iovec *iov, loff_t offset,
2878 				  unsigned long nr_segs)
2879 {
2880 	struct file *file = iocb->ki_filp;
2881 	struct inode *inode = file->f_mapping->host;
2882 
2883 	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
2884 				  offset, nr_segs,
2885 				  reiserfs_get_blocks_direct_io, NULL);
2886 }
2887 
2888 int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
2889 {
2890 	struct inode *inode = dentry->d_inode;
2891 	int error;
2892 	unsigned int ia_valid = attr->ia_valid;
2893 	reiserfs_write_lock(inode->i_sb);
2894 	if (attr->ia_valid & ATTR_SIZE) {
2895 		/* version 2 items will be caught by the s_maxbytes check
2896 		 ** done for us in vmtruncate
2897 		 */
2898 		if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 &&
2899 		    attr->ia_size > MAX_NON_LFS) {
2900 			error = -EFBIG;
2901 			goto out;
2902 		}
2903 		/* fill in hole pointers in the expanding truncate case. */
2904 		if (attr->ia_size > inode->i_size) {
2905 			error = generic_cont_expand(inode, attr->ia_size);
2906 			if (REISERFS_I(inode)->i_prealloc_count > 0) {
2907 				int err;
2908 				struct reiserfs_transaction_handle th;
2909 				/* we're changing at most 2 bitmaps, inode + super */
2910 				err = journal_begin(&th, inode->i_sb, 4);
2911 				if (!err) {
2912 					reiserfs_discard_prealloc(&th, inode);
2913 					err = journal_end(&th, inode->i_sb, 4);
2914 				}
2915 				if (err)
2916 					error = err;
2917 			}
2918 			if (error)
2919 				goto out;
2920 		}
2921 	}
2922 
2923 	if ((((attr->ia_valid & ATTR_UID) && (attr->ia_uid & ~0xffff)) ||
2924 	     ((attr->ia_valid & ATTR_GID) && (attr->ia_gid & ~0xffff))) &&
2925 	    (get_inode_sd_version(inode) == STAT_DATA_V1)) {
2926 		/* stat data of format v3.5 has 16 bit uid and gid */
2927 		error = -EINVAL;
2928 		goto out;
2929 	}
2930 
2931 	error = inode_change_ok(inode, attr);
2932 	if (!error) {
2933 		if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
2934 		    (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
2935 			error = reiserfs_chown_xattrs(inode, attr);
2936 
2937 			if (!error) {
2938 				struct reiserfs_transaction_handle th;
2939 				int jbegin_count =
2940 				    2 *
2941 				    (REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb) +
2942 				     REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)) +
2943 				    2;
2944 
2945 				/* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */
2946 				error =
2947 				    journal_begin(&th, inode->i_sb,
2948 						  jbegin_count);
2949 				if (error)
2950 					goto out;
2951 				error =
2952 				    DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
2953 				if (error) {
2954 					journal_end(&th, inode->i_sb,
2955 						    jbegin_count);
2956 					goto out;
2957 				}
2958 				/* Update corresponding info in inode so that everything is in
2959 				 * one transaction */
2960 				if (attr->ia_valid & ATTR_UID)
2961 					inode->i_uid = attr->ia_uid;
2962 				if (attr->ia_valid & ATTR_GID)
2963 					inode->i_gid = attr->ia_gid;
2964 				mark_inode_dirty(inode);
2965 				error =
2966 				    journal_end(&th, inode->i_sb, jbegin_count);
2967 			}
2968 		}
2969 		if (!error)
2970 			error = inode_setattr(inode, attr);
2971 	}
2972 
2973 	if (!error && reiserfs_posixacl(inode->i_sb)) {
2974 		if (attr->ia_valid & ATTR_MODE)
2975 			error = reiserfs_acl_chmod(inode);
2976 	}
2977 
2978       out:
2979 	reiserfs_write_unlock(inode->i_sb);
2980 	return error;
2981 }
2982 
2983 struct address_space_operations reiserfs_address_space_operations = {
2984 	.writepage = reiserfs_writepage,
2985 	.readpage = reiserfs_readpage,
2986 	.readpages = reiserfs_readpages,
2987 	.releasepage = reiserfs_releasepage,
2988 	.invalidatepage = reiserfs_invalidatepage,
2989 	.sync_page = block_sync_page,
2990 	.prepare_write = reiserfs_prepare_write,
2991 	.commit_write = reiserfs_commit_write,
2992 	.bmap = reiserfs_aop_bmap,
2993 	.direct_IO = reiserfs_direct_IO,
2994 	.set_page_dirty = reiserfs_set_page_dirty,
2995 };
2996