xref: /openbmc/linux/fs/reiserfs/file.c (revision 64c70b1c)
1 /*
2  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3  */
4 
5 #include <linux/time.h>
6 #include <linux/reiserfs_fs.h>
7 #include <linux/reiserfs_acl.h>
8 #include <linux/reiserfs_xattr.h>
9 #include <asm/uaccess.h>
10 #include <linux/pagemap.h>
11 #include <linux/swap.h>
12 #include <linux/writeback.h>
13 #include <linux/blkdev.h>
14 #include <linux/buffer_head.h>
15 #include <linux/quotaops.h>
16 
17 /*
18 ** We pack the tails of files on file close, not at the time they are written.
19 ** This implies an unnecessary copy of the tail and an unnecessary indirect item
20 ** insertion/balancing, for files that are written in one write.
21 ** It avoids unnecessary tail packings (balances) for files that are written in
22 ** multiple writes and are small enough to have tails.
23 **
24 ** file_release is called by the VFS layer when the file is closed.  If
25 ** this is the last open file descriptor, and the file
26 ** small enough to have a tail, and the tail is currently in an
27 ** unformatted node, the tail is converted back into a direct item.
28 **
29 ** We use reiserfs_truncate_file to pack the tail, since it already has
30 ** all the conditions coded.
31 */
32 static int reiserfs_file_release(struct inode *inode, struct file *filp)
33 {
34 
35 	struct reiserfs_transaction_handle th;
36 	int err;
37 	int jbegin_failure = 0;
38 
39 	BUG_ON(!S_ISREG(inode->i_mode));
40 
41 	/* fast out for when nothing needs to be done */
42 	if ((atomic_read(&inode->i_count) > 1 ||
43 	     !(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
44 	     !tail_has_to_be_packed(inode)) &&
45 	    REISERFS_I(inode)->i_prealloc_count <= 0) {
46 		return 0;
47 	}
48 
49 	mutex_lock(&inode->i_mutex);
50 
51 	mutex_lock(&(REISERFS_I(inode)->i_mmap));
52 	if (REISERFS_I(inode)->i_flags & i_ever_mapped)
53 		REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
54 
55 	reiserfs_write_lock(inode->i_sb);
56 	/* freeing preallocation only involves relogging blocks that
57 	 * are already in the current transaction.  preallocation gets
58 	 * freed at the end of each transaction, so it is impossible for
59 	 * us to log any additional blocks (including quota blocks)
60 	 */
61 	err = journal_begin(&th, inode->i_sb, 1);
62 	if (err) {
63 		/* uh oh, we can't allow the inode to go away while there
64 		 * is still preallocation blocks pending.  Try to join the
65 		 * aborted transaction
66 		 */
67 		jbegin_failure = err;
68 		err = journal_join_abort(&th, inode->i_sb, 1);
69 
70 		if (err) {
71 			/* hmpf, our choices here aren't good.  We can pin the inode
72 			 * which will disallow unmount from every happening, we can
73 			 * do nothing, which will corrupt random memory on unmount,
74 			 * or we can forcibly remove the file from the preallocation
75 			 * list, which will leak blocks on disk.  Lets pin the inode
76 			 * and let the admin know what is going on.
77 			 */
78 			igrab(inode);
79 			reiserfs_warning(inode->i_sb,
80 					 "pinning inode %lu because the "
81 					 "preallocation can't be freed",
82 					 inode->i_ino);
83 			goto out;
84 		}
85 	}
86 	reiserfs_update_inode_transaction(inode);
87 
88 #ifdef REISERFS_PREALLOCATE
89 	reiserfs_discard_prealloc(&th, inode);
90 #endif
91 	err = journal_end(&th, inode->i_sb, 1);
92 
93 	/* copy back the error code from journal_begin */
94 	if (!err)
95 		err = jbegin_failure;
96 
97 	if (!err && atomic_read(&inode->i_count) <= 1 &&
98 	    (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
99 	    tail_has_to_be_packed(inode)) {
100 		/* if regular file is released by last holder and it has been
101 		   appended (we append by unformatted node only) or its direct
102 		   item(s) had to be converted, then it may have to be
103 		   indirect2direct converted */
104 		err = reiserfs_truncate_file(inode, 0);
105 	}
106       out:
107 	mutex_unlock(&(REISERFS_I(inode)->i_mmap));
108 	mutex_unlock(&inode->i_mutex);
109 	reiserfs_write_unlock(inode->i_sb);
110 	return err;
111 }
112 
113 static int reiserfs_file_mmap(struct file *file, struct vm_area_struct *vma)
114 {
115 	struct inode *inode;
116 
117 	inode = file->f_path.dentry->d_inode;
118 	mutex_lock(&(REISERFS_I(inode)->i_mmap));
119 	REISERFS_I(inode)->i_flags |= i_ever_mapped;
120 	mutex_unlock(&(REISERFS_I(inode)->i_mmap));
121 
122 	return generic_file_mmap(file, vma);
123 }
124 
125 static void reiserfs_vfs_truncate_file(struct inode *inode)
126 {
127 	reiserfs_truncate_file(inode, 1);
128 }
129 
130 /* Sync a reiserfs file. */
131 
132 /*
133  * FIXME: sync_mapping_buffers() never has anything to sync.  Can
134  * be removed...
135  */
136 
137 static int reiserfs_sync_file(struct file *p_s_filp,
138 			      struct dentry *p_s_dentry, int datasync)
139 {
140 	struct inode *p_s_inode = p_s_dentry->d_inode;
141 	int n_err;
142 	int barrier_done;
143 
144 	BUG_ON(!S_ISREG(p_s_inode->i_mode));
145 	n_err = sync_mapping_buffers(p_s_inode->i_mapping);
146 	reiserfs_write_lock(p_s_inode->i_sb);
147 	barrier_done = reiserfs_commit_for_inode(p_s_inode);
148 	reiserfs_write_unlock(p_s_inode->i_sb);
149 	if (barrier_done != 1 && reiserfs_barrier_flush(p_s_inode->i_sb))
150 		blkdev_issue_flush(p_s_inode->i_sb->s_bdev, NULL);
151 	if (barrier_done < 0)
152 		return barrier_done;
153 	return (n_err < 0) ? -EIO : 0;
154 }
155 
156 /* I really do not want to play with memory shortage right now, so
157    to simplify the code, we are not going to write more than this much pages at
158    a time. This still should considerably improve performance compared to 4k
159    at a time case. This is 32 pages of 4k size. */
160 #define REISERFS_WRITE_PAGES_AT_A_TIME (128 * 1024) / PAGE_CACHE_SIZE
161 
162 /* Allocates blocks for a file to fulfil write request.
163    Maps all unmapped but prepared pages from the list.
164    Updates metadata with newly allocated blocknumbers as needed */
165 static int reiserfs_allocate_blocks_for_region(struct reiserfs_transaction_handle *th, struct inode *inode,	/* Inode we work with */
166 					       loff_t pos,	/* Writing position */
167 					       int num_pages,	/* number of pages write going
168 								   to touch */
169 					       int write_bytes,	/* amount of bytes to write */
170 					       struct page **prepared_pages,	/* array of
171 										   prepared pages
172 										 */
173 					       int blocks_to_allocate	/* Amount of blocks we
174 									   need to allocate to
175 									   fit the data into file
176 									 */
177     )
178 {
179 	struct cpu_key key;	// cpu key of item that we are going to deal with
180 	struct item_head *ih;	// pointer to item head that we are going to deal with
181 	struct buffer_head *bh;	// Buffer head that contains items that we are going to deal with
182 	__le32 *item;		// pointer to item we are going to deal with
183 	INITIALIZE_PATH(path);	// path to item, that we are going to deal with.
184 	b_blocknr_t *allocated_blocks;	// Pointer to a place where allocated blocknumbers would be stored.
185 	reiserfs_blocknr_hint_t hint;	// hint structure for block allocator.
186 	size_t res;		// return value of various functions that we call.
187 	int curr_block;		// current block used to keep track of unmapped blocks.
188 	int i;			// loop counter
189 	int itempos;		// position in item
190 	unsigned int from = (pos & (PAGE_CACHE_SIZE - 1));	// writing position in
191 	// first page
192 	unsigned int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1;	/* last modified byte offset in last page */
193 	__u64 hole_size;	// amount of blocks for a file hole, if it needed to be created.
194 	int modifying_this_item = 0;	// Flag for items traversal code to keep track
195 	// of the fact that we already prepared
196 	// current block for journal
197 	int will_prealloc = 0;
198 	RFALSE(!blocks_to_allocate,
199 	       "green-9004: tried to allocate zero blocks?");
200 
201 	/* only preallocate if this is a small write */
202 	if (REISERFS_I(inode)->i_prealloc_count ||
203 	    (!(write_bytes & (inode->i_sb->s_blocksize - 1)) &&
204 	     blocks_to_allocate <
205 	     REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize))
206 		will_prealloc =
207 		    REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize;
208 
209 	allocated_blocks = kmalloc((blocks_to_allocate + will_prealloc) *
210 				   sizeof(b_blocknr_t), GFP_NOFS);
211 	if (!allocated_blocks)
212 		return -ENOMEM;
213 
214 	/* First we compose a key to point at the writing position, we want to do
215 	   that outside of any locking region. */
216 	make_cpu_key(&key, inode, pos + 1, TYPE_ANY, 3 /*key length */ );
217 
218 	/* If we came here, it means we absolutely need to open a transaction,
219 	   since we need to allocate some blocks */
220 	reiserfs_write_lock(inode->i_sb);	// Journaling stuff and we need that.
221 	res = journal_begin(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb));	// Wish I know if this number enough
222 	if (res)
223 		goto error_exit;
224 	reiserfs_update_inode_transaction(inode);
225 
226 	/* Look for the in-tree position of our write, need path for block allocator */
227 	res = search_for_position_by_key(inode->i_sb, &key, &path);
228 	if (res == IO_ERROR) {
229 		res = -EIO;
230 		goto error_exit;
231 	}
232 
233 	/* Allocate blocks */
234 	/* First fill in "hint" structure for block allocator */
235 	hint.th = th;		// transaction handle.
236 	hint.path = &path;	// Path, so that block allocator can determine packing locality or whatever it needs to determine.
237 	hint.inode = inode;	// Inode is needed by block allocator too.
238 	hint.search_start = 0;	// We have no hint on where to search free blocks for block allocator.
239 	hint.key = key.on_disk_key;	// on disk key of file.
240 	hint.block = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);	// Number of disk blocks this file occupies already.
241 	hint.formatted_node = 0;	// We are allocating blocks for unformatted node.
242 	hint.preallocate = will_prealloc;
243 
244 	/* Call block allocator to allocate blocks */
245 	res =
246 	    reiserfs_allocate_blocknrs(&hint, allocated_blocks,
247 				       blocks_to_allocate, blocks_to_allocate);
248 	if (res != CARRY_ON) {
249 		if (res == NO_DISK_SPACE) {
250 			/* We flush the transaction in case of no space. This way some
251 			   blocks might become free */
252 			SB_JOURNAL(inode->i_sb)->j_must_wait = 1;
253 			res = restart_transaction(th, inode, &path);
254 			if (res)
255 				goto error_exit;
256 
257 			/* We might have scheduled, so search again */
258 			res =
259 			    search_for_position_by_key(inode->i_sb, &key,
260 						       &path);
261 			if (res == IO_ERROR) {
262 				res = -EIO;
263 				goto error_exit;
264 			}
265 
266 			/* update changed info for hint structure. */
267 			res =
268 			    reiserfs_allocate_blocknrs(&hint, allocated_blocks,
269 						       blocks_to_allocate,
270 						       blocks_to_allocate);
271 			if (res != CARRY_ON) {
272 				res = res == QUOTA_EXCEEDED ? -EDQUOT : -ENOSPC;
273 				pathrelse(&path);
274 				goto error_exit;
275 			}
276 		} else {
277 			res = res == QUOTA_EXCEEDED ? -EDQUOT : -ENOSPC;
278 			pathrelse(&path);
279 			goto error_exit;
280 		}
281 	}
282 #ifdef __BIG_ENDIAN
283 	// Too bad, I have not found any way to convert a given region from
284 	// cpu format to little endian format
285 	{
286 		int i;
287 		for (i = 0; i < blocks_to_allocate; i++)
288 			allocated_blocks[i] = cpu_to_le32(allocated_blocks[i]);
289 	}
290 #endif
291 
292 	/* Blocks allocating well might have scheduled and tree might have changed,
293 	   let's search the tree again */
294 	/* find where in the tree our write should go */
295 	res = search_for_position_by_key(inode->i_sb, &key, &path);
296 	if (res == IO_ERROR) {
297 		res = -EIO;
298 		goto error_exit_free_blocks;
299 	}
300 
301 	bh = get_last_bh(&path);	// Get a bufferhead for last element in path.
302 	ih = get_ih(&path);	// Get a pointer to last item head in path.
303 	item = get_item(&path);	// Get a pointer to last item in path
304 
305 	/* Let's see what we have found */
306 	if (res != POSITION_FOUND) {	/* position not found, this means that we
307 					   might need to append file with holes
308 					   first */
309 		// Since we are writing past the file's end, we need to find out if
310 		// there is a hole that needs to be inserted before our writing
311 		// position, and how many blocks it is going to cover (we need to
312 		//  populate pointers to file blocks representing the hole with zeros)
313 
314 		{
315 			int item_offset = 1;
316 			/*
317 			 * if ih is stat data, its offset is 0 and we don't want to
318 			 * add 1 to pos in the hole_size calculation
319 			 */
320 			if (is_statdata_le_ih(ih))
321 				item_offset = 0;
322 			hole_size = (pos + item_offset -
323 				     (le_key_k_offset
324 				      (get_inode_item_key_version(inode),
325 				       &(ih->ih_key)) + op_bytes_number(ih,
326 									inode->
327 									i_sb->
328 									s_blocksize)))
329 			    >> inode->i_sb->s_blocksize_bits;
330 		}
331 
332 		if (hole_size > 0) {
333 			int to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize) / UNFM_P_SIZE);	// How much data to insert first time.
334 			/* area filled with zeroes, to supply as list of zero blocknumbers
335 			   We allocate it outside of loop just in case loop would spin for
336 			   several iterations. */
337 			char *zeros = kzalloc(to_paste * UNFM_P_SIZE, GFP_ATOMIC);	// We cannot insert more than MAX_ITEM_LEN bytes anyway.
338 			if (!zeros) {
339 				res = -ENOMEM;
340 				goto error_exit_free_blocks;
341 			}
342 			do {
343 				to_paste =
344 				    min_t(__u64, hole_size,
345 					  MAX_ITEM_LEN(inode->i_sb->
346 						       s_blocksize) /
347 					  UNFM_P_SIZE);
348 				if (is_indirect_le_ih(ih)) {
349 					/* Ok, there is existing indirect item already. Need to append it */
350 					/* Calculate position past inserted item */
351 					make_cpu_key(&key, inode,
352 						     le_key_k_offset
353 						     (get_inode_item_key_version
354 						      (inode),
355 						      &(ih->ih_key)) +
356 						     op_bytes_number(ih,
357 								     inode->
358 								     i_sb->
359 								     s_blocksize),
360 						     TYPE_INDIRECT, 3);
361 					res =
362 					    reiserfs_paste_into_item(th, &path,
363 								     &key,
364 								     inode,
365 								     (char *)
366 								     zeros,
367 								     UNFM_P_SIZE
368 								     *
369 								     to_paste);
370 					if (res) {
371 						kfree(zeros);
372 						goto error_exit_free_blocks;
373 					}
374 				} else if (is_statdata_le_ih(ih)) {
375 					/* No existing item, create it */
376 					/* item head for new item */
377 					struct item_head ins_ih;
378 
379 					/* create a key for our new item */
380 					make_cpu_key(&key, inode, 1,
381 						     TYPE_INDIRECT, 3);
382 
383 					/* Create new item head for our new item */
384 					make_le_item_head(&ins_ih, &key,
385 							  key.version, 1,
386 							  TYPE_INDIRECT,
387 							  to_paste *
388 							  UNFM_P_SIZE,
389 							  0 /* free space */ );
390 
391 					/* Find where such item should live in the tree */
392 					res =
393 					    search_item(inode->i_sb, &key,
394 							&path);
395 					if (res != ITEM_NOT_FOUND) {
396 						/* item should not exist, otherwise we have error */
397 						if (res != -ENOSPC) {
398 							reiserfs_warning(inode->
399 									 i_sb,
400 									 "green-9008: search_by_key (%K) returned %d",
401 									 &key,
402 									 res);
403 						}
404 						res = -EIO;
405 						kfree(zeros);
406 						goto error_exit_free_blocks;
407 					}
408 					res =
409 					    reiserfs_insert_item(th, &path,
410 								 &key, &ins_ih,
411 								 inode,
412 								 (char *)zeros);
413 				} else {
414 					reiserfs_panic(inode->i_sb,
415 						       "green-9011: Unexpected key type %K\n",
416 						       &key);
417 				}
418 				if (res) {
419 					kfree(zeros);
420 					goto error_exit_free_blocks;
421 				}
422 				/* Now we want to check if transaction is too full, and if it is
423 				   we restart it. This will also free the path. */
424 				if (journal_transaction_should_end
425 				    (th, th->t_blocks_allocated)) {
426 					inode->i_size = cpu_key_k_offset(&key) +
427 						(to_paste << inode->i_blkbits);
428 					res =
429 					    restart_transaction(th, inode,
430 								&path);
431 					if (res) {
432 						pathrelse(&path);
433 						kfree(zeros);
434 						goto error_exit;
435 					}
436 				}
437 
438 				/* Well, need to recalculate path and stuff */
439 				set_cpu_key_k_offset(&key,
440 						     cpu_key_k_offset(&key) +
441 						     (to_paste << inode->
442 						      i_blkbits));
443 				res =
444 				    search_for_position_by_key(inode->i_sb,
445 							       &key, &path);
446 				if (res == IO_ERROR) {
447 					res = -EIO;
448 					kfree(zeros);
449 					goto error_exit_free_blocks;
450 				}
451 				bh = get_last_bh(&path);
452 				ih = get_ih(&path);
453 				item = get_item(&path);
454 				hole_size -= to_paste;
455 			} while (hole_size);
456 			kfree(zeros);
457 		}
458 	}
459 	// Go through existing indirect items first
460 	// replace all zeroes with blocknumbers from list
461 	// Note that if no corresponding item was found, by previous search,
462 	// it means there are no existing in-tree representation for file area
463 	// we are going to overwrite, so there is nothing to scan through for holes.
464 	for (curr_block = 0, itempos = path.pos_in_item;
465 	     curr_block < blocks_to_allocate && res == POSITION_FOUND;) {
466 	      retry:
467 
468 		if (itempos >= ih_item_len(ih) / UNFM_P_SIZE) {
469 			/* We run out of data in this indirect item, let's look for another
470 			   one. */
471 			/* First if we are already modifying current item, log it */
472 			if (modifying_this_item) {
473 				journal_mark_dirty(th, inode->i_sb, bh);
474 				modifying_this_item = 0;
475 			}
476 			/* Then set the key to look for a new indirect item (offset of old
477 			   item is added to old item length */
478 			set_cpu_key_k_offset(&key,
479 					     le_key_k_offset
480 					     (get_inode_item_key_version(inode),
481 					      &(ih->ih_key)) +
482 					     op_bytes_number(ih,
483 							     inode->i_sb->
484 							     s_blocksize));
485 			/* Search ofor position of new key in the tree. */
486 			res =
487 			    search_for_position_by_key(inode->i_sb, &key,
488 						       &path);
489 			if (res == IO_ERROR) {
490 				res = -EIO;
491 				goto error_exit_free_blocks;
492 			}
493 			bh = get_last_bh(&path);
494 			ih = get_ih(&path);
495 			item = get_item(&path);
496 			itempos = path.pos_in_item;
497 			continue;	// loop to check all kinds of conditions and so on.
498 		}
499 		/* Ok, we have correct position in item now, so let's see if it is
500 		   representing file hole (blocknumber is zero) and fill it if needed */
501 		if (!item[itempos]) {
502 			/* Ok, a hole. Now we need to check if we already prepared this
503 			   block to be journaled */
504 			while (!modifying_this_item) {	// loop until succeed
505 				/* Well, this item is not journaled yet, so we must prepare
506 				   it for journal first, before we can change it */
507 				struct item_head tmp_ih;	// We copy item head of found item,
508 				// here to detect if fs changed under
509 				// us while we were preparing for
510 				// journal.
511 				int fs_gen;	// We store fs generation here to find if someone
512 				// changes fs under our feet
513 
514 				copy_item_head(&tmp_ih, ih);	// Remember itemhead
515 				fs_gen = get_generation(inode->i_sb);	// remember fs generation
516 				reiserfs_prepare_for_journal(inode->i_sb, bh, 1);	// Prepare a buffer within which indirect item is stored for changing.
517 				if (fs_changed(fs_gen, inode->i_sb)
518 				    && item_moved(&tmp_ih, &path)) {
519 					// Sigh, fs was changed under us, we need to look for new
520 					// location of item we are working with
521 
522 					/* unmark prepaerd area as journaled and search for it's
523 					   new position */
524 					reiserfs_restore_prepared_buffer(inode->
525 									 i_sb,
526 									 bh);
527 					res =
528 					    search_for_position_by_key(inode->
529 								       i_sb,
530 								       &key,
531 								       &path);
532 					if (res == IO_ERROR) {
533 						res = -EIO;
534 						goto error_exit_free_blocks;
535 					}
536 					bh = get_last_bh(&path);
537 					ih = get_ih(&path);
538 					item = get_item(&path);
539 					itempos = path.pos_in_item;
540 					goto retry;
541 				}
542 				modifying_this_item = 1;
543 			}
544 			item[itempos] = allocated_blocks[curr_block];	// Assign new block
545 			curr_block++;
546 		}
547 		itempos++;
548 	}
549 
550 	if (modifying_this_item) {	// We need to log last-accessed block, if it
551 		// was modified, but not logged yet.
552 		journal_mark_dirty(th, inode->i_sb, bh);
553 	}
554 
555 	if (curr_block < blocks_to_allocate) {
556 		// Oh, well need to append to indirect item, or to create indirect item
557 		// if there weren't any
558 		if (is_indirect_le_ih(ih)) {
559 			// Existing indirect item - append. First calculate key for append
560 			// position. We do not need to recalculate path as it should
561 			// already point to correct place.
562 			make_cpu_key(&key, inode,
563 				     le_key_k_offset(get_inode_item_key_version
564 						     (inode),
565 						     &(ih->ih_key)) +
566 				     op_bytes_number(ih,
567 						     inode->i_sb->s_blocksize),
568 				     TYPE_INDIRECT, 3);
569 			res =
570 			    reiserfs_paste_into_item(th, &path, &key, inode,
571 						     (char *)(allocated_blocks +
572 							      curr_block),
573 						     UNFM_P_SIZE *
574 						     (blocks_to_allocate -
575 						      curr_block));
576 			if (res) {
577 				goto error_exit_free_blocks;
578 			}
579 		} else if (is_statdata_le_ih(ih)) {
580 			// Last found item was statdata. That means we need to create indirect item.
581 			struct item_head ins_ih;	/* itemhead for new item */
582 
583 			/* create a key for our new item */
584 			make_cpu_key(&key, inode, 1, TYPE_INDIRECT, 3);	// Position one,
585 			// because that's
586 			// where first
587 			// indirect item
588 			// begins
589 			/* Create new item head for our new item */
590 			make_le_item_head(&ins_ih, &key, key.version, 1,
591 					  TYPE_INDIRECT,
592 					  (blocks_to_allocate -
593 					   curr_block) * UNFM_P_SIZE,
594 					  0 /* free space */ );
595 			/* Find where such item should live in the tree */
596 			res = search_item(inode->i_sb, &key, &path);
597 			if (res != ITEM_NOT_FOUND) {
598 				/* Well, if we have found such item already, or some error
599 				   occured, we need to warn user and return error */
600 				if (res != -ENOSPC) {
601 					reiserfs_warning(inode->i_sb,
602 							 "green-9009: search_by_key (%K) "
603 							 "returned %d", &key,
604 							 res);
605 				}
606 				res = -EIO;
607 				goto error_exit_free_blocks;
608 			}
609 			/* Insert item into the tree with the data as its body */
610 			res =
611 			    reiserfs_insert_item(th, &path, &key, &ins_ih,
612 						 inode,
613 						 (char *)(allocated_blocks +
614 							  curr_block));
615 		} else {
616 			reiserfs_panic(inode->i_sb,
617 				       "green-9010: unexpected item type for key %K\n",
618 				       &key);
619 		}
620 	}
621 	// the caller is responsible for closing the transaction
622 	// unless we return an error, they are also responsible for logging
623 	// the inode.
624 	//
625 	pathrelse(&path);
626 	/*
627 	 * cleanup prellocation from previous writes
628 	 * if this is a partial block write
629 	 */
630 	if (write_bytes & (inode->i_sb->s_blocksize - 1))
631 		reiserfs_discard_prealloc(th, inode);
632 	reiserfs_write_unlock(inode->i_sb);
633 
634 	// go through all the pages/buffers and map the buffers to newly allocated
635 	// blocks (so that system knows where to write these pages later).
636 	curr_block = 0;
637 	for (i = 0; i < num_pages; i++) {
638 		struct page *page = prepared_pages[i];	//current page
639 		struct buffer_head *head = page_buffers(page);	// first buffer for a page
640 		int block_start, block_end;	// in-page offsets for buffers.
641 
642 		if (!page_buffers(page))
643 			reiserfs_panic(inode->i_sb,
644 				       "green-9005: No buffers for prepared page???");
645 
646 		/* For each buffer in page */
647 		for (bh = head, block_start = 0; bh != head || !block_start;
648 		     block_start = block_end, bh = bh->b_this_page) {
649 			if (!bh)
650 				reiserfs_panic(inode->i_sb,
651 					       "green-9006: Allocated but absent buffer for a page?");
652 			block_end = block_start + inode->i_sb->s_blocksize;
653 			if (i == 0 && block_end <= from)
654 				/* if this buffer is before requested data to map, skip it */
655 				continue;
656 			if (i == num_pages - 1 && block_start >= to)
657 				/* If this buffer is after requested data to map, abort
658 				   processing of current page */
659 				break;
660 
661 			if (!buffer_mapped(bh)) {	// Ok, unmapped buffer, need to map it
662 				map_bh(bh, inode->i_sb,
663 				       le32_to_cpu(allocated_blocks
664 						   [curr_block]));
665 				curr_block++;
666 				set_buffer_new(bh);
667 			}
668 		}
669 	}
670 
671 	RFALSE(curr_block > blocks_to_allocate,
672 	       "green-9007: Used too many blocks? weird");
673 
674 	kfree(allocated_blocks);
675 	return 0;
676 
677 // Need to deal with transaction here.
678       error_exit_free_blocks:
679 	pathrelse(&path);
680 	// free blocks
681 	for (i = 0; i < blocks_to_allocate; i++)
682 		reiserfs_free_block(th, inode, le32_to_cpu(allocated_blocks[i]),
683 				    1);
684 
685       error_exit:
686 	if (th->t_trans_id) {
687 		int err;
688 		// update any changes we made to blk count
689 		mark_inode_dirty(inode);
690 		err =
691 		    journal_end(th, inode->i_sb,
692 				JOURNAL_PER_BALANCE_CNT * 3 + 1 +
693 				2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb));
694 		if (err)
695 			res = err;
696 	}
697 	reiserfs_write_unlock(inode->i_sb);
698 	kfree(allocated_blocks);
699 
700 	return res;
701 }
702 
703 /* Unlock pages prepared by reiserfs_prepare_file_region_for_write */
704 static void reiserfs_unprepare_pages(struct page **prepared_pages,	/* list of locked pages */
705 				     size_t num_pages /* amount of pages */ )
706 {
707 	int i;			// loop counter
708 
709 	for (i = 0; i < num_pages; i++) {
710 		struct page *page = prepared_pages[i];
711 
712 		try_to_free_buffers(page);
713 		unlock_page(page);
714 		page_cache_release(page);
715 	}
716 }
717 
718 /* This function will copy data from userspace to specified pages within
719    supplied byte range */
720 static int reiserfs_copy_from_user_to_file_region(loff_t pos,	/* In-file position */
721 						  int num_pages,	/* Number of pages affected */
722 						  int write_bytes,	/* Amount of bytes to write */
723 						  struct page **prepared_pages,	/* pointer to
724 										   array to
725 										   prepared pages
726 										 */
727 						  const char __user * buf	/* Pointer to user-supplied
728 										   data */
729     )
730 {
731 	long page_fault = 0;	// status of copy_from_user.
732 	int i;			// loop counter.
733 	int offset;		// offset in page
734 
735 	for (i = 0, offset = (pos & (PAGE_CACHE_SIZE - 1)); i < num_pages;
736 	     i++, offset = 0) {
737 		size_t count = min_t(size_t, PAGE_CACHE_SIZE - offset, write_bytes);	// How much of bytes to write to this page
738 		struct page *page = prepared_pages[i];	// Current page we process.
739 
740 		fault_in_pages_readable(buf, count);
741 
742 		/* Copy data from userspace to the current page */
743 		kmap(page);
744 		page_fault = __copy_from_user(page_address(page) + offset, buf, count);	// Copy the data.
745 		/* Flush processor's dcache for this page */
746 		flush_dcache_page(page);
747 		kunmap(page);
748 		buf += count;
749 		write_bytes -= count;
750 
751 		if (page_fault)
752 			break;	// Was there a fault? abort.
753 	}
754 
755 	return page_fault ? -EFAULT : 0;
756 }
757 
758 /* taken fs/buffer.c:__block_commit_write */
759 int reiserfs_commit_page(struct inode *inode, struct page *page,
760 			 unsigned from, unsigned to)
761 {
762 	unsigned block_start, block_end;
763 	int partial = 0;
764 	unsigned blocksize;
765 	struct buffer_head *bh, *head;
766 	unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT;
767 	int new;
768 	int logit = reiserfs_file_data_log(inode);
769 	struct super_block *s = inode->i_sb;
770 	int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
771 	struct reiserfs_transaction_handle th;
772 	int ret = 0;
773 
774 	th.t_trans_id = 0;
775 	blocksize = 1 << inode->i_blkbits;
776 
777 	if (logit) {
778 		reiserfs_write_lock(s);
779 		ret = journal_begin(&th, s, bh_per_page + 1);
780 		if (ret)
781 			goto drop_write_lock;
782 		reiserfs_update_inode_transaction(inode);
783 	}
784 	for (bh = head = page_buffers(page), block_start = 0;
785 	     bh != head || !block_start;
786 	     block_start = block_end, bh = bh->b_this_page) {
787 
788 		new = buffer_new(bh);
789 		clear_buffer_new(bh);
790 		block_end = block_start + blocksize;
791 		if (block_end <= from || block_start >= to) {
792 			if (!buffer_uptodate(bh))
793 				partial = 1;
794 		} else {
795 			set_buffer_uptodate(bh);
796 			if (logit) {
797 				reiserfs_prepare_for_journal(s, bh, 1);
798 				journal_mark_dirty(&th, s, bh);
799 			} else if (!buffer_dirty(bh)) {
800 				mark_buffer_dirty(bh);
801 				/* do data=ordered on any page past the end
802 				 * of file and any buffer marked BH_New.
803 				 */
804 				if (reiserfs_data_ordered(inode->i_sb) &&
805 				    (new || page->index >= i_size_index)) {
806 					reiserfs_add_ordered_list(inode, bh);
807 				}
808 			}
809 		}
810 	}
811 	if (logit) {
812 		ret = journal_end(&th, s, bh_per_page + 1);
813 	      drop_write_lock:
814 		reiserfs_write_unlock(s);
815 	}
816 	/*
817 	 * If this is a partial write which happened to make all buffers
818 	 * uptodate then we can optimize away a bogus readpage() for
819 	 * the next read(). Here we 'discover' whether the page went
820 	 * uptodate as a result of this (potentially partial) write.
821 	 */
822 	if (!partial)
823 		SetPageUptodate(page);
824 	return ret;
825 }
826 
827 /* Submit pages for write. This was separated from actual file copying
828    because we might want to allocate block numbers in-between.
829    This function assumes that caller will adjust file size to correct value. */
830 static int reiserfs_submit_file_region_for_write(struct reiserfs_transaction_handle *th, struct inode *inode, loff_t pos,	/* Writing position offset */
831 						 size_t num_pages,	/* Number of pages to write */
832 						 size_t write_bytes,	/* number of bytes to write */
833 						 struct page **prepared_pages	/* list of pages */
834     )
835 {
836 	int status;		// return status of block_commit_write.
837 	int retval = 0;		// Return value we are going to return.
838 	int i;			// loop counter
839 	int offset;		// Writing offset in page.
840 	int orig_write_bytes = write_bytes;
841 	int sd_update = 0;
842 
843 	for (i = 0, offset = (pos & (PAGE_CACHE_SIZE - 1)); i < num_pages;
844 	     i++, offset = 0) {
845 		int count = min_t(int, PAGE_CACHE_SIZE - offset, write_bytes);	// How much of bytes to write to this page
846 		struct page *page = prepared_pages[i];	// Current page we process.
847 
848 		status =
849 		    reiserfs_commit_page(inode, page, offset, offset + count);
850 		if (status)
851 			retval = status;	// To not overcomplicate matters We are going to
852 		// submit all the pages even if there was error.
853 		// we only remember error status to report it on
854 		// exit.
855 		write_bytes -= count;
856 	}
857 	/* now that we've gotten all the ordered buffers marked dirty,
858 	 * we can safely update i_size and close any running transaction
859 	 */
860 	if (pos + orig_write_bytes > inode->i_size) {
861 		inode->i_size = pos + orig_write_bytes;	// Set new size
862 		/* If the file have grown so much that tail packing is no
863 		 * longer possible, reset "need to pack" flag */
864 		if ((have_large_tails(inode->i_sb) &&
865 		     inode->i_size > i_block_size(inode) * 4) ||
866 		    (have_small_tails(inode->i_sb) &&
867 		     inode->i_size > i_block_size(inode)))
868 			REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
869 		else if ((have_large_tails(inode->i_sb) &&
870 			  inode->i_size < i_block_size(inode) * 4) ||
871 			 (have_small_tails(inode->i_sb) &&
872 			  inode->i_size < i_block_size(inode)))
873 			REISERFS_I(inode)->i_flags |= i_pack_on_close_mask;
874 
875 		if (th->t_trans_id) {
876 			reiserfs_write_lock(inode->i_sb);
877 			// this sets the proper flags for O_SYNC to trigger a commit
878 			mark_inode_dirty(inode);
879 			reiserfs_write_unlock(inode->i_sb);
880 		} else {
881 			reiserfs_write_lock(inode->i_sb);
882 			reiserfs_update_inode_transaction(inode);
883 			mark_inode_dirty(inode);
884 			reiserfs_write_unlock(inode->i_sb);
885 		}
886 
887 		sd_update = 1;
888 	}
889 	if (th->t_trans_id) {
890 		reiserfs_write_lock(inode->i_sb);
891 		if (!sd_update)
892 			mark_inode_dirty(inode);
893 		status = journal_end(th, th->t_super, th->t_blocks_allocated);
894 		if (status)
895 			retval = status;
896 		reiserfs_write_unlock(inode->i_sb);
897 	}
898 	th->t_trans_id = 0;
899 
900 	/*
901 	 * we have to unlock the pages after updating i_size, otherwise
902 	 * we race with writepage
903 	 */
904 	for (i = 0; i < num_pages; i++) {
905 		struct page *page = prepared_pages[i];
906 		unlock_page(page);
907 		mark_page_accessed(page);
908 		page_cache_release(page);
909 	}
910 	return retval;
911 }
912 
913 /* Look if passed writing region is going to touch file's tail
914    (if it is present). And if it is, convert the tail to unformatted node */
915 static int reiserfs_check_for_tail_and_convert(struct inode *inode,	/* inode to deal with */
916 					       loff_t pos,	/* Writing position */
917 					       int write_bytes	/* amount of bytes to write */
918     )
919 {
920 	INITIALIZE_PATH(path);	// needed for search_for_position
921 	struct cpu_key key;	// Key that would represent last touched writing byte.
922 	struct item_head *ih;	// item header of found block;
923 	int res;		// Return value of various functions we call.
924 	int cont_expand_offset;	// We will put offset for generic_cont_expand here
925 	// This can be int just because tails are created
926 	// only for small files.
927 
928 /* this embodies a dependency on a particular tail policy */
929 	if (inode->i_size >= inode->i_sb->s_blocksize * 4) {
930 		/* such a big files do not have tails, so we won't bother ourselves
931 		   to look for tails, simply return */
932 		return 0;
933 	}
934 
935 	reiserfs_write_lock(inode->i_sb);
936 	/* find the item containing the last byte to be written, or if
937 	 * writing past the end of the file then the last item of the
938 	 * file (and then we check its type). */
939 	make_cpu_key(&key, inode, pos + write_bytes + 1, TYPE_ANY,
940 		     3 /*key length */ );
941 	res = search_for_position_by_key(inode->i_sb, &key, &path);
942 	if (res == IO_ERROR) {
943 		reiserfs_write_unlock(inode->i_sb);
944 		return -EIO;
945 	}
946 	ih = get_ih(&path);
947 	res = 0;
948 	if (is_direct_le_ih(ih)) {
949 		/* Ok, closest item is file tail (tails are stored in "direct"
950 		 * items), so we need to unpack it. */
951 		/* To not overcomplicate matters, we just call generic_cont_expand
952 		   which will in turn call other stuff and finally will boil down to
953 		   reiserfs_get_block() that would do necessary conversion. */
954 		cont_expand_offset =
955 		    le_key_k_offset(get_inode_item_key_version(inode),
956 				    &(ih->ih_key));
957 		pathrelse(&path);
958 		res = generic_cont_expand(inode, cont_expand_offset);
959 	} else
960 		pathrelse(&path);
961 
962 	reiserfs_write_unlock(inode->i_sb);
963 	return res;
964 }
965 
966 /* This function locks pages starting from @pos for @inode.
967    @num_pages pages are locked and stored in
968    @prepared_pages array. Also buffers are allocated for these pages.
969    First and last page of the region is read if it is overwritten only
970    partially. If last page did not exist before write (file hole or file
971    append), it is zeroed, then.
972    Returns number of unallocated blocks that should be allocated to cover
973    new file data.*/
974 static int reiserfs_prepare_file_region_for_write(struct inode *inode
975 						  /* Inode of the file */ ,
976 						  loff_t pos,	/* position in the file */
977 						  size_t num_pages,	/* number of pages to
978 									   prepare */
979 						  size_t write_bytes,	/* Amount of bytes to be
980 									   overwritten from
981 									   @pos */
982 						  struct page **prepared_pages	/* pointer to array
983 										   where to store
984 										   prepared pages */
985     )
986 {
987 	int res = 0;		// Return values of different functions we call.
988 	unsigned long index = pos >> PAGE_CACHE_SHIFT;	// Offset in file in pages.
989 	int from = (pos & (PAGE_CACHE_SIZE - 1));	// Writing offset in first page
990 	int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1;
991 	/* offset of last modified byte in last
992 	   page */
993 	struct address_space *mapping = inode->i_mapping;	// Pages are mapped here.
994 	int i;			// Simple counter
995 	int blocks = 0;		/* Return value (blocks that should be allocated) */
996 	struct buffer_head *bh, *head;	// Current bufferhead and first bufferhead
997 	// of a page.
998 	unsigned block_start, block_end;	// Starting and ending offsets of current
999 	// buffer in the page.
1000 	struct buffer_head *wait[2], **wait_bh = wait;	// Buffers for page, if
1001 	// Page appeared to be not up
1002 	// to date. Note how we have
1003 	// at most 2 buffers, this is
1004 	// because we at most may
1005 	// partially overwrite two
1006 	// buffers for one page. One at                                                 // the beginning of write area
1007 	// and one at the end.
1008 	// Everything inthe middle gets                                                 // overwritten totally.
1009 
1010 	struct cpu_key key;	// cpu key of item that we are going to deal with
1011 	struct item_head *ih = NULL;	// pointer to item head that we are going to deal with
1012 	struct buffer_head *itembuf = NULL;	// Buffer head that contains items that we are going to deal with
1013 	INITIALIZE_PATH(path);	// path to item, that we are going to deal with.
1014 	__le32 *item = NULL;	// pointer to item we are going to deal with
1015 	int item_pos = -1;	/* Position in indirect item */
1016 
1017 	if (num_pages < 1) {
1018 		reiserfs_warning(inode->i_sb,
1019 				 "green-9001: reiserfs_prepare_file_region_for_write "
1020 				 "called with zero number of pages to process");
1021 		return -EFAULT;
1022 	}
1023 
1024 	/* We have 2 loops for pages. In first loop we grab and lock the pages, so
1025 	   that nobody would touch these until we release the pages. Then
1026 	   we'd start to deal with mapping buffers to blocks. */
1027 	for (i = 0; i < num_pages; i++) {
1028 		prepared_pages[i] = grab_cache_page(mapping, index + i);	// locks the page
1029 		if (!prepared_pages[i]) {
1030 			res = -ENOMEM;
1031 			goto failed_page_grabbing;
1032 		}
1033 		if (!page_has_buffers(prepared_pages[i]))
1034 			create_empty_buffers(prepared_pages[i],
1035 					     inode->i_sb->s_blocksize, 0);
1036 	}
1037 
1038 	/* Let's count amount of blocks for a case where all the blocks
1039 	   overwritten are new (we will substract already allocated blocks later) */
1040 	if (num_pages > 2)
1041 		/* These are full-overwritten pages so we count all the blocks in
1042 		   these pages are counted as needed to be allocated */
1043 		blocks =
1044 		    (num_pages - 2) << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1045 
1046 	/* count blocks needed for first page (possibly partially written) */
1047 	blocks += ((PAGE_CACHE_SIZE - from) >> inode->i_blkbits) + !!(from & (inode->i_sb->s_blocksize - 1));	/* roundup */
1048 
1049 	/* Now we account for last page. If last page == first page (we
1050 	   overwrite only one page), we substract all the blocks past the
1051 	   last writing position in a page out of already calculated number
1052 	   of blocks */
1053 	blocks += ((num_pages > 1) << (PAGE_CACHE_SHIFT - inode->i_blkbits)) -
1054 	    ((PAGE_CACHE_SIZE - to) >> inode->i_blkbits);
1055 	/* Note how we do not roundup here since partial blocks still
1056 	   should be allocated */
1057 
1058 	/* Now if all the write area lies past the file end, no point in
1059 	   maping blocks, since there is none, so we just zero out remaining
1060 	   parts of first and last pages in write area (if needed) */
1061 	if ((pos & ~((loff_t) PAGE_CACHE_SIZE - 1)) > inode->i_size) {
1062 		if (from != 0)		/* First page needs to be partially zeroed */
1063 			zero_user_page(prepared_pages[0], 0, from, KM_USER0);
1064 
1065 		if (to != PAGE_CACHE_SIZE)	/* Last page needs to be partially zeroed */
1066 			zero_user_page(prepared_pages[num_pages-1], to,
1067 					PAGE_CACHE_SIZE - to, KM_USER0);
1068 
1069 		/* Since all blocks are new - use already calculated value */
1070 		return blocks;
1071 	}
1072 
1073 	/* Well, since we write somewhere into the middle of a file, there is
1074 	   possibility we are writing over some already allocated blocks, so
1075 	   let's map these blocks and substract number of such blocks out of blocks
1076 	   we need to allocate (calculated above) */
1077 	/* Mask write position to start on blocksize, we do it out of the
1078 	   loop for performance reasons */
1079 	pos &= ~((loff_t) inode->i_sb->s_blocksize - 1);
1080 	/* Set cpu key to the starting position in a file (on left block boundary) */
1081 	make_cpu_key(&key, inode,
1082 		     1 + ((pos) & ~((loff_t) inode->i_sb->s_blocksize - 1)),
1083 		     TYPE_ANY, 3 /*key length */ );
1084 
1085 	reiserfs_write_lock(inode->i_sb);	// We need that for at least search_by_key()
1086 	for (i = 0; i < num_pages; i++) {
1087 
1088 		head = page_buffers(prepared_pages[i]);
1089 		/* For each buffer in the page */
1090 		for (bh = head, block_start = 0; bh != head || !block_start;
1091 		     block_start = block_end, bh = bh->b_this_page) {
1092 			if (!bh)
1093 				reiserfs_panic(inode->i_sb,
1094 					       "green-9002: Allocated but absent buffer for a page?");
1095 			/* Find where this buffer ends */
1096 			block_end = block_start + inode->i_sb->s_blocksize;
1097 			if (i == 0 && block_end <= from)
1098 				/* if this buffer is before requested data to map, skip it */
1099 				continue;
1100 
1101 			if (i == num_pages - 1 && block_start >= to) {
1102 				/* If this buffer is after requested data to map, abort
1103 				   processing of current page */
1104 				break;
1105 			}
1106 
1107 			if (buffer_mapped(bh) && bh->b_blocknr != 0) {
1108 				/* This is optimisation for a case where buffer is mapped
1109 				   and have blocknumber assigned. In case significant amount
1110 				   of such buffers are present, we may avoid some amount
1111 				   of search_by_key calls.
1112 				   Probably it would be possible to move parts of this code
1113 				   out of BKL, but I afraid that would overcomplicate code
1114 				   without any noticeable benefit.
1115 				 */
1116 				item_pos++;
1117 				/* Update the key */
1118 				set_cpu_key_k_offset(&key,
1119 						     cpu_key_k_offset(&key) +
1120 						     inode->i_sb->s_blocksize);
1121 				blocks--;	// Decrease the amount of blocks that need to be
1122 				// allocated
1123 				continue;	// Go to the next buffer
1124 			}
1125 
1126 			if (!itembuf ||	/* if first iteration */
1127 			    item_pos >= ih_item_len(ih) / UNFM_P_SIZE) {	/* or if we progressed past the
1128 										   current unformatted_item */
1129 				/* Try to find next item */
1130 				res =
1131 				    search_for_position_by_key(inode->i_sb,
1132 							       &key, &path);
1133 				/* Abort if no more items */
1134 				if (res != POSITION_FOUND) {
1135 					/* make sure later loops don't use this item */
1136 					itembuf = NULL;
1137 					item = NULL;
1138 					break;
1139 				}
1140 
1141 				/* Update information about current indirect item */
1142 				itembuf = get_last_bh(&path);
1143 				ih = get_ih(&path);
1144 				item = get_item(&path);
1145 				item_pos = path.pos_in_item;
1146 
1147 				RFALSE(!is_indirect_le_ih(ih),
1148 				       "green-9003: indirect item expected");
1149 			}
1150 
1151 			/* See if there is some block associated with the file
1152 			   at that position, map the buffer to this block */
1153 			if (get_block_num(item, item_pos)) {
1154 				map_bh(bh, inode->i_sb,
1155 				       get_block_num(item, item_pos));
1156 				blocks--;	// Decrease the amount of blocks that need to be
1157 				// allocated
1158 			}
1159 			item_pos++;
1160 			/* Update the key */
1161 			set_cpu_key_k_offset(&key,
1162 					     cpu_key_k_offset(&key) +
1163 					     inode->i_sb->s_blocksize);
1164 		}
1165 	}
1166 	pathrelse(&path);	// Free the path
1167 	reiserfs_write_unlock(inode->i_sb);
1168 
1169 	/* Now zero out unmappend buffers for the first and last pages of
1170 	   write area or issue read requests if page is mapped. */
1171 	/* First page, see if it is not uptodate */
1172 	if (!PageUptodate(prepared_pages[0])) {
1173 		head = page_buffers(prepared_pages[0]);
1174 
1175 		/* For each buffer in page */
1176 		for (bh = head, block_start = 0; bh != head || !block_start;
1177 		     block_start = block_end, bh = bh->b_this_page) {
1178 
1179 			if (!bh)
1180 				reiserfs_panic(inode->i_sb,
1181 					       "green-9002: Allocated but absent buffer for a page?");
1182 			/* Find where this buffer ends */
1183 			block_end = block_start + inode->i_sb->s_blocksize;
1184 			if (block_end <= from)
1185 				/* if this buffer is before requested data to map, skip it */
1186 				continue;
1187 			if (block_start < from) {	/* Aha, our partial buffer */
1188 				if (buffer_mapped(bh)) {	/* If it is mapped, we need to
1189 								   issue READ request for it to
1190 								   not loose data */
1191 					ll_rw_block(READ, 1, &bh);
1192 					*wait_bh++ = bh;
1193 				} else {	/* Not mapped, zero it */
1194 					zero_user_page(prepared_pages[0],
1195 						       block_start,
1196 						       from - block_start, KM_USER0);
1197 					set_buffer_uptodate(bh);
1198 				}
1199 			}
1200 		}
1201 	}
1202 
1203 	/* Last page, see if it is not uptodate, or if the last page is past the end of the file. */
1204 	if (!PageUptodate(prepared_pages[num_pages - 1]) ||
1205 	    ((pos + write_bytes) >> PAGE_CACHE_SHIFT) >
1206 	    (inode->i_size >> PAGE_CACHE_SHIFT)) {
1207 		head = page_buffers(prepared_pages[num_pages - 1]);
1208 
1209 		/* for each buffer in page */
1210 		for (bh = head, block_start = 0; bh != head || !block_start;
1211 		     block_start = block_end, bh = bh->b_this_page) {
1212 
1213 			if (!bh)
1214 				reiserfs_panic(inode->i_sb,
1215 					       "green-9002: Allocated but absent buffer for a page?");
1216 			/* Find where this buffer ends */
1217 			block_end = block_start + inode->i_sb->s_blocksize;
1218 			if (block_start >= to)
1219 				/* if this buffer is after requested data to map, skip it */
1220 				break;
1221 			if (block_end > to) {	/* Aha, our partial buffer */
1222 				if (buffer_mapped(bh)) {	/* If it is mapped, we need to
1223 								   issue READ request for it to
1224 								   not loose data */
1225 					ll_rw_block(READ, 1, &bh);
1226 					*wait_bh++ = bh;
1227 				} else {	/* Not mapped, zero it */
1228 					zero_user_page(prepared_pages[num_pages-1],
1229 							to, block_end - to, KM_USER0);
1230 					set_buffer_uptodate(bh);
1231 				}
1232 			}
1233 		}
1234 	}
1235 
1236 	/* Wait for read requests we made to happen, if necessary */
1237 	while (wait_bh > wait) {
1238 		wait_on_buffer(*--wait_bh);
1239 		if (!buffer_uptodate(*wait_bh)) {
1240 			res = -EIO;
1241 			goto failed_read;
1242 		}
1243 	}
1244 
1245 	return blocks;
1246       failed_page_grabbing:
1247 	num_pages = i;
1248       failed_read:
1249 	reiserfs_unprepare_pages(prepared_pages, num_pages);
1250 	return res;
1251 }
1252 
1253 /* Write @count bytes at position @ppos in a file indicated by @file
1254    from the buffer @buf.
1255 
1256    generic_file_write() is only appropriate for filesystems that are not seeking to optimize performance and want
1257    something simple that works.  It is not for serious use by general purpose filesystems, excepting the one that it was
1258    written for (ext2/3).  This is for several reasons:
1259 
1260    * It has no understanding of any filesystem specific optimizations.
1261 
1262    * It enters the filesystem repeatedly for each page that is written.
1263 
1264    * It depends on reiserfs_get_block() function which if implemented by reiserfs performs costly search_by_key
1265    * operation for each page it is supplied with. By contrast reiserfs_file_write() feeds as much as possible at a time
1266    * to reiserfs which allows for fewer tree traversals.
1267 
1268    * Each indirect pointer insertion takes a lot of cpu, because it involves memory moves inside of blocks.
1269 
1270    * Asking the block allocation code for blocks one at a time is slightly less efficient.
1271 
1272    All of these reasons for not using only generic file write were understood back when reiserfs was first miscoded to
1273    use it, but we were in a hurry to make code freeze, and so it couldn't be revised then.  This new code should make
1274    things right finally.
1275 
1276    Future Features: providing search_by_key with hints.
1277 
1278 */
1279 static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going to write into */
1280 				   const char __user * buf,	/*  pointer to user supplied data
1281 								   (in userspace) */
1282 				   size_t count,	/* amount of bytes to write */
1283 				   loff_t * ppos	/* pointer to position in file that we start writing at. Should be updated to
1284 							 * new current position before returning. */
1285 				   )
1286 {
1287 	size_t already_written = 0;	// Number of bytes already written to the file.
1288 	loff_t pos;		// Current position in the file.
1289 	ssize_t res;		// return value of various functions that we call.
1290 	int err = 0;
1291 	struct inode *inode = file->f_path.dentry->d_inode;	// Inode of the file that we are writing to.
1292 	/* To simplify coding at this time, we store
1293 	   locked pages in array for now */
1294 	struct page *prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME];
1295 	struct reiserfs_transaction_handle th;
1296 	th.t_trans_id = 0;
1297 
1298 	/* If a filesystem is converted from 3.5 to 3.6, we'll have v3.5 items
1299 	* lying around (most of the disk, in fact). Despite the filesystem
1300 	* now being a v3.6 format, the old items still can't support large
1301 	* file sizes. Catch this case here, as the rest of the VFS layer is
1302 	* oblivious to the different limitations between old and new items.
1303 	* reiserfs_setattr catches this for truncates. This chunk is lifted
1304 	* from generic_write_checks. */
1305 	if (get_inode_item_key_version (inode) == KEY_FORMAT_3_5 &&
1306 	    *ppos + count > MAX_NON_LFS) {
1307 		if (*ppos >= MAX_NON_LFS) {
1308 			send_sig(SIGXFSZ, current, 0);
1309 			return -EFBIG;
1310 		}
1311 		if (count > MAX_NON_LFS - (unsigned long)*ppos)
1312 			count = MAX_NON_LFS - (unsigned long)*ppos;
1313 	}
1314 
1315 	if (file->f_flags & O_DIRECT)
1316 		return do_sync_write(file, buf, count, ppos);
1317 
1318 	if (unlikely((ssize_t) count < 0))
1319 		return -EINVAL;
1320 
1321 	if (unlikely(!access_ok(VERIFY_READ, buf, count)))
1322 		return -EFAULT;
1323 
1324 	mutex_lock(&inode->i_mutex);	// locks the entire file for just us
1325 
1326 	pos = *ppos;
1327 
1328 	/* Check if we can write to specified region of file, file
1329 	   is not overly big and this kind of stuff. Adjust pos and
1330 	   count, if needed */
1331 	res = generic_write_checks(file, &pos, &count, 0);
1332 	if (res)
1333 		goto out;
1334 
1335 	if (count == 0)
1336 		goto out;
1337 
1338 	res = remove_suid(file->f_path.dentry);
1339 	if (res)
1340 		goto out;
1341 
1342 	file_update_time(file);
1343 
1344 	// Ok, we are done with all the checks.
1345 
1346 	// Now we should start real work
1347 
1348 	/* If we are going to write past the file's packed tail or if we are going
1349 	   to overwrite part of the tail, we need that tail to be converted into
1350 	   unformatted node */
1351 	res = reiserfs_check_for_tail_and_convert(inode, pos, count);
1352 	if (res)
1353 		goto out;
1354 
1355 	while (count > 0) {
1356 		/* This is the main loop in which we running until some error occures
1357 		   or until we write all of the data. */
1358 		size_t num_pages;	/* amount of pages we are going to write this iteration */
1359 		size_t write_bytes;	/* amount of bytes to write during this iteration */
1360 		size_t blocks_to_allocate;	/* how much blocks we need to allocate for this iteration */
1361 
1362 		/*  (pos & (PAGE_CACHE_SIZE-1)) is an idiom for offset into a page of pos */
1363 		num_pages = !!((pos + count) & (PAGE_CACHE_SIZE - 1)) +	/* round up partial
1364 									   pages */
1365 		    ((count +
1366 		      (pos & (PAGE_CACHE_SIZE - 1))) >> PAGE_CACHE_SHIFT);
1367 		/* convert size to amount of
1368 		   pages */
1369 		reiserfs_write_lock(inode->i_sb);
1370 		if (num_pages > REISERFS_WRITE_PAGES_AT_A_TIME
1371 		    || num_pages > reiserfs_can_fit_pages(inode->i_sb)) {
1372 			/* If we were asked to write more data than we want to or if there
1373 			   is not that much space, then we shorten amount of data to write
1374 			   for this iteration. */
1375 			num_pages =
1376 			    min_t(size_t, REISERFS_WRITE_PAGES_AT_A_TIME,
1377 				  reiserfs_can_fit_pages(inode->i_sb));
1378 			/* Also we should not forget to set size in bytes accordingly */
1379 			write_bytes = (num_pages << PAGE_CACHE_SHIFT) -
1380 			    (pos & (PAGE_CACHE_SIZE - 1));
1381 			/* If position is not on the
1382 			   start of the page, we need
1383 			   to substract the offset
1384 			   within page */
1385 		} else
1386 			write_bytes = count;
1387 
1388 		/* reserve the blocks to be allocated later, so that later on
1389 		   we still have the space to write the blocks to */
1390 		reiserfs_claim_blocks_to_be_allocated(inode->i_sb,
1391 						      num_pages <<
1392 						      (PAGE_CACHE_SHIFT -
1393 						       inode->i_blkbits));
1394 		reiserfs_write_unlock(inode->i_sb);
1395 
1396 		if (!num_pages) {	/* If we do not have enough space even for a single page... */
1397 			if (pos >
1398 			    inode->i_size + inode->i_sb->s_blocksize -
1399 			    (pos & (inode->i_sb->s_blocksize - 1))) {
1400 				res = -ENOSPC;
1401 				break;	// In case we are writing past the end of the last file block, break.
1402 			}
1403 			// Otherwise we are possibly overwriting the file, so
1404 			// let's set write size to be equal or less than blocksize.
1405 			// This way we get it correctly for file holes.
1406 			// But overwriting files on absolutelly full volumes would not
1407 			// be very efficient. Well, people are not supposed to fill
1408 			// 100% of disk space anyway.
1409 			write_bytes =
1410 			    min_t(size_t, count,
1411 				  inode->i_sb->s_blocksize -
1412 				  (pos & (inode->i_sb->s_blocksize - 1)));
1413 			num_pages = 1;
1414 			// No blocks were claimed before, so do it now.
1415 			reiserfs_claim_blocks_to_be_allocated(inode->i_sb,
1416 							      1 <<
1417 							      (PAGE_CACHE_SHIFT
1418 							       -
1419 							       inode->
1420 							       i_blkbits));
1421 		}
1422 
1423 		/* Prepare for writing into the region, read in all the
1424 		   partially overwritten pages, if needed. And lock the pages,
1425 		   so that nobody else can access these until we are done.
1426 		   We get number of actual blocks needed as a result. */
1427 		res = reiserfs_prepare_file_region_for_write(inode, pos,
1428 							     num_pages,
1429 							     write_bytes,
1430 							     prepared_pages);
1431 		if (res < 0) {
1432 			reiserfs_release_claimed_blocks(inode->i_sb,
1433 							num_pages <<
1434 							(PAGE_CACHE_SHIFT -
1435 							 inode->i_blkbits));
1436 			break;
1437 		}
1438 
1439 		blocks_to_allocate = res;
1440 
1441 		/* First we correct our estimate of how many blocks we need */
1442 		reiserfs_release_claimed_blocks(inode->i_sb,
1443 						(num_pages <<
1444 						 (PAGE_CACHE_SHIFT -
1445 						  inode->i_sb->
1446 						  s_blocksize_bits)) -
1447 						blocks_to_allocate);
1448 
1449 		if (blocks_to_allocate > 0) {	/*We only allocate blocks if we need to */
1450 			/* Fill in all the possible holes and append the file if needed */
1451 			res =
1452 			    reiserfs_allocate_blocks_for_region(&th, inode, pos,
1453 								num_pages,
1454 								write_bytes,
1455 								prepared_pages,
1456 								blocks_to_allocate);
1457 		}
1458 
1459 		/* well, we have allocated the blocks, so it is time to free
1460 		   the reservation we made earlier. */
1461 		reiserfs_release_claimed_blocks(inode->i_sb,
1462 						blocks_to_allocate);
1463 		if (res) {
1464 			reiserfs_unprepare_pages(prepared_pages, num_pages);
1465 			break;
1466 		}
1467 
1468 /* NOTE that allocating blocks and filling blocks can be done in reverse order
1469    and probably we would do that just to get rid of garbage in files after a
1470    crash */
1471 
1472 		/* Copy data from user-supplied buffer to file's pages */
1473 		res =
1474 		    reiserfs_copy_from_user_to_file_region(pos, num_pages,
1475 							   write_bytes,
1476 							   prepared_pages, buf);
1477 		if (res) {
1478 			reiserfs_unprepare_pages(prepared_pages, num_pages);
1479 			break;
1480 		}
1481 
1482 		/* Send the pages to disk and unlock them. */
1483 		res =
1484 		    reiserfs_submit_file_region_for_write(&th, inode, pos,
1485 							  num_pages,
1486 							  write_bytes,
1487 							  prepared_pages);
1488 		if (res)
1489 			break;
1490 
1491 		already_written += write_bytes;
1492 		buf += write_bytes;
1493 		*ppos = pos += write_bytes;
1494 		count -= write_bytes;
1495 		balance_dirty_pages_ratelimited_nr(inode->i_mapping, num_pages);
1496 	}
1497 
1498 	/* this is only true on error */
1499 	if (th.t_trans_id) {
1500 		reiserfs_write_lock(inode->i_sb);
1501 		err = journal_end(&th, th.t_super, th.t_blocks_allocated);
1502 		reiserfs_write_unlock(inode->i_sb);
1503 		if (err) {
1504 			res = err;
1505 			goto out;
1506 		}
1507 	}
1508 
1509 	if (likely(res >= 0) &&
1510 	    (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))))
1511 		res = generic_osync_inode(inode, file->f_mapping,
1512 		                          OSYNC_METADATA | OSYNC_DATA);
1513 
1514 	mutex_unlock(&inode->i_mutex);
1515 	reiserfs_async_progress_wait(inode->i_sb);
1516 	return (already_written != 0) ? already_written : res;
1517 
1518       out:
1519 	mutex_unlock(&inode->i_mutex);	// unlock the file on exit.
1520 	return res;
1521 }
1522 
1523 const struct file_operations reiserfs_file_operations = {
1524 	.read = do_sync_read,
1525 	.write = reiserfs_file_write,
1526 	.ioctl = reiserfs_ioctl,
1527 #ifdef CONFIG_COMPAT
1528 	.compat_ioctl = reiserfs_compat_ioctl,
1529 #endif
1530 	.mmap = reiserfs_file_mmap,
1531 	.open = generic_file_open,
1532 	.release = reiserfs_file_release,
1533 	.fsync = reiserfs_sync_file,
1534 	.aio_read = generic_file_aio_read,
1535 	.aio_write = generic_file_aio_write,
1536 	.splice_read = generic_file_splice_read,
1537 	.splice_write = generic_file_splice_write,
1538 };
1539 
1540 const struct inode_operations reiserfs_file_inode_operations = {
1541 	.truncate = reiserfs_vfs_truncate_file,
1542 	.setattr = reiserfs_setattr,
1543 	.setxattr = reiserfs_setxattr,
1544 	.getxattr = reiserfs_getxattr,
1545 	.listxattr = reiserfs_listxattr,
1546 	.removexattr = reiserfs_removexattr,
1547 	.permission = reiserfs_permission,
1548 };
1549