xref: /openbmc/linux/fs/reiserfs/file.c (revision d5cb9783536a41df9f9cba5b0a1d78047ed787f7)
1 /*
2  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3  */
4 
5 #include <linux/time.h>
6 #include <linux/reiserfs_fs.h>
7 #include <linux/reiserfs_acl.h>
8 #include <linux/reiserfs_xattr.h>
9 #include <linux/smp_lock.h>
10 #include <asm/uaccess.h>
11 #include <linux/pagemap.h>
12 #include <linux/swap.h>
13 #include <linux/writeback.h>
14 #include <linux/blkdev.h>
15 #include <linux/buffer_head.h>
16 #include <linux/quotaops.h>
17 
18 /*
19 ** We pack the tails of files on file close, not at the time they are written.
20 ** This implies an unnecessary copy of the tail and an unnecessary indirect item
21 ** insertion/balancing, for files that are written in one write.
22 ** It avoids unnecessary tail packings (balances) for files that are written in
23 ** multiple writes and are small enough to have tails.
24 **
25 ** file_release is called by the VFS layer when the file is closed.  If
26 ** this is the last open file descriptor, and the file
27 ** small enough to have a tail, and the tail is currently in an
28 ** unformatted node, the tail is converted back into a direct item.
29 **
30 ** We use reiserfs_truncate_file to pack the tail, since it already has
31 ** all the conditions coded.
32 */
33 static int reiserfs_file_release(struct inode *inode, struct file *filp)
34 {
35 
36 	struct reiserfs_transaction_handle th;
37 	int err;
38 	int jbegin_failure = 0;
39 
40 	if (!S_ISREG(inode->i_mode))
41 		BUG();
42 
43 	/* fast out for when nothing needs to be done */
44 	if ((atomic_read(&inode->i_count) > 1 ||
45 	     !(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
46 	     !tail_has_to_be_packed(inode)) &&
47 	    REISERFS_I(inode)->i_prealloc_count <= 0) {
48 		return 0;
49 	}
50 
51 	reiserfs_write_lock(inode->i_sb);
52 	down(&inode->i_sem);
53 	/* freeing preallocation only involves relogging blocks that
54 	 * are already in the current transaction.  preallocation gets
55 	 * freed at the end of each transaction, so it is impossible for
56 	 * us to log any additional blocks (including quota blocks)
57 	 */
58 	err = journal_begin(&th, inode->i_sb, 1);
59 	if (err) {
60 		/* uh oh, we can't allow the inode to go away while there
61 		 * is still preallocation blocks pending.  Try to join the
62 		 * aborted transaction
63 		 */
64 		jbegin_failure = err;
65 		err = journal_join_abort(&th, inode->i_sb, 1);
66 
67 		if (err) {
68 			/* hmpf, our choices here aren't good.  We can pin the inode
69 			 * which will disallow unmount from every happening, we can
70 			 * do nothing, which will corrupt random memory on unmount,
71 			 * or we can forcibly remove the file from the preallocation
72 			 * list, which will leak blocks on disk.  Lets pin the inode
73 			 * and let the admin know what is going on.
74 			 */
75 			igrab(inode);
76 			reiserfs_warning(inode->i_sb,
77 					 "pinning inode %lu because the "
78 					 "preallocation can't be freed");
79 			goto out;
80 		}
81 	}
82 	reiserfs_update_inode_transaction(inode);
83 
84 #ifdef REISERFS_PREALLOCATE
85 	reiserfs_discard_prealloc(&th, inode);
86 #endif
87 	err = journal_end(&th, inode->i_sb, 1);
88 
89 	/* copy back the error code from journal_begin */
90 	if (!err)
91 		err = jbegin_failure;
92 
93 	if (!err && atomic_read(&inode->i_count) <= 1 &&
94 	    (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
95 	    tail_has_to_be_packed(inode)) {
96 		/* if regular file is released by last holder and it has been
97 		   appended (we append by unformatted node only) or its direct
98 		   item(s) had to be converted, then it may have to be
99 		   indirect2direct converted */
100 		err = reiserfs_truncate_file(inode, 0);
101 	}
102       out:
103 	up(&inode->i_sem);
104 	reiserfs_write_unlock(inode->i_sb);
105 	return err;
106 }
107 
108 static void reiserfs_vfs_truncate_file(struct inode *inode)
109 {
110 	reiserfs_truncate_file(inode, 1);
111 }
112 
113 /* Sync a reiserfs file. */
114 
115 /*
116  * FIXME: sync_mapping_buffers() never has anything to sync.  Can
117  * be removed...
118  */
119 
120 static int reiserfs_sync_file(struct file *p_s_filp,
121 			      struct dentry *p_s_dentry, int datasync)
122 {
123 	struct inode *p_s_inode = p_s_dentry->d_inode;
124 	int n_err;
125 	int barrier_done;
126 
127 	if (!S_ISREG(p_s_inode->i_mode))
128 		BUG();
129 	n_err = sync_mapping_buffers(p_s_inode->i_mapping);
130 	reiserfs_write_lock(p_s_inode->i_sb);
131 	barrier_done = reiserfs_commit_for_inode(p_s_inode);
132 	reiserfs_write_unlock(p_s_inode->i_sb);
133 	if (barrier_done != 1)
134 		blkdev_issue_flush(p_s_inode->i_sb->s_bdev, NULL);
135 	if (barrier_done < 0)
136 		return barrier_done;
137 	return (n_err < 0) ? -EIO : 0;
138 }
139 
140 /* I really do not want to play with memory shortage right now, so
141    to simplify the code, we are not going to write more than this much pages at
142    a time. This still should considerably improve performance compared to 4k
143    at a time case. This is 32 pages of 4k size. */
144 #define REISERFS_WRITE_PAGES_AT_A_TIME (128 * 1024) / PAGE_CACHE_SIZE
145 
146 /* Allocates blocks for a file to fulfil write request.
147    Maps all unmapped but prepared pages from the list.
148    Updates metadata with newly allocated blocknumbers as needed */
149 static int reiserfs_allocate_blocks_for_region(struct reiserfs_transaction_handle *th, struct inode *inode,	/* Inode we work with */
150 					       loff_t pos,	/* Writing position */
151 					       int num_pages,	/* number of pages write going
152 								   to touch */
153 					       int write_bytes,	/* amount of bytes to write */
154 					       struct page **prepared_pages,	/* array of
155 										   prepared pages
156 										 */
157 					       int blocks_to_allocate	/* Amount of blocks we
158 									   need to allocate to
159 									   fit the data into file
160 									 */
161     )
162 {
163 	struct cpu_key key;	// cpu key of item that we are going to deal with
164 	struct item_head *ih;	// pointer to item head that we are going to deal with
165 	struct buffer_head *bh;	// Buffer head that contains items that we are going to deal with
166 	__le32 *item;		// pointer to item we are going to deal with
167 	INITIALIZE_PATH(path);	// path to item, that we are going to deal with.
168 	b_blocknr_t *allocated_blocks;	// Pointer to a place where allocated blocknumbers would be stored.
169 	reiserfs_blocknr_hint_t hint;	// hint structure for block allocator.
170 	size_t res;		// return value of various functions that we call.
171 	int curr_block;		// current block used to keep track of unmapped blocks.
172 	int i;			// loop counter
173 	int itempos;		// position in item
174 	unsigned int from = (pos & (PAGE_CACHE_SIZE - 1));	// writing position in
175 	// first page
176 	unsigned int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1;	/* last modified byte offset in last page */
177 	__u64 hole_size;	// amount of blocks for a file hole, if it needed to be created.
178 	int modifying_this_item = 0;	// Flag for items traversal code to keep track
179 	// of the fact that we already prepared
180 	// current block for journal
181 	int will_prealloc = 0;
182 	RFALSE(!blocks_to_allocate,
183 	       "green-9004: tried to allocate zero blocks?");
184 
185 	/* only preallocate if this is a small write */
186 	if (REISERFS_I(inode)->i_prealloc_count ||
187 	    (!(write_bytes & (inode->i_sb->s_blocksize - 1)) &&
188 	     blocks_to_allocate <
189 	     REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize))
190 		will_prealloc =
191 		    REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize;
192 
193 	allocated_blocks = kmalloc((blocks_to_allocate + will_prealloc) *
194 				   sizeof(b_blocknr_t), GFP_NOFS);
195 
196 	/* First we compose a key to point at the writing position, we want to do
197 	   that outside of any locking region. */
198 	make_cpu_key(&key, inode, pos + 1, TYPE_ANY, 3 /*key length */ );
199 
200 	/* If we came here, it means we absolutely need to open a transaction,
201 	   since we need to allocate some blocks */
202 	reiserfs_write_lock(inode->i_sb);	// Journaling stuff and we need that.
203 	res = journal_begin(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb));	// Wish I know if this number enough
204 	if (res)
205 		goto error_exit;
206 	reiserfs_update_inode_transaction(inode);
207 
208 	/* Look for the in-tree position of our write, need path for block allocator */
209 	res = search_for_position_by_key(inode->i_sb, &key, &path);
210 	if (res == IO_ERROR) {
211 		res = -EIO;
212 		goto error_exit;
213 	}
214 
215 	/* Allocate blocks */
216 	/* First fill in "hint" structure for block allocator */
217 	hint.th = th;		// transaction handle.
218 	hint.path = &path;	// Path, so that block allocator can determine packing locality or whatever it needs to determine.
219 	hint.inode = inode;	// Inode is needed by block allocator too.
220 	hint.search_start = 0;	// We have no hint on where to search free blocks for block allocator.
221 	hint.key = key.on_disk_key;	// on disk key of file.
222 	hint.block = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);	// Number of disk blocks this file occupies already.
223 	hint.formatted_node = 0;	// We are allocating blocks for unformatted node.
224 	hint.preallocate = will_prealloc;
225 
226 	/* Call block allocator to allocate blocks */
227 	res =
228 	    reiserfs_allocate_blocknrs(&hint, allocated_blocks,
229 				       blocks_to_allocate, blocks_to_allocate);
230 	if (res != CARRY_ON) {
231 		if (res == NO_DISK_SPACE) {
232 			/* We flush the transaction in case of no space. This way some
233 			   blocks might become free */
234 			SB_JOURNAL(inode->i_sb)->j_must_wait = 1;
235 			res = restart_transaction(th, inode, &path);
236 			if (res)
237 				goto error_exit;
238 
239 			/* We might have scheduled, so search again */
240 			res =
241 			    search_for_position_by_key(inode->i_sb, &key,
242 						       &path);
243 			if (res == IO_ERROR) {
244 				res = -EIO;
245 				goto error_exit;
246 			}
247 
248 			/* update changed info for hint structure. */
249 			res =
250 			    reiserfs_allocate_blocknrs(&hint, allocated_blocks,
251 						       blocks_to_allocate,
252 						       blocks_to_allocate);
253 			if (res != CARRY_ON) {
254 				res = -ENOSPC;
255 				pathrelse(&path);
256 				goto error_exit;
257 			}
258 		} else {
259 			res = -ENOSPC;
260 			pathrelse(&path);
261 			goto error_exit;
262 		}
263 	}
264 #ifdef __BIG_ENDIAN
265 	// Too bad, I have not found any way to convert a given region from
266 	// cpu format to little endian format
267 	{
268 		int i;
269 		for (i = 0; i < blocks_to_allocate; i++)
270 			allocated_blocks[i] = cpu_to_le32(allocated_blocks[i]);
271 	}
272 #endif
273 
274 	/* Blocks allocating well might have scheduled and tree might have changed,
275 	   let's search the tree again */
276 	/* find where in the tree our write should go */
277 	res = search_for_position_by_key(inode->i_sb, &key, &path);
278 	if (res == IO_ERROR) {
279 		res = -EIO;
280 		goto error_exit_free_blocks;
281 	}
282 
283 	bh = get_last_bh(&path);	// Get a bufferhead for last element in path.
284 	ih = get_ih(&path);	// Get a pointer to last item head in path.
285 	item = get_item(&path);	// Get a pointer to last item in path
286 
287 	/* Let's see what we have found */
288 	if (res != POSITION_FOUND) {	/* position not found, this means that we
289 					   might need to append file with holes
290 					   first */
291 		// Since we are writing past the file's end, we need to find out if
292 		// there is a hole that needs to be inserted before our writing
293 		// position, and how many blocks it is going to cover (we need to
294 		//  populate pointers to file blocks representing the hole with zeros)
295 
296 		{
297 			int item_offset = 1;
298 			/*
299 			 * if ih is stat data, its offset is 0 and we don't want to
300 			 * add 1 to pos in the hole_size calculation
301 			 */
302 			if (is_statdata_le_ih(ih))
303 				item_offset = 0;
304 			hole_size = (pos + item_offset -
305 				     (le_key_k_offset
306 				      (get_inode_item_key_version(inode),
307 				       &(ih->ih_key)) + op_bytes_number(ih,
308 									inode->
309 									i_sb->
310 									s_blocksize)))
311 			    >> inode->i_sb->s_blocksize_bits;
312 		}
313 
314 		if (hole_size > 0) {
315 			int to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize) / UNFM_P_SIZE);	// How much data to insert first time.
316 			/* area filled with zeroes, to supply as list of zero blocknumbers
317 			   We allocate it outside of loop just in case loop would spin for
318 			   several iterations. */
319 			char *zeros = kmalloc(to_paste * UNFM_P_SIZE, GFP_ATOMIC);	// We cannot insert more than MAX_ITEM_LEN bytes anyway.
320 			if (!zeros) {
321 				res = -ENOMEM;
322 				goto error_exit_free_blocks;
323 			}
324 			memset(zeros, 0, to_paste * UNFM_P_SIZE);
325 			do {
326 				to_paste =
327 				    min_t(__u64, hole_size,
328 					  MAX_ITEM_LEN(inode->i_sb->
329 						       s_blocksize) /
330 					  UNFM_P_SIZE);
331 				if (is_indirect_le_ih(ih)) {
332 					/* Ok, there is existing indirect item already. Need to append it */
333 					/* Calculate position past inserted item */
334 					make_cpu_key(&key, inode,
335 						     le_key_k_offset
336 						     (get_inode_item_key_version
337 						      (inode),
338 						      &(ih->ih_key)) +
339 						     op_bytes_number(ih,
340 								     inode->
341 								     i_sb->
342 								     s_blocksize),
343 						     TYPE_INDIRECT, 3);
344 					res =
345 					    reiserfs_paste_into_item(th, &path,
346 								     &key,
347 								     inode,
348 								     (char *)
349 								     zeros,
350 								     UNFM_P_SIZE
351 								     *
352 								     to_paste);
353 					if (res) {
354 						kfree(zeros);
355 						goto error_exit_free_blocks;
356 					}
357 				} else if (is_statdata_le_ih(ih)) {
358 					/* No existing item, create it */
359 					/* item head for new item */
360 					struct item_head ins_ih;
361 
362 					/* create a key for our new item */
363 					make_cpu_key(&key, inode, 1,
364 						     TYPE_INDIRECT, 3);
365 
366 					/* Create new item head for our new item */
367 					make_le_item_head(&ins_ih, &key,
368 							  key.version, 1,
369 							  TYPE_INDIRECT,
370 							  to_paste *
371 							  UNFM_P_SIZE,
372 							  0 /* free space */ );
373 
374 					/* Find where such item should live in the tree */
375 					res =
376 					    search_item(inode->i_sb, &key,
377 							&path);
378 					if (res != ITEM_NOT_FOUND) {
379 						/* item should not exist, otherwise we have error */
380 						if (res != -ENOSPC) {
381 							reiserfs_warning(inode->
382 									 i_sb,
383 									 "green-9008: search_by_key (%K) returned %d",
384 									 &key,
385 									 res);
386 						}
387 						res = -EIO;
388 						kfree(zeros);
389 						goto error_exit_free_blocks;
390 					}
391 					res =
392 					    reiserfs_insert_item(th, &path,
393 								 &key, &ins_ih,
394 								 inode,
395 								 (char *)zeros);
396 				} else {
397 					reiserfs_panic(inode->i_sb,
398 						       "green-9011: Unexpected key type %K\n",
399 						       &key);
400 				}
401 				if (res) {
402 					kfree(zeros);
403 					goto error_exit_free_blocks;
404 				}
405 				/* Now we want to check if transaction is too full, and if it is
406 				   we restart it. This will also free the path. */
407 				if (journal_transaction_should_end
408 				    (th, th->t_blocks_allocated)) {
409 					res =
410 					    restart_transaction(th, inode,
411 								&path);
412 					if (res) {
413 						pathrelse(&path);
414 						kfree(zeros);
415 						goto error_exit;
416 					}
417 				}
418 
419 				/* Well, need to recalculate path and stuff */
420 				set_cpu_key_k_offset(&key,
421 						     cpu_key_k_offset(&key) +
422 						     (to_paste << inode->
423 						      i_blkbits));
424 				res =
425 				    search_for_position_by_key(inode->i_sb,
426 							       &key, &path);
427 				if (res == IO_ERROR) {
428 					res = -EIO;
429 					kfree(zeros);
430 					goto error_exit_free_blocks;
431 				}
432 				bh = get_last_bh(&path);
433 				ih = get_ih(&path);
434 				item = get_item(&path);
435 				hole_size -= to_paste;
436 			} while (hole_size);
437 			kfree(zeros);
438 		}
439 	}
440 	// Go through existing indirect items first
441 	// replace all zeroes with blocknumbers from list
442 	// Note that if no corresponding item was found, by previous search,
443 	// it means there are no existing in-tree representation for file area
444 	// we are going to overwrite, so there is nothing to scan through for holes.
445 	for (curr_block = 0, itempos = path.pos_in_item;
446 	     curr_block < blocks_to_allocate && res == POSITION_FOUND;) {
447 	      retry:
448 
449 		if (itempos >= ih_item_len(ih) / UNFM_P_SIZE) {
450 			/* We run out of data in this indirect item, let's look for another
451 			   one. */
452 			/* First if we are already modifying current item, log it */
453 			if (modifying_this_item) {
454 				journal_mark_dirty(th, inode->i_sb, bh);
455 				modifying_this_item = 0;
456 			}
457 			/* Then set the key to look for a new indirect item (offset of old
458 			   item is added to old item length */
459 			set_cpu_key_k_offset(&key,
460 					     le_key_k_offset
461 					     (get_inode_item_key_version(inode),
462 					      &(ih->ih_key)) +
463 					     op_bytes_number(ih,
464 							     inode->i_sb->
465 							     s_blocksize));
466 			/* Search ofor position of new key in the tree. */
467 			res =
468 			    search_for_position_by_key(inode->i_sb, &key,
469 						       &path);
470 			if (res == IO_ERROR) {
471 				res = -EIO;
472 				goto error_exit_free_blocks;
473 			}
474 			bh = get_last_bh(&path);
475 			ih = get_ih(&path);
476 			item = get_item(&path);
477 			itempos = path.pos_in_item;
478 			continue;	// loop to check all kinds of conditions and so on.
479 		}
480 		/* Ok, we have correct position in item now, so let's see if it is
481 		   representing file hole (blocknumber is zero) and fill it if needed */
482 		if (!item[itempos]) {
483 			/* Ok, a hole. Now we need to check if we already prepared this
484 			   block to be journaled */
485 			while (!modifying_this_item) {	// loop until succeed
486 				/* Well, this item is not journaled yet, so we must prepare
487 				   it for journal first, before we can change it */
488 				struct item_head tmp_ih;	// We copy item head of found item,
489 				// here to detect if fs changed under
490 				// us while we were preparing for
491 				// journal.
492 				int fs_gen;	// We store fs generation here to find if someone
493 				// changes fs under our feet
494 
495 				copy_item_head(&tmp_ih, ih);	// Remember itemhead
496 				fs_gen = get_generation(inode->i_sb);	// remember fs generation
497 				reiserfs_prepare_for_journal(inode->i_sb, bh, 1);	// Prepare a buffer within which indirect item is stored for changing.
498 				if (fs_changed(fs_gen, inode->i_sb)
499 				    && item_moved(&tmp_ih, &path)) {
500 					// Sigh, fs was changed under us, we need to look for new
501 					// location of item we are working with
502 
503 					/* unmark prepaerd area as journaled and search for it's
504 					   new position */
505 					reiserfs_restore_prepared_buffer(inode->
506 									 i_sb,
507 									 bh);
508 					res =
509 					    search_for_position_by_key(inode->
510 								       i_sb,
511 								       &key,
512 								       &path);
513 					if (res == IO_ERROR) {
514 						res = -EIO;
515 						goto error_exit_free_blocks;
516 					}
517 					bh = get_last_bh(&path);
518 					ih = get_ih(&path);
519 					item = get_item(&path);
520 					itempos = path.pos_in_item;
521 					goto retry;
522 				}
523 				modifying_this_item = 1;
524 			}
525 			item[itempos] = allocated_blocks[curr_block];	// Assign new block
526 			curr_block++;
527 		}
528 		itempos++;
529 	}
530 
531 	if (modifying_this_item) {	// We need to log last-accessed block, if it
532 		// was modified, but not logged yet.
533 		journal_mark_dirty(th, inode->i_sb, bh);
534 	}
535 
536 	if (curr_block < blocks_to_allocate) {
537 		// Oh, well need to append to indirect item, or to create indirect item
538 		// if there weren't any
539 		if (is_indirect_le_ih(ih)) {
540 			// Existing indirect item - append. First calculate key for append
541 			// position. We do not need to recalculate path as it should
542 			// already point to correct place.
543 			make_cpu_key(&key, inode,
544 				     le_key_k_offset(get_inode_item_key_version
545 						     (inode),
546 						     &(ih->ih_key)) +
547 				     op_bytes_number(ih,
548 						     inode->i_sb->s_blocksize),
549 				     TYPE_INDIRECT, 3);
550 			res =
551 			    reiserfs_paste_into_item(th, &path, &key, inode,
552 						     (char *)(allocated_blocks +
553 							      curr_block),
554 						     UNFM_P_SIZE *
555 						     (blocks_to_allocate -
556 						      curr_block));
557 			if (res) {
558 				goto error_exit_free_blocks;
559 			}
560 		} else if (is_statdata_le_ih(ih)) {
561 			// Last found item was statdata. That means we need to create indirect item.
562 			struct item_head ins_ih;	/* itemhead for new item */
563 
564 			/* create a key for our new item */
565 			make_cpu_key(&key, inode, 1, TYPE_INDIRECT, 3);	// Position one,
566 			// because that's
567 			// where first
568 			// indirect item
569 			// begins
570 			/* Create new item head for our new item */
571 			make_le_item_head(&ins_ih, &key, key.version, 1,
572 					  TYPE_INDIRECT,
573 					  (blocks_to_allocate -
574 					   curr_block) * UNFM_P_SIZE,
575 					  0 /* free space */ );
576 			/* Find where such item should live in the tree */
577 			res = search_item(inode->i_sb, &key, &path);
578 			if (res != ITEM_NOT_FOUND) {
579 				/* Well, if we have found such item already, or some error
580 				   occured, we need to warn user and return error */
581 				if (res != -ENOSPC) {
582 					reiserfs_warning(inode->i_sb,
583 							 "green-9009: search_by_key (%K) "
584 							 "returned %d", &key,
585 							 res);
586 				}
587 				res = -EIO;
588 				goto error_exit_free_blocks;
589 			}
590 			/* Insert item into the tree with the data as its body */
591 			res =
592 			    reiserfs_insert_item(th, &path, &key, &ins_ih,
593 						 inode,
594 						 (char *)(allocated_blocks +
595 							  curr_block));
596 		} else {
597 			reiserfs_panic(inode->i_sb,
598 				       "green-9010: unexpected item type for key %K\n",
599 				       &key);
600 		}
601 	}
602 	// the caller is responsible for closing the transaction
603 	// unless we return an error, they are also responsible for logging
604 	// the inode.
605 	//
606 	pathrelse(&path);
607 	/*
608 	 * cleanup prellocation from previous writes
609 	 * if this is a partial block write
610 	 */
611 	if (write_bytes & (inode->i_sb->s_blocksize - 1))
612 		reiserfs_discard_prealloc(th, inode);
613 	reiserfs_write_unlock(inode->i_sb);
614 
615 	// go through all the pages/buffers and map the buffers to newly allocated
616 	// blocks (so that system knows where to write these pages later).
617 	curr_block = 0;
618 	for (i = 0; i < num_pages; i++) {
619 		struct page *page = prepared_pages[i];	//current page
620 		struct buffer_head *head = page_buffers(page);	// first buffer for a page
621 		int block_start, block_end;	// in-page offsets for buffers.
622 
623 		if (!page_buffers(page))
624 			reiserfs_panic(inode->i_sb,
625 				       "green-9005: No buffers for prepared page???");
626 
627 		/* For each buffer in page */
628 		for (bh = head, block_start = 0; bh != head || !block_start;
629 		     block_start = block_end, bh = bh->b_this_page) {
630 			if (!bh)
631 				reiserfs_panic(inode->i_sb,
632 					       "green-9006: Allocated but absent buffer for a page?");
633 			block_end = block_start + inode->i_sb->s_blocksize;
634 			if (i == 0 && block_end <= from)
635 				/* if this buffer is before requested data to map, skip it */
636 				continue;
637 			if (i == num_pages - 1 && block_start >= to)
638 				/* If this buffer is after requested data to map, abort
639 				   processing of current page */
640 				break;
641 
642 			if (!buffer_mapped(bh)) {	// Ok, unmapped buffer, need to map it
643 				map_bh(bh, inode->i_sb,
644 				       le32_to_cpu(allocated_blocks
645 						   [curr_block]));
646 				curr_block++;
647 				set_buffer_new(bh);
648 			}
649 		}
650 	}
651 
652 	RFALSE(curr_block > blocks_to_allocate,
653 	       "green-9007: Used too many blocks? weird");
654 
655 	kfree(allocated_blocks);
656 	return 0;
657 
658 // Need to deal with transaction here.
659       error_exit_free_blocks:
660 	pathrelse(&path);
661 	// free blocks
662 	for (i = 0; i < blocks_to_allocate; i++)
663 		reiserfs_free_block(th, inode, le32_to_cpu(allocated_blocks[i]),
664 				    1);
665 
666       error_exit:
667 	if (th->t_trans_id) {
668 		int err;
669 		// update any changes we made to blk count
670 		mark_inode_dirty(inode);
671 		err =
672 		    journal_end(th, inode->i_sb,
673 				JOURNAL_PER_BALANCE_CNT * 3 + 1 +
674 				2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb));
675 		if (err)
676 			res = err;
677 	}
678 	reiserfs_write_unlock(inode->i_sb);
679 	kfree(allocated_blocks);
680 
681 	return res;
682 }
683 
684 /* Unlock pages prepared by reiserfs_prepare_file_region_for_write */
685 static void reiserfs_unprepare_pages(struct page **prepared_pages,	/* list of locked pages */
686 				     size_t num_pages /* amount of pages */ )
687 {
688 	int i;			// loop counter
689 
690 	for (i = 0; i < num_pages; i++) {
691 		struct page *page = prepared_pages[i];
692 
693 		try_to_free_buffers(page);
694 		unlock_page(page);
695 		page_cache_release(page);
696 	}
697 }
698 
699 /* This function will copy data from userspace to specified pages within
700    supplied byte range */
701 static int reiserfs_copy_from_user_to_file_region(loff_t pos,	/* In-file position */
702 						  int num_pages,	/* Number of pages affected */
703 						  int write_bytes,	/* Amount of bytes to write */
704 						  struct page **prepared_pages,	/* pointer to
705 										   array to
706 										   prepared pages
707 										 */
708 						  const char __user * buf	/* Pointer to user-supplied
709 										   data */
710     )
711 {
712 	long page_fault = 0;	// status of copy_from_user.
713 	int i;			// loop counter.
714 	int offset;		// offset in page
715 
716 	for (i = 0, offset = (pos & (PAGE_CACHE_SIZE - 1)); i < num_pages;
717 	     i++, offset = 0) {
718 		size_t count = min_t(size_t, PAGE_CACHE_SIZE - offset, write_bytes);	// How much of bytes to write to this page
719 		struct page *page = prepared_pages[i];	// Current page we process.
720 
721 		fault_in_pages_readable(buf, count);
722 
723 		/* Copy data from userspace to the current page */
724 		kmap(page);
725 		page_fault = __copy_from_user(page_address(page) + offset, buf, count);	// Copy the data.
726 		/* Flush processor's dcache for this page */
727 		flush_dcache_page(page);
728 		kunmap(page);
729 		buf += count;
730 		write_bytes -= count;
731 
732 		if (page_fault)
733 			break;	// Was there a fault? abort.
734 	}
735 
736 	return page_fault ? -EFAULT : 0;
737 }
738 
739 /* taken fs/buffer.c:__block_commit_write */
740 int reiserfs_commit_page(struct inode *inode, struct page *page,
741 			 unsigned from, unsigned to)
742 {
743 	unsigned block_start, block_end;
744 	int partial = 0;
745 	unsigned blocksize;
746 	struct buffer_head *bh, *head;
747 	unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT;
748 	int new;
749 	int logit = reiserfs_file_data_log(inode);
750 	struct super_block *s = inode->i_sb;
751 	int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
752 	struct reiserfs_transaction_handle th;
753 	int ret = 0;
754 
755 	th.t_trans_id = 0;
756 	blocksize = 1 << inode->i_blkbits;
757 
758 	if (logit) {
759 		reiserfs_write_lock(s);
760 		ret = journal_begin(&th, s, bh_per_page + 1);
761 		if (ret)
762 			goto drop_write_lock;
763 		reiserfs_update_inode_transaction(inode);
764 	}
765 	for (bh = head = page_buffers(page), block_start = 0;
766 	     bh != head || !block_start;
767 	     block_start = block_end, bh = bh->b_this_page) {
768 
769 		new = buffer_new(bh);
770 		clear_buffer_new(bh);
771 		block_end = block_start + blocksize;
772 		if (block_end <= from || block_start >= to) {
773 			if (!buffer_uptodate(bh))
774 				partial = 1;
775 		} else {
776 			set_buffer_uptodate(bh);
777 			if (logit) {
778 				reiserfs_prepare_for_journal(s, bh, 1);
779 				journal_mark_dirty(&th, s, bh);
780 			} else if (!buffer_dirty(bh)) {
781 				mark_buffer_dirty(bh);
782 				/* do data=ordered on any page past the end
783 				 * of file and any buffer marked BH_New.
784 				 */
785 				if (reiserfs_data_ordered(inode->i_sb) &&
786 				    (new || page->index >= i_size_index)) {
787 					reiserfs_add_ordered_list(inode, bh);
788 				}
789 			}
790 		}
791 	}
792 	if (logit) {
793 		ret = journal_end(&th, s, bh_per_page + 1);
794 	      drop_write_lock:
795 		reiserfs_write_unlock(s);
796 	}
797 	/*
798 	 * If this is a partial write which happened to make all buffers
799 	 * uptodate then we can optimize away a bogus readpage() for
800 	 * the next read(). Here we 'discover' whether the page went
801 	 * uptodate as a result of this (potentially partial) write.
802 	 */
803 	if (!partial)
804 		SetPageUptodate(page);
805 	return ret;
806 }
807 
808 /* Submit pages for write. This was separated from actual file copying
809    because we might want to allocate block numbers in-between.
810    This function assumes that caller will adjust file size to correct value. */
811 static int reiserfs_submit_file_region_for_write(struct reiserfs_transaction_handle *th, struct inode *inode, loff_t pos,	/* Writing position offset */
812 						 size_t num_pages,	/* Number of pages to write */
813 						 size_t write_bytes,	/* number of bytes to write */
814 						 struct page **prepared_pages	/* list of pages */
815     )
816 {
817 	int status;		// return status of block_commit_write.
818 	int retval = 0;		// Return value we are going to return.
819 	int i;			// loop counter
820 	int offset;		// Writing offset in page.
821 	int orig_write_bytes = write_bytes;
822 	int sd_update = 0;
823 
824 	for (i = 0, offset = (pos & (PAGE_CACHE_SIZE - 1)); i < num_pages;
825 	     i++, offset = 0) {
826 		int count = min_t(int, PAGE_CACHE_SIZE - offset, write_bytes);	// How much of bytes to write to this page
827 		struct page *page = prepared_pages[i];	// Current page we process.
828 
829 		status =
830 		    reiserfs_commit_page(inode, page, offset, offset + count);
831 		if (status)
832 			retval = status;	// To not overcomplicate matters We are going to
833 		// submit all the pages even if there was error.
834 		// we only remember error status to report it on
835 		// exit.
836 		write_bytes -= count;
837 	}
838 	/* now that we've gotten all the ordered buffers marked dirty,
839 	 * we can safely update i_size and close any running transaction
840 	 */
841 	if (pos + orig_write_bytes > inode->i_size) {
842 		inode->i_size = pos + orig_write_bytes;	// Set new size
843 		/* If the file have grown so much that tail packing is no
844 		 * longer possible, reset "need to pack" flag */
845 		if ((have_large_tails(inode->i_sb) &&
846 		     inode->i_size > i_block_size(inode) * 4) ||
847 		    (have_small_tails(inode->i_sb) &&
848 		     inode->i_size > i_block_size(inode)))
849 			REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
850 		else if ((have_large_tails(inode->i_sb) &&
851 			  inode->i_size < i_block_size(inode) * 4) ||
852 			 (have_small_tails(inode->i_sb) &&
853 			  inode->i_size < i_block_size(inode)))
854 			REISERFS_I(inode)->i_flags |= i_pack_on_close_mask;
855 
856 		if (th->t_trans_id) {
857 			reiserfs_write_lock(inode->i_sb);
858 			// this sets the proper flags for O_SYNC to trigger a commit
859 			mark_inode_dirty(inode);
860 			reiserfs_write_unlock(inode->i_sb);
861 		} else
862 			mark_inode_dirty(inode);
863 
864 		sd_update = 1;
865 	}
866 	if (th->t_trans_id) {
867 		reiserfs_write_lock(inode->i_sb);
868 		if (!sd_update)
869 			mark_inode_dirty(inode);
870 		status = journal_end(th, th->t_super, th->t_blocks_allocated);
871 		if (status)
872 			retval = status;
873 		reiserfs_write_unlock(inode->i_sb);
874 	}
875 	th->t_trans_id = 0;
876 
877 	/*
878 	 * we have to unlock the pages after updating i_size, otherwise
879 	 * we race with writepage
880 	 */
881 	for (i = 0; i < num_pages; i++) {
882 		struct page *page = prepared_pages[i];
883 		unlock_page(page);
884 		mark_page_accessed(page);
885 		page_cache_release(page);
886 	}
887 	return retval;
888 }
889 
890 /* Look if passed writing region is going to touch file's tail
891    (if it is present). And if it is, convert the tail to unformatted node */
892 static int reiserfs_check_for_tail_and_convert(struct inode *inode,	/* inode to deal with */
893 					       loff_t pos,	/* Writing position */
894 					       int write_bytes	/* amount of bytes to write */
895     )
896 {
897 	INITIALIZE_PATH(path);	// needed for search_for_position
898 	struct cpu_key key;	// Key that would represent last touched writing byte.
899 	struct item_head *ih;	// item header of found block;
900 	int res;		// Return value of various functions we call.
901 	int cont_expand_offset;	// We will put offset for generic_cont_expand here
902 	// This can be int just because tails are created
903 	// only for small files.
904 
905 /* this embodies a dependency on a particular tail policy */
906 	if (inode->i_size >= inode->i_sb->s_blocksize * 4) {
907 		/* such a big files do not have tails, so we won't bother ourselves
908 		   to look for tails, simply return */
909 		return 0;
910 	}
911 
912 	reiserfs_write_lock(inode->i_sb);
913 	/* find the item containing the last byte to be written, or if
914 	 * writing past the end of the file then the last item of the
915 	 * file (and then we check its type). */
916 	make_cpu_key(&key, inode, pos + write_bytes + 1, TYPE_ANY,
917 		     3 /*key length */ );
918 	res = search_for_position_by_key(inode->i_sb, &key, &path);
919 	if (res == IO_ERROR) {
920 		reiserfs_write_unlock(inode->i_sb);
921 		return -EIO;
922 	}
923 	ih = get_ih(&path);
924 	res = 0;
925 	if (is_direct_le_ih(ih)) {
926 		/* Ok, closest item is file tail (tails are stored in "direct"
927 		 * items), so we need to unpack it. */
928 		/* To not overcomplicate matters, we just call generic_cont_expand
929 		   which will in turn call other stuff and finally will boil down to
930 		   reiserfs_get_block() that would do necessary conversion. */
931 		cont_expand_offset =
932 		    le_key_k_offset(get_inode_item_key_version(inode),
933 				    &(ih->ih_key));
934 		pathrelse(&path);
935 		res = generic_cont_expand(inode, cont_expand_offset);
936 	} else
937 		pathrelse(&path);
938 
939 	reiserfs_write_unlock(inode->i_sb);
940 	return res;
941 }
942 
943 /* This function locks pages starting from @pos for @inode.
944    @num_pages pages are locked and stored in
945    @prepared_pages array. Also buffers are allocated for these pages.
946    First and last page of the region is read if it is overwritten only
947    partially. If last page did not exist before write (file hole or file
948    append), it is zeroed, then.
949    Returns number of unallocated blocks that should be allocated to cover
950    new file data.*/
951 static int reiserfs_prepare_file_region_for_write(struct inode *inode
952 						  /* Inode of the file */ ,
953 						  loff_t pos,	/* position in the file */
954 						  size_t num_pages,	/* number of pages to
955 									   prepare */
956 						  size_t write_bytes,	/* Amount of bytes to be
957 									   overwritten from
958 									   @pos */
959 						  struct page **prepared_pages	/* pointer to array
960 										   where to store
961 										   prepared pages */
962     )
963 {
964 	int res = 0;		// Return values of different functions we call.
965 	unsigned long index = pos >> PAGE_CACHE_SHIFT;	// Offset in file in pages.
966 	int from = (pos & (PAGE_CACHE_SIZE - 1));	// Writing offset in first page
967 	int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1;
968 	/* offset of last modified byte in last
969 	   page */
970 	struct address_space *mapping = inode->i_mapping;	// Pages are mapped here.
971 	int i;			// Simple counter
972 	int blocks = 0;		/* Return value (blocks that should be allocated) */
973 	struct buffer_head *bh, *head;	// Current bufferhead and first bufferhead
974 	// of a page.
975 	unsigned block_start, block_end;	// Starting and ending offsets of current
976 	// buffer in the page.
977 	struct buffer_head *wait[2], **wait_bh = wait;	// Buffers for page, if
978 	// Page appeared to be not up
979 	// to date. Note how we have
980 	// at most 2 buffers, this is
981 	// because we at most may
982 	// partially overwrite two
983 	// buffers for one page. One at                                                 // the beginning of write area
984 	// and one at the end.
985 	// Everything inthe middle gets                                                 // overwritten totally.
986 
987 	struct cpu_key key;	// cpu key of item that we are going to deal with
988 	struct item_head *ih = NULL;	// pointer to item head that we are going to deal with
989 	struct buffer_head *itembuf = NULL;	// Buffer head that contains items that we are going to deal with
990 	INITIALIZE_PATH(path);	// path to item, that we are going to deal with.
991 	__le32 *item = NULL;	// pointer to item we are going to deal with
992 	int item_pos = -1;	/* Position in indirect item */
993 
994 	if (num_pages < 1) {
995 		reiserfs_warning(inode->i_sb,
996 				 "green-9001: reiserfs_prepare_file_region_for_write "
997 				 "called with zero number of pages to process");
998 		return -EFAULT;
999 	}
1000 
1001 	/* We have 2 loops for pages. In first loop we grab and lock the pages, so
1002 	   that nobody would touch these until we release the pages. Then
1003 	   we'd start to deal with mapping buffers to blocks. */
1004 	for (i = 0; i < num_pages; i++) {
1005 		prepared_pages[i] = grab_cache_page(mapping, index + i);	// locks the page
1006 		if (!prepared_pages[i]) {
1007 			res = -ENOMEM;
1008 			goto failed_page_grabbing;
1009 		}
1010 		if (!page_has_buffers(prepared_pages[i]))
1011 			create_empty_buffers(prepared_pages[i],
1012 					     inode->i_sb->s_blocksize, 0);
1013 	}
1014 
1015 	/* Let's count amount of blocks for a case where all the blocks
1016 	   overwritten are new (we will substract already allocated blocks later) */
1017 	if (num_pages > 2)
1018 		/* These are full-overwritten pages so we count all the blocks in
1019 		   these pages are counted as needed to be allocated */
1020 		blocks =
1021 		    (num_pages - 2) << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1022 
1023 	/* count blocks needed for first page (possibly partially written) */
1024 	blocks += ((PAGE_CACHE_SIZE - from) >> inode->i_blkbits) + !!(from & (inode->i_sb->s_blocksize - 1));	/* roundup */
1025 
1026 	/* Now we account for last page. If last page == first page (we
1027 	   overwrite only one page), we substract all the blocks past the
1028 	   last writing position in a page out of already calculated number
1029 	   of blocks */
1030 	blocks += ((num_pages > 1) << (PAGE_CACHE_SHIFT - inode->i_blkbits)) -
1031 	    ((PAGE_CACHE_SIZE - to) >> inode->i_blkbits);
1032 	/* Note how we do not roundup here since partial blocks still
1033 	   should be allocated */
1034 
1035 	/* Now if all the write area lies past the file end, no point in
1036 	   maping blocks, since there is none, so we just zero out remaining
1037 	   parts of first and last pages in write area (if needed) */
1038 	if ((pos & ~((loff_t) PAGE_CACHE_SIZE - 1)) > inode->i_size) {
1039 		if (from != 0) {	/* First page needs to be partially zeroed */
1040 			char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0);
1041 			memset(kaddr, 0, from);
1042 			kunmap_atomic(kaddr, KM_USER0);
1043 		}
1044 		if (to != PAGE_CACHE_SIZE) {	/* Last page needs to be partially zeroed */
1045 			char *kaddr =
1046 			    kmap_atomic(prepared_pages[num_pages - 1],
1047 					KM_USER0);
1048 			memset(kaddr + to, 0, PAGE_CACHE_SIZE - to);
1049 			kunmap_atomic(kaddr, KM_USER0);
1050 		}
1051 
1052 		/* Since all blocks are new - use already calculated value */
1053 		return blocks;
1054 	}
1055 
1056 	/* Well, since we write somewhere into the middle of a file, there is
1057 	   possibility we are writing over some already allocated blocks, so
1058 	   let's map these blocks and substract number of such blocks out of blocks
1059 	   we need to allocate (calculated above) */
1060 	/* Mask write position to start on blocksize, we do it out of the
1061 	   loop for performance reasons */
1062 	pos &= ~((loff_t) inode->i_sb->s_blocksize - 1);
1063 	/* Set cpu key to the starting position in a file (on left block boundary) */
1064 	make_cpu_key(&key, inode,
1065 		     1 + ((pos) & ~((loff_t) inode->i_sb->s_blocksize - 1)),
1066 		     TYPE_ANY, 3 /*key length */ );
1067 
1068 	reiserfs_write_lock(inode->i_sb);	// We need that for at least search_by_key()
1069 	for (i = 0; i < num_pages; i++) {
1070 
1071 		head = page_buffers(prepared_pages[i]);
1072 		/* For each buffer in the page */
1073 		for (bh = head, block_start = 0; bh != head || !block_start;
1074 		     block_start = block_end, bh = bh->b_this_page) {
1075 			if (!bh)
1076 				reiserfs_panic(inode->i_sb,
1077 					       "green-9002: Allocated but absent buffer for a page?");
1078 			/* Find where this buffer ends */
1079 			block_end = block_start + inode->i_sb->s_blocksize;
1080 			if (i == 0 && block_end <= from)
1081 				/* if this buffer is before requested data to map, skip it */
1082 				continue;
1083 
1084 			if (i == num_pages - 1 && block_start >= to) {
1085 				/* If this buffer is after requested data to map, abort
1086 				   processing of current page */
1087 				break;
1088 			}
1089 
1090 			if (buffer_mapped(bh) && bh->b_blocknr != 0) {
1091 				/* This is optimisation for a case where buffer is mapped
1092 				   and have blocknumber assigned. In case significant amount
1093 				   of such buffers are present, we may avoid some amount
1094 				   of search_by_key calls.
1095 				   Probably it would be possible to move parts of this code
1096 				   out of BKL, but I afraid that would overcomplicate code
1097 				   without any noticeable benefit.
1098 				 */
1099 				item_pos++;
1100 				/* Update the key */
1101 				set_cpu_key_k_offset(&key,
1102 						     cpu_key_k_offset(&key) +
1103 						     inode->i_sb->s_blocksize);
1104 				blocks--;	// Decrease the amount of blocks that need to be
1105 				// allocated
1106 				continue;	// Go to the next buffer
1107 			}
1108 
1109 			if (!itembuf ||	/* if first iteration */
1110 			    item_pos >= ih_item_len(ih) / UNFM_P_SIZE) {	/* or if we progressed past the
1111 										   current unformatted_item */
1112 				/* Try to find next item */
1113 				res =
1114 				    search_for_position_by_key(inode->i_sb,
1115 							       &key, &path);
1116 				/* Abort if no more items */
1117 				if (res != POSITION_FOUND) {
1118 					/* make sure later loops don't use this item */
1119 					itembuf = NULL;
1120 					item = NULL;
1121 					break;
1122 				}
1123 
1124 				/* Update information about current indirect item */
1125 				itembuf = get_last_bh(&path);
1126 				ih = get_ih(&path);
1127 				item = get_item(&path);
1128 				item_pos = path.pos_in_item;
1129 
1130 				RFALSE(!is_indirect_le_ih(ih),
1131 				       "green-9003: indirect item expected");
1132 			}
1133 
1134 			/* See if there is some block associated with the file
1135 			   at that position, map the buffer to this block */
1136 			if (get_block_num(item, item_pos)) {
1137 				map_bh(bh, inode->i_sb,
1138 				       get_block_num(item, item_pos));
1139 				blocks--;	// Decrease the amount of blocks that need to be
1140 				// allocated
1141 			}
1142 			item_pos++;
1143 			/* Update the key */
1144 			set_cpu_key_k_offset(&key,
1145 					     cpu_key_k_offset(&key) +
1146 					     inode->i_sb->s_blocksize);
1147 		}
1148 	}
1149 	pathrelse(&path);	// Free the path
1150 	reiserfs_write_unlock(inode->i_sb);
1151 
1152 	/* Now zero out unmappend buffers for the first and last pages of
1153 	   write area or issue read requests if page is mapped. */
1154 	/* First page, see if it is not uptodate */
1155 	if (!PageUptodate(prepared_pages[0])) {
1156 		head = page_buffers(prepared_pages[0]);
1157 
1158 		/* For each buffer in page */
1159 		for (bh = head, block_start = 0; bh != head || !block_start;
1160 		     block_start = block_end, bh = bh->b_this_page) {
1161 
1162 			if (!bh)
1163 				reiserfs_panic(inode->i_sb,
1164 					       "green-9002: Allocated but absent buffer for a page?");
1165 			/* Find where this buffer ends */
1166 			block_end = block_start + inode->i_sb->s_blocksize;
1167 			if (block_end <= from)
1168 				/* if this buffer is before requested data to map, skip it */
1169 				continue;
1170 			if (block_start < from) {	/* Aha, our partial buffer */
1171 				if (buffer_mapped(bh)) {	/* If it is mapped, we need to
1172 								   issue READ request for it to
1173 								   not loose data */
1174 					ll_rw_block(READ, 1, &bh);
1175 					*wait_bh++ = bh;
1176 				} else {	/* Not mapped, zero it */
1177 					char *kaddr =
1178 					    kmap_atomic(prepared_pages[0],
1179 							KM_USER0);
1180 					memset(kaddr + block_start, 0,
1181 					       from - block_start);
1182 					kunmap_atomic(kaddr, KM_USER0);
1183 					set_buffer_uptodate(bh);
1184 				}
1185 			}
1186 		}
1187 	}
1188 
1189 	/* Last page, see if it is not uptodate, or if the last page is past the end of the file. */
1190 	if (!PageUptodate(prepared_pages[num_pages - 1]) ||
1191 	    ((pos + write_bytes) >> PAGE_CACHE_SHIFT) >
1192 	    (inode->i_size >> PAGE_CACHE_SHIFT)) {
1193 		head = page_buffers(prepared_pages[num_pages - 1]);
1194 
1195 		/* for each buffer in page */
1196 		for (bh = head, block_start = 0; bh != head || !block_start;
1197 		     block_start = block_end, bh = bh->b_this_page) {
1198 
1199 			if (!bh)
1200 				reiserfs_panic(inode->i_sb,
1201 					       "green-9002: Allocated but absent buffer for a page?");
1202 			/* Find where this buffer ends */
1203 			block_end = block_start + inode->i_sb->s_blocksize;
1204 			if (block_start >= to)
1205 				/* if this buffer is after requested data to map, skip it */
1206 				break;
1207 			if (block_end > to) {	/* Aha, our partial buffer */
1208 				if (buffer_mapped(bh)) {	/* If it is mapped, we need to
1209 								   issue READ request for it to
1210 								   not loose data */
1211 					ll_rw_block(READ, 1, &bh);
1212 					*wait_bh++ = bh;
1213 				} else {	/* Not mapped, zero it */
1214 					char *kaddr =
1215 					    kmap_atomic(prepared_pages
1216 							[num_pages - 1],
1217 							KM_USER0);
1218 					memset(kaddr + to, 0, block_end - to);
1219 					kunmap_atomic(kaddr, KM_USER0);
1220 					set_buffer_uptodate(bh);
1221 				}
1222 			}
1223 		}
1224 	}
1225 
1226 	/* Wait for read requests we made to happen, if necessary */
1227 	while (wait_bh > wait) {
1228 		wait_on_buffer(*--wait_bh);
1229 		if (!buffer_uptodate(*wait_bh)) {
1230 			res = -EIO;
1231 			goto failed_read;
1232 		}
1233 	}
1234 
1235 	return blocks;
1236       failed_page_grabbing:
1237 	num_pages = i;
1238       failed_read:
1239 	reiserfs_unprepare_pages(prepared_pages, num_pages);
1240 	return res;
1241 }
1242 
1243 /* Write @count bytes at position @ppos in a file indicated by @file
1244    from the buffer @buf.
1245 
1246    generic_file_write() is only appropriate for filesystems that are not seeking to optimize performance and want
1247    something simple that works.  It is not for serious use by general purpose filesystems, excepting the one that it was
1248    written for (ext2/3).  This is for several reasons:
1249 
1250    * It has no understanding of any filesystem specific optimizations.
1251 
1252    * It enters the filesystem repeatedly for each page that is written.
1253 
1254    * It depends on reiserfs_get_block() function which if implemented by reiserfs performs costly search_by_key
1255    * operation for each page it is supplied with. By contrast reiserfs_file_write() feeds as much as possible at a time
1256    * to reiserfs which allows for fewer tree traversals.
1257 
1258    * Each indirect pointer insertion takes a lot of cpu, because it involves memory moves inside of blocks.
1259 
1260    * Asking the block allocation code for blocks one at a time is slightly less efficient.
1261 
1262    All of these reasons for not using only generic file write were understood back when reiserfs was first miscoded to
1263    use it, but we were in a hurry to make code freeze, and so it couldn't be revised then.  This new code should make
1264    things right finally.
1265 
1266    Future Features: providing search_by_key with hints.
1267 
1268 */
1269 static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going to write into */
1270 				   const char __user * buf,	/*  pointer to user supplied data
1271 								   (in userspace) */
1272 				   size_t count,	/* amount of bytes to write */
1273 				   loff_t * ppos	/* pointer to position in file that we start writing at. Should be updated to
1274 							 * new current position before returning. */
1275 				   )
1276 {
1277 	size_t already_written = 0;	// Number of bytes already written to the file.
1278 	loff_t pos;		// Current position in the file.
1279 	ssize_t res;		// return value of various functions that we call.
1280 	int err = 0;
1281 	struct inode *inode = file->f_dentry->d_inode;	// Inode of the file that we are writing to.
1282 	/* To simplify coding at this time, we store
1283 	   locked pages in array for now */
1284 	struct page *prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME];
1285 	struct reiserfs_transaction_handle th;
1286 	th.t_trans_id = 0;
1287 
1288 	if (file->f_flags & O_DIRECT) {	// Direct IO needs treatment
1289 		ssize_t result, after_file_end = 0;
1290 		if ((*ppos + count >= inode->i_size)
1291 		    || (file->f_flags & O_APPEND)) {
1292 			/* If we are appending a file, we need to put this savelink in here.
1293 			   If we will crash while doing direct io, finish_unfinished will
1294 			   cut the garbage from the file end. */
1295 			reiserfs_write_lock(inode->i_sb);
1296 			err =
1297 			    journal_begin(&th, inode->i_sb,
1298 					  JOURNAL_PER_BALANCE_CNT);
1299 			if (err) {
1300 				reiserfs_write_unlock(inode->i_sb);
1301 				return err;
1302 			}
1303 			reiserfs_update_inode_transaction(inode);
1304 			add_save_link(&th, inode, 1 /* Truncate */ );
1305 			after_file_end = 1;
1306 			err =
1307 			    journal_end(&th, inode->i_sb,
1308 					JOURNAL_PER_BALANCE_CNT);
1309 			reiserfs_write_unlock(inode->i_sb);
1310 			if (err)
1311 				return err;
1312 		}
1313 		result = generic_file_write(file, buf, count, ppos);
1314 
1315 		if (after_file_end) {	/* Now update i_size and remove the savelink */
1316 			struct reiserfs_transaction_handle th;
1317 			reiserfs_write_lock(inode->i_sb);
1318 			err = journal_begin(&th, inode->i_sb, 1);
1319 			if (err) {
1320 				reiserfs_write_unlock(inode->i_sb);
1321 				return err;
1322 			}
1323 			reiserfs_update_inode_transaction(inode);
1324 			mark_inode_dirty(inode);
1325 			err = journal_end(&th, inode->i_sb, 1);
1326 			if (err) {
1327 				reiserfs_write_unlock(inode->i_sb);
1328 				return err;
1329 			}
1330 			err = remove_save_link(inode, 1 /* truncate */ );
1331 			reiserfs_write_unlock(inode->i_sb);
1332 			if (err)
1333 				return err;
1334 		}
1335 
1336 		return result;
1337 	}
1338 
1339 	if (unlikely((ssize_t) count < 0))
1340 		return -EINVAL;
1341 
1342 	if (unlikely(!access_ok(VERIFY_READ, buf, count)))
1343 		return -EFAULT;
1344 
1345 	down(&inode->i_sem);	// locks the entire file for just us
1346 
1347 	pos = *ppos;
1348 
1349 	/* Check if we can write to specified region of file, file
1350 	   is not overly big and this kind of stuff. Adjust pos and
1351 	   count, if needed */
1352 	res = generic_write_checks(file, &pos, &count, 0);
1353 	if (res)
1354 		goto out;
1355 
1356 	if (count == 0)
1357 		goto out;
1358 
1359 	res = remove_suid(file->f_dentry);
1360 	if (res)
1361 		goto out;
1362 
1363 	inode_update_time(inode, 1);	/* Both mtime and ctime */
1364 
1365 	// Ok, we are done with all the checks.
1366 
1367 	// Now we should start real work
1368 
1369 	/* If we are going to write past the file's packed tail or if we are going
1370 	   to overwrite part of the tail, we need that tail to be converted into
1371 	   unformatted node */
1372 	res = reiserfs_check_for_tail_and_convert(inode, pos, count);
1373 	if (res)
1374 		goto out;
1375 
1376 	while (count > 0) {
1377 		/* This is the main loop in which we running until some error occures
1378 		   or until we write all of the data. */
1379 		size_t num_pages;	/* amount of pages we are going to write this iteration */
1380 		size_t write_bytes;	/* amount of bytes to write during this iteration */
1381 		size_t blocks_to_allocate;	/* how much blocks we need to allocate for this iteration */
1382 
1383 		/*  (pos & (PAGE_CACHE_SIZE-1)) is an idiom for offset into a page of pos */
1384 		num_pages = !!((pos + count) & (PAGE_CACHE_SIZE - 1)) +	/* round up partial
1385 									   pages */
1386 		    ((count +
1387 		      (pos & (PAGE_CACHE_SIZE - 1))) >> PAGE_CACHE_SHIFT);
1388 		/* convert size to amount of
1389 		   pages */
1390 		reiserfs_write_lock(inode->i_sb);
1391 		if (num_pages > REISERFS_WRITE_PAGES_AT_A_TIME
1392 		    || num_pages > reiserfs_can_fit_pages(inode->i_sb)) {
1393 			/* If we were asked to write more data than we want to or if there
1394 			   is not that much space, then we shorten amount of data to write
1395 			   for this iteration. */
1396 			num_pages =
1397 			    min_t(size_t, REISERFS_WRITE_PAGES_AT_A_TIME,
1398 				  reiserfs_can_fit_pages(inode->i_sb));
1399 			/* Also we should not forget to set size in bytes accordingly */
1400 			write_bytes = (num_pages << PAGE_CACHE_SHIFT) -
1401 			    (pos & (PAGE_CACHE_SIZE - 1));
1402 			/* If position is not on the
1403 			   start of the page, we need
1404 			   to substract the offset
1405 			   within page */
1406 		} else
1407 			write_bytes = count;
1408 
1409 		/* reserve the blocks to be allocated later, so that later on
1410 		   we still have the space to write the blocks to */
1411 		reiserfs_claim_blocks_to_be_allocated(inode->i_sb,
1412 						      num_pages <<
1413 						      (PAGE_CACHE_SHIFT -
1414 						       inode->i_blkbits));
1415 		reiserfs_write_unlock(inode->i_sb);
1416 
1417 		if (!num_pages) {	/* If we do not have enough space even for a single page... */
1418 			if (pos >
1419 			    inode->i_size + inode->i_sb->s_blocksize -
1420 			    (pos & (inode->i_sb->s_blocksize - 1))) {
1421 				res = -ENOSPC;
1422 				break;	// In case we are writing past the end of the last file block, break.
1423 			}
1424 			// Otherwise we are possibly overwriting the file, so
1425 			// let's set write size to be equal or less than blocksize.
1426 			// This way we get it correctly for file holes.
1427 			// But overwriting files on absolutelly full volumes would not
1428 			// be very efficient. Well, people are not supposed to fill
1429 			// 100% of disk space anyway.
1430 			write_bytes =
1431 			    min_t(size_t, count,
1432 				  inode->i_sb->s_blocksize -
1433 				  (pos & (inode->i_sb->s_blocksize - 1)));
1434 			num_pages = 1;
1435 			// No blocks were claimed before, so do it now.
1436 			reiserfs_claim_blocks_to_be_allocated(inode->i_sb,
1437 							      1 <<
1438 							      (PAGE_CACHE_SHIFT
1439 							       -
1440 							       inode->
1441 							       i_blkbits));
1442 		}
1443 
1444 		/* Prepare for writing into the region, read in all the
1445 		   partially overwritten pages, if needed. And lock the pages,
1446 		   so that nobody else can access these until we are done.
1447 		   We get number of actual blocks needed as a result. */
1448 		blocks_to_allocate =
1449 		    reiserfs_prepare_file_region_for_write(inode, pos,
1450 							   num_pages,
1451 							   write_bytes,
1452 							   prepared_pages);
1453 		if (blocks_to_allocate < 0) {
1454 			res = blocks_to_allocate;
1455 			reiserfs_release_claimed_blocks(inode->i_sb,
1456 							num_pages <<
1457 							(PAGE_CACHE_SHIFT -
1458 							 inode->i_blkbits));
1459 			break;
1460 		}
1461 
1462 		/* First we correct our estimate of how many blocks we need */
1463 		reiserfs_release_claimed_blocks(inode->i_sb,
1464 						(num_pages <<
1465 						 (PAGE_CACHE_SHIFT -
1466 						  inode->i_sb->
1467 						  s_blocksize_bits)) -
1468 						blocks_to_allocate);
1469 
1470 		if (blocks_to_allocate > 0) {	/*We only allocate blocks if we need to */
1471 			/* Fill in all the possible holes and append the file if needed */
1472 			res =
1473 			    reiserfs_allocate_blocks_for_region(&th, inode, pos,
1474 								num_pages,
1475 								write_bytes,
1476 								prepared_pages,
1477 								blocks_to_allocate);
1478 		}
1479 
1480 		/* well, we have allocated the blocks, so it is time to free
1481 		   the reservation we made earlier. */
1482 		reiserfs_release_claimed_blocks(inode->i_sb,
1483 						blocks_to_allocate);
1484 		if (res) {
1485 			reiserfs_unprepare_pages(prepared_pages, num_pages);
1486 			break;
1487 		}
1488 
1489 /* NOTE that allocating blocks and filling blocks can be done in reverse order
1490    and probably we would do that just to get rid of garbage in files after a
1491    crash */
1492 
1493 		/* Copy data from user-supplied buffer to file's pages */
1494 		res =
1495 		    reiserfs_copy_from_user_to_file_region(pos, num_pages,
1496 							   write_bytes,
1497 							   prepared_pages, buf);
1498 		if (res) {
1499 			reiserfs_unprepare_pages(prepared_pages, num_pages);
1500 			break;
1501 		}
1502 
1503 		/* Send the pages to disk and unlock them. */
1504 		res =
1505 		    reiserfs_submit_file_region_for_write(&th, inode, pos,
1506 							  num_pages,
1507 							  write_bytes,
1508 							  prepared_pages);
1509 		if (res)
1510 			break;
1511 
1512 		already_written += write_bytes;
1513 		buf += write_bytes;
1514 		*ppos = pos += write_bytes;
1515 		count -= write_bytes;
1516 		balance_dirty_pages_ratelimited(inode->i_mapping);
1517 	}
1518 
1519 	/* this is only true on error */
1520 	if (th.t_trans_id) {
1521 		reiserfs_write_lock(inode->i_sb);
1522 		err = journal_end(&th, th.t_super, th.t_blocks_allocated);
1523 		reiserfs_write_unlock(inode->i_sb);
1524 		if (err) {
1525 			res = err;
1526 			goto out;
1527 		}
1528 	}
1529 
1530 	if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
1531 		res =
1532 		    generic_osync_inode(inode, file->f_mapping,
1533 					OSYNC_METADATA | OSYNC_DATA);
1534 
1535 	up(&inode->i_sem);
1536 	reiserfs_async_progress_wait(inode->i_sb);
1537 	return (already_written != 0) ? already_written : res;
1538 
1539       out:
1540 	up(&inode->i_sem);	// unlock the file on exit.
1541 	return res;
1542 }
1543 
1544 static ssize_t reiserfs_aio_write(struct kiocb *iocb, const char __user * buf,
1545 				  size_t count, loff_t pos)
1546 {
1547 	return generic_file_aio_write(iocb, buf, count, pos);
1548 }
1549 
1550 struct file_operations reiserfs_file_operations = {
1551 	.read = generic_file_read,
1552 	.write = reiserfs_file_write,
1553 	.ioctl = reiserfs_ioctl,
1554 	.mmap = generic_file_mmap,
1555 	.release = reiserfs_file_release,
1556 	.fsync = reiserfs_sync_file,
1557 	.sendfile = generic_file_sendfile,
1558 	.aio_read = generic_file_aio_read,
1559 	.aio_write = reiserfs_aio_write,
1560 };
1561 
1562 struct inode_operations reiserfs_file_inode_operations = {
1563 	.truncate = reiserfs_vfs_truncate_file,
1564 	.setattr = reiserfs_setattr,
1565 	.setxattr = reiserfs_setxattr,
1566 	.getxattr = reiserfs_getxattr,
1567 	.listxattr = reiserfs_listxattr,
1568 	.removexattr = reiserfs_removexattr,
1569 	.permission = reiserfs_permission,
1570 };
1571