xref: /openbmc/linux/fs/reiserfs/file.c (revision 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2)
1 /*
2  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3  */
4 
5 
6 #include <linux/time.h>
7 #include <linux/reiserfs_fs.h>
8 #include <linux/reiserfs_acl.h>
9 #include <linux/reiserfs_xattr.h>
10 #include <linux/smp_lock.h>
11 #include <asm/uaccess.h>
12 #include <linux/pagemap.h>
13 #include <linux/swap.h>
14 #include <linux/writeback.h>
15 #include <linux/blkdev.h>
16 #include <linux/buffer_head.h>
17 #include <linux/quotaops.h>
18 
19 /*
20 ** We pack the tails of files on file close, not at the time they are written.
21 ** This implies an unnecessary copy of the tail and an unnecessary indirect item
22 ** insertion/balancing, for files that are written in one write.
23 ** It avoids unnecessary tail packings (balances) for files that are written in
24 ** multiple writes and are small enough to have tails.
25 **
26 ** file_release is called by the VFS layer when the file is closed.  If
27 ** this is the last open file descriptor, and the file
28 ** small enough to have a tail, and the tail is currently in an
29 ** unformatted node, the tail is converted back into a direct item.
30 **
31 ** We use reiserfs_truncate_file to pack the tail, since it already has
32 ** all the conditions coded.
33 */
34 static int reiserfs_file_release (struct inode * inode, struct file * filp)
35 {
36 
37     struct reiserfs_transaction_handle th ;
38     int err;
39     int jbegin_failure = 0;
40 
41     if (!S_ISREG (inode->i_mode))
42 	BUG ();
43 
44     /* fast out for when nothing needs to be done */
45     if ((atomic_read(&inode->i_count) > 1 ||
46 	!(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
47          !tail_has_to_be_packed(inode))       &&
48 	REISERFS_I(inode)->i_prealloc_count <= 0) {
49 	return 0;
50     }
51 
52     reiserfs_write_lock(inode->i_sb);
53     down (&inode->i_sem);
54     /* freeing preallocation only involves relogging blocks that
55      * are already in the current transaction.  preallocation gets
56      * freed at the end of each transaction, so it is impossible for
57      * us to log any additional blocks (including quota blocks)
58      */
59     err = journal_begin(&th, inode->i_sb, 1);
60     if (err) {
61 	/* uh oh, we can't allow the inode to go away while there
62 	 * is still preallocation blocks pending.  Try to join the
63 	 * aborted transaction
64 	 */
65 	jbegin_failure = err;
66 	err = journal_join_abort(&th, inode->i_sb, 1);
67 
68 	if (err) {
69 	    /* hmpf, our choices here aren't good.  We can pin the inode
70 	     * which will disallow unmount from every happening, we can
71 	     * do nothing, which will corrupt random memory on unmount,
72 	     * or we can forcibly remove the file from the preallocation
73 	     * list, which will leak blocks on disk.  Lets pin the inode
74 	     * and let the admin know what is going on.
75 	     */
76 	    igrab(inode);
77 	    reiserfs_warning(inode->i_sb, "pinning inode %lu because the "
78 	                     "preallocation can't be freed");
79 	    goto out;
80 	}
81     }
82     reiserfs_update_inode_transaction(inode) ;
83 
84 #ifdef REISERFS_PREALLOCATE
85     reiserfs_discard_prealloc (&th, inode);
86 #endif
87     err = journal_end(&th, inode->i_sb, 1);
88 
89     /* copy back the error code from journal_begin */
90     if (!err)
91         err = jbegin_failure;
92 
93     if (!err && atomic_read(&inode->i_count) <= 1 &&
94 	(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
95         tail_has_to_be_packed (inode)) {
96 	/* if regular file is released by last holder and it has been
97 	   appended (we append by unformatted node only) or its direct
98 	   item(s) had to be converted, then it may have to be
99 	   indirect2direct converted */
100 	err = reiserfs_truncate_file(inode, 0) ;
101     }
102 out:
103     up (&inode->i_sem);
104     reiserfs_write_unlock(inode->i_sb);
105     return err;
106 }
107 
108 static void reiserfs_vfs_truncate_file(struct inode *inode) {
109     reiserfs_truncate_file(inode, 1) ;
110 }
111 
112 /* Sync a reiserfs file. */
113 
114 /*
115  * FIXME: sync_mapping_buffers() never has anything to sync.  Can
116  * be removed...
117  */
118 
119 static int reiserfs_sync_file(
120 			      struct file   * p_s_filp,
121 			      struct dentry * p_s_dentry,
122 			      int datasync
123 			      ) {
124   struct inode * p_s_inode = p_s_dentry->d_inode;
125   int n_err;
126   int barrier_done;
127 
128   if (!S_ISREG(p_s_inode->i_mode))
129       BUG ();
130   n_err = sync_mapping_buffers(p_s_inode->i_mapping) ;
131   reiserfs_write_lock(p_s_inode->i_sb);
132   barrier_done = reiserfs_commit_for_inode(p_s_inode);
133   reiserfs_write_unlock(p_s_inode->i_sb);
134   if (barrier_done != 1)
135       blkdev_issue_flush(p_s_inode->i_sb->s_bdev, NULL);
136   if (barrier_done < 0)
137     return barrier_done;
138   return ( n_err < 0 ) ? -EIO : 0;
139 }
140 
141 /* I really do not want to play with memory shortage right now, so
142    to simplify the code, we are not going to write more than this much pages at
143    a time. This still should considerably improve performance compared to 4k
144    at a time case. This is 32 pages of 4k size. */
145 #define REISERFS_WRITE_PAGES_AT_A_TIME (128 * 1024) / PAGE_CACHE_SIZE
146 
147 /* Allocates blocks for a file to fulfil write request.
148    Maps all unmapped but prepared pages from the list.
149    Updates metadata with newly allocated blocknumbers as needed */
150 static int reiserfs_allocate_blocks_for_region(
151 				struct reiserfs_transaction_handle *th,
152 				struct inode *inode, /* Inode we work with */
153 				loff_t pos, /* Writing position */
154 				int num_pages, /* number of pages write going
155 						  to touch */
156 				int write_bytes, /* amount of bytes to write */
157 				struct page **prepared_pages, /* array of
158 							         prepared pages
159 							       */
160 				int blocks_to_allocate /* Amount of blocks we
161 							  need to allocate to
162 							  fit the data into file
163 							 */
164 				)
165 {
166     struct cpu_key key; // cpu key of item that we are going to deal with
167     struct item_head *ih; // pointer to item head that we are going to deal with
168     struct buffer_head *bh; // Buffer head that contains items that we are going to deal with
169     __u32 * item; // pointer to item we are going to deal with
170     INITIALIZE_PATH(path); // path to item, that we are going to deal with.
171     b_blocknr_t *allocated_blocks; // Pointer to a place where allocated blocknumbers would be stored.
172     reiserfs_blocknr_hint_t hint; // hint structure for block allocator.
173     size_t res; // return value of various functions that we call.
174     int curr_block; // current block used to keep track of unmapped blocks.
175     int i; // loop counter
176     int itempos; // position in item
177     unsigned int from = (pos & (PAGE_CACHE_SIZE - 1)); // writing position in
178 						       // first page
179     unsigned int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1; /* last modified byte offset in last page */
180     __u64 hole_size ; // amount of blocks for a file hole, if it needed to be created.
181     int modifying_this_item = 0; // Flag for items traversal code to keep track
182 				 // of the fact that we already prepared
183 				 // current block for journal
184     int will_prealloc = 0;
185     RFALSE(!blocks_to_allocate, "green-9004: tried to allocate zero blocks?");
186 
187     /* only preallocate if this is a small write */
188     if (REISERFS_I(inode)->i_prealloc_count ||
189        (!(write_bytes & (inode->i_sb->s_blocksize -1)) &&
190         blocks_to_allocate <
191         REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize))
192         will_prealloc = REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize;
193 
194     allocated_blocks = kmalloc((blocks_to_allocate + will_prealloc) *
195     					sizeof(b_blocknr_t), GFP_NOFS);
196 
197     /* First we compose a key to point at the writing position, we want to do
198        that outside of any locking region. */
199     make_cpu_key (&key, inode, pos+1, TYPE_ANY, 3/*key length*/);
200 
201     /* If we came here, it means we absolutely need to open a transaction,
202        since we need to allocate some blocks */
203     reiserfs_write_lock(inode->i_sb); // Journaling stuff and we need that.
204     res = journal_begin(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS); // Wish I know if this number enough
205     if (res)
206         goto error_exit;
207     reiserfs_update_inode_transaction(inode) ;
208 
209     /* Look for the in-tree position of our write, need path for block allocator */
210     res = search_for_position_by_key(inode->i_sb, &key, &path);
211     if ( res == IO_ERROR ) {
212 	res = -EIO;
213 	goto error_exit;
214     }
215 
216     /* Allocate blocks */
217     /* First fill in "hint" structure for block allocator */
218     hint.th = th; // transaction handle.
219     hint.path = &path; // Path, so that block allocator can determine packing locality or whatever it needs to determine.
220     hint.inode = inode; // Inode is needed by block allocator too.
221     hint.search_start = 0; // We have no hint on where to search free blocks for block allocator.
222     hint.key = key.on_disk_key; // on disk key of file.
223     hint.block = inode->i_blocks>>(inode->i_sb->s_blocksize_bits-9); // Number of disk blocks this file occupies already.
224     hint.formatted_node = 0; // We are allocating blocks for unformatted node.
225     hint.preallocate = will_prealloc;
226 
227     /* Call block allocator to allocate blocks */
228     res = reiserfs_allocate_blocknrs(&hint, allocated_blocks, blocks_to_allocate, blocks_to_allocate);
229     if ( res != CARRY_ON ) {
230 	if ( res == NO_DISK_SPACE ) {
231 	    /* We flush the transaction in case of no space. This way some
232 	       blocks might become free */
233 	    SB_JOURNAL(inode->i_sb)->j_must_wait = 1;
234 	    res = restart_transaction(th, inode, &path);
235             if (res)
236                 goto error_exit;
237 
238 	    /* We might have scheduled, so search again */
239 	    res = search_for_position_by_key(inode->i_sb, &key, &path);
240 	    if ( res == IO_ERROR ) {
241 		res = -EIO;
242 		goto error_exit;
243 	    }
244 
245 	    /* update changed info for hint structure. */
246 	    res = reiserfs_allocate_blocknrs(&hint, allocated_blocks, blocks_to_allocate, blocks_to_allocate);
247 	    if ( res != CARRY_ON ) {
248 		res = -ENOSPC;
249 		pathrelse(&path);
250 		goto error_exit;
251 	    }
252 	} else {
253 	    res = -ENOSPC;
254 	    pathrelse(&path);
255 	    goto error_exit;
256 	}
257     }
258 
259 #ifdef __BIG_ENDIAN
260         // Too bad, I have not found any way to convert a given region from
261         // cpu format to little endian format
262     {
263         int i;
264         for ( i = 0; i < blocks_to_allocate ; i++)
265             allocated_blocks[i]=cpu_to_le32(allocated_blocks[i]);
266     }
267 #endif
268 
269     /* Blocks allocating well might have scheduled and tree might have changed,
270        let's search the tree again */
271     /* find where in the tree our write should go */
272     res = search_for_position_by_key(inode->i_sb, &key, &path);
273     if ( res == IO_ERROR ) {
274 	res = -EIO;
275 	goto error_exit_free_blocks;
276     }
277 
278     bh = get_last_bh( &path ); // Get a bufferhead for last element in path.
279     ih = get_ih( &path );      // Get a pointer to last item head in path.
280     item = get_item( &path );  // Get a pointer to last item in path
281 
282     /* Let's see what we have found */
283     if ( res != POSITION_FOUND ) { /* position not found, this means that we
284 				      might need to append file with holes
285 				      first */
286 	// Since we are writing past the file's end, we need to find out if
287 	// there is a hole that needs to be inserted before our writing
288 	// position, and how many blocks it is going to cover (we need to
289 	//  populate pointers to file blocks representing the hole with zeros)
290 
291 	{
292 	    int item_offset = 1;
293 	    /*
294 	     * if ih is stat data, its offset is 0 and we don't want to
295 	     * add 1 to pos in the hole_size calculation
296 	     */
297 	    if (is_statdata_le_ih(ih))
298 	        item_offset = 0;
299 	    hole_size = (pos + item_offset -
300 	            (le_key_k_offset( get_inode_item_key_version(inode),
301 		    &(ih->ih_key)) +
302 		    op_bytes_number(ih, inode->i_sb->s_blocksize))) >>
303 		    inode->i_sb->s_blocksize_bits;
304 	}
305 
306 	if ( hole_size > 0 ) {
307 	    int to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize)/UNFM_P_SIZE ); // How much data to insert first time.
308 	    /* area filled with zeroes, to supply as list of zero blocknumbers
309 	       We allocate it outside of loop just in case loop would spin for
310 	       several iterations. */
311 	    char *zeros = kmalloc(to_paste*UNFM_P_SIZE, GFP_ATOMIC); // We cannot insert more than MAX_ITEM_LEN bytes anyway.
312 	    if ( !zeros ) {
313 		res = -ENOMEM;
314 		goto error_exit_free_blocks;
315 	    }
316 	    memset ( zeros, 0, to_paste*UNFM_P_SIZE);
317 	    do {
318 		to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize)/UNFM_P_SIZE );
319 		if ( is_indirect_le_ih(ih) ) {
320 		    /* Ok, there is existing indirect item already. Need to append it */
321 		    /* Calculate position past inserted item */
322 		    make_cpu_key( &key, inode, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize), TYPE_INDIRECT, 3);
323 		    res = reiserfs_paste_into_item( th, &path, &key, inode, (char *)zeros, UNFM_P_SIZE*to_paste);
324 		    if ( res ) {
325 			kfree(zeros);
326 			goto error_exit_free_blocks;
327 		    }
328 		} else if ( is_statdata_le_ih(ih) ) {
329 		    /* No existing item, create it */
330 		    /* item head for new item */
331 		    struct item_head ins_ih;
332 
333 		    /* create a key for our new item */
334 		    make_cpu_key( &key, inode, 1, TYPE_INDIRECT, 3);
335 
336 		    /* Create new item head for our new item */
337 		    make_le_item_head (&ins_ih, &key, key.version, 1,
338 				       TYPE_INDIRECT, to_paste*UNFM_P_SIZE,
339 				       0 /* free space */);
340 
341 		    /* Find where such item should live in the tree */
342 		    res = search_item (inode->i_sb, &key, &path);
343 		    if ( res != ITEM_NOT_FOUND ) {
344 			/* item should not exist, otherwise we have error */
345 			if ( res != -ENOSPC ) {
346 			    reiserfs_warning (inode->i_sb,
347 				"green-9008: search_by_key (%K) returned %d",
348 					      &key, res);
349 			}
350 			res = -EIO;
351 		        kfree(zeros);
352 			goto error_exit_free_blocks;
353 		    }
354 		    res = reiserfs_insert_item( th, &path, &key, &ins_ih, inode, (char *)zeros);
355 		} else {
356 		    reiserfs_panic(inode->i_sb, "green-9011: Unexpected key type %K\n", &key);
357 		}
358 		if ( res ) {
359 		    kfree(zeros);
360 		    goto error_exit_free_blocks;
361 		}
362 		/* Now we want to check if transaction is too full, and if it is
363 		   we restart it. This will also free the path. */
364 		if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
365 		    res = restart_transaction(th, inode, &path);
366                     if (res) {
367                         pathrelse (&path);
368                         kfree(zeros);
369                         goto error_exit;
370                     }
371                 }
372 
373 		/* Well, need to recalculate path and stuff */
374 		set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + (to_paste << inode->i_blkbits));
375 		res = search_for_position_by_key(inode->i_sb, &key, &path);
376 		if ( res == IO_ERROR ) {
377 		    res = -EIO;
378 		    kfree(zeros);
379 		    goto error_exit_free_blocks;
380 		}
381 		bh=get_last_bh(&path);
382 		ih=get_ih(&path);
383 		item = get_item(&path);
384 		hole_size -= to_paste;
385 	    } while ( hole_size );
386 	    kfree(zeros);
387 	}
388     }
389 
390     // Go through existing indirect items first
391     // replace all zeroes with blocknumbers from list
392     // Note that if no corresponding item was found, by previous search,
393     // it means there are no existing in-tree representation for file area
394     // we are going to overwrite, so there is nothing to scan through for holes.
395     for ( curr_block = 0, itempos = path.pos_in_item ; curr_block < blocks_to_allocate && res == POSITION_FOUND ; ) {
396 retry:
397 
398 	if ( itempos >= ih_item_len(ih)/UNFM_P_SIZE ) {
399 	    /* We run out of data in this indirect item, let's look for another
400 	       one. */
401 	    /* First if we are already modifying current item, log it */
402 	    if ( modifying_this_item ) {
403 		journal_mark_dirty (th, inode->i_sb, bh);
404 		modifying_this_item = 0;
405 	    }
406 	    /* Then set the key to look for a new indirect item (offset of old
407 	       item is added to old item length */
408 	    set_cpu_key_k_offset( &key, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize));
409 	    /* Search ofor position of new key in the tree. */
410 	    res = search_for_position_by_key(inode->i_sb, &key, &path);
411 	    if ( res == IO_ERROR) {
412 		res = -EIO;
413 		goto error_exit_free_blocks;
414 	    }
415 	    bh=get_last_bh(&path);
416 	    ih=get_ih(&path);
417 	    item = get_item(&path);
418 	    itempos = path.pos_in_item;
419 	    continue; // loop to check all kinds of conditions and so on.
420 	}
421 	/* Ok, we have correct position in item now, so let's see if it is
422 	   representing file hole (blocknumber is zero) and fill it if needed */
423 	if ( !item[itempos] ) {
424 	    /* Ok, a hole. Now we need to check if we already prepared this
425 	       block to be journaled */
426 	    while ( !modifying_this_item ) { // loop until succeed
427 		/* Well, this item is not journaled yet, so we must prepare
428 		   it for journal first, before we can change it */
429 		struct item_head tmp_ih; // We copy item head of found item,
430 					 // here to detect if fs changed under
431 					 // us while we were preparing for
432 					 // journal.
433 		int fs_gen; // We store fs generation here to find if someone
434 			    // changes fs under our feet
435 
436 		copy_item_head (&tmp_ih, ih); // Remember itemhead
437 		fs_gen = get_generation (inode->i_sb); // remember fs generation
438 		reiserfs_prepare_for_journal(inode->i_sb, bh, 1); // Prepare a buffer within which indirect item is stored for changing.
439 		if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
440 		    // Sigh, fs was changed under us, we need to look for new
441 		    // location of item we are working with
442 
443 		    /* unmark prepaerd area as journaled and search for it's
444 		       new position */
445 		    reiserfs_restore_prepared_buffer(inode->i_sb, bh);
446 		    res = search_for_position_by_key(inode->i_sb, &key, &path);
447 		    if ( res == IO_ERROR) {
448 			res = -EIO;
449 			goto error_exit_free_blocks;
450 		    }
451 		    bh=get_last_bh(&path);
452 		    ih=get_ih(&path);
453 		    item = get_item(&path);
454 		    itempos = path.pos_in_item;
455 		    goto retry;
456 		}
457 		modifying_this_item = 1;
458 	    }
459 	    item[itempos] = allocated_blocks[curr_block]; // Assign new block
460 	    curr_block++;
461 	}
462 	itempos++;
463     }
464 
465     if ( modifying_this_item ) { // We need to log last-accessed block, if it
466 				 // was modified, but not logged yet.
467 	journal_mark_dirty (th, inode->i_sb, bh);
468     }
469 
470     if ( curr_block < blocks_to_allocate ) {
471 	// Oh, well need to append to indirect item, or to create indirect item
472 	// if there weren't any
473 	if ( is_indirect_le_ih(ih) ) {
474 	    // Existing indirect item - append. First calculate key for append
475 	    // position. We do not need to recalculate path as it should
476 	    // already point to correct place.
477 	    make_cpu_key( &key, inode, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize), TYPE_INDIRECT, 3);
478 	    res = reiserfs_paste_into_item( th, &path, &key, inode, (char *)(allocated_blocks+curr_block), UNFM_P_SIZE*(blocks_to_allocate-curr_block));
479 	    if ( res ) {
480 		goto error_exit_free_blocks;
481 	    }
482 	} else if (is_statdata_le_ih(ih) ) {
483 	    // Last found item was statdata. That means we need to create indirect item.
484 	    struct item_head ins_ih; /* itemhead for new item */
485 
486 	    /* create a key for our new item */
487 	    make_cpu_key( &key, inode, 1, TYPE_INDIRECT, 3); // Position one,
488 							    // because that's
489 							    // where first
490 							    // indirect item
491 							    // begins
492 	    /* Create new item head for our new item */
493 	    make_le_item_head (&ins_ih, &key, key.version, 1, TYPE_INDIRECT,
494 			       (blocks_to_allocate-curr_block)*UNFM_P_SIZE,
495 			       0 /* free space */);
496 	    /* Find where such item should live in the tree */
497 	    res = search_item (inode->i_sb, &key, &path);
498 	    if ( res != ITEM_NOT_FOUND ) {
499 		/* Well, if we have found such item already, or some error
500 		   occured, we need to warn user and return error */
501 		if ( res != -ENOSPC ) {
502 		    reiserfs_warning (inode->i_sb,
503 				      "green-9009: search_by_key (%K) "
504 				      "returned %d", &key, res);
505 		}
506 		res = -EIO;
507 		goto error_exit_free_blocks;
508 	    }
509 	    /* Insert item into the tree with the data as its body */
510 	    res = reiserfs_insert_item( th, &path, &key, &ins_ih, inode, (char *)(allocated_blocks+curr_block));
511 	} else {
512 	    reiserfs_panic(inode->i_sb, "green-9010: unexpected item type for key %K\n",&key);
513 	}
514     }
515 
516     // the caller is responsible for closing the transaction
517     // unless we return an error, they are also responsible for logging
518     // the inode.
519     //
520     pathrelse(&path);
521     /*
522      * cleanup prellocation from previous writes
523      * if this is a partial block write
524      */
525     if (write_bytes & (inode->i_sb->s_blocksize -1))
526         reiserfs_discard_prealloc(th, inode);
527     reiserfs_write_unlock(inode->i_sb);
528 
529     // go through all the pages/buffers and map the buffers to newly allocated
530     // blocks (so that system knows where to write these pages later).
531     curr_block = 0;
532     for ( i = 0; i < num_pages ; i++ ) {
533 	struct page *page=prepared_pages[i]; //current page
534 	struct buffer_head *head = page_buffers(page);// first buffer for a page
535 	int block_start, block_end; // in-page offsets for buffers.
536 
537 	if (!page_buffers(page))
538 	    reiserfs_panic(inode->i_sb, "green-9005: No buffers for prepared page???");
539 
540 	/* For each buffer in page */
541 	for(bh = head, block_start = 0; bh != head || !block_start;
542 	    block_start=block_end, bh = bh->b_this_page) {
543 	    if (!bh)
544 		reiserfs_panic(inode->i_sb, "green-9006: Allocated but absent buffer for a page?");
545 	    block_end = block_start+inode->i_sb->s_blocksize;
546 	    if (i == 0 && block_end <= from )
547 		/* if this buffer is before requested data to map, skip it */
548 		continue;
549 	    if (i == num_pages - 1 && block_start >= to)
550 		/* If this buffer is after requested data to map, abort
551 		   processing of current page */
552 		break;
553 
554 	    if ( !buffer_mapped(bh) ) { // Ok, unmapped buffer, need to map it
555 		map_bh( bh, inode->i_sb, le32_to_cpu(allocated_blocks[curr_block]));
556 		curr_block++;
557 		set_buffer_new(bh);
558 	    }
559 	}
560     }
561 
562     RFALSE( curr_block > blocks_to_allocate, "green-9007: Used too many blocks? weird");
563 
564     kfree(allocated_blocks);
565     return 0;
566 
567 // Need to deal with transaction here.
568 error_exit_free_blocks:
569     pathrelse(&path);
570     // free blocks
571     for( i = 0; i < blocks_to_allocate; i++ )
572 	reiserfs_free_block(th, inode, le32_to_cpu(allocated_blocks[i]), 1);
573 
574 error_exit:
575     if (th->t_trans_id) {
576         int err;
577         // update any changes we made to blk count
578         reiserfs_update_sd(th, inode);
579         err = journal_end(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS);
580         if (err)
581             res = err;
582     }
583     reiserfs_write_unlock(inode->i_sb);
584     kfree(allocated_blocks);
585 
586     return res;
587 }
588 
589 /* Unlock pages prepared by reiserfs_prepare_file_region_for_write */
590 static void reiserfs_unprepare_pages(struct page **prepared_pages, /* list of locked pages */
591 			      size_t num_pages /* amount of pages */) {
592     int i; // loop counter
593 
594     for (i=0; i < num_pages ; i++) {
595 	struct page *page = prepared_pages[i];
596 
597 	try_to_free_buffers(page);
598 	unlock_page(page);
599 	page_cache_release(page);
600     }
601 }
602 
603 /* This function will copy data from userspace to specified pages within
604    supplied byte range */
605 static int reiserfs_copy_from_user_to_file_region(
606 				loff_t pos, /* In-file position */
607 				int num_pages, /* Number of pages affected */
608 				int write_bytes, /* Amount of bytes to write */
609 				struct page **prepared_pages, /* pointer to
610 								 array to
611 								 prepared pages
612 								*/
613 				const char __user *buf /* Pointer to user-supplied
614 						   data*/
615 				)
616 {
617     long page_fault=0; // status of copy_from_user.
618     int i; // loop counter.
619     int offset; // offset in page
620 
621     for ( i = 0, offset = (pos & (PAGE_CACHE_SIZE-1)); i < num_pages ; i++,offset=0) {
622 	size_t count = min_t(size_t,PAGE_CACHE_SIZE-offset,write_bytes); // How much of bytes to write to this page
623 	struct page *page=prepared_pages[i]; // Current page we process.
624 
625 	fault_in_pages_readable( buf, count);
626 
627 	/* Copy data from userspace to the current page */
628 	kmap(page);
629 	page_fault = __copy_from_user(page_address(page)+offset, buf, count); // Copy the data.
630 	/* Flush processor's dcache for this page */
631 	flush_dcache_page(page);
632 	kunmap(page);
633 	buf+=count;
634 	write_bytes-=count;
635 
636 	if (page_fault)
637 	    break; // Was there a fault? abort.
638     }
639 
640     return page_fault?-EFAULT:0;
641 }
642 
643 /* taken fs/buffer.c:__block_commit_write */
644 int reiserfs_commit_page(struct inode *inode, struct page *page,
645 		unsigned from, unsigned to)
646 {
647     unsigned block_start, block_end;
648     int partial = 0;
649     unsigned blocksize;
650     struct buffer_head *bh, *head;
651     unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT;
652     int new;
653     int logit = reiserfs_file_data_log(inode);
654     struct super_block *s = inode->i_sb;
655     int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
656     struct reiserfs_transaction_handle th;
657     int ret = 0;
658 
659     th.t_trans_id = 0;
660     blocksize = 1 << inode->i_blkbits;
661 
662     if (logit) {
663 	reiserfs_write_lock(s);
664 	ret = journal_begin(&th, s, bh_per_page + 1);
665 	if (ret)
666 	    goto drop_write_lock;
667 	reiserfs_update_inode_transaction(inode);
668     }
669     for(bh = head = page_buffers(page), block_start = 0;
670         bh != head || !block_start;
671 	block_start=block_end, bh = bh->b_this_page)
672     {
673 
674 	new = buffer_new(bh);
675 	clear_buffer_new(bh);
676 	block_end = block_start + blocksize;
677 	if (block_end <= from || block_start >= to) {
678 	    if (!buffer_uptodate(bh))
679 		    partial = 1;
680 	} else {
681 	    set_buffer_uptodate(bh);
682 	    if (logit) {
683 		reiserfs_prepare_for_journal(s, bh, 1);
684 		journal_mark_dirty(&th, s, bh);
685 	    } else if (!buffer_dirty(bh)) {
686 		mark_buffer_dirty(bh);
687 		/* do data=ordered on any page past the end
688 		 * of file and any buffer marked BH_New.
689 		 */
690 		if (reiserfs_data_ordered(inode->i_sb) &&
691 		    (new || page->index >= i_size_index)) {
692 		    reiserfs_add_ordered_list(inode, bh);
693 	        }
694 	    }
695 	}
696     }
697     if (logit) {
698 	ret = journal_end(&th, s, bh_per_page + 1);
699 drop_write_lock:
700 	reiserfs_write_unlock(s);
701     }
702     /*
703      * If this is a partial write which happened to make all buffers
704      * uptodate then we can optimize away a bogus readpage() for
705      * the next read(). Here we 'discover' whether the page went
706      * uptodate as a result of this (potentially partial) write.
707      */
708     if (!partial)
709 	SetPageUptodate(page);
710     return ret;
711 }
712 
713 
714 /* Submit pages for write. This was separated from actual file copying
715    because we might want to allocate block numbers in-between.
716    This function assumes that caller will adjust file size to correct value. */
717 static int reiserfs_submit_file_region_for_write(
718 				struct reiserfs_transaction_handle *th,
719 				struct inode *inode,
720 				loff_t pos, /* Writing position offset */
721 				size_t num_pages, /* Number of pages to write */
722 				size_t write_bytes, /* number of bytes to write */
723 				struct page **prepared_pages /* list of pages */
724 				)
725 {
726     int status; // return status of block_commit_write.
727     int retval = 0; // Return value we are going to return.
728     int i; // loop counter
729     int offset; // Writing offset in page.
730     int orig_write_bytes = write_bytes;
731     int sd_update = 0;
732 
733     for ( i = 0, offset = (pos & (PAGE_CACHE_SIZE-1)); i < num_pages ; i++,offset=0) {
734 	int count = min_t(int,PAGE_CACHE_SIZE-offset,write_bytes); // How much of bytes to write to this page
735 	struct page *page=prepared_pages[i]; // Current page we process.
736 
737 	status = reiserfs_commit_page(inode, page, offset, offset+count);
738 	if ( status )
739 	    retval = status; // To not overcomplicate matters We are going to
740 			     // submit all the pages even if there was error.
741 			     // we only remember error status to report it on
742 			     // exit.
743 	write_bytes-=count;
744     }
745     /* now that we've gotten all the ordered buffers marked dirty,
746      * we can safely update i_size and close any running transaction
747      */
748     if ( pos + orig_write_bytes > inode->i_size) {
749 	inode->i_size = pos + orig_write_bytes; // Set new size
750 	/* If the file have grown so much that tail packing is no
751 	 * longer possible, reset "need to pack" flag */
752 	if ( (have_large_tails (inode->i_sb) &&
753 	      inode->i_size > i_block_size (inode)*4) ||
754 	     (have_small_tails (inode->i_sb) &&
755 	     inode->i_size > i_block_size(inode)) )
756 	    REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ;
757         else if ( (have_large_tails (inode->i_sb) &&
758 	          inode->i_size < i_block_size (inode)*4) ||
759 	          (have_small_tails (inode->i_sb) &&
760 		  inode->i_size < i_block_size(inode)) )
761 	    REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ;
762 
763 	if (th->t_trans_id) {
764 	    reiserfs_write_lock(inode->i_sb);
765 	    reiserfs_update_sd(th, inode); // And update on-disk metadata
766 	    reiserfs_write_unlock(inode->i_sb);
767 	} else
768 	    inode->i_sb->s_op->dirty_inode(inode);
769 
770         sd_update = 1;
771     }
772     if (th->t_trans_id) {
773 	reiserfs_write_lock(inode->i_sb);
774 	if (!sd_update)
775 	    reiserfs_update_sd(th, inode);
776 	status = journal_end(th, th->t_super, th->t_blocks_allocated);
777         if (status)
778             retval = status;
779 	reiserfs_write_unlock(inode->i_sb);
780     }
781     th->t_trans_id = 0;
782 
783     /*
784      * we have to unlock the pages after updating i_size, otherwise
785      * we race with writepage
786      */
787     for ( i = 0; i < num_pages ; i++) {
788 	struct page *page=prepared_pages[i];
789 	unlock_page(page);
790 	mark_page_accessed(page);
791 	page_cache_release(page);
792     }
793     return retval;
794 }
795 
796 /* Look if passed writing region is going to touch file's tail
797    (if it is present). And if it is, convert the tail to unformatted node */
798 static int reiserfs_check_for_tail_and_convert( struct inode *inode, /* inode to deal with */
799 					 loff_t pos, /* Writing position */
800 					 int write_bytes /* amount of bytes to write */
801 				        )
802 {
803     INITIALIZE_PATH(path); // needed for search_for_position
804     struct cpu_key key; // Key that would represent last touched writing byte.
805     struct item_head *ih; // item header of found block;
806     int res; // Return value of various functions we call.
807     int cont_expand_offset; // We will put offset for generic_cont_expand here
808 			    // This can be int just because tails are created
809 			    // only for small files.
810 
811 /* this embodies a dependency on a particular tail policy */
812     if ( inode->i_size >= inode->i_sb->s_blocksize*4 ) {
813 	/* such a big files do not have tails, so we won't bother ourselves
814 	   to look for tails, simply return */
815 	return 0;
816     }
817 
818     reiserfs_write_lock(inode->i_sb);
819     /* find the item containing the last byte to be written, or if
820      * writing past the end of the file then the last item of the
821      * file (and then we check its type). */
822     make_cpu_key (&key, inode, pos+write_bytes+1, TYPE_ANY, 3/*key length*/);
823     res = search_for_position_by_key(inode->i_sb, &key, &path);
824     if ( res == IO_ERROR ) {
825         reiserfs_write_unlock(inode->i_sb);
826 	return -EIO;
827     }
828     ih = get_ih(&path);
829     res = 0;
830     if ( is_direct_le_ih(ih) ) {
831 	/* Ok, closest item is file tail (tails are stored in "direct"
832 	 * items), so we need to unpack it. */
833 	/* To not overcomplicate matters, we just call generic_cont_expand
834 	   which will in turn call other stuff and finally will boil down to
835 	    reiserfs_get_block() that would do necessary conversion. */
836 	cont_expand_offset = le_key_k_offset(get_inode_item_key_version(inode), &(ih->ih_key));
837 	pathrelse(&path);
838 	res = generic_cont_expand( inode, cont_expand_offset);
839     } else
840 	pathrelse(&path);
841 
842     reiserfs_write_unlock(inode->i_sb);
843     return res;
844 }
845 
846 /* This function locks pages starting from @pos for @inode.
847    @num_pages pages are locked and stored in
848    @prepared_pages array. Also buffers are allocated for these pages.
849    First and last page of the region is read if it is overwritten only
850    partially. If last page did not exist before write (file hole or file
851    append), it is zeroed, then.
852    Returns number of unallocated blocks that should be allocated to cover
853    new file data.*/
854 static int reiserfs_prepare_file_region_for_write(
855 				struct inode *inode /* Inode of the file */,
856 				loff_t pos, /* position in the file */
857 				size_t num_pages, /* number of pages to
858 					          prepare */
859 				size_t write_bytes, /* Amount of bytes to be
860 						    overwritten from
861 						    @pos */
862 				struct page **prepared_pages /* pointer to array
863 							       where to store
864 							       prepared pages */
865 					   )
866 {
867     int res=0; // Return values of different functions we call.
868     unsigned long index = pos >> PAGE_CACHE_SHIFT; // Offset in file in pages.
869     int from = (pos & (PAGE_CACHE_SIZE - 1)); // Writing offset in first page
870     int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1;
871 					 /* offset of last modified byte in last
872 				            page */
873     struct address_space *mapping = inode->i_mapping; // Pages are mapped here.
874     int i; // Simple counter
875     int blocks = 0; /* Return value (blocks that should be allocated) */
876     struct buffer_head *bh, *head; // Current bufferhead and first bufferhead
877 				   // of a page.
878     unsigned block_start, block_end; // Starting and ending offsets of current
879 				     // buffer in the page.
880     struct buffer_head *wait[2], **wait_bh=wait; // Buffers for page, if
881 						 // Page appeared to be not up
882 						 // to date. Note how we have
883 						 // at most 2 buffers, this is
884 						 // because we at most may
885 						 // partially overwrite two
886 						 // buffers for one page. One at                                                 // the beginning of write area
887 						 // and one at the end.
888 						 // Everything inthe middle gets                                                 // overwritten totally.
889 
890     struct cpu_key key; // cpu key of item that we are going to deal with
891     struct item_head *ih = NULL; // pointer to item head that we are going to deal with
892     struct buffer_head *itembuf=NULL; // Buffer head that contains items that we are going to deal with
893     INITIALIZE_PATH(path); // path to item, that we are going to deal with.
894     __u32 * item=NULL; // pointer to item we are going to deal with
895     int item_pos=-1; /* Position in indirect item */
896 
897 
898     if ( num_pages < 1 ) {
899 	reiserfs_warning (inode->i_sb,
900 			  "green-9001: reiserfs_prepare_file_region_for_write "
901 			  "called with zero number of pages to process");
902 	return -EFAULT;
903     }
904 
905     /* We have 2 loops for pages. In first loop we grab and lock the pages, so
906        that nobody would touch these until we release the pages. Then
907        we'd start to deal with mapping buffers to blocks. */
908     for ( i = 0; i < num_pages; i++) {
909 	prepared_pages[i] = grab_cache_page(mapping, index + i); // locks the page
910 	if ( !prepared_pages[i]) {
911 	    res = -ENOMEM;
912 	    goto failed_page_grabbing;
913 	}
914 	if (!page_has_buffers(prepared_pages[i]))
915 	    create_empty_buffers(prepared_pages[i], inode->i_sb->s_blocksize, 0);
916     }
917 
918     /* Let's count amount of blocks for a case where all the blocks
919        overwritten are new (we will substract already allocated blocks later)*/
920     if ( num_pages > 2 )
921 	/* These are full-overwritten pages so we count all the blocks in
922 	   these pages are counted as needed to be allocated */
923 	blocks = (num_pages - 2) << (PAGE_CACHE_SHIFT - inode->i_blkbits);
924 
925     /* count blocks needed for first page (possibly partially written) */
926     blocks += ((PAGE_CACHE_SIZE - from) >> inode->i_blkbits) +
927 	   !!(from & (inode->i_sb->s_blocksize-1)); /* roundup */
928 
929     /* Now we account for last page. If last page == first page (we
930        overwrite only one page), we substract all the blocks past the
931        last writing position in a page out of already calculated number
932        of blocks */
933     blocks += ((num_pages > 1) << (PAGE_CACHE_SHIFT-inode->i_blkbits)) -
934 	   ((PAGE_CACHE_SIZE - to) >> inode->i_blkbits);
935 	   /* Note how we do not roundup here since partial blocks still
936 		   should be allocated */
937 
938     /* Now if all the write area lies past the file end, no point in
939        maping blocks, since there is none, so we just zero out remaining
940        parts of first and last pages in write area (if needed) */
941     if ( (pos & ~((loff_t)PAGE_CACHE_SIZE - 1)) > inode->i_size ) {
942 	if ( from != 0 ) {/* First page needs to be partially zeroed */
943 	    char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0);
944 	    memset(kaddr, 0, from);
945 	    kunmap_atomic( kaddr, KM_USER0);
946 	}
947 	if ( to != PAGE_CACHE_SIZE ) { /* Last page needs to be partially zeroed */
948 	    char *kaddr = kmap_atomic(prepared_pages[num_pages-1], KM_USER0);
949 	    memset(kaddr+to, 0, PAGE_CACHE_SIZE - to);
950 	    kunmap_atomic( kaddr, KM_USER0);
951 	}
952 
953 	/* Since all blocks are new - use already calculated value */
954 	return blocks;
955     }
956 
957     /* Well, since we write somewhere into the middle of a file, there is
958        possibility we are writing over some already allocated blocks, so
959        let's map these blocks and substract number of such blocks out of blocks
960        we need to allocate (calculated above) */
961     /* Mask write position to start on blocksize, we do it out of the
962        loop for performance reasons */
963     pos &= ~((loff_t) inode->i_sb->s_blocksize - 1);
964     /* Set cpu key to the starting position in a file (on left block boundary)*/
965     make_cpu_key (&key, inode, 1 + ((pos) & ~((loff_t) inode->i_sb->s_blocksize - 1)), TYPE_ANY, 3/*key length*/);
966 
967     reiserfs_write_lock(inode->i_sb); // We need that for at least search_by_key()
968     for ( i = 0; i < num_pages ; i++ ) {
969 
970 	head = page_buffers(prepared_pages[i]);
971 	/* For each buffer in the page */
972 	for(bh = head, block_start = 0; bh != head || !block_start;
973 	    block_start=block_end, bh = bh->b_this_page) {
974 		if (!bh)
975 		    reiserfs_panic(inode->i_sb, "green-9002: Allocated but absent buffer for a page?");
976 		/* Find where this buffer ends */
977 		block_end = block_start+inode->i_sb->s_blocksize;
978 		if (i == 0 && block_end <= from )
979 		    /* if this buffer is before requested data to map, skip it*/
980 		    continue;
981 
982 		if (i == num_pages - 1 && block_start >= to) {
983 		    /* If this buffer is after requested data to map, abort
984 		       processing of current page */
985 		    break;
986 		}
987 
988 		if ( buffer_mapped(bh) && bh->b_blocknr !=0 ) {
989 		    /* This is optimisation for a case where buffer is mapped
990 		       and have blocknumber assigned. In case significant amount
991 		       of such buffers are present, we may avoid some amount
992 		       of search_by_key calls.
993 		       Probably it would be possible to move parts of this code
994 		       out of BKL, but I afraid that would overcomplicate code
995 		       without any noticeable benefit.
996 		    */
997 		    item_pos++;
998 		    /* Update the key */
999 		    set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + inode->i_sb->s_blocksize);
1000 		    blocks--; // Decrease the amount of blocks that need to be
1001 			      // allocated
1002 		    continue; // Go to the next buffer
1003 		}
1004 
1005 		if ( !itembuf || /* if first iteration */
1006 		     item_pos >= ih_item_len(ih)/UNFM_P_SIZE)
1007 					     { /* or if we progressed past the
1008 						  current unformatted_item */
1009 			/* Try to find next item */
1010 			res = search_for_position_by_key(inode->i_sb, &key, &path);
1011 			/* Abort if no more items */
1012 			if ( res != POSITION_FOUND ) {
1013 			    /* make sure later loops don't use this item */
1014 			    itembuf = NULL;
1015 			    item = NULL;
1016 			    break;
1017 			}
1018 
1019 			/* Update information about current indirect item */
1020 			itembuf = get_last_bh( &path );
1021 			ih = get_ih( &path );
1022 			item = get_item( &path );
1023 			item_pos = path.pos_in_item;
1024 
1025 			RFALSE( !is_indirect_le_ih (ih), "green-9003: indirect item expected");
1026 		}
1027 
1028 		/* See if there is some block associated with the file
1029 		   at that position, map the buffer to this block */
1030 		if ( get_block_num(item,item_pos) ) {
1031 		    map_bh(bh, inode->i_sb, get_block_num(item,item_pos));
1032 		    blocks--; // Decrease the amount of blocks that need to be
1033 			      // allocated
1034 		}
1035 		item_pos++;
1036 		/* Update the key */
1037 		set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + inode->i_sb->s_blocksize);
1038 	}
1039     }
1040     pathrelse(&path); // Free the path
1041     reiserfs_write_unlock(inode->i_sb);
1042 
1043 	/* Now zero out unmappend buffers for the first and last pages of
1044 	   write area or issue read requests if page is mapped. */
1045 	/* First page, see if it is not uptodate */
1046 	if ( !PageUptodate(prepared_pages[0]) ) {
1047 	    head = page_buffers(prepared_pages[0]);
1048 
1049 	    /* For each buffer in page */
1050 	    for(bh = head, block_start = 0; bh != head || !block_start;
1051 		block_start=block_end, bh = bh->b_this_page) {
1052 
1053 		if (!bh)
1054 		    reiserfs_panic(inode->i_sb, "green-9002: Allocated but absent buffer for a page?");
1055 		/* Find where this buffer ends */
1056 		block_end = block_start+inode->i_sb->s_blocksize;
1057 		if ( block_end <= from )
1058 		    /* if this buffer is before requested data to map, skip it*/
1059 		    continue;
1060 		if ( block_start < from ) { /* Aha, our partial buffer */
1061 		    if ( buffer_mapped(bh) ) { /* If it is mapped, we need to
1062 						  issue READ request for it to
1063 						  not loose data */
1064 			ll_rw_block(READ, 1, &bh);
1065 			*wait_bh++=bh;
1066 		    } else { /* Not mapped, zero it */
1067 			char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0);
1068 			memset(kaddr+block_start, 0, from-block_start);
1069 			kunmap_atomic( kaddr, KM_USER0);
1070 			set_buffer_uptodate(bh);
1071 		    }
1072 		}
1073 	    }
1074 	}
1075 
1076 	/* Last page, see if it is not uptodate, or if the last page is past the end of the file. */
1077 	if ( !PageUptodate(prepared_pages[num_pages-1]) ||
1078 	    ((pos+write_bytes)>>PAGE_CACHE_SHIFT) > (inode->i_size>>PAGE_CACHE_SHIFT) ) {
1079 	    head = page_buffers(prepared_pages[num_pages-1]);
1080 
1081 	    /* for each buffer in page */
1082 	    for(bh = head, block_start = 0; bh != head || !block_start;
1083 		block_start=block_end, bh = bh->b_this_page) {
1084 
1085 		if (!bh)
1086 		    reiserfs_panic(inode->i_sb, "green-9002: Allocated but absent buffer for a page?");
1087 		/* Find where this buffer ends */
1088 		block_end = block_start+inode->i_sb->s_blocksize;
1089 		if ( block_start >= to )
1090 		    /* if this buffer is after requested data to map, skip it*/
1091 		    break;
1092 		if ( block_end > to ) { /* Aha, our partial buffer */
1093 		    if ( buffer_mapped(bh) ) { /* If it is mapped, we need to
1094 						  issue READ request for it to
1095 						  not loose data */
1096 			ll_rw_block(READ, 1, &bh);
1097 			*wait_bh++=bh;
1098 		    } else { /* Not mapped, zero it */
1099 			char *kaddr = kmap_atomic(prepared_pages[num_pages-1], KM_USER0);
1100 			memset(kaddr+to, 0, block_end-to);
1101 			kunmap_atomic( kaddr, KM_USER0);
1102 			set_buffer_uptodate(bh);
1103 		    }
1104 		}
1105 	    }
1106 	}
1107 
1108     /* Wait for read requests we made to happen, if necessary */
1109     while(wait_bh > wait) {
1110 	wait_on_buffer(*--wait_bh);
1111 	if (!buffer_uptodate(*wait_bh)) {
1112 	    res = -EIO;
1113 	    goto failed_read;
1114 	}
1115     }
1116 
1117     return blocks;
1118 failed_page_grabbing:
1119     num_pages = i;
1120 failed_read:
1121     reiserfs_unprepare_pages(prepared_pages, num_pages);
1122     return res;
1123 }
1124 
1125 /* Write @count bytes at position @ppos in a file indicated by @file
1126    from the buffer @buf.
1127 
1128    generic_file_write() is only appropriate for filesystems that are not seeking to optimize performance and want
1129    something simple that works.  It is not for serious use by general purpose filesystems, excepting the one that it was
1130    written for (ext2/3).  This is for several reasons:
1131 
1132    * It has no understanding of any filesystem specific optimizations.
1133 
1134    * It enters the filesystem repeatedly for each page that is written.
1135 
1136    * It depends on reiserfs_get_block() function which if implemented by reiserfs performs costly search_by_key
1137    * operation for each page it is supplied with. By contrast reiserfs_file_write() feeds as much as possible at a time
1138    * to reiserfs which allows for fewer tree traversals.
1139 
1140    * Each indirect pointer insertion takes a lot of cpu, because it involves memory moves inside of blocks.
1141 
1142    * Asking the block allocation code for blocks one at a time is slightly less efficient.
1143 
1144    All of these reasons for not using only generic file write were understood back when reiserfs was first miscoded to
1145    use it, but we were in a hurry to make code freeze, and so it couldn't be revised then.  This new code should make
1146    things right finally.
1147 
1148    Future Features: providing search_by_key with hints.
1149 
1150 */
1151 static ssize_t reiserfs_file_write( struct file *file, /* the file we are going to write into */
1152                              const char __user *buf, /*  pointer to user supplied data
1153 (in userspace) */
1154                              size_t count, /* amount of bytes to write */
1155                              loff_t *ppos /* pointer to position in file that we start writing at. Should be updated to
1156                                            * new current position before returning. */ )
1157 {
1158     size_t already_written = 0; // Number of bytes already written to the file.
1159     loff_t pos; // Current position in the file.
1160     ssize_t res; // return value of various functions that we call.
1161     int err = 0;
1162     struct inode *inode = file->f_dentry->d_inode; // Inode of the file that we are writing to.
1163 				/* To simplify coding at this time, we store
1164 				   locked pages in array for now */
1165     struct page * prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME];
1166     struct reiserfs_transaction_handle th;
1167     th.t_trans_id = 0;
1168 
1169     if ( file->f_flags & O_DIRECT) { // Direct IO needs treatment
1170 	ssize_t result, after_file_end = 0;
1171 	if ( (*ppos + count >= inode->i_size) || (file->f_flags & O_APPEND) ) {
1172 	    /* If we are appending a file, we need to put this savelink in here.
1173 	       If we will crash while doing direct io, finish_unfinished will
1174 	       cut the garbage from the file end. */
1175 	    reiserfs_write_lock(inode->i_sb);
1176 	    err = journal_begin(&th, inode->i_sb,  JOURNAL_PER_BALANCE_CNT );
1177             if (err) {
1178 		reiserfs_write_unlock (inode->i_sb);
1179 		return err;
1180 	    }
1181 	    reiserfs_update_inode_transaction(inode);
1182 	    add_save_link (&th, inode, 1 /* Truncate */);
1183 	    after_file_end = 1;
1184 	    err = journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT );
1185             reiserfs_write_unlock(inode->i_sb);
1186 	    if (err)
1187 		return err;
1188 	}
1189 	result = generic_file_write(file, buf, count, ppos);
1190 
1191 	if ( after_file_end ) { /* Now update i_size and remove the savelink */
1192 	    struct reiserfs_transaction_handle th;
1193 	    reiserfs_write_lock(inode->i_sb);
1194 	    err = journal_begin(&th, inode->i_sb, 1);
1195             if (err) {
1196                 reiserfs_write_unlock (inode->i_sb);
1197                 return err;
1198             }
1199 	    reiserfs_update_inode_transaction(inode);
1200 	    reiserfs_update_sd(&th, inode);
1201 	    err = journal_end(&th, inode->i_sb, 1);
1202             if (err) {
1203                 reiserfs_write_unlock (inode->i_sb);
1204                 return err;
1205             }
1206 	    err = remove_save_link (inode, 1/* truncate */);
1207 	    reiserfs_write_unlock(inode->i_sb);
1208             if (err)
1209                 return err;
1210 	}
1211 
1212 	return result;
1213     }
1214 
1215     if ( unlikely((ssize_t) count < 0 ))
1216         return -EINVAL;
1217 
1218     if (unlikely(!access_ok(VERIFY_READ, buf, count)))
1219         return -EFAULT;
1220 
1221     down(&inode->i_sem); // locks the entire file for just us
1222 
1223     pos = *ppos;
1224 
1225     /* Check if we can write to specified region of file, file
1226        is not overly big and this kind of stuff. Adjust pos and
1227        count, if needed */
1228     res = generic_write_checks(file, &pos, &count, 0);
1229     if (res)
1230 	goto out;
1231 
1232     if ( count == 0 )
1233 	goto out;
1234 
1235     res = remove_suid(file->f_dentry);
1236     if (res)
1237 	goto out;
1238 
1239     inode_update_time(inode, 1); /* Both mtime and ctime */
1240 
1241     // Ok, we are done with all the checks.
1242 
1243     // Now we should start real work
1244 
1245     /* If we are going to write past the file's packed tail or if we are going
1246        to overwrite part of the tail, we need that tail to be converted into
1247        unformatted node */
1248     res = reiserfs_check_for_tail_and_convert( inode, pos, count);
1249     if (res)
1250 	goto out;
1251 
1252     while ( count > 0) {
1253 	/* This is the main loop in which we running until some error occures
1254 	   or until we write all of the data. */
1255 	size_t num_pages;/* amount of pages we are going to write this iteration */
1256 	size_t write_bytes; /* amount of bytes to write during this iteration */
1257 	size_t blocks_to_allocate; /* how much blocks we need to allocate for this iteration */
1258 
1259         /*  (pos & (PAGE_CACHE_SIZE-1)) is an idiom for offset into a page of pos*/
1260 	num_pages = !!((pos+count) & (PAGE_CACHE_SIZE - 1)) + /* round up partial
1261 							  pages */
1262 		    ((count + (pos & (PAGE_CACHE_SIZE-1))) >> PAGE_CACHE_SHIFT);
1263 						/* convert size to amount of
1264 						   pages */
1265 	reiserfs_write_lock(inode->i_sb);
1266 	if ( num_pages > REISERFS_WRITE_PAGES_AT_A_TIME
1267 		|| num_pages > reiserfs_can_fit_pages(inode->i_sb) ) {
1268 	    /* If we were asked to write more data than we want to or if there
1269 	       is not that much space, then we shorten amount of data to write
1270 	       for this iteration. */
1271 	    num_pages = min_t(size_t, REISERFS_WRITE_PAGES_AT_A_TIME, reiserfs_can_fit_pages(inode->i_sb));
1272 	    /* Also we should not forget to set size in bytes accordingly */
1273 	    write_bytes = (num_pages << PAGE_CACHE_SHIFT) -
1274 			    (pos & (PAGE_CACHE_SIZE-1));
1275 					 /* If position is not on the
1276 					    start of the page, we need
1277 					    to substract the offset
1278 					    within page */
1279 	} else
1280 	    write_bytes = count;
1281 
1282 	/* reserve the blocks to be allocated later, so that later on
1283 	   we still have the space to write the blocks to */
1284 	reiserfs_claim_blocks_to_be_allocated(inode->i_sb, num_pages << (PAGE_CACHE_SHIFT - inode->i_blkbits));
1285 	reiserfs_write_unlock(inode->i_sb);
1286 
1287 	if ( !num_pages ) { /* If we do not have enough space even for */
1288 	    res = -ENOSPC;  /* single page, return -ENOSPC */
1289 	    if ( pos > (inode->i_size & (inode->i_sb->s_blocksize-1)))
1290 		break; // In case we are writing past the file end, break.
1291 	    // Otherwise we are possibly overwriting the file, so
1292 	    // let's set write size to be equal or less than blocksize.
1293 	    // This way we get it correctly for file holes.
1294 	    // But overwriting files on absolutelly full volumes would not
1295 	    // be very efficient. Well, people are not supposed to fill
1296 	    // 100% of disk space anyway.
1297 	    write_bytes = min_t(size_t, count, inode->i_sb->s_blocksize - (pos & (inode->i_sb->s_blocksize - 1)));
1298 	    num_pages = 1;
1299 	    // No blocks were claimed before, so do it now.
1300 	    reiserfs_claim_blocks_to_be_allocated(inode->i_sb, 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits));
1301 	}
1302 
1303 	/* Prepare for writing into the region, read in all the
1304 	   partially overwritten pages, if needed. And lock the pages,
1305 	   so that nobody else can access these until we are done.
1306 	   We get number of actual blocks needed as a result.*/
1307 	blocks_to_allocate = reiserfs_prepare_file_region_for_write(inode, pos, num_pages, write_bytes, prepared_pages);
1308 	if ( blocks_to_allocate < 0 ) {
1309 	    res = blocks_to_allocate;
1310 	    reiserfs_release_claimed_blocks(inode->i_sb, num_pages << (PAGE_CACHE_SHIFT - inode->i_blkbits));
1311 	    break;
1312 	}
1313 
1314 	/* First we correct our estimate of how many blocks we need */
1315 	reiserfs_release_claimed_blocks(inode->i_sb, (num_pages << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits)) - blocks_to_allocate );
1316 
1317 	if ( blocks_to_allocate > 0) {/*We only allocate blocks if we need to*/
1318 	    /* Fill in all the possible holes and append the file if needed */
1319 	    res = reiserfs_allocate_blocks_for_region(&th, inode, pos, num_pages, write_bytes, prepared_pages, blocks_to_allocate);
1320 	}
1321 
1322 	/* well, we have allocated the blocks, so it is time to free
1323 	   the reservation we made earlier. */
1324 	reiserfs_release_claimed_blocks(inode->i_sb, blocks_to_allocate);
1325 	if ( res ) {
1326 	    reiserfs_unprepare_pages(prepared_pages, num_pages);
1327 	    break;
1328 	}
1329 
1330 /* NOTE that allocating blocks and filling blocks can be done in reverse order
1331    and probably we would do that just to get rid of garbage in files after a
1332    crash */
1333 
1334 	/* Copy data from user-supplied buffer to file's pages */
1335 	res = reiserfs_copy_from_user_to_file_region(pos, num_pages, write_bytes, prepared_pages, buf);
1336 	if ( res ) {
1337 	    reiserfs_unprepare_pages(prepared_pages, num_pages);
1338 	    break;
1339 	}
1340 
1341 	/* Send the pages to disk and unlock them. */
1342 	res = reiserfs_submit_file_region_for_write(&th, inode, pos, num_pages,
1343 	                                            write_bytes,prepared_pages);
1344 	if ( res )
1345 	    break;
1346 
1347 	already_written += write_bytes;
1348 	buf += write_bytes;
1349 	*ppos = pos += write_bytes;
1350 	count -= write_bytes;
1351 	balance_dirty_pages_ratelimited(inode->i_mapping);
1352     }
1353 
1354     /* this is only true on error */
1355     if (th.t_trans_id) {
1356         reiserfs_write_lock(inode->i_sb);
1357         err = journal_end(&th, th.t_super, th.t_blocks_allocated);
1358         reiserfs_write_unlock(inode->i_sb);
1359         if (err) {
1360             res = err;
1361             goto out;
1362         }
1363     }
1364 
1365     if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
1366 	res = generic_osync_inode(inode, file->f_mapping, OSYNC_METADATA|OSYNC_DATA);
1367 
1368     up(&inode->i_sem);
1369     reiserfs_async_progress_wait(inode->i_sb);
1370     return (already_written != 0)?already_written:res;
1371 
1372 out:
1373     up(&inode->i_sem); // unlock the file on exit.
1374     return res;
1375 }
1376 
1377 static ssize_t reiserfs_aio_write(struct kiocb *iocb, const char __user *buf,
1378 			       size_t count, loff_t pos)
1379 {
1380     return generic_file_aio_write(iocb, buf, count, pos);
1381 }
1382 
1383 
1384 
1385 struct file_operations reiserfs_file_operations = {
1386     .read	= generic_file_read,
1387     .write	= reiserfs_file_write,
1388     .ioctl	= reiserfs_ioctl,
1389     .mmap	= generic_file_mmap,
1390     .release	= reiserfs_file_release,
1391     .fsync	= reiserfs_sync_file,
1392     .sendfile	= generic_file_sendfile,
1393     .aio_read   = generic_file_aio_read,
1394     .aio_write  = reiserfs_aio_write,
1395 };
1396 
1397 
1398 struct  inode_operations reiserfs_file_inode_operations = {
1399     .truncate	= reiserfs_vfs_truncate_file,
1400     .setattr    = reiserfs_setattr,
1401     .setxattr   = reiserfs_setxattr,
1402     .getxattr   = reiserfs_getxattr,
1403     .listxattr  = reiserfs_listxattr,
1404     .removexattr = reiserfs_removexattr,
1405     .permission = reiserfs_permission,
1406 };
1407 
1408 
1409