xref: /openbmc/linux/fs/buffer.c (revision 637aff46f94a754207c80c8c64bf1b74f24b967d)
1 /*
2  *  linux/fs/buffer.c
3  *
4  *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
5  */
6 
7 /*
8  * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
9  *
10  * Removed a lot of unnecessary code and simplified things now that
11  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
12  *
13  * Speed up hash, lru, and free list operations.  Use gfp() for allocating
14  * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
15  *
16  * Added 32k buffer block sizes - these are required older ARM systems. - RMK
17  *
18  * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
19  */
20 
21 #include <linux/kernel.h>
22 #include <linux/syscalls.h>
23 #include <linux/fs.h>
24 #include <linux/mm.h>
25 #include <linux/percpu.h>
26 #include <linux/slab.h>
27 #include <linux/capability.h>
28 #include <linux/blkdev.h>
29 #include <linux/file.h>
30 #include <linux/quotaops.h>
31 #include <linux/highmem.h>
32 #include <linux/module.h>
33 #include <linux/writeback.h>
34 #include <linux/hash.h>
35 #include <linux/suspend.h>
36 #include <linux/buffer_head.h>
37 #include <linux/task_io_accounting_ops.h>
38 #include <linux/bio.h>
39 #include <linux/notifier.h>
40 #include <linux/cpu.h>
41 #include <linux/bitops.h>
42 #include <linux/mpage.h>
43 #include <linux/bit_spinlock.h>
44 
45 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
46 
47 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
48 
49 inline void
50 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
51 {
52 	bh->b_end_io = handler;
53 	bh->b_private = private;
54 }
55 
56 static int sync_buffer(void *word)
57 {
58 	struct block_device *bd;
59 	struct buffer_head *bh
60 		= container_of(word, struct buffer_head, b_state);
61 
62 	smp_mb();
63 	bd = bh->b_bdev;
64 	if (bd)
65 		blk_run_address_space(bd->bd_inode->i_mapping);
66 	io_schedule();
67 	return 0;
68 }
69 
70 void fastcall __lock_buffer(struct buffer_head *bh)
71 {
72 	wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
73 							TASK_UNINTERRUPTIBLE);
74 }
75 EXPORT_SYMBOL(__lock_buffer);
76 
77 void fastcall unlock_buffer(struct buffer_head *bh)
78 {
79 	smp_mb__before_clear_bit();
80 	clear_buffer_locked(bh);
81 	smp_mb__after_clear_bit();
82 	wake_up_bit(&bh->b_state, BH_Lock);
83 }
84 
85 /*
86  * Block until a buffer comes unlocked.  This doesn't stop it
87  * from becoming locked again - you have to lock it yourself
88  * if you want to preserve its state.
89  */
90 void __wait_on_buffer(struct buffer_head * bh)
91 {
92 	wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
93 }
94 
95 static void
96 __clear_page_buffers(struct page *page)
97 {
98 	ClearPagePrivate(page);
99 	set_page_private(page, 0);
100 	page_cache_release(page);
101 }
102 
103 static void buffer_io_error(struct buffer_head *bh)
104 {
105 	char b[BDEVNAME_SIZE];
106 
107 	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
108 			bdevname(bh->b_bdev, b),
109 			(unsigned long long)bh->b_blocknr);
110 }
111 
112 /*
113  * End-of-IO handler helper function which does not touch the bh after
114  * unlocking it.
115  * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
116  * a race there is benign: unlock_buffer() only use the bh's address for
117  * hashing after unlocking the buffer, so it doesn't actually touch the bh
118  * itself.
119  */
120 static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
121 {
122 	if (uptodate) {
123 		set_buffer_uptodate(bh);
124 	} else {
125 		/* This happens, due to failed READA attempts. */
126 		clear_buffer_uptodate(bh);
127 	}
128 	unlock_buffer(bh);
129 }
130 
131 /*
132  * Default synchronous end-of-IO handler..  Just mark it up-to-date and
133  * unlock the buffer. This is what ll_rw_block uses too.
134  */
135 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
136 {
137 	__end_buffer_read_notouch(bh, uptodate);
138 	put_bh(bh);
139 }
140 
141 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
142 {
143 	char b[BDEVNAME_SIZE];
144 
145 	if (uptodate) {
146 		set_buffer_uptodate(bh);
147 	} else {
148 		if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
149 			buffer_io_error(bh);
150 			printk(KERN_WARNING "lost page write due to "
151 					"I/O error on %s\n",
152 				       bdevname(bh->b_bdev, b));
153 		}
154 		set_buffer_write_io_error(bh);
155 		clear_buffer_uptodate(bh);
156 	}
157 	unlock_buffer(bh);
158 	put_bh(bh);
159 }
160 
161 /*
162  * Write out and wait upon all the dirty data associated with a block
163  * device via its mapping.  Does not take the superblock lock.
164  */
165 int sync_blockdev(struct block_device *bdev)
166 {
167 	int ret = 0;
168 
169 	if (bdev)
170 		ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
171 	return ret;
172 }
173 EXPORT_SYMBOL(sync_blockdev);
174 
175 /*
176  * Write out and wait upon all dirty data associated with this
177  * device.   Filesystem data as well as the underlying block
178  * device.  Takes the superblock lock.
179  */
180 int fsync_bdev(struct block_device *bdev)
181 {
182 	struct super_block *sb = get_super(bdev);
183 	if (sb) {
184 		int res = fsync_super(sb);
185 		drop_super(sb);
186 		return res;
187 	}
188 	return sync_blockdev(bdev);
189 }
190 
191 /**
192  * freeze_bdev  --  lock a filesystem and force it into a consistent state
193  * @bdev:	blockdevice to lock
194  *
195  * This takes the block device bd_mount_sem to make sure no new mounts
196  * happen on bdev until thaw_bdev() is called.
197  * If a superblock is found on this device, we take the s_umount semaphore
198  * on it to make sure nobody unmounts until the snapshot creation is done.
199  */
200 struct super_block *freeze_bdev(struct block_device *bdev)
201 {
202 	struct super_block *sb;
203 
204 	down(&bdev->bd_mount_sem);
205 	sb = get_super(bdev);
206 	if (sb && !(sb->s_flags & MS_RDONLY)) {
207 		sb->s_frozen = SB_FREEZE_WRITE;
208 		smp_wmb();
209 
210 		__fsync_super(sb);
211 
212 		sb->s_frozen = SB_FREEZE_TRANS;
213 		smp_wmb();
214 
215 		sync_blockdev(sb->s_bdev);
216 
217 		if (sb->s_op->write_super_lockfs)
218 			sb->s_op->write_super_lockfs(sb);
219 	}
220 
221 	sync_blockdev(bdev);
222 	return sb;	/* thaw_bdev releases s->s_umount and bd_mount_sem */
223 }
224 EXPORT_SYMBOL(freeze_bdev);
225 
226 /**
227  * thaw_bdev  -- unlock filesystem
228  * @bdev:	blockdevice to unlock
229  * @sb:		associated superblock
230  *
231  * Unlocks the filesystem and marks it writeable again after freeze_bdev().
232  */
233 void thaw_bdev(struct block_device *bdev, struct super_block *sb)
234 {
235 	if (sb) {
236 		BUG_ON(sb->s_bdev != bdev);
237 
238 		if (sb->s_op->unlockfs)
239 			sb->s_op->unlockfs(sb);
240 		sb->s_frozen = SB_UNFROZEN;
241 		smp_wmb();
242 		wake_up(&sb->s_wait_unfrozen);
243 		drop_super(sb);
244 	}
245 
246 	up(&bdev->bd_mount_sem);
247 }
248 EXPORT_SYMBOL(thaw_bdev);
249 
250 /*
251  * Various filesystems appear to want __find_get_block to be non-blocking.
252  * But it's the page lock which protects the buffers.  To get around this,
253  * we get exclusion from try_to_free_buffers with the blockdev mapping's
254  * private_lock.
255  *
256  * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
257  * may be quite high.  This code could TryLock the page, and if that
258  * succeeds, there is no need to take private_lock. (But if
259  * private_lock is contended then so is mapping->tree_lock).
260  */
261 static struct buffer_head *
262 __find_get_block_slow(struct block_device *bdev, sector_t block)
263 {
264 	struct inode *bd_inode = bdev->bd_inode;
265 	struct address_space *bd_mapping = bd_inode->i_mapping;
266 	struct buffer_head *ret = NULL;
267 	pgoff_t index;
268 	struct buffer_head *bh;
269 	struct buffer_head *head;
270 	struct page *page;
271 	int all_mapped = 1;
272 
273 	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
274 	page = find_get_page(bd_mapping, index);
275 	if (!page)
276 		goto out;
277 
278 	spin_lock(&bd_mapping->private_lock);
279 	if (!page_has_buffers(page))
280 		goto out_unlock;
281 	head = page_buffers(page);
282 	bh = head;
283 	do {
284 		if (bh->b_blocknr == block) {
285 			ret = bh;
286 			get_bh(bh);
287 			goto out_unlock;
288 		}
289 		if (!buffer_mapped(bh))
290 			all_mapped = 0;
291 		bh = bh->b_this_page;
292 	} while (bh != head);
293 
294 	/* we might be here because some of the buffers on this page are
295 	 * not mapped.  This is due to various races between
296 	 * file io on the block device and getblk.  It gets dealt with
297 	 * elsewhere, don't buffer_error if we had some unmapped buffers
298 	 */
299 	if (all_mapped) {
300 		printk("__find_get_block_slow() failed. "
301 			"block=%llu, b_blocknr=%llu\n",
302 			(unsigned long long)block,
303 			(unsigned long long)bh->b_blocknr);
304 		printk("b_state=0x%08lx, b_size=%zu\n",
305 			bh->b_state, bh->b_size);
306 		printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
307 	}
308 out_unlock:
309 	spin_unlock(&bd_mapping->private_lock);
310 	page_cache_release(page);
311 out:
312 	return ret;
313 }
314 
315 /* If invalidate_buffers() will trash dirty buffers, it means some kind
316    of fs corruption is going on. Trashing dirty data always imply losing
317    information that was supposed to be just stored on the physical layer
318    by the user.
319 
320    Thus invalidate_buffers in general usage is not allwowed to trash
321    dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
322    be preserved.  These buffers are simply skipped.
323 
324    We also skip buffers which are still in use.  For example this can
325    happen if a userspace program is reading the block device.
326 
327    NOTE: In the case where the user removed a removable-media-disk even if
328    there's still dirty data not synced on disk (due a bug in the device driver
329    or due an error of the user), by not destroying the dirty buffers we could
330    generate corruption also on the next media inserted, thus a parameter is
331    necessary to handle this case in the most safe way possible (trying
332    to not corrupt also the new disk inserted with the data belonging to
333    the old now corrupted disk). Also for the ramdisk the natural thing
334    to do in order to release the ramdisk memory is to destroy dirty buffers.
335 
336    These are two special cases. Normal usage imply the device driver
337    to issue a sync on the device (without waiting I/O completion) and
338    then an invalidate_buffers call that doesn't trash dirty buffers.
339 
340    For handling cache coherency with the blkdev pagecache the 'update' case
341    is been introduced. It is needed to re-read from disk any pinned
342    buffer. NOTE: re-reading from disk is destructive so we can do it only
343    when we assume nobody is changing the buffercache under our I/O and when
344    we think the disk contains more recent information than the buffercache.
345    The update == 1 pass marks the buffers we need to update, the update == 2
346    pass does the actual I/O. */
347 void invalidate_bdev(struct block_device *bdev)
348 {
349 	struct address_space *mapping = bdev->bd_inode->i_mapping;
350 
351 	if (mapping->nrpages == 0)
352 		return;
353 
354 	invalidate_bh_lrus();
355 	invalidate_mapping_pages(mapping, 0, -1);
356 }
357 
358 /*
359  * Kick pdflush then try to free up some ZONE_NORMAL memory.
360  */
361 static void free_more_memory(void)
362 {
363 	struct zone **zones;
364 	pg_data_t *pgdat;
365 
366 	wakeup_pdflush(1024);
367 	yield();
368 
369 	for_each_online_pgdat(pgdat) {
370 		zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
371 		if (*zones)
372 			try_to_free_pages(zones, 0, GFP_NOFS);
373 	}
374 }
375 
376 /*
377  * I/O completion handler for block_read_full_page() - pages
378  * which come unlocked at the end of I/O.
379  */
380 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
381 {
382 	unsigned long flags;
383 	struct buffer_head *first;
384 	struct buffer_head *tmp;
385 	struct page *page;
386 	int page_uptodate = 1;
387 
388 	BUG_ON(!buffer_async_read(bh));
389 
390 	page = bh->b_page;
391 	if (uptodate) {
392 		set_buffer_uptodate(bh);
393 	} else {
394 		clear_buffer_uptodate(bh);
395 		if (printk_ratelimit())
396 			buffer_io_error(bh);
397 		SetPageError(page);
398 	}
399 
400 	/*
401 	 * Be _very_ careful from here on. Bad things can happen if
402 	 * two buffer heads end IO at almost the same time and both
403 	 * decide that the page is now completely done.
404 	 */
405 	first = page_buffers(page);
406 	local_irq_save(flags);
407 	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
408 	clear_buffer_async_read(bh);
409 	unlock_buffer(bh);
410 	tmp = bh;
411 	do {
412 		if (!buffer_uptodate(tmp))
413 			page_uptodate = 0;
414 		if (buffer_async_read(tmp)) {
415 			BUG_ON(!buffer_locked(tmp));
416 			goto still_busy;
417 		}
418 		tmp = tmp->b_this_page;
419 	} while (tmp != bh);
420 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
421 	local_irq_restore(flags);
422 
423 	/*
424 	 * If none of the buffers had errors and they are all
425 	 * uptodate then we can set the page uptodate.
426 	 */
427 	if (page_uptodate && !PageError(page))
428 		SetPageUptodate(page);
429 	unlock_page(page);
430 	return;
431 
432 still_busy:
433 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
434 	local_irq_restore(flags);
435 	return;
436 }
437 
438 /*
439  * Completion handler for block_write_full_page() - pages which are unlocked
440  * during I/O, and which have PageWriteback cleared upon I/O completion.
441  */
442 static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
443 {
444 	char b[BDEVNAME_SIZE];
445 	unsigned long flags;
446 	struct buffer_head *first;
447 	struct buffer_head *tmp;
448 	struct page *page;
449 
450 	BUG_ON(!buffer_async_write(bh));
451 
452 	page = bh->b_page;
453 	if (uptodate) {
454 		set_buffer_uptodate(bh);
455 	} else {
456 		if (printk_ratelimit()) {
457 			buffer_io_error(bh);
458 			printk(KERN_WARNING "lost page write due to "
459 					"I/O error on %s\n",
460 			       bdevname(bh->b_bdev, b));
461 		}
462 		set_bit(AS_EIO, &page->mapping->flags);
463 		set_buffer_write_io_error(bh);
464 		clear_buffer_uptodate(bh);
465 		SetPageError(page);
466 	}
467 
468 	first = page_buffers(page);
469 	local_irq_save(flags);
470 	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
471 
472 	clear_buffer_async_write(bh);
473 	unlock_buffer(bh);
474 	tmp = bh->b_this_page;
475 	while (tmp != bh) {
476 		if (buffer_async_write(tmp)) {
477 			BUG_ON(!buffer_locked(tmp));
478 			goto still_busy;
479 		}
480 		tmp = tmp->b_this_page;
481 	}
482 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
483 	local_irq_restore(flags);
484 	end_page_writeback(page);
485 	return;
486 
487 still_busy:
488 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
489 	local_irq_restore(flags);
490 	return;
491 }
492 
493 /*
494  * If a page's buffers are under async readin (end_buffer_async_read
495  * completion) then there is a possibility that another thread of
496  * control could lock one of the buffers after it has completed
497  * but while some of the other buffers have not completed.  This
498  * locked buffer would confuse end_buffer_async_read() into not unlocking
499  * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
500  * that this buffer is not under async I/O.
501  *
502  * The page comes unlocked when it has no locked buffer_async buffers
503  * left.
504  *
505  * PageLocked prevents anyone starting new async I/O reads any of
506  * the buffers.
507  *
508  * PageWriteback is used to prevent simultaneous writeout of the same
509  * page.
510  *
511  * PageLocked prevents anyone from starting writeback of a page which is
512  * under read I/O (PageWriteback is only ever set against a locked page).
513  */
514 static void mark_buffer_async_read(struct buffer_head *bh)
515 {
516 	bh->b_end_io = end_buffer_async_read;
517 	set_buffer_async_read(bh);
518 }
519 
520 void mark_buffer_async_write(struct buffer_head *bh)
521 {
522 	bh->b_end_io = end_buffer_async_write;
523 	set_buffer_async_write(bh);
524 }
525 EXPORT_SYMBOL(mark_buffer_async_write);
526 
527 
528 /*
529  * fs/buffer.c contains helper functions for buffer-backed address space's
530  * fsync functions.  A common requirement for buffer-based filesystems is
531  * that certain data from the backing blockdev needs to be written out for
532  * a successful fsync().  For example, ext2 indirect blocks need to be
533  * written back and waited upon before fsync() returns.
534  *
535  * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
536  * inode_has_buffers() and invalidate_inode_buffers() are provided for the
537  * management of a list of dependent buffers at ->i_mapping->private_list.
538  *
539  * Locking is a little subtle: try_to_free_buffers() will remove buffers
540  * from their controlling inode's queue when they are being freed.  But
541  * try_to_free_buffers() will be operating against the *blockdev* mapping
542  * at the time, not against the S_ISREG file which depends on those buffers.
543  * So the locking for private_list is via the private_lock in the address_space
544  * which backs the buffers.  Which is different from the address_space
545  * against which the buffers are listed.  So for a particular address_space,
546  * mapping->private_lock does *not* protect mapping->private_list!  In fact,
547  * mapping->private_list will always be protected by the backing blockdev's
548  * ->private_lock.
549  *
550  * Which introduces a requirement: all buffers on an address_space's
551  * ->private_list must be from the same address_space: the blockdev's.
552  *
553  * address_spaces which do not place buffers at ->private_list via these
554  * utility functions are free to use private_lock and private_list for
555  * whatever they want.  The only requirement is that list_empty(private_list)
556  * be true at clear_inode() time.
557  *
558  * FIXME: clear_inode should not call invalidate_inode_buffers().  The
559  * filesystems should do that.  invalidate_inode_buffers() should just go
560  * BUG_ON(!list_empty).
561  *
562  * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
563  * take an address_space, not an inode.  And it should be called
564  * mark_buffer_dirty_fsync() to clearly define why those buffers are being
565  * queued up.
566  *
567  * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
568  * list if it is already on a list.  Because if the buffer is on a list,
569  * it *must* already be on the right one.  If not, the filesystem is being
570  * silly.  This will save a ton of locking.  But first we have to ensure
571  * that buffers are taken *off* the old inode's list when they are freed
572  * (presumably in truncate).  That requires careful auditing of all
573  * filesystems (do it inside bforget()).  It could also be done by bringing
574  * b_inode back.
575  */
576 
577 /*
578  * The buffer's backing address_space's private_lock must be held
579  */
580 static inline void __remove_assoc_queue(struct buffer_head *bh)
581 {
582 	list_del_init(&bh->b_assoc_buffers);
583 	WARN_ON(!bh->b_assoc_map);
584 	if (buffer_write_io_error(bh))
585 		set_bit(AS_EIO, &bh->b_assoc_map->flags);
586 	bh->b_assoc_map = NULL;
587 }
588 
589 int inode_has_buffers(struct inode *inode)
590 {
591 	return !list_empty(&inode->i_data.private_list);
592 }
593 
594 /*
595  * osync is designed to support O_SYNC io.  It waits synchronously for
596  * all already-submitted IO to complete, but does not queue any new
597  * writes to the disk.
598  *
599  * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
600  * you dirty the buffers, and then use osync_inode_buffers to wait for
601  * completion.  Any other dirty buffers which are not yet queued for
602  * write will not be flushed to disk by the osync.
603  */
604 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
605 {
606 	struct buffer_head *bh;
607 	struct list_head *p;
608 	int err = 0;
609 
610 	spin_lock(lock);
611 repeat:
612 	list_for_each_prev(p, list) {
613 		bh = BH_ENTRY(p);
614 		if (buffer_locked(bh)) {
615 			get_bh(bh);
616 			spin_unlock(lock);
617 			wait_on_buffer(bh);
618 			if (!buffer_uptodate(bh))
619 				err = -EIO;
620 			brelse(bh);
621 			spin_lock(lock);
622 			goto repeat;
623 		}
624 	}
625 	spin_unlock(lock);
626 	return err;
627 }
628 
629 /**
630  * sync_mapping_buffers - write out and wait upon a mapping's "associated"
631  *                        buffers
632  * @mapping: the mapping which wants those buffers written
633  *
634  * Starts I/O against the buffers at mapping->private_list, and waits upon
635  * that I/O.
636  *
637  * Basically, this is a convenience function for fsync().
638  * @mapping is a file or directory which needs those buffers to be written for
639  * a successful fsync().
640  */
641 int sync_mapping_buffers(struct address_space *mapping)
642 {
643 	struct address_space *buffer_mapping = mapping->assoc_mapping;
644 
645 	if (buffer_mapping == NULL || list_empty(&mapping->private_list))
646 		return 0;
647 
648 	return fsync_buffers_list(&buffer_mapping->private_lock,
649 					&mapping->private_list);
650 }
651 EXPORT_SYMBOL(sync_mapping_buffers);
652 
653 /*
654  * Called when we've recently written block `bblock', and it is known that
655  * `bblock' was for a buffer_boundary() buffer.  This means that the block at
656  * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
657  * dirty, schedule it for IO.  So that indirects merge nicely with their data.
658  */
659 void write_boundary_block(struct block_device *bdev,
660 			sector_t bblock, unsigned blocksize)
661 {
662 	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
663 	if (bh) {
664 		if (buffer_dirty(bh))
665 			ll_rw_block(WRITE, 1, &bh);
666 		put_bh(bh);
667 	}
668 }
669 
670 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
671 {
672 	struct address_space *mapping = inode->i_mapping;
673 	struct address_space *buffer_mapping = bh->b_page->mapping;
674 
675 	mark_buffer_dirty(bh);
676 	if (!mapping->assoc_mapping) {
677 		mapping->assoc_mapping = buffer_mapping;
678 	} else {
679 		BUG_ON(mapping->assoc_mapping != buffer_mapping);
680 	}
681 	if (list_empty(&bh->b_assoc_buffers)) {
682 		spin_lock(&buffer_mapping->private_lock);
683 		list_move_tail(&bh->b_assoc_buffers,
684 				&mapping->private_list);
685 		bh->b_assoc_map = mapping;
686 		spin_unlock(&buffer_mapping->private_lock);
687 	}
688 }
689 EXPORT_SYMBOL(mark_buffer_dirty_inode);
690 
691 /*
692  * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
693  * dirty.
694  *
695  * If warn is true, then emit a warning if the page is not uptodate and has
696  * not been truncated.
697  */
698 static int __set_page_dirty(struct page *page,
699 		struct address_space *mapping, int warn)
700 {
701 	if (unlikely(!mapping))
702 		return !TestSetPageDirty(page);
703 
704 	if (TestSetPageDirty(page))
705 		return 0;
706 
707 	write_lock_irq(&mapping->tree_lock);
708 	if (page->mapping) {	/* Race with truncate? */
709 		WARN_ON_ONCE(warn && !PageUptodate(page));
710 
711 		if (mapping_cap_account_dirty(mapping)) {
712 			__inc_zone_page_state(page, NR_FILE_DIRTY);
713 			task_io_account_write(PAGE_CACHE_SIZE);
714 		}
715 		radix_tree_tag_set(&mapping->page_tree,
716 				page_index(page), PAGECACHE_TAG_DIRTY);
717 	}
718 	write_unlock_irq(&mapping->tree_lock);
719 	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
720 
721 	return 1;
722 }
723 
724 /*
725  * Add a page to the dirty page list.
726  *
727  * It is a sad fact of life that this function is called from several places
728  * deeply under spinlocking.  It may not sleep.
729  *
730  * If the page has buffers, the uptodate buffers are set dirty, to preserve
731  * dirty-state coherency between the page and the buffers.  It the page does
732  * not have buffers then when they are later attached they will all be set
733  * dirty.
734  *
735  * The buffers are dirtied before the page is dirtied.  There's a small race
736  * window in which a writepage caller may see the page cleanness but not the
737  * buffer dirtiness.  That's fine.  If this code were to set the page dirty
738  * before the buffers, a concurrent writepage caller could clear the page dirty
739  * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
740  * page on the dirty page list.
741  *
742  * We use private_lock to lock against try_to_free_buffers while using the
743  * page's buffer list.  Also use this to protect against clean buffers being
744  * added to the page after it was set dirty.
745  *
746  * FIXME: may need to call ->reservepage here as well.  That's rather up to the
747  * address_space though.
748  */
749 int __set_page_dirty_buffers(struct page *page)
750 {
751 	struct address_space *mapping = page_mapping(page);
752 
753 	if (unlikely(!mapping))
754 		return !TestSetPageDirty(page);
755 
756 	spin_lock(&mapping->private_lock);
757 	if (page_has_buffers(page)) {
758 		struct buffer_head *head = page_buffers(page);
759 		struct buffer_head *bh = head;
760 
761 		do {
762 			set_buffer_dirty(bh);
763 			bh = bh->b_this_page;
764 		} while (bh != head);
765 	}
766 	spin_unlock(&mapping->private_lock);
767 
768 	return __set_page_dirty(page, mapping, 1);
769 }
770 EXPORT_SYMBOL(__set_page_dirty_buffers);
771 
772 /*
773  * Write out and wait upon a list of buffers.
774  *
775  * We have conflicting pressures: we want to make sure that all
776  * initially dirty buffers get waited on, but that any subsequently
777  * dirtied buffers don't.  After all, we don't want fsync to last
778  * forever if somebody is actively writing to the file.
779  *
780  * Do this in two main stages: first we copy dirty buffers to a
781  * temporary inode list, queueing the writes as we go.  Then we clean
782  * up, waiting for those writes to complete.
783  *
784  * During this second stage, any subsequent updates to the file may end
785  * up refiling the buffer on the original inode's dirty list again, so
786  * there is a chance we will end up with a buffer queued for write but
787  * not yet completed on that list.  So, as a final cleanup we go through
788  * the osync code to catch these locked, dirty buffers without requeuing
789  * any newly dirty buffers for write.
790  */
791 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
792 {
793 	struct buffer_head *bh;
794 	struct list_head tmp;
795 	int err = 0, err2;
796 
797 	INIT_LIST_HEAD(&tmp);
798 
799 	spin_lock(lock);
800 	while (!list_empty(list)) {
801 		bh = BH_ENTRY(list->next);
802 		__remove_assoc_queue(bh);
803 		if (buffer_dirty(bh) || buffer_locked(bh)) {
804 			list_add(&bh->b_assoc_buffers, &tmp);
805 			if (buffer_dirty(bh)) {
806 				get_bh(bh);
807 				spin_unlock(lock);
808 				/*
809 				 * Ensure any pending I/O completes so that
810 				 * ll_rw_block() actually writes the current
811 				 * contents - it is a noop if I/O is still in
812 				 * flight on potentially older contents.
813 				 */
814 				ll_rw_block(SWRITE, 1, &bh);
815 				brelse(bh);
816 				spin_lock(lock);
817 			}
818 		}
819 	}
820 
821 	while (!list_empty(&tmp)) {
822 		bh = BH_ENTRY(tmp.prev);
823 		list_del_init(&bh->b_assoc_buffers);
824 		get_bh(bh);
825 		spin_unlock(lock);
826 		wait_on_buffer(bh);
827 		if (!buffer_uptodate(bh))
828 			err = -EIO;
829 		brelse(bh);
830 		spin_lock(lock);
831 	}
832 
833 	spin_unlock(lock);
834 	err2 = osync_buffers_list(lock, list);
835 	if (err)
836 		return err;
837 	else
838 		return err2;
839 }
840 
841 /*
842  * Invalidate any and all dirty buffers on a given inode.  We are
843  * probably unmounting the fs, but that doesn't mean we have already
844  * done a sync().  Just drop the buffers from the inode list.
845  *
846  * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
847  * assumes that all the buffers are against the blockdev.  Not true
848  * for reiserfs.
849  */
850 void invalidate_inode_buffers(struct inode *inode)
851 {
852 	if (inode_has_buffers(inode)) {
853 		struct address_space *mapping = &inode->i_data;
854 		struct list_head *list = &mapping->private_list;
855 		struct address_space *buffer_mapping = mapping->assoc_mapping;
856 
857 		spin_lock(&buffer_mapping->private_lock);
858 		while (!list_empty(list))
859 			__remove_assoc_queue(BH_ENTRY(list->next));
860 		spin_unlock(&buffer_mapping->private_lock);
861 	}
862 }
863 
864 /*
865  * Remove any clean buffers from the inode's buffer list.  This is called
866  * when we're trying to free the inode itself.  Those buffers can pin it.
867  *
868  * Returns true if all buffers were removed.
869  */
870 int remove_inode_buffers(struct inode *inode)
871 {
872 	int ret = 1;
873 
874 	if (inode_has_buffers(inode)) {
875 		struct address_space *mapping = &inode->i_data;
876 		struct list_head *list = &mapping->private_list;
877 		struct address_space *buffer_mapping = mapping->assoc_mapping;
878 
879 		spin_lock(&buffer_mapping->private_lock);
880 		while (!list_empty(list)) {
881 			struct buffer_head *bh = BH_ENTRY(list->next);
882 			if (buffer_dirty(bh)) {
883 				ret = 0;
884 				break;
885 			}
886 			__remove_assoc_queue(bh);
887 		}
888 		spin_unlock(&buffer_mapping->private_lock);
889 	}
890 	return ret;
891 }
892 
893 /*
894  * Create the appropriate buffers when given a page for data area and
895  * the size of each buffer.. Use the bh->b_this_page linked list to
896  * follow the buffers created.  Return NULL if unable to create more
897  * buffers.
898  *
899  * The retry flag is used to differentiate async IO (paging, swapping)
900  * which may not fail from ordinary buffer allocations.
901  */
902 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
903 		int retry)
904 {
905 	struct buffer_head *bh, *head;
906 	long offset;
907 
908 try_again:
909 	head = NULL;
910 	offset = PAGE_SIZE;
911 	while ((offset -= size) >= 0) {
912 		bh = alloc_buffer_head(GFP_NOFS);
913 		if (!bh)
914 			goto no_grow;
915 
916 		bh->b_bdev = NULL;
917 		bh->b_this_page = head;
918 		bh->b_blocknr = -1;
919 		head = bh;
920 
921 		bh->b_state = 0;
922 		atomic_set(&bh->b_count, 0);
923 		bh->b_private = NULL;
924 		bh->b_size = size;
925 
926 		/* Link the buffer to its page */
927 		set_bh_page(bh, page, offset);
928 
929 		init_buffer(bh, NULL, NULL);
930 	}
931 	return head;
932 /*
933  * In case anything failed, we just free everything we got.
934  */
935 no_grow:
936 	if (head) {
937 		do {
938 			bh = head;
939 			head = head->b_this_page;
940 			free_buffer_head(bh);
941 		} while (head);
942 	}
943 
944 	/*
945 	 * Return failure for non-async IO requests.  Async IO requests
946 	 * are not allowed to fail, so we have to wait until buffer heads
947 	 * become available.  But we don't want tasks sleeping with
948 	 * partially complete buffers, so all were released above.
949 	 */
950 	if (!retry)
951 		return NULL;
952 
953 	/* We're _really_ low on memory. Now we just
954 	 * wait for old buffer heads to become free due to
955 	 * finishing IO.  Since this is an async request and
956 	 * the reserve list is empty, we're sure there are
957 	 * async buffer heads in use.
958 	 */
959 	free_more_memory();
960 	goto try_again;
961 }
962 EXPORT_SYMBOL_GPL(alloc_page_buffers);
963 
964 static inline void
965 link_dev_buffers(struct page *page, struct buffer_head *head)
966 {
967 	struct buffer_head *bh, *tail;
968 
969 	bh = head;
970 	do {
971 		tail = bh;
972 		bh = bh->b_this_page;
973 	} while (bh);
974 	tail->b_this_page = head;
975 	attach_page_buffers(page, head);
976 }
977 
978 /*
979  * Initialise the state of a blockdev page's buffers.
980  */
981 static void
982 init_page_buffers(struct page *page, struct block_device *bdev,
983 			sector_t block, int size)
984 {
985 	struct buffer_head *head = page_buffers(page);
986 	struct buffer_head *bh = head;
987 	int uptodate = PageUptodate(page);
988 
989 	do {
990 		if (!buffer_mapped(bh)) {
991 			init_buffer(bh, NULL, NULL);
992 			bh->b_bdev = bdev;
993 			bh->b_blocknr = block;
994 			if (uptodate)
995 				set_buffer_uptodate(bh);
996 			set_buffer_mapped(bh);
997 		}
998 		block++;
999 		bh = bh->b_this_page;
1000 	} while (bh != head);
1001 }
1002 
1003 /*
1004  * Create the page-cache page that contains the requested block.
1005  *
1006  * This is user purely for blockdev mappings.
1007  */
1008 static struct page *
1009 grow_dev_page(struct block_device *bdev, sector_t block,
1010 		pgoff_t index, int size)
1011 {
1012 	struct inode *inode = bdev->bd_inode;
1013 	struct page *page;
1014 	struct buffer_head *bh;
1015 
1016 	page = find_or_create_page(inode->i_mapping, index,
1017 		(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
1018 	if (!page)
1019 		return NULL;
1020 
1021 	BUG_ON(!PageLocked(page));
1022 
1023 	if (page_has_buffers(page)) {
1024 		bh = page_buffers(page);
1025 		if (bh->b_size == size) {
1026 			init_page_buffers(page, bdev, block, size);
1027 			return page;
1028 		}
1029 		if (!try_to_free_buffers(page))
1030 			goto failed;
1031 	}
1032 
1033 	/*
1034 	 * Allocate some buffers for this page
1035 	 */
1036 	bh = alloc_page_buffers(page, size, 0);
1037 	if (!bh)
1038 		goto failed;
1039 
1040 	/*
1041 	 * Link the page to the buffers and initialise them.  Take the
1042 	 * lock to be atomic wrt __find_get_block(), which does not
1043 	 * run under the page lock.
1044 	 */
1045 	spin_lock(&inode->i_mapping->private_lock);
1046 	link_dev_buffers(page, bh);
1047 	init_page_buffers(page, bdev, block, size);
1048 	spin_unlock(&inode->i_mapping->private_lock);
1049 	return page;
1050 
1051 failed:
1052 	BUG();
1053 	unlock_page(page);
1054 	page_cache_release(page);
1055 	return NULL;
1056 }
1057 
1058 /*
1059  * Create buffers for the specified block device block's page.  If
1060  * that page was dirty, the buffers are set dirty also.
1061  */
1062 static int
1063 grow_buffers(struct block_device *bdev, sector_t block, int size)
1064 {
1065 	struct page *page;
1066 	pgoff_t index;
1067 	int sizebits;
1068 
1069 	sizebits = -1;
1070 	do {
1071 		sizebits++;
1072 	} while ((size << sizebits) < PAGE_SIZE);
1073 
1074 	index = block >> sizebits;
1075 
1076 	/*
1077 	 * Check for a block which wants to lie outside our maximum possible
1078 	 * pagecache index.  (this comparison is done using sector_t types).
1079 	 */
1080 	if (unlikely(index != block >> sizebits)) {
1081 		char b[BDEVNAME_SIZE];
1082 
1083 		printk(KERN_ERR "%s: requested out-of-range block %llu for "
1084 			"device %s\n",
1085 			__FUNCTION__, (unsigned long long)block,
1086 			bdevname(bdev, b));
1087 		return -EIO;
1088 	}
1089 	block = index << sizebits;
1090 	/* Create a page with the proper size buffers.. */
1091 	page = grow_dev_page(bdev, block, index, size);
1092 	if (!page)
1093 		return 0;
1094 	unlock_page(page);
1095 	page_cache_release(page);
1096 	return 1;
1097 }
1098 
1099 static struct buffer_head *
1100 __getblk_slow(struct block_device *bdev, sector_t block, int size)
1101 {
1102 	/* Size must be multiple of hard sectorsize */
1103 	if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
1104 			(size < 512 || size > PAGE_SIZE))) {
1105 		printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1106 					size);
1107 		printk(KERN_ERR "hardsect size: %d\n",
1108 					bdev_hardsect_size(bdev));
1109 
1110 		dump_stack();
1111 		return NULL;
1112 	}
1113 
1114 	for (;;) {
1115 		struct buffer_head * bh;
1116 		int ret;
1117 
1118 		bh = __find_get_block(bdev, block, size);
1119 		if (bh)
1120 			return bh;
1121 
1122 		ret = grow_buffers(bdev, block, size);
1123 		if (ret < 0)
1124 			return NULL;
1125 		if (ret == 0)
1126 			free_more_memory();
1127 	}
1128 }
1129 
1130 /*
1131  * The relationship between dirty buffers and dirty pages:
1132  *
1133  * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1134  * the page is tagged dirty in its radix tree.
1135  *
1136  * At all times, the dirtiness of the buffers represents the dirtiness of
1137  * subsections of the page.  If the page has buffers, the page dirty bit is
1138  * merely a hint about the true dirty state.
1139  *
1140  * When a page is set dirty in its entirety, all its buffers are marked dirty
1141  * (if the page has buffers).
1142  *
1143  * When a buffer is marked dirty, its page is dirtied, but the page's other
1144  * buffers are not.
1145  *
1146  * Also.  When blockdev buffers are explicitly read with bread(), they
1147  * individually become uptodate.  But their backing page remains not
1148  * uptodate - even if all of its buffers are uptodate.  A subsequent
1149  * block_read_full_page() against that page will discover all the uptodate
1150  * buffers, will set the page uptodate and will perform no I/O.
1151  */
1152 
1153 /**
1154  * mark_buffer_dirty - mark a buffer_head as needing writeout
1155  * @bh: the buffer_head to mark dirty
1156  *
1157  * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1158  * backing page dirty, then tag the page as dirty in its address_space's radix
1159  * tree and then attach the address_space's inode to its superblock's dirty
1160  * inode list.
1161  *
1162  * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1163  * mapping->tree_lock and the global inode_lock.
1164  */
1165 void fastcall mark_buffer_dirty(struct buffer_head *bh)
1166 {
1167 	WARN_ON_ONCE(!buffer_uptodate(bh));
1168 	if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
1169 		__set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0);
1170 }
1171 
1172 /*
1173  * Decrement a buffer_head's reference count.  If all buffers against a page
1174  * have zero reference count, are clean and unlocked, and if the page is clean
1175  * and unlocked then try_to_free_buffers() may strip the buffers from the page
1176  * in preparation for freeing it (sometimes, rarely, buffers are removed from
1177  * a page but it ends up not being freed, and buffers may later be reattached).
1178  */
1179 void __brelse(struct buffer_head * buf)
1180 {
1181 	if (atomic_read(&buf->b_count)) {
1182 		put_bh(buf);
1183 		return;
1184 	}
1185 	printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1186 	WARN_ON(1);
1187 }
1188 
1189 /*
1190  * bforget() is like brelse(), except it discards any
1191  * potentially dirty data.
1192  */
1193 void __bforget(struct buffer_head *bh)
1194 {
1195 	clear_buffer_dirty(bh);
1196 	if (!list_empty(&bh->b_assoc_buffers)) {
1197 		struct address_space *buffer_mapping = bh->b_page->mapping;
1198 
1199 		spin_lock(&buffer_mapping->private_lock);
1200 		list_del_init(&bh->b_assoc_buffers);
1201 		bh->b_assoc_map = NULL;
1202 		spin_unlock(&buffer_mapping->private_lock);
1203 	}
1204 	__brelse(bh);
1205 }
1206 
1207 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1208 {
1209 	lock_buffer(bh);
1210 	if (buffer_uptodate(bh)) {
1211 		unlock_buffer(bh);
1212 		return bh;
1213 	} else {
1214 		get_bh(bh);
1215 		bh->b_end_io = end_buffer_read_sync;
1216 		submit_bh(READ, bh);
1217 		wait_on_buffer(bh);
1218 		if (buffer_uptodate(bh))
1219 			return bh;
1220 	}
1221 	brelse(bh);
1222 	return NULL;
1223 }
1224 
1225 /*
1226  * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1227  * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1228  * refcount elevated by one when they're in an LRU.  A buffer can only appear
1229  * once in a particular CPU's LRU.  A single buffer can be present in multiple
1230  * CPU's LRUs at the same time.
1231  *
1232  * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1233  * sb_find_get_block().
1234  *
1235  * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1236  * a local interrupt disable for that.
1237  */
1238 
1239 #define BH_LRU_SIZE	8
1240 
1241 struct bh_lru {
1242 	struct buffer_head *bhs[BH_LRU_SIZE];
1243 };
1244 
1245 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1246 
1247 #ifdef CONFIG_SMP
1248 #define bh_lru_lock()	local_irq_disable()
1249 #define bh_lru_unlock()	local_irq_enable()
1250 #else
1251 #define bh_lru_lock()	preempt_disable()
1252 #define bh_lru_unlock()	preempt_enable()
1253 #endif
1254 
1255 static inline void check_irqs_on(void)
1256 {
1257 #ifdef irqs_disabled
1258 	BUG_ON(irqs_disabled());
1259 #endif
1260 }
1261 
1262 /*
1263  * The LRU management algorithm is dopey-but-simple.  Sorry.
1264  */
1265 static void bh_lru_install(struct buffer_head *bh)
1266 {
1267 	struct buffer_head *evictee = NULL;
1268 	struct bh_lru *lru;
1269 
1270 	check_irqs_on();
1271 	bh_lru_lock();
1272 	lru = &__get_cpu_var(bh_lrus);
1273 	if (lru->bhs[0] != bh) {
1274 		struct buffer_head *bhs[BH_LRU_SIZE];
1275 		int in;
1276 		int out = 0;
1277 
1278 		get_bh(bh);
1279 		bhs[out++] = bh;
1280 		for (in = 0; in < BH_LRU_SIZE; in++) {
1281 			struct buffer_head *bh2 = lru->bhs[in];
1282 
1283 			if (bh2 == bh) {
1284 				__brelse(bh2);
1285 			} else {
1286 				if (out >= BH_LRU_SIZE) {
1287 					BUG_ON(evictee != NULL);
1288 					evictee = bh2;
1289 				} else {
1290 					bhs[out++] = bh2;
1291 				}
1292 			}
1293 		}
1294 		while (out < BH_LRU_SIZE)
1295 			bhs[out++] = NULL;
1296 		memcpy(lru->bhs, bhs, sizeof(bhs));
1297 	}
1298 	bh_lru_unlock();
1299 
1300 	if (evictee)
1301 		__brelse(evictee);
1302 }
1303 
1304 /*
1305  * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1306  */
1307 static struct buffer_head *
1308 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1309 {
1310 	struct buffer_head *ret = NULL;
1311 	struct bh_lru *lru;
1312 	unsigned int i;
1313 
1314 	check_irqs_on();
1315 	bh_lru_lock();
1316 	lru = &__get_cpu_var(bh_lrus);
1317 	for (i = 0; i < BH_LRU_SIZE; i++) {
1318 		struct buffer_head *bh = lru->bhs[i];
1319 
1320 		if (bh && bh->b_bdev == bdev &&
1321 				bh->b_blocknr == block && bh->b_size == size) {
1322 			if (i) {
1323 				while (i) {
1324 					lru->bhs[i] = lru->bhs[i - 1];
1325 					i--;
1326 				}
1327 				lru->bhs[0] = bh;
1328 			}
1329 			get_bh(bh);
1330 			ret = bh;
1331 			break;
1332 		}
1333 	}
1334 	bh_lru_unlock();
1335 	return ret;
1336 }
1337 
1338 /*
1339  * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1340  * it in the LRU and mark it as accessed.  If it is not present then return
1341  * NULL
1342  */
1343 struct buffer_head *
1344 __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1345 {
1346 	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1347 
1348 	if (bh == NULL) {
1349 		bh = __find_get_block_slow(bdev, block);
1350 		if (bh)
1351 			bh_lru_install(bh);
1352 	}
1353 	if (bh)
1354 		touch_buffer(bh);
1355 	return bh;
1356 }
1357 EXPORT_SYMBOL(__find_get_block);
1358 
1359 /*
1360  * __getblk will locate (and, if necessary, create) the buffer_head
1361  * which corresponds to the passed block_device, block and size. The
1362  * returned buffer has its reference count incremented.
1363  *
1364  * __getblk() cannot fail - it just keeps trying.  If you pass it an
1365  * illegal block number, __getblk() will happily return a buffer_head
1366  * which represents the non-existent block.  Very weird.
1367  *
1368  * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1369  * attempt is failing.  FIXME, perhaps?
1370  */
1371 struct buffer_head *
1372 __getblk(struct block_device *bdev, sector_t block, unsigned size)
1373 {
1374 	struct buffer_head *bh = __find_get_block(bdev, block, size);
1375 
1376 	might_sleep();
1377 	if (bh == NULL)
1378 		bh = __getblk_slow(bdev, block, size);
1379 	return bh;
1380 }
1381 EXPORT_SYMBOL(__getblk);
1382 
1383 /*
1384  * Do async read-ahead on a buffer..
1385  */
1386 void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1387 {
1388 	struct buffer_head *bh = __getblk(bdev, block, size);
1389 	if (likely(bh)) {
1390 		ll_rw_block(READA, 1, &bh);
1391 		brelse(bh);
1392 	}
1393 }
1394 EXPORT_SYMBOL(__breadahead);
1395 
1396 /**
1397  *  __bread() - reads a specified block and returns the bh
1398  *  @bdev: the block_device to read from
1399  *  @block: number of block
1400  *  @size: size (in bytes) to read
1401  *
1402  *  Reads a specified block, and returns buffer head that contains it.
1403  *  It returns NULL if the block was unreadable.
1404  */
1405 struct buffer_head *
1406 __bread(struct block_device *bdev, sector_t block, unsigned size)
1407 {
1408 	struct buffer_head *bh = __getblk(bdev, block, size);
1409 
1410 	if (likely(bh) && !buffer_uptodate(bh))
1411 		bh = __bread_slow(bh);
1412 	return bh;
1413 }
1414 EXPORT_SYMBOL(__bread);
1415 
1416 /*
1417  * invalidate_bh_lrus() is called rarely - but not only at unmount.
1418  * This doesn't race because it runs in each cpu either in irq
1419  * or with preempt disabled.
1420  */
1421 static void invalidate_bh_lru(void *arg)
1422 {
1423 	struct bh_lru *b = &get_cpu_var(bh_lrus);
1424 	int i;
1425 
1426 	for (i = 0; i < BH_LRU_SIZE; i++) {
1427 		brelse(b->bhs[i]);
1428 		b->bhs[i] = NULL;
1429 	}
1430 	put_cpu_var(bh_lrus);
1431 }
1432 
1433 void invalidate_bh_lrus(void)
1434 {
1435 	on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
1436 }
1437 
1438 void set_bh_page(struct buffer_head *bh,
1439 		struct page *page, unsigned long offset)
1440 {
1441 	bh->b_page = page;
1442 	BUG_ON(offset >= PAGE_SIZE);
1443 	if (PageHighMem(page))
1444 		/*
1445 		 * This catches illegal uses and preserves the offset:
1446 		 */
1447 		bh->b_data = (char *)(0 + offset);
1448 	else
1449 		bh->b_data = page_address(page) + offset;
1450 }
1451 EXPORT_SYMBOL(set_bh_page);
1452 
1453 /*
1454  * Called when truncating a buffer on a page completely.
1455  */
1456 static void discard_buffer(struct buffer_head * bh)
1457 {
1458 	lock_buffer(bh);
1459 	clear_buffer_dirty(bh);
1460 	bh->b_bdev = NULL;
1461 	clear_buffer_mapped(bh);
1462 	clear_buffer_req(bh);
1463 	clear_buffer_new(bh);
1464 	clear_buffer_delay(bh);
1465 	clear_buffer_unwritten(bh);
1466 	unlock_buffer(bh);
1467 }
1468 
1469 /**
1470  * block_invalidatepage - invalidate part of all of a buffer-backed page
1471  *
1472  * @page: the page which is affected
1473  * @offset: the index of the truncation point
1474  *
1475  * block_invalidatepage() is called when all or part of the page has become
1476  * invalidatedby a truncate operation.
1477  *
1478  * block_invalidatepage() does not have to release all buffers, but it must
1479  * ensure that no dirty buffer is left outside @offset and that no I/O
1480  * is underway against any of the blocks which are outside the truncation
1481  * point.  Because the caller is about to free (and possibly reuse) those
1482  * blocks on-disk.
1483  */
1484 void block_invalidatepage(struct page *page, unsigned long offset)
1485 {
1486 	struct buffer_head *head, *bh, *next;
1487 	unsigned int curr_off = 0;
1488 
1489 	BUG_ON(!PageLocked(page));
1490 	if (!page_has_buffers(page))
1491 		goto out;
1492 
1493 	head = page_buffers(page);
1494 	bh = head;
1495 	do {
1496 		unsigned int next_off = curr_off + bh->b_size;
1497 		next = bh->b_this_page;
1498 
1499 		/*
1500 		 * is this block fully invalidated?
1501 		 */
1502 		if (offset <= curr_off)
1503 			discard_buffer(bh);
1504 		curr_off = next_off;
1505 		bh = next;
1506 	} while (bh != head);
1507 
1508 	/*
1509 	 * We release buffers only if the entire page is being invalidated.
1510 	 * The get_block cached value has been unconditionally invalidated,
1511 	 * so real IO is not possible anymore.
1512 	 */
1513 	if (offset == 0)
1514 		try_to_release_page(page, 0);
1515 out:
1516 	return;
1517 }
1518 EXPORT_SYMBOL(block_invalidatepage);
1519 
1520 /*
1521  * We attach and possibly dirty the buffers atomically wrt
1522  * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1523  * is already excluded via the page lock.
1524  */
1525 void create_empty_buffers(struct page *page,
1526 			unsigned long blocksize, unsigned long b_state)
1527 {
1528 	struct buffer_head *bh, *head, *tail;
1529 
1530 	head = alloc_page_buffers(page, blocksize, 1);
1531 	bh = head;
1532 	do {
1533 		bh->b_state |= b_state;
1534 		tail = bh;
1535 		bh = bh->b_this_page;
1536 	} while (bh);
1537 	tail->b_this_page = head;
1538 
1539 	spin_lock(&page->mapping->private_lock);
1540 	if (PageUptodate(page) || PageDirty(page)) {
1541 		bh = head;
1542 		do {
1543 			if (PageDirty(page))
1544 				set_buffer_dirty(bh);
1545 			if (PageUptodate(page))
1546 				set_buffer_uptodate(bh);
1547 			bh = bh->b_this_page;
1548 		} while (bh != head);
1549 	}
1550 	attach_page_buffers(page, head);
1551 	spin_unlock(&page->mapping->private_lock);
1552 }
1553 EXPORT_SYMBOL(create_empty_buffers);
1554 
1555 /*
1556  * We are taking a block for data and we don't want any output from any
1557  * buffer-cache aliases starting from return from that function and
1558  * until the moment when something will explicitly mark the buffer
1559  * dirty (hopefully that will not happen until we will free that block ;-)
1560  * We don't even need to mark it not-uptodate - nobody can expect
1561  * anything from a newly allocated buffer anyway. We used to used
1562  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1563  * don't want to mark the alias unmapped, for example - it would confuse
1564  * anyone who might pick it with bread() afterwards...
1565  *
1566  * Also..  Note that bforget() doesn't lock the buffer.  So there can
1567  * be writeout I/O going on against recently-freed buffers.  We don't
1568  * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1569  * only if we really need to.  That happens here.
1570  */
1571 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1572 {
1573 	struct buffer_head *old_bh;
1574 
1575 	might_sleep();
1576 
1577 	old_bh = __find_get_block_slow(bdev, block);
1578 	if (old_bh) {
1579 		clear_buffer_dirty(old_bh);
1580 		wait_on_buffer(old_bh);
1581 		clear_buffer_req(old_bh);
1582 		__brelse(old_bh);
1583 	}
1584 }
1585 EXPORT_SYMBOL(unmap_underlying_metadata);
1586 
1587 /*
1588  * NOTE! All mapped/uptodate combinations are valid:
1589  *
1590  *	Mapped	Uptodate	Meaning
1591  *
1592  *	No	No		"unknown" - must do get_block()
1593  *	No	Yes		"hole" - zero-filled
1594  *	Yes	No		"allocated" - allocated on disk, not read in
1595  *	Yes	Yes		"valid" - allocated and up-to-date in memory.
1596  *
1597  * "Dirty" is valid only with the last case (mapped+uptodate).
1598  */
1599 
1600 /*
1601  * While block_write_full_page is writing back the dirty buffers under
1602  * the page lock, whoever dirtied the buffers may decide to clean them
1603  * again at any time.  We handle that by only looking at the buffer
1604  * state inside lock_buffer().
1605  *
1606  * If block_write_full_page() is called for regular writeback
1607  * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1608  * locked buffer.   This only can happen if someone has written the buffer
1609  * directly, with submit_bh().  At the address_space level PageWriteback
1610  * prevents this contention from occurring.
1611  */
1612 static int __block_write_full_page(struct inode *inode, struct page *page,
1613 			get_block_t *get_block, struct writeback_control *wbc)
1614 {
1615 	int err;
1616 	sector_t block;
1617 	sector_t last_block;
1618 	struct buffer_head *bh, *head;
1619 	const unsigned blocksize = 1 << inode->i_blkbits;
1620 	int nr_underway = 0;
1621 
1622 	BUG_ON(!PageLocked(page));
1623 
1624 	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1625 
1626 	if (!page_has_buffers(page)) {
1627 		create_empty_buffers(page, blocksize,
1628 					(1 << BH_Dirty)|(1 << BH_Uptodate));
1629 	}
1630 
1631 	/*
1632 	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1633 	 * here, and the (potentially unmapped) buffers may become dirty at
1634 	 * any time.  If a buffer becomes dirty here after we've inspected it
1635 	 * then we just miss that fact, and the page stays dirty.
1636 	 *
1637 	 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1638 	 * handle that here by just cleaning them.
1639 	 */
1640 
1641 	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1642 	head = page_buffers(page);
1643 	bh = head;
1644 
1645 	/*
1646 	 * Get all the dirty buffers mapped to disk addresses and
1647 	 * handle any aliases from the underlying blockdev's mapping.
1648 	 */
1649 	do {
1650 		if (block > last_block) {
1651 			/*
1652 			 * mapped buffers outside i_size will occur, because
1653 			 * this page can be outside i_size when there is a
1654 			 * truncate in progress.
1655 			 */
1656 			/*
1657 			 * The buffer was zeroed by block_write_full_page()
1658 			 */
1659 			clear_buffer_dirty(bh);
1660 			set_buffer_uptodate(bh);
1661 		} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
1662 			WARN_ON(bh->b_size != blocksize);
1663 			err = get_block(inode, block, bh, 1);
1664 			if (err)
1665 				goto recover;
1666 			if (buffer_new(bh)) {
1667 				/* blockdev mappings never come here */
1668 				clear_buffer_new(bh);
1669 				unmap_underlying_metadata(bh->b_bdev,
1670 							bh->b_blocknr);
1671 			}
1672 		}
1673 		bh = bh->b_this_page;
1674 		block++;
1675 	} while (bh != head);
1676 
1677 	do {
1678 		if (!buffer_mapped(bh))
1679 			continue;
1680 		/*
1681 		 * If it's a fully non-blocking write attempt and we cannot
1682 		 * lock the buffer then redirty the page.  Note that this can
1683 		 * potentially cause a busy-wait loop from pdflush and kswapd
1684 		 * activity, but those code paths have their own higher-level
1685 		 * throttling.
1686 		 */
1687 		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1688 			lock_buffer(bh);
1689 		} else if (test_set_buffer_locked(bh)) {
1690 			redirty_page_for_writepage(wbc, page);
1691 			continue;
1692 		}
1693 		if (test_clear_buffer_dirty(bh)) {
1694 			mark_buffer_async_write(bh);
1695 		} else {
1696 			unlock_buffer(bh);
1697 		}
1698 	} while ((bh = bh->b_this_page) != head);
1699 
1700 	/*
1701 	 * The page and its buffers are protected by PageWriteback(), so we can
1702 	 * drop the bh refcounts early.
1703 	 */
1704 	BUG_ON(PageWriteback(page));
1705 	set_page_writeback(page);
1706 
1707 	do {
1708 		struct buffer_head *next = bh->b_this_page;
1709 		if (buffer_async_write(bh)) {
1710 			submit_bh(WRITE, bh);
1711 			nr_underway++;
1712 		}
1713 		bh = next;
1714 	} while (bh != head);
1715 	unlock_page(page);
1716 
1717 	err = 0;
1718 done:
1719 	if (nr_underway == 0) {
1720 		/*
1721 		 * The page was marked dirty, but the buffers were
1722 		 * clean.  Someone wrote them back by hand with
1723 		 * ll_rw_block/submit_bh.  A rare case.
1724 		 */
1725 		end_page_writeback(page);
1726 
1727 		/*
1728 		 * The page and buffer_heads can be released at any time from
1729 		 * here on.
1730 		 */
1731 		wbc->pages_skipped++;	/* We didn't write this page */
1732 	}
1733 	return err;
1734 
1735 recover:
1736 	/*
1737 	 * ENOSPC, or some other error.  We may already have added some
1738 	 * blocks to the file, so we need to write these out to avoid
1739 	 * exposing stale data.
1740 	 * The page is currently locked and not marked for writeback
1741 	 */
1742 	bh = head;
1743 	/* Recovery: lock and submit the mapped buffers */
1744 	do {
1745 		if (buffer_mapped(bh) && buffer_dirty(bh)) {
1746 			lock_buffer(bh);
1747 			mark_buffer_async_write(bh);
1748 		} else {
1749 			/*
1750 			 * The buffer may have been set dirty during
1751 			 * attachment to a dirty page.
1752 			 */
1753 			clear_buffer_dirty(bh);
1754 		}
1755 	} while ((bh = bh->b_this_page) != head);
1756 	SetPageError(page);
1757 	BUG_ON(PageWriteback(page));
1758 	mapping_set_error(page->mapping, err);
1759 	set_page_writeback(page);
1760 	do {
1761 		struct buffer_head *next = bh->b_this_page;
1762 		if (buffer_async_write(bh)) {
1763 			clear_buffer_dirty(bh);
1764 			submit_bh(WRITE, bh);
1765 			nr_underway++;
1766 		}
1767 		bh = next;
1768 	} while (bh != head);
1769 	unlock_page(page);
1770 	goto done;
1771 }
1772 
1773 static int __block_prepare_write(struct inode *inode, struct page *page,
1774 		unsigned from, unsigned to, get_block_t *get_block)
1775 {
1776 	unsigned block_start, block_end;
1777 	sector_t block;
1778 	int err = 0;
1779 	unsigned blocksize, bbits;
1780 	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1781 
1782 	BUG_ON(!PageLocked(page));
1783 	BUG_ON(from > PAGE_CACHE_SIZE);
1784 	BUG_ON(to > PAGE_CACHE_SIZE);
1785 	BUG_ON(from > to);
1786 
1787 	blocksize = 1 << inode->i_blkbits;
1788 	if (!page_has_buffers(page))
1789 		create_empty_buffers(page, blocksize, 0);
1790 	head = page_buffers(page);
1791 
1792 	bbits = inode->i_blkbits;
1793 	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1794 
1795 	for(bh = head, block_start = 0; bh != head || !block_start;
1796 	    block++, block_start=block_end, bh = bh->b_this_page) {
1797 		block_end = block_start + blocksize;
1798 		if (block_end <= from || block_start >= to) {
1799 			if (PageUptodate(page)) {
1800 				if (!buffer_uptodate(bh))
1801 					set_buffer_uptodate(bh);
1802 			}
1803 			continue;
1804 		}
1805 		if (buffer_new(bh))
1806 			clear_buffer_new(bh);
1807 		if (!buffer_mapped(bh)) {
1808 			WARN_ON(bh->b_size != blocksize);
1809 			err = get_block(inode, block, bh, 1);
1810 			if (err)
1811 				break;
1812 			if (buffer_new(bh)) {
1813 				unmap_underlying_metadata(bh->b_bdev,
1814 							bh->b_blocknr);
1815 				if (PageUptodate(page)) {
1816 					clear_buffer_new(bh);
1817 					set_buffer_uptodate(bh);
1818 					mark_buffer_dirty(bh);
1819 					continue;
1820 				}
1821 				if (block_end > to || block_start < from) {
1822 					void *kaddr;
1823 
1824 					kaddr = kmap_atomic(page, KM_USER0);
1825 					if (block_end > to)
1826 						memset(kaddr+to, 0,
1827 							block_end-to);
1828 					if (block_start < from)
1829 						memset(kaddr+block_start,
1830 							0, from-block_start);
1831 					flush_dcache_page(page);
1832 					kunmap_atomic(kaddr, KM_USER0);
1833 				}
1834 				continue;
1835 			}
1836 		}
1837 		if (PageUptodate(page)) {
1838 			if (!buffer_uptodate(bh))
1839 				set_buffer_uptodate(bh);
1840 			continue;
1841 		}
1842 		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1843 		    !buffer_unwritten(bh) &&
1844 		     (block_start < from || block_end > to)) {
1845 			ll_rw_block(READ, 1, &bh);
1846 			*wait_bh++=bh;
1847 		}
1848 	}
1849 	/*
1850 	 * If we issued read requests - let them complete.
1851 	 */
1852 	while(wait_bh > wait) {
1853 		wait_on_buffer(*--wait_bh);
1854 		if (!buffer_uptodate(*wait_bh))
1855 			err = -EIO;
1856 	}
1857 	if (!err) {
1858 		bh = head;
1859 		do {
1860 			if (buffer_new(bh))
1861 				clear_buffer_new(bh);
1862 		} while ((bh = bh->b_this_page) != head);
1863 		return 0;
1864 	}
1865 	/* Error case: */
1866 	/*
1867 	 * Zero out any newly allocated blocks to avoid exposing stale
1868 	 * data.  If BH_New is set, we know that the block was newly
1869 	 * allocated in the above loop.
1870 	 */
1871 	bh = head;
1872 	block_start = 0;
1873 	do {
1874 		block_end = block_start+blocksize;
1875 		if (block_end <= from)
1876 			goto next_bh;
1877 		if (block_start >= to)
1878 			break;
1879 		if (buffer_new(bh)) {
1880 			clear_buffer_new(bh);
1881 			zero_user_page(page, block_start, bh->b_size, KM_USER0);
1882 			set_buffer_uptodate(bh);
1883 			mark_buffer_dirty(bh);
1884 		}
1885 next_bh:
1886 		block_start = block_end;
1887 		bh = bh->b_this_page;
1888 	} while (bh != head);
1889 	return err;
1890 }
1891 
1892 static int __block_commit_write(struct inode *inode, struct page *page,
1893 		unsigned from, unsigned to)
1894 {
1895 	unsigned block_start, block_end;
1896 	int partial = 0;
1897 	unsigned blocksize;
1898 	struct buffer_head *bh, *head;
1899 
1900 	blocksize = 1 << inode->i_blkbits;
1901 
1902 	for(bh = head = page_buffers(page), block_start = 0;
1903 	    bh != head || !block_start;
1904 	    block_start=block_end, bh = bh->b_this_page) {
1905 		block_end = block_start + blocksize;
1906 		if (block_end <= from || block_start >= to) {
1907 			if (!buffer_uptodate(bh))
1908 				partial = 1;
1909 		} else {
1910 			set_buffer_uptodate(bh);
1911 			mark_buffer_dirty(bh);
1912 		}
1913 	}
1914 
1915 	/*
1916 	 * If this is a partial write which happened to make all buffers
1917 	 * uptodate then we can optimize away a bogus readpage() for
1918 	 * the next read(). Here we 'discover' whether the page went
1919 	 * uptodate as a result of this (potentially partial) write.
1920 	 */
1921 	if (!partial)
1922 		SetPageUptodate(page);
1923 	return 0;
1924 }
1925 
1926 /*
1927  * Generic "read page" function for block devices that have the normal
1928  * get_block functionality. This is most of the block device filesystems.
1929  * Reads the page asynchronously --- the unlock_buffer() and
1930  * set/clear_buffer_uptodate() functions propagate buffer state into the
1931  * page struct once IO has completed.
1932  */
1933 int block_read_full_page(struct page *page, get_block_t *get_block)
1934 {
1935 	struct inode *inode = page->mapping->host;
1936 	sector_t iblock, lblock;
1937 	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1938 	unsigned int blocksize;
1939 	int nr, i;
1940 	int fully_mapped = 1;
1941 
1942 	BUG_ON(!PageLocked(page));
1943 	blocksize = 1 << inode->i_blkbits;
1944 	if (!page_has_buffers(page))
1945 		create_empty_buffers(page, blocksize, 0);
1946 	head = page_buffers(page);
1947 
1948 	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1949 	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
1950 	bh = head;
1951 	nr = 0;
1952 	i = 0;
1953 
1954 	do {
1955 		if (buffer_uptodate(bh))
1956 			continue;
1957 
1958 		if (!buffer_mapped(bh)) {
1959 			int err = 0;
1960 
1961 			fully_mapped = 0;
1962 			if (iblock < lblock) {
1963 				WARN_ON(bh->b_size != blocksize);
1964 				err = get_block(inode, iblock, bh, 0);
1965 				if (err)
1966 					SetPageError(page);
1967 			}
1968 			if (!buffer_mapped(bh)) {
1969 				zero_user_page(page, i * blocksize, blocksize,
1970 						KM_USER0);
1971 				if (!err)
1972 					set_buffer_uptodate(bh);
1973 				continue;
1974 			}
1975 			/*
1976 			 * get_block() might have updated the buffer
1977 			 * synchronously
1978 			 */
1979 			if (buffer_uptodate(bh))
1980 				continue;
1981 		}
1982 		arr[nr++] = bh;
1983 	} while (i++, iblock++, (bh = bh->b_this_page) != head);
1984 
1985 	if (fully_mapped)
1986 		SetPageMappedToDisk(page);
1987 
1988 	if (!nr) {
1989 		/*
1990 		 * All buffers are uptodate - we can set the page uptodate
1991 		 * as well. But not if get_block() returned an error.
1992 		 */
1993 		if (!PageError(page))
1994 			SetPageUptodate(page);
1995 		unlock_page(page);
1996 		return 0;
1997 	}
1998 
1999 	/* Stage two: lock the buffers */
2000 	for (i = 0; i < nr; i++) {
2001 		bh = arr[i];
2002 		lock_buffer(bh);
2003 		mark_buffer_async_read(bh);
2004 	}
2005 
2006 	/*
2007 	 * Stage 3: start the IO.  Check for uptodateness
2008 	 * inside the buffer lock in case another process reading
2009 	 * the underlying blockdev brought it uptodate (the sct fix).
2010 	 */
2011 	for (i = 0; i < nr; i++) {
2012 		bh = arr[i];
2013 		if (buffer_uptodate(bh))
2014 			end_buffer_async_read(bh, 1);
2015 		else
2016 			submit_bh(READ, bh);
2017 	}
2018 	return 0;
2019 }
2020 
2021 /* utility function for filesystems that need to do work on expanding
2022  * truncates.  Uses prepare/commit_write to allow the filesystem to
2023  * deal with the hole.
2024  */
2025 static int __generic_cont_expand(struct inode *inode, loff_t size,
2026 				 pgoff_t index, unsigned int offset)
2027 {
2028 	struct address_space *mapping = inode->i_mapping;
2029 	struct page *page;
2030 	unsigned long limit;
2031 	int err;
2032 
2033 	err = -EFBIG;
2034         limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
2035 	if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2036 		send_sig(SIGXFSZ, current, 0);
2037 		goto out;
2038 	}
2039 	if (size > inode->i_sb->s_maxbytes)
2040 		goto out;
2041 
2042 	err = -ENOMEM;
2043 	page = grab_cache_page(mapping, index);
2044 	if (!page)
2045 		goto out;
2046 	err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
2047 	if (err) {
2048 		/*
2049 		 * ->prepare_write() may have instantiated a few blocks
2050 		 * outside i_size.  Trim these off again.
2051 		 */
2052 		unlock_page(page);
2053 		page_cache_release(page);
2054 		vmtruncate(inode, inode->i_size);
2055 		goto out;
2056 	}
2057 
2058 	err = mapping->a_ops->commit_write(NULL, page, offset, offset);
2059 
2060 	unlock_page(page);
2061 	page_cache_release(page);
2062 	if (err > 0)
2063 		err = 0;
2064 out:
2065 	return err;
2066 }
2067 
2068 int generic_cont_expand(struct inode *inode, loff_t size)
2069 {
2070 	pgoff_t index;
2071 	unsigned int offset;
2072 
2073 	offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
2074 
2075 	/* ugh.  in prepare/commit_write, if from==to==start of block, we
2076 	** skip the prepare.  make sure we never send an offset for the start
2077 	** of a block
2078 	*/
2079 	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
2080 		/* caller must handle this extra byte. */
2081 		offset++;
2082 	}
2083 	index = size >> PAGE_CACHE_SHIFT;
2084 
2085 	return __generic_cont_expand(inode, size, index, offset);
2086 }
2087 
2088 int generic_cont_expand_simple(struct inode *inode, loff_t size)
2089 {
2090 	loff_t pos = size - 1;
2091 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
2092 	unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
2093 
2094 	/* prepare/commit_write can handle even if from==to==start of block. */
2095 	return __generic_cont_expand(inode, size, index, offset);
2096 }
2097 
2098 /*
2099  * For moronic filesystems that do not allow holes in file.
2100  * We may have to extend the file.
2101  */
2102 
2103 int cont_prepare_write(struct page *page, unsigned offset,
2104 		unsigned to, get_block_t *get_block, loff_t *bytes)
2105 {
2106 	struct address_space *mapping = page->mapping;
2107 	struct inode *inode = mapping->host;
2108 	struct page *new_page;
2109 	pgoff_t pgpos;
2110 	long status;
2111 	unsigned zerofrom;
2112 	unsigned blocksize = 1 << inode->i_blkbits;
2113 
2114 	while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
2115 		status = -ENOMEM;
2116 		new_page = grab_cache_page(mapping, pgpos);
2117 		if (!new_page)
2118 			goto out;
2119 		/* we might sleep */
2120 		if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
2121 			unlock_page(new_page);
2122 			page_cache_release(new_page);
2123 			continue;
2124 		}
2125 		zerofrom = *bytes & ~PAGE_CACHE_MASK;
2126 		if (zerofrom & (blocksize-1)) {
2127 			*bytes |= (blocksize-1);
2128 			(*bytes)++;
2129 		}
2130 		status = __block_prepare_write(inode, new_page, zerofrom,
2131 						PAGE_CACHE_SIZE, get_block);
2132 		if (status)
2133 			goto out_unmap;
2134 		zero_user_page(new_page, zerofrom, PAGE_CACHE_SIZE - zerofrom,
2135 				KM_USER0);
2136 		generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
2137 		unlock_page(new_page);
2138 		page_cache_release(new_page);
2139 	}
2140 
2141 	if (page->index < pgpos) {
2142 		/* completely inside the area */
2143 		zerofrom = offset;
2144 	} else {
2145 		/* page covers the boundary, find the boundary offset */
2146 		zerofrom = *bytes & ~PAGE_CACHE_MASK;
2147 
2148 		/* if we will expand the thing last block will be filled */
2149 		if (to > zerofrom && (zerofrom & (blocksize-1))) {
2150 			*bytes |= (blocksize-1);
2151 			(*bytes)++;
2152 		}
2153 
2154 		/* starting below the boundary? Nothing to zero out */
2155 		if (offset <= zerofrom)
2156 			zerofrom = offset;
2157 	}
2158 	status = __block_prepare_write(inode, page, zerofrom, to, get_block);
2159 	if (status)
2160 		goto out1;
2161 	if (zerofrom < offset) {
2162 		zero_user_page(page, zerofrom, offset - zerofrom, KM_USER0);
2163 		__block_commit_write(inode, page, zerofrom, offset);
2164 	}
2165 	return 0;
2166 out1:
2167 	ClearPageUptodate(page);
2168 	return status;
2169 
2170 out_unmap:
2171 	ClearPageUptodate(new_page);
2172 	unlock_page(new_page);
2173 	page_cache_release(new_page);
2174 out:
2175 	return status;
2176 }
2177 
2178 int block_prepare_write(struct page *page, unsigned from, unsigned to,
2179 			get_block_t *get_block)
2180 {
2181 	struct inode *inode = page->mapping->host;
2182 	int err = __block_prepare_write(inode, page, from, to, get_block);
2183 	if (err)
2184 		ClearPageUptodate(page);
2185 	return err;
2186 }
2187 
2188 int block_commit_write(struct page *page, unsigned from, unsigned to)
2189 {
2190 	struct inode *inode = page->mapping->host;
2191 	__block_commit_write(inode,page,from,to);
2192 	return 0;
2193 }
2194 
2195 int generic_commit_write(struct file *file, struct page *page,
2196 		unsigned from, unsigned to)
2197 {
2198 	struct inode *inode = page->mapping->host;
2199 	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2200 	__block_commit_write(inode,page,from,to);
2201 	/*
2202 	 * No need to use i_size_read() here, the i_size
2203 	 * cannot change under us because we hold i_mutex.
2204 	 */
2205 	if (pos > inode->i_size) {
2206 		i_size_write(inode, pos);
2207 		mark_inode_dirty(inode);
2208 	}
2209 	return 0;
2210 }
2211 
2212 /*
2213  * block_page_mkwrite() is not allowed to change the file size as it gets
2214  * called from a page fault handler when a page is first dirtied. Hence we must
2215  * be careful to check for EOF conditions here. We set the page up correctly
2216  * for a written page which means we get ENOSPC checking when writing into
2217  * holes and correct delalloc and unwritten extent mapping on filesystems that
2218  * support these features.
2219  *
2220  * We are not allowed to take the i_mutex here so we have to play games to
2221  * protect against truncate races as the page could now be beyond EOF.  Because
2222  * vmtruncate() writes the inode size before removing pages, once we have the
2223  * page lock we can determine safely if the page is beyond EOF. If it is not
2224  * beyond EOF, then the page is guaranteed safe against truncation until we
2225  * unlock the page.
2226  */
2227 int
2228 block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
2229 		   get_block_t get_block)
2230 {
2231 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
2232 	unsigned long end;
2233 	loff_t size;
2234 	int ret = -EINVAL;
2235 
2236 	lock_page(page);
2237 	size = i_size_read(inode);
2238 	if ((page->mapping != inode->i_mapping) ||
2239 	    (page_offset(page) > size)) {
2240 		/* page got truncated out from underneath us */
2241 		goto out_unlock;
2242 	}
2243 
2244 	/* page is wholly or partially inside EOF */
2245 	if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
2246 		end = size & ~PAGE_CACHE_MASK;
2247 	else
2248 		end = PAGE_CACHE_SIZE;
2249 
2250 	ret = block_prepare_write(page, 0, end, get_block);
2251 	if (!ret)
2252 		ret = block_commit_write(page, 0, end);
2253 
2254 out_unlock:
2255 	unlock_page(page);
2256 	return ret;
2257 }
2258 
2259 /*
2260  * nobh_prepare_write()'s prereads are special: the buffer_heads are freed
2261  * immediately, while under the page lock.  So it needs a special end_io
2262  * handler which does not touch the bh after unlocking it.
2263  */
2264 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2265 {
2266 	__end_buffer_read_notouch(bh, uptodate);
2267 }
2268 
2269 /*
2270  * On entry, the page is fully not uptodate.
2271  * On exit the page is fully uptodate in the areas outside (from,to)
2272  */
2273 int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
2274 			get_block_t *get_block)
2275 {
2276 	struct inode *inode = page->mapping->host;
2277 	const unsigned blkbits = inode->i_blkbits;
2278 	const unsigned blocksize = 1 << blkbits;
2279 	struct buffer_head *head, *bh;
2280 	unsigned block_in_page;
2281 	unsigned block_start, block_end;
2282 	sector_t block_in_file;
2283 	char *kaddr;
2284 	int nr_reads = 0;
2285 	int ret = 0;
2286 	int is_mapped_to_disk = 1;
2287 
2288 	if (page_has_buffers(page))
2289 		return block_prepare_write(page, from, to, get_block);
2290 
2291 	if (PageMappedToDisk(page))
2292 		return 0;
2293 
2294 	/*
2295 	 * Allocate buffers so that we can keep track of state, and potentially
2296 	 * attach them to the page if an error occurs. In the common case of
2297 	 * no error, they will just be freed again without ever being attached
2298 	 * to the page (which is all OK, because we're under the page lock).
2299 	 *
2300 	 * Be careful: the buffer linked list is a NULL terminated one, rather
2301 	 * than the circular one we're used to.
2302 	 */
2303 	head = alloc_page_buffers(page, blocksize, 0);
2304 	if (!head)
2305 		return -ENOMEM;
2306 
2307 	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2308 
2309 	/*
2310 	 * We loop across all blocks in the page, whether or not they are
2311 	 * part of the affected region.  This is so we can discover if the
2312 	 * page is fully mapped-to-disk.
2313 	 */
2314 	for (block_start = 0, block_in_page = 0, bh = head;
2315 		  block_start < PAGE_CACHE_SIZE;
2316 		  block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2317 		int create;
2318 
2319 		block_end = block_start + blocksize;
2320 		bh->b_state = 0;
2321 		create = 1;
2322 		if (block_start >= to)
2323 			create = 0;
2324 		ret = get_block(inode, block_in_file + block_in_page,
2325 					bh, create);
2326 		if (ret)
2327 			goto failed;
2328 		if (!buffer_mapped(bh))
2329 			is_mapped_to_disk = 0;
2330 		if (buffer_new(bh))
2331 			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2332 		if (PageUptodate(page)) {
2333 			set_buffer_uptodate(bh);
2334 			continue;
2335 		}
2336 		if (buffer_new(bh) || !buffer_mapped(bh)) {
2337 			kaddr = kmap_atomic(page, KM_USER0);
2338 			if (block_start < from)
2339 				memset(kaddr+block_start, 0, from-block_start);
2340 			if (block_end > to)
2341 				memset(kaddr + to, 0, block_end - to);
2342 			flush_dcache_page(page);
2343 			kunmap_atomic(kaddr, KM_USER0);
2344 			continue;
2345 		}
2346 		if (buffer_uptodate(bh))
2347 			continue;	/* reiserfs does this */
2348 		if (block_start < from || block_end > to) {
2349 			lock_buffer(bh);
2350 			bh->b_end_io = end_buffer_read_nobh;
2351 			submit_bh(READ, bh);
2352 			nr_reads++;
2353 		}
2354 	}
2355 
2356 	if (nr_reads) {
2357 		/*
2358 		 * The page is locked, so these buffers are protected from
2359 		 * any VM or truncate activity.  Hence we don't need to care
2360 		 * for the buffer_head refcounts.
2361 		 */
2362 		for (bh = head; bh; bh = bh->b_this_page) {
2363 			wait_on_buffer(bh);
2364 			if (!buffer_uptodate(bh))
2365 				ret = -EIO;
2366 		}
2367 		if (ret)
2368 			goto failed;
2369 	}
2370 
2371 	if (is_mapped_to_disk)
2372 		SetPageMappedToDisk(page);
2373 
2374 	do {
2375 		bh = head;
2376 		head = head->b_this_page;
2377 		free_buffer_head(bh);
2378 	} while (head);
2379 
2380 	return 0;
2381 
2382 failed:
2383 	/*
2384 	 * Error recovery is a bit difficult. We need to zero out blocks that
2385 	 * were newly allocated, and dirty them to ensure they get written out.
2386 	 * Buffers need to be attached to the page at this point, otherwise
2387 	 * the handling of potential IO errors during writeout would be hard
2388 	 * (could try doing synchronous writeout, but what if that fails too?)
2389 	 */
2390 	spin_lock(&page->mapping->private_lock);
2391 	bh = head;
2392 	block_start = 0;
2393 	do {
2394 		if (PageUptodate(page))
2395 			set_buffer_uptodate(bh);
2396 		if (PageDirty(page))
2397 			set_buffer_dirty(bh);
2398 
2399 		block_end = block_start+blocksize;
2400 		if (block_end <= from)
2401 			goto next;
2402 		if (block_start >= to)
2403 			goto next;
2404 
2405 		if (buffer_new(bh)) {
2406 			clear_buffer_new(bh);
2407 			if (!buffer_uptodate(bh)) {
2408 				zero_user_page(page, block_start, bh->b_size, KM_USER0);
2409 				set_buffer_uptodate(bh);
2410 			}
2411 			mark_buffer_dirty(bh);
2412 		}
2413 next:
2414 		block_start = block_end;
2415 		if (!bh->b_this_page)
2416 			bh->b_this_page = head;
2417 		bh = bh->b_this_page;
2418 	} while (bh != head);
2419 	attach_page_buffers(page, head);
2420 	spin_unlock(&page->mapping->private_lock);
2421 
2422 	return ret;
2423 }
2424 EXPORT_SYMBOL(nobh_prepare_write);
2425 
2426 /*
2427  * Make sure any changes to nobh_commit_write() are reflected in
2428  * nobh_truncate_page(), since it doesn't call commit_write().
2429  */
2430 int nobh_commit_write(struct file *file, struct page *page,
2431 		unsigned from, unsigned to)
2432 {
2433 	struct inode *inode = page->mapping->host;
2434 	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2435 
2436 	if (page_has_buffers(page))
2437 		return generic_commit_write(file, page, from, to);
2438 
2439 	SetPageUptodate(page);
2440 	set_page_dirty(page);
2441 	if (pos > inode->i_size) {
2442 		i_size_write(inode, pos);
2443 		mark_inode_dirty(inode);
2444 	}
2445 	return 0;
2446 }
2447 EXPORT_SYMBOL(nobh_commit_write);
2448 
2449 /*
2450  * nobh_writepage() - based on block_full_write_page() except
2451  * that it tries to operate without attaching bufferheads to
2452  * the page.
2453  */
2454 int nobh_writepage(struct page *page, get_block_t *get_block,
2455 			struct writeback_control *wbc)
2456 {
2457 	struct inode * const inode = page->mapping->host;
2458 	loff_t i_size = i_size_read(inode);
2459 	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2460 	unsigned offset;
2461 	int ret;
2462 
2463 	/* Is the page fully inside i_size? */
2464 	if (page->index < end_index)
2465 		goto out;
2466 
2467 	/* Is the page fully outside i_size? (truncate in progress) */
2468 	offset = i_size & (PAGE_CACHE_SIZE-1);
2469 	if (page->index >= end_index+1 || !offset) {
2470 		/*
2471 		 * The page may have dirty, unmapped buffers.  For example,
2472 		 * they may have been added in ext3_writepage().  Make them
2473 		 * freeable here, so the page does not leak.
2474 		 */
2475 #if 0
2476 		/* Not really sure about this  - do we need this ? */
2477 		if (page->mapping->a_ops->invalidatepage)
2478 			page->mapping->a_ops->invalidatepage(page, offset);
2479 #endif
2480 		unlock_page(page);
2481 		return 0; /* don't care */
2482 	}
2483 
2484 	/*
2485 	 * The page straddles i_size.  It must be zeroed out on each and every
2486 	 * writepage invocation because it may be mmapped.  "A file is mapped
2487 	 * in multiples of the page size.  For a file that is not a multiple of
2488 	 * the  page size, the remaining memory is zeroed when mapped, and
2489 	 * writes to that region are not written out to the file."
2490 	 */
2491 	zero_user_page(page, offset, PAGE_CACHE_SIZE - offset, KM_USER0);
2492 out:
2493 	ret = mpage_writepage(page, get_block, wbc);
2494 	if (ret == -EAGAIN)
2495 		ret = __block_write_full_page(inode, page, get_block, wbc);
2496 	return ret;
2497 }
2498 EXPORT_SYMBOL(nobh_writepage);
2499 
2500 /*
2501  * This function assumes that ->prepare_write() uses nobh_prepare_write().
2502  */
2503 int nobh_truncate_page(struct address_space *mapping, loff_t from)
2504 {
2505 	struct inode *inode = mapping->host;
2506 	unsigned blocksize = 1 << inode->i_blkbits;
2507 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
2508 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
2509 	unsigned to;
2510 	struct page *page;
2511 	const struct address_space_operations *a_ops = mapping->a_ops;
2512 	int ret = 0;
2513 
2514 	if ((offset & (blocksize - 1)) == 0)
2515 		goto out;
2516 
2517 	ret = -ENOMEM;
2518 	page = grab_cache_page(mapping, index);
2519 	if (!page)
2520 		goto out;
2521 
2522 	to = (offset + blocksize) & ~(blocksize - 1);
2523 	ret = a_ops->prepare_write(NULL, page, offset, to);
2524 	if (ret == 0) {
2525 		zero_user_page(page, offset, PAGE_CACHE_SIZE - offset,
2526 				KM_USER0);
2527 		/*
2528 		 * It would be more correct to call aops->commit_write()
2529 		 * here, but this is more efficient.
2530 		 */
2531 		SetPageUptodate(page);
2532 		set_page_dirty(page);
2533 	}
2534 	unlock_page(page);
2535 	page_cache_release(page);
2536 out:
2537 	return ret;
2538 }
2539 EXPORT_SYMBOL(nobh_truncate_page);
2540 
2541 int block_truncate_page(struct address_space *mapping,
2542 			loff_t from, get_block_t *get_block)
2543 {
2544 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
2545 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
2546 	unsigned blocksize;
2547 	sector_t iblock;
2548 	unsigned length, pos;
2549 	struct inode *inode = mapping->host;
2550 	struct page *page;
2551 	struct buffer_head *bh;
2552 	int err;
2553 
2554 	blocksize = 1 << inode->i_blkbits;
2555 	length = offset & (blocksize - 1);
2556 
2557 	/* Block boundary? Nothing to do */
2558 	if (!length)
2559 		return 0;
2560 
2561 	length = blocksize - length;
2562 	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2563 
2564 	page = grab_cache_page(mapping, index);
2565 	err = -ENOMEM;
2566 	if (!page)
2567 		goto out;
2568 
2569 	if (!page_has_buffers(page))
2570 		create_empty_buffers(page, blocksize, 0);
2571 
2572 	/* Find the buffer that contains "offset" */
2573 	bh = page_buffers(page);
2574 	pos = blocksize;
2575 	while (offset >= pos) {
2576 		bh = bh->b_this_page;
2577 		iblock++;
2578 		pos += blocksize;
2579 	}
2580 
2581 	err = 0;
2582 	if (!buffer_mapped(bh)) {
2583 		WARN_ON(bh->b_size != blocksize);
2584 		err = get_block(inode, iblock, bh, 0);
2585 		if (err)
2586 			goto unlock;
2587 		/* unmapped? It's a hole - nothing to do */
2588 		if (!buffer_mapped(bh))
2589 			goto unlock;
2590 	}
2591 
2592 	/* Ok, it's mapped. Make sure it's up-to-date */
2593 	if (PageUptodate(page))
2594 		set_buffer_uptodate(bh);
2595 
2596 	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2597 		err = -EIO;
2598 		ll_rw_block(READ, 1, &bh);
2599 		wait_on_buffer(bh);
2600 		/* Uhhuh. Read error. Complain and punt. */
2601 		if (!buffer_uptodate(bh))
2602 			goto unlock;
2603 	}
2604 
2605 	zero_user_page(page, offset, length, KM_USER0);
2606 	mark_buffer_dirty(bh);
2607 	err = 0;
2608 
2609 unlock:
2610 	unlock_page(page);
2611 	page_cache_release(page);
2612 out:
2613 	return err;
2614 }
2615 
2616 /*
2617  * The generic ->writepage function for buffer-backed address_spaces
2618  */
2619 int block_write_full_page(struct page *page, get_block_t *get_block,
2620 			struct writeback_control *wbc)
2621 {
2622 	struct inode * const inode = page->mapping->host;
2623 	loff_t i_size = i_size_read(inode);
2624 	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2625 	unsigned offset;
2626 
2627 	/* Is the page fully inside i_size? */
2628 	if (page->index < end_index)
2629 		return __block_write_full_page(inode, page, get_block, wbc);
2630 
2631 	/* Is the page fully outside i_size? (truncate in progress) */
2632 	offset = i_size & (PAGE_CACHE_SIZE-1);
2633 	if (page->index >= end_index+1 || !offset) {
2634 		/*
2635 		 * The page may have dirty, unmapped buffers.  For example,
2636 		 * they may have been added in ext3_writepage().  Make them
2637 		 * freeable here, so the page does not leak.
2638 		 */
2639 		do_invalidatepage(page, 0);
2640 		unlock_page(page);
2641 		return 0; /* don't care */
2642 	}
2643 
2644 	/*
2645 	 * The page straddles i_size.  It must be zeroed out on each and every
2646 	 * writepage invokation because it may be mmapped.  "A file is mapped
2647 	 * in multiples of the page size.  For a file that is not a multiple of
2648 	 * the  page size, the remaining memory is zeroed when mapped, and
2649 	 * writes to that region are not written out to the file."
2650 	 */
2651 	zero_user_page(page, offset, PAGE_CACHE_SIZE - offset, KM_USER0);
2652 	return __block_write_full_page(inode, page, get_block, wbc);
2653 }
2654 
2655 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2656 			    get_block_t *get_block)
2657 {
2658 	struct buffer_head tmp;
2659 	struct inode *inode = mapping->host;
2660 	tmp.b_state = 0;
2661 	tmp.b_blocknr = 0;
2662 	tmp.b_size = 1 << inode->i_blkbits;
2663 	get_block(inode, block, &tmp, 0);
2664 	return tmp.b_blocknr;
2665 }
2666 
2667 static void end_bio_bh_io_sync(struct bio *bio, int err)
2668 {
2669 	struct buffer_head *bh = bio->bi_private;
2670 
2671 	if (err == -EOPNOTSUPP) {
2672 		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2673 		set_bit(BH_Eopnotsupp, &bh->b_state);
2674 	}
2675 
2676 	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2677 	bio_put(bio);
2678 }
2679 
2680 int submit_bh(int rw, struct buffer_head * bh)
2681 {
2682 	struct bio *bio;
2683 	int ret = 0;
2684 
2685 	BUG_ON(!buffer_locked(bh));
2686 	BUG_ON(!buffer_mapped(bh));
2687 	BUG_ON(!bh->b_end_io);
2688 
2689 	if (buffer_ordered(bh) && (rw == WRITE))
2690 		rw = WRITE_BARRIER;
2691 
2692 	/*
2693 	 * Only clear out a write error when rewriting, should this
2694 	 * include WRITE_SYNC as well?
2695 	 */
2696 	if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
2697 		clear_buffer_write_io_error(bh);
2698 
2699 	/*
2700 	 * from here on down, it's all bio -- do the initial mapping,
2701 	 * submit_bio -> generic_make_request may further map this bio around
2702 	 */
2703 	bio = bio_alloc(GFP_NOIO, 1);
2704 
2705 	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2706 	bio->bi_bdev = bh->b_bdev;
2707 	bio->bi_io_vec[0].bv_page = bh->b_page;
2708 	bio->bi_io_vec[0].bv_len = bh->b_size;
2709 	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2710 
2711 	bio->bi_vcnt = 1;
2712 	bio->bi_idx = 0;
2713 	bio->bi_size = bh->b_size;
2714 
2715 	bio->bi_end_io = end_bio_bh_io_sync;
2716 	bio->bi_private = bh;
2717 
2718 	bio_get(bio);
2719 	submit_bio(rw, bio);
2720 
2721 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
2722 		ret = -EOPNOTSUPP;
2723 
2724 	bio_put(bio);
2725 	return ret;
2726 }
2727 
2728 /**
2729  * ll_rw_block: low-level access to block devices (DEPRECATED)
2730  * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
2731  * @nr: number of &struct buffer_heads in the array
2732  * @bhs: array of pointers to &struct buffer_head
2733  *
2734  * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
2735  * requests an I/O operation on them, either a %READ or a %WRITE.  The third
2736  * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
2737  * are sent to disk. The fourth %READA option is described in the documentation
2738  * for generic_make_request() which ll_rw_block() calls.
2739  *
2740  * This function drops any buffer that it cannot get a lock on (with the
2741  * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
2742  * clean when doing a write request, and any buffer that appears to be
2743  * up-to-date when doing read request.  Further it marks as clean buffers that
2744  * are processed for writing (the buffer cache won't assume that they are
2745  * actually clean until the buffer gets unlocked).
2746  *
2747  * ll_rw_block sets b_end_io to simple completion handler that marks
2748  * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2749  * any waiters.
2750  *
2751  * All of the buffers must be for the same device, and must also be a
2752  * multiple of the current approved size for the device.
2753  */
2754 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2755 {
2756 	int i;
2757 
2758 	for (i = 0; i < nr; i++) {
2759 		struct buffer_head *bh = bhs[i];
2760 
2761 		if (rw == SWRITE)
2762 			lock_buffer(bh);
2763 		else if (test_set_buffer_locked(bh))
2764 			continue;
2765 
2766 		if (rw == WRITE || rw == SWRITE) {
2767 			if (test_clear_buffer_dirty(bh)) {
2768 				bh->b_end_io = end_buffer_write_sync;
2769 				get_bh(bh);
2770 				submit_bh(WRITE, bh);
2771 				continue;
2772 			}
2773 		} else {
2774 			if (!buffer_uptodate(bh)) {
2775 				bh->b_end_io = end_buffer_read_sync;
2776 				get_bh(bh);
2777 				submit_bh(rw, bh);
2778 				continue;
2779 			}
2780 		}
2781 		unlock_buffer(bh);
2782 	}
2783 }
2784 
2785 /*
2786  * For a data-integrity writeout, we need to wait upon any in-progress I/O
2787  * and then start new I/O and then wait upon it.  The caller must have a ref on
2788  * the buffer_head.
2789  */
2790 int sync_dirty_buffer(struct buffer_head *bh)
2791 {
2792 	int ret = 0;
2793 
2794 	WARN_ON(atomic_read(&bh->b_count) < 1);
2795 	lock_buffer(bh);
2796 	if (test_clear_buffer_dirty(bh)) {
2797 		get_bh(bh);
2798 		bh->b_end_io = end_buffer_write_sync;
2799 		ret = submit_bh(WRITE, bh);
2800 		wait_on_buffer(bh);
2801 		if (buffer_eopnotsupp(bh)) {
2802 			clear_buffer_eopnotsupp(bh);
2803 			ret = -EOPNOTSUPP;
2804 		}
2805 		if (!ret && !buffer_uptodate(bh))
2806 			ret = -EIO;
2807 	} else {
2808 		unlock_buffer(bh);
2809 	}
2810 	return ret;
2811 }
2812 
2813 /*
2814  * try_to_free_buffers() checks if all the buffers on this particular page
2815  * are unused, and releases them if so.
2816  *
2817  * Exclusion against try_to_free_buffers may be obtained by either
2818  * locking the page or by holding its mapping's private_lock.
2819  *
2820  * If the page is dirty but all the buffers are clean then we need to
2821  * be sure to mark the page clean as well.  This is because the page
2822  * may be against a block device, and a later reattachment of buffers
2823  * to a dirty page will set *all* buffers dirty.  Which would corrupt
2824  * filesystem data on the same device.
2825  *
2826  * The same applies to regular filesystem pages: if all the buffers are
2827  * clean then we set the page clean and proceed.  To do that, we require
2828  * total exclusion from __set_page_dirty_buffers().  That is obtained with
2829  * private_lock.
2830  *
2831  * try_to_free_buffers() is non-blocking.
2832  */
2833 static inline int buffer_busy(struct buffer_head *bh)
2834 {
2835 	return atomic_read(&bh->b_count) |
2836 		(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2837 }
2838 
2839 static int
2840 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
2841 {
2842 	struct buffer_head *head = page_buffers(page);
2843 	struct buffer_head *bh;
2844 
2845 	bh = head;
2846 	do {
2847 		if (buffer_write_io_error(bh) && page->mapping)
2848 			set_bit(AS_EIO, &page->mapping->flags);
2849 		if (buffer_busy(bh))
2850 			goto failed;
2851 		bh = bh->b_this_page;
2852 	} while (bh != head);
2853 
2854 	do {
2855 		struct buffer_head *next = bh->b_this_page;
2856 
2857 		if (!list_empty(&bh->b_assoc_buffers))
2858 			__remove_assoc_queue(bh);
2859 		bh = next;
2860 	} while (bh != head);
2861 	*buffers_to_free = head;
2862 	__clear_page_buffers(page);
2863 	return 1;
2864 failed:
2865 	return 0;
2866 }
2867 
2868 int try_to_free_buffers(struct page *page)
2869 {
2870 	struct address_space * const mapping = page->mapping;
2871 	struct buffer_head *buffers_to_free = NULL;
2872 	int ret = 0;
2873 
2874 	BUG_ON(!PageLocked(page));
2875 	if (PageWriteback(page))
2876 		return 0;
2877 
2878 	if (mapping == NULL) {		/* can this still happen? */
2879 		ret = drop_buffers(page, &buffers_to_free);
2880 		goto out;
2881 	}
2882 
2883 	spin_lock(&mapping->private_lock);
2884 	ret = drop_buffers(page, &buffers_to_free);
2885 
2886 	/*
2887 	 * If the filesystem writes its buffers by hand (eg ext3)
2888 	 * then we can have clean buffers against a dirty page.  We
2889 	 * clean the page here; otherwise the VM will never notice
2890 	 * that the filesystem did any IO at all.
2891 	 *
2892 	 * Also, during truncate, discard_buffer will have marked all
2893 	 * the page's buffers clean.  We discover that here and clean
2894 	 * the page also.
2895 	 *
2896 	 * private_lock must be held over this entire operation in order
2897 	 * to synchronise against __set_page_dirty_buffers and prevent the
2898 	 * dirty bit from being lost.
2899 	 */
2900 	if (ret)
2901 		cancel_dirty_page(page, PAGE_CACHE_SIZE);
2902 	spin_unlock(&mapping->private_lock);
2903 out:
2904 	if (buffers_to_free) {
2905 		struct buffer_head *bh = buffers_to_free;
2906 
2907 		do {
2908 			struct buffer_head *next = bh->b_this_page;
2909 			free_buffer_head(bh);
2910 			bh = next;
2911 		} while (bh != buffers_to_free);
2912 	}
2913 	return ret;
2914 }
2915 EXPORT_SYMBOL(try_to_free_buffers);
2916 
2917 void block_sync_page(struct page *page)
2918 {
2919 	struct address_space *mapping;
2920 
2921 	smp_mb();
2922 	mapping = page_mapping(page);
2923 	if (mapping)
2924 		blk_run_backing_dev(mapping->backing_dev_info, page);
2925 }
2926 
2927 /*
2928  * There are no bdflush tunables left.  But distributions are
2929  * still running obsolete flush daemons, so we terminate them here.
2930  *
2931  * Use of bdflush() is deprecated and will be removed in a future kernel.
2932  * The `pdflush' kernel threads fully replace bdflush daemons and this call.
2933  */
2934 asmlinkage long sys_bdflush(int func, long data)
2935 {
2936 	static int msg_count;
2937 
2938 	if (!capable(CAP_SYS_ADMIN))
2939 		return -EPERM;
2940 
2941 	if (msg_count < 5) {
2942 		msg_count++;
2943 		printk(KERN_INFO
2944 			"warning: process `%s' used the obsolete bdflush"
2945 			" system call\n", current->comm);
2946 		printk(KERN_INFO "Fix your initscripts?\n");
2947 	}
2948 
2949 	if (func == 1)
2950 		do_exit(0);
2951 	return 0;
2952 }
2953 
2954 /*
2955  * Buffer-head allocation
2956  */
2957 static struct kmem_cache *bh_cachep;
2958 
2959 /*
2960  * Once the number of bh's in the machine exceeds this level, we start
2961  * stripping them in writeback.
2962  */
2963 static int max_buffer_heads;
2964 
2965 int buffer_heads_over_limit;
2966 
2967 struct bh_accounting {
2968 	int nr;			/* Number of live bh's */
2969 	int ratelimit;		/* Limit cacheline bouncing */
2970 };
2971 
2972 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
2973 
2974 static void recalc_bh_state(void)
2975 {
2976 	int i;
2977 	int tot = 0;
2978 
2979 	if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
2980 		return;
2981 	__get_cpu_var(bh_accounting).ratelimit = 0;
2982 	for_each_online_cpu(i)
2983 		tot += per_cpu(bh_accounting, i).nr;
2984 	buffer_heads_over_limit = (tot > max_buffer_heads);
2985 }
2986 
2987 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
2988 {
2989 	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
2990 	if (ret) {
2991 		INIT_LIST_HEAD(&ret->b_assoc_buffers);
2992 		get_cpu_var(bh_accounting).nr++;
2993 		recalc_bh_state();
2994 		put_cpu_var(bh_accounting);
2995 	}
2996 	return ret;
2997 }
2998 EXPORT_SYMBOL(alloc_buffer_head);
2999 
3000 void free_buffer_head(struct buffer_head *bh)
3001 {
3002 	BUG_ON(!list_empty(&bh->b_assoc_buffers));
3003 	kmem_cache_free(bh_cachep, bh);
3004 	get_cpu_var(bh_accounting).nr--;
3005 	recalc_bh_state();
3006 	put_cpu_var(bh_accounting);
3007 }
3008 EXPORT_SYMBOL(free_buffer_head);
3009 
3010 static void buffer_exit_cpu(int cpu)
3011 {
3012 	int i;
3013 	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3014 
3015 	for (i = 0; i < BH_LRU_SIZE; i++) {
3016 		brelse(b->bhs[i]);
3017 		b->bhs[i] = NULL;
3018 	}
3019 	get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
3020 	per_cpu(bh_accounting, cpu).nr = 0;
3021 	put_cpu_var(bh_accounting);
3022 }
3023 
3024 static int buffer_cpu_notify(struct notifier_block *self,
3025 			      unsigned long action, void *hcpu)
3026 {
3027 	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
3028 		buffer_exit_cpu((unsigned long)hcpu);
3029 	return NOTIFY_OK;
3030 }
3031 
3032 void __init buffer_init(void)
3033 {
3034 	int nrpages;
3035 
3036 	bh_cachep = KMEM_CACHE(buffer_head,
3037 			SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
3038 
3039 	/*
3040 	 * Limit the bh occupancy to 10% of ZONE_NORMAL
3041 	 */
3042 	nrpages = (nr_free_buffer_pages() * 10) / 100;
3043 	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3044 	hotcpu_notifier(buffer_cpu_notify, 0);
3045 }
3046 
3047 EXPORT_SYMBOL(__bforget);
3048 EXPORT_SYMBOL(__brelse);
3049 EXPORT_SYMBOL(__wait_on_buffer);
3050 EXPORT_SYMBOL(block_commit_write);
3051 EXPORT_SYMBOL(block_prepare_write);
3052 EXPORT_SYMBOL(block_page_mkwrite);
3053 EXPORT_SYMBOL(block_read_full_page);
3054 EXPORT_SYMBOL(block_sync_page);
3055 EXPORT_SYMBOL(block_truncate_page);
3056 EXPORT_SYMBOL(block_write_full_page);
3057 EXPORT_SYMBOL(cont_prepare_write);
3058 EXPORT_SYMBOL(end_buffer_read_sync);
3059 EXPORT_SYMBOL(end_buffer_write_sync);
3060 EXPORT_SYMBOL(file_fsync);
3061 EXPORT_SYMBOL(fsync_bdev);
3062 EXPORT_SYMBOL(generic_block_bmap);
3063 EXPORT_SYMBOL(generic_commit_write);
3064 EXPORT_SYMBOL(generic_cont_expand);
3065 EXPORT_SYMBOL(generic_cont_expand_simple);
3066 EXPORT_SYMBOL(init_buffer);
3067 EXPORT_SYMBOL(invalidate_bdev);
3068 EXPORT_SYMBOL(ll_rw_block);
3069 EXPORT_SYMBOL(mark_buffer_dirty);
3070 EXPORT_SYMBOL(submit_bh);
3071 EXPORT_SYMBOL(sync_dirty_buffer);
3072 EXPORT_SYMBOL(unlock_buffer);
3073