xref: /openbmc/linux/fs/buffer.c (revision 367b8112)
1 /*
2  *  linux/fs/buffer.c
3  *
4  *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
5  */
6 
7 /*
8  * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
9  *
10  * Removed a lot of unnecessary code and simplified things now that
11  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
12  *
13  * Speed up hash, lru, and free list operations.  Use gfp() for allocating
14  * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
15  *
16  * Added 32k buffer block sizes - these are required older ARM systems. - RMK
17  *
18  * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
19  */
20 
21 #include <linux/kernel.h>
22 #include <linux/syscalls.h>
23 #include <linux/fs.h>
24 #include <linux/mm.h>
25 #include <linux/percpu.h>
26 #include <linux/slab.h>
27 #include <linux/capability.h>
28 #include <linux/blkdev.h>
29 #include <linux/file.h>
30 #include <linux/quotaops.h>
31 #include <linux/highmem.h>
32 #include <linux/module.h>
33 #include <linux/writeback.h>
34 #include <linux/hash.h>
35 #include <linux/suspend.h>
36 #include <linux/buffer_head.h>
37 #include <linux/task_io_accounting_ops.h>
38 #include <linux/bio.h>
39 #include <linux/notifier.h>
40 #include <linux/cpu.h>
41 #include <linux/bitops.h>
42 #include <linux/mpage.h>
43 #include <linux/bit_spinlock.h>
44 
45 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
46 
47 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
48 
49 inline void
50 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
51 {
52 	bh->b_end_io = handler;
53 	bh->b_private = private;
54 }
55 
56 static int sync_buffer(void *word)
57 {
58 	struct block_device *bd;
59 	struct buffer_head *bh
60 		= container_of(word, struct buffer_head, b_state);
61 
62 	smp_mb();
63 	bd = bh->b_bdev;
64 	if (bd)
65 		blk_run_address_space(bd->bd_inode->i_mapping);
66 	io_schedule();
67 	return 0;
68 }
69 
70 void __lock_buffer(struct buffer_head *bh)
71 {
72 	wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
73 							TASK_UNINTERRUPTIBLE);
74 }
75 EXPORT_SYMBOL(__lock_buffer);
76 
77 void unlock_buffer(struct buffer_head *bh)
78 {
79 	clear_bit_unlock(BH_Lock, &bh->b_state);
80 	smp_mb__after_clear_bit();
81 	wake_up_bit(&bh->b_state, BH_Lock);
82 }
83 
84 /*
85  * Block until a buffer comes unlocked.  This doesn't stop it
86  * from becoming locked again - you have to lock it yourself
87  * if you want to preserve its state.
88  */
89 void __wait_on_buffer(struct buffer_head * bh)
90 {
91 	wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
92 }
93 
94 static void
95 __clear_page_buffers(struct page *page)
96 {
97 	ClearPagePrivate(page);
98 	set_page_private(page, 0);
99 	page_cache_release(page);
100 }
101 
102 static void buffer_io_error(struct buffer_head *bh)
103 {
104 	char b[BDEVNAME_SIZE];
105 
106 	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
107 			bdevname(bh->b_bdev, b),
108 			(unsigned long long)bh->b_blocknr);
109 }
110 
111 /*
112  * End-of-IO handler helper function which does not touch the bh after
113  * unlocking it.
114  * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
115  * a race there is benign: unlock_buffer() only use the bh's address for
116  * hashing after unlocking the buffer, so it doesn't actually touch the bh
117  * itself.
118  */
119 static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
120 {
121 	if (uptodate) {
122 		set_buffer_uptodate(bh);
123 	} else {
124 		/* This happens, due to failed READA attempts. */
125 		clear_buffer_uptodate(bh);
126 	}
127 	unlock_buffer(bh);
128 }
129 
130 /*
131  * Default synchronous end-of-IO handler..  Just mark it up-to-date and
132  * unlock the buffer. This is what ll_rw_block uses too.
133  */
134 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
135 {
136 	__end_buffer_read_notouch(bh, uptodate);
137 	put_bh(bh);
138 }
139 
140 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
141 {
142 	char b[BDEVNAME_SIZE];
143 
144 	if (uptodate) {
145 		set_buffer_uptodate(bh);
146 	} else {
147 		if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
148 			buffer_io_error(bh);
149 			printk(KERN_WARNING "lost page write due to "
150 					"I/O error on %s\n",
151 				       bdevname(bh->b_bdev, b));
152 		}
153 		set_buffer_write_io_error(bh);
154 		clear_buffer_uptodate(bh);
155 	}
156 	unlock_buffer(bh);
157 	put_bh(bh);
158 }
159 
160 /*
161  * Write out and wait upon all the dirty data associated with a block
162  * device via its mapping.  Does not take the superblock lock.
163  */
164 int sync_blockdev(struct block_device *bdev)
165 {
166 	int ret = 0;
167 
168 	if (bdev)
169 		ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
170 	return ret;
171 }
172 EXPORT_SYMBOL(sync_blockdev);
173 
174 /*
175  * Write out and wait upon all dirty data associated with this
176  * device.   Filesystem data as well as the underlying block
177  * device.  Takes the superblock lock.
178  */
179 int fsync_bdev(struct block_device *bdev)
180 {
181 	struct super_block *sb = get_super(bdev);
182 	if (sb) {
183 		int res = fsync_super(sb);
184 		drop_super(sb);
185 		return res;
186 	}
187 	return sync_blockdev(bdev);
188 }
189 
190 /**
191  * freeze_bdev  --  lock a filesystem and force it into a consistent state
192  * @bdev:	blockdevice to lock
193  *
194  * This takes the block device bd_mount_sem to make sure no new mounts
195  * happen on bdev until thaw_bdev() is called.
196  * If a superblock is found on this device, we take the s_umount semaphore
197  * on it to make sure nobody unmounts until the snapshot creation is done.
198  */
199 struct super_block *freeze_bdev(struct block_device *bdev)
200 {
201 	struct super_block *sb;
202 
203 	down(&bdev->bd_mount_sem);
204 	sb = get_super(bdev);
205 	if (sb && !(sb->s_flags & MS_RDONLY)) {
206 		sb->s_frozen = SB_FREEZE_WRITE;
207 		smp_wmb();
208 
209 		__fsync_super(sb);
210 
211 		sb->s_frozen = SB_FREEZE_TRANS;
212 		smp_wmb();
213 
214 		sync_blockdev(sb->s_bdev);
215 
216 		if (sb->s_op->write_super_lockfs)
217 			sb->s_op->write_super_lockfs(sb);
218 	}
219 
220 	sync_blockdev(bdev);
221 	return sb;	/* thaw_bdev releases s->s_umount and bd_mount_sem */
222 }
223 EXPORT_SYMBOL(freeze_bdev);
224 
225 /**
226  * thaw_bdev  -- unlock filesystem
227  * @bdev:	blockdevice to unlock
228  * @sb:		associated superblock
229  *
230  * Unlocks the filesystem and marks it writeable again after freeze_bdev().
231  */
232 void thaw_bdev(struct block_device *bdev, struct super_block *sb)
233 {
234 	if (sb) {
235 		BUG_ON(sb->s_bdev != bdev);
236 
237 		if (sb->s_op->unlockfs)
238 			sb->s_op->unlockfs(sb);
239 		sb->s_frozen = SB_UNFROZEN;
240 		smp_wmb();
241 		wake_up(&sb->s_wait_unfrozen);
242 		drop_super(sb);
243 	}
244 
245 	up(&bdev->bd_mount_sem);
246 }
247 EXPORT_SYMBOL(thaw_bdev);
248 
249 /*
250  * Various filesystems appear to want __find_get_block to be non-blocking.
251  * But it's the page lock which protects the buffers.  To get around this,
252  * we get exclusion from try_to_free_buffers with the blockdev mapping's
253  * private_lock.
254  *
255  * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
256  * may be quite high.  This code could TryLock the page, and if that
257  * succeeds, there is no need to take private_lock. (But if
258  * private_lock is contended then so is mapping->tree_lock).
259  */
260 static struct buffer_head *
261 __find_get_block_slow(struct block_device *bdev, sector_t block)
262 {
263 	struct inode *bd_inode = bdev->bd_inode;
264 	struct address_space *bd_mapping = bd_inode->i_mapping;
265 	struct buffer_head *ret = NULL;
266 	pgoff_t index;
267 	struct buffer_head *bh;
268 	struct buffer_head *head;
269 	struct page *page;
270 	int all_mapped = 1;
271 
272 	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
273 	page = find_get_page(bd_mapping, index);
274 	if (!page)
275 		goto out;
276 
277 	spin_lock(&bd_mapping->private_lock);
278 	if (!page_has_buffers(page))
279 		goto out_unlock;
280 	head = page_buffers(page);
281 	bh = head;
282 	do {
283 		if (bh->b_blocknr == block) {
284 			ret = bh;
285 			get_bh(bh);
286 			goto out_unlock;
287 		}
288 		if (!buffer_mapped(bh))
289 			all_mapped = 0;
290 		bh = bh->b_this_page;
291 	} while (bh != head);
292 
293 	/* we might be here because some of the buffers on this page are
294 	 * not mapped.  This is due to various races between
295 	 * file io on the block device and getblk.  It gets dealt with
296 	 * elsewhere, don't buffer_error if we had some unmapped buffers
297 	 */
298 	if (all_mapped) {
299 		printk("__find_get_block_slow() failed. "
300 			"block=%llu, b_blocknr=%llu\n",
301 			(unsigned long long)block,
302 			(unsigned long long)bh->b_blocknr);
303 		printk("b_state=0x%08lx, b_size=%zu\n",
304 			bh->b_state, bh->b_size);
305 		printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
306 	}
307 out_unlock:
308 	spin_unlock(&bd_mapping->private_lock);
309 	page_cache_release(page);
310 out:
311 	return ret;
312 }
313 
314 /* If invalidate_buffers() will trash dirty buffers, it means some kind
315    of fs corruption is going on. Trashing dirty data always imply losing
316    information that was supposed to be just stored on the physical layer
317    by the user.
318 
319    Thus invalidate_buffers in general usage is not allwowed to trash
320    dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
321    be preserved.  These buffers are simply skipped.
322 
323    We also skip buffers which are still in use.  For example this can
324    happen if a userspace program is reading the block device.
325 
326    NOTE: In the case where the user removed a removable-media-disk even if
327    there's still dirty data not synced on disk (due a bug in the device driver
328    or due an error of the user), by not destroying the dirty buffers we could
329    generate corruption also on the next media inserted, thus a parameter is
330    necessary to handle this case in the most safe way possible (trying
331    to not corrupt also the new disk inserted with the data belonging to
332    the old now corrupted disk). Also for the ramdisk the natural thing
333    to do in order to release the ramdisk memory is to destroy dirty buffers.
334 
335    These are two special cases. Normal usage imply the device driver
336    to issue a sync on the device (without waiting I/O completion) and
337    then an invalidate_buffers call that doesn't trash dirty buffers.
338 
339    For handling cache coherency with the blkdev pagecache the 'update' case
340    is been introduced. It is needed to re-read from disk any pinned
341    buffer. NOTE: re-reading from disk is destructive so we can do it only
342    when we assume nobody is changing the buffercache under our I/O and when
343    we think the disk contains more recent information than the buffercache.
344    The update == 1 pass marks the buffers we need to update, the update == 2
345    pass does the actual I/O. */
346 void invalidate_bdev(struct block_device *bdev)
347 {
348 	struct address_space *mapping = bdev->bd_inode->i_mapping;
349 
350 	if (mapping->nrpages == 0)
351 		return;
352 
353 	invalidate_bh_lrus();
354 	invalidate_mapping_pages(mapping, 0, -1);
355 }
356 
357 /*
358  * Kick pdflush then try to free up some ZONE_NORMAL memory.
359  */
360 static void free_more_memory(void)
361 {
362 	struct zone *zone;
363 	int nid;
364 
365 	wakeup_pdflush(1024);
366 	yield();
367 
368 	for_each_online_node(nid) {
369 		(void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
370 						gfp_zone(GFP_NOFS), NULL,
371 						&zone);
372 		if (zone)
373 			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
374 						GFP_NOFS);
375 	}
376 }
377 
378 /*
379  * I/O completion handler for block_read_full_page() - pages
380  * which come unlocked at the end of I/O.
381  */
382 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
383 {
384 	unsigned long flags;
385 	struct buffer_head *first;
386 	struct buffer_head *tmp;
387 	struct page *page;
388 	int page_uptodate = 1;
389 
390 	BUG_ON(!buffer_async_read(bh));
391 
392 	page = bh->b_page;
393 	if (uptodate) {
394 		set_buffer_uptodate(bh);
395 	} else {
396 		clear_buffer_uptodate(bh);
397 		if (printk_ratelimit())
398 			buffer_io_error(bh);
399 		SetPageError(page);
400 	}
401 
402 	/*
403 	 * Be _very_ careful from here on. Bad things can happen if
404 	 * two buffer heads end IO at almost the same time and both
405 	 * decide that the page is now completely done.
406 	 */
407 	first = page_buffers(page);
408 	local_irq_save(flags);
409 	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
410 	clear_buffer_async_read(bh);
411 	unlock_buffer(bh);
412 	tmp = bh;
413 	do {
414 		if (!buffer_uptodate(tmp))
415 			page_uptodate = 0;
416 		if (buffer_async_read(tmp)) {
417 			BUG_ON(!buffer_locked(tmp));
418 			goto still_busy;
419 		}
420 		tmp = tmp->b_this_page;
421 	} while (tmp != bh);
422 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
423 	local_irq_restore(flags);
424 
425 	/*
426 	 * If none of the buffers had errors and they are all
427 	 * uptodate then we can set the page uptodate.
428 	 */
429 	if (page_uptodate && !PageError(page))
430 		SetPageUptodate(page);
431 	unlock_page(page);
432 	return;
433 
434 still_busy:
435 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
436 	local_irq_restore(flags);
437 	return;
438 }
439 
440 /*
441  * Completion handler for block_write_full_page() - pages which are unlocked
442  * during I/O, and which have PageWriteback cleared upon I/O completion.
443  */
444 static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
445 {
446 	char b[BDEVNAME_SIZE];
447 	unsigned long flags;
448 	struct buffer_head *first;
449 	struct buffer_head *tmp;
450 	struct page *page;
451 
452 	BUG_ON(!buffer_async_write(bh));
453 
454 	page = bh->b_page;
455 	if (uptodate) {
456 		set_buffer_uptodate(bh);
457 	} else {
458 		if (printk_ratelimit()) {
459 			buffer_io_error(bh);
460 			printk(KERN_WARNING "lost page write due to "
461 					"I/O error on %s\n",
462 			       bdevname(bh->b_bdev, b));
463 		}
464 		set_bit(AS_EIO, &page->mapping->flags);
465 		set_buffer_write_io_error(bh);
466 		clear_buffer_uptodate(bh);
467 		SetPageError(page);
468 	}
469 
470 	first = page_buffers(page);
471 	local_irq_save(flags);
472 	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
473 
474 	clear_buffer_async_write(bh);
475 	unlock_buffer(bh);
476 	tmp = bh->b_this_page;
477 	while (tmp != bh) {
478 		if (buffer_async_write(tmp)) {
479 			BUG_ON(!buffer_locked(tmp));
480 			goto still_busy;
481 		}
482 		tmp = tmp->b_this_page;
483 	}
484 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
485 	local_irq_restore(flags);
486 	end_page_writeback(page);
487 	return;
488 
489 still_busy:
490 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
491 	local_irq_restore(flags);
492 	return;
493 }
494 
495 /*
496  * If a page's buffers are under async readin (end_buffer_async_read
497  * completion) then there is a possibility that another thread of
498  * control could lock one of the buffers after it has completed
499  * but while some of the other buffers have not completed.  This
500  * locked buffer would confuse end_buffer_async_read() into not unlocking
501  * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
502  * that this buffer is not under async I/O.
503  *
504  * The page comes unlocked when it has no locked buffer_async buffers
505  * left.
506  *
507  * PageLocked prevents anyone starting new async I/O reads any of
508  * the buffers.
509  *
510  * PageWriteback is used to prevent simultaneous writeout of the same
511  * page.
512  *
513  * PageLocked prevents anyone from starting writeback of a page which is
514  * under read I/O (PageWriteback is only ever set against a locked page).
515  */
516 static void mark_buffer_async_read(struct buffer_head *bh)
517 {
518 	bh->b_end_io = end_buffer_async_read;
519 	set_buffer_async_read(bh);
520 }
521 
522 void mark_buffer_async_write(struct buffer_head *bh)
523 {
524 	bh->b_end_io = end_buffer_async_write;
525 	set_buffer_async_write(bh);
526 }
527 EXPORT_SYMBOL(mark_buffer_async_write);
528 
529 
530 /*
531  * fs/buffer.c contains helper functions for buffer-backed address space's
532  * fsync functions.  A common requirement for buffer-based filesystems is
533  * that certain data from the backing blockdev needs to be written out for
534  * a successful fsync().  For example, ext2 indirect blocks need to be
535  * written back and waited upon before fsync() returns.
536  *
537  * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
538  * inode_has_buffers() and invalidate_inode_buffers() are provided for the
539  * management of a list of dependent buffers at ->i_mapping->private_list.
540  *
541  * Locking is a little subtle: try_to_free_buffers() will remove buffers
542  * from their controlling inode's queue when they are being freed.  But
543  * try_to_free_buffers() will be operating against the *blockdev* mapping
544  * at the time, not against the S_ISREG file which depends on those buffers.
545  * So the locking for private_list is via the private_lock in the address_space
546  * which backs the buffers.  Which is different from the address_space
547  * against which the buffers are listed.  So for a particular address_space,
548  * mapping->private_lock does *not* protect mapping->private_list!  In fact,
549  * mapping->private_list will always be protected by the backing blockdev's
550  * ->private_lock.
551  *
552  * Which introduces a requirement: all buffers on an address_space's
553  * ->private_list must be from the same address_space: the blockdev's.
554  *
555  * address_spaces which do not place buffers at ->private_list via these
556  * utility functions are free to use private_lock and private_list for
557  * whatever they want.  The only requirement is that list_empty(private_list)
558  * be true at clear_inode() time.
559  *
560  * FIXME: clear_inode should not call invalidate_inode_buffers().  The
561  * filesystems should do that.  invalidate_inode_buffers() should just go
562  * BUG_ON(!list_empty).
563  *
564  * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
565  * take an address_space, not an inode.  And it should be called
566  * mark_buffer_dirty_fsync() to clearly define why those buffers are being
567  * queued up.
568  *
569  * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
570  * list if it is already on a list.  Because if the buffer is on a list,
571  * it *must* already be on the right one.  If not, the filesystem is being
572  * silly.  This will save a ton of locking.  But first we have to ensure
573  * that buffers are taken *off* the old inode's list when they are freed
574  * (presumably in truncate).  That requires careful auditing of all
575  * filesystems (do it inside bforget()).  It could also be done by bringing
576  * b_inode back.
577  */
578 
579 /*
580  * The buffer's backing address_space's private_lock must be held
581  */
582 static void __remove_assoc_queue(struct buffer_head *bh)
583 {
584 	list_del_init(&bh->b_assoc_buffers);
585 	WARN_ON(!bh->b_assoc_map);
586 	if (buffer_write_io_error(bh))
587 		set_bit(AS_EIO, &bh->b_assoc_map->flags);
588 	bh->b_assoc_map = NULL;
589 }
590 
591 int inode_has_buffers(struct inode *inode)
592 {
593 	return !list_empty(&inode->i_data.private_list);
594 }
595 
596 /*
597  * osync is designed to support O_SYNC io.  It waits synchronously for
598  * all already-submitted IO to complete, but does not queue any new
599  * writes to the disk.
600  *
601  * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
602  * you dirty the buffers, and then use osync_inode_buffers to wait for
603  * completion.  Any other dirty buffers which are not yet queued for
604  * write will not be flushed to disk by the osync.
605  */
606 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
607 {
608 	struct buffer_head *bh;
609 	struct list_head *p;
610 	int err = 0;
611 
612 	spin_lock(lock);
613 repeat:
614 	list_for_each_prev(p, list) {
615 		bh = BH_ENTRY(p);
616 		if (buffer_locked(bh)) {
617 			get_bh(bh);
618 			spin_unlock(lock);
619 			wait_on_buffer(bh);
620 			if (!buffer_uptodate(bh))
621 				err = -EIO;
622 			brelse(bh);
623 			spin_lock(lock);
624 			goto repeat;
625 		}
626 	}
627 	spin_unlock(lock);
628 	return err;
629 }
630 
631 /**
632  * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
633  * @mapping: the mapping which wants those buffers written
634  *
635  * Starts I/O against the buffers at mapping->private_list, and waits upon
636  * that I/O.
637  *
638  * Basically, this is a convenience function for fsync().
639  * @mapping is a file or directory which needs those buffers to be written for
640  * a successful fsync().
641  */
642 int sync_mapping_buffers(struct address_space *mapping)
643 {
644 	struct address_space *buffer_mapping = mapping->assoc_mapping;
645 
646 	if (buffer_mapping == NULL || list_empty(&mapping->private_list))
647 		return 0;
648 
649 	return fsync_buffers_list(&buffer_mapping->private_lock,
650 					&mapping->private_list);
651 }
652 EXPORT_SYMBOL(sync_mapping_buffers);
653 
654 /*
655  * Called when we've recently written block `bblock', and it is known that
656  * `bblock' was for a buffer_boundary() buffer.  This means that the block at
657  * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
658  * dirty, schedule it for IO.  So that indirects merge nicely with their data.
659  */
660 void write_boundary_block(struct block_device *bdev,
661 			sector_t bblock, unsigned blocksize)
662 {
663 	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
664 	if (bh) {
665 		if (buffer_dirty(bh))
666 			ll_rw_block(WRITE, 1, &bh);
667 		put_bh(bh);
668 	}
669 }
670 
671 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
672 {
673 	struct address_space *mapping = inode->i_mapping;
674 	struct address_space *buffer_mapping = bh->b_page->mapping;
675 
676 	mark_buffer_dirty(bh);
677 	if (!mapping->assoc_mapping) {
678 		mapping->assoc_mapping = buffer_mapping;
679 	} else {
680 		BUG_ON(mapping->assoc_mapping != buffer_mapping);
681 	}
682 	if (!bh->b_assoc_map) {
683 		spin_lock(&buffer_mapping->private_lock);
684 		list_move_tail(&bh->b_assoc_buffers,
685 				&mapping->private_list);
686 		bh->b_assoc_map = mapping;
687 		spin_unlock(&buffer_mapping->private_lock);
688 	}
689 }
690 EXPORT_SYMBOL(mark_buffer_dirty_inode);
691 
692 /*
693  * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
694  * dirty.
695  *
696  * If warn is true, then emit a warning if the page is not uptodate and has
697  * not been truncated.
698  */
699 static int __set_page_dirty(struct page *page,
700 		struct address_space *mapping, int warn)
701 {
702 	if (unlikely(!mapping))
703 		return !TestSetPageDirty(page);
704 
705 	if (TestSetPageDirty(page))
706 		return 0;
707 
708 	spin_lock_irq(&mapping->tree_lock);
709 	if (page->mapping) {	/* Race with truncate? */
710 		WARN_ON_ONCE(warn && !PageUptodate(page));
711 
712 		if (mapping_cap_account_dirty(mapping)) {
713 			__inc_zone_page_state(page, NR_FILE_DIRTY);
714 			__inc_bdi_stat(mapping->backing_dev_info,
715 					BDI_RECLAIMABLE);
716 			task_io_account_write(PAGE_CACHE_SIZE);
717 		}
718 		radix_tree_tag_set(&mapping->page_tree,
719 				page_index(page), PAGECACHE_TAG_DIRTY);
720 	}
721 	spin_unlock_irq(&mapping->tree_lock);
722 	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
723 
724 	return 1;
725 }
726 
727 /*
728  * Add a page to the dirty page list.
729  *
730  * It is a sad fact of life that this function is called from several places
731  * deeply under spinlocking.  It may not sleep.
732  *
733  * If the page has buffers, the uptodate buffers are set dirty, to preserve
734  * dirty-state coherency between the page and the buffers.  It the page does
735  * not have buffers then when they are later attached they will all be set
736  * dirty.
737  *
738  * The buffers are dirtied before the page is dirtied.  There's a small race
739  * window in which a writepage caller may see the page cleanness but not the
740  * buffer dirtiness.  That's fine.  If this code were to set the page dirty
741  * before the buffers, a concurrent writepage caller could clear the page dirty
742  * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
743  * page on the dirty page list.
744  *
745  * We use private_lock to lock against try_to_free_buffers while using the
746  * page's buffer list.  Also use this to protect against clean buffers being
747  * added to the page after it was set dirty.
748  *
749  * FIXME: may need to call ->reservepage here as well.  That's rather up to the
750  * address_space though.
751  */
752 int __set_page_dirty_buffers(struct page *page)
753 {
754 	struct address_space *mapping = page_mapping(page);
755 
756 	if (unlikely(!mapping))
757 		return !TestSetPageDirty(page);
758 
759 	spin_lock(&mapping->private_lock);
760 	if (page_has_buffers(page)) {
761 		struct buffer_head *head = page_buffers(page);
762 		struct buffer_head *bh = head;
763 
764 		do {
765 			set_buffer_dirty(bh);
766 			bh = bh->b_this_page;
767 		} while (bh != head);
768 	}
769 	spin_unlock(&mapping->private_lock);
770 
771 	return __set_page_dirty(page, mapping, 1);
772 }
773 EXPORT_SYMBOL(__set_page_dirty_buffers);
774 
775 /*
776  * Write out and wait upon a list of buffers.
777  *
778  * We have conflicting pressures: we want to make sure that all
779  * initially dirty buffers get waited on, but that any subsequently
780  * dirtied buffers don't.  After all, we don't want fsync to last
781  * forever if somebody is actively writing to the file.
782  *
783  * Do this in two main stages: first we copy dirty buffers to a
784  * temporary inode list, queueing the writes as we go.  Then we clean
785  * up, waiting for those writes to complete.
786  *
787  * During this second stage, any subsequent updates to the file may end
788  * up refiling the buffer on the original inode's dirty list again, so
789  * there is a chance we will end up with a buffer queued for write but
790  * not yet completed on that list.  So, as a final cleanup we go through
791  * the osync code to catch these locked, dirty buffers without requeuing
792  * any newly dirty buffers for write.
793  */
794 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
795 {
796 	struct buffer_head *bh;
797 	struct list_head tmp;
798 	struct address_space *mapping;
799 	int err = 0, err2;
800 
801 	INIT_LIST_HEAD(&tmp);
802 
803 	spin_lock(lock);
804 	while (!list_empty(list)) {
805 		bh = BH_ENTRY(list->next);
806 		mapping = bh->b_assoc_map;
807 		__remove_assoc_queue(bh);
808 		/* Avoid race with mark_buffer_dirty_inode() which does
809 		 * a lockless check and we rely on seeing the dirty bit */
810 		smp_mb();
811 		if (buffer_dirty(bh) || buffer_locked(bh)) {
812 			list_add(&bh->b_assoc_buffers, &tmp);
813 			bh->b_assoc_map = mapping;
814 			if (buffer_dirty(bh)) {
815 				get_bh(bh);
816 				spin_unlock(lock);
817 				/*
818 				 * Ensure any pending I/O completes so that
819 				 * ll_rw_block() actually writes the current
820 				 * contents - it is a noop if I/O is still in
821 				 * flight on potentially older contents.
822 				 */
823 				ll_rw_block(SWRITE_SYNC, 1, &bh);
824 				brelse(bh);
825 				spin_lock(lock);
826 			}
827 		}
828 	}
829 
830 	while (!list_empty(&tmp)) {
831 		bh = BH_ENTRY(tmp.prev);
832 		get_bh(bh);
833 		mapping = bh->b_assoc_map;
834 		__remove_assoc_queue(bh);
835 		/* Avoid race with mark_buffer_dirty_inode() which does
836 		 * a lockless check and we rely on seeing the dirty bit */
837 		smp_mb();
838 		if (buffer_dirty(bh)) {
839 			list_add(&bh->b_assoc_buffers,
840 				 &mapping->private_list);
841 			bh->b_assoc_map = mapping;
842 		}
843 		spin_unlock(lock);
844 		wait_on_buffer(bh);
845 		if (!buffer_uptodate(bh))
846 			err = -EIO;
847 		brelse(bh);
848 		spin_lock(lock);
849 	}
850 
851 	spin_unlock(lock);
852 	err2 = osync_buffers_list(lock, list);
853 	if (err)
854 		return err;
855 	else
856 		return err2;
857 }
858 
859 /*
860  * Invalidate any and all dirty buffers on a given inode.  We are
861  * probably unmounting the fs, but that doesn't mean we have already
862  * done a sync().  Just drop the buffers from the inode list.
863  *
864  * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
865  * assumes that all the buffers are against the blockdev.  Not true
866  * for reiserfs.
867  */
868 void invalidate_inode_buffers(struct inode *inode)
869 {
870 	if (inode_has_buffers(inode)) {
871 		struct address_space *mapping = &inode->i_data;
872 		struct list_head *list = &mapping->private_list;
873 		struct address_space *buffer_mapping = mapping->assoc_mapping;
874 
875 		spin_lock(&buffer_mapping->private_lock);
876 		while (!list_empty(list))
877 			__remove_assoc_queue(BH_ENTRY(list->next));
878 		spin_unlock(&buffer_mapping->private_lock);
879 	}
880 }
881 
882 /*
883  * Remove any clean buffers from the inode's buffer list.  This is called
884  * when we're trying to free the inode itself.  Those buffers can pin it.
885  *
886  * Returns true if all buffers were removed.
887  */
888 int remove_inode_buffers(struct inode *inode)
889 {
890 	int ret = 1;
891 
892 	if (inode_has_buffers(inode)) {
893 		struct address_space *mapping = &inode->i_data;
894 		struct list_head *list = &mapping->private_list;
895 		struct address_space *buffer_mapping = mapping->assoc_mapping;
896 
897 		spin_lock(&buffer_mapping->private_lock);
898 		while (!list_empty(list)) {
899 			struct buffer_head *bh = BH_ENTRY(list->next);
900 			if (buffer_dirty(bh)) {
901 				ret = 0;
902 				break;
903 			}
904 			__remove_assoc_queue(bh);
905 		}
906 		spin_unlock(&buffer_mapping->private_lock);
907 	}
908 	return ret;
909 }
910 
911 /*
912  * Create the appropriate buffers when given a page for data area and
913  * the size of each buffer.. Use the bh->b_this_page linked list to
914  * follow the buffers created.  Return NULL if unable to create more
915  * buffers.
916  *
917  * The retry flag is used to differentiate async IO (paging, swapping)
918  * which may not fail from ordinary buffer allocations.
919  */
920 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
921 		int retry)
922 {
923 	struct buffer_head *bh, *head;
924 	long offset;
925 
926 try_again:
927 	head = NULL;
928 	offset = PAGE_SIZE;
929 	while ((offset -= size) >= 0) {
930 		bh = alloc_buffer_head(GFP_NOFS);
931 		if (!bh)
932 			goto no_grow;
933 
934 		bh->b_bdev = NULL;
935 		bh->b_this_page = head;
936 		bh->b_blocknr = -1;
937 		head = bh;
938 
939 		bh->b_state = 0;
940 		atomic_set(&bh->b_count, 0);
941 		bh->b_private = NULL;
942 		bh->b_size = size;
943 
944 		/* Link the buffer to its page */
945 		set_bh_page(bh, page, offset);
946 
947 		init_buffer(bh, NULL, NULL);
948 	}
949 	return head;
950 /*
951  * In case anything failed, we just free everything we got.
952  */
953 no_grow:
954 	if (head) {
955 		do {
956 			bh = head;
957 			head = head->b_this_page;
958 			free_buffer_head(bh);
959 		} while (head);
960 	}
961 
962 	/*
963 	 * Return failure for non-async IO requests.  Async IO requests
964 	 * are not allowed to fail, so we have to wait until buffer heads
965 	 * become available.  But we don't want tasks sleeping with
966 	 * partially complete buffers, so all were released above.
967 	 */
968 	if (!retry)
969 		return NULL;
970 
971 	/* We're _really_ low on memory. Now we just
972 	 * wait for old buffer heads to become free due to
973 	 * finishing IO.  Since this is an async request and
974 	 * the reserve list is empty, we're sure there are
975 	 * async buffer heads in use.
976 	 */
977 	free_more_memory();
978 	goto try_again;
979 }
980 EXPORT_SYMBOL_GPL(alloc_page_buffers);
981 
982 static inline void
983 link_dev_buffers(struct page *page, struct buffer_head *head)
984 {
985 	struct buffer_head *bh, *tail;
986 
987 	bh = head;
988 	do {
989 		tail = bh;
990 		bh = bh->b_this_page;
991 	} while (bh);
992 	tail->b_this_page = head;
993 	attach_page_buffers(page, head);
994 }
995 
996 /*
997  * Initialise the state of a blockdev page's buffers.
998  */
999 static void
1000 init_page_buffers(struct page *page, struct block_device *bdev,
1001 			sector_t block, int size)
1002 {
1003 	struct buffer_head *head = page_buffers(page);
1004 	struct buffer_head *bh = head;
1005 	int uptodate = PageUptodate(page);
1006 
1007 	do {
1008 		if (!buffer_mapped(bh)) {
1009 			init_buffer(bh, NULL, NULL);
1010 			bh->b_bdev = bdev;
1011 			bh->b_blocknr = block;
1012 			if (uptodate)
1013 				set_buffer_uptodate(bh);
1014 			set_buffer_mapped(bh);
1015 		}
1016 		block++;
1017 		bh = bh->b_this_page;
1018 	} while (bh != head);
1019 }
1020 
1021 /*
1022  * Create the page-cache page that contains the requested block.
1023  *
1024  * This is user purely for blockdev mappings.
1025  */
1026 static struct page *
1027 grow_dev_page(struct block_device *bdev, sector_t block,
1028 		pgoff_t index, int size)
1029 {
1030 	struct inode *inode = bdev->bd_inode;
1031 	struct page *page;
1032 	struct buffer_head *bh;
1033 
1034 	page = find_or_create_page(inode->i_mapping, index,
1035 		(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
1036 	if (!page)
1037 		return NULL;
1038 
1039 	BUG_ON(!PageLocked(page));
1040 
1041 	if (page_has_buffers(page)) {
1042 		bh = page_buffers(page);
1043 		if (bh->b_size == size) {
1044 			init_page_buffers(page, bdev, block, size);
1045 			return page;
1046 		}
1047 		if (!try_to_free_buffers(page))
1048 			goto failed;
1049 	}
1050 
1051 	/*
1052 	 * Allocate some buffers for this page
1053 	 */
1054 	bh = alloc_page_buffers(page, size, 0);
1055 	if (!bh)
1056 		goto failed;
1057 
1058 	/*
1059 	 * Link the page to the buffers and initialise them.  Take the
1060 	 * lock to be atomic wrt __find_get_block(), which does not
1061 	 * run under the page lock.
1062 	 */
1063 	spin_lock(&inode->i_mapping->private_lock);
1064 	link_dev_buffers(page, bh);
1065 	init_page_buffers(page, bdev, block, size);
1066 	spin_unlock(&inode->i_mapping->private_lock);
1067 	return page;
1068 
1069 failed:
1070 	BUG();
1071 	unlock_page(page);
1072 	page_cache_release(page);
1073 	return NULL;
1074 }
1075 
1076 /*
1077  * Create buffers for the specified block device block's page.  If
1078  * that page was dirty, the buffers are set dirty also.
1079  */
1080 static int
1081 grow_buffers(struct block_device *bdev, sector_t block, int size)
1082 {
1083 	struct page *page;
1084 	pgoff_t index;
1085 	int sizebits;
1086 
1087 	sizebits = -1;
1088 	do {
1089 		sizebits++;
1090 	} while ((size << sizebits) < PAGE_SIZE);
1091 
1092 	index = block >> sizebits;
1093 
1094 	/*
1095 	 * Check for a block which wants to lie outside our maximum possible
1096 	 * pagecache index.  (this comparison is done using sector_t types).
1097 	 */
1098 	if (unlikely(index != block >> sizebits)) {
1099 		char b[BDEVNAME_SIZE];
1100 
1101 		printk(KERN_ERR "%s: requested out-of-range block %llu for "
1102 			"device %s\n",
1103 			__func__, (unsigned long long)block,
1104 			bdevname(bdev, b));
1105 		return -EIO;
1106 	}
1107 	block = index << sizebits;
1108 	/* Create a page with the proper size buffers.. */
1109 	page = grow_dev_page(bdev, block, index, size);
1110 	if (!page)
1111 		return 0;
1112 	unlock_page(page);
1113 	page_cache_release(page);
1114 	return 1;
1115 }
1116 
1117 static struct buffer_head *
1118 __getblk_slow(struct block_device *bdev, sector_t block, int size)
1119 {
1120 	/* Size must be multiple of hard sectorsize */
1121 	if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
1122 			(size < 512 || size > PAGE_SIZE))) {
1123 		printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1124 					size);
1125 		printk(KERN_ERR "hardsect size: %d\n",
1126 					bdev_hardsect_size(bdev));
1127 
1128 		dump_stack();
1129 		return NULL;
1130 	}
1131 
1132 	for (;;) {
1133 		struct buffer_head * bh;
1134 		int ret;
1135 
1136 		bh = __find_get_block(bdev, block, size);
1137 		if (bh)
1138 			return bh;
1139 
1140 		ret = grow_buffers(bdev, block, size);
1141 		if (ret < 0)
1142 			return NULL;
1143 		if (ret == 0)
1144 			free_more_memory();
1145 	}
1146 }
1147 
1148 /*
1149  * The relationship between dirty buffers and dirty pages:
1150  *
1151  * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1152  * the page is tagged dirty in its radix tree.
1153  *
1154  * At all times, the dirtiness of the buffers represents the dirtiness of
1155  * subsections of the page.  If the page has buffers, the page dirty bit is
1156  * merely a hint about the true dirty state.
1157  *
1158  * When a page is set dirty in its entirety, all its buffers are marked dirty
1159  * (if the page has buffers).
1160  *
1161  * When a buffer is marked dirty, its page is dirtied, but the page's other
1162  * buffers are not.
1163  *
1164  * Also.  When blockdev buffers are explicitly read with bread(), they
1165  * individually become uptodate.  But their backing page remains not
1166  * uptodate - even if all of its buffers are uptodate.  A subsequent
1167  * block_read_full_page() against that page will discover all the uptodate
1168  * buffers, will set the page uptodate and will perform no I/O.
1169  */
1170 
1171 /**
1172  * mark_buffer_dirty - mark a buffer_head as needing writeout
1173  * @bh: the buffer_head to mark dirty
1174  *
1175  * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1176  * backing page dirty, then tag the page as dirty in its address_space's radix
1177  * tree and then attach the address_space's inode to its superblock's dirty
1178  * inode list.
1179  *
1180  * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1181  * mapping->tree_lock and the global inode_lock.
1182  */
1183 void mark_buffer_dirty(struct buffer_head *bh)
1184 {
1185 	WARN_ON_ONCE(!buffer_uptodate(bh));
1186 
1187 	/*
1188 	 * Very *carefully* optimize the it-is-already-dirty case.
1189 	 *
1190 	 * Don't let the final "is it dirty" escape to before we
1191 	 * perhaps modified the buffer.
1192 	 */
1193 	if (buffer_dirty(bh)) {
1194 		smp_mb();
1195 		if (buffer_dirty(bh))
1196 			return;
1197 	}
1198 
1199 	if (!test_set_buffer_dirty(bh))
1200 		__set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0);
1201 }
1202 
1203 /*
1204  * Decrement a buffer_head's reference count.  If all buffers against a page
1205  * have zero reference count, are clean and unlocked, and if the page is clean
1206  * and unlocked then try_to_free_buffers() may strip the buffers from the page
1207  * in preparation for freeing it (sometimes, rarely, buffers are removed from
1208  * a page but it ends up not being freed, and buffers may later be reattached).
1209  */
1210 void __brelse(struct buffer_head * buf)
1211 {
1212 	if (atomic_read(&buf->b_count)) {
1213 		put_bh(buf);
1214 		return;
1215 	}
1216 	WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1217 }
1218 
1219 /*
1220  * bforget() is like brelse(), except it discards any
1221  * potentially dirty data.
1222  */
1223 void __bforget(struct buffer_head *bh)
1224 {
1225 	clear_buffer_dirty(bh);
1226 	if (bh->b_assoc_map) {
1227 		struct address_space *buffer_mapping = bh->b_page->mapping;
1228 
1229 		spin_lock(&buffer_mapping->private_lock);
1230 		list_del_init(&bh->b_assoc_buffers);
1231 		bh->b_assoc_map = NULL;
1232 		spin_unlock(&buffer_mapping->private_lock);
1233 	}
1234 	__brelse(bh);
1235 }
1236 
1237 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1238 {
1239 	lock_buffer(bh);
1240 	if (buffer_uptodate(bh)) {
1241 		unlock_buffer(bh);
1242 		return bh;
1243 	} else {
1244 		get_bh(bh);
1245 		bh->b_end_io = end_buffer_read_sync;
1246 		submit_bh(READ, bh);
1247 		wait_on_buffer(bh);
1248 		if (buffer_uptodate(bh))
1249 			return bh;
1250 	}
1251 	brelse(bh);
1252 	return NULL;
1253 }
1254 
1255 /*
1256  * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1257  * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1258  * refcount elevated by one when they're in an LRU.  A buffer can only appear
1259  * once in a particular CPU's LRU.  A single buffer can be present in multiple
1260  * CPU's LRUs at the same time.
1261  *
1262  * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1263  * sb_find_get_block().
1264  *
1265  * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1266  * a local interrupt disable for that.
1267  */
1268 
1269 #define BH_LRU_SIZE	8
1270 
1271 struct bh_lru {
1272 	struct buffer_head *bhs[BH_LRU_SIZE];
1273 };
1274 
1275 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1276 
1277 #ifdef CONFIG_SMP
1278 #define bh_lru_lock()	local_irq_disable()
1279 #define bh_lru_unlock()	local_irq_enable()
1280 #else
1281 #define bh_lru_lock()	preempt_disable()
1282 #define bh_lru_unlock()	preempt_enable()
1283 #endif
1284 
1285 static inline void check_irqs_on(void)
1286 {
1287 #ifdef irqs_disabled
1288 	BUG_ON(irqs_disabled());
1289 #endif
1290 }
1291 
1292 /*
1293  * The LRU management algorithm is dopey-but-simple.  Sorry.
1294  */
1295 static void bh_lru_install(struct buffer_head *bh)
1296 {
1297 	struct buffer_head *evictee = NULL;
1298 	struct bh_lru *lru;
1299 
1300 	check_irqs_on();
1301 	bh_lru_lock();
1302 	lru = &__get_cpu_var(bh_lrus);
1303 	if (lru->bhs[0] != bh) {
1304 		struct buffer_head *bhs[BH_LRU_SIZE];
1305 		int in;
1306 		int out = 0;
1307 
1308 		get_bh(bh);
1309 		bhs[out++] = bh;
1310 		for (in = 0; in < BH_LRU_SIZE; in++) {
1311 			struct buffer_head *bh2 = lru->bhs[in];
1312 
1313 			if (bh2 == bh) {
1314 				__brelse(bh2);
1315 			} else {
1316 				if (out >= BH_LRU_SIZE) {
1317 					BUG_ON(evictee != NULL);
1318 					evictee = bh2;
1319 				} else {
1320 					bhs[out++] = bh2;
1321 				}
1322 			}
1323 		}
1324 		while (out < BH_LRU_SIZE)
1325 			bhs[out++] = NULL;
1326 		memcpy(lru->bhs, bhs, sizeof(bhs));
1327 	}
1328 	bh_lru_unlock();
1329 
1330 	if (evictee)
1331 		__brelse(evictee);
1332 }
1333 
1334 /*
1335  * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1336  */
1337 static struct buffer_head *
1338 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1339 {
1340 	struct buffer_head *ret = NULL;
1341 	struct bh_lru *lru;
1342 	unsigned int i;
1343 
1344 	check_irqs_on();
1345 	bh_lru_lock();
1346 	lru = &__get_cpu_var(bh_lrus);
1347 	for (i = 0; i < BH_LRU_SIZE; i++) {
1348 		struct buffer_head *bh = lru->bhs[i];
1349 
1350 		if (bh && bh->b_bdev == bdev &&
1351 				bh->b_blocknr == block && bh->b_size == size) {
1352 			if (i) {
1353 				while (i) {
1354 					lru->bhs[i] = lru->bhs[i - 1];
1355 					i--;
1356 				}
1357 				lru->bhs[0] = bh;
1358 			}
1359 			get_bh(bh);
1360 			ret = bh;
1361 			break;
1362 		}
1363 	}
1364 	bh_lru_unlock();
1365 	return ret;
1366 }
1367 
1368 /*
1369  * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1370  * it in the LRU and mark it as accessed.  If it is not present then return
1371  * NULL
1372  */
1373 struct buffer_head *
1374 __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1375 {
1376 	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1377 
1378 	if (bh == NULL) {
1379 		bh = __find_get_block_slow(bdev, block);
1380 		if (bh)
1381 			bh_lru_install(bh);
1382 	}
1383 	if (bh)
1384 		touch_buffer(bh);
1385 	return bh;
1386 }
1387 EXPORT_SYMBOL(__find_get_block);
1388 
1389 /*
1390  * __getblk will locate (and, if necessary, create) the buffer_head
1391  * which corresponds to the passed block_device, block and size. The
1392  * returned buffer has its reference count incremented.
1393  *
1394  * __getblk() cannot fail - it just keeps trying.  If you pass it an
1395  * illegal block number, __getblk() will happily return a buffer_head
1396  * which represents the non-existent block.  Very weird.
1397  *
1398  * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1399  * attempt is failing.  FIXME, perhaps?
1400  */
1401 struct buffer_head *
1402 __getblk(struct block_device *bdev, sector_t block, unsigned size)
1403 {
1404 	struct buffer_head *bh = __find_get_block(bdev, block, size);
1405 
1406 	might_sleep();
1407 	if (bh == NULL)
1408 		bh = __getblk_slow(bdev, block, size);
1409 	return bh;
1410 }
1411 EXPORT_SYMBOL(__getblk);
1412 
1413 /*
1414  * Do async read-ahead on a buffer..
1415  */
1416 void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1417 {
1418 	struct buffer_head *bh = __getblk(bdev, block, size);
1419 	if (likely(bh)) {
1420 		ll_rw_block(READA, 1, &bh);
1421 		brelse(bh);
1422 	}
1423 }
1424 EXPORT_SYMBOL(__breadahead);
1425 
1426 /**
1427  *  __bread() - reads a specified block and returns the bh
1428  *  @bdev: the block_device to read from
1429  *  @block: number of block
1430  *  @size: size (in bytes) to read
1431  *
1432  *  Reads a specified block, and returns buffer head that contains it.
1433  *  It returns NULL if the block was unreadable.
1434  */
1435 struct buffer_head *
1436 __bread(struct block_device *bdev, sector_t block, unsigned size)
1437 {
1438 	struct buffer_head *bh = __getblk(bdev, block, size);
1439 
1440 	if (likely(bh) && !buffer_uptodate(bh))
1441 		bh = __bread_slow(bh);
1442 	return bh;
1443 }
1444 EXPORT_SYMBOL(__bread);
1445 
1446 /*
1447  * invalidate_bh_lrus() is called rarely - but not only at unmount.
1448  * This doesn't race because it runs in each cpu either in irq
1449  * or with preempt disabled.
1450  */
1451 static void invalidate_bh_lru(void *arg)
1452 {
1453 	struct bh_lru *b = &get_cpu_var(bh_lrus);
1454 	int i;
1455 
1456 	for (i = 0; i < BH_LRU_SIZE; i++) {
1457 		brelse(b->bhs[i]);
1458 		b->bhs[i] = NULL;
1459 	}
1460 	put_cpu_var(bh_lrus);
1461 }
1462 
1463 void invalidate_bh_lrus(void)
1464 {
1465 	on_each_cpu(invalidate_bh_lru, NULL, 1);
1466 }
1467 EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1468 
1469 void set_bh_page(struct buffer_head *bh,
1470 		struct page *page, unsigned long offset)
1471 {
1472 	bh->b_page = page;
1473 	BUG_ON(offset >= PAGE_SIZE);
1474 	if (PageHighMem(page))
1475 		/*
1476 		 * This catches illegal uses and preserves the offset:
1477 		 */
1478 		bh->b_data = (char *)(0 + offset);
1479 	else
1480 		bh->b_data = page_address(page) + offset;
1481 }
1482 EXPORT_SYMBOL(set_bh_page);
1483 
1484 /*
1485  * Called when truncating a buffer on a page completely.
1486  */
1487 static void discard_buffer(struct buffer_head * bh)
1488 {
1489 	lock_buffer(bh);
1490 	clear_buffer_dirty(bh);
1491 	bh->b_bdev = NULL;
1492 	clear_buffer_mapped(bh);
1493 	clear_buffer_req(bh);
1494 	clear_buffer_new(bh);
1495 	clear_buffer_delay(bh);
1496 	clear_buffer_unwritten(bh);
1497 	unlock_buffer(bh);
1498 }
1499 
1500 /**
1501  * block_invalidatepage - invalidate part of all of a buffer-backed page
1502  *
1503  * @page: the page which is affected
1504  * @offset: the index of the truncation point
1505  *
1506  * block_invalidatepage() is called when all or part of the page has become
1507  * invalidatedby a truncate operation.
1508  *
1509  * block_invalidatepage() does not have to release all buffers, but it must
1510  * ensure that no dirty buffer is left outside @offset and that no I/O
1511  * is underway against any of the blocks which are outside the truncation
1512  * point.  Because the caller is about to free (and possibly reuse) those
1513  * blocks on-disk.
1514  */
1515 void block_invalidatepage(struct page *page, unsigned long offset)
1516 {
1517 	struct buffer_head *head, *bh, *next;
1518 	unsigned int curr_off = 0;
1519 
1520 	BUG_ON(!PageLocked(page));
1521 	if (!page_has_buffers(page))
1522 		goto out;
1523 
1524 	head = page_buffers(page);
1525 	bh = head;
1526 	do {
1527 		unsigned int next_off = curr_off + bh->b_size;
1528 		next = bh->b_this_page;
1529 
1530 		/*
1531 		 * is this block fully invalidated?
1532 		 */
1533 		if (offset <= curr_off)
1534 			discard_buffer(bh);
1535 		curr_off = next_off;
1536 		bh = next;
1537 	} while (bh != head);
1538 
1539 	/*
1540 	 * We release buffers only if the entire page is being invalidated.
1541 	 * The get_block cached value has been unconditionally invalidated,
1542 	 * so real IO is not possible anymore.
1543 	 */
1544 	if (offset == 0)
1545 		try_to_release_page(page, 0);
1546 out:
1547 	return;
1548 }
1549 EXPORT_SYMBOL(block_invalidatepage);
1550 
1551 /*
1552  * We attach and possibly dirty the buffers atomically wrt
1553  * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1554  * is already excluded via the page lock.
1555  */
1556 void create_empty_buffers(struct page *page,
1557 			unsigned long blocksize, unsigned long b_state)
1558 {
1559 	struct buffer_head *bh, *head, *tail;
1560 
1561 	head = alloc_page_buffers(page, blocksize, 1);
1562 	bh = head;
1563 	do {
1564 		bh->b_state |= b_state;
1565 		tail = bh;
1566 		bh = bh->b_this_page;
1567 	} while (bh);
1568 	tail->b_this_page = head;
1569 
1570 	spin_lock(&page->mapping->private_lock);
1571 	if (PageUptodate(page) || PageDirty(page)) {
1572 		bh = head;
1573 		do {
1574 			if (PageDirty(page))
1575 				set_buffer_dirty(bh);
1576 			if (PageUptodate(page))
1577 				set_buffer_uptodate(bh);
1578 			bh = bh->b_this_page;
1579 		} while (bh != head);
1580 	}
1581 	attach_page_buffers(page, head);
1582 	spin_unlock(&page->mapping->private_lock);
1583 }
1584 EXPORT_SYMBOL(create_empty_buffers);
1585 
1586 /*
1587  * We are taking a block for data and we don't want any output from any
1588  * buffer-cache aliases starting from return from that function and
1589  * until the moment when something will explicitly mark the buffer
1590  * dirty (hopefully that will not happen until we will free that block ;-)
1591  * We don't even need to mark it not-uptodate - nobody can expect
1592  * anything from a newly allocated buffer anyway. We used to used
1593  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1594  * don't want to mark the alias unmapped, for example - it would confuse
1595  * anyone who might pick it with bread() afterwards...
1596  *
1597  * Also..  Note that bforget() doesn't lock the buffer.  So there can
1598  * be writeout I/O going on against recently-freed buffers.  We don't
1599  * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1600  * only if we really need to.  That happens here.
1601  */
1602 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1603 {
1604 	struct buffer_head *old_bh;
1605 
1606 	might_sleep();
1607 
1608 	old_bh = __find_get_block_slow(bdev, block);
1609 	if (old_bh) {
1610 		clear_buffer_dirty(old_bh);
1611 		wait_on_buffer(old_bh);
1612 		clear_buffer_req(old_bh);
1613 		__brelse(old_bh);
1614 	}
1615 }
1616 EXPORT_SYMBOL(unmap_underlying_metadata);
1617 
1618 /*
1619  * NOTE! All mapped/uptodate combinations are valid:
1620  *
1621  *	Mapped	Uptodate	Meaning
1622  *
1623  *	No	No		"unknown" - must do get_block()
1624  *	No	Yes		"hole" - zero-filled
1625  *	Yes	No		"allocated" - allocated on disk, not read in
1626  *	Yes	Yes		"valid" - allocated and up-to-date in memory.
1627  *
1628  * "Dirty" is valid only with the last case (mapped+uptodate).
1629  */
1630 
1631 /*
1632  * While block_write_full_page is writing back the dirty buffers under
1633  * the page lock, whoever dirtied the buffers may decide to clean them
1634  * again at any time.  We handle that by only looking at the buffer
1635  * state inside lock_buffer().
1636  *
1637  * If block_write_full_page() is called for regular writeback
1638  * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1639  * locked buffer.   This only can happen if someone has written the buffer
1640  * directly, with submit_bh().  At the address_space level PageWriteback
1641  * prevents this contention from occurring.
1642  */
1643 static int __block_write_full_page(struct inode *inode, struct page *page,
1644 			get_block_t *get_block, struct writeback_control *wbc)
1645 {
1646 	int err;
1647 	sector_t block;
1648 	sector_t last_block;
1649 	struct buffer_head *bh, *head;
1650 	const unsigned blocksize = 1 << inode->i_blkbits;
1651 	int nr_underway = 0;
1652 
1653 	BUG_ON(!PageLocked(page));
1654 
1655 	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1656 
1657 	if (!page_has_buffers(page)) {
1658 		create_empty_buffers(page, blocksize,
1659 					(1 << BH_Dirty)|(1 << BH_Uptodate));
1660 	}
1661 
1662 	/*
1663 	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1664 	 * here, and the (potentially unmapped) buffers may become dirty at
1665 	 * any time.  If a buffer becomes dirty here after we've inspected it
1666 	 * then we just miss that fact, and the page stays dirty.
1667 	 *
1668 	 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1669 	 * handle that here by just cleaning them.
1670 	 */
1671 
1672 	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1673 	head = page_buffers(page);
1674 	bh = head;
1675 
1676 	/*
1677 	 * Get all the dirty buffers mapped to disk addresses and
1678 	 * handle any aliases from the underlying blockdev's mapping.
1679 	 */
1680 	do {
1681 		if (block > last_block) {
1682 			/*
1683 			 * mapped buffers outside i_size will occur, because
1684 			 * this page can be outside i_size when there is a
1685 			 * truncate in progress.
1686 			 */
1687 			/*
1688 			 * The buffer was zeroed by block_write_full_page()
1689 			 */
1690 			clear_buffer_dirty(bh);
1691 			set_buffer_uptodate(bh);
1692 		} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1693 			   buffer_dirty(bh)) {
1694 			WARN_ON(bh->b_size != blocksize);
1695 			err = get_block(inode, block, bh, 1);
1696 			if (err)
1697 				goto recover;
1698 			clear_buffer_delay(bh);
1699 			if (buffer_new(bh)) {
1700 				/* blockdev mappings never come here */
1701 				clear_buffer_new(bh);
1702 				unmap_underlying_metadata(bh->b_bdev,
1703 							bh->b_blocknr);
1704 			}
1705 		}
1706 		bh = bh->b_this_page;
1707 		block++;
1708 	} while (bh != head);
1709 
1710 	do {
1711 		if (!buffer_mapped(bh))
1712 			continue;
1713 		/*
1714 		 * If it's a fully non-blocking write attempt and we cannot
1715 		 * lock the buffer then redirty the page.  Note that this can
1716 		 * potentially cause a busy-wait loop from pdflush and kswapd
1717 		 * activity, but those code paths have their own higher-level
1718 		 * throttling.
1719 		 */
1720 		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1721 			lock_buffer(bh);
1722 		} else if (!trylock_buffer(bh)) {
1723 			redirty_page_for_writepage(wbc, page);
1724 			continue;
1725 		}
1726 		if (test_clear_buffer_dirty(bh)) {
1727 			mark_buffer_async_write(bh);
1728 		} else {
1729 			unlock_buffer(bh);
1730 		}
1731 	} while ((bh = bh->b_this_page) != head);
1732 
1733 	/*
1734 	 * The page and its buffers are protected by PageWriteback(), so we can
1735 	 * drop the bh refcounts early.
1736 	 */
1737 	BUG_ON(PageWriteback(page));
1738 	set_page_writeback(page);
1739 
1740 	do {
1741 		struct buffer_head *next = bh->b_this_page;
1742 		if (buffer_async_write(bh)) {
1743 			submit_bh(WRITE, bh);
1744 			nr_underway++;
1745 		}
1746 		bh = next;
1747 	} while (bh != head);
1748 	unlock_page(page);
1749 
1750 	err = 0;
1751 done:
1752 	if (nr_underway == 0) {
1753 		/*
1754 		 * The page was marked dirty, but the buffers were
1755 		 * clean.  Someone wrote them back by hand with
1756 		 * ll_rw_block/submit_bh.  A rare case.
1757 		 */
1758 		end_page_writeback(page);
1759 
1760 		/*
1761 		 * The page and buffer_heads can be released at any time from
1762 		 * here on.
1763 		 */
1764 	}
1765 	return err;
1766 
1767 recover:
1768 	/*
1769 	 * ENOSPC, or some other error.  We may already have added some
1770 	 * blocks to the file, so we need to write these out to avoid
1771 	 * exposing stale data.
1772 	 * The page is currently locked and not marked for writeback
1773 	 */
1774 	bh = head;
1775 	/* Recovery: lock and submit the mapped buffers */
1776 	do {
1777 		if (buffer_mapped(bh) && buffer_dirty(bh) &&
1778 		    !buffer_delay(bh)) {
1779 			lock_buffer(bh);
1780 			mark_buffer_async_write(bh);
1781 		} else {
1782 			/*
1783 			 * The buffer may have been set dirty during
1784 			 * attachment to a dirty page.
1785 			 */
1786 			clear_buffer_dirty(bh);
1787 		}
1788 	} while ((bh = bh->b_this_page) != head);
1789 	SetPageError(page);
1790 	BUG_ON(PageWriteback(page));
1791 	mapping_set_error(page->mapping, err);
1792 	set_page_writeback(page);
1793 	do {
1794 		struct buffer_head *next = bh->b_this_page;
1795 		if (buffer_async_write(bh)) {
1796 			clear_buffer_dirty(bh);
1797 			submit_bh(WRITE, bh);
1798 			nr_underway++;
1799 		}
1800 		bh = next;
1801 	} while (bh != head);
1802 	unlock_page(page);
1803 	goto done;
1804 }
1805 
1806 /*
1807  * If a page has any new buffers, zero them out here, and mark them uptodate
1808  * and dirty so they'll be written out (in order to prevent uninitialised
1809  * block data from leaking). And clear the new bit.
1810  */
1811 void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1812 {
1813 	unsigned int block_start, block_end;
1814 	struct buffer_head *head, *bh;
1815 
1816 	BUG_ON(!PageLocked(page));
1817 	if (!page_has_buffers(page))
1818 		return;
1819 
1820 	bh = head = page_buffers(page);
1821 	block_start = 0;
1822 	do {
1823 		block_end = block_start + bh->b_size;
1824 
1825 		if (buffer_new(bh)) {
1826 			if (block_end > from && block_start < to) {
1827 				if (!PageUptodate(page)) {
1828 					unsigned start, size;
1829 
1830 					start = max(from, block_start);
1831 					size = min(to, block_end) - start;
1832 
1833 					zero_user(page, start, size);
1834 					set_buffer_uptodate(bh);
1835 				}
1836 
1837 				clear_buffer_new(bh);
1838 				mark_buffer_dirty(bh);
1839 			}
1840 		}
1841 
1842 		block_start = block_end;
1843 		bh = bh->b_this_page;
1844 	} while (bh != head);
1845 }
1846 EXPORT_SYMBOL(page_zero_new_buffers);
1847 
1848 static int __block_prepare_write(struct inode *inode, struct page *page,
1849 		unsigned from, unsigned to, get_block_t *get_block)
1850 {
1851 	unsigned block_start, block_end;
1852 	sector_t block;
1853 	int err = 0;
1854 	unsigned blocksize, bbits;
1855 	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1856 
1857 	BUG_ON(!PageLocked(page));
1858 	BUG_ON(from > PAGE_CACHE_SIZE);
1859 	BUG_ON(to > PAGE_CACHE_SIZE);
1860 	BUG_ON(from > to);
1861 
1862 	blocksize = 1 << inode->i_blkbits;
1863 	if (!page_has_buffers(page))
1864 		create_empty_buffers(page, blocksize, 0);
1865 	head = page_buffers(page);
1866 
1867 	bbits = inode->i_blkbits;
1868 	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1869 
1870 	for(bh = head, block_start = 0; bh != head || !block_start;
1871 	    block++, block_start=block_end, bh = bh->b_this_page) {
1872 		block_end = block_start + blocksize;
1873 		if (block_end <= from || block_start >= to) {
1874 			if (PageUptodate(page)) {
1875 				if (!buffer_uptodate(bh))
1876 					set_buffer_uptodate(bh);
1877 			}
1878 			continue;
1879 		}
1880 		if (buffer_new(bh))
1881 			clear_buffer_new(bh);
1882 		if (!buffer_mapped(bh)) {
1883 			WARN_ON(bh->b_size != blocksize);
1884 			err = get_block(inode, block, bh, 1);
1885 			if (err)
1886 				break;
1887 			if (buffer_new(bh)) {
1888 				unmap_underlying_metadata(bh->b_bdev,
1889 							bh->b_blocknr);
1890 				if (PageUptodate(page)) {
1891 					clear_buffer_new(bh);
1892 					set_buffer_uptodate(bh);
1893 					mark_buffer_dirty(bh);
1894 					continue;
1895 				}
1896 				if (block_end > to || block_start < from)
1897 					zero_user_segments(page,
1898 						to, block_end,
1899 						block_start, from);
1900 				continue;
1901 			}
1902 		}
1903 		if (PageUptodate(page)) {
1904 			if (!buffer_uptodate(bh))
1905 				set_buffer_uptodate(bh);
1906 			continue;
1907 		}
1908 		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1909 		    !buffer_unwritten(bh) &&
1910 		     (block_start < from || block_end > to)) {
1911 			ll_rw_block(READ, 1, &bh);
1912 			*wait_bh++=bh;
1913 		}
1914 	}
1915 	/*
1916 	 * If we issued read requests - let them complete.
1917 	 */
1918 	while(wait_bh > wait) {
1919 		wait_on_buffer(*--wait_bh);
1920 		if (!buffer_uptodate(*wait_bh))
1921 			err = -EIO;
1922 	}
1923 	if (unlikely(err))
1924 		page_zero_new_buffers(page, from, to);
1925 	return err;
1926 }
1927 
1928 static int __block_commit_write(struct inode *inode, struct page *page,
1929 		unsigned from, unsigned to)
1930 {
1931 	unsigned block_start, block_end;
1932 	int partial = 0;
1933 	unsigned blocksize;
1934 	struct buffer_head *bh, *head;
1935 
1936 	blocksize = 1 << inode->i_blkbits;
1937 
1938 	for(bh = head = page_buffers(page), block_start = 0;
1939 	    bh != head || !block_start;
1940 	    block_start=block_end, bh = bh->b_this_page) {
1941 		block_end = block_start + blocksize;
1942 		if (block_end <= from || block_start >= to) {
1943 			if (!buffer_uptodate(bh))
1944 				partial = 1;
1945 		} else {
1946 			set_buffer_uptodate(bh);
1947 			mark_buffer_dirty(bh);
1948 		}
1949 		clear_buffer_new(bh);
1950 	}
1951 
1952 	/*
1953 	 * If this is a partial write which happened to make all buffers
1954 	 * uptodate then we can optimize away a bogus readpage() for
1955 	 * the next read(). Here we 'discover' whether the page went
1956 	 * uptodate as a result of this (potentially partial) write.
1957 	 */
1958 	if (!partial)
1959 		SetPageUptodate(page);
1960 	return 0;
1961 }
1962 
1963 /*
1964  * block_write_begin takes care of the basic task of block allocation and
1965  * bringing partial write blocks uptodate first.
1966  *
1967  * If *pagep is not NULL, then block_write_begin uses the locked page
1968  * at *pagep rather than allocating its own. In this case, the page will
1969  * not be unlocked or deallocated on failure.
1970  */
1971 int block_write_begin(struct file *file, struct address_space *mapping,
1972 			loff_t pos, unsigned len, unsigned flags,
1973 			struct page **pagep, void **fsdata,
1974 			get_block_t *get_block)
1975 {
1976 	struct inode *inode = mapping->host;
1977 	int status = 0;
1978 	struct page *page;
1979 	pgoff_t index;
1980 	unsigned start, end;
1981 	int ownpage = 0;
1982 
1983 	index = pos >> PAGE_CACHE_SHIFT;
1984 	start = pos & (PAGE_CACHE_SIZE - 1);
1985 	end = start + len;
1986 
1987 	page = *pagep;
1988 	if (page == NULL) {
1989 		ownpage = 1;
1990 		page = __grab_cache_page(mapping, index);
1991 		if (!page) {
1992 			status = -ENOMEM;
1993 			goto out;
1994 		}
1995 		*pagep = page;
1996 	} else
1997 		BUG_ON(!PageLocked(page));
1998 
1999 	status = __block_prepare_write(inode, page, start, end, get_block);
2000 	if (unlikely(status)) {
2001 		ClearPageUptodate(page);
2002 
2003 		if (ownpage) {
2004 			unlock_page(page);
2005 			page_cache_release(page);
2006 			*pagep = NULL;
2007 
2008 			/*
2009 			 * prepare_write() may have instantiated a few blocks
2010 			 * outside i_size.  Trim these off again. Don't need
2011 			 * i_size_read because we hold i_mutex.
2012 			 */
2013 			if (pos + len > inode->i_size)
2014 				vmtruncate(inode, inode->i_size);
2015 		}
2016 		goto out;
2017 	}
2018 
2019 out:
2020 	return status;
2021 }
2022 EXPORT_SYMBOL(block_write_begin);
2023 
2024 int block_write_end(struct file *file, struct address_space *mapping,
2025 			loff_t pos, unsigned len, unsigned copied,
2026 			struct page *page, void *fsdata)
2027 {
2028 	struct inode *inode = mapping->host;
2029 	unsigned start;
2030 
2031 	start = pos & (PAGE_CACHE_SIZE - 1);
2032 
2033 	if (unlikely(copied < len)) {
2034 		/*
2035 		 * The buffers that were written will now be uptodate, so we
2036 		 * don't have to worry about a readpage reading them and
2037 		 * overwriting a partial write. However if we have encountered
2038 		 * a short write and only partially written into a buffer, it
2039 		 * will not be marked uptodate, so a readpage might come in and
2040 		 * destroy our partial write.
2041 		 *
2042 		 * Do the simplest thing, and just treat any short write to a
2043 		 * non uptodate page as a zero-length write, and force the
2044 		 * caller to redo the whole thing.
2045 		 */
2046 		if (!PageUptodate(page))
2047 			copied = 0;
2048 
2049 		page_zero_new_buffers(page, start+copied, start+len);
2050 	}
2051 	flush_dcache_page(page);
2052 
2053 	/* This could be a short (even 0-length) commit */
2054 	__block_commit_write(inode, page, start, start+copied);
2055 
2056 	return copied;
2057 }
2058 EXPORT_SYMBOL(block_write_end);
2059 
2060 int generic_write_end(struct file *file, struct address_space *mapping,
2061 			loff_t pos, unsigned len, unsigned copied,
2062 			struct page *page, void *fsdata)
2063 {
2064 	struct inode *inode = mapping->host;
2065 	int i_size_changed = 0;
2066 
2067 	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2068 
2069 	/*
2070 	 * No need to use i_size_read() here, the i_size
2071 	 * cannot change under us because we hold i_mutex.
2072 	 *
2073 	 * But it's important to update i_size while still holding page lock:
2074 	 * page writeout could otherwise come in and zero beyond i_size.
2075 	 */
2076 	if (pos+copied > inode->i_size) {
2077 		i_size_write(inode, pos+copied);
2078 		i_size_changed = 1;
2079 	}
2080 
2081 	unlock_page(page);
2082 	page_cache_release(page);
2083 
2084 	/*
2085 	 * Don't mark the inode dirty under page lock. First, it unnecessarily
2086 	 * makes the holding time of page lock longer. Second, it forces lock
2087 	 * ordering of page lock and transaction start for journaling
2088 	 * filesystems.
2089 	 */
2090 	if (i_size_changed)
2091 		mark_inode_dirty(inode);
2092 
2093 	return copied;
2094 }
2095 EXPORT_SYMBOL(generic_write_end);
2096 
2097 /*
2098  * block_is_partially_uptodate checks whether buffers within a page are
2099  * uptodate or not.
2100  *
2101  * Returns true if all buffers which correspond to a file portion
2102  * we want to read are uptodate.
2103  */
2104 int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
2105 					unsigned long from)
2106 {
2107 	struct inode *inode = page->mapping->host;
2108 	unsigned block_start, block_end, blocksize;
2109 	unsigned to;
2110 	struct buffer_head *bh, *head;
2111 	int ret = 1;
2112 
2113 	if (!page_has_buffers(page))
2114 		return 0;
2115 
2116 	blocksize = 1 << inode->i_blkbits;
2117 	to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
2118 	to = from + to;
2119 	if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
2120 		return 0;
2121 
2122 	head = page_buffers(page);
2123 	bh = head;
2124 	block_start = 0;
2125 	do {
2126 		block_end = block_start + blocksize;
2127 		if (block_end > from && block_start < to) {
2128 			if (!buffer_uptodate(bh)) {
2129 				ret = 0;
2130 				break;
2131 			}
2132 			if (block_end >= to)
2133 				break;
2134 		}
2135 		block_start = block_end;
2136 		bh = bh->b_this_page;
2137 	} while (bh != head);
2138 
2139 	return ret;
2140 }
2141 EXPORT_SYMBOL(block_is_partially_uptodate);
2142 
2143 /*
2144  * Generic "read page" function for block devices that have the normal
2145  * get_block functionality. This is most of the block device filesystems.
2146  * Reads the page asynchronously --- the unlock_buffer() and
2147  * set/clear_buffer_uptodate() functions propagate buffer state into the
2148  * page struct once IO has completed.
2149  */
2150 int block_read_full_page(struct page *page, get_block_t *get_block)
2151 {
2152 	struct inode *inode = page->mapping->host;
2153 	sector_t iblock, lblock;
2154 	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2155 	unsigned int blocksize;
2156 	int nr, i;
2157 	int fully_mapped = 1;
2158 
2159 	BUG_ON(!PageLocked(page));
2160 	blocksize = 1 << inode->i_blkbits;
2161 	if (!page_has_buffers(page))
2162 		create_empty_buffers(page, blocksize, 0);
2163 	head = page_buffers(page);
2164 
2165 	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2166 	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2167 	bh = head;
2168 	nr = 0;
2169 	i = 0;
2170 
2171 	do {
2172 		if (buffer_uptodate(bh))
2173 			continue;
2174 
2175 		if (!buffer_mapped(bh)) {
2176 			int err = 0;
2177 
2178 			fully_mapped = 0;
2179 			if (iblock < lblock) {
2180 				WARN_ON(bh->b_size != blocksize);
2181 				err = get_block(inode, iblock, bh, 0);
2182 				if (err)
2183 					SetPageError(page);
2184 			}
2185 			if (!buffer_mapped(bh)) {
2186 				zero_user(page, i * blocksize, blocksize);
2187 				if (!err)
2188 					set_buffer_uptodate(bh);
2189 				continue;
2190 			}
2191 			/*
2192 			 * get_block() might have updated the buffer
2193 			 * synchronously
2194 			 */
2195 			if (buffer_uptodate(bh))
2196 				continue;
2197 		}
2198 		arr[nr++] = bh;
2199 	} while (i++, iblock++, (bh = bh->b_this_page) != head);
2200 
2201 	if (fully_mapped)
2202 		SetPageMappedToDisk(page);
2203 
2204 	if (!nr) {
2205 		/*
2206 		 * All buffers are uptodate - we can set the page uptodate
2207 		 * as well. But not if get_block() returned an error.
2208 		 */
2209 		if (!PageError(page))
2210 			SetPageUptodate(page);
2211 		unlock_page(page);
2212 		return 0;
2213 	}
2214 
2215 	/* Stage two: lock the buffers */
2216 	for (i = 0; i < nr; i++) {
2217 		bh = arr[i];
2218 		lock_buffer(bh);
2219 		mark_buffer_async_read(bh);
2220 	}
2221 
2222 	/*
2223 	 * Stage 3: start the IO.  Check for uptodateness
2224 	 * inside the buffer lock in case another process reading
2225 	 * the underlying blockdev brought it uptodate (the sct fix).
2226 	 */
2227 	for (i = 0; i < nr; i++) {
2228 		bh = arr[i];
2229 		if (buffer_uptodate(bh))
2230 			end_buffer_async_read(bh, 1);
2231 		else
2232 			submit_bh(READ, bh);
2233 	}
2234 	return 0;
2235 }
2236 
2237 /* utility function for filesystems that need to do work on expanding
2238  * truncates.  Uses filesystem pagecache writes to allow the filesystem to
2239  * deal with the hole.
2240  */
2241 int generic_cont_expand_simple(struct inode *inode, loff_t size)
2242 {
2243 	struct address_space *mapping = inode->i_mapping;
2244 	struct page *page;
2245 	void *fsdata;
2246 	unsigned long limit;
2247 	int err;
2248 
2249 	err = -EFBIG;
2250         limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
2251 	if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2252 		send_sig(SIGXFSZ, current, 0);
2253 		goto out;
2254 	}
2255 	if (size > inode->i_sb->s_maxbytes)
2256 		goto out;
2257 
2258 	err = pagecache_write_begin(NULL, mapping, size, 0,
2259 				AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
2260 				&page, &fsdata);
2261 	if (err)
2262 		goto out;
2263 
2264 	err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2265 	BUG_ON(err > 0);
2266 
2267 out:
2268 	return err;
2269 }
2270 
2271 static int cont_expand_zero(struct file *file, struct address_space *mapping,
2272 			    loff_t pos, loff_t *bytes)
2273 {
2274 	struct inode *inode = mapping->host;
2275 	unsigned blocksize = 1 << inode->i_blkbits;
2276 	struct page *page;
2277 	void *fsdata;
2278 	pgoff_t index, curidx;
2279 	loff_t curpos;
2280 	unsigned zerofrom, offset, len;
2281 	int err = 0;
2282 
2283 	index = pos >> PAGE_CACHE_SHIFT;
2284 	offset = pos & ~PAGE_CACHE_MASK;
2285 
2286 	while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
2287 		zerofrom = curpos & ~PAGE_CACHE_MASK;
2288 		if (zerofrom & (blocksize-1)) {
2289 			*bytes |= (blocksize-1);
2290 			(*bytes)++;
2291 		}
2292 		len = PAGE_CACHE_SIZE - zerofrom;
2293 
2294 		err = pagecache_write_begin(file, mapping, curpos, len,
2295 						AOP_FLAG_UNINTERRUPTIBLE,
2296 						&page, &fsdata);
2297 		if (err)
2298 			goto out;
2299 		zero_user(page, zerofrom, len);
2300 		err = pagecache_write_end(file, mapping, curpos, len, len,
2301 						page, fsdata);
2302 		if (err < 0)
2303 			goto out;
2304 		BUG_ON(err != len);
2305 		err = 0;
2306 
2307 		balance_dirty_pages_ratelimited(mapping);
2308 	}
2309 
2310 	/* page covers the boundary, find the boundary offset */
2311 	if (index == curidx) {
2312 		zerofrom = curpos & ~PAGE_CACHE_MASK;
2313 		/* if we will expand the thing last block will be filled */
2314 		if (offset <= zerofrom) {
2315 			goto out;
2316 		}
2317 		if (zerofrom & (blocksize-1)) {
2318 			*bytes |= (blocksize-1);
2319 			(*bytes)++;
2320 		}
2321 		len = offset - zerofrom;
2322 
2323 		err = pagecache_write_begin(file, mapping, curpos, len,
2324 						AOP_FLAG_UNINTERRUPTIBLE,
2325 						&page, &fsdata);
2326 		if (err)
2327 			goto out;
2328 		zero_user(page, zerofrom, len);
2329 		err = pagecache_write_end(file, mapping, curpos, len, len,
2330 						page, fsdata);
2331 		if (err < 0)
2332 			goto out;
2333 		BUG_ON(err != len);
2334 		err = 0;
2335 	}
2336 out:
2337 	return err;
2338 }
2339 
2340 /*
2341  * For moronic filesystems that do not allow holes in file.
2342  * We may have to extend the file.
2343  */
2344 int cont_write_begin(struct file *file, struct address_space *mapping,
2345 			loff_t pos, unsigned len, unsigned flags,
2346 			struct page **pagep, void **fsdata,
2347 			get_block_t *get_block, loff_t *bytes)
2348 {
2349 	struct inode *inode = mapping->host;
2350 	unsigned blocksize = 1 << inode->i_blkbits;
2351 	unsigned zerofrom;
2352 	int err;
2353 
2354 	err = cont_expand_zero(file, mapping, pos, bytes);
2355 	if (err)
2356 		goto out;
2357 
2358 	zerofrom = *bytes & ~PAGE_CACHE_MASK;
2359 	if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2360 		*bytes |= (blocksize-1);
2361 		(*bytes)++;
2362 	}
2363 
2364 	*pagep = NULL;
2365 	err = block_write_begin(file, mapping, pos, len,
2366 				flags, pagep, fsdata, get_block);
2367 out:
2368 	return err;
2369 }
2370 
2371 int block_prepare_write(struct page *page, unsigned from, unsigned to,
2372 			get_block_t *get_block)
2373 {
2374 	struct inode *inode = page->mapping->host;
2375 	int err = __block_prepare_write(inode, page, from, to, get_block);
2376 	if (err)
2377 		ClearPageUptodate(page);
2378 	return err;
2379 }
2380 
2381 int block_commit_write(struct page *page, unsigned from, unsigned to)
2382 {
2383 	struct inode *inode = page->mapping->host;
2384 	__block_commit_write(inode,page,from,to);
2385 	return 0;
2386 }
2387 
2388 /*
2389  * block_page_mkwrite() is not allowed to change the file size as it gets
2390  * called from a page fault handler when a page is first dirtied. Hence we must
2391  * be careful to check for EOF conditions here. We set the page up correctly
2392  * for a written page which means we get ENOSPC checking when writing into
2393  * holes and correct delalloc and unwritten extent mapping on filesystems that
2394  * support these features.
2395  *
2396  * We are not allowed to take the i_mutex here so we have to play games to
2397  * protect against truncate races as the page could now be beyond EOF.  Because
2398  * vmtruncate() writes the inode size before removing pages, once we have the
2399  * page lock we can determine safely if the page is beyond EOF. If it is not
2400  * beyond EOF, then the page is guaranteed safe against truncation until we
2401  * unlock the page.
2402  */
2403 int
2404 block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
2405 		   get_block_t get_block)
2406 {
2407 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
2408 	unsigned long end;
2409 	loff_t size;
2410 	int ret = -EINVAL;
2411 
2412 	lock_page(page);
2413 	size = i_size_read(inode);
2414 	if ((page->mapping != inode->i_mapping) ||
2415 	    (page_offset(page) > size)) {
2416 		/* page got truncated out from underneath us */
2417 		goto out_unlock;
2418 	}
2419 
2420 	/* page is wholly or partially inside EOF */
2421 	if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
2422 		end = size & ~PAGE_CACHE_MASK;
2423 	else
2424 		end = PAGE_CACHE_SIZE;
2425 
2426 	ret = block_prepare_write(page, 0, end, get_block);
2427 	if (!ret)
2428 		ret = block_commit_write(page, 0, end);
2429 
2430 out_unlock:
2431 	unlock_page(page);
2432 	return ret;
2433 }
2434 
2435 /*
2436  * nobh_write_begin()'s prereads are special: the buffer_heads are freed
2437  * immediately, while under the page lock.  So it needs a special end_io
2438  * handler which does not touch the bh after unlocking it.
2439  */
2440 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2441 {
2442 	__end_buffer_read_notouch(bh, uptodate);
2443 }
2444 
2445 /*
2446  * Attach the singly-linked list of buffers created by nobh_write_begin, to
2447  * the page (converting it to circular linked list and taking care of page
2448  * dirty races).
2449  */
2450 static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2451 {
2452 	struct buffer_head *bh;
2453 
2454 	BUG_ON(!PageLocked(page));
2455 
2456 	spin_lock(&page->mapping->private_lock);
2457 	bh = head;
2458 	do {
2459 		if (PageDirty(page))
2460 			set_buffer_dirty(bh);
2461 		if (!bh->b_this_page)
2462 			bh->b_this_page = head;
2463 		bh = bh->b_this_page;
2464 	} while (bh != head);
2465 	attach_page_buffers(page, head);
2466 	spin_unlock(&page->mapping->private_lock);
2467 }
2468 
2469 /*
2470  * On entry, the page is fully not uptodate.
2471  * On exit the page is fully uptodate in the areas outside (from,to)
2472  */
2473 int nobh_write_begin(struct file *file, struct address_space *mapping,
2474 			loff_t pos, unsigned len, unsigned flags,
2475 			struct page **pagep, void **fsdata,
2476 			get_block_t *get_block)
2477 {
2478 	struct inode *inode = mapping->host;
2479 	const unsigned blkbits = inode->i_blkbits;
2480 	const unsigned blocksize = 1 << blkbits;
2481 	struct buffer_head *head, *bh;
2482 	struct page *page;
2483 	pgoff_t index;
2484 	unsigned from, to;
2485 	unsigned block_in_page;
2486 	unsigned block_start, block_end;
2487 	sector_t block_in_file;
2488 	int nr_reads = 0;
2489 	int ret = 0;
2490 	int is_mapped_to_disk = 1;
2491 
2492 	index = pos >> PAGE_CACHE_SHIFT;
2493 	from = pos & (PAGE_CACHE_SIZE - 1);
2494 	to = from + len;
2495 
2496 	page = __grab_cache_page(mapping, index);
2497 	if (!page)
2498 		return -ENOMEM;
2499 	*pagep = page;
2500 	*fsdata = NULL;
2501 
2502 	if (page_has_buffers(page)) {
2503 		unlock_page(page);
2504 		page_cache_release(page);
2505 		*pagep = NULL;
2506 		return block_write_begin(file, mapping, pos, len, flags, pagep,
2507 					fsdata, get_block);
2508 	}
2509 
2510 	if (PageMappedToDisk(page))
2511 		return 0;
2512 
2513 	/*
2514 	 * Allocate buffers so that we can keep track of state, and potentially
2515 	 * attach them to the page if an error occurs. In the common case of
2516 	 * no error, they will just be freed again without ever being attached
2517 	 * to the page (which is all OK, because we're under the page lock).
2518 	 *
2519 	 * Be careful: the buffer linked list is a NULL terminated one, rather
2520 	 * than the circular one we're used to.
2521 	 */
2522 	head = alloc_page_buffers(page, blocksize, 0);
2523 	if (!head) {
2524 		ret = -ENOMEM;
2525 		goto out_release;
2526 	}
2527 
2528 	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2529 
2530 	/*
2531 	 * We loop across all blocks in the page, whether or not they are
2532 	 * part of the affected region.  This is so we can discover if the
2533 	 * page is fully mapped-to-disk.
2534 	 */
2535 	for (block_start = 0, block_in_page = 0, bh = head;
2536 		  block_start < PAGE_CACHE_SIZE;
2537 		  block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2538 		int create;
2539 
2540 		block_end = block_start + blocksize;
2541 		bh->b_state = 0;
2542 		create = 1;
2543 		if (block_start >= to)
2544 			create = 0;
2545 		ret = get_block(inode, block_in_file + block_in_page,
2546 					bh, create);
2547 		if (ret)
2548 			goto failed;
2549 		if (!buffer_mapped(bh))
2550 			is_mapped_to_disk = 0;
2551 		if (buffer_new(bh))
2552 			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2553 		if (PageUptodate(page)) {
2554 			set_buffer_uptodate(bh);
2555 			continue;
2556 		}
2557 		if (buffer_new(bh) || !buffer_mapped(bh)) {
2558 			zero_user_segments(page, block_start, from,
2559 							to, block_end);
2560 			continue;
2561 		}
2562 		if (buffer_uptodate(bh))
2563 			continue;	/* reiserfs does this */
2564 		if (block_start < from || block_end > to) {
2565 			lock_buffer(bh);
2566 			bh->b_end_io = end_buffer_read_nobh;
2567 			submit_bh(READ, bh);
2568 			nr_reads++;
2569 		}
2570 	}
2571 
2572 	if (nr_reads) {
2573 		/*
2574 		 * The page is locked, so these buffers are protected from
2575 		 * any VM or truncate activity.  Hence we don't need to care
2576 		 * for the buffer_head refcounts.
2577 		 */
2578 		for (bh = head; bh; bh = bh->b_this_page) {
2579 			wait_on_buffer(bh);
2580 			if (!buffer_uptodate(bh))
2581 				ret = -EIO;
2582 		}
2583 		if (ret)
2584 			goto failed;
2585 	}
2586 
2587 	if (is_mapped_to_disk)
2588 		SetPageMappedToDisk(page);
2589 
2590 	*fsdata = head; /* to be released by nobh_write_end */
2591 
2592 	return 0;
2593 
2594 failed:
2595 	BUG_ON(!ret);
2596 	/*
2597 	 * Error recovery is a bit difficult. We need to zero out blocks that
2598 	 * were newly allocated, and dirty them to ensure they get written out.
2599 	 * Buffers need to be attached to the page at this point, otherwise
2600 	 * the handling of potential IO errors during writeout would be hard
2601 	 * (could try doing synchronous writeout, but what if that fails too?)
2602 	 */
2603 	attach_nobh_buffers(page, head);
2604 	page_zero_new_buffers(page, from, to);
2605 
2606 out_release:
2607 	unlock_page(page);
2608 	page_cache_release(page);
2609 	*pagep = NULL;
2610 
2611 	if (pos + len > inode->i_size)
2612 		vmtruncate(inode, inode->i_size);
2613 
2614 	return ret;
2615 }
2616 EXPORT_SYMBOL(nobh_write_begin);
2617 
2618 int nobh_write_end(struct file *file, struct address_space *mapping,
2619 			loff_t pos, unsigned len, unsigned copied,
2620 			struct page *page, void *fsdata)
2621 {
2622 	struct inode *inode = page->mapping->host;
2623 	struct buffer_head *head = fsdata;
2624 	struct buffer_head *bh;
2625 	BUG_ON(fsdata != NULL && page_has_buffers(page));
2626 
2627 	if (unlikely(copied < len) && !page_has_buffers(page))
2628 		attach_nobh_buffers(page, head);
2629 	if (page_has_buffers(page))
2630 		return generic_write_end(file, mapping, pos, len,
2631 					copied, page, fsdata);
2632 
2633 	SetPageUptodate(page);
2634 	set_page_dirty(page);
2635 	if (pos+copied > inode->i_size) {
2636 		i_size_write(inode, pos+copied);
2637 		mark_inode_dirty(inode);
2638 	}
2639 
2640 	unlock_page(page);
2641 	page_cache_release(page);
2642 
2643 	while (head) {
2644 		bh = head;
2645 		head = head->b_this_page;
2646 		free_buffer_head(bh);
2647 	}
2648 
2649 	return copied;
2650 }
2651 EXPORT_SYMBOL(nobh_write_end);
2652 
2653 /*
2654  * nobh_writepage() - based on block_full_write_page() except
2655  * that it tries to operate without attaching bufferheads to
2656  * the page.
2657  */
2658 int nobh_writepage(struct page *page, get_block_t *get_block,
2659 			struct writeback_control *wbc)
2660 {
2661 	struct inode * const inode = page->mapping->host;
2662 	loff_t i_size = i_size_read(inode);
2663 	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2664 	unsigned offset;
2665 	int ret;
2666 
2667 	/* Is the page fully inside i_size? */
2668 	if (page->index < end_index)
2669 		goto out;
2670 
2671 	/* Is the page fully outside i_size? (truncate in progress) */
2672 	offset = i_size & (PAGE_CACHE_SIZE-1);
2673 	if (page->index >= end_index+1 || !offset) {
2674 		/*
2675 		 * The page may have dirty, unmapped buffers.  For example,
2676 		 * they may have been added in ext3_writepage().  Make them
2677 		 * freeable here, so the page does not leak.
2678 		 */
2679 #if 0
2680 		/* Not really sure about this  - do we need this ? */
2681 		if (page->mapping->a_ops->invalidatepage)
2682 			page->mapping->a_ops->invalidatepage(page, offset);
2683 #endif
2684 		unlock_page(page);
2685 		return 0; /* don't care */
2686 	}
2687 
2688 	/*
2689 	 * The page straddles i_size.  It must be zeroed out on each and every
2690 	 * writepage invocation because it may be mmapped.  "A file is mapped
2691 	 * in multiples of the page size.  For a file that is not a multiple of
2692 	 * the  page size, the remaining memory is zeroed when mapped, and
2693 	 * writes to that region are not written out to the file."
2694 	 */
2695 	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2696 out:
2697 	ret = mpage_writepage(page, get_block, wbc);
2698 	if (ret == -EAGAIN)
2699 		ret = __block_write_full_page(inode, page, get_block, wbc);
2700 	return ret;
2701 }
2702 EXPORT_SYMBOL(nobh_writepage);
2703 
2704 int nobh_truncate_page(struct address_space *mapping,
2705 			loff_t from, get_block_t *get_block)
2706 {
2707 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
2708 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
2709 	unsigned blocksize;
2710 	sector_t iblock;
2711 	unsigned length, pos;
2712 	struct inode *inode = mapping->host;
2713 	struct page *page;
2714 	struct buffer_head map_bh;
2715 	int err;
2716 
2717 	blocksize = 1 << inode->i_blkbits;
2718 	length = offset & (blocksize - 1);
2719 
2720 	/* Block boundary? Nothing to do */
2721 	if (!length)
2722 		return 0;
2723 
2724 	length = blocksize - length;
2725 	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2726 
2727 	page = grab_cache_page(mapping, index);
2728 	err = -ENOMEM;
2729 	if (!page)
2730 		goto out;
2731 
2732 	if (page_has_buffers(page)) {
2733 has_buffers:
2734 		unlock_page(page);
2735 		page_cache_release(page);
2736 		return block_truncate_page(mapping, from, get_block);
2737 	}
2738 
2739 	/* Find the buffer that contains "offset" */
2740 	pos = blocksize;
2741 	while (offset >= pos) {
2742 		iblock++;
2743 		pos += blocksize;
2744 	}
2745 
2746 	err = get_block(inode, iblock, &map_bh, 0);
2747 	if (err)
2748 		goto unlock;
2749 	/* unmapped? It's a hole - nothing to do */
2750 	if (!buffer_mapped(&map_bh))
2751 		goto unlock;
2752 
2753 	/* Ok, it's mapped. Make sure it's up-to-date */
2754 	if (!PageUptodate(page)) {
2755 		err = mapping->a_ops->readpage(NULL, page);
2756 		if (err) {
2757 			page_cache_release(page);
2758 			goto out;
2759 		}
2760 		lock_page(page);
2761 		if (!PageUptodate(page)) {
2762 			err = -EIO;
2763 			goto unlock;
2764 		}
2765 		if (page_has_buffers(page))
2766 			goto has_buffers;
2767 	}
2768 	zero_user(page, offset, length);
2769 	set_page_dirty(page);
2770 	err = 0;
2771 
2772 unlock:
2773 	unlock_page(page);
2774 	page_cache_release(page);
2775 out:
2776 	return err;
2777 }
2778 EXPORT_SYMBOL(nobh_truncate_page);
2779 
2780 int block_truncate_page(struct address_space *mapping,
2781 			loff_t from, get_block_t *get_block)
2782 {
2783 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
2784 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
2785 	unsigned blocksize;
2786 	sector_t iblock;
2787 	unsigned length, pos;
2788 	struct inode *inode = mapping->host;
2789 	struct page *page;
2790 	struct buffer_head *bh;
2791 	int err;
2792 
2793 	blocksize = 1 << inode->i_blkbits;
2794 	length = offset & (blocksize - 1);
2795 
2796 	/* Block boundary? Nothing to do */
2797 	if (!length)
2798 		return 0;
2799 
2800 	length = blocksize - length;
2801 	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2802 
2803 	page = grab_cache_page(mapping, index);
2804 	err = -ENOMEM;
2805 	if (!page)
2806 		goto out;
2807 
2808 	if (!page_has_buffers(page))
2809 		create_empty_buffers(page, blocksize, 0);
2810 
2811 	/* Find the buffer that contains "offset" */
2812 	bh = page_buffers(page);
2813 	pos = blocksize;
2814 	while (offset >= pos) {
2815 		bh = bh->b_this_page;
2816 		iblock++;
2817 		pos += blocksize;
2818 	}
2819 
2820 	err = 0;
2821 	if (!buffer_mapped(bh)) {
2822 		WARN_ON(bh->b_size != blocksize);
2823 		err = get_block(inode, iblock, bh, 0);
2824 		if (err)
2825 			goto unlock;
2826 		/* unmapped? It's a hole - nothing to do */
2827 		if (!buffer_mapped(bh))
2828 			goto unlock;
2829 	}
2830 
2831 	/* Ok, it's mapped. Make sure it's up-to-date */
2832 	if (PageUptodate(page))
2833 		set_buffer_uptodate(bh);
2834 
2835 	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2836 		err = -EIO;
2837 		ll_rw_block(READ, 1, &bh);
2838 		wait_on_buffer(bh);
2839 		/* Uhhuh. Read error. Complain and punt. */
2840 		if (!buffer_uptodate(bh))
2841 			goto unlock;
2842 	}
2843 
2844 	zero_user(page, offset, length);
2845 	mark_buffer_dirty(bh);
2846 	err = 0;
2847 
2848 unlock:
2849 	unlock_page(page);
2850 	page_cache_release(page);
2851 out:
2852 	return err;
2853 }
2854 
2855 /*
2856  * The generic ->writepage function for buffer-backed address_spaces
2857  */
2858 int block_write_full_page(struct page *page, get_block_t *get_block,
2859 			struct writeback_control *wbc)
2860 {
2861 	struct inode * const inode = page->mapping->host;
2862 	loff_t i_size = i_size_read(inode);
2863 	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2864 	unsigned offset;
2865 
2866 	/* Is the page fully inside i_size? */
2867 	if (page->index < end_index)
2868 		return __block_write_full_page(inode, page, get_block, wbc);
2869 
2870 	/* Is the page fully outside i_size? (truncate in progress) */
2871 	offset = i_size & (PAGE_CACHE_SIZE-1);
2872 	if (page->index >= end_index+1 || !offset) {
2873 		/*
2874 		 * The page may have dirty, unmapped buffers.  For example,
2875 		 * they may have been added in ext3_writepage().  Make them
2876 		 * freeable here, so the page does not leak.
2877 		 */
2878 		do_invalidatepage(page, 0);
2879 		unlock_page(page);
2880 		return 0; /* don't care */
2881 	}
2882 
2883 	/*
2884 	 * The page straddles i_size.  It must be zeroed out on each and every
2885 	 * writepage invokation because it may be mmapped.  "A file is mapped
2886 	 * in multiples of the page size.  For a file that is not a multiple of
2887 	 * the  page size, the remaining memory is zeroed when mapped, and
2888 	 * writes to that region are not written out to the file."
2889 	 */
2890 	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2891 	return __block_write_full_page(inode, page, get_block, wbc);
2892 }
2893 
2894 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2895 			    get_block_t *get_block)
2896 {
2897 	struct buffer_head tmp;
2898 	struct inode *inode = mapping->host;
2899 	tmp.b_state = 0;
2900 	tmp.b_blocknr = 0;
2901 	tmp.b_size = 1 << inode->i_blkbits;
2902 	get_block(inode, block, &tmp, 0);
2903 	return tmp.b_blocknr;
2904 }
2905 
2906 static void end_bio_bh_io_sync(struct bio *bio, int err)
2907 {
2908 	struct buffer_head *bh = bio->bi_private;
2909 
2910 	if (err == -EOPNOTSUPP) {
2911 		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2912 		set_bit(BH_Eopnotsupp, &bh->b_state);
2913 	}
2914 
2915 	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2916 	bio_put(bio);
2917 }
2918 
2919 int submit_bh(int rw, struct buffer_head * bh)
2920 {
2921 	struct bio *bio;
2922 	int ret = 0;
2923 
2924 	BUG_ON(!buffer_locked(bh));
2925 	BUG_ON(!buffer_mapped(bh));
2926 	BUG_ON(!bh->b_end_io);
2927 
2928 	/*
2929 	 * Mask in barrier bit for a write (could be either a WRITE or a
2930 	 * WRITE_SYNC
2931 	 */
2932 	if (buffer_ordered(bh) && (rw & WRITE))
2933 		rw |= WRITE_BARRIER;
2934 
2935 	/*
2936 	 * Only clear out a write error when rewriting
2937 	 */
2938 	if (test_set_buffer_req(bh) && (rw & WRITE))
2939 		clear_buffer_write_io_error(bh);
2940 
2941 	/*
2942 	 * from here on down, it's all bio -- do the initial mapping,
2943 	 * submit_bio -> generic_make_request may further map this bio around
2944 	 */
2945 	bio = bio_alloc(GFP_NOIO, 1);
2946 
2947 	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2948 	bio->bi_bdev = bh->b_bdev;
2949 	bio->bi_io_vec[0].bv_page = bh->b_page;
2950 	bio->bi_io_vec[0].bv_len = bh->b_size;
2951 	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2952 
2953 	bio->bi_vcnt = 1;
2954 	bio->bi_idx = 0;
2955 	bio->bi_size = bh->b_size;
2956 
2957 	bio->bi_end_io = end_bio_bh_io_sync;
2958 	bio->bi_private = bh;
2959 
2960 	bio_get(bio);
2961 	submit_bio(rw, bio);
2962 
2963 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
2964 		ret = -EOPNOTSUPP;
2965 
2966 	bio_put(bio);
2967 	return ret;
2968 }
2969 
2970 /**
2971  * ll_rw_block: low-level access to block devices (DEPRECATED)
2972  * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
2973  * @nr: number of &struct buffer_heads in the array
2974  * @bhs: array of pointers to &struct buffer_head
2975  *
2976  * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
2977  * requests an I/O operation on them, either a %READ or a %WRITE.  The third
2978  * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
2979  * are sent to disk. The fourth %READA option is described in the documentation
2980  * for generic_make_request() which ll_rw_block() calls.
2981  *
2982  * This function drops any buffer that it cannot get a lock on (with the
2983  * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
2984  * clean when doing a write request, and any buffer that appears to be
2985  * up-to-date when doing read request.  Further it marks as clean buffers that
2986  * are processed for writing (the buffer cache won't assume that they are
2987  * actually clean until the buffer gets unlocked).
2988  *
2989  * ll_rw_block sets b_end_io to simple completion handler that marks
2990  * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2991  * any waiters.
2992  *
2993  * All of the buffers must be for the same device, and must also be a
2994  * multiple of the current approved size for the device.
2995  */
2996 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2997 {
2998 	int i;
2999 
3000 	for (i = 0; i < nr; i++) {
3001 		struct buffer_head *bh = bhs[i];
3002 
3003 		if (rw == SWRITE || rw == SWRITE_SYNC)
3004 			lock_buffer(bh);
3005 		else if (!trylock_buffer(bh))
3006 			continue;
3007 
3008 		if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) {
3009 			if (test_clear_buffer_dirty(bh)) {
3010 				bh->b_end_io = end_buffer_write_sync;
3011 				get_bh(bh);
3012 				if (rw == SWRITE_SYNC)
3013 					submit_bh(WRITE_SYNC, bh);
3014 				else
3015 					submit_bh(WRITE, bh);
3016 				continue;
3017 			}
3018 		} else {
3019 			if (!buffer_uptodate(bh)) {
3020 				bh->b_end_io = end_buffer_read_sync;
3021 				get_bh(bh);
3022 				submit_bh(rw, bh);
3023 				continue;
3024 			}
3025 		}
3026 		unlock_buffer(bh);
3027 	}
3028 }
3029 
3030 /*
3031  * For a data-integrity writeout, we need to wait upon any in-progress I/O
3032  * and then start new I/O and then wait upon it.  The caller must have a ref on
3033  * the buffer_head.
3034  */
3035 int sync_dirty_buffer(struct buffer_head *bh)
3036 {
3037 	int ret = 0;
3038 
3039 	WARN_ON(atomic_read(&bh->b_count) < 1);
3040 	lock_buffer(bh);
3041 	if (test_clear_buffer_dirty(bh)) {
3042 		get_bh(bh);
3043 		bh->b_end_io = end_buffer_write_sync;
3044 		ret = submit_bh(WRITE_SYNC, bh);
3045 		wait_on_buffer(bh);
3046 		if (buffer_eopnotsupp(bh)) {
3047 			clear_buffer_eopnotsupp(bh);
3048 			ret = -EOPNOTSUPP;
3049 		}
3050 		if (!ret && !buffer_uptodate(bh))
3051 			ret = -EIO;
3052 	} else {
3053 		unlock_buffer(bh);
3054 	}
3055 	return ret;
3056 }
3057 
3058 /*
3059  * try_to_free_buffers() checks if all the buffers on this particular page
3060  * are unused, and releases them if so.
3061  *
3062  * Exclusion against try_to_free_buffers may be obtained by either
3063  * locking the page or by holding its mapping's private_lock.
3064  *
3065  * If the page is dirty but all the buffers are clean then we need to
3066  * be sure to mark the page clean as well.  This is because the page
3067  * may be against a block device, and a later reattachment of buffers
3068  * to a dirty page will set *all* buffers dirty.  Which would corrupt
3069  * filesystem data on the same device.
3070  *
3071  * The same applies to regular filesystem pages: if all the buffers are
3072  * clean then we set the page clean and proceed.  To do that, we require
3073  * total exclusion from __set_page_dirty_buffers().  That is obtained with
3074  * private_lock.
3075  *
3076  * try_to_free_buffers() is non-blocking.
3077  */
3078 static inline int buffer_busy(struct buffer_head *bh)
3079 {
3080 	return atomic_read(&bh->b_count) |
3081 		(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3082 }
3083 
3084 static int
3085 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3086 {
3087 	struct buffer_head *head = page_buffers(page);
3088 	struct buffer_head *bh;
3089 
3090 	bh = head;
3091 	do {
3092 		if (buffer_write_io_error(bh) && page->mapping)
3093 			set_bit(AS_EIO, &page->mapping->flags);
3094 		if (buffer_busy(bh))
3095 			goto failed;
3096 		bh = bh->b_this_page;
3097 	} while (bh != head);
3098 
3099 	do {
3100 		struct buffer_head *next = bh->b_this_page;
3101 
3102 		if (bh->b_assoc_map)
3103 			__remove_assoc_queue(bh);
3104 		bh = next;
3105 	} while (bh != head);
3106 	*buffers_to_free = head;
3107 	__clear_page_buffers(page);
3108 	return 1;
3109 failed:
3110 	return 0;
3111 }
3112 
3113 int try_to_free_buffers(struct page *page)
3114 {
3115 	struct address_space * const mapping = page->mapping;
3116 	struct buffer_head *buffers_to_free = NULL;
3117 	int ret = 0;
3118 
3119 	BUG_ON(!PageLocked(page));
3120 	if (PageWriteback(page))
3121 		return 0;
3122 
3123 	if (mapping == NULL) {		/* can this still happen? */
3124 		ret = drop_buffers(page, &buffers_to_free);
3125 		goto out;
3126 	}
3127 
3128 	spin_lock(&mapping->private_lock);
3129 	ret = drop_buffers(page, &buffers_to_free);
3130 
3131 	/*
3132 	 * If the filesystem writes its buffers by hand (eg ext3)
3133 	 * then we can have clean buffers against a dirty page.  We
3134 	 * clean the page here; otherwise the VM will never notice
3135 	 * that the filesystem did any IO at all.
3136 	 *
3137 	 * Also, during truncate, discard_buffer will have marked all
3138 	 * the page's buffers clean.  We discover that here and clean
3139 	 * the page also.
3140 	 *
3141 	 * private_lock must be held over this entire operation in order
3142 	 * to synchronise against __set_page_dirty_buffers and prevent the
3143 	 * dirty bit from being lost.
3144 	 */
3145 	if (ret)
3146 		cancel_dirty_page(page, PAGE_CACHE_SIZE);
3147 	spin_unlock(&mapping->private_lock);
3148 out:
3149 	if (buffers_to_free) {
3150 		struct buffer_head *bh = buffers_to_free;
3151 
3152 		do {
3153 			struct buffer_head *next = bh->b_this_page;
3154 			free_buffer_head(bh);
3155 			bh = next;
3156 		} while (bh != buffers_to_free);
3157 	}
3158 	return ret;
3159 }
3160 EXPORT_SYMBOL(try_to_free_buffers);
3161 
3162 void block_sync_page(struct page *page)
3163 {
3164 	struct address_space *mapping;
3165 
3166 	smp_mb();
3167 	mapping = page_mapping(page);
3168 	if (mapping)
3169 		blk_run_backing_dev(mapping->backing_dev_info, page);
3170 }
3171 
3172 /*
3173  * There are no bdflush tunables left.  But distributions are
3174  * still running obsolete flush daemons, so we terminate them here.
3175  *
3176  * Use of bdflush() is deprecated and will be removed in a future kernel.
3177  * The `pdflush' kernel threads fully replace bdflush daemons and this call.
3178  */
3179 asmlinkage long sys_bdflush(int func, long data)
3180 {
3181 	static int msg_count;
3182 
3183 	if (!capable(CAP_SYS_ADMIN))
3184 		return -EPERM;
3185 
3186 	if (msg_count < 5) {
3187 		msg_count++;
3188 		printk(KERN_INFO
3189 			"warning: process `%s' used the obsolete bdflush"
3190 			" system call\n", current->comm);
3191 		printk(KERN_INFO "Fix your initscripts?\n");
3192 	}
3193 
3194 	if (func == 1)
3195 		do_exit(0);
3196 	return 0;
3197 }
3198 
3199 /*
3200  * Buffer-head allocation
3201  */
3202 static struct kmem_cache *bh_cachep;
3203 
3204 /*
3205  * Once the number of bh's in the machine exceeds this level, we start
3206  * stripping them in writeback.
3207  */
3208 static int max_buffer_heads;
3209 
3210 int buffer_heads_over_limit;
3211 
3212 struct bh_accounting {
3213 	int nr;			/* Number of live bh's */
3214 	int ratelimit;		/* Limit cacheline bouncing */
3215 };
3216 
3217 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3218 
3219 static void recalc_bh_state(void)
3220 {
3221 	int i;
3222 	int tot = 0;
3223 
3224 	if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
3225 		return;
3226 	__get_cpu_var(bh_accounting).ratelimit = 0;
3227 	for_each_online_cpu(i)
3228 		tot += per_cpu(bh_accounting, i).nr;
3229 	buffer_heads_over_limit = (tot > max_buffer_heads);
3230 }
3231 
3232 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3233 {
3234 	struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
3235 	if (ret) {
3236 		INIT_LIST_HEAD(&ret->b_assoc_buffers);
3237 		get_cpu_var(bh_accounting).nr++;
3238 		recalc_bh_state();
3239 		put_cpu_var(bh_accounting);
3240 	}
3241 	return ret;
3242 }
3243 EXPORT_SYMBOL(alloc_buffer_head);
3244 
3245 void free_buffer_head(struct buffer_head *bh)
3246 {
3247 	BUG_ON(!list_empty(&bh->b_assoc_buffers));
3248 	kmem_cache_free(bh_cachep, bh);
3249 	get_cpu_var(bh_accounting).nr--;
3250 	recalc_bh_state();
3251 	put_cpu_var(bh_accounting);
3252 }
3253 EXPORT_SYMBOL(free_buffer_head);
3254 
3255 static void buffer_exit_cpu(int cpu)
3256 {
3257 	int i;
3258 	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3259 
3260 	for (i = 0; i < BH_LRU_SIZE; i++) {
3261 		brelse(b->bhs[i]);
3262 		b->bhs[i] = NULL;
3263 	}
3264 	get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
3265 	per_cpu(bh_accounting, cpu).nr = 0;
3266 	put_cpu_var(bh_accounting);
3267 }
3268 
3269 static int buffer_cpu_notify(struct notifier_block *self,
3270 			      unsigned long action, void *hcpu)
3271 {
3272 	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
3273 		buffer_exit_cpu((unsigned long)hcpu);
3274 	return NOTIFY_OK;
3275 }
3276 
3277 /**
3278  * bh_uptodate_or_lock - Test whether the buffer is uptodate
3279  * @bh: struct buffer_head
3280  *
3281  * Return true if the buffer is up-to-date and false,
3282  * with the buffer locked, if not.
3283  */
3284 int bh_uptodate_or_lock(struct buffer_head *bh)
3285 {
3286 	if (!buffer_uptodate(bh)) {
3287 		lock_buffer(bh);
3288 		if (!buffer_uptodate(bh))
3289 			return 0;
3290 		unlock_buffer(bh);
3291 	}
3292 	return 1;
3293 }
3294 EXPORT_SYMBOL(bh_uptodate_or_lock);
3295 
3296 /**
3297  * bh_submit_read - Submit a locked buffer for reading
3298  * @bh: struct buffer_head
3299  *
3300  * Returns zero on success and -EIO on error.
3301  */
3302 int bh_submit_read(struct buffer_head *bh)
3303 {
3304 	BUG_ON(!buffer_locked(bh));
3305 
3306 	if (buffer_uptodate(bh)) {
3307 		unlock_buffer(bh);
3308 		return 0;
3309 	}
3310 
3311 	get_bh(bh);
3312 	bh->b_end_io = end_buffer_read_sync;
3313 	submit_bh(READ, bh);
3314 	wait_on_buffer(bh);
3315 	if (buffer_uptodate(bh))
3316 		return 0;
3317 	return -EIO;
3318 }
3319 EXPORT_SYMBOL(bh_submit_read);
3320 
3321 static void
3322 init_buffer_head(void *data)
3323 {
3324 	struct buffer_head *bh = data;
3325 
3326 	memset(bh, 0, sizeof(*bh));
3327 	INIT_LIST_HEAD(&bh->b_assoc_buffers);
3328 }
3329 
3330 void __init buffer_init(void)
3331 {
3332 	int nrpages;
3333 
3334 	bh_cachep = kmem_cache_create("buffer_head",
3335 			sizeof(struct buffer_head), 0,
3336 				(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3337 				SLAB_MEM_SPREAD),
3338 				init_buffer_head);
3339 
3340 	/*
3341 	 * Limit the bh occupancy to 10% of ZONE_NORMAL
3342 	 */
3343 	nrpages = (nr_free_buffer_pages() * 10) / 100;
3344 	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3345 	hotcpu_notifier(buffer_cpu_notify, 0);
3346 }
3347 
3348 EXPORT_SYMBOL(__bforget);
3349 EXPORT_SYMBOL(__brelse);
3350 EXPORT_SYMBOL(__wait_on_buffer);
3351 EXPORT_SYMBOL(block_commit_write);
3352 EXPORT_SYMBOL(block_prepare_write);
3353 EXPORT_SYMBOL(block_page_mkwrite);
3354 EXPORT_SYMBOL(block_read_full_page);
3355 EXPORT_SYMBOL(block_sync_page);
3356 EXPORT_SYMBOL(block_truncate_page);
3357 EXPORT_SYMBOL(block_write_full_page);
3358 EXPORT_SYMBOL(cont_write_begin);
3359 EXPORT_SYMBOL(end_buffer_read_sync);
3360 EXPORT_SYMBOL(end_buffer_write_sync);
3361 EXPORT_SYMBOL(file_fsync);
3362 EXPORT_SYMBOL(fsync_bdev);
3363 EXPORT_SYMBOL(generic_block_bmap);
3364 EXPORT_SYMBOL(generic_cont_expand_simple);
3365 EXPORT_SYMBOL(init_buffer);
3366 EXPORT_SYMBOL(invalidate_bdev);
3367 EXPORT_SYMBOL(ll_rw_block);
3368 EXPORT_SYMBOL(mark_buffer_dirty);
3369 EXPORT_SYMBOL(submit_bh);
3370 EXPORT_SYMBOL(sync_dirty_buffer);
3371 EXPORT_SYMBOL(unlock_buffer);
3372