xref: /openbmc/linux/fs/nfs/direct.c (revision 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2)
1*1da177e4SLinus Torvalds /*
2*1da177e4SLinus Torvalds  * linux/fs/nfs/direct.c
3*1da177e4SLinus Torvalds  *
4*1da177e4SLinus Torvalds  * Copyright (C) 2003 by Chuck Lever <cel@netapp.com>
5*1da177e4SLinus Torvalds  *
6*1da177e4SLinus Torvalds  * High-performance uncached I/O for the Linux NFS client
7*1da177e4SLinus Torvalds  *
8*1da177e4SLinus Torvalds  * There are important applications whose performance or correctness
9*1da177e4SLinus Torvalds  * depends on uncached access to file data.  Database clusters
10*1da177e4SLinus Torvalds  * (multiple copies of the same instance running on separate hosts)
11*1da177e4SLinus Torvalds  * implement their own cache coherency protocol that subsumes file
12*1da177e4SLinus Torvalds  * system cache protocols.  Applications that process datasets
13*1da177e4SLinus Torvalds  * considerably larger than the client's memory do not always benefit
14*1da177e4SLinus Torvalds  * from a local cache.  A streaming video server, for instance, has no
15*1da177e4SLinus Torvalds  * need to cache the contents of a file.
16*1da177e4SLinus Torvalds  *
17*1da177e4SLinus Torvalds  * When an application requests uncached I/O, all read and write requests
18*1da177e4SLinus Torvalds  * are made directly to the server; data stored or fetched via these
19*1da177e4SLinus Torvalds  * requests is not cached in the Linux page cache.  The client does not
20*1da177e4SLinus Torvalds  * correct unaligned requests from applications.  All requested bytes are
21*1da177e4SLinus Torvalds  * held on permanent storage before a direct write system call returns to
22*1da177e4SLinus Torvalds  * an application.
23*1da177e4SLinus Torvalds  *
24*1da177e4SLinus Torvalds  * Solaris implements an uncached I/O facility called directio() that
25*1da177e4SLinus Torvalds  * is used for backups and sequential I/O to very large files.  Solaris
26*1da177e4SLinus Torvalds  * also supports uncaching whole NFS partitions with "-o forcedirectio,"
27*1da177e4SLinus Torvalds  * an undocumented mount option.
28*1da177e4SLinus Torvalds  *
29*1da177e4SLinus Torvalds  * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with
30*1da177e4SLinus Torvalds  * help from Andrew Morton.
31*1da177e4SLinus Torvalds  *
32*1da177e4SLinus Torvalds  * 18 Dec 2001	Initial implementation for 2.4  --cel
33*1da177e4SLinus Torvalds  * 08 Jul 2002	Version for 2.4.19, with bug fixes --trondmy
34*1da177e4SLinus Torvalds  * 08 Jun 2003	Port to 2.5 APIs  --cel
35*1da177e4SLinus Torvalds  * 31 Mar 2004	Handle direct I/O without VFS support  --cel
36*1da177e4SLinus Torvalds  * 15 Sep 2004	Parallel async reads  --cel
37*1da177e4SLinus Torvalds  *
38*1da177e4SLinus Torvalds  */
39*1da177e4SLinus Torvalds 
40*1da177e4SLinus Torvalds #include <linux/config.h>
41*1da177e4SLinus Torvalds #include <linux/errno.h>
42*1da177e4SLinus Torvalds #include <linux/sched.h>
43*1da177e4SLinus Torvalds #include <linux/kernel.h>
44*1da177e4SLinus Torvalds #include <linux/smp_lock.h>
45*1da177e4SLinus Torvalds #include <linux/file.h>
46*1da177e4SLinus Torvalds #include <linux/pagemap.h>
47*1da177e4SLinus Torvalds #include <linux/kref.h>
48*1da177e4SLinus Torvalds 
49*1da177e4SLinus Torvalds #include <linux/nfs_fs.h>
50*1da177e4SLinus Torvalds #include <linux/nfs_page.h>
51*1da177e4SLinus Torvalds #include <linux/sunrpc/clnt.h>
52*1da177e4SLinus Torvalds 
53*1da177e4SLinus Torvalds #include <asm/system.h>
54*1da177e4SLinus Torvalds #include <asm/uaccess.h>
55*1da177e4SLinus Torvalds #include <asm/atomic.h>
56*1da177e4SLinus Torvalds 
57*1da177e4SLinus Torvalds #define NFSDBG_FACILITY		NFSDBG_VFS
58*1da177e4SLinus Torvalds #define MAX_DIRECTIO_SIZE	(4096UL << PAGE_SHIFT)
59*1da177e4SLinus Torvalds 
60*1da177e4SLinus Torvalds static kmem_cache_t *nfs_direct_cachep;
61*1da177e4SLinus Torvalds 
62*1da177e4SLinus Torvalds /*
63*1da177e4SLinus Torvalds  * This represents a set of asynchronous requests that we're waiting on
64*1da177e4SLinus Torvalds  */
65*1da177e4SLinus Torvalds struct nfs_direct_req {
66*1da177e4SLinus Torvalds 	struct kref		kref;		/* release manager */
67*1da177e4SLinus Torvalds 	struct list_head	list;		/* nfs_read_data structs */
68*1da177e4SLinus Torvalds 	wait_queue_head_t	wait;		/* wait for i/o completion */
69*1da177e4SLinus Torvalds 	struct page **		pages;		/* pages in our buffer */
70*1da177e4SLinus Torvalds 	unsigned int		npages;		/* count of pages */
71*1da177e4SLinus Torvalds 	atomic_t		complete,	/* i/os we're waiting for */
72*1da177e4SLinus Torvalds 				count,		/* bytes actually processed */
73*1da177e4SLinus Torvalds 				error;		/* any reported error */
74*1da177e4SLinus Torvalds };
75*1da177e4SLinus Torvalds 
76*1da177e4SLinus Torvalds 
77*1da177e4SLinus Torvalds /**
78*1da177e4SLinus Torvalds  * nfs_get_user_pages - find and set up pages underlying user's buffer
79*1da177e4SLinus Torvalds  * rw: direction (read or write)
80*1da177e4SLinus Torvalds  * user_addr: starting address of this segment of user's buffer
81*1da177e4SLinus Torvalds  * count: size of this segment
82*1da177e4SLinus Torvalds  * @pages: returned array of page struct pointers underlying user's buffer
83*1da177e4SLinus Torvalds  */
84*1da177e4SLinus Torvalds static inline int
85*1da177e4SLinus Torvalds nfs_get_user_pages(int rw, unsigned long user_addr, size_t size,
86*1da177e4SLinus Torvalds 		struct page ***pages)
87*1da177e4SLinus Torvalds {
88*1da177e4SLinus Torvalds 	int result = -ENOMEM;
89*1da177e4SLinus Torvalds 	unsigned long page_count;
90*1da177e4SLinus Torvalds 	size_t array_size;
91*1da177e4SLinus Torvalds 
92*1da177e4SLinus Torvalds 	/* set an arbitrary limit to prevent type overflow */
93*1da177e4SLinus Torvalds 	/* XXX: this can probably be as large as INT_MAX */
94*1da177e4SLinus Torvalds 	if (size > MAX_DIRECTIO_SIZE) {
95*1da177e4SLinus Torvalds 		*pages = NULL;
96*1da177e4SLinus Torvalds 		return -EFBIG;
97*1da177e4SLinus Torvalds 	}
98*1da177e4SLinus Torvalds 
99*1da177e4SLinus Torvalds 	page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
100*1da177e4SLinus Torvalds 	page_count -= user_addr >> PAGE_SHIFT;
101*1da177e4SLinus Torvalds 
102*1da177e4SLinus Torvalds 	array_size = (page_count * sizeof(struct page *));
103*1da177e4SLinus Torvalds 	*pages = kmalloc(array_size, GFP_KERNEL);
104*1da177e4SLinus Torvalds 	if (*pages) {
105*1da177e4SLinus Torvalds 		down_read(&current->mm->mmap_sem);
106*1da177e4SLinus Torvalds 		result = get_user_pages(current, current->mm, user_addr,
107*1da177e4SLinus Torvalds 					page_count, (rw == READ), 0,
108*1da177e4SLinus Torvalds 					*pages, NULL);
109*1da177e4SLinus Torvalds 		up_read(&current->mm->mmap_sem);
110*1da177e4SLinus Torvalds 	}
111*1da177e4SLinus Torvalds 	return result;
112*1da177e4SLinus Torvalds }
113*1da177e4SLinus Torvalds 
114*1da177e4SLinus Torvalds /**
115*1da177e4SLinus Torvalds  * nfs_free_user_pages - tear down page struct array
116*1da177e4SLinus Torvalds  * @pages: array of page struct pointers underlying target buffer
117*1da177e4SLinus Torvalds  * @npages: number of pages in the array
118*1da177e4SLinus Torvalds  * @do_dirty: dirty the pages as we release them
119*1da177e4SLinus Torvalds  */
120*1da177e4SLinus Torvalds static void
121*1da177e4SLinus Torvalds nfs_free_user_pages(struct page **pages, int npages, int do_dirty)
122*1da177e4SLinus Torvalds {
123*1da177e4SLinus Torvalds 	int i;
124*1da177e4SLinus Torvalds 	for (i = 0; i < npages; i++) {
125*1da177e4SLinus Torvalds 		if (do_dirty)
126*1da177e4SLinus Torvalds 			set_page_dirty_lock(pages[i]);
127*1da177e4SLinus Torvalds 		page_cache_release(pages[i]);
128*1da177e4SLinus Torvalds 	}
129*1da177e4SLinus Torvalds 	kfree(pages);
130*1da177e4SLinus Torvalds }
131*1da177e4SLinus Torvalds 
132*1da177e4SLinus Torvalds /**
133*1da177e4SLinus Torvalds  * nfs_direct_req_release - release  nfs_direct_req structure for direct read
134*1da177e4SLinus Torvalds  * @kref: kref object embedded in an nfs_direct_req structure
135*1da177e4SLinus Torvalds  *
136*1da177e4SLinus Torvalds  */
137*1da177e4SLinus Torvalds static void nfs_direct_req_release(struct kref *kref)
138*1da177e4SLinus Torvalds {
139*1da177e4SLinus Torvalds 	struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
140*1da177e4SLinus Torvalds 	kmem_cache_free(nfs_direct_cachep, dreq);
141*1da177e4SLinus Torvalds }
142*1da177e4SLinus Torvalds 
143*1da177e4SLinus Torvalds /**
144*1da177e4SLinus Torvalds  * nfs_direct_read_alloc - allocate nfs_read_data structures for direct read
145*1da177e4SLinus Torvalds  * @count: count of bytes for the read request
146*1da177e4SLinus Torvalds  * @rsize: local rsize setting
147*1da177e4SLinus Torvalds  *
148*1da177e4SLinus Torvalds  * Note we also set the number of requests we have in the dreq when we are
149*1da177e4SLinus Torvalds  * done.  This prevents races with I/O completion so we will always wait
150*1da177e4SLinus Torvalds  * until all requests have been dispatched and completed.
151*1da177e4SLinus Torvalds  */
152*1da177e4SLinus Torvalds static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, unsigned int rsize)
153*1da177e4SLinus Torvalds {
154*1da177e4SLinus Torvalds 	struct list_head *list;
155*1da177e4SLinus Torvalds 	struct nfs_direct_req *dreq;
156*1da177e4SLinus Torvalds 	unsigned int reads = 0;
157*1da177e4SLinus Torvalds 
158*1da177e4SLinus Torvalds 	dreq = kmem_cache_alloc(nfs_direct_cachep, SLAB_KERNEL);
159*1da177e4SLinus Torvalds 	if (!dreq)
160*1da177e4SLinus Torvalds 		return NULL;
161*1da177e4SLinus Torvalds 
162*1da177e4SLinus Torvalds 	kref_init(&dreq->kref);
163*1da177e4SLinus Torvalds 	init_waitqueue_head(&dreq->wait);
164*1da177e4SLinus Torvalds 	INIT_LIST_HEAD(&dreq->list);
165*1da177e4SLinus Torvalds 	atomic_set(&dreq->count, 0);
166*1da177e4SLinus Torvalds 	atomic_set(&dreq->error, 0);
167*1da177e4SLinus Torvalds 
168*1da177e4SLinus Torvalds 	list = &dreq->list;
169*1da177e4SLinus Torvalds 	for(;;) {
170*1da177e4SLinus Torvalds 		struct nfs_read_data *data = nfs_readdata_alloc();
171*1da177e4SLinus Torvalds 
172*1da177e4SLinus Torvalds 		if (unlikely(!data)) {
173*1da177e4SLinus Torvalds 			while (!list_empty(list)) {
174*1da177e4SLinus Torvalds 				data = list_entry(list->next,
175*1da177e4SLinus Torvalds 						  struct nfs_read_data, pages);
176*1da177e4SLinus Torvalds 				list_del(&data->pages);
177*1da177e4SLinus Torvalds 				nfs_readdata_free(data);
178*1da177e4SLinus Torvalds 			}
179*1da177e4SLinus Torvalds 			kref_put(&dreq->kref, nfs_direct_req_release);
180*1da177e4SLinus Torvalds 			return NULL;
181*1da177e4SLinus Torvalds 		}
182*1da177e4SLinus Torvalds 
183*1da177e4SLinus Torvalds 		INIT_LIST_HEAD(&data->pages);
184*1da177e4SLinus Torvalds 		list_add(&data->pages, list);
185*1da177e4SLinus Torvalds 
186*1da177e4SLinus Torvalds 		data->req = (struct nfs_page *) dreq;
187*1da177e4SLinus Torvalds 		reads++;
188*1da177e4SLinus Torvalds 		if (nbytes <= rsize)
189*1da177e4SLinus Torvalds 			break;
190*1da177e4SLinus Torvalds 		nbytes -= rsize;
191*1da177e4SLinus Torvalds 	}
192*1da177e4SLinus Torvalds 	kref_get(&dreq->kref);
193*1da177e4SLinus Torvalds 	atomic_set(&dreq->complete, reads);
194*1da177e4SLinus Torvalds 	return dreq;
195*1da177e4SLinus Torvalds }
196*1da177e4SLinus Torvalds 
197*1da177e4SLinus Torvalds /**
198*1da177e4SLinus Torvalds  * nfs_direct_read_result - handle a read reply for a direct read request
199*1da177e4SLinus Torvalds  * @data: address of NFS READ operation control block
200*1da177e4SLinus Torvalds  * @status: status of this NFS READ operation
201*1da177e4SLinus Torvalds  *
202*1da177e4SLinus Torvalds  * We must hold a reference to all the pages in this direct read request
203*1da177e4SLinus Torvalds  * until the RPCs complete.  This could be long *after* we are woken up in
204*1da177e4SLinus Torvalds  * nfs_direct_read_wait (for instance, if someone hits ^C on a slow server).
205*1da177e4SLinus Torvalds  */
206*1da177e4SLinus Torvalds static void nfs_direct_read_result(struct nfs_read_data *data, int status)
207*1da177e4SLinus Torvalds {
208*1da177e4SLinus Torvalds 	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
209*1da177e4SLinus Torvalds 
210*1da177e4SLinus Torvalds 	if (likely(status >= 0))
211*1da177e4SLinus Torvalds 		atomic_add(data->res.count, &dreq->count);
212*1da177e4SLinus Torvalds 	else
213*1da177e4SLinus Torvalds 		atomic_set(&dreq->error, status);
214*1da177e4SLinus Torvalds 
215*1da177e4SLinus Torvalds 	if (unlikely(atomic_dec_and_test(&dreq->complete))) {
216*1da177e4SLinus Torvalds 		nfs_free_user_pages(dreq->pages, dreq->npages, 1);
217*1da177e4SLinus Torvalds 		wake_up(&dreq->wait);
218*1da177e4SLinus Torvalds 		kref_put(&dreq->kref, nfs_direct_req_release);
219*1da177e4SLinus Torvalds 	}
220*1da177e4SLinus Torvalds }
221*1da177e4SLinus Torvalds 
222*1da177e4SLinus Torvalds /**
223*1da177e4SLinus Torvalds  * nfs_direct_read_schedule - dispatch NFS READ operations for a direct read
224*1da177e4SLinus Torvalds  * @dreq: address of nfs_direct_req struct for this request
225*1da177e4SLinus Torvalds  * @inode: target inode
226*1da177e4SLinus Torvalds  * @ctx: target file open context
227*1da177e4SLinus Torvalds  * @user_addr: starting address of this segment of user's buffer
228*1da177e4SLinus Torvalds  * @count: size of this segment
229*1da177e4SLinus Torvalds  * @file_offset: offset in file to begin the operation
230*1da177e4SLinus Torvalds  *
231*1da177e4SLinus Torvalds  * For each nfs_read_data struct that was allocated on the list, dispatch
232*1da177e4SLinus Torvalds  * an NFS READ operation
233*1da177e4SLinus Torvalds  */
234*1da177e4SLinus Torvalds static void nfs_direct_read_schedule(struct nfs_direct_req *dreq,
235*1da177e4SLinus Torvalds 		struct inode *inode, struct nfs_open_context *ctx,
236*1da177e4SLinus Torvalds 		unsigned long user_addr, size_t count, loff_t file_offset)
237*1da177e4SLinus Torvalds {
238*1da177e4SLinus Torvalds 	struct list_head *list = &dreq->list;
239*1da177e4SLinus Torvalds 	struct page **pages = dreq->pages;
240*1da177e4SLinus Torvalds 	unsigned int curpage, pgbase;
241*1da177e4SLinus Torvalds 	unsigned int rsize = NFS_SERVER(inode)->rsize;
242*1da177e4SLinus Torvalds 
243*1da177e4SLinus Torvalds 	curpage = 0;
244*1da177e4SLinus Torvalds 	pgbase = user_addr & ~PAGE_MASK;
245*1da177e4SLinus Torvalds 	do {
246*1da177e4SLinus Torvalds 		struct nfs_read_data *data;
247*1da177e4SLinus Torvalds 		unsigned int bytes;
248*1da177e4SLinus Torvalds 
249*1da177e4SLinus Torvalds 		bytes = rsize;
250*1da177e4SLinus Torvalds 		if (count < rsize)
251*1da177e4SLinus Torvalds 			bytes = count;
252*1da177e4SLinus Torvalds 
253*1da177e4SLinus Torvalds 		data = list_entry(list->next, struct nfs_read_data, pages);
254*1da177e4SLinus Torvalds 		list_del_init(&data->pages);
255*1da177e4SLinus Torvalds 
256*1da177e4SLinus Torvalds 		data->inode = inode;
257*1da177e4SLinus Torvalds 		data->cred = ctx->cred;
258*1da177e4SLinus Torvalds 		data->args.fh = NFS_FH(inode);
259*1da177e4SLinus Torvalds 		data->args.context = ctx;
260*1da177e4SLinus Torvalds 		data->args.offset = file_offset;
261*1da177e4SLinus Torvalds 		data->args.pgbase = pgbase;
262*1da177e4SLinus Torvalds 		data->args.pages = &pages[curpage];
263*1da177e4SLinus Torvalds 		data->args.count = bytes;
264*1da177e4SLinus Torvalds 		data->res.fattr = &data->fattr;
265*1da177e4SLinus Torvalds 		data->res.eof = 0;
266*1da177e4SLinus Torvalds 		data->res.count = bytes;
267*1da177e4SLinus Torvalds 
268*1da177e4SLinus Torvalds 		NFS_PROTO(inode)->read_setup(data);
269*1da177e4SLinus Torvalds 
270*1da177e4SLinus Torvalds 		data->task.tk_cookie = (unsigned long) inode;
271*1da177e4SLinus Torvalds 		data->task.tk_calldata = data;
272*1da177e4SLinus Torvalds 		data->task.tk_release = nfs_readdata_release;
273*1da177e4SLinus Torvalds 		data->complete = nfs_direct_read_result;
274*1da177e4SLinus Torvalds 
275*1da177e4SLinus Torvalds 		lock_kernel();
276*1da177e4SLinus Torvalds 		rpc_execute(&data->task);
277*1da177e4SLinus Torvalds 		unlock_kernel();
278*1da177e4SLinus Torvalds 
279*1da177e4SLinus Torvalds 		dfprintk(VFS, "NFS: %4d initiated direct read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
280*1da177e4SLinus Torvalds 				data->task.tk_pid,
281*1da177e4SLinus Torvalds 				inode->i_sb->s_id,
282*1da177e4SLinus Torvalds 				(long long)NFS_FILEID(inode),
283*1da177e4SLinus Torvalds 				bytes,
284*1da177e4SLinus Torvalds 				(unsigned long long)data->args.offset);
285*1da177e4SLinus Torvalds 
286*1da177e4SLinus Torvalds 		file_offset += bytes;
287*1da177e4SLinus Torvalds 		pgbase += bytes;
288*1da177e4SLinus Torvalds 		curpage += pgbase >> PAGE_SHIFT;
289*1da177e4SLinus Torvalds 		pgbase &= ~PAGE_MASK;
290*1da177e4SLinus Torvalds 
291*1da177e4SLinus Torvalds 		count -= bytes;
292*1da177e4SLinus Torvalds 	} while (count != 0);
293*1da177e4SLinus Torvalds }
294*1da177e4SLinus Torvalds 
295*1da177e4SLinus Torvalds /**
296*1da177e4SLinus Torvalds  * nfs_direct_read_wait - wait for I/O completion for direct reads
297*1da177e4SLinus Torvalds  * @dreq: request on which we are to wait
298*1da177e4SLinus Torvalds  * @intr: whether or not this wait can be interrupted
299*1da177e4SLinus Torvalds  *
300*1da177e4SLinus Torvalds  * Collects and returns the final error value/byte-count.
301*1da177e4SLinus Torvalds  */
302*1da177e4SLinus Torvalds static ssize_t nfs_direct_read_wait(struct nfs_direct_req *dreq, int intr)
303*1da177e4SLinus Torvalds {
304*1da177e4SLinus Torvalds 	int result = 0;
305*1da177e4SLinus Torvalds 
306*1da177e4SLinus Torvalds 	if (intr) {
307*1da177e4SLinus Torvalds 		result = wait_event_interruptible(dreq->wait,
308*1da177e4SLinus Torvalds 					(atomic_read(&dreq->complete) == 0));
309*1da177e4SLinus Torvalds 	} else {
310*1da177e4SLinus Torvalds 		wait_event(dreq->wait, (atomic_read(&dreq->complete) == 0));
311*1da177e4SLinus Torvalds 	}
312*1da177e4SLinus Torvalds 
313*1da177e4SLinus Torvalds 	if (!result)
314*1da177e4SLinus Torvalds 		result = atomic_read(&dreq->error);
315*1da177e4SLinus Torvalds 	if (!result)
316*1da177e4SLinus Torvalds 		result = atomic_read(&dreq->count);
317*1da177e4SLinus Torvalds 
318*1da177e4SLinus Torvalds 	kref_put(&dreq->kref, nfs_direct_req_release);
319*1da177e4SLinus Torvalds 	return (ssize_t) result;
320*1da177e4SLinus Torvalds }
321*1da177e4SLinus Torvalds 
322*1da177e4SLinus Torvalds /**
323*1da177e4SLinus Torvalds  * nfs_direct_read_seg - Read in one iov segment.  Generate separate
324*1da177e4SLinus Torvalds  *                        read RPCs for each "rsize" bytes.
325*1da177e4SLinus Torvalds  * @inode: target inode
326*1da177e4SLinus Torvalds  * @ctx: target file open context
327*1da177e4SLinus Torvalds  * @user_addr: starting address of this segment of user's buffer
328*1da177e4SLinus Torvalds  * @count: size of this segment
329*1da177e4SLinus Torvalds  * @file_offset: offset in file to begin the operation
330*1da177e4SLinus Torvalds  * @pages: array of addresses of page structs defining user's buffer
331*1da177e4SLinus Torvalds  * @nr_pages: number of pages in the array
332*1da177e4SLinus Torvalds  *
333*1da177e4SLinus Torvalds  */
334*1da177e4SLinus Torvalds static ssize_t nfs_direct_read_seg(struct inode *inode,
335*1da177e4SLinus Torvalds 		struct nfs_open_context *ctx, unsigned long user_addr,
336*1da177e4SLinus Torvalds 		size_t count, loff_t file_offset, struct page **pages,
337*1da177e4SLinus Torvalds 		unsigned int nr_pages)
338*1da177e4SLinus Torvalds {
339*1da177e4SLinus Torvalds 	ssize_t result;
340*1da177e4SLinus Torvalds 	sigset_t oldset;
341*1da177e4SLinus Torvalds 	struct rpc_clnt *clnt = NFS_CLIENT(inode);
342*1da177e4SLinus Torvalds 	struct nfs_direct_req *dreq;
343*1da177e4SLinus Torvalds 
344*1da177e4SLinus Torvalds 	dreq = nfs_direct_read_alloc(count, NFS_SERVER(inode)->rsize);
345*1da177e4SLinus Torvalds 	if (!dreq)
346*1da177e4SLinus Torvalds 		return -ENOMEM;
347*1da177e4SLinus Torvalds 
348*1da177e4SLinus Torvalds 	dreq->pages = pages;
349*1da177e4SLinus Torvalds 	dreq->npages = nr_pages;
350*1da177e4SLinus Torvalds 
351*1da177e4SLinus Torvalds 	rpc_clnt_sigmask(clnt, &oldset);
352*1da177e4SLinus Torvalds 	nfs_direct_read_schedule(dreq, inode, ctx, user_addr, count,
353*1da177e4SLinus Torvalds 				 file_offset);
354*1da177e4SLinus Torvalds 	result = nfs_direct_read_wait(dreq, clnt->cl_intr);
355*1da177e4SLinus Torvalds 	rpc_clnt_sigunmask(clnt, &oldset);
356*1da177e4SLinus Torvalds 
357*1da177e4SLinus Torvalds 	return result;
358*1da177e4SLinus Torvalds }
359*1da177e4SLinus Torvalds 
360*1da177e4SLinus Torvalds /**
361*1da177e4SLinus Torvalds  * nfs_direct_read - For each iov segment, map the user's buffer
362*1da177e4SLinus Torvalds  *                   then generate read RPCs.
363*1da177e4SLinus Torvalds  * @inode: target inode
364*1da177e4SLinus Torvalds  * @ctx: target file open context
365*1da177e4SLinus Torvalds  * @iov: array of vectors that define I/O buffer
366*1da177e4SLinus Torvalds  * file_offset: offset in file to begin the operation
367*1da177e4SLinus Torvalds  * nr_segs: size of iovec array
368*1da177e4SLinus Torvalds  *
369*1da177e4SLinus Torvalds  * We've already pushed out any non-direct writes so that this read
370*1da177e4SLinus Torvalds  * will see them when we read from the server.
371*1da177e4SLinus Torvalds  */
372*1da177e4SLinus Torvalds static ssize_t
373*1da177e4SLinus Torvalds nfs_direct_read(struct inode *inode, struct nfs_open_context *ctx,
374*1da177e4SLinus Torvalds 		const struct iovec *iov, loff_t file_offset,
375*1da177e4SLinus Torvalds 		unsigned long nr_segs)
376*1da177e4SLinus Torvalds {
377*1da177e4SLinus Torvalds 	ssize_t tot_bytes = 0;
378*1da177e4SLinus Torvalds 	unsigned long seg = 0;
379*1da177e4SLinus Torvalds 
380*1da177e4SLinus Torvalds 	while ((seg < nr_segs) && (tot_bytes >= 0)) {
381*1da177e4SLinus Torvalds 		ssize_t result;
382*1da177e4SLinus Torvalds 		int page_count;
383*1da177e4SLinus Torvalds 		struct page **pages;
384*1da177e4SLinus Torvalds 		const struct iovec *vec = &iov[seg++];
385*1da177e4SLinus Torvalds 		unsigned long user_addr = (unsigned long) vec->iov_base;
386*1da177e4SLinus Torvalds 		size_t size = vec->iov_len;
387*1da177e4SLinus Torvalds 
388*1da177e4SLinus Torvalds                 page_count = nfs_get_user_pages(READ, user_addr, size, &pages);
389*1da177e4SLinus Torvalds                 if (page_count < 0) {
390*1da177e4SLinus Torvalds                         nfs_free_user_pages(pages, 0, 0);
391*1da177e4SLinus Torvalds 			if (tot_bytes > 0)
392*1da177e4SLinus Torvalds 				break;
393*1da177e4SLinus Torvalds                         return page_count;
394*1da177e4SLinus Torvalds                 }
395*1da177e4SLinus Torvalds 
396*1da177e4SLinus Torvalds 		result = nfs_direct_read_seg(inode, ctx, user_addr, size,
397*1da177e4SLinus Torvalds 				file_offset, pages, page_count);
398*1da177e4SLinus Torvalds 
399*1da177e4SLinus Torvalds 		if (result <= 0) {
400*1da177e4SLinus Torvalds 			if (tot_bytes > 0)
401*1da177e4SLinus Torvalds 				break;
402*1da177e4SLinus Torvalds 			return result;
403*1da177e4SLinus Torvalds 		}
404*1da177e4SLinus Torvalds 		tot_bytes += result;
405*1da177e4SLinus Torvalds 		file_offset += result;
406*1da177e4SLinus Torvalds 		if (result < size)
407*1da177e4SLinus Torvalds 			break;
408*1da177e4SLinus Torvalds 	}
409*1da177e4SLinus Torvalds 
410*1da177e4SLinus Torvalds 	return tot_bytes;
411*1da177e4SLinus Torvalds }
412*1da177e4SLinus Torvalds 
413*1da177e4SLinus Torvalds /**
414*1da177e4SLinus Torvalds  * nfs_direct_write_seg - Write out one iov segment.  Generate separate
415*1da177e4SLinus Torvalds  *                        write RPCs for each "wsize" bytes, then commit.
416*1da177e4SLinus Torvalds  * @inode: target inode
417*1da177e4SLinus Torvalds  * @ctx: target file open context
418*1da177e4SLinus Torvalds  * user_addr: starting address of this segment of user's buffer
419*1da177e4SLinus Torvalds  * count: size of this segment
420*1da177e4SLinus Torvalds  * file_offset: offset in file to begin the operation
421*1da177e4SLinus Torvalds  * @pages: array of addresses of page structs defining user's buffer
422*1da177e4SLinus Torvalds  * nr_pages: size of pages array
423*1da177e4SLinus Torvalds  */
424*1da177e4SLinus Torvalds static ssize_t nfs_direct_write_seg(struct inode *inode,
425*1da177e4SLinus Torvalds 		struct nfs_open_context *ctx, unsigned long user_addr,
426*1da177e4SLinus Torvalds 		size_t count, loff_t file_offset, struct page **pages,
427*1da177e4SLinus Torvalds 		int nr_pages)
428*1da177e4SLinus Torvalds {
429*1da177e4SLinus Torvalds 	const unsigned int wsize = NFS_SERVER(inode)->wsize;
430*1da177e4SLinus Torvalds 	size_t request;
431*1da177e4SLinus Torvalds 	int curpage, need_commit;
432*1da177e4SLinus Torvalds 	ssize_t result, tot_bytes;
433*1da177e4SLinus Torvalds 	struct nfs_writeverf first_verf;
434*1da177e4SLinus Torvalds 	struct nfs_write_data *wdata;
435*1da177e4SLinus Torvalds 
436*1da177e4SLinus Torvalds 	wdata = nfs_writedata_alloc();
437*1da177e4SLinus Torvalds 	if (!wdata)
438*1da177e4SLinus Torvalds 		return -ENOMEM;
439*1da177e4SLinus Torvalds 
440*1da177e4SLinus Torvalds 	wdata->inode = inode;
441*1da177e4SLinus Torvalds 	wdata->cred = ctx->cred;
442*1da177e4SLinus Torvalds 	wdata->args.fh = NFS_FH(inode);
443*1da177e4SLinus Torvalds 	wdata->args.context = ctx;
444*1da177e4SLinus Torvalds 	wdata->args.stable = NFS_UNSTABLE;
445*1da177e4SLinus Torvalds 	if (IS_SYNC(inode) || NFS_PROTO(inode)->version == 2 || count <= wsize)
446*1da177e4SLinus Torvalds 		wdata->args.stable = NFS_FILE_SYNC;
447*1da177e4SLinus Torvalds 	wdata->res.fattr = &wdata->fattr;
448*1da177e4SLinus Torvalds 	wdata->res.verf = &wdata->verf;
449*1da177e4SLinus Torvalds 
450*1da177e4SLinus Torvalds 	nfs_begin_data_update(inode);
451*1da177e4SLinus Torvalds retry:
452*1da177e4SLinus Torvalds 	need_commit = 0;
453*1da177e4SLinus Torvalds 	tot_bytes = 0;
454*1da177e4SLinus Torvalds 	curpage = 0;
455*1da177e4SLinus Torvalds 	request = count;
456*1da177e4SLinus Torvalds 	wdata->args.pgbase = user_addr & ~PAGE_MASK;
457*1da177e4SLinus Torvalds 	wdata->args.offset = file_offset;
458*1da177e4SLinus Torvalds 	do {
459*1da177e4SLinus Torvalds 		wdata->args.count = request;
460*1da177e4SLinus Torvalds 		if (wdata->args.count > wsize)
461*1da177e4SLinus Torvalds 			wdata->args.count = wsize;
462*1da177e4SLinus Torvalds 		wdata->args.pages = &pages[curpage];
463*1da177e4SLinus Torvalds 
464*1da177e4SLinus Torvalds 		dprintk("NFS: direct write: c=%u o=%Ld ua=%lu, pb=%u, cp=%u\n",
465*1da177e4SLinus Torvalds 			wdata->args.count, (long long) wdata->args.offset,
466*1da177e4SLinus Torvalds 			user_addr + tot_bytes, wdata->args.pgbase, curpage);
467*1da177e4SLinus Torvalds 
468*1da177e4SLinus Torvalds 		lock_kernel();
469*1da177e4SLinus Torvalds 		result = NFS_PROTO(inode)->write(wdata);
470*1da177e4SLinus Torvalds 		unlock_kernel();
471*1da177e4SLinus Torvalds 
472*1da177e4SLinus Torvalds 		if (result <= 0) {
473*1da177e4SLinus Torvalds 			if (tot_bytes > 0)
474*1da177e4SLinus Torvalds 				break;
475*1da177e4SLinus Torvalds 			goto out;
476*1da177e4SLinus Torvalds 		}
477*1da177e4SLinus Torvalds 
478*1da177e4SLinus Torvalds 		if (tot_bytes == 0)
479*1da177e4SLinus Torvalds 			memcpy(&first_verf.verifier, &wdata->verf.verifier,
480*1da177e4SLinus Torvalds 						sizeof(first_verf.verifier));
481*1da177e4SLinus Torvalds 		if (wdata->verf.committed != NFS_FILE_SYNC) {
482*1da177e4SLinus Torvalds 			need_commit = 1;
483*1da177e4SLinus Torvalds 			if (memcmp(&first_verf.verifier, &wdata->verf.verifier,
484*1da177e4SLinus Torvalds 					sizeof(first_verf.verifier)));
485*1da177e4SLinus Torvalds 				goto sync_retry;
486*1da177e4SLinus Torvalds 		}
487*1da177e4SLinus Torvalds 
488*1da177e4SLinus Torvalds 		tot_bytes += result;
489*1da177e4SLinus Torvalds 
490*1da177e4SLinus Torvalds 		/* in case of a short write: stop now, let the app recover */
491*1da177e4SLinus Torvalds 		if (result < wdata->args.count)
492*1da177e4SLinus Torvalds 			break;
493*1da177e4SLinus Torvalds 
494*1da177e4SLinus Torvalds 		wdata->args.offset += result;
495*1da177e4SLinus Torvalds 		wdata->args.pgbase += result;
496*1da177e4SLinus Torvalds 		curpage += wdata->args.pgbase >> PAGE_SHIFT;
497*1da177e4SLinus Torvalds 		wdata->args.pgbase &= ~PAGE_MASK;
498*1da177e4SLinus Torvalds 		request -= result;
499*1da177e4SLinus Torvalds 	} while (request != 0);
500*1da177e4SLinus Torvalds 
501*1da177e4SLinus Torvalds 	/*
502*1da177e4SLinus Torvalds 	 * Commit data written so far, even in the event of an error
503*1da177e4SLinus Torvalds 	 */
504*1da177e4SLinus Torvalds 	if (need_commit) {
505*1da177e4SLinus Torvalds 		wdata->args.count = tot_bytes;
506*1da177e4SLinus Torvalds 		wdata->args.offset = file_offset;
507*1da177e4SLinus Torvalds 
508*1da177e4SLinus Torvalds 		lock_kernel();
509*1da177e4SLinus Torvalds 		result = NFS_PROTO(inode)->commit(wdata);
510*1da177e4SLinus Torvalds 		unlock_kernel();
511*1da177e4SLinus Torvalds 
512*1da177e4SLinus Torvalds 		if (result < 0 || memcmp(&first_verf.verifier,
513*1da177e4SLinus Torvalds 					 &wdata->verf.verifier,
514*1da177e4SLinus Torvalds 					 sizeof(first_verf.verifier)) != 0)
515*1da177e4SLinus Torvalds 			goto sync_retry;
516*1da177e4SLinus Torvalds 	}
517*1da177e4SLinus Torvalds 	result = tot_bytes;
518*1da177e4SLinus Torvalds 
519*1da177e4SLinus Torvalds out:
520*1da177e4SLinus Torvalds 	nfs_end_data_update_defer(inode);
521*1da177e4SLinus Torvalds 	nfs_writedata_free(wdata);
522*1da177e4SLinus Torvalds 	return result;
523*1da177e4SLinus Torvalds 
524*1da177e4SLinus Torvalds sync_retry:
525*1da177e4SLinus Torvalds 	wdata->args.stable = NFS_FILE_SYNC;
526*1da177e4SLinus Torvalds 	goto retry;
527*1da177e4SLinus Torvalds }
528*1da177e4SLinus Torvalds 
529*1da177e4SLinus Torvalds /**
530*1da177e4SLinus Torvalds  * nfs_direct_write - For each iov segment, map the user's buffer
531*1da177e4SLinus Torvalds  *                    then generate write and commit RPCs.
532*1da177e4SLinus Torvalds  * @inode: target inode
533*1da177e4SLinus Torvalds  * @ctx: target file open context
534*1da177e4SLinus Torvalds  * @iov: array of vectors that define I/O buffer
535*1da177e4SLinus Torvalds  * file_offset: offset in file to begin the operation
536*1da177e4SLinus Torvalds  * nr_segs: size of iovec array
537*1da177e4SLinus Torvalds  *
538*1da177e4SLinus Torvalds  * Upon return, generic_file_direct_IO invalidates any cached pages
539*1da177e4SLinus Torvalds  * that non-direct readers might access, so they will pick up these
540*1da177e4SLinus Torvalds  * writes immediately.
541*1da177e4SLinus Torvalds  */
542*1da177e4SLinus Torvalds static ssize_t nfs_direct_write(struct inode *inode,
543*1da177e4SLinus Torvalds 		struct nfs_open_context *ctx, const struct iovec *iov,
544*1da177e4SLinus Torvalds 		loff_t file_offset, unsigned long nr_segs)
545*1da177e4SLinus Torvalds {
546*1da177e4SLinus Torvalds 	ssize_t tot_bytes = 0;
547*1da177e4SLinus Torvalds 	unsigned long seg = 0;
548*1da177e4SLinus Torvalds 
549*1da177e4SLinus Torvalds 	while ((seg < nr_segs) && (tot_bytes >= 0)) {
550*1da177e4SLinus Torvalds 		ssize_t result;
551*1da177e4SLinus Torvalds 		int page_count;
552*1da177e4SLinus Torvalds 		struct page **pages;
553*1da177e4SLinus Torvalds 		const struct iovec *vec = &iov[seg++];
554*1da177e4SLinus Torvalds 		unsigned long user_addr = (unsigned long) vec->iov_base;
555*1da177e4SLinus Torvalds 		size_t size = vec->iov_len;
556*1da177e4SLinus Torvalds 
557*1da177e4SLinus Torvalds                 page_count = nfs_get_user_pages(WRITE, user_addr, size, &pages);
558*1da177e4SLinus Torvalds                 if (page_count < 0) {
559*1da177e4SLinus Torvalds                         nfs_free_user_pages(pages, 0, 0);
560*1da177e4SLinus Torvalds 			if (tot_bytes > 0)
561*1da177e4SLinus Torvalds 				break;
562*1da177e4SLinus Torvalds                         return page_count;
563*1da177e4SLinus Torvalds                 }
564*1da177e4SLinus Torvalds 
565*1da177e4SLinus Torvalds 		result = nfs_direct_write_seg(inode, ctx, user_addr, size,
566*1da177e4SLinus Torvalds 				file_offset, pages, page_count);
567*1da177e4SLinus Torvalds 		nfs_free_user_pages(pages, page_count, 0);
568*1da177e4SLinus Torvalds 
569*1da177e4SLinus Torvalds 		if (result <= 0) {
570*1da177e4SLinus Torvalds 			if (tot_bytes > 0)
571*1da177e4SLinus Torvalds 				break;
572*1da177e4SLinus Torvalds 			return result;
573*1da177e4SLinus Torvalds 		}
574*1da177e4SLinus Torvalds 		tot_bytes += result;
575*1da177e4SLinus Torvalds 		file_offset += result;
576*1da177e4SLinus Torvalds 		if (result < size)
577*1da177e4SLinus Torvalds 			break;
578*1da177e4SLinus Torvalds 	}
579*1da177e4SLinus Torvalds 	return tot_bytes;
580*1da177e4SLinus Torvalds }
581*1da177e4SLinus Torvalds 
582*1da177e4SLinus Torvalds /**
583*1da177e4SLinus Torvalds  * nfs_direct_IO - NFS address space operation for direct I/O
584*1da177e4SLinus Torvalds  * rw: direction (read or write)
585*1da177e4SLinus Torvalds  * @iocb: target I/O control block
586*1da177e4SLinus Torvalds  * @iov: array of vectors that define I/O buffer
587*1da177e4SLinus Torvalds  * file_offset: offset in file to begin the operation
588*1da177e4SLinus Torvalds  * nr_segs: size of iovec array
589*1da177e4SLinus Torvalds  *
590*1da177e4SLinus Torvalds  */
591*1da177e4SLinus Torvalds ssize_t
592*1da177e4SLinus Torvalds nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
593*1da177e4SLinus Torvalds 		loff_t file_offset, unsigned long nr_segs)
594*1da177e4SLinus Torvalds {
595*1da177e4SLinus Torvalds 	ssize_t result = -EINVAL;
596*1da177e4SLinus Torvalds 	struct file *file = iocb->ki_filp;
597*1da177e4SLinus Torvalds 	struct nfs_open_context *ctx;
598*1da177e4SLinus Torvalds 	struct dentry *dentry = file->f_dentry;
599*1da177e4SLinus Torvalds 	struct inode *inode = dentry->d_inode;
600*1da177e4SLinus Torvalds 
601*1da177e4SLinus Torvalds 	/*
602*1da177e4SLinus Torvalds 	 * No support for async yet
603*1da177e4SLinus Torvalds 	 */
604*1da177e4SLinus Torvalds 	if (!is_sync_kiocb(iocb))
605*1da177e4SLinus Torvalds 		return result;
606*1da177e4SLinus Torvalds 
607*1da177e4SLinus Torvalds 	ctx = (struct nfs_open_context *)file->private_data;
608*1da177e4SLinus Torvalds 	switch (rw) {
609*1da177e4SLinus Torvalds 	case READ:
610*1da177e4SLinus Torvalds 		dprintk("NFS: direct_IO(read) (%s) off/no(%Lu/%lu)\n",
611*1da177e4SLinus Torvalds 				dentry->d_name.name, file_offset, nr_segs);
612*1da177e4SLinus Torvalds 
613*1da177e4SLinus Torvalds 		result = nfs_direct_read(inode, ctx, iov,
614*1da177e4SLinus Torvalds 						file_offset, nr_segs);
615*1da177e4SLinus Torvalds 		break;
616*1da177e4SLinus Torvalds 	case WRITE:
617*1da177e4SLinus Torvalds 		dprintk("NFS: direct_IO(write) (%s) off/no(%Lu/%lu)\n",
618*1da177e4SLinus Torvalds 				dentry->d_name.name, file_offset, nr_segs);
619*1da177e4SLinus Torvalds 
620*1da177e4SLinus Torvalds 		result = nfs_direct_write(inode, ctx, iov,
621*1da177e4SLinus Torvalds 						file_offset, nr_segs);
622*1da177e4SLinus Torvalds 		break;
623*1da177e4SLinus Torvalds 	default:
624*1da177e4SLinus Torvalds 		break;
625*1da177e4SLinus Torvalds 	}
626*1da177e4SLinus Torvalds 	return result;
627*1da177e4SLinus Torvalds }
628*1da177e4SLinus Torvalds 
629*1da177e4SLinus Torvalds /**
630*1da177e4SLinus Torvalds  * nfs_file_direct_read - file direct read operation for NFS files
631*1da177e4SLinus Torvalds  * @iocb: target I/O control block
632*1da177e4SLinus Torvalds  * @buf: user's buffer into which to read data
633*1da177e4SLinus Torvalds  * count: number of bytes to read
634*1da177e4SLinus Torvalds  * pos: byte offset in file where reading starts
635*1da177e4SLinus Torvalds  *
636*1da177e4SLinus Torvalds  * We use this function for direct reads instead of calling
637*1da177e4SLinus Torvalds  * generic_file_aio_read() in order to avoid gfar's check to see if
638*1da177e4SLinus Torvalds  * the request starts before the end of the file.  For that check
639*1da177e4SLinus Torvalds  * to work, we must generate a GETATTR before each direct read, and
640*1da177e4SLinus Torvalds  * even then there is a window between the GETATTR and the subsequent
641*1da177e4SLinus Torvalds  * READ where the file size could change.  So our preference is simply
642*1da177e4SLinus Torvalds  * to do all reads the application wants, and the server will take
643*1da177e4SLinus Torvalds  * care of managing the end of file boundary.
644*1da177e4SLinus Torvalds  *
645*1da177e4SLinus Torvalds  * This function also eliminates unnecessarily updating the file's
646*1da177e4SLinus Torvalds  * atime locally, as the NFS server sets the file's atime, and this
647*1da177e4SLinus Torvalds  * client must read the updated atime from the server back into its
648*1da177e4SLinus Torvalds  * cache.
649*1da177e4SLinus Torvalds  */
650*1da177e4SLinus Torvalds ssize_t
651*1da177e4SLinus Torvalds nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
652*1da177e4SLinus Torvalds {
653*1da177e4SLinus Torvalds 	ssize_t retval = -EINVAL;
654*1da177e4SLinus Torvalds 	loff_t *ppos = &iocb->ki_pos;
655*1da177e4SLinus Torvalds 	struct file *file = iocb->ki_filp;
656*1da177e4SLinus Torvalds 	struct nfs_open_context *ctx =
657*1da177e4SLinus Torvalds 			(struct nfs_open_context *) file->private_data;
658*1da177e4SLinus Torvalds 	struct dentry *dentry = file->f_dentry;
659*1da177e4SLinus Torvalds 	struct address_space *mapping = file->f_mapping;
660*1da177e4SLinus Torvalds 	struct inode *inode = mapping->host;
661*1da177e4SLinus Torvalds 	struct iovec iov = {
662*1da177e4SLinus Torvalds 		.iov_base = buf,
663*1da177e4SLinus Torvalds 		.iov_len = count,
664*1da177e4SLinus Torvalds 	};
665*1da177e4SLinus Torvalds 
666*1da177e4SLinus Torvalds 	dprintk("nfs: direct read(%s/%s, %lu@%lu)\n",
667*1da177e4SLinus Torvalds 		dentry->d_parent->d_name.name, dentry->d_name.name,
668*1da177e4SLinus Torvalds 		(unsigned long) count, (unsigned long) pos);
669*1da177e4SLinus Torvalds 
670*1da177e4SLinus Torvalds 	if (!is_sync_kiocb(iocb))
671*1da177e4SLinus Torvalds 		goto out;
672*1da177e4SLinus Torvalds 	if (count < 0)
673*1da177e4SLinus Torvalds 		goto out;
674*1da177e4SLinus Torvalds 	retval = -EFAULT;
675*1da177e4SLinus Torvalds 	if (!access_ok(VERIFY_WRITE, iov.iov_base, iov.iov_len))
676*1da177e4SLinus Torvalds 		goto out;
677*1da177e4SLinus Torvalds 	retval = 0;
678*1da177e4SLinus Torvalds 	if (!count)
679*1da177e4SLinus Torvalds 		goto out;
680*1da177e4SLinus Torvalds 
681*1da177e4SLinus Torvalds 	if (mapping->nrpages) {
682*1da177e4SLinus Torvalds 		retval = filemap_fdatawrite(mapping);
683*1da177e4SLinus Torvalds 		if (retval == 0)
684*1da177e4SLinus Torvalds 			retval = nfs_wb_all(inode);
685*1da177e4SLinus Torvalds 		if (retval == 0)
686*1da177e4SLinus Torvalds 			retval = filemap_fdatawait(mapping);
687*1da177e4SLinus Torvalds 		if (retval)
688*1da177e4SLinus Torvalds 			goto out;
689*1da177e4SLinus Torvalds 	}
690*1da177e4SLinus Torvalds 
691*1da177e4SLinus Torvalds 	retval = nfs_direct_read(inode, ctx, &iov, pos, 1);
692*1da177e4SLinus Torvalds 	if (retval > 0)
693*1da177e4SLinus Torvalds 		*ppos = pos + retval;
694*1da177e4SLinus Torvalds 
695*1da177e4SLinus Torvalds out:
696*1da177e4SLinus Torvalds 	return retval;
697*1da177e4SLinus Torvalds }
698*1da177e4SLinus Torvalds 
699*1da177e4SLinus Torvalds /**
700*1da177e4SLinus Torvalds  * nfs_file_direct_write - file direct write operation for NFS files
701*1da177e4SLinus Torvalds  * @iocb: target I/O control block
702*1da177e4SLinus Torvalds  * @buf: user's buffer from which to write data
703*1da177e4SLinus Torvalds  * count: number of bytes to write
704*1da177e4SLinus Torvalds  * pos: byte offset in file where writing starts
705*1da177e4SLinus Torvalds  *
706*1da177e4SLinus Torvalds  * We use this function for direct writes instead of calling
707*1da177e4SLinus Torvalds  * generic_file_aio_write() in order to avoid taking the inode
708*1da177e4SLinus Torvalds  * semaphore and updating the i_size.  The NFS server will set
709*1da177e4SLinus Torvalds  * the new i_size and this client must read the updated size
710*1da177e4SLinus Torvalds  * back into its cache.  We let the server do generic write
711*1da177e4SLinus Torvalds  * parameter checking and report problems.
712*1da177e4SLinus Torvalds  *
713*1da177e4SLinus Torvalds  * We also avoid an unnecessary invocation of generic_osync_inode(),
714*1da177e4SLinus Torvalds  * as it is fairly meaningless to sync the metadata of an NFS file.
715*1da177e4SLinus Torvalds  *
716*1da177e4SLinus Torvalds  * We eliminate local atime updates, see direct read above.
717*1da177e4SLinus Torvalds  *
718*1da177e4SLinus Torvalds  * We avoid unnecessary page cache invalidations for normal cached
719*1da177e4SLinus Torvalds  * readers of this file.
720*1da177e4SLinus Torvalds  *
721*1da177e4SLinus Torvalds  * Note that O_APPEND is not supported for NFS direct writes, as there
722*1da177e4SLinus Torvalds  * is no atomic O_APPEND write facility in the NFS protocol.
723*1da177e4SLinus Torvalds  */
724*1da177e4SLinus Torvalds ssize_t
725*1da177e4SLinus Torvalds nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos)
726*1da177e4SLinus Torvalds {
727*1da177e4SLinus Torvalds 	ssize_t retval = -EINVAL;
728*1da177e4SLinus Torvalds 	loff_t *ppos = &iocb->ki_pos;
729*1da177e4SLinus Torvalds 	unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
730*1da177e4SLinus Torvalds 	struct file *file = iocb->ki_filp;
731*1da177e4SLinus Torvalds 	struct nfs_open_context *ctx =
732*1da177e4SLinus Torvalds 			(struct nfs_open_context *) file->private_data;
733*1da177e4SLinus Torvalds 	struct dentry *dentry = file->f_dentry;
734*1da177e4SLinus Torvalds 	struct address_space *mapping = file->f_mapping;
735*1da177e4SLinus Torvalds 	struct inode *inode = mapping->host;
736*1da177e4SLinus Torvalds 	struct iovec iov = {
737*1da177e4SLinus Torvalds 		.iov_base = (char __user *)buf,
738*1da177e4SLinus Torvalds 		.iov_len = count,
739*1da177e4SLinus Torvalds 	};
740*1da177e4SLinus Torvalds 
741*1da177e4SLinus Torvalds 	dfprintk(VFS, "nfs: direct write(%s/%s(%ld), %lu@%lu)\n",
742*1da177e4SLinus Torvalds 		dentry->d_parent->d_name.name, dentry->d_name.name,
743*1da177e4SLinus Torvalds 		inode->i_ino, (unsigned long) count, (unsigned long) pos);
744*1da177e4SLinus Torvalds 
745*1da177e4SLinus Torvalds 	if (!is_sync_kiocb(iocb))
746*1da177e4SLinus Torvalds 		goto out;
747*1da177e4SLinus Torvalds 	if (count < 0)
748*1da177e4SLinus Torvalds 		goto out;
749*1da177e4SLinus Torvalds         if (pos < 0)
750*1da177e4SLinus Torvalds 		goto out;
751*1da177e4SLinus Torvalds 	retval = -EFAULT;
752*1da177e4SLinus Torvalds 	if (!access_ok(VERIFY_READ, iov.iov_base, iov.iov_len))
753*1da177e4SLinus Torvalds 		goto out;
754*1da177e4SLinus Torvalds         if (file->f_error) {
755*1da177e4SLinus Torvalds                 retval = file->f_error;
756*1da177e4SLinus Torvalds                 file->f_error = 0;
757*1da177e4SLinus Torvalds                 goto out;
758*1da177e4SLinus Torvalds         }
759*1da177e4SLinus Torvalds 	retval = -EFBIG;
760*1da177e4SLinus Torvalds 	if (limit != RLIM_INFINITY) {
761*1da177e4SLinus Torvalds 		if (pos >= limit) {
762*1da177e4SLinus Torvalds 			send_sig(SIGXFSZ, current, 0);
763*1da177e4SLinus Torvalds 			goto out;
764*1da177e4SLinus Torvalds 		}
765*1da177e4SLinus Torvalds 		if (count > limit - (unsigned long) pos)
766*1da177e4SLinus Torvalds 			count = limit - (unsigned long) pos;
767*1da177e4SLinus Torvalds 	}
768*1da177e4SLinus Torvalds 	retval = 0;
769*1da177e4SLinus Torvalds 	if (!count)
770*1da177e4SLinus Torvalds 		goto out;
771*1da177e4SLinus Torvalds 
772*1da177e4SLinus Torvalds 	if (mapping->nrpages) {
773*1da177e4SLinus Torvalds 		retval = filemap_fdatawrite(mapping);
774*1da177e4SLinus Torvalds 		if (retval == 0)
775*1da177e4SLinus Torvalds 			retval = nfs_wb_all(inode);
776*1da177e4SLinus Torvalds 		if (retval == 0)
777*1da177e4SLinus Torvalds 			retval = filemap_fdatawait(mapping);
778*1da177e4SLinus Torvalds 		if (retval)
779*1da177e4SLinus Torvalds 			goto out;
780*1da177e4SLinus Torvalds 	}
781*1da177e4SLinus Torvalds 
782*1da177e4SLinus Torvalds 	retval = nfs_direct_write(inode, ctx, &iov, pos, 1);
783*1da177e4SLinus Torvalds 	if (mapping->nrpages)
784*1da177e4SLinus Torvalds 		invalidate_inode_pages2(mapping);
785*1da177e4SLinus Torvalds 	if (retval > 0)
786*1da177e4SLinus Torvalds 		*ppos = pos + retval;
787*1da177e4SLinus Torvalds 
788*1da177e4SLinus Torvalds out:
789*1da177e4SLinus Torvalds 	return retval;
790*1da177e4SLinus Torvalds }
791*1da177e4SLinus Torvalds 
792*1da177e4SLinus Torvalds int nfs_init_directcache(void)
793*1da177e4SLinus Torvalds {
794*1da177e4SLinus Torvalds 	nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
795*1da177e4SLinus Torvalds 						sizeof(struct nfs_direct_req),
796*1da177e4SLinus Torvalds 						0, SLAB_RECLAIM_ACCOUNT,
797*1da177e4SLinus Torvalds 						NULL, NULL);
798*1da177e4SLinus Torvalds 	if (nfs_direct_cachep == NULL)
799*1da177e4SLinus Torvalds 		return -ENOMEM;
800*1da177e4SLinus Torvalds 
801*1da177e4SLinus Torvalds 	return 0;
802*1da177e4SLinus Torvalds }
803*1da177e4SLinus Torvalds 
804*1da177e4SLinus Torvalds void nfs_destroy_directcache(void)
805*1da177e4SLinus Torvalds {
806*1da177e4SLinus Torvalds 	if (kmem_cache_destroy(nfs_direct_cachep))
807*1da177e4SLinus Torvalds 		printk(KERN_INFO "nfs_direct_cache: not all structures were freed\n");
808*1da177e4SLinus Torvalds }
809