xref: /openbmc/linux/fs/nfs/direct.c (revision 8ebc80a25f9d9bf7a8e368b266d5b740c485c362)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * linux/fs/nfs/direct.c
4  *
5  * Copyright (C) 2003 by Chuck Lever <cel@netapp.com>
6  *
7  * High-performance uncached I/O for the Linux NFS client
8  *
9  * There are important applications whose performance or correctness
10  * depends on uncached access to file data.  Database clusters
11  * (multiple copies of the same instance running on separate hosts)
12  * implement their own cache coherency protocol that subsumes file
13  * system cache protocols.  Applications that process datasets
14  * considerably larger than the client's memory do not always benefit
15  * from a local cache.  A streaming video server, for instance, has no
16  * need to cache the contents of a file.
17  *
18  * When an application requests uncached I/O, all read and write requests
19  * are made directly to the server; data stored or fetched via these
20  * requests is not cached in the Linux page cache.  The client does not
21  * correct unaligned requests from applications.  All requested bytes are
22  * held on permanent storage before a direct write system call returns to
23  * an application.
24  *
25  * Solaris implements an uncached I/O facility called directio() that
26  * is used for backups and sequential I/O to very large files.  Solaris
27  * also supports uncaching whole NFS partitions with "-o forcedirectio,"
28  * an undocumented mount option.
29  *
30  * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with
31  * help from Andrew Morton.
32  *
33  * 18 Dec 2001	Initial implementation for 2.4  --cel
34  * 08 Jul 2002	Version for 2.4.19, with bug fixes --trondmy
35  * 08 Jun 2003	Port to 2.5 APIs  --cel
36  * 31 Mar 2004	Handle direct I/O without VFS support  --cel
37  * 15 Sep 2004	Parallel async reads  --cel
38  * 04 May 2005	support O_DIRECT with aio  --cel
39  *
40  */
41 
42 #include <linux/errno.h>
43 #include <linux/sched.h>
44 #include <linux/kernel.h>
45 #include <linux/file.h>
46 #include <linux/pagemap.h>
47 #include <linux/kref.h>
48 #include <linux/slab.h>
49 #include <linux/task_io_accounting_ops.h>
50 #include <linux/module.h>
51 
52 #include <linux/nfs_fs.h>
53 #include <linux/nfs_page.h>
54 #include <linux/sunrpc/clnt.h>
55 
56 #include <linux/uaccess.h>
57 #include <linux/atomic.h>
58 
59 #include "internal.h"
60 #include "iostat.h"
61 #include "pnfs.h"
62 #include "fscache.h"
63 #include "nfstrace.h"
64 
65 #define NFSDBG_FACILITY		NFSDBG_VFS
66 
67 static struct kmem_cache *nfs_direct_cachep;
68 
69 static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops;
70 static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops;
71 static void nfs_direct_write_complete(struct nfs_direct_req *dreq);
72 static void nfs_direct_write_schedule_work(struct work_struct *work);
73 
get_dreq(struct nfs_direct_req * dreq)74 static inline void get_dreq(struct nfs_direct_req *dreq)
75 {
76 	atomic_inc(&dreq->io_count);
77 }
78 
put_dreq(struct nfs_direct_req * dreq)79 static inline int put_dreq(struct nfs_direct_req *dreq)
80 {
81 	return atomic_dec_and_test(&dreq->io_count);
82 }
83 
84 static void
nfs_direct_handle_truncated(struct nfs_direct_req * dreq,const struct nfs_pgio_header * hdr,ssize_t dreq_len)85 nfs_direct_handle_truncated(struct nfs_direct_req *dreq,
86 			    const struct nfs_pgio_header *hdr,
87 			    ssize_t dreq_len)
88 {
89 	if (!(test_bit(NFS_IOHDR_ERROR, &hdr->flags) ||
90 	      test_bit(NFS_IOHDR_EOF, &hdr->flags)))
91 		return;
92 	if (dreq->max_count >= dreq_len) {
93 		dreq->max_count = dreq_len;
94 		if (dreq->count > dreq_len)
95 			dreq->count = dreq_len;
96 	}
97 
98 	if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && !dreq->error)
99 		dreq->error = hdr->error;
100 }
101 
102 static void
nfs_direct_count_bytes(struct nfs_direct_req * dreq,const struct nfs_pgio_header * hdr)103 nfs_direct_count_bytes(struct nfs_direct_req *dreq,
104 		       const struct nfs_pgio_header *hdr)
105 {
106 	loff_t hdr_end = hdr->io_start + hdr->good_bytes;
107 	ssize_t dreq_len = 0;
108 
109 	if (hdr_end > dreq->io_start)
110 		dreq_len = hdr_end - dreq->io_start;
111 
112 	nfs_direct_handle_truncated(dreq, hdr, dreq_len);
113 
114 	if (dreq_len > dreq->max_count)
115 		dreq_len = dreq->max_count;
116 
117 	if (dreq->count < dreq_len)
118 		dreq->count = dreq_len;
119 }
120 
nfs_direct_truncate_request(struct nfs_direct_req * dreq,struct nfs_page * req)121 static void nfs_direct_truncate_request(struct nfs_direct_req *dreq,
122 					struct nfs_page *req)
123 {
124 	loff_t offs = req_offset(req);
125 	size_t req_start = (size_t)(offs - dreq->io_start);
126 
127 	if (req_start < dreq->max_count)
128 		dreq->max_count = req_start;
129 	if (req_start < dreq->count)
130 		dreq->count = req_start;
131 }
132 
nfs_direct_file_adjust_size_locked(struct inode * inode,loff_t offset,size_t count)133 static void nfs_direct_file_adjust_size_locked(struct inode *inode,
134 					       loff_t offset, size_t count)
135 {
136 	loff_t newsize = offset + (loff_t)count;
137 	loff_t oldsize = i_size_read(inode);
138 
139 	if (newsize > oldsize) {
140 		i_size_write(inode, newsize);
141 		NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE;
142 		trace_nfs_size_grow(inode, newsize);
143 		nfs_inc_stats(inode, NFSIOS_EXTENDWRITE);
144 	}
145 }
146 
147 /**
148  * nfs_swap_rw - NFS address space operation for swap I/O
149  * @iocb: target I/O control block
150  * @iter: I/O buffer
151  *
152  * Perform IO to the swap-file.  This is much like direct IO.
153  */
nfs_swap_rw(struct kiocb * iocb,struct iov_iter * iter)154 int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter)
155 {
156 	ssize_t ret;
157 
158 	if (iov_iter_rw(iter) == READ)
159 		ret = nfs_file_direct_read(iocb, iter, true);
160 	else
161 		ret = nfs_file_direct_write(iocb, iter, true);
162 	if (ret < 0)
163 		return ret;
164 	return 0;
165 }
166 
nfs_direct_release_pages(struct page ** pages,unsigned int npages)167 static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
168 {
169 	unsigned int i;
170 	for (i = 0; i < npages; i++)
171 		put_page(pages[i]);
172 }
173 
nfs_init_cinfo_from_dreq(struct nfs_commit_info * cinfo,struct nfs_direct_req * dreq)174 void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
175 			      struct nfs_direct_req *dreq)
176 {
177 	cinfo->inode = dreq->inode;
178 	cinfo->mds = &dreq->mds_cinfo;
179 	cinfo->ds = &dreq->ds_cinfo;
180 	cinfo->dreq = dreq;
181 	cinfo->completion_ops = &nfs_direct_commit_completion_ops;
182 }
183 
nfs_direct_req_alloc(void)184 static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
185 {
186 	struct nfs_direct_req *dreq;
187 
188 	dreq = kmem_cache_zalloc(nfs_direct_cachep, GFP_KERNEL);
189 	if (!dreq)
190 		return NULL;
191 
192 	kref_init(&dreq->kref);
193 	kref_get(&dreq->kref);
194 	init_completion(&dreq->completion);
195 	INIT_LIST_HEAD(&dreq->mds_cinfo.list);
196 	pnfs_init_ds_commit_info(&dreq->ds_cinfo);
197 	INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
198 	spin_lock_init(&dreq->lock);
199 
200 	return dreq;
201 }
202 
nfs_direct_req_free(struct kref * kref)203 static void nfs_direct_req_free(struct kref *kref)
204 {
205 	struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
206 
207 	pnfs_release_ds_info(&dreq->ds_cinfo, dreq->inode);
208 	if (dreq->l_ctx != NULL)
209 		nfs_put_lock_context(dreq->l_ctx);
210 	if (dreq->ctx != NULL)
211 		put_nfs_open_context(dreq->ctx);
212 	kmem_cache_free(nfs_direct_cachep, dreq);
213 }
214 
nfs_direct_req_release(struct nfs_direct_req * dreq)215 static void nfs_direct_req_release(struct nfs_direct_req *dreq)
216 {
217 	kref_put(&dreq->kref, nfs_direct_req_free);
218 }
219 
nfs_dreq_bytes_left(struct nfs_direct_req * dreq,loff_t offset)220 ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq, loff_t offset)
221 {
222 	loff_t start = offset - dreq->io_start;
223 	return dreq->max_count - start;
224 }
225 EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left);
226 
227 /*
228  * Collects and returns the final error value/byte-count.
229  */
nfs_direct_wait(struct nfs_direct_req * dreq)230 static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
231 {
232 	ssize_t result = -EIOCBQUEUED;
233 
234 	/* Async requests don't wait here */
235 	if (dreq->iocb)
236 		goto out;
237 
238 	result = wait_for_completion_killable(&dreq->completion);
239 
240 	if (!result) {
241 		result = dreq->count;
242 		WARN_ON_ONCE(dreq->count < 0);
243 	}
244 	if (!result)
245 		result = dreq->error;
246 
247 out:
248 	return (ssize_t) result;
249 }
250 
251 /*
252  * Synchronous I/O uses a stack-allocated iocb.  Thus we can't trust
253  * the iocb is still valid here if this is a synchronous request.
254  */
nfs_direct_complete(struct nfs_direct_req * dreq)255 static void nfs_direct_complete(struct nfs_direct_req *dreq)
256 {
257 	struct inode *inode = dreq->inode;
258 
259 	inode_dio_end(inode);
260 
261 	if (dreq->iocb) {
262 		long res = (long) dreq->error;
263 		if (dreq->count != 0) {
264 			res = (long) dreq->count;
265 			WARN_ON_ONCE(dreq->count < 0);
266 		}
267 		dreq->iocb->ki_complete(dreq->iocb, res);
268 	}
269 
270 	complete(&dreq->completion);
271 
272 	nfs_direct_req_release(dreq);
273 }
274 
nfs_direct_read_completion(struct nfs_pgio_header * hdr)275 static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
276 {
277 	unsigned long bytes = 0;
278 	struct nfs_direct_req *dreq = hdr->dreq;
279 
280 	spin_lock(&dreq->lock);
281 	if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) {
282 		spin_unlock(&dreq->lock);
283 		goto out_put;
284 	}
285 
286 	nfs_direct_count_bytes(dreq, hdr);
287 	spin_unlock(&dreq->lock);
288 
289 	while (!list_empty(&hdr->pages)) {
290 		struct nfs_page *req = nfs_list_entry(hdr->pages.next);
291 		struct page *page = req->wb_page;
292 
293 		if (!PageCompound(page) && bytes < hdr->good_bytes &&
294 		    (dreq->flags == NFS_ODIRECT_SHOULD_DIRTY))
295 			set_page_dirty(page);
296 		bytes += req->wb_bytes;
297 		nfs_list_remove_request(req);
298 		nfs_release_request(req);
299 	}
300 out_put:
301 	if (put_dreq(dreq))
302 		nfs_direct_complete(dreq);
303 	hdr->release(hdr);
304 }
305 
nfs_read_sync_pgio_error(struct list_head * head,int error)306 static void nfs_read_sync_pgio_error(struct list_head *head, int error)
307 {
308 	struct nfs_page *req;
309 
310 	while (!list_empty(head)) {
311 		req = nfs_list_entry(head->next);
312 		nfs_list_remove_request(req);
313 		nfs_release_request(req);
314 	}
315 }
316 
nfs_direct_pgio_init(struct nfs_pgio_header * hdr)317 static void nfs_direct_pgio_init(struct nfs_pgio_header *hdr)
318 {
319 	get_dreq(hdr->dreq);
320 }
321 
322 static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = {
323 	.error_cleanup = nfs_read_sync_pgio_error,
324 	.init_hdr = nfs_direct_pgio_init,
325 	.completion = nfs_direct_read_completion,
326 };
327 
328 /*
329  * For each rsize'd chunk of the user's buffer, dispatch an NFS READ
330  * operation.  If nfs_readdata_alloc() or get_user_pages() fails,
331  * bail and stop sending more reads.  Read length accounting is
332  * handled automatically by nfs_direct_read_result().  Otherwise, if
333  * no requests have been sent, just return an error.
334  */
335 
nfs_direct_read_schedule_iovec(struct nfs_direct_req * dreq,struct iov_iter * iter,loff_t pos)336 static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
337 					      struct iov_iter *iter,
338 					      loff_t pos)
339 {
340 	struct nfs_pageio_descriptor desc;
341 	struct inode *inode = dreq->inode;
342 	ssize_t result = -EINVAL;
343 	size_t requested_bytes = 0;
344 	size_t rsize = max_t(size_t, NFS_SERVER(inode)->rsize, PAGE_SIZE);
345 
346 	nfs_pageio_init_read(&desc, dreq->inode, false,
347 			     &nfs_direct_read_completion_ops);
348 	get_dreq(dreq);
349 	desc.pg_dreq = dreq;
350 	inode_dio_begin(inode);
351 
352 	while (iov_iter_count(iter)) {
353 		struct page **pagevec;
354 		size_t bytes;
355 		size_t pgbase;
356 		unsigned npages, i;
357 
358 		result = iov_iter_get_pages_alloc2(iter, &pagevec,
359 						  rsize, &pgbase);
360 		if (result < 0)
361 			break;
362 
363 		bytes = result;
364 		npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
365 		for (i = 0; i < npages; i++) {
366 			struct nfs_page *req;
367 			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
368 			/* XXX do we need to do the eof zeroing found in async_filler? */
369 			req = nfs_page_create_from_page(dreq->ctx, pagevec[i],
370 							pgbase, pos, req_len);
371 			if (IS_ERR(req)) {
372 				result = PTR_ERR(req);
373 				break;
374 			}
375 			if (!nfs_pageio_add_request(&desc, req)) {
376 				result = desc.pg_error;
377 				nfs_release_request(req);
378 				break;
379 			}
380 			pgbase = 0;
381 			bytes -= req_len;
382 			requested_bytes += req_len;
383 			pos += req_len;
384 			dreq->bytes_left -= req_len;
385 		}
386 		nfs_direct_release_pages(pagevec, npages);
387 		kvfree(pagevec);
388 		if (result < 0)
389 			break;
390 	}
391 
392 	nfs_pageio_complete(&desc);
393 
394 	/*
395 	 * If no bytes were started, return the error, and let the
396 	 * generic layer handle the completion.
397 	 */
398 	if (requested_bytes == 0) {
399 		inode_dio_end(inode);
400 		nfs_direct_req_release(dreq);
401 		return result < 0 ? result : -EIO;
402 	}
403 
404 	if (put_dreq(dreq))
405 		nfs_direct_complete(dreq);
406 	return requested_bytes;
407 }
408 
409 /**
410  * nfs_file_direct_read - file direct read operation for NFS files
411  * @iocb: target I/O control block
412  * @iter: vector of user buffers into which to read data
413  * @swap: flag indicating this is swap IO, not O_DIRECT IO
414  *
415  * We use this function for direct reads instead of calling
416  * generic_file_aio_read() in order to avoid gfar's check to see if
417  * the request starts before the end of the file.  For that check
418  * to work, we must generate a GETATTR before each direct read, and
419  * even then there is a window between the GETATTR and the subsequent
420  * READ where the file size could change.  Our preference is simply
421  * to do all reads the application wants, and the server will take
422  * care of managing the end of file boundary.
423  *
424  * This function also eliminates unnecessarily updating the file's
425  * atime locally, as the NFS server sets the file's atime, and this
426  * client must read the updated atime from the server back into its
427  * cache.
428  */
nfs_file_direct_read(struct kiocb * iocb,struct iov_iter * iter,bool swap)429 ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
430 			     bool swap)
431 {
432 	struct file *file = iocb->ki_filp;
433 	struct address_space *mapping = file->f_mapping;
434 	struct inode *inode = mapping->host;
435 	struct nfs_direct_req *dreq;
436 	struct nfs_lock_context *l_ctx;
437 	ssize_t result, requested;
438 	size_t count = iov_iter_count(iter);
439 	nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
440 
441 	dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
442 		file, count, (long long) iocb->ki_pos);
443 
444 	result = 0;
445 	if (!count)
446 		goto out;
447 
448 	task_io_account_read(count);
449 
450 	result = -ENOMEM;
451 	dreq = nfs_direct_req_alloc();
452 	if (dreq == NULL)
453 		goto out;
454 
455 	dreq->inode = inode;
456 	dreq->bytes_left = dreq->max_count = count;
457 	dreq->io_start = iocb->ki_pos;
458 	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
459 	l_ctx = nfs_get_lock_context(dreq->ctx);
460 	if (IS_ERR(l_ctx)) {
461 		result = PTR_ERR(l_ctx);
462 		nfs_direct_req_release(dreq);
463 		goto out_release;
464 	}
465 	dreq->l_ctx = l_ctx;
466 	if (!is_sync_kiocb(iocb))
467 		dreq->iocb = iocb;
468 
469 	if (user_backed_iter(iter))
470 		dreq->flags = NFS_ODIRECT_SHOULD_DIRTY;
471 
472 	if (!swap)
473 		nfs_start_io_direct(inode);
474 
475 	NFS_I(inode)->read_io += count;
476 	requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
477 
478 	if (!swap)
479 		nfs_end_io_direct(inode);
480 
481 	if (requested > 0) {
482 		result = nfs_direct_wait(dreq);
483 		if (result > 0) {
484 			requested -= result;
485 			iocb->ki_pos += result;
486 		}
487 		iov_iter_revert(iter, requested);
488 	} else {
489 		result = requested;
490 	}
491 
492 out_release:
493 	nfs_direct_req_release(dreq);
494 out:
495 	return result;
496 }
497 
nfs_direct_add_page_head(struct list_head * list,struct nfs_page * req)498 static void nfs_direct_add_page_head(struct list_head *list,
499 				     struct nfs_page *req)
500 {
501 	struct nfs_page *head = req->wb_head;
502 
503 	if (!list_empty(&head->wb_list) || !nfs_lock_request(head))
504 		return;
505 	if (!list_empty(&head->wb_list)) {
506 		nfs_unlock_request(head);
507 		return;
508 	}
509 	list_add(&head->wb_list, list);
510 	kref_get(&head->wb_kref);
511 	kref_get(&head->wb_kref);
512 }
513 
nfs_direct_join_group(struct list_head * list,struct nfs_commit_info * cinfo,struct inode * inode)514 static void nfs_direct_join_group(struct list_head *list,
515 				  struct nfs_commit_info *cinfo,
516 				  struct inode *inode)
517 {
518 	struct nfs_page *req, *subreq;
519 
520 	list_for_each_entry(req, list, wb_list) {
521 		if (req->wb_head != req) {
522 			nfs_direct_add_page_head(&req->wb_list, req);
523 			continue;
524 		}
525 		subreq = req->wb_this_page;
526 		if (subreq == req)
527 			continue;
528 		do {
529 			/*
530 			 * Remove subrequests from this list before freeing
531 			 * them in the call to nfs_join_page_group().
532 			 */
533 			if (!list_empty(&subreq->wb_list)) {
534 				nfs_list_remove_request(subreq);
535 				nfs_release_request(subreq);
536 			}
537 		} while ((subreq = subreq->wb_this_page) != req);
538 		nfs_join_page_group(req, cinfo, inode);
539 	}
540 }
541 
542 static void
nfs_direct_write_scan_commit_list(struct inode * inode,struct list_head * list,struct nfs_commit_info * cinfo)543 nfs_direct_write_scan_commit_list(struct inode *inode,
544 				  struct list_head *list,
545 				  struct nfs_commit_info *cinfo)
546 {
547 	mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
548 	pnfs_recover_commit_reqs(list, cinfo);
549 	nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0);
550 	mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
551 }
552 
nfs_direct_write_reschedule(struct nfs_direct_req * dreq)553 static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
554 {
555 	struct nfs_pageio_descriptor desc;
556 	struct nfs_page *req;
557 	LIST_HEAD(reqs);
558 	struct nfs_commit_info cinfo;
559 
560 	nfs_init_cinfo_from_dreq(&cinfo, dreq);
561 	nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
562 
563 	nfs_direct_join_group(&reqs, &cinfo, dreq->inode);
564 
565 	nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo);
566 	get_dreq(dreq);
567 
568 	nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false,
569 			      &nfs_direct_write_completion_ops);
570 	desc.pg_dreq = dreq;
571 
572 	while (!list_empty(&reqs)) {
573 		req = nfs_list_entry(reqs.next);
574 		/* Bump the transmission count */
575 		req->wb_nio++;
576 		if (!nfs_pageio_add_request(&desc, req)) {
577 			spin_lock(&dreq->lock);
578 			if (dreq->error < 0) {
579 				desc.pg_error = dreq->error;
580 			} else if (desc.pg_error != -EAGAIN) {
581 				dreq->flags = 0;
582 				if (!desc.pg_error)
583 					desc.pg_error = -EIO;
584 				dreq->error = desc.pg_error;
585 			} else
586 				dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
587 			spin_unlock(&dreq->lock);
588 			break;
589 		}
590 		nfs_release_request(req);
591 	}
592 	nfs_pageio_complete(&desc);
593 
594 	while (!list_empty(&reqs)) {
595 		req = nfs_list_entry(reqs.next);
596 		nfs_list_remove_request(req);
597 		nfs_unlock_and_release_request(req);
598 		if (desc.pg_error == -EAGAIN) {
599 			nfs_mark_request_commit(req, NULL, &cinfo, 0);
600 		} else {
601 			spin_lock(&dreq->lock);
602 			nfs_direct_truncate_request(dreq, req);
603 			spin_unlock(&dreq->lock);
604 			nfs_release_request(req);
605 		}
606 	}
607 
608 	if (put_dreq(dreq))
609 		nfs_direct_write_complete(dreq);
610 }
611 
nfs_direct_commit_complete(struct nfs_commit_data * data)612 static void nfs_direct_commit_complete(struct nfs_commit_data *data)
613 {
614 	const struct nfs_writeverf *verf = data->res.verf;
615 	struct nfs_direct_req *dreq = data->dreq;
616 	struct nfs_commit_info cinfo;
617 	struct nfs_page *req;
618 	int status = data->task.tk_status;
619 
620 	trace_nfs_direct_commit_complete(dreq);
621 
622 	if (status < 0) {
623 		/* Errors in commit are fatal */
624 		dreq->error = status;
625 		dreq->flags = NFS_ODIRECT_DONE;
626 	} else {
627 		status = dreq->error;
628 	}
629 
630 	nfs_init_cinfo_from_dreq(&cinfo, dreq);
631 
632 	while (!list_empty(&data->pages)) {
633 		req = nfs_list_entry(data->pages.next);
634 		nfs_list_remove_request(req);
635 		if (status < 0) {
636 			spin_lock(&dreq->lock);
637 			nfs_direct_truncate_request(dreq, req);
638 			spin_unlock(&dreq->lock);
639 			nfs_release_request(req);
640 		} else if (!nfs_write_match_verf(verf, req)) {
641 			dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
642 			/*
643 			 * Despite the reboot, the write was successful,
644 			 * so reset wb_nio.
645 			 */
646 			req->wb_nio = 0;
647 			nfs_mark_request_commit(req, NULL, &cinfo, 0);
648 		} else
649 			nfs_release_request(req);
650 		nfs_unlock_and_release_request(req);
651 	}
652 
653 	if (nfs_commit_end(cinfo.mds))
654 		nfs_direct_write_complete(dreq);
655 }
656 
nfs_direct_resched_write(struct nfs_commit_info * cinfo,struct nfs_page * req)657 static void nfs_direct_resched_write(struct nfs_commit_info *cinfo,
658 		struct nfs_page *req)
659 {
660 	struct nfs_direct_req *dreq = cinfo->dreq;
661 
662 	trace_nfs_direct_resched_write(dreq);
663 
664 	spin_lock(&dreq->lock);
665 	if (dreq->flags != NFS_ODIRECT_DONE)
666 		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
667 	spin_unlock(&dreq->lock);
668 	nfs_mark_request_commit(req, NULL, cinfo, 0);
669 }
670 
671 static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = {
672 	.completion = nfs_direct_commit_complete,
673 	.resched_write = nfs_direct_resched_write,
674 };
675 
nfs_direct_commit_schedule(struct nfs_direct_req * dreq)676 static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
677 {
678 	int res;
679 	struct nfs_commit_info cinfo;
680 	LIST_HEAD(mds_list);
681 
682 	nfs_init_cinfo_from_dreq(&cinfo, dreq);
683 	nfs_commit_begin(cinfo.mds);
684 	nfs_scan_commit(dreq->inode, &mds_list, &cinfo);
685 	res = nfs_generic_commit_list(dreq->inode, &mds_list, 0, &cinfo);
686 	if (res < 0) { /* res == -ENOMEM */
687 		spin_lock(&dreq->lock);
688 		if (dreq->flags == 0)
689 			dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
690 		spin_unlock(&dreq->lock);
691 	}
692 	if (nfs_commit_end(cinfo.mds))
693 		nfs_direct_write_complete(dreq);
694 }
695 
nfs_direct_write_clear_reqs(struct nfs_direct_req * dreq)696 static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq)
697 {
698 	struct nfs_commit_info cinfo;
699 	struct nfs_page *req;
700 	LIST_HEAD(reqs);
701 
702 	nfs_init_cinfo_from_dreq(&cinfo, dreq);
703 	nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
704 
705 	while (!list_empty(&reqs)) {
706 		req = nfs_list_entry(reqs.next);
707 		nfs_list_remove_request(req);
708 		nfs_direct_truncate_request(dreq, req);
709 		nfs_release_request(req);
710 		nfs_unlock_and_release_request(req);
711 	}
712 }
713 
nfs_direct_write_schedule_work(struct work_struct * work)714 static void nfs_direct_write_schedule_work(struct work_struct *work)
715 {
716 	struct nfs_direct_req *dreq = container_of(work, struct nfs_direct_req, work);
717 	int flags = dreq->flags;
718 
719 	dreq->flags = 0;
720 	switch (flags) {
721 		case NFS_ODIRECT_DO_COMMIT:
722 			nfs_direct_commit_schedule(dreq);
723 			break;
724 		case NFS_ODIRECT_RESCHED_WRITES:
725 			nfs_direct_write_reschedule(dreq);
726 			break;
727 		default:
728 			nfs_direct_write_clear_reqs(dreq);
729 			nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping);
730 			nfs_direct_complete(dreq);
731 	}
732 }
733 
nfs_direct_write_complete(struct nfs_direct_req * dreq)734 static void nfs_direct_write_complete(struct nfs_direct_req *dreq)
735 {
736 	trace_nfs_direct_write_complete(dreq);
737 	queue_work(nfsiod_workqueue, &dreq->work); /* Calls nfs_direct_write_schedule_work */
738 }
739 
nfs_direct_write_completion(struct nfs_pgio_header * hdr)740 static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
741 {
742 	struct nfs_direct_req *dreq = hdr->dreq;
743 	struct nfs_commit_info cinfo;
744 	struct nfs_page *req = nfs_list_entry(hdr->pages.next);
745 	struct inode *inode = dreq->inode;
746 	int flags = NFS_ODIRECT_DONE;
747 
748 	trace_nfs_direct_write_completion(dreq);
749 
750 	nfs_init_cinfo_from_dreq(&cinfo, dreq);
751 
752 	spin_lock(&dreq->lock);
753 	if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) {
754 		spin_unlock(&dreq->lock);
755 		goto out_put;
756 	}
757 
758 	nfs_direct_count_bytes(dreq, hdr);
759 	if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags) &&
760 	    !test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
761 		if (!dreq->flags)
762 			dreq->flags = NFS_ODIRECT_DO_COMMIT;
763 		flags = dreq->flags;
764 	}
765 	spin_unlock(&dreq->lock);
766 
767 	spin_lock(&inode->i_lock);
768 	nfs_direct_file_adjust_size_locked(inode, dreq->io_start, dreq->count);
769 	spin_unlock(&inode->i_lock);
770 
771 	while (!list_empty(&hdr->pages)) {
772 
773 		req = nfs_list_entry(hdr->pages.next);
774 		nfs_list_remove_request(req);
775 		if (flags == NFS_ODIRECT_DO_COMMIT) {
776 			kref_get(&req->wb_kref);
777 			memcpy(&req->wb_verf, &hdr->verf.verifier,
778 			       sizeof(req->wb_verf));
779 			nfs_mark_request_commit(req, hdr->lseg, &cinfo,
780 				hdr->ds_commit_idx);
781 		} else if (flags == NFS_ODIRECT_RESCHED_WRITES) {
782 			kref_get(&req->wb_kref);
783 			nfs_mark_request_commit(req, NULL, &cinfo, 0);
784 		}
785 		nfs_unlock_and_release_request(req);
786 	}
787 
788 out_put:
789 	if (put_dreq(dreq))
790 		nfs_direct_write_complete(dreq);
791 	hdr->release(hdr);
792 }
793 
nfs_write_sync_pgio_error(struct list_head * head,int error)794 static void nfs_write_sync_pgio_error(struct list_head *head, int error)
795 {
796 	struct nfs_page *req;
797 
798 	while (!list_empty(head)) {
799 		req = nfs_list_entry(head->next);
800 		nfs_list_remove_request(req);
801 		nfs_unlock_and_release_request(req);
802 	}
803 }
804 
nfs_direct_write_reschedule_io(struct nfs_pgio_header * hdr)805 static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr)
806 {
807 	struct nfs_direct_req *dreq = hdr->dreq;
808 	struct nfs_page *req;
809 	struct nfs_commit_info cinfo;
810 
811 	trace_nfs_direct_write_reschedule_io(dreq);
812 
813 	nfs_init_cinfo_from_dreq(&cinfo, dreq);
814 	spin_lock(&dreq->lock);
815 	if (dreq->error == 0)
816 		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
817 	set_bit(NFS_IOHDR_REDO, &hdr->flags);
818 	spin_unlock(&dreq->lock);
819 	while (!list_empty(&hdr->pages)) {
820 		req = nfs_list_entry(hdr->pages.next);
821 		nfs_list_remove_request(req);
822 		nfs_unlock_request(req);
823 		nfs_mark_request_commit(req, NULL, &cinfo, 0);
824 	}
825 }
826 
827 static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
828 	.error_cleanup = nfs_write_sync_pgio_error,
829 	.init_hdr = nfs_direct_pgio_init,
830 	.completion = nfs_direct_write_completion,
831 	.reschedule_io = nfs_direct_write_reschedule_io,
832 };
833 
834 
835 /*
836  * NB: Return the value of the first error return code.  Subsequent
837  *     errors after the first one are ignored.
838  */
839 /*
840  * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
841  * operation.  If nfs_writedata_alloc() or get_user_pages() fails,
842  * bail and stop sending more writes.  Write length accounting is
843  * handled automatically by nfs_direct_write_result().  Otherwise, if
844  * no requests have been sent, just return an error.
845  */
nfs_direct_write_schedule_iovec(struct nfs_direct_req * dreq,struct iov_iter * iter,loff_t pos,int ioflags)846 static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
847 					       struct iov_iter *iter,
848 					       loff_t pos, int ioflags)
849 {
850 	struct nfs_pageio_descriptor desc;
851 	struct inode *inode = dreq->inode;
852 	struct nfs_commit_info cinfo;
853 	ssize_t result = 0;
854 	size_t requested_bytes = 0;
855 	size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE);
856 	bool defer = false;
857 
858 	trace_nfs_direct_write_schedule_iovec(dreq);
859 
860 	nfs_pageio_init_write(&desc, inode, ioflags, false,
861 			      &nfs_direct_write_completion_ops);
862 	desc.pg_dreq = dreq;
863 	get_dreq(dreq);
864 	inode_dio_begin(inode);
865 
866 	NFS_I(inode)->write_io += iov_iter_count(iter);
867 	while (iov_iter_count(iter)) {
868 		struct page **pagevec;
869 		size_t bytes;
870 		size_t pgbase;
871 		unsigned npages, i;
872 
873 		result = iov_iter_get_pages_alloc2(iter, &pagevec,
874 						  wsize, &pgbase);
875 		if (result < 0)
876 			break;
877 
878 		bytes = result;
879 		npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
880 		for (i = 0; i < npages; i++) {
881 			struct nfs_page *req;
882 			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
883 
884 			req = nfs_page_create_from_page(dreq->ctx, pagevec[i],
885 							pgbase, pos, req_len);
886 			if (IS_ERR(req)) {
887 				result = PTR_ERR(req);
888 				break;
889 			}
890 
891 			if (desc.pg_error < 0) {
892 				nfs_free_request(req);
893 				result = desc.pg_error;
894 				break;
895 			}
896 
897 			pgbase = 0;
898 			bytes -= req_len;
899 			requested_bytes += req_len;
900 			pos += req_len;
901 			dreq->bytes_left -= req_len;
902 
903 			if (defer) {
904 				nfs_mark_request_commit(req, NULL, &cinfo, 0);
905 				continue;
906 			}
907 
908 			nfs_lock_request(req);
909 			if (nfs_pageio_add_request(&desc, req))
910 				continue;
911 
912 			/* Exit on hard errors */
913 			if (desc.pg_error < 0 && desc.pg_error != -EAGAIN) {
914 				result = desc.pg_error;
915 				nfs_unlock_and_release_request(req);
916 				break;
917 			}
918 
919 			/* If the error is soft, defer remaining requests */
920 			nfs_init_cinfo_from_dreq(&cinfo, dreq);
921 			spin_lock(&dreq->lock);
922 			dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
923 			spin_unlock(&dreq->lock);
924 			nfs_unlock_request(req);
925 			nfs_mark_request_commit(req, NULL, &cinfo, 0);
926 			desc.pg_error = 0;
927 			defer = true;
928 		}
929 		nfs_direct_release_pages(pagevec, npages);
930 		kvfree(pagevec);
931 		if (result < 0)
932 			break;
933 	}
934 	nfs_pageio_complete(&desc);
935 
936 	/*
937 	 * If no bytes were started, return the error, and let the
938 	 * generic layer handle the completion.
939 	 */
940 	if (requested_bytes == 0) {
941 		inode_dio_end(inode);
942 		nfs_direct_req_release(dreq);
943 		return result < 0 ? result : -EIO;
944 	}
945 
946 	if (put_dreq(dreq))
947 		nfs_direct_write_complete(dreq);
948 	return requested_bytes;
949 }
950 
951 /**
952  * nfs_file_direct_write - file direct write operation for NFS files
953  * @iocb: target I/O control block
954  * @iter: vector of user buffers from which to write data
955  * @swap: flag indicating this is swap IO, not O_DIRECT IO
956  *
957  * We use this function for direct writes instead of calling
958  * generic_file_aio_write() in order to avoid taking the inode
959  * semaphore and updating the i_size.  The NFS server will set
960  * the new i_size and this client must read the updated size
961  * back into its cache.  We let the server do generic write
962  * parameter checking and report problems.
963  *
964  * We eliminate local atime updates, see direct read above.
965  *
966  * We avoid unnecessary page cache invalidations for normal cached
967  * readers of this file.
968  *
969  * Note that O_APPEND is not supported for NFS direct writes, as there
970  * is no atomic O_APPEND write facility in the NFS protocol.
971  */
nfs_file_direct_write(struct kiocb * iocb,struct iov_iter * iter,bool swap)972 ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
973 			      bool swap)
974 {
975 	ssize_t result, requested;
976 	size_t count;
977 	struct file *file = iocb->ki_filp;
978 	struct address_space *mapping = file->f_mapping;
979 	struct inode *inode = mapping->host;
980 	struct nfs_direct_req *dreq;
981 	struct nfs_lock_context *l_ctx;
982 	loff_t pos, end;
983 
984 	dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
985 		file, iov_iter_count(iter), (long long) iocb->ki_pos);
986 
987 	if (swap)
988 		/* bypass generic checks */
989 		result =  iov_iter_count(iter);
990 	else
991 		result = generic_write_checks(iocb, iter);
992 	if (result <= 0)
993 		return result;
994 	count = result;
995 	nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
996 
997 	pos = iocb->ki_pos;
998 	end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT;
999 
1000 	task_io_account_write(count);
1001 
1002 	result = -ENOMEM;
1003 	dreq = nfs_direct_req_alloc();
1004 	if (!dreq)
1005 		goto out;
1006 
1007 	dreq->inode = inode;
1008 	dreq->bytes_left = dreq->max_count = count;
1009 	dreq->io_start = pos;
1010 	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
1011 	l_ctx = nfs_get_lock_context(dreq->ctx);
1012 	if (IS_ERR(l_ctx)) {
1013 		result = PTR_ERR(l_ctx);
1014 		nfs_direct_req_release(dreq);
1015 		goto out_release;
1016 	}
1017 	dreq->l_ctx = l_ctx;
1018 	if (!is_sync_kiocb(iocb))
1019 		dreq->iocb = iocb;
1020 	pnfs_init_ds_commit_info_ops(&dreq->ds_cinfo, inode);
1021 
1022 	if (swap) {
1023 		requested = nfs_direct_write_schedule_iovec(dreq, iter, pos,
1024 							    FLUSH_STABLE);
1025 	} else {
1026 		nfs_start_io_direct(inode);
1027 
1028 		requested = nfs_direct_write_schedule_iovec(dreq, iter, pos,
1029 							    FLUSH_COND_STABLE);
1030 
1031 		if (mapping->nrpages) {
1032 			invalidate_inode_pages2_range(mapping,
1033 						      pos >> PAGE_SHIFT, end);
1034 		}
1035 
1036 		nfs_end_io_direct(inode);
1037 	}
1038 
1039 	if (requested > 0) {
1040 		result = nfs_direct_wait(dreq);
1041 		if (result > 0) {
1042 			requested -= result;
1043 			iocb->ki_pos = pos + result;
1044 			/* XXX: should check the generic_write_sync retval */
1045 			generic_write_sync(iocb, result);
1046 		}
1047 		iov_iter_revert(iter, requested);
1048 	} else {
1049 		result = requested;
1050 	}
1051 	nfs_fscache_invalidate(inode, FSCACHE_INVAL_DIO_WRITE);
1052 out_release:
1053 	nfs_direct_req_release(dreq);
1054 out:
1055 	return result;
1056 }
1057 
1058 /**
1059  * nfs_init_directcache - create a slab cache for nfs_direct_req structures
1060  *
1061  */
nfs_init_directcache(void)1062 int __init nfs_init_directcache(void)
1063 {
1064 	nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
1065 						sizeof(struct nfs_direct_req),
1066 						0, (SLAB_RECLAIM_ACCOUNT|
1067 							SLAB_MEM_SPREAD),
1068 						NULL);
1069 	if (nfs_direct_cachep == NULL)
1070 		return -ENOMEM;
1071 
1072 	return 0;
1073 }
1074 
1075 /**
1076  * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures
1077  *
1078  */
nfs_destroy_directcache(void)1079 void nfs_destroy_directcache(void)
1080 {
1081 	kmem_cache_destroy(nfs_direct_cachep);
1082 }
1083