1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * linux/fs/nfs/direct.c
4 *
5 * Copyright (C) 2003 by Chuck Lever <cel@netapp.com>
6 *
7 * High-performance uncached I/O for the Linux NFS client
8 *
9 * There are important applications whose performance or correctness
10 * depends on uncached access to file data. Database clusters
11 * (multiple copies of the same instance running on separate hosts)
12 * implement their own cache coherency protocol that subsumes file
13 * system cache protocols. Applications that process datasets
14 * considerably larger than the client's memory do not always benefit
15 * from a local cache. A streaming video server, for instance, has no
16 * need to cache the contents of a file.
17 *
18 * When an application requests uncached I/O, all read and write requests
19 * are made directly to the server; data stored or fetched via these
20 * requests is not cached in the Linux page cache. The client does not
21 * correct unaligned requests from applications. All requested bytes are
22 * held on permanent storage before a direct write system call returns to
23 * an application.
24 *
25 * Solaris implements an uncached I/O facility called directio() that
26 * is used for backups and sequential I/O to very large files. Solaris
27 * also supports uncaching whole NFS partitions with "-o forcedirectio,"
28 * an undocumented mount option.
29 *
30 * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with
31 * help from Andrew Morton.
32 *
33 * 18 Dec 2001 Initial implementation for 2.4 --cel
34 * 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy
35 * 08 Jun 2003 Port to 2.5 APIs --cel
36 * 31 Mar 2004 Handle direct I/O without VFS support --cel
37 * 15 Sep 2004 Parallel async reads --cel
38 * 04 May 2005 support O_DIRECT with aio --cel
39 *
40 */
41
42 #include <linux/errno.h>
43 #include <linux/sched.h>
44 #include <linux/kernel.h>
45 #include <linux/file.h>
46 #include <linux/pagemap.h>
47 #include <linux/kref.h>
48 #include <linux/slab.h>
49 #include <linux/task_io_accounting_ops.h>
50 #include <linux/module.h>
51
52 #include <linux/nfs_fs.h>
53 #include <linux/nfs_page.h>
54 #include <linux/sunrpc/clnt.h>
55
56 #include <linux/uaccess.h>
57 #include <linux/atomic.h>
58
59 #include "internal.h"
60 #include "iostat.h"
61 #include "pnfs.h"
62 #include "fscache.h"
63 #include "nfstrace.h"
64
65 #define NFSDBG_FACILITY NFSDBG_VFS
66
67 static struct kmem_cache *nfs_direct_cachep;
68
69 static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops;
70 static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops;
71 static void nfs_direct_write_complete(struct nfs_direct_req *dreq);
72 static void nfs_direct_write_schedule_work(struct work_struct *work);
73
get_dreq(struct nfs_direct_req * dreq)74 static inline void get_dreq(struct nfs_direct_req *dreq)
75 {
76 atomic_inc(&dreq->io_count);
77 }
78
put_dreq(struct nfs_direct_req * dreq)79 static inline int put_dreq(struct nfs_direct_req *dreq)
80 {
81 return atomic_dec_and_test(&dreq->io_count);
82 }
83
84 static void
nfs_direct_handle_truncated(struct nfs_direct_req * dreq,const struct nfs_pgio_header * hdr,ssize_t dreq_len)85 nfs_direct_handle_truncated(struct nfs_direct_req *dreq,
86 const struct nfs_pgio_header *hdr,
87 ssize_t dreq_len)
88 {
89 if (!(test_bit(NFS_IOHDR_ERROR, &hdr->flags) ||
90 test_bit(NFS_IOHDR_EOF, &hdr->flags)))
91 return;
92 if (dreq->max_count >= dreq_len) {
93 dreq->max_count = dreq_len;
94 if (dreq->count > dreq_len)
95 dreq->count = dreq_len;
96 }
97
98 if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && !dreq->error)
99 dreq->error = hdr->error;
100 }
101
102 static void
nfs_direct_count_bytes(struct nfs_direct_req * dreq,const struct nfs_pgio_header * hdr)103 nfs_direct_count_bytes(struct nfs_direct_req *dreq,
104 const struct nfs_pgio_header *hdr)
105 {
106 loff_t hdr_end = hdr->io_start + hdr->good_bytes;
107 ssize_t dreq_len = 0;
108
109 if (hdr_end > dreq->io_start)
110 dreq_len = hdr_end - dreq->io_start;
111
112 nfs_direct_handle_truncated(dreq, hdr, dreq_len);
113
114 if (dreq_len > dreq->max_count)
115 dreq_len = dreq->max_count;
116
117 if (dreq->count < dreq_len)
118 dreq->count = dreq_len;
119 }
120
nfs_direct_truncate_request(struct nfs_direct_req * dreq,struct nfs_page * req)121 static void nfs_direct_truncate_request(struct nfs_direct_req *dreq,
122 struct nfs_page *req)
123 {
124 loff_t offs = req_offset(req);
125 size_t req_start = (size_t)(offs - dreq->io_start);
126
127 if (req_start < dreq->max_count)
128 dreq->max_count = req_start;
129 if (req_start < dreq->count)
130 dreq->count = req_start;
131 }
132
nfs_direct_file_adjust_size_locked(struct inode * inode,loff_t offset,size_t count)133 static void nfs_direct_file_adjust_size_locked(struct inode *inode,
134 loff_t offset, size_t count)
135 {
136 loff_t newsize = offset + (loff_t)count;
137 loff_t oldsize = i_size_read(inode);
138
139 if (newsize > oldsize) {
140 i_size_write(inode, newsize);
141 NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE;
142 trace_nfs_size_grow(inode, newsize);
143 nfs_inc_stats(inode, NFSIOS_EXTENDWRITE);
144 }
145 }
146
147 /**
148 * nfs_swap_rw - NFS address space operation for swap I/O
149 * @iocb: target I/O control block
150 * @iter: I/O buffer
151 *
152 * Perform IO to the swap-file. This is much like direct IO.
153 */
nfs_swap_rw(struct kiocb * iocb,struct iov_iter * iter)154 int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter)
155 {
156 ssize_t ret;
157
158 if (iov_iter_rw(iter) == READ)
159 ret = nfs_file_direct_read(iocb, iter, true);
160 else
161 ret = nfs_file_direct_write(iocb, iter, true);
162 if (ret < 0)
163 return ret;
164 return 0;
165 }
166
nfs_direct_release_pages(struct page ** pages,unsigned int npages)167 static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
168 {
169 unsigned int i;
170 for (i = 0; i < npages; i++)
171 put_page(pages[i]);
172 }
173
nfs_init_cinfo_from_dreq(struct nfs_commit_info * cinfo,struct nfs_direct_req * dreq)174 void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
175 struct nfs_direct_req *dreq)
176 {
177 cinfo->inode = dreq->inode;
178 cinfo->mds = &dreq->mds_cinfo;
179 cinfo->ds = &dreq->ds_cinfo;
180 cinfo->dreq = dreq;
181 cinfo->completion_ops = &nfs_direct_commit_completion_ops;
182 }
183
nfs_direct_req_alloc(void)184 static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
185 {
186 struct nfs_direct_req *dreq;
187
188 dreq = kmem_cache_zalloc(nfs_direct_cachep, GFP_KERNEL);
189 if (!dreq)
190 return NULL;
191
192 kref_init(&dreq->kref);
193 kref_get(&dreq->kref);
194 init_completion(&dreq->completion);
195 INIT_LIST_HEAD(&dreq->mds_cinfo.list);
196 pnfs_init_ds_commit_info(&dreq->ds_cinfo);
197 INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
198 spin_lock_init(&dreq->lock);
199
200 return dreq;
201 }
202
nfs_direct_req_free(struct kref * kref)203 static void nfs_direct_req_free(struct kref *kref)
204 {
205 struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
206
207 pnfs_release_ds_info(&dreq->ds_cinfo, dreq->inode);
208 if (dreq->l_ctx != NULL)
209 nfs_put_lock_context(dreq->l_ctx);
210 if (dreq->ctx != NULL)
211 put_nfs_open_context(dreq->ctx);
212 kmem_cache_free(nfs_direct_cachep, dreq);
213 }
214
nfs_direct_req_release(struct nfs_direct_req * dreq)215 static void nfs_direct_req_release(struct nfs_direct_req *dreq)
216 {
217 kref_put(&dreq->kref, nfs_direct_req_free);
218 }
219
nfs_dreq_bytes_left(struct nfs_direct_req * dreq,loff_t offset)220 ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq, loff_t offset)
221 {
222 loff_t start = offset - dreq->io_start;
223 return dreq->max_count - start;
224 }
225 EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left);
226
227 /*
228 * Collects and returns the final error value/byte-count.
229 */
nfs_direct_wait(struct nfs_direct_req * dreq)230 static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
231 {
232 ssize_t result = -EIOCBQUEUED;
233
234 /* Async requests don't wait here */
235 if (dreq->iocb)
236 goto out;
237
238 result = wait_for_completion_killable(&dreq->completion);
239
240 if (!result) {
241 result = dreq->count;
242 WARN_ON_ONCE(dreq->count < 0);
243 }
244 if (!result)
245 result = dreq->error;
246
247 out:
248 return (ssize_t) result;
249 }
250
251 /*
252 * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust
253 * the iocb is still valid here if this is a synchronous request.
254 */
nfs_direct_complete(struct nfs_direct_req * dreq)255 static void nfs_direct_complete(struct nfs_direct_req *dreq)
256 {
257 struct inode *inode = dreq->inode;
258
259 inode_dio_end(inode);
260
261 if (dreq->iocb) {
262 long res = (long) dreq->error;
263 if (dreq->count != 0) {
264 res = (long) dreq->count;
265 WARN_ON_ONCE(dreq->count < 0);
266 }
267 dreq->iocb->ki_complete(dreq->iocb, res);
268 }
269
270 complete(&dreq->completion);
271
272 nfs_direct_req_release(dreq);
273 }
274
nfs_direct_read_completion(struct nfs_pgio_header * hdr)275 static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
276 {
277 unsigned long bytes = 0;
278 struct nfs_direct_req *dreq = hdr->dreq;
279
280 spin_lock(&dreq->lock);
281 if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) {
282 spin_unlock(&dreq->lock);
283 goto out_put;
284 }
285
286 nfs_direct_count_bytes(dreq, hdr);
287 spin_unlock(&dreq->lock);
288
289 while (!list_empty(&hdr->pages)) {
290 struct nfs_page *req = nfs_list_entry(hdr->pages.next);
291 struct page *page = req->wb_page;
292
293 if (!PageCompound(page) && bytes < hdr->good_bytes &&
294 (dreq->flags == NFS_ODIRECT_SHOULD_DIRTY))
295 set_page_dirty(page);
296 bytes += req->wb_bytes;
297 nfs_list_remove_request(req);
298 nfs_release_request(req);
299 }
300 out_put:
301 if (put_dreq(dreq))
302 nfs_direct_complete(dreq);
303 hdr->release(hdr);
304 }
305
nfs_read_sync_pgio_error(struct list_head * head,int error)306 static void nfs_read_sync_pgio_error(struct list_head *head, int error)
307 {
308 struct nfs_page *req;
309
310 while (!list_empty(head)) {
311 req = nfs_list_entry(head->next);
312 nfs_list_remove_request(req);
313 nfs_release_request(req);
314 }
315 }
316
nfs_direct_pgio_init(struct nfs_pgio_header * hdr)317 static void nfs_direct_pgio_init(struct nfs_pgio_header *hdr)
318 {
319 get_dreq(hdr->dreq);
320 }
321
322 static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = {
323 .error_cleanup = nfs_read_sync_pgio_error,
324 .init_hdr = nfs_direct_pgio_init,
325 .completion = nfs_direct_read_completion,
326 };
327
328 /*
329 * For each rsize'd chunk of the user's buffer, dispatch an NFS READ
330 * operation. If nfs_readdata_alloc() or get_user_pages() fails,
331 * bail and stop sending more reads. Read length accounting is
332 * handled automatically by nfs_direct_read_result(). Otherwise, if
333 * no requests have been sent, just return an error.
334 */
335
nfs_direct_read_schedule_iovec(struct nfs_direct_req * dreq,struct iov_iter * iter,loff_t pos)336 static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
337 struct iov_iter *iter,
338 loff_t pos)
339 {
340 struct nfs_pageio_descriptor desc;
341 struct inode *inode = dreq->inode;
342 ssize_t result = -EINVAL;
343 size_t requested_bytes = 0;
344 size_t rsize = max_t(size_t, NFS_SERVER(inode)->rsize, PAGE_SIZE);
345
346 nfs_pageio_init_read(&desc, dreq->inode, false,
347 &nfs_direct_read_completion_ops);
348 get_dreq(dreq);
349 desc.pg_dreq = dreq;
350 inode_dio_begin(inode);
351
352 while (iov_iter_count(iter)) {
353 struct page **pagevec;
354 size_t bytes;
355 size_t pgbase;
356 unsigned npages, i;
357
358 result = iov_iter_get_pages_alloc2(iter, &pagevec,
359 rsize, &pgbase);
360 if (result < 0)
361 break;
362
363 bytes = result;
364 npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
365 for (i = 0; i < npages; i++) {
366 struct nfs_page *req;
367 unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
368 /* XXX do we need to do the eof zeroing found in async_filler? */
369 req = nfs_page_create_from_page(dreq->ctx, pagevec[i],
370 pgbase, pos, req_len);
371 if (IS_ERR(req)) {
372 result = PTR_ERR(req);
373 break;
374 }
375 if (!nfs_pageio_add_request(&desc, req)) {
376 result = desc.pg_error;
377 nfs_release_request(req);
378 break;
379 }
380 pgbase = 0;
381 bytes -= req_len;
382 requested_bytes += req_len;
383 pos += req_len;
384 dreq->bytes_left -= req_len;
385 }
386 nfs_direct_release_pages(pagevec, npages);
387 kvfree(pagevec);
388 if (result < 0)
389 break;
390 }
391
392 nfs_pageio_complete(&desc);
393
394 /*
395 * If no bytes were started, return the error, and let the
396 * generic layer handle the completion.
397 */
398 if (requested_bytes == 0) {
399 inode_dio_end(inode);
400 nfs_direct_req_release(dreq);
401 return result < 0 ? result : -EIO;
402 }
403
404 if (put_dreq(dreq))
405 nfs_direct_complete(dreq);
406 return requested_bytes;
407 }
408
409 /**
410 * nfs_file_direct_read - file direct read operation for NFS files
411 * @iocb: target I/O control block
412 * @iter: vector of user buffers into which to read data
413 * @swap: flag indicating this is swap IO, not O_DIRECT IO
414 *
415 * We use this function for direct reads instead of calling
416 * generic_file_aio_read() in order to avoid gfar's check to see if
417 * the request starts before the end of the file. For that check
418 * to work, we must generate a GETATTR before each direct read, and
419 * even then there is a window between the GETATTR and the subsequent
420 * READ where the file size could change. Our preference is simply
421 * to do all reads the application wants, and the server will take
422 * care of managing the end of file boundary.
423 *
424 * This function also eliminates unnecessarily updating the file's
425 * atime locally, as the NFS server sets the file's atime, and this
426 * client must read the updated atime from the server back into its
427 * cache.
428 */
nfs_file_direct_read(struct kiocb * iocb,struct iov_iter * iter,bool swap)429 ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
430 bool swap)
431 {
432 struct file *file = iocb->ki_filp;
433 struct address_space *mapping = file->f_mapping;
434 struct inode *inode = mapping->host;
435 struct nfs_direct_req *dreq;
436 struct nfs_lock_context *l_ctx;
437 ssize_t result, requested;
438 size_t count = iov_iter_count(iter);
439 nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
440
441 dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
442 file, count, (long long) iocb->ki_pos);
443
444 result = 0;
445 if (!count)
446 goto out;
447
448 task_io_account_read(count);
449
450 result = -ENOMEM;
451 dreq = nfs_direct_req_alloc();
452 if (dreq == NULL)
453 goto out;
454
455 dreq->inode = inode;
456 dreq->bytes_left = dreq->max_count = count;
457 dreq->io_start = iocb->ki_pos;
458 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
459 l_ctx = nfs_get_lock_context(dreq->ctx);
460 if (IS_ERR(l_ctx)) {
461 result = PTR_ERR(l_ctx);
462 nfs_direct_req_release(dreq);
463 goto out_release;
464 }
465 dreq->l_ctx = l_ctx;
466 if (!is_sync_kiocb(iocb))
467 dreq->iocb = iocb;
468
469 if (user_backed_iter(iter))
470 dreq->flags = NFS_ODIRECT_SHOULD_DIRTY;
471
472 if (!swap)
473 nfs_start_io_direct(inode);
474
475 NFS_I(inode)->read_io += count;
476 requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
477
478 if (!swap)
479 nfs_end_io_direct(inode);
480
481 if (requested > 0) {
482 result = nfs_direct_wait(dreq);
483 if (result > 0) {
484 requested -= result;
485 iocb->ki_pos += result;
486 }
487 iov_iter_revert(iter, requested);
488 } else {
489 result = requested;
490 }
491
492 out_release:
493 nfs_direct_req_release(dreq);
494 out:
495 return result;
496 }
497
nfs_direct_add_page_head(struct list_head * list,struct nfs_page * req)498 static void nfs_direct_add_page_head(struct list_head *list,
499 struct nfs_page *req)
500 {
501 struct nfs_page *head = req->wb_head;
502
503 if (!list_empty(&head->wb_list) || !nfs_lock_request(head))
504 return;
505 if (!list_empty(&head->wb_list)) {
506 nfs_unlock_request(head);
507 return;
508 }
509 list_add(&head->wb_list, list);
510 kref_get(&head->wb_kref);
511 kref_get(&head->wb_kref);
512 }
513
nfs_direct_join_group(struct list_head * list,struct nfs_commit_info * cinfo,struct inode * inode)514 static void nfs_direct_join_group(struct list_head *list,
515 struct nfs_commit_info *cinfo,
516 struct inode *inode)
517 {
518 struct nfs_page *req, *subreq;
519
520 list_for_each_entry(req, list, wb_list) {
521 if (req->wb_head != req) {
522 nfs_direct_add_page_head(&req->wb_list, req);
523 continue;
524 }
525 subreq = req->wb_this_page;
526 if (subreq == req)
527 continue;
528 do {
529 /*
530 * Remove subrequests from this list before freeing
531 * them in the call to nfs_join_page_group().
532 */
533 if (!list_empty(&subreq->wb_list)) {
534 nfs_list_remove_request(subreq);
535 nfs_release_request(subreq);
536 }
537 } while ((subreq = subreq->wb_this_page) != req);
538 nfs_join_page_group(req, cinfo, inode);
539 }
540 }
541
542 static void
nfs_direct_write_scan_commit_list(struct inode * inode,struct list_head * list,struct nfs_commit_info * cinfo)543 nfs_direct_write_scan_commit_list(struct inode *inode,
544 struct list_head *list,
545 struct nfs_commit_info *cinfo)
546 {
547 mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
548 pnfs_recover_commit_reqs(list, cinfo);
549 nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0);
550 mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
551 }
552
nfs_direct_write_reschedule(struct nfs_direct_req * dreq)553 static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
554 {
555 struct nfs_pageio_descriptor desc;
556 struct nfs_page *req;
557 LIST_HEAD(reqs);
558 struct nfs_commit_info cinfo;
559
560 nfs_init_cinfo_from_dreq(&cinfo, dreq);
561 nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
562
563 nfs_direct_join_group(&reqs, &cinfo, dreq->inode);
564
565 nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo);
566 get_dreq(dreq);
567
568 nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false,
569 &nfs_direct_write_completion_ops);
570 desc.pg_dreq = dreq;
571
572 while (!list_empty(&reqs)) {
573 req = nfs_list_entry(reqs.next);
574 /* Bump the transmission count */
575 req->wb_nio++;
576 if (!nfs_pageio_add_request(&desc, req)) {
577 spin_lock(&dreq->lock);
578 if (dreq->error < 0) {
579 desc.pg_error = dreq->error;
580 } else if (desc.pg_error != -EAGAIN) {
581 dreq->flags = 0;
582 if (!desc.pg_error)
583 desc.pg_error = -EIO;
584 dreq->error = desc.pg_error;
585 } else
586 dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
587 spin_unlock(&dreq->lock);
588 break;
589 }
590 nfs_release_request(req);
591 }
592 nfs_pageio_complete(&desc);
593
594 while (!list_empty(&reqs)) {
595 req = nfs_list_entry(reqs.next);
596 nfs_list_remove_request(req);
597 nfs_unlock_and_release_request(req);
598 if (desc.pg_error == -EAGAIN) {
599 nfs_mark_request_commit(req, NULL, &cinfo, 0);
600 } else {
601 spin_lock(&dreq->lock);
602 nfs_direct_truncate_request(dreq, req);
603 spin_unlock(&dreq->lock);
604 nfs_release_request(req);
605 }
606 }
607
608 if (put_dreq(dreq))
609 nfs_direct_write_complete(dreq);
610 }
611
nfs_direct_commit_complete(struct nfs_commit_data * data)612 static void nfs_direct_commit_complete(struct nfs_commit_data *data)
613 {
614 const struct nfs_writeverf *verf = data->res.verf;
615 struct nfs_direct_req *dreq = data->dreq;
616 struct nfs_commit_info cinfo;
617 struct nfs_page *req;
618 int status = data->task.tk_status;
619
620 trace_nfs_direct_commit_complete(dreq);
621
622 if (status < 0) {
623 /* Errors in commit are fatal */
624 dreq->error = status;
625 dreq->flags = NFS_ODIRECT_DONE;
626 } else {
627 status = dreq->error;
628 }
629
630 nfs_init_cinfo_from_dreq(&cinfo, dreq);
631
632 while (!list_empty(&data->pages)) {
633 req = nfs_list_entry(data->pages.next);
634 nfs_list_remove_request(req);
635 if (status < 0) {
636 spin_lock(&dreq->lock);
637 nfs_direct_truncate_request(dreq, req);
638 spin_unlock(&dreq->lock);
639 nfs_release_request(req);
640 } else if (!nfs_write_match_verf(verf, req)) {
641 dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
642 /*
643 * Despite the reboot, the write was successful,
644 * so reset wb_nio.
645 */
646 req->wb_nio = 0;
647 nfs_mark_request_commit(req, NULL, &cinfo, 0);
648 } else
649 nfs_release_request(req);
650 nfs_unlock_and_release_request(req);
651 }
652
653 if (nfs_commit_end(cinfo.mds))
654 nfs_direct_write_complete(dreq);
655 }
656
nfs_direct_resched_write(struct nfs_commit_info * cinfo,struct nfs_page * req)657 static void nfs_direct_resched_write(struct nfs_commit_info *cinfo,
658 struct nfs_page *req)
659 {
660 struct nfs_direct_req *dreq = cinfo->dreq;
661
662 trace_nfs_direct_resched_write(dreq);
663
664 spin_lock(&dreq->lock);
665 if (dreq->flags != NFS_ODIRECT_DONE)
666 dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
667 spin_unlock(&dreq->lock);
668 nfs_mark_request_commit(req, NULL, cinfo, 0);
669 }
670
671 static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = {
672 .completion = nfs_direct_commit_complete,
673 .resched_write = nfs_direct_resched_write,
674 };
675
nfs_direct_commit_schedule(struct nfs_direct_req * dreq)676 static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
677 {
678 int res;
679 struct nfs_commit_info cinfo;
680 LIST_HEAD(mds_list);
681
682 nfs_init_cinfo_from_dreq(&cinfo, dreq);
683 nfs_commit_begin(cinfo.mds);
684 nfs_scan_commit(dreq->inode, &mds_list, &cinfo);
685 res = nfs_generic_commit_list(dreq->inode, &mds_list, 0, &cinfo);
686 if (res < 0) { /* res == -ENOMEM */
687 spin_lock(&dreq->lock);
688 if (dreq->flags == 0)
689 dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
690 spin_unlock(&dreq->lock);
691 }
692 if (nfs_commit_end(cinfo.mds))
693 nfs_direct_write_complete(dreq);
694 }
695
nfs_direct_write_clear_reqs(struct nfs_direct_req * dreq)696 static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq)
697 {
698 struct nfs_commit_info cinfo;
699 struct nfs_page *req;
700 LIST_HEAD(reqs);
701
702 nfs_init_cinfo_from_dreq(&cinfo, dreq);
703 nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
704
705 while (!list_empty(&reqs)) {
706 req = nfs_list_entry(reqs.next);
707 nfs_list_remove_request(req);
708 nfs_direct_truncate_request(dreq, req);
709 nfs_release_request(req);
710 nfs_unlock_and_release_request(req);
711 }
712 }
713
nfs_direct_write_schedule_work(struct work_struct * work)714 static void nfs_direct_write_schedule_work(struct work_struct *work)
715 {
716 struct nfs_direct_req *dreq = container_of(work, struct nfs_direct_req, work);
717 int flags = dreq->flags;
718
719 dreq->flags = 0;
720 switch (flags) {
721 case NFS_ODIRECT_DO_COMMIT:
722 nfs_direct_commit_schedule(dreq);
723 break;
724 case NFS_ODIRECT_RESCHED_WRITES:
725 nfs_direct_write_reschedule(dreq);
726 break;
727 default:
728 nfs_direct_write_clear_reqs(dreq);
729 nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping);
730 nfs_direct_complete(dreq);
731 }
732 }
733
nfs_direct_write_complete(struct nfs_direct_req * dreq)734 static void nfs_direct_write_complete(struct nfs_direct_req *dreq)
735 {
736 trace_nfs_direct_write_complete(dreq);
737 queue_work(nfsiod_workqueue, &dreq->work); /* Calls nfs_direct_write_schedule_work */
738 }
739
nfs_direct_write_completion(struct nfs_pgio_header * hdr)740 static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
741 {
742 struct nfs_direct_req *dreq = hdr->dreq;
743 struct nfs_commit_info cinfo;
744 struct nfs_page *req = nfs_list_entry(hdr->pages.next);
745 struct inode *inode = dreq->inode;
746 int flags = NFS_ODIRECT_DONE;
747
748 trace_nfs_direct_write_completion(dreq);
749
750 nfs_init_cinfo_from_dreq(&cinfo, dreq);
751
752 spin_lock(&dreq->lock);
753 if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) {
754 spin_unlock(&dreq->lock);
755 goto out_put;
756 }
757
758 nfs_direct_count_bytes(dreq, hdr);
759 if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags) &&
760 !test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
761 if (!dreq->flags)
762 dreq->flags = NFS_ODIRECT_DO_COMMIT;
763 flags = dreq->flags;
764 }
765 spin_unlock(&dreq->lock);
766
767 spin_lock(&inode->i_lock);
768 nfs_direct_file_adjust_size_locked(inode, dreq->io_start, dreq->count);
769 spin_unlock(&inode->i_lock);
770
771 while (!list_empty(&hdr->pages)) {
772
773 req = nfs_list_entry(hdr->pages.next);
774 nfs_list_remove_request(req);
775 if (flags == NFS_ODIRECT_DO_COMMIT) {
776 kref_get(&req->wb_kref);
777 memcpy(&req->wb_verf, &hdr->verf.verifier,
778 sizeof(req->wb_verf));
779 nfs_mark_request_commit(req, hdr->lseg, &cinfo,
780 hdr->ds_commit_idx);
781 } else if (flags == NFS_ODIRECT_RESCHED_WRITES) {
782 kref_get(&req->wb_kref);
783 nfs_mark_request_commit(req, NULL, &cinfo, 0);
784 }
785 nfs_unlock_and_release_request(req);
786 }
787
788 out_put:
789 if (put_dreq(dreq))
790 nfs_direct_write_complete(dreq);
791 hdr->release(hdr);
792 }
793
nfs_write_sync_pgio_error(struct list_head * head,int error)794 static void nfs_write_sync_pgio_error(struct list_head *head, int error)
795 {
796 struct nfs_page *req;
797
798 while (!list_empty(head)) {
799 req = nfs_list_entry(head->next);
800 nfs_list_remove_request(req);
801 nfs_unlock_and_release_request(req);
802 }
803 }
804
nfs_direct_write_reschedule_io(struct nfs_pgio_header * hdr)805 static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr)
806 {
807 struct nfs_direct_req *dreq = hdr->dreq;
808 struct nfs_page *req;
809 struct nfs_commit_info cinfo;
810
811 trace_nfs_direct_write_reschedule_io(dreq);
812
813 nfs_init_cinfo_from_dreq(&cinfo, dreq);
814 spin_lock(&dreq->lock);
815 if (dreq->error == 0)
816 dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
817 set_bit(NFS_IOHDR_REDO, &hdr->flags);
818 spin_unlock(&dreq->lock);
819 while (!list_empty(&hdr->pages)) {
820 req = nfs_list_entry(hdr->pages.next);
821 nfs_list_remove_request(req);
822 nfs_unlock_request(req);
823 nfs_mark_request_commit(req, NULL, &cinfo, 0);
824 }
825 }
826
827 static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
828 .error_cleanup = nfs_write_sync_pgio_error,
829 .init_hdr = nfs_direct_pgio_init,
830 .completion = nfs_direct_write_completion,
831 .reschedule_io = nfs_direct_write_reschedule_io,
832 };
833
834
835 /*
836 * NB: Return the value of the first error return code. Subsequent
837 * errors after the first one are ignored.
838 */
839 /*
840 * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
841 * operation. If nfs_writedata_alloc() or get_user_pages() fails,
842 * bail and stop sending more writes. Write length accounting is
843 * handled automatically by nfs_direct_write_result(). Otherwise, if
844 * no requests have been sent, just return an error.
845 */
nfs_direct_write_schedule_iovec(struct nfs_direct_req * dreq,struct iov_iter * iter,loff_t pos,int ioflags)846 static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
847 struct iov_iter *iter,
848 loff_t pos, int ioflags)
849 {
850 struct nfs_pageio_descriptor desc;
851 struct inode *inode = dreq->inode;
852 struct nfs_commit_info cinfo;
853 ssize_t result = 0;
854 size_t requested_bytes = 0;
855 size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE);
856 bool defer = false;
857
858 trace_nfs_direct_write_schedule_iovec(dreq);
859
860 nfs_pageio_init_write(&desc, inode, ioflags, false,
861 &nfs_direct_write_completion_ops);
862 desc.pg_dreq = dreq;
863 get_dreq(dreq);
864 inode_dio_begin(inode);
865
866 NFS_I(inode)->write_io += iov_iter_count(iter);
867 while (iov_iter_count(iter)) {
868 struct page **pagevec;
869 size_t bytes;
870 size_t pgbase;
871 unsigned npages, i;
872
873 result = iov_iter_get_pages_alloc2(iter, &pagevec,
874 wsize, &pgbase);
875 if (result < 0)
876 break;
877
878 bytes = result;
879 npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
880 for (i = 0; i < npages; i++) {
881 struct nfs_page *req;
882 unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
883
884 req = nfs_page_create_from_page(dreq->ctx, pagevec[i],
885 pgbase, pos, req_len);
886 if (IS_ERR(req)) {
887 result = PTR_ERR(req);
888 break;
889 }
890
891 if (desc.pg_error < 0) {
892 nfs_free_request(req);
893 result = desc.pg_error;
894 break;
895 }
896
897 pgbase = 0;
898 bytes -= req_len;
899 requested_bytes += req_len;
900 pos += req_len;
901 dreq->bytes_left -= req_len;
902
903 if (defer) {
904 nfs_mark_request_commit(req, NULL, &cinfo, 0);
905 continue;
906 }
907
908 nfs_lock_request(req);
909 if (nfs_pageio_add_request(&desc, req))
910 continue;
911
912 /* Exit on hard errors */
913 if (desc.pg_error < 0 && desc.pg_error != -EAGAIN) {
914 result = desc.pg_error;
915 nfs_unlock_and_release_request(req);
916 break;
917 }
918
919 /* If the error is soft, defer remaining requests */
920 nfs_init_cinfo_from_dreq(&cinfo, dreq);
921 spin_lock(&dreq->lock);
922 dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
923 spin_unlock(&dreq->lock);
924 nfs_unlock_request(req);
925 nfs_mark_request_commit(req, NULL, &cinfo, 0);
926 desc.pg_error = 0;
927 defer = true;
928 }
929 nfs_direct_release_pages(pagevec, npages);
930 kvfree(pagevec);
931 if (result < 0)
932 break;
933 }
934 nfs_pageio_complete(&desc);
935
936 /*
937 * If no bytes were started, return the error, and let the
938 * generic layer handle the completion.
939 */
940 if (requested_bytes == 0) {
941 inode_dio_end(inode);
942 nfs_direct_req_release(dreq);
943 return result < 0 ? result : -EIO;
944 }
945
946 if (put_dreq(dreq))
947 nfs_direct_write_complete(dreq);
948 return requested_bytes;
949 }
950
951 /**
952 * nfs_file_direct_write - file direct write operation for NFS files
953 * @iocb: target I/O control block
954 * @iter: vector of user buffers from which to write data
955 * @swap: flag indicating this is swap IO, not O_DIRECT IO
956 *
957 * We use this function for direct writes instead of calling
958 * generic_file_aio_write() in order to avoid taking the inode
959 * semaphore and updating the i_size. The NFS server will set
960 * the new i_size and this client must read the updated size
961 * back into its cache. We let the server do generic write
962 * parameter checking and report problems.
963 *
964 * We eliminate local atime updates, see direct read above.
965 *
966 * We avoid unnecessary page cache invalidations for normal cached
967 * readers of this file.
968 *
969 * Note that O_APPEND is not supported for NFS direct writes, as there
970 * is no atomic O_APPEND write facility in the NFS protocol.
971 */
nfs_file_direct_write(struct kiocb * iocb,struct iov_iter * iter,bool swap)972 ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
973 bool swap)
974 {
975 ssize_t result, requested;
976 size_t count;
977 struct file *file = iocb->ki_filp;
978 struct address_space *mapping = file->f_mapping;
979 struct inode *inode = mapping->host;
980 struct nfs_direct_req *dreq;
981 struct nfs_lock_context *l_ctx;
982 loff_t pos, end;
983
984 dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
985 file, iov_iter_count(iter), (long long) iocb->ki_pos);
986
987 if (swap)
988 /* bypass generic checks */
989 result = iov_iter_count(iter);
990 else
991 result = generic_write_checks(iocb, iter);
992 if (result <= 0)
993 return result;
994 count = result;
995 nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
996
997 pos = iocb->ki_pos;
998 end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT;
999
1000 task_io_account_write(count);
1001
1002 result = -ENOMEM;
1003 dreq = nfs_direct_req_alloc();
1004 if (!dreq)
1005 goto out;
1006
1007 dreq->inode = inode;
1008 dreq->bytes_left = dreq->max_count = count;
1009 dreq->io_start = pos;
1010 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
1011 l_ctx = nfs_get_lock_context(dreq->ctx);
1012 if (IS_ERR(l_ctx)) {
1013 result = PTR_ERR(l_ctx);
1014 nfs_direct_req_release(dreq);
1015 goto out_release;
1016 }
1017 dreq->l_ctx = l_ctx;
1018 if (!is_sync_kiocb(iocb))
1019 dreq->iocb = iocb;
1020 pnfs_init_ds_commit_info_ops(&dreq->ds_cinfo, inode);
1021
1022 if (swap) {
1023 requested = nfs_direct_write_schedule_iovec(dreq, iter, pos,
1024 FLUSH_STABLE);
1025 } else {
1026 nfs_start_io_direct(inode);
1027
1028 requested = nfs_direct_write_schedule_iovec(dreq, iter, pos,
1029 FLUSH_COND_STABLE);
1030
1031 if (mapping->nrpages) {
1032 invalidate_inode_pages2_range(mapping,
1033 pos >> PAGE_SHIFT, end);
1034 }
1035
1036 nfs_end_io_direct(inode);
1037 }
1038
1039 if (requested > 0) {
1040 result = nfs_direct_wait(dreq);
1041 if (result > 0) {
1042 requested -= result;
1043 iocb->ki_pos = pos + result;
1044 /* XXX: should check the generic_write_sync retval */
1045 generic_write_sync(iocb, result);
1046 }
1047 iov_iter_revert(iter, requested);
1048 } else {
1049 result = requested;
1050 }
1051 nfs_fscache_invalidate(inode, FSCACHE_INVAL_DIO_WRITE);
1052 out_release:
1053 nfs_direct_req_release(dreq);
1054 out:
1055 return result;
1056 }
1057
1058 /**
1059 * nfs_init_directcache - create a slab cache for nfs_direct_req structures
1060 *
1061 */
nfs_init_directcache(void)1062 int __init nfs_init_directcache(void)
1063 {
1064 nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
1065 sizeof(struct nfs_direct_req),
1066 0, (SLAB_RECLAIM_ACCOUNT|
1067 SLAB_MEM_SPREAD),
1068 NULL);
1069 if (nfs_direct_cachep == NULL)
1070 return -ENOMEM;
1071
1072 return 0;
1073 }
1074
1075 /**
1076 * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures
1077 *
1078 */
nfs_destroy_directcache(void)1079 void nfs_destroy_directcache(void)
1080 {
1081 kmem_cache_destroy(nfs_direct_cachep);
1082 }
1083