xref: /openbmc/linux/fs/cachefiles/io.c (revision 0a94608f)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* kiocb-using read/write
3  *
4  * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
5  * Written by David Howells (dhowells@redhat.com)
6  */
7 
8 #include <linux/mount.h>
9 #include <linux/slab.h>
10 #include <linux/file.h>
11 #include <linux/uio.h>
12 #include <linux/falloc.h>
13 #include <linux/sched/mm.h>
14 #include <trace/events/fscache.h>
15 #include "internal.h"
16 
17 struct cachefiles_kiocb {
18 	struct kiocb		iocb;
19 	refcount_t		ki_refcnt;
20 	loff_t			start;
21 	union {
22 		size_t		skipped;
23 		size_t		len;
24 	};
25 	struct cachefiles_object *object;
26 	netfs_io_terminated_t	term_func;
27 	void			*term_func_priv;
28 	bool			was_async;
29 	unsigned int		inval_counter;	/* Copy of cookie->inval_counter */
30 	u64			b_writing;
31 };
32 
33 static inline void cachefiles_put_kiocb(struct cachefiles_kiocb *ki)
34 {
35 	if (refcount_dec_and_test(&ki->ki_refcnt)) {
36 		cachefiles_put_object(ki->object, cachefiles_obj_put_ioreq);
37 		fput(ki->iocb.ki_filp);
38 		kfree(ki);
39 	}
40 }
41 
42 /*
43  * Handle completion of a read from the cache.
44  */
45 static void cachefiles_read_complete(struct kiocb *iocb, long ret)
46 {
47 	struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb);
48 	struct inode *inode = file_inode(ki->iocb.ki_filp);
49 
50 	_enter("%ld", ret);
51 
52 	if (ret < 0)
53 		trace_cachefiles_io_error(ki->object, inode, ret,
54 					  cachefiles_trace_read_error);
55 
56 	if (ki->term_func) {
57 		if (ret >= 0) {
58 			if (ki->object->cookie->inval_counter == ki->inval_counter)
59 				ki->skipped += ret;
60 			else
61 				ret = -ESTALE;
62 		}
63 
64 		ki->term_func(ki->term_func_priv, ret, ki->was_async);
65 	}
66 
67 	cachefiles_put_kiocb(ki);
68 }
69 
70 /*
71  * Initiate a read from the cache.
72  */
73 static int cachefiles_read(struct netfs_cache_resources *cres,
74 			   loff_t start_pos,
75 			   struct iov_iter *iter,
76 			   enum netfs_read_from_hole read_hole,
77 			   netfs_io_terminated_t term_func,
78 			   void *term_func_priv)
79 {
80 	struct cachefiles_object *object;
81 	struct cachefiles_kiocb *ki;
82 	struct file *file;
83 	unsigned int old_nofs;
84 	ssize_t ret = -ENOBUFS;
85 	size_t len = iov_iter_count(iter), skipped = 0;
86 
87 	if (!fscache_wait_for_operation(cres, FSCACHE_WANT_READ))
88 		goto presubmission_error;
89 
90 	fscache_count_read();
91 	object = cachefiles_cres_object(cres);
92 	file = cachefiles_cres_file(cres);
93 
94 	_enter("%pD,%li,%llx,%zx/%llx",
95 	       file, file_inode(file)->i_ino, start_pos, len,
96 	       i_size_read(file_inode(file)));
97 
98 	/* If the caller asked us to seek for data before doing the read, then
99 	 * we should do that now.  If we find a gap, we fill it with zeros.
100 	 */
101 	if (read_hole != NETFS_READ_HOLE_IGNORE) {
102 		loff_t off = start_pos, off2;
103 
104 		off2 = cachefiles_inject_read_error();
105 		if (off2 == 0)
106 			off2 = vfs_llseek(file, off, SEEK_DATA);
107 		if (off2 < 0 && off2 >= (loff_t)-MAX_ERRNO && off2 != -ENXIO) {
108 			skipped = 0;
109 			ret = off2;
110 			goto presubmission_error;
111 		}
112 
113 		if (off2 == -ENXIO || off2 >= start_pos + len) {
114 			/* The region is beyond the EOF or there's no more data
115 			 * in the region, so clear the rest of the buffer and
116 			 * return success.
117 			 */
118 			ret = -ENODATA;
119 			if (read_hole == NETFS_READ_HOLE_FAIL)
120 				goto presubmission_error;
121 
122 			iov_iter_zero(len, iter);
123 			skipped = len;
124 			ret = 0;
125 			goto presubmission_error;
126 		}
127 
128 		skipped = off2 - off;
129 		iov_iter_zero(skipped, iter);
130 	}
131 
132 	ret = -ENOMEM;
133 	ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL);
134 	if (!ki)
135 		goto presubmission_error;
136 
137 	refcount_set(&ki->ki_refcnt, 2);
138 	ki->iocb.ki_filp	= file;
139 	ki->iocb.ki_pos		= start_pos + skipped;
140 	ki->iocb.ki_flags	= IOCB_DIRECT;
141 	ki->iocb.ki_ioprio	= get_current_ioprio();
142 	ki->skipped		= skipped;
143 	ki->object		= object;
144 	ki->inval_counter	= cres->inval_counter;
145 	ki->term_func		= term_func;
146 	ki->term_func_priv	= term_func_priv;
147 	ki->was_async		= true;
148 
149 	if (ki->term_func)
150 		ki->iocb.ki_complete = cachefiles_read_complete;
151 
152 	get_file(ki->iocb.ki_filp);
153 	cachefiles_grab_object(object, cachefiles_obj_get_ioreq);
154 
155 	trace_cachefiles_read(object, file_inode(file), ki->iocb.ki_pos, len - skipped);
156 	old_nofs = memalloc_nofs_save();
157 	ret = cachefiles_inject_read_error();
158 	if (ret == 0)
159 		ret = vfs_iocb_iter_read(file, &ki->iocb, iter);
160 	memalloc_nofs_restore(old_nofs);
161 	switch (ret) {
162 	case -EIOCBQUEUED:
163 		goto in_progress;
164 
165 	case -ERESTARTSYS:
166 	case -ERESTARTNOINTR:
167 	case -ERESTARTNOHAND:
168 	case -ERESTART_RESTARTBLOCK:
169 		/* There's no easy way to restart the syscall since other AIO's
170 		 * may be already running. Just fail this IO with EINTR.
171 		 */
172 		ret = -EINTR;
173 		fallthrough;
174 	default:
175 		ki->was_async = false;
176 		cachefiles_read_complete(&ki->iocb, ret);
177 		if (ret > 0)
178 			ret = 0;
179 		break;
180 	}
181 
182 in_progress:
183 	cachefiles_put_kiocb(ki);
184 	_leave(" = %zd", ret);
185 	return ret;
186 
187 presubmission_error:
188 	if (term_func)
189 		term_func(term_func_priv, ret < 0 ? ret : skipped, false);
190 	return ret;
191 }
192 
193 /*
194  * Query the occupancy of the cache in a region, returning where the next chunk
195  * of data starts and how long it is.
196  */
197 static int cachefiles_query_occupancy(struct netfs_cache_resources *cres,
198 				      loff_t start, size_t len, size_t granularity,
199 				      loff_t *_data_start, size_t *_data_len)
200 {
201 	struct cachefiles_object *object;
202 	struct file *file;
203 	loff_t off, off2;
204 
205 	*_data_start = -1;
206 	*_data_len = 0;
207 
208 	if (!fscache_wait_for_operation(cres, FSCACHE_WANT_READ))
209 		return -ENOBUFS;
210 
211 	object = cachefiles_cres_object(cres);
212 	file = cachefiles_cres_file(cres);
213 	granularity = max_t(size_t, object->volume->cache->bsize, granularity);
214 
215 	_enter("%pD,%li,%llx,%zx/%llx",
216 	       file, file_inode(file)->i_ino, start, len,
217 	       i_size_read(file_inode(file)));
218 
219 	off = cachefiles_inject_read_error();
220 	if (off == 0)
221 		off = vfs_llseek(file, start, SEEK_DATA);
222 	if (off == -ENXIO)
223 		return -ENODATA; /* Beyond EOF */
224 	if (off < 0 && off >= (loff_t)-MAX_ERRNO)
225 		return -ENOBUFS; /* Error. */
226 	if (round_up(off, granularity) >= start + len)
227 		return -ENODATA; /* No data in range */
228 
229 	off2 = cachefiles_inject_read_error();
230 	if (off2 == 0)
231 		off2 = vfs_llseek(file, off, SEEK_HOLE);
232 	if (off2 == -ENXIO)
233 		return -ENODATA; /* Beyond EOF */
234 	if (off2 < 0 && off2 >= (loff_t)-MAX_ERRNO)
235 		return -ENOBUFS; /* Error. */
236 
237 	/* Round away partial blocks */
238 	off = round_up(off, granularity);
239 	off2 = round_down(off2, granularity);
240 	if (off2 <= off)
241 		return -ENODATA;
242 
243 	*_data_start = off;
244 	if (off2 > start + len)
245 		*_data_len = len;
246 	else
247 		*_data_len = off2 - off;
248 	return 0;
249 }
250 
251 /*
252  * Handle completion of a write to the cache.
253  */
254 static void cachefiles_write_complete(struct kiocb *iocb, long ret)
255 {
256 	struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb);
257 	struct cachefiles_object *object = ki->object;
258 	struct inode *inode = file_inode(ki->iocb.ki_filp);
259 
260 	_enter("%ld", ret);
261 
262 	/* Tell lockdep we inherited freeze protection from submission thread */
263 	__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
264 	__sb_end_write(inode->i_sb, SB_FREEZE_WRITE);
265 
266 	if (ret < 0)
267 		trace_cachefiles_io_error(object, inode, ret,
268 					  cachefiles_trace_write_error);
269 
270 	atomic_long_sub(ki->b_writing, &object->volume->cache->b_writing);
271 	set_bit(FSCACHE_COOKIE_HAVE_DATA, &object->cookie->flags);
272 	if (ki->term_func)
273 		ki->term_func(ki->term_func_priv, ret, ki->was_async);
274 	cachefiles_put_kiocb(ki);
275 }
276 
277 /*
278  * Initiate a write to the cache.
279  */
280 static int cachefiles_write(struct netfs_cache_resources *cres,
281 			    loff_t start_pos,
282 			    struct iov_iter *iter,
283 			    netfs_io_terminated_t term_func,
284 			    void *term_func_priv)
285 {
286 	struct cachefiles_object *object;
287 	struct cachefiles_cache *cache;
288 	struct cachefiles_kiocb *ki;
289 	struct inode *inode;
290 	struct file *file;
291 	unsigned int old_nofs;
292 	ssize_t ret = -ENOBUFS;
293 	size_t len = iov_iter_count(iter);
294 
295 	if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE))
296 		goto presubmission_error;
297 	fscache_count_write();
298 	object = cachefiles_cres_object(cres);
299 	cache = object->volume->cache;
300 	file = cachefiles_cres_file(cres);
301 
302 	_enter("%pD,%li,%llx,%zx/%llx",
303 	       file, file_inode(file)->i_ino, start_pos, len,
304 	       i_size_read(file_inode(file)));
305 
306 	ret = -ENOMEM;
307 	ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL);
308 	if (!ki)
309 		goto presubmission_error;
310 
311 	refcount_set(&ki->ki_refcnt, 2);
312 	ki->iocb.ki_filp	= file;
313 	ki->iocb.ki_pos		= start_pos;
314 	ki->iocb.ki_flags	= IOCB_DIRECT | IOCB_WRITE;
315 	ki->iocb.ki_ioprio	= get_current_ioprio();
316 	ki->object		= object;
317 	ki->inval_counter	= cres->inval_counter;
318 	ki->start		= start_pos;
319 	ki->len			= len;
320 	ki->term_func		= term_func;
321 	ki->term_func_priv	= term_func_priv;
322 	ki->was_async		= true;
323 	ki->b_writing		= (len + (1 << cache->bshift) - 1) >> cache->bshift;
324 
325 	if (ki->term_func)
326 		ki->iocb.ki_complete = cachefiles_write_complete;
327 	atomic_long_add(ki->b_writing, &cache->b_writing);
328 
329 	/* Open-code file_start_write here to grab freeze protection, which
330 	 * will be released by another thread in aio_complete_rw().  Fool
331 	 * lockdep by telling it the lock got released so that it doesn't
332 	 * complain about the held lock when we return to userspace.
333 	 */
334 	inode = file_inode(file);
335 	__sb_start_write(inode->i_sb, SB_FREEZE_WRITE);
336 	__sb_writers_release(inode->i_sb, SB_FREEZE_WRITE);
337 
338 	get_file(ki->iocb.ki_filp);
339 	cachefiles_grab_object(object, cachefiles_obj_get_ioreq);
340 
341 	trace_cachefiles_write(object, inode, ki->iocb.ki_pos, len);
342 	old_nofs = memalloc_nofs_save();
343 	ret = cachefiles_inject_write_error();
344 	if (ret == 0)
345 		ret = vfs_iocb_iter_write(file, &ki->iocb, iter);
346 	memalloc_nofs_restore(old_nofs);
347 	switch (ret) {
348 	case -EIOCBQUEUED:
349 		goto in_progress;
350 
351 	case -ERESTARTSYS:
352 	case -ERESTARTNOINTR:
353 	case -ERESTARTNOHAND:
354 	case -ERESTART_RESTARTBLOCK:
355 		/* There's no easy way to restart the syscall since other AIO's
356 		 * may be already running. Just fail this IO with EINTR.
357 		 */
358 		ret = -EINTR;
359 		fallthrough;
360 	default:
361 		ki->was_async = false;
362 		cachefiles_write_complete(&ki->iocb, ret);
363 		if (ret > 0)
364 			ret = 0;
365 		break;
366 	}
367 
368 in_progress:
369 	cachefiles_put_kiocb(ki);
370 	_leave(" = %zd", ret);
371 	return ret;
372 
373 presubmission_error:
374 	if (term_func)
375 		term_func(term_func_priv, ret, false);
376 	return ret;
377 }
378 
379 /*
380  * Prepare a read operation, shortening it to a cached/uncached
381  * boundary as appropriate.
382  */
383 static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *subreq,
384 						      loff_t i_size)
385 {
386 	enum cachefiles_prepare_read_trace why;
387 	struct netfs_io_request *rreq = subreq->rreq;
388 	struct netfs_cache_resources *cres = &rreq->cache_resources;
389 	struct cachefiles_object *object;
390 	struct cachefiles_cache *cache;
391 	struct fscache_cookie *cookie = fscache_cres_cookie(cres);
392 	const struct cred *saved_cred;
393 	struct file *file = cachefiles_cres_file(cres);
394 	enum netfs_io_source ret = NETFS_DOWNLOAD_FROM_SERVER;
395 	loff_t off, to;
396 	ino_t ino = file ? file_inode(file)->i_ino : 0;
397 
398 	_enter("%zx @%llx/%llx", subreq->len, subreq->start, i_size);
399 
400 	if (subreq->start >= i_size) {
401 		ret = NETFS_FILL_WITH_ZEROES;
402 		why = cachefiles_trace_read_after_eof;
403 		goto out_no_object;
404 	}
405 
406 	if (test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) {
407 		__set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
408 		why = cachefiles_trace_read_no_data;
409 		goto out_no_object;
410 	}
411 
412 	/* The object and the file may be being created in the background. */
413 	if (!file) {
414 		why = cachefiles_trace_read_no_file;
415 		if (!fscache_wait_for_operation(cres, FSCACHE_WANT_READ))
416 			goto out_no_object;
417 		file = cachefiles_cres_file(cres);
418 		if (!file)
419 			goto out_no_object;
420 		ino = file_inode(file)->i_ino;
421 	}
422 
423 	object = cachefiles_cres_object(cres);
424 	cache = object->volume->cache;
425 	cachefiles_begin_secure(cache, &saved_cred);
426 
427 	off = cachefiles_inject_read_error();
428 	if (off == 0)
429 		off = vfs_llseek(file, subreq->start, SEEK_DATA);
430 	if (off < 0 && off >= (loff_t)-MAX_ERRNO) {
431 		if (off == (loff_t)-ENXIO) {
432 			why = cachefiles_trace_read_seek_nxio;
433 			goto download_and_store;
434 		}
435 		trace_cachefiles_io_error(object, file_inode(file), off,
436 					  cachefiles_trace_seek_error);
437 		why = cachefiles_trace_read_seek_error;
438 		goto out;
439 	}
440 
441 	if (off >= subreq->start + subreq->len) {
442 		why = cachefiles_trace_read_found_hole;
443 		goto download_and_store;
444 	}
445 
446 	if (off > subreq->start) {
447 		off = round_up(off, cache->bsize);
448 		subreq->len = off - subreq->start;
449 		why = cachefiles_trace_read_found_part;
450 		goto download_and_store;
451 	}
452 
453 	to = cachefiles_inject_read_error();
454 	if (to == 0)
455 		to = vfs_llseek(file, subreq->start, SEEK_HOLE);
456 	if (to < 0 && to >= (loff_t)-MAX_ERRNO) {
457 		trace_cachefiles_io_error(object, file_inode(file), to,
458 					  cachefiles_trace_seek_error);
459 		why = cachefiles_trace_read_seek_error;
460 		goto out;
461 	}
462 
463 	if (to < subreq->start + subreq->len) {
464 		if (subreq->start + subreq->len >= i_size)
465 			to = round_up(to, cache->bsize);
466 		else
467 			to = round_down(to, cache->bsize);
468 		subreq->len = to - subreq->start;
469 	}
470 
471 	why = cachefiles_trace_read_have_data;
472 	ret = NETFS_READ_FROM_CACHE;
473 	goto out;
474 
475 download_and_store:
476 	__set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
477 out:
478 	cachefiles_end_secure(cache, saved_cred);
479 out_no_object:
480 	trace_cachefiles_prep_read(subreq, ret, why, ino);
481 	return ret;
482 }
483 
484 /*
485  * Prepare for a write to occur.
486  */
487 static int __cachefiles_prepare_write(struct netfs_cache_resources *cres,
488 				      loff_t *_start, size_t *_len, loff_t i_size,
489 				      bool no_space_allocated_yet)
490 {
491 	struct cachefiles_object *object = cachefiles_cres_object(cres);
492 	struct cachefiles_cache *cache = object->volume->cache;
493 	struct file *file = cachefiles_cres_file(cres);
494 	loff_t start = *_start, pos;
495 	size_t len = *_len, down;
496 	int ret;
497 
498 	/* Round to DIO size */
499 	down = start - round_down(start, PAGE_SIZE);
500 	*_start = start - down;
501 	*_len = round_up(down + len, PAGE_SIZE);
502 
503 	/* We need to work out whether there's sufficient disk space to perform
504 	 * the write - but we can skip that check if we have space already
505 	 * allocated.
506 	 */
507 	if (no_space_allocated_yet)
508 		goto check_space;
509 
510 	pos = cachefiles_inject_read_error();
511 	if (pos == 0)
512 		pos = vfs_llseek(file, *_start, SEEK_DATA);
513 	if (pos < 0 && pos >= (loff_t)-MAX_ERRNO) {
514 		if (pos == -ENXIO)
515 			goto check_space; /* Unallocated tail */
516 		trace_cachefiles_io_error(object, file_inode(file), pos,
517 					  cachefiles_trace_seek_error);
518 		return pos;
519 	}
520 	if ((u64)pos >= (u64)*_start + *_len)
521 		goto check_space; /* Unallocated region */
522 
523 	/* We have a block that's at least partially filled - if we're low on
524 	 * space, we need to see if it's fully allocated.  If it's not, we may
525 	 * want to cull it.
526 	 */
527 	if (cachefiles_has_space(cache, 0, *_len / PAGE_SIZE,
528 				 cachefiles_has_space_check) == 0)
529 		return 0; /* Enough space to simply overwrite the whole block */
530 
531 	pos = cachefiles_inject_read_error();
532 	if (pos == 0)
533 		pos = vfs_llseek(file, *_start, SEEK_HOLE);
534 	if (pos < 0 && pos >= (loff_t)-MAX_ERRNO) {
535 		trace_cachefiles_io_error(object, file_inode(file), pos,
536 					  cachefiles_trace_seek_error);
537 		return pos;
538 	}
539 	if ((u64)pos >= (u64)*_start + *_len)
540 		return 0; /* Fully allocated */
541 
542 	/* Partially allocated, but insufficient space: cull. */
543 	fscache_count_no_write_space();
544 	ret = cachefiles_inject_remove_error();
545 	if (ret == 0)
546 		ret = vfs_fallocate(file, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
547 				    *_start, *_len);
548 	if (ret < 0) {
549 		trace_cachefiles_io_error(object, file_inode(file), ret,
550 					  cachefiles_trace_fallocate_error);
551 		cachefiles_io_error_obj(object,
552 					"CacheFiles: fallocate failed (%d)\n", ret);
553 		ret = -EIO;
554 	}
555 
556 	return ret;
557 
558 check_space:
559 	return cachefiles_has_space(cache, 0, *_len / PAGE_SIZE,
560 				    cachefiles_has_space_for_write);
561 }
562 
563 static int cachefiles_prepare_write(struct netfs_cache_resources *cres,
564 				    loff_t *_start, size_t *_len, loff_t i_size,
565 				    bool no_space_allocated_yet)
566 {
567 	struct cachefiles_object *object = cachefiles_cres_object(cres);
568 	struct cachefiles_cache *cache = object->volume->cache;
569 	const struct cred *saved_cred;
570 	int ret;
571 
572 	if (!cachefiles_cres_file(cres)) {
573 		if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE))
574 			return -ENOBUFS;
575 		if (!cachefiles_cres_file(cres))
576 			return -ENOBUFS;
577 	}
578 
579 	cachefiles_begin_secure(cache, &saved_cred);
580 	ret = __cachefiles_prepare_write(cres, _start, _len, i_size,
581 					 no_space_allocated_yet);
582 	cachefiles_end_secure(cache, saved_cred);
583 	return ret;
584 }
585 
586 /*
587  * Clean up an operation.
588  */
589 static void cachefiles_end_operation(struct netfs_cache_resources *cres)
590 {
591 	struct file *file = cachefiles_cres_file(cres);
592 
593 	if (file)
594 		fput(file);
595 	fscache_end_cookie_access(fscache_cres_cookie(cres), fscache_access_io_end);
596 }
597 
598 static const struct netfs_cache_ops cachefiles_netfs_cache_ops = {
599 	.end_operation		= cachefiles_end_operation,
600 	.read			= cachefiles_read,
601 	.write			= cachefiles_write,
602 	.prepare_read		= cachefiles_prepare_read,
603 	.prepare_write		= cachefiles_prepare_write,
604 	.query_occupancy	= cachefiles_query_occupancy,
605 };
606 
607 /*
608  * Open the cache file when beginning a cache operation.
609  */
610 bool cachefiles_begin_operation(struct netfs_cache_resources *cres,
611 				enum fscache_want_state want_state)
612 {
613 	struct cachefiles_object *object = cachefiles_cres_object(cres);
614 
615 	if (!cachefiles_cres_file(cres)) {
616 		cres->ops = &cachefiles_netfs_cache_ops;
617 		if (object->file) {
618 			spin_lock(&object->lock);
619 			if (!cres->cache_priv2 && object->file)
620 				cres->cache_priv2 = get_file(object->file);
621 			spin_unlock(&object->lock);
622 		}
623 	}
624 
625 	if (!cachefiles_cres_file(cres) && want_state != FSCACHE_WANT_PARAMS) {
626 		pr_err("failed to get cres->file\n");
627 		return false;
628 	}
629 
630 	return true;
631 }
632