xref: /openbmc/linux/fs/cachefiles/io.c (revision d9565bf4)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* kiocb-using read/write
3  *
4  * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
5  * Written by David Howells (dhowells@redhat.com)
6  */
7 
8 #include <linux/mount.h>
9 #include <linux/slab.h>
10 #include <linux/file.h>
11 #include <linux/uio.h>
12 #include <linux/falloc.h>
13 #include <linux/sched/mm.h>
14 #include <trace/events/fscache.h>
15 #include "internal.h"
16 
17 struct cachefiles_kiocb {
18 	struct kiocb		iocb;
19 	refcount_t		ki_refcnt;
20 	loff_t			start;
21 	union {
22 		size_t		skipped;
23 		size_t		len;
24 	};
25 	struct cachefiles_object *object;
26 	netfs_io_terminated_t	term_func;
27 	void			*term_func_priv;
28 	bool			was_async;
29 	unsigned int		inval_counter;	/* Copy of cookie->inval_counter */
30 	u64			b_writing;
31 };
32 
33 static inline void cachefiles_put_kiocb(struct cachefiles_kiocb *ki)
34 {
35 	if (refcount_dec_and_test(&ki->ki_refcnt)) {
36 		cachefiles_put_object(ki->object, cachefiles_obj_put_ioreq);
37 		fput(ki->iocb.ki_filp);
38 		kfree(ki);
39 	}
40 }
41 
42 /*
43  * Handle completion of a read from the cache.
44  */
45 static void cachefiles_read_complete(struct kiocb *iocb, long ret)
46 {
47 	struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb);
48 	struct inode *inode = file_inode(ki->iocb.ki_filp);
49 
50 	_enter("%ld", ret);
51 
52 	if (ret < 0)
53 		trace_cachefiles_io_error(ki->object, inode, ret,
54 					  cachefiles_trace_read_error);
55 
56 	if (ki->term_func) {
57 		if (ret >= 0) {
58 			if (ki->object->cookie->inval_counter == ki->inval_counter)
59 				ki->skipped += ret;
60 			else
61 				ret = -ESTALE;
62 		}
63 
64 		ki->term_func(ki->term_func_priv, ret, ki->was_async);
65 	}
66 
67 	cachefiles_put_kiocb(ki);
68 }
69 
70 /*
71  * Initiate a read from the cache.
72  */
73 static int cachefiles_read(struct netfs_cache_resources *cres,
74 			   loff_t start_pos,
75 			   struct iov_iter *iter,
76 			   enum netfs_read_from_hole read_hole,
77 			   netfs_io_terminated_t term_func,
78 			   void *term_func_priv)
79 {
80 	struct cachefiles_object *object;
81 	struct cachefiles_kiocb *ki;
82 	struct file *file;
83 	unsigned int old_nofs;
84 	ssize_t ret = -ENOBUFS;
85 	size_t len = iov_iter_count(iter), skipped = 0;
86 
87 	if (!fscache_wait_for_operation(cres, FSCACHE_WANT_READ))
88 		goto presubmission_error;
89 
90 	fscache_count_read();
91 	object = cachefiles_cres_object(cres);
92 	file = cachefiles_cres_file(cres);
93 
94 	_enter("%pD,%li,%llx,%zx/%llx",
95 	       file, file_inode(file)->i_ino, start_pos, len,
96 	       i_size_read(file_inode(file)));
97 
98 	/* If the caller asked us to seek for data before doing the read, then
99 	 * we should do that now.  If we find a gap, we fill it with zeros.
100 	 */
101 	if (read_hole != NETFS_READ_HOLE_IGNORE) {
102 		loff_t off = start_pos, off2;
103 
104 		off2 = cachefiles_inject_read_error();
105 		if (off2 == 0)
106 			off2 = vfs_llseek(file, off, SEEK_DATA);
107 		if (off2 < 0 && off2 >= (loff_t)-MAX_ERRNO && off2 != -ENXIO) {
108 			skipped = 0;
109 			ret = off2;
110 			goto presubmission_error;
111 		}
112 
113 		if (off2 == -ENXIO || off2 >= start_pos + len) {
114 			/* The region is beyond the EOF or there's no more data
115 			 * in the region, so clear the rest of the buffer and
116 			 * return success.
117 			 */
118 			ret = -ENODATA;
119 			if (read_hole == NETFS_READ_HOLE_FAIL)
120 				goto presubmission_error;
121 
122 			iov_iter_zero(len, iter);
123 			skipped = len;
124 			ret = 0;
125 			goto presubmission_error;
126 		}
127 
128 		skipped = off2 - off;
129 		iov_iter_zero(skipped, iter);
130 	}
131 
132 	ret = -ENOMEM;
133 	ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL);
134 	if (!ki)
135 		goto presubmission_error;
136 
137 	refcount_set(&ki->ki_refcnt, 2);
138 	ki->iocb.ki_filp	= file;
139 	ki->iocb.ki_pos		= start_pos + skipped;
140 	ki->iocb.ki_flags	= IOCB_DIRECT;
141 	ki->iocb.ki_hint	= ki_hint_validate(file_write_hint(file));
142 	ki->iocb.ki_ioprio	= get_current_ioprio();
143 	ki->skipped		= skipped;
144 	ki->object		= object;
145 	ki->inval_counter	= cres->inval_counter;
146 	ki->term_func		= term_func;
147 	ki->term_func_priv	= term_func_priv;
148 	ki->was_async		= true;
149 
150 	if (ki->term_func)
151 		ki->iocb.ki_complete = cachefiles_read_complete;
152 
153 	get_file(ki->iocb.ki_filp);
154 	cachefiles_grab_object(object, cachefiles_obj_get_ioreq);
155 
156 	trace_cachefiles_read(object, file_inode(file), ki->iocb.ki_pos, len - skipped);
157 	old_nofs = memalloc_nofs_save();
158 	ret = cachefiles_inject_read_error();
159 	if (ret == 0)
160 		ret = vfs_iocb_iter_read(file, &ki->iocb, iter);
161 	memalloc_nofs_restore(old_nofs);
162 	switch (ret) {
163 	case -EIOCBQUEUED:
164 		goto in_progress;
165 
166 	case -ERESTARTSYS:
167 	case -ERESTARTNOINTR:
168 	case -ERESTARTNOHAND:
169 	case -ERESTART_RESTARTBLOCK:
170 		/* There's no easy way to restart the syscall since other AIO's
171 		 * may be already running. Just fail this IO with EINTR.
172 		 */
173 		ret = -EINTR;
174 		fallthrough;
175 	default:
176 		ki->was_async = false;
177 		cachefiles_read_complete(&ki->iocb, ret);
178 		if (ret > 0)
179 			ret = 0;
180 		break;
181 	}
182 
183 in_progress:
184 	cachefiles_put_kiocb(ki);
185 	_leave(" = %zd", ret);
186 	return ret;
187 
188 presubmission_error:
189 	if (term_func)
190 		term_func(term_func_priv, ret < 0 ? ret : skipped, false);
191 	return ret;
192 }
193 
194 /*
195  * Query the occupancy of the cache in a region, returning where the next chunk
196  * of data starts and how long it is.
197  */
198 static int cachefiles_query_occupancy(struct netfs_cache_resources *cres,
199 				      loff_t start, size_t len, size_t granularity,
200 				      loff_t *_data_start, size_t *_data_len)
201 {
202 	struct cachefiles_object *object;
203 	struct file *file;
204 	loff_t off, off2;
205 
206 	*_data_start = -1;
207 	*_data_len = 0;
208 
209 	if (!fscache_wait_for_operation(cres, FSCACHE_WANT_READ))
210 		return -ENOBUFS;
211 
212 	object = cachefiles_cres_object(cres);
213 	file = cachefiles_cres_file(cres);
214 	granularity = max_t(size_t, object->volume->cache->bsize, granularity);
215 
216 	_enter("%pD,%li,%llx,%zx/%llx",
217 	       file, file_inode(file)->i_ino, start, len,
218 	       i_size_read(file_inode(file)));
219 
220 	off = cachefiles_inject_read_error();
221 	if (off == 0)
222 		off = vfs_llseek(file, start, SEEK_DATA);
223 	if (off == -ENXIO)
224 		return -ENODATA; /* Beyond EOF */
225 	if (off < 0 && off >= (loff_t)-MAX_ERRNO)
226 		return -ENOBUFS; /* Error. */
227 	if (round_up(off, granularity) >= start + len)
228 		return -ENODATA; /* No data in range */
229 
230 	off2 = cachefiles_inject_read_error();
231 	if (off2 == 0)
232 		off2 = vfs_llseek(file, off, SEEK_HOLE);
233 	if (off2 == -ENXIO)
234 		return -ENODATA; /* Beyond EOF */
235 	if (off2 < 0 && off2 >= (loff_t)-MAX_ERRNO)
236 		return -ENOBUFS; /* Error. */
237 
238 	/* Round away partial blocks */
239 	off = round_up(off, granularity);
240 	off2 = round_down(off2, granularity);
241 	if (off2 <= off)
242 		return -ENODATA;
243 
244 	*_data_start = off;
245 	if (off2 > start + len)
246 		*_data_len = len;
247 	else
248 		*_data_len = off2 - off;
249 	return 0;
250 }
251 
252 /*
253  * Handle completion of a write to the cache.
254  */
255 static void cachefiles_write_complete(struct kiocb *iocb, long ret)
256 {
257 	struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb);
258 	struct cachefiles_object *object = ki->object;
259 	struct inode *inode = file_inode(ki->iocb.ki_filp);
260 
261 	_enter("%ld", ret);
262 
263 	/* Tell lockdep we inherited freeze protection from submission thread */
264 	__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
265 	__sb_end_write(inode->i_sb, SB_FREEZE_WRITE);
266 
267 	if (ret < 0)
268 		trace_cachefiles_io_error(object, inode, ret,
269 					  cachefiles_trace_write_error);
270 
271 	atomic_long_sub(ki->b_writing, &object->volume->cache->b_writing);
272 	set_bit(FSCACHE_COOKIE_HAVE_DATA, &object->cookie->flags);
273 	if (ki->term_func)
274 		ki->term_func(ki->term_func_priv, ret, ki->was_async);
275 	cachefiles_put_kiocb(ki);
276 }
277 
278 /*
279  * Initiate a write to the cache.
280  */
281 static int cachefiles_write(struct netfs_cache_resources *cres,
282 			    loff_t start_pos,
283 			    struct iov_iter *iter,
284 			    netfs_io_terminated_t term_func,
285 			    void *term_func_priv)
286 {
287 	struct cachefiles_object *object;
288 	struct cachefiles_cache *cache;
289 	struct cachefiles_kiocb *ki;
290 	struct inode *inode;
291 	struct file *file;
292 	unsigned int old_nofs;
293 	ssize_t ret = -ENOBUFS;
294 	size_t len = iov_iter_count(iter);
295 
296 	if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE))
297 		goto presubmission_error;
298 	fscache_count_write();
299 	object = cachefiles_cres_object(cres);
300 	cache = object->volume->cache;
301 	file = cachefiles_cres_file(cres);
302 
303 	_enter("%pD,%li,%llx,%zx/%llx",
304 	       file, file_inode(file)->i_ino, start_pos, len,
305 	       i_size_read(file_inode(file)));
306 
307 	ret = -ENOMEM;
308 	ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL);
309 	if (!ki)
310 		goto presubmission_error;
311 
312 	refcount_set(&ki->ki_refcnt, 2);
313 	ki->iocb.ki_filp	= file;
314 	ki->iocb.ki_pos		= start_pos;
315 	ki->iocb.ki_flags	= IOCB_DIRECT | IOCB_WRITE;
316 	ki->iocb.ki_hint	= ki_hint_validate(file_write_hint(file));
317 	ki->iocb.ki_ioprio	= get_current_ioprio();
318 	ki->object		= object;
319 	ki->inval_counter	= cres->inval_counter;
320 	ki->start		= start_pos;
321 	ki->len			= len;
322 	ki->term_func		= term_func;
323 	ki->term_func_priv	= term_func_priv;
324 	ki->was_async		= true;
325 	ki->b_writing		= (len + (1 << cache->bshift) - 1) >> cache->bshift;
326 
327 	if (ki->term_func)
328 		ki->iocb.ki_complete = cachefiles_write_complete;
329 	atomic_long_add(ki->b_writing, &cache->b_writing);
330 
331 	/* Open-code file_start_write here to grab freeze protection, which
332 	 * will be released by another thread in aio_complete_rw().  Fool
333 	 * lockdep by telling it the lock got released so that it doesn't
334 	 * complain about the held lock when we return to userspace.
335 	 */
336 	inode = file_inode(file);
337 	__sb_start_write(inode->i_sb, SB_FREEZE_WRITE);
338 	__sb_writers_release(inode->i_sb, SB_FREEZE_WRITE);
339 
340 	get_file(ki->iocb.ki_filp);
341 	cachefiles_grab_object(object, cachefiles_obj_get_ioreq);
342 
343 	trace_cachefiles_write(object, inode, ki->iocb.ki_pos, len);
344 	old_nofs = memalloc_nofs_save();
345 	ret = cachefiles_inject_write_error();
346 	if (ret == 0)
347 		ret = vfs_iocb_iter_write(file, &ki->iocb, iter);
348 	memalloc_nofs_restore(old_nofs);
349 	switch (ret) {
350 	case -EIOCBQUEUED:
351 		goto in_progress;
352 
353 	case -ERESTARTSYS:
354 	case -ERESTARTNOINTR:
355 	case -ERESTARTNOHAND:
356 	case -ERESTART_RESTARTBLOCK:
357 		/* There's no easy way to restart the syscall since other AIO's
358 		 * may be already running. Just fail this IO with EINTR.
359 		 */
360 		ret = -EINTR;
361 		fallthrough;
362 	default:
363 		ki->was_async = false;
364 		cachefiles_write_complete(&ki->iocb, ret);
365 		if (ret > 0)
366 			ret = 0;
367 		break;
368 	}
369 
370 in_progress:
371 	cachefiles_put_kiocb(ki);
372 	_leave(" = %zd", ret);
373 	return ret;
374 
375 presubmission_error:
376 	if (term_func)
377 		term_func(term_func_priv, ret, false);
378 	return ret;
379 }
380 
381 /*
382  * Prepare a read operation, shortening it to a cached/uncached
383  * boundary as appropriate.
384  */
385 static enum netfs_read_source cachefiles_prepare_read(struct netfs_read_subrequest *subreq,
386 						      loff_t i_size)
387 {
388 	enum cachefiles_prepare_read_trace why;
389 	struct netfs_read_request *rreq = subreq->rreq;
390 	struct netfs_cache_resources *cres = &rreq->cache_resources;
391 	struct cachefiles_object *object;
392 	struct cachefiles_cache *cache;
393 	struct fscache_cookie *cookie = fscache_cres_cookie(cres);
394 	const struct cred *saved_cred;
395 	struct file *file = cachefiles_cres_file(cres);
396 	enum netfs_read_source ret = NETFS_DOWNLOAD_FROM_SERVER;
397 	loff_t off, to;
398 	ino_t ino = file ? file_inode(file)->i_ino : 0;
399 
400 	_enter("%zx @%llx/%llx", subreq->len, subreq->start, i_size);
401 
402 	if (subreq->start >= i_size) {
403 		ret = NETFS_FILL_WITH_ZEROES;
404 		why = cachefiles_trace_read_after_eof;
405 		goto out_no_object;
406 	}
407 
408 	if (test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) {
409 		__set_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags);
410 		why = cachefiles_trace_read_no_data;
411 		goto out_no_object;
412 	}
413 
414 	/* The object and the file may be being created in the background. */
415 	if (!file) {
416 		why = cachefiles_trace_read_no_file;
417 		if (!fscache_wait_for_operation(cres, FSCACHE_WANT_READ))
418 			goto out_no_object;
419 		file = cachefiles_cres_file(cres);
420 		if (!file)
421 			goto out_no_object;
422 		ino = file_inode(file)->i_ino;
423 	}
424 
425 	object = cachefiles_cres_object(cres);
426 	cache = object->volume->cache;
427 	cachefiles_begin_secure(cache, &saved_cred);
428 
429 	off = cachefiles_inject_read_error();
430 	if (off == 0)
431 		off = vfs_llseek(file, subreq->start, SEEK_DATA);
432 	if (off < 0 && off >= (loff_t)-MAX_ERRNO) {
433 		if (off == (loff_t)-ENXIO) {
434 			why = cachefiles_trace_read_seek_nxio;
435 			goto download_and_store;
436 		}
437 		trace_cachefiles_io_error(object, file_inode(file), off,
438 					  cachefiles_trace_seek_error);
439 		why = cachefiles_trace_read_seek_error;
440 		goto out;
441 	}
442 
443 	if (off >= subreq->start + subreq->len) {
444 		why = cachefiles_trace_read_found_hole;
445 		goto download_and_store;
446 	}
447 
448 	if (off > subreq->start) {
449 		off = round_up(off, cache->bsize);
450 		subreq->len = off - subreq->start;
451 		why = cachefiles_trace_read_found_part;
452 		goto download_and_store;
453 	}
454 
455 	to = cachefiles_inject_read_error();
456 	if (to == 0)
457 		to = vfs_llseek(file, subreq->start, SEEK_HOLE);
458 	if (to < 0 && to >= (loff_t)-MAX_ERRNO) {
459 		trace_cachefiles_io_error(object, file_inode(file), to,
460 					  cachefiles_trace_seek_error);
461 		why = cachefiles_trace_read_seek_error;
462 		goto out;
463 	}
464 
465 	if (to < subreq->start + subreq->len) {
466 		if (subreq->start + subreq->len >= i_size)
467 			to = round_up(to, cache->bsize);
468 		else
469 			to = round_down(to, cache->bsize);
470 		subreq->len = to - subreq->start;
471 	}
472 
473 	why = cachefiles_trace_read_have_data;
474 	ret = NETFS_READ_FROM_CACHE;
475 	goto out;
476 
477 download_and_store:
478 	__set_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags);
479 out:
480 	cachefiles_end_secure(cache, saved_cred);
481 out_no_object:
482 	trace_cachefiles_prep_read(subreq, ret, why, ino);
483 	return ret;
484 }
485 
486 /*
487  * Prepare for a write to occur.
488  */
489 static int __cachefiles_prepare_write(struct netfs_cache_resources *cres,
490 				      loff_t *_start, size_t *_len, loff_t i_size,
491 				      bool no_space_allocated_yet)
492 {
493 	struct cachefiles_object *object = cachefiles_cres_object(cres);
494 	struct cachefiles_cache *cache = object->volume->cache;
495 	struct file *file = cachefiles_cres_file(cres);
496 	loff_t start = *_start, pos;
497 	size_t len = *_len, down;
498 	int ret;
499 
500 	/* Round to DIO size */
501 	down = start - round_down(start, PAGE_SIZE);
502 	*_start = start - down;
503 	*_len = round_up(down + len, PAGE_SIZE);
504 
505 	/* We need to work out whether there's sufficient disk space to perform
506 	 * the write - but we can skip that check if we have space already
507 	 * allocated.
508 	 */
509 	if (no_space_allocated_yet)
510 		goto check_space;
511 
512 	pos = cachefiles_inject_read_error();
513 	if (pos == 0)
514 		pos = vfs_llseek(file, *_start, SEEK_DATA);
515 	if (pos < 0 && pos >= (loff_t)-MAX_ERRNO) {
516 		if (pos == -ENXIO)
517 			goto check_space; /* Unallocated tail */
518 		trace_cachefiles_io_error(object, file_inode(file), pos,
519 					  cachefiles_trace_seek_error);
520 		return pos;
521 	}
522 	if ((u64)pos >= (u64)*_start + *_len)
523 		goto check_space; /* Unallocated region */
524 
525 	/* We have a block that's at least partially filled - if we're low on
526 	 * space, we need to see if it's fully allocated.  If it's not, we may
527 	 * want to cull it.
528 	 */
529 	if (cachefiles_has_space(cache, 0, *_len / PAGE_SIZE,
530 				 cachefiles_has_space_check) == 0)
531 		return 0; /* Enough space to simply overwrite the whole block */
532 
533 	pos = cachefiles_inject_read_error();
534 	if (pos == 0)
535 		pos = vfs_llseek(file, *_start, SEEK_HOLE);
536 	if (pos < 0 && pos >= (loff_t)-MAX_ERRNO) {
537 		trace_cachefiles_io_error(object, file_inode(file), pos,
538 					  cachefiles_trace_seek_error);
539 		return pos;
540 	}
541 	if ((u64)pos >= (u64)*_start + *_len)
542 		return 0; /* Fully allocated */
543 
544 	/* Partially allocated, but insufficient space: cull. */
545 	fscache_count_no_write_space();
546 	ret = cachefiles_inject_remove_error();
547 	if (ret == 0)
548 		ret = vfs_fallocate(file, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
549 				    *_start, *_len);
550 	if (ret < 0) {
551 		trace_cachefiles_io_error(object, file_inode(file), ret,
552 					  cachefiles_trace_fallocate_error);
553 		cachefiles_io_error_obj(object,
554 					"CacheFiles: fallocate failed (%d)\n", ret);
555 		ret = -EIO;
556 	}
557 
558 	return ret;
559 
560 check_space:
561 	return cachefiles_has_space(cache, 0, *_len / PAGE_SIZE,
562 				    cachefiles_has_space_for_write);
563 }
564 
565 static int cachefiles_prepare_write(struct netfs_cache_resources *cres,
566 				    loff_t *_start, size_t *_len, loff_t i_size,
567 				    bool no_space_allocated_yet)
568 {
569 	struct cachefiles_object *object = cachefiles_cres_object(cres);
570 	struct cachefiles_cache *cache = object->volume->cache;
571 	const struct cred *saved_cred;
572 	int ret;
573 
574 	if (!cachefiles_cres_file(cres)) {
575 		if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE))
576 			return -ENOBUFS;
577 		if (!cachefiles_cres_file(cres))
578 			return -ENOBUFS;
579 	}
580 
581 	cachefiles_begin_secure(cache, &saved_cred);
582 	ret = __cachefiles_prepare_write(cres, _start, _len, i_size,
583 					 no_space_allocated_yet);
584 	cachefiles_end_secure(cache, saved_cred);
585 	return ret;
586 }
587 
588 /*
589  * Clean up an operation.
590  */
591 static void cachefiles_end_operation(struct netfs_cache_resources *cres)
592 {
593 	struct file *file = cachefiles_cres_file(cres);
594 
595 	if (file)
596 		fput(file);
597 	fscache_end_cookie_access(fscache_cres_cookie(cres), fscache_access_io_end);
598 }
599 
600 static const struct netfs_cache_ops cachefiles_netfs_cache_ops = {
601 	.end_operation		= cachefiles_end_operation,
602 	.read			= cachefiles_read,
603 	.write			= cachefiles_write,
604 	.prepare_read		= cachefiles_prepare_read,
605 	.prepare_write		= cachefiles_prepare_write,
606 	.query_occupancy	= cachefiles_query_occupancy,
607 };
608 
609 /*
610  * Open the cache file when beginning a cache operation.
611  */
612 bool cachefiles_begin_operation(struct netfs_cache_resources *cres,
613 				enum fscache_want_state want_state)
614 {
615 	struct cachefiles_object *object = cachefiles_cres_object(cres);
616 
617 	if (!cachefiles_cres_file(cres)) {
618 		cres->ops = &cachefiles_netfs_cache_ops;
619 		if (object->file) {
620 			spin_lock(&object->lock);
621 			if (!cres->cache_priv2 && object->file)
622 				cres->cache_priv2 = get_file(object->file);
623 			spin_unlock(&object->lock);
624 		}
625 	}
626 
627 	if (!cachefiles_cres_file(cres) && want_state != FSCACHE_WANT_PARAMS) {
628 		pr_err("failed to get cres->file\n");
629 		return false;
630 	}
631 
632 	return true;
633 }
634