xref: /openbmc/linux/fs/xfs/scrub/xfile.c (revision 3e8bd1ba)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_format.h"
14 #include "scrub/xfile.h"
15 #include "scrub/xfarray.h"
16 #include "scrub/scrub.h"
17 #include "scrub/trace.h"
18 #include <linux/shmem_fs.h>
19 
20 /*
21  * Swappable Temporary Memory
22  * ==========================
23  *
24  * Online checking sometimes needs to be able to stage a large amount of data
25  * in memory.  This information might not fit in the available memory and it
26  * doesn't all need to be accessible at all times.  In other words, we want an
27  * indexed data buffer to store data that can be paged out.
28  *
29  * When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those
30  * requirements.  Therefore, the xfile mechanism uses an unlinked shmem file to
31  * store our staging data.  This file is not installed in the file descriptor
32  * table so that user programs cannot access the data, which means that the
33  * xfile must be freed with xfile_destroy.
34  *
35  * xfiles assume that the caller will handle all required concurrency
36  * management; standard vfs locks (freezer and inode) are not taken.  Reads
37  * and writes are satisfied directly from the page cache.
38  *
39  * NOTE: The current shmemfs implementation has a quirk that in-kernel reads
40  * of a hole cause a page to be mapped into the file.  If you are going to
41  * create a sparse xfile, please be careful about reading from uninitialized
42  * parts of the file.  These pages are !Uptodate and will eventually be
43  * reclaimed if not written, but in the short term this boosts memory
44  * consumption.
45  */
46 
47 /*
48  * xfiles must not be exposed to userspace and require upper layers to
49  * coordinate access to the one handle returned by the constructor, so
50  * establish a separate lock class for xfiles to avoid confusing lockdep.
51  */
52 static struct lock_class_key xfile_i_mutex_key;
53 
54 /*
55  * Create an xfile of the given size.  The description will be used in the
56  * trace output.
57  */
58 int
59 xfile_create(
60 	const char		*description,
61 	loff_t			isize,
62 	struct xfile		**xfilep)
63 {
64 	struct inode		*inode;
65 	struct xfile		*xf;
66 	int			error = -ENOMEM;
67 
68 	xf = kmalloc(sizeof(struct xfile), XCHK_GFP_FLAGS);
69 	if (!xf)
70 		return -ENOMEM;
71 
72 	xf->file = shmem_file_setup(description, isize, 0);
73 	if (!xf->file)
74 		goto out_xfile;
75 	if (IS_ERR(xf->file)) {
76 		error = PTR_ERR(xf->file);
77 		goto out_xfile;
78 	}
79 
80 	/*
81 	 * We want a large sparse file that we can pread, pwrite, and seek.
82 	 * xfile users are responsible for keeping the xfile hidden away from
83 	 * all other callers, so we skip timestamp updates and security checks.
84 	 * Make the inode only accessible by root, just in case the xfile ever
85 	 * escapes.
86 	 */
87 	xf->file->f_mode |= FMODE_PREAD | FMODE_PWRITE | FMODE_NOCMTIME |
88 			    FMODE_LSEEK;
89 	xf->file->f_flags |= O_RDWR | O_LARGEFILE | O_NOATIME;
90 	inode = file_inode(xf->file);
91 	inode->i_flags |= S_PRIVATE | S_NOCMTIME | S_NOATIME;
92 	inode->i_mode &= ~0177;
93 	inode->i_uid = GLOBAL_ROOT_UID;
94 	inode->i_gid = GLOBAL_ROOT_GID;
95 
96 	lockdep_set_class(&inode->i_rwsem, &xfile_i_mutex_key);
97 
98 	trace_xfile_create(xf);
99 
100 	*xfilep = xf;
101 	return 0;
102 out_xfile:
103 	kfree(xf);
104 	return error;
105 }
106 
107 /* Close the file and release all resources. */
108 void
109 xfile_destroy(
110 	struct xfile		*xf)
111 {
112 	struct inode		*inode = file_inode(xf->file);
113 
114 	trace_xfile_destroy(xf);
115 
116 	lockdep_set_class(&inode->i_rwsem, &inode->i_sb->s_type->i_mutex_key);
117 	fput(xf->file);
118 	kfree(xf);
119 }
120 
121 /*
122  * Read a memory object directly from the xfile's page cache.  Unlike regular
123  * pread, we return -E2BIG and -EFBIG for reads that are too large or at too
124  * high an offset, instead of truncating the read.  Otherwise, we return
125  * bytes read or an error code, like regular pread.
126  */
127 ssize_t
128 xfile_pread(
129 	struct xfile		*xf,
130 	void			*buf,
131 	size_t			count,
132 	loff_t			pos)
133 {
134 	struct inode		*inode = file_inode(xf->file);
135 	struct address_space	*mapping = inode->i_mapping;
136 	struct page		*page = NULL;
137 	ssize_t			read = 0;
138 	unsigned int		pflags;
139 	int			error = 0;
140 
141 	if (count > MAX_RW_COUNT)
142 		return -E2BIG;
143 	if (inode->i_sb->s_maxbytes - pos < count)
144 		return -EFBIG;
145 
146 	trace_xfile_pread(xf, pos, count);
147 
148 	pflags = memalloc_nofs_save();
149 	while (count > 0) {
150 		void		*p, *kaddr;
151 		unsigned int	len;
152 
153 		len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
154 
155 		/*
156 		 * In-kernel reads of a shmem file cause it to allocate a page
157 		 * if the mapping shows a hole.  Therefore, if we hit ENOMEM
158 		 * we can continue by zeroing the caller's buffer.
159 		 */
160 		page = shmem_read_mapping_page_gfp(mapping, pos >> PAGE_SHIFT,
161 				__GFP_NOWARN);
162 		if (IS_ERR(page)) {
163 			error = PTR_ERR(page);
164 			if (error != -ENOMEM)
165 				break;
166 
167 			memset(buf, 0, len);
168 			goto advance;
169 		}
170 
171 		if (PageUptodate(page)) {
172 			/*
173 			 * xfile pages must never be mapped into userspace, so
174 			 * we skip the dcache flush.
175 			 */
176 			kaddr = kmap_local_page(page);
177 			p = kaddr + offset_in_page(pos);
178 			memcpy(buf, p, len);
179 			kunmap_local(kaddr);
180 		} else {
181 			memset(buf, 0, len);
182 		}
183 		put_page(page);
184 
185 advance:
186 		count -= len;
187 		pos += len;
188 		buf += len;
189 		read += len;
190 	}
191 	memalloc_nofs_restore(pflags);
192 
193 	if (read > 0)
194 		return read;
195 	return error;
196 }
197 
198 /*
199  * Write a memory object directly to the xfile's page cache.  Unlike regular
200  * pwrite, we return -E2BIG and -EFBIG for writes that are too large or at too
201  * high an offset, instead of truncating the write.  Otherwise, we return
202  * bytes written or an error code, like regular pwrite.
203  */
204 ssize_t
205 xfile_pwrite(
206 	struct xfile		*xf,
207 	const void		*buf,
208 	size_t			count,
209 	loff_t			pos)
210 {
211 	struct inode		*inode = file_inode(xf->file);
212 	struct address_space	*mapping = inode->i_mapping;
213 	const struct address_space_operations *aops = mapping->a_ops;
214 	struct page		*page = NULL;
215 	ssize_t			written = 0;
216 	unsigned int		pflags;
217 	int			error = 0;
218 
219 	if (count > MAX_RW_COUNT)
220 		return -E2BIG;
221 	if (inode->i_sb->s_maxbytes - pos < count)
222 		return -EFBIG;
223 
224 	trace_xfile_pwrite(xf, pos, count);
225 
226 	pflags = memalloc_nofs_save();
227 	while (count > 0) {
228 		void		*fsdata = NULL;
229 		void		*p, *kaddr;
230 		unsigned int	len;
231 		int		ret;
232 
233 		len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
234 
235 		/*
236 		 * We call write_begin directly here to avoid all the freezer
237 		 * protection lock-taking that happens in the normal path.
238 		 * shmem doesn't support fs freeze, but lockdep doesn't know
239 		 * that and will trip over that.
240 		 */
241 		error = aops->write_begin(NULL, mapping, pos, len, &page,
242 				&fsdata);
243 		if (error)
244 			break;
245 
246 		/*
247 		 * xfile pages must never be mapped into userspace, so we skip
248 		 * the dcache flush.  If the page is not uptodate, zero it
249 		 * before writing data.
250 		 */
251 		kaddr = kmap_local_page(page);
252 		if (!PageUptodate(page)) {
253 			memset(kaddr, 0, PAGE_SIZE);
254 			SetPageUptodate(page);
255 		}
256 		p = kaddr + offset_in_page(pos);
257 		memcpy(p, buf, len);
258 		kunmap_local(kaddr);
259 
260 		ret = aops->write_end(NULL, mapping, pos, len, len, page,
261 				fsdata);
262 		if (ret < 0) {
263 			error = ret;
264 			break;
265 		}
266 
267 		written += ret;
268 		if (ret != len)
269 			break;
270 
271 		count -= ret;
272 		pos += ret;
273 		buf += ret;
274 	}
275 	memalloc_nofs_restore(pflags);
276 
277 	if (written > 0)
278 		return written;
279 	return error;
280 }
281 
282 /* Find the next written area in the xfile data for a given offset. */
283 loff_t
284 xfile_seek_data(
285 	struct xfile		*xf,
286 	loff_t			pos)
287 {
288 	loff_t			ret;
289 
290 	ret = vfs_llseek(xf->file, pos, SEEK_DATA);
291 	trace_xfile_seek_data(xf, pos, ret);
292 	return ret;
293 }
294 
295 /* Query stat information for an xfile. */
296 int
297 xfile_stat(
298 	struct xfile		*xf,
299 	struct xfile_stat	*statbuf)
300 {
301 	struct kstat		ks;
302 	int			error;
303 
304 	error = vfs_getattr_nosec(&xf->file->f_path, &ks,
305 			STATX_SIZE | STATX_BLOCKS, AT_STATX_DONT_SYNC);
306 	if (error)
307 		return error;
308 
309 	statbuf->size = ks.size;
310 	statbuf->bytes = ks.blocks << SECTOR_SHIFT;
311 	return 0;
312 }
313 
314 /*
315  * Grab the (locked) page for a memory object.  The object cannot span a page
316  * boundary.  Returns 0 (and a locked page) if successful, -ENOTBLK if we
317  * cannot grab the page, or the usual negative errno.
318  */
319 int
320 xfile_get_page(
321 	struct xfile		*xf,
322 	loff_t			pos,
323 	unsigned int		len,
324 	struct xfile_page	*xfpage)
325 {
326 	struct inode		*inode = file_inode(xf->file);
327 	struct address_space	*mapping = inode->i_mapping;
328 	const struct address_space_operations *aops = mapping->a_ops;
329 	struct page		*page = NULL;
330 	void			*fsdata = NULL;
331 	loff_t			key = round_down(pos, PAGE_SIZE);
332 	unsigned int		pflags;
333 	int			error;
334 
335 	if (inode->i_sb->s_maxbytes - pos < len)
336 		return -ENOMEM;
337 	if (len > PAGE_SIZE - offset_in_page(pos))
338 		return -ENOTBLK;
339 
340 	trace_xfile_get_page(xf, pos, len);
341 
342 	pflags = memalloc_nofs_save();
343 
344 	/*
345 	 * We call write_begin directly here to avoid all the freezer
346 	 * protection lock-taking that happens in the normal path.  shmem
347 	 * doesn't support fs freeze, but lockdep doesn't know that and will
348 	 * trip over that.
349 	 */
350 	error = aops->write_begin(NULL, mapping, key, PAGE_SIZE, &page,
351 			&fsdata);
352 	if (error)
353 		goto out_pflags;
354 
355 	/* We got the page, so make sure we push out EOF. */
356 	if (i_size_read(inode) < pos + len)
357 		i_size_write(inode, pos + len);
358 
359 	/*
360 	 * If the page isn't up to date, fill it with zeroes before we hand it
361 	 * to the caller and make sure the backing store will hold on to them.
362 	 */
363 	if (!PageUptodate(page)) {
364 		void	*kaddr;
365 
366 		kaddr = kmap_local_page(page);
367 		memset(kaddr, 0, PAGE_SIZE);
368 		kunmap_local(kaddr);
369 		SetPageUptodate(page);
370 	}
371 
372 	/*
373 	 * Mark each page dirty so that the contents are written to some
374 	 * backing store when we drop this buffer, and take an extra reference
375 	 * to prevent the xfile page from being swapped or removed from the
376 	 * page cache by reclaim if the caller unlocks the page.
377 	 */
378 	set_page_dirty(page);
379 	get_page(page);
380 
381 	xfpage->page = page;
382 	xfpage->fsdata = fsdata;
383 	xfpage->pos = key;
384 out_pflags:
385 	memalloc_nofs_restore(pflags);
386 	return error;
387 }
388 
389 /*
390  * Release the (locked) page for a memory object.  Returns 0 or a negative
391  * errno.
392  */
393 int
394 xfile_put_page(
395 	struct xfile		*xf,
396 	struct xfile_page	*xfpage)
397 {
398 	struct inode		*inode = file_inode(xf->file);
399 	struct address_space	*mapping = inode->i_mapping;
400 	const struct address_space_operations *aops = mapping->a_ops;
401 	unsigned int		pflags;
402 	int			ret;
403 
404 	trace_xfile_put_page(xf, xfpage->pos, PAGE_SIZE);
405 
406 	/* Give back the reference that we took in xfile_get_page. */
407 	put_page(xfpage->page);
408 
409 	pflags = memalloc_nofs_save();
410 	ret = aops->write_end(NULL, mapping, xfpage->pos, PAGE_SIZE, PAGE_SIZE,
411 			xfpage->page, xfpage->fsdata);
412 	memalloc_nofs_restore(pflags);
413 	memset(xfpage, 0, sizeof(struct xfile_page));
414 
415 	if (ret < 0)
416 		return ret;
417 	if (ret != PAGE_SIZE)
418 		return -EIO;
419 	return 0;
420 }
421