1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "scrub/xfile.h"
14 #include "scrub/xfarray.h"
15 #include "scrub/scrub.h"
16 #include "scrub/trace.h"
17 #include <linux/shmem_fs.h>
18
19 /*
20 * Swappable Temporary Memory
21 * ==========================
22 *
23 * Online checking sometimes needs to be able to stage a large amount of data
24 * in memory. This information might not fit in the available memory and it
25 * doesn't all need to be accessible at all times. In other words, we want an
26 * indexed data buffer to store data that can be paged out.
27 *
28 * When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those
29 * requirements. Therefore, the xfile mechanism uses an unlinked shmem file to
30 * store our staging data. This file is not installed in the file descriptor
31 * table so that user programs cannot access the data, which means that the
32 * xfile must be freed with xfile_destroy.
33 *
34 * xfiles assume that the caller will handle all required concurrency
35 * management; standard vfs locks (freezer and inode) are not taken. Reads
36 * and writes are satisfied directly from the page cache.
37 *
38 * NOTE: The current shmemfs implementation has a quirk that in-kernel reads
39 * of a hole cause a page to be mapped into the file. If you are going to
40 * create a sparse xfile, please be careful about reading from uninitialized
41 * parts of the file. These pages are !Uptodate and will eventually be
42 * reclaimed if not written, but in the short term this boosts memory
43 * consumption.
44 */
45
46 /*
47 * xfiles must not be exposed to userspace and require upper layers to
48 * coordinate access to the one handle returned by the constructor, so
49 * establish a separate lock class for xfiles to avoid confusing lockdep.
50 */
51 static struct lock_class_key xfile_i_mutex_key;
52
53 /*
54 * Create an xfile of the given size. The description will be used in the
55 * trace output.
56 */
57 int
xfile_create(const char * description,loff_t isize,struct xfile ** xfilep)58 xfile_create(
59 const char *description,
60 loff_t isize,
61 struct xfile **xfilep)
62 {
63 struct inode *inode;
64 struct xfile *xf;
65 int error = -ENOMEM;
66
67 xf = kmalloc(sizeof(struct xfile), XCHK_GFP_FLAGS);
68 if (!xf)
69 return -ENOMEM;
70
71 xf->file = shmem_file_setup(description, isize, 0);
72 if (!xf->file)
73 goto out_xfile;
74 if (IS_ERR(xf->file)) {
75 error = PTR_ERR(xf->file);
76 goto out_xfile;
77 }
78
79 /*
80 * We want a large sparse file that we can pread, pwrite, and seek.
81 * xfile users are responsible for keeping the xfile hidden away from
82 * all other callers, so we skip timestamp updates and security checks.
83 * Make the inode only accessible by root, just in case the xfile ever
84 * escapes.
85 */
86 xf->file->f_mode |= FMODE_PREAD | FMODE_PWRITE | FMODE_NOCMTIME |
87 FMODE_LSEEK;
88 xf->file->f_flags |= O_RDWR | O_LARGEFILE | O_NOATIME;
89 inode = file_inode(xf->file);
90 inode->i_flags |= S_PRIVATE | S_NOCMTIME | S_NOATIME;
91 inode->i_mode &= ~0177;
92 inode->i_uid = GLOBAL_ROOT_UID;
93 inode->i_gid = GLOBAL_ROOT_GID;
94
95 lockdep_set_class(&inode->i_rwsem, &xfile_i_mutex_key);
96
97 trace_xfile_create(xf);
98
99 *xfilep = xf;
100 return 0;
101 out_xfile:
102 kfree(xf);
103 return error;
104 }
105
106 /* Close the file and release all resources. */
107 void
xfile_destroy(struct xfile * xf)108 xfile_destroy(
109 struct xfile *xf)
110 {
111 struct inode *inode = file_inode(xf->file);
112
113 trace_xfile_destroy(xf);
114
115 lockdep_set_class(&inode->i_rwsem, &inode->i_sb->s_type->i_mutex_key);
116 fput(xf->file);
117 kfree(xf);
118 }
119
120 /*
121 * Read a memory object directly from the xfile's page cache. Unlike regular
122 * pread, we return -E2BIG and -EFBIG for reads that are too large or at too
123 * high an offset, instead of truncating the read. Otherwise, we return
124 * bytes read or an error code, like regular pread.
125 */
126 ssize_t
xfile_pread(struct xfile * xf,void * buf,size_t count,loff_t pos)127 xfile_pread(
128 struct xfile *xf,
129 void *buf,
130 size_t count,
131 loff_t pos)
132 {
133 struct inode *inode = file_inode(xf->file);
134 struct address_space *mapping = inode->i_mapping;
135 struct page *page = NULL;
136 ssize_t read = 0;
137 unsigned int pflags;
138 int error = 0;
139
140 if (count > MAX_RW_COUNT)
141 return -E2BIG;
142 if (inode->i_sb->s_maxbytes - pos < count)
143 return -EFBIG;
144
145 trace_xfile_pread(xf, pos, count);
146
147 pflags = memalloc_nofs_save();
148 while (count > 0) {
149 void *p, *kaddr;
150 unsigned int len;
151
152 len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
153
154 /*
155 * In-kernel reads of a shmem file cause it to allocate a page
156 * if the mapping shows a hole. Therefore, if we hit ENOMEM
157 * we can continue by zeroing the caller's buffer.
158 */
159 page = shmem_read_mapping_page_gfp(mapping, pos >> PAGE_SHIFT,
160 __GFP_NOWARN);
161 if (IS_ERR(page)) {
162 error = PTR_ERR(page);
163 if (error != -ENOMEM)
164 break;
165
166 memset(buf, 0, len);
167 goto advance;
168 }
169
170 if (PageUptodate(page)) {
171 /*
172 * xfile pages must never be mapped into userspace, so
173 * we skip the dcache flush.
174 */
175 kaddr = kmap_local_page(page);
176 p = kaddr + offset_in_page(pos);
177 memcpy(buf, p, len);
178 kunmap_local(kaddr);
179 } else {
180 memset(buf, 0, len);
181 }
182 put_page(page);
183
184 advance:
185 count -= len;
186 pos += len;
187 buf += len;
188 read += len;
189 }
190 memalloc_nofs_restore(pflags);
191
192 if (read > 0)
193 return read;
194 return error;
195 }
196
197 /*
198 * Write a memory object directly to the xfile's page cache. Unlike regular
199 * pwrite, we return -E2BIG and -EFBIG for writes that are too large or at too
200 * high an offset, instead of truncating the write. Otherwise, we return
201 * bytes written or an error code, like regular pwrite.
202 */
203 ssize_t
xfile_pwrite(struct xfile * xf,const void * buf,size_t count,loff_t pos)204 xfile_pwrite(
205 struct xfile *xf,
206 const void *buf,
207 size_t count,
208 loff_t pos)
209 {
210 struct inode *inode = file_inode(xf->file);
211 struct address_space *mapping = inode->i_mapping;
212 const struct address_space_operations *aops = mapping->a_ops;
213 struct page *page = NULL;
214 ssize_t written = 0;
215 unsigned int pflags;
216 int error = 0;
217
218 if (count > MAX_RW_COUNT)
219 return -E2BIG;
220 if (inode->i_sb->s_maxbytes - pos < count)
221 return -EFBIG;
222
223 trace_xfile_pwrite(xf, pos, count);
224
225 pflags = memalloc_nofs_save();
226 while (count > 0) {
227 void *fsdata = NULL;
228 void *p, *kaddr;
229 unsigned int len;
230 int ret;
231
232 len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
233
234 /*
235 * We call write_begin directly here to avoid all the freezer
236 * protection lock-taking that happens in the normal path.
237 * shmem doesn't support fs freeze, but lockdep doesn't know
238 * that and will trip over that.
239 */
240 error = aops->write_begin(NULL, mapping, pos, len, &page,
241 &fsdata);
242 if (error)
243 break;
244
245 /*
246 * xfile pages must never be mapped into userspace, so we skip
247 * the dcache flush. If the page is not uptodate, zero it
248 * before writing data.
249 */
250 kaddr = kmap_local_page(page);
251 if (!PageUptodate(page)) {
252 memset(kaddr, 0, PAGE_SIZE);
253 SetPageUptodate(page);
254 }
255 p = kaddr + offset_in_page(pos);
256 memcpy(p, buf, len);
257 kunmap_local(kaddr);
258
259 ret = aops->write_end(NULL, mapping, pos, len, len, page,
260 fsdata);
261 if (ret < 0) {
262 error = ret;
263 break;
264 }
265
266 written += ret;
267 if (ret != len)
268 break;
269
270 count -= ret;
271 pos += ret;
272 buf += ret;
273 }
274 memalloc_nofs_restore(pflags);
275
276 if (written > 0)
277 return written;
278 return error;
279 }
280
281 /* Find the next written area in the xfile data for a given offset. */
282 loff_t
xfile_seek_data(struct xfile * xf,loff_t pos)283 xfile_seek_data(
284 struct xfile *xf,
285 loff_t pos)
286 {
287 loff_t ret;
288
289 ret = vfs_llseek(xf->file, pos, SEEK_DATA);
290 trace_xfile_seek_data(xf, pos, ret);
291 return ret;
292 }
293
294 /* Query stat information for an xfile. */
295 int
xfile_stat(struct xfile * xf,struct xfile_stat * statbuf)296 xfile_stat(
297 struct xfile *xf,
298 struct xfile_stat *statbuf)
299 {
300 struct kstat ks;
301 int error;
302
303 error = vfs_getattr_nosec(&xf->file->f_path, &ks,
304 STATX_SIZE | STATX_BLOCKS, AT_STATX_DONT_SYNC);
305 if (error)
306 return error;
307
308 statbuf->size = ks.size;
309 statbuf->bytes = ks.blocks << SECTOR_SHIFT;
310 return 0;
311 }
312
313 /*
314 * Grab the (locked) page for a memory object. The object cannot span a page
315 * boundary. Returns 0 (and a locked page) if successful, -ENOTBLK if we
316 * cannot grab the page, or the usual negative errno.
317 */
318 int
xfile_get_page(struct xfile * xf,loff_t pos,unsigned int len,struct xfile_page * xfpage)319 xfile_get_page(
320 struct xfile *xf,
321 loff_t pos,
322 unsigned int len,
323 struct xfile_page *xfpage)
324 {
325 struct inode *inode = file_inode(xf->file);
326 struct address_space *mapping = inode->i_mapping;
327 const struct address_space_operations *aops = mapping->a_ops;
328 struct page *page = NULL;
329 void *fsdata = NULL;
330 loff_t key = round_down(pos, PAGE_SIZE);
331 unsigned int pflags;
332 int error;
333
334 if (inode->i_sb->s_maxbytes - pos < len)
335 return -ENOMEM;
336 if (len > PAGE_SIZE - offset_in_page(pos))
337 return -ENOTBLK;
338
339 trace_xfile_get_page(xf, pos, len);
340
341 pflags = memalloc_nofs_save();
342
343 /*
344 * We call write_begin directly here to avoid all the freezer
345 * protection lock-taking that happens in the normal path. shmem
346 * doesn't support fs freeze, but lockdep doesn't know that and will
347 * trip over that.
348 */
349 error = aops->write_begin(NULL, mapping, key, PAGE_SIZE, &page,
350 &fsdata);
351 if (error)
352 goto out_pflags;
353
354 /* We got the page, so make sure we push out EOF. */
355 if (i_size_read(inode) < pos + len)
356 i_size_write(inode, pos + len);
357
358 /*
359 * If the page isn't up to date, fill it with zeroes before we hand it
360 * to the caller and make sure the backing store will hold on to them.
361 */
362 if (!PageUptodate(page)) {
363 void *kaddr;
364
365 kaddr = kmap_local_page(page);
366 memset(kaddr, 0, PAGE_SIZE);
367 kunmap_local(kaddr);
368 SetPageUptodate(page);
369 }
370
371 /*
372 * Mark each page dirty so that the contents are written to some
373 * backing store when we drop this buffer, and take an extra reference
374 * to prevent the xfile page from being swapped or removed from the
375 * page cache by reclaim if the caller unlocks the page.
376 */
377 set_page_dirty(page);
378 get_page(page);
379
380 xfpage->page = page;
381 xfpage->fsdata = fsdata;
382 xfpage->pos = key;
383 out_pflags:
384 memalloc_nofs_restore(pflags);
385 return error;
386 }
387
388 /*
389 * Release the (locked) page for a memory object. Returns 0 or a negative
390 * errno.
391 */
392 int
xfile_put_page(struct xfile * xf,struct xfile_page * xfpage)393 xfile_put_page(
394 struct xfile *xf,
395 struct xfile_page *xfpage)
396 {
397 struct inode *inode = file_inode(xf->file);
398 struct address_space *mapping = inode->i_mapping;
399 const struct address_space_operations *aops = mapping->a_ops;
400 unsigned int pflags;
401 int ret;
402
403 trace_xfile_put_page(xf, xfpage->pos, PAGE_SIZE);
404
405 /* Give back the reference that we took in xfile_get_page. */
406 put_page(xfpage->page);
407
408 pflags = memalloc_nofs_save();
409 ret = aops->write_end(NULL, mapping, xfpage->pos, PAGE_SIZE, PAGE_SIZE,
410 xfpage->page, xfpage->fsdata);
411 memalloc_nofs_restore(pflags);
412 memset(xfpage, 0, sizeof(struct xfile_page));
413
414 if (ret < 0)
415 return ret;
416 if (ret != PAGE_SIZE)
417 return -EIO;
418 return 0;
419 }
420