xref: /openbmc/linux/fs/overlayfs/file.c (revision 2fe60ec9)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2017 Red Hat, Inc.
4  */
5 
6 #include <linux/cred.h>
7 #include <linux/file.h>
8 #include <linux/mount.h>
9 #include <linux/xattr.h>
10 #include <linux/uio.h>
11 #include <linux/uaccess.h>
12 #include <linux/splice.h>
13 #include <linux/security.h>
14 #include <linux/mm.h>
15 #include <linux/fs.h>
16 #include "overlayfs.h"
17 
18 struct ovl_aio_req {
19 	struct kiocb iocb;
20 	refcount_t ref;
21 	struct kiocb *orig_iocb;
22 	struct fd fd;
23 };
24 
25 static struct kmem_cache *ovl_aio_request_cachep;
26 
27 static char ovl_whatisit(struct inode *inode, struct inode *realinode)
28 {
29 	if (realinode != ovl_inode_upper(inode))
30 		return 'l';
31 	if (ovl_has_upperdata(inode))
32 		return 'u';
33 	else
34 		return 'm';
35 }
36 
37 /* No atime modificaton nor notify on underlying */
38 #define OVL_OPEN_FLAGS (O_NOATIME | FMODE_NONOTIFY)
39 
40 static struct file *ovl_open_realfile(const struct file *file,
41 				      struct inode *realinode)
42 {
43 	struct inode *inode = file_inode(file);
44 	struct file *realfile;
45 	const struct cred *old_cred;
46 	int flags = file->f_flags | OVL_OPEN_FLAGS;
47 	int acc_mode = ACC_MODE(flags);
48 	int err;
49 
50 	if (flags & O_APPEND)
51 		acc_mode |= MAY_APPEND;
52 
53 	old_cred = ovl_override_creds(inode->i_sb);
54 	err = inode_permission(&init_user_ns, realinode, MAY_OPEN | acc_mode);
55 	if (err) {
56 		realfile = ERR_PTR(err);
57 	} else {
58 		if (!inode_owner_or_capable(&init_user_ns, realinode))
59 			flags &= ~O_NOATIME;
60 
61 		realfile = open_with_fake_path(&file->f_path, flags, realinode,
62 					       current_cred());
63 	}
64 	revert_creds(old_cred);
65 
66 	pr_debug("open(%p[%pD2/%c], 0%o) -> (%p, 0%o)\n",
67 		 file, file, ovl_whatisit(inode, realinode), file->f_flags,
68 		 realfile, IS_ERR(realfile) ? 0 : realfile->f_flags);
69 
70 	return realfile;
71 }
72 
73 #define OVL_SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT)
74 
75 static int ovl_change_flags(struct file *file, unsigned int flags)
76 {
77 	struct inode *inode = file_inode(file);
78 	int err;
79 
80 	flags &= OVL_SETFL_MASK;
81 
82 	if (((flags ^ file->f_flags) & O_APPEND) && IS_APPEND(inode))
83 		return -EPERM;
84 
85 	if ((flags & O_DIRECT) && !(file->f_mode & FMODE_CAN_ODIRECT))
86 		return -EINVAL;
87 
88 	if (file->f_op->check_flags) {
89 		err = file->f_op->check_flags(flags);
90 		if (err)
91 			return err;
92 	}
93 
94 	spin_lock(&file->f_lock);
95 	file->f_flags = (file->f_flags & ~OVL_SETFL_MASK) | flags;
96 	spin_unlock(&file->f_lock);
97 
98 	return 0;
99 }
100 
101 static int ovl_real_fdget_meta(const struct file *file, struct fd *real,
102 			       bool allow_meta)
103 {
104 	struct inode *inode = file_inode(file);
105 	struct inode *realinode;
106 
107 	real->flags = 0;
108 	real->file = file->private_data;
109 
110 	if (allow_meta)
111 		realinode = ovl_inode_real(inode);
112 	else
113 		realinode = ovl_inode_realdata(inode);
114 
115 	/* Has it been copied up since we'd opened it? */
116 	if (unlikely(file_inode(real->file) != realinode)) {
117 		real->flags = FDPUT_FPUT;
118 		real->file = ovl_open_realfile(file, realinode);
119 
120 		return PTR_ERR_OR_ZERO(real->file);
121 	}
122 
123 	/* Did the flags change since open? */
124 	if (unlikely((file->f_flags ^ real->file->f_flags) & ~OVL_OPEN_FLAGS))
125 		return ovl_change_flags(real->file, file->f_flags);
126 
127 	return 0;
128 }
129 
130 static int ovl_real_fdget(const struct file *file, struct fd *real)
131 {
132 	if (d_is_dir(file_dentry(file))) {
133 		real->flags = 0;
134 		real->file = ovl_dir_real_file(file, false);
135 
136 		return PTR_ERR_OR_ZERO(real->file);
137 	}
138 
139 	return ovl_real_fdget_meta(file, real, false);
140 }
141 
142 static int ovl_open(struct inode *inode, struct file *file)
143 {
144 	struct file *realfile;
145 	int err;
146 
147 	err = ovl_maybe_copy_up(file_dentry(file), file->f_flags);
148 	if (err)
149 		return err;
150 
151 	/* No longer need these flags, so don't pass them on to underlying fs */
152 	file->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
153 
154 	realfile = ovl_open_realfile(file, ovl_inode_realdata(inode));
155 	if (IS_ERR(realfile))
156 		return PTR_ERR(realfile);
157 
158 	file->private_data = realfile;
159 
160 	return 0;
161 }
162 
163 static int ovl_release(struct inode *inode, struct file *file)
164 {
165 	fput(file->private_data);
166 
167 	return 0;
168 }
169 
170 static loff_t ovl_llseek(struct file *file, loff_t offset, int whence)
171 {
172 	struct inode *inode = file_inode(file);
173 	struct fd real;
174 	const struct cred *old_cred;
175 	loff_t ret;
176 
177 	/*
178 	 * The two special cases below do not need to involve real fs,
179 	 * so we can optimizing concurrent callers.
180 	 */
181 	if (offset == 0) {
182 		if (whence == SEEK_CUR)
183 			return file->f_pos;
184 
185 		if (whence == SEEK_SET)
186 			return vfs_setpos(file, 0, 0);
187 	}
188 
189 	ret = ovl_real_fdget(file, &real);
190 	if (ret)
191 		return ret;
192 
193 	/*
194 	 * Overlay file f_pos is the master copy that is preserved
195 	 * through copy up and modified on read/write, but only real
196 	 * fs knows how to SEEK_HOLE/SEEK_DATA and real fs may impose
197 	 * limitations that are more strict than ->s_maxbytes for specific
198 	 * files, so we use the real file to perform seeks.
199 	 */
200 	ovl_inode_lock(inode);
201 	real.file->f_pos = file->f_pos;
202 
203 	old_cred = ovl_override_creds(inode->i_sb);
204 	ret = vfs_llseek(real.file, offset, whence);
205 	revert_creds(old_cred);
206 
207 	file->f_pos = real.file->f_pos;
208 	ovl_inode_unlock(inode);
209 
210 	fdput(real);
211 
212 	return ret;
213 }
214 
215 static void ovl_file_accessed(struct file *file)
216 {
217 	struct inode *inode, *upperinode;
218 
219 	if (file->f_flags & O_NOATIME)
220 		return;
221 
222 	inode = file_inode(file);
223 	upperinode = ovl_inode_upper(inode);
224 
225 	if (!upperinode)
226 		return;
227 
228 	if ((!timespec64_equal(&inode->i_mtime, &upperinode->i_mtime) ||
229 	     !timespec64_equal(&inode->i_ctime, &upperinode->i_ctime))) {
230 		inode->i_mtime = upperinode->i_mtime;
231 		inode->i_ctime = upperinode->i_ctime;
232 	}
233 
234 	touch_atime(&file->f_path);
235 }
236 
237 static rwf_t ovl_iocb_to_rwf(int ifl)
238 {
239 	rwf_t flags = 0;
240 
241 	if (ifl & IOCB_NOWAIT)
242 		flags |= RWF_NOWAIT;
243 	if (ifl & IOCB_HIPRI)
244 		flags |= RWF_HIPRI;
245 	if (ifl & IOCB_DSYNC)
246 		flags |= RWF_DSYNC;
247 	if (ifl & IOCB_SYNC)
248 		flags |= RWF_SYNC;
249 
250 	return flags;
251 }
252 
253 static inline void ovl_aio_put(struct ovl_aio_req *aio_req)
254 {
255 	if (refcount_dec_and_test(&aio_req->ref)) {
256 		fdput(aio_req->fd);
257 		kmem_cache_free(ovl_aio_request_cachep, aio_req);
258 	}
259 }
260 
261 static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req)
262 {
263 	struct kiocb *iocb = &aio_req->iocb;
264 	struct kiocb *orig_iocb = aio_req->orig_iocb;
265 
266 	if (iocb->ki_flags & IOCB_WRITE) {
267 		struct inode *inode = file_inode(orig_iocb->ki_filp);
268 
269 		/* Actually acquired in ovl_write_iter() */
270 		__sb_writers_acquired(file_inode(iocb->ki_filp)->i_sb,
271 				      SB_FREEZE_WRITE);
272 		file_end_write(iocb->ki_filp);
273 		ovl_copyattr(ovl_inode_real(inode), inode);
274 	}
275 
276 	orig_iocb->ki_pos = iocb->ki_pos;
277 	ovl_aio_put(aio_req);
278 }
279 
280 static void ovl_aio_rw_complete(struct kiocb *iocb, long res)
281 {
282 	struct ovl_aio_req *aio_req = container_of(iocb,
283 						   struct ovl_aio_req, iocb);
284 	struct kiocb *orig_iocb = aio_req->orig_iocb;
285 
286 	ovl_aio_cleanup_handler(aio_req);
287 	orig_iocb->ki_complete(orig_iocb, res);
288 }
289 
290 static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
291 {
292 	struct file *file = iocb->ki_filp;
293 	struct fd real;
294 	const struct cred *old_cred;
295 	ssize_t ret;
296 
297 	if (!iov_iter_count(iter))
298 		return 0;
299 
300 	ret = ovl_real_fdget(file, &real);
301 	if (ret)
302 		return ret;
303 
304 	ret = -EINVAL;
305 	if (iocb->ki_flags & IOCB_DIRECT &&
306 	    !(real.file->f_mode & FMODE_CAN_ODIRECT))
307 		goto out_fdput;
308 
309 	old_cred = ovl_override_creds(file_inode(file)->i_sb);
310 	if (is_sync_kiocb(iocb)) {
311 		ret = vfs_iter_read(real.file, iter, &iocb->ki_pos,
312 				    ovl_iocb_to_rwf(iocb->ki_flags));
313 	} else {
314 		struct ovl_aio_req *aio_req;
315 
316 		ret = -ENOMEM;
317 		aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL);
318 		if (!aio_req)
319 			goto out;
320 
321 		aio_req->fd = real;
322 		real.flags = 0;
323 		aio_req->orig_iocb = iocb;
324 		kiocb_clone(&aio_req->iocb, iocb, real.file);
325 		aio_req->iocb.ki_complete = ovl_aio_rw_complete;
326 		refcount_set(&aio_req->ref, 2);
327 		ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter);
328 		ovl_aio_put(aio_req);
329 		if (ret != -EIOCBQUEUED)
330 			ovl_aio_cleanup_handler(aio_req);
331 	}
332 out:
333 	revert_creds(old_cred);
334 	ovl_file_accessed(file);
335 out_fdput:
336 	fdput(real);
337 
338 	return ret;
339 }
340 
341 static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
342 {
343 	struct file *file = iocb->ki_filp;
344 	struct inode *inode = file_inode(file);
345 	struct fd real;
346 	const struct cred *old_cred;
347 	ssize_t ret;
348 	int ifl = iocb->ki_flags;
349 
350 	if (!iov_iter_count(iter))
351 		return 0;
352 
353 	inode_lock(inode);
354 	/* Update mode */
355 	ovl_copyattr(ovl_inode_real(inode), inode);
356 	ret = file_remove_privs(file);
357 	if (ret)
358 		goto out_unlock;
359 
360 	ret = ovl_real_fdget(file, &real);
361 	if (ret)
362 		goto out_unlock;
363 
364 	ret = -EINVAL;
365 	if (iocb->ki_flags & IOCB_DIRECT &&
366 	    !(real.file->f_mode & FMODE_CAN_ODIRECT))
367 		goto out_fdput;
368 
369 	if (!ovl_should_sync(OVL_FS(inode->i_sb)))
370 		ifl &= ~(IOCB_DSYNC | IOCB_SYNC);
371 
372 	old_cred = ovl_override_creds(file_inode(file)->i_sb);
373 	if (is_sync_kiocb(iocb)) {
374 		file_start_write(real.file);
375 		ret = vfs_iter_write(real.file, iter, &iocb->ki_pos,
376 				     ovl_iocb_to_rwf(ifl));
377 		file_end_write(real.file);
378 		/* Update size */
379 		ovl_copyattr(ovl_inode_real(inode), inode);
380 	} else {
381 		struct ovl_aio_req *aio_req;
382 
383 		ret = -ENOMEM;
384 		aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL);
385 		if (!aio_req)
386 			goto out;
387 
388 		file_start_write(real.file);
389 		/* Pacify lockdep, same trick as done in aio_write() */
390 		__sb_writers_release(file_inode(real.file)->i_sb,
391 				     SB_FREEZE_WRITE);
392 		aio_req->fd = real;
393 		real.flags = 0;
394 		aio_req->orig_iocb = iocb;
395 		kiocb_clone(&aio_req->iocb, iocb, real.file);
396 		aio_req->iocb.ki_flags = ifl;
397 		aio_req->iocb.ki_complete = ovl_aio_rw_complete;
398 		refcount_set(&aio_req->ref, 2);
399 		ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter);
400 		ovl_aio_put(aio_req);
401 		if (ret != -EIOCBQUEUED)
402 			ovl_aio_cleanup_handler(aio_req);
403 	}
404 out:
405 	revert_creds(old_cred);
406 out_fdput:
407 	fdput(real);
408 
409 out_unlock:
410 	inode_unlock(inode);
411 
412 	return ret;
413 }
414 
415 /*
416  * Calling iter_file_splice_write() directly from overlay's f_op may deadlock
417  * due to lock order inversion between pipe->mutex in iter_file_splice_write()
418  * and file_start_write(real.file) in ovl_write_iter().
419  *
420  * So do everything ovl_write_iter() does and call iter_file_splice_write() on
421  * the real file.
422  */
423 static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
424 				loff_t *ppos, size_t len, unsigned int flags)
425 {
426 	struct fd real;
427 	const struct cred *old_cred;
428 	struct inode *inode = file_inode(out);
429 	struct inode *realinode = ovl_inode_real(inode);
430 	ssize_t ret;
431 
432 	inode_lock(inode);
433 	/* Update mode */
434 	ovl_copyattr(realinode, inode);
435 	ret = file_remove_privs(out);
436 	if (ret)
437 		goto out_unlock;
438 
439 	ret = ovl_real_fdget(out, &real);
440 	if (ret)
441 		goto out_unlock;
442 
443 	old_cred = ovl_override_creds(inode->i_sb);
444 	file_start_write(real.file);
445 
446 	ret = iter_file_splice_write(pipe, real.file, ppos, len, flags);
447 
448 	file_end_write(real.file);
449 	/* Update size */
450 	ovl_copyattr(realinode, inode);
451 	revert_creds(old_cred);
452 	fdput(real);
453 
454 out_unlock:
455 	inode_unlock(inode);
456 
457 	return ret;
458 }
459 
460 static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
461 {
462 	struct fd real;
463 	const struct cred *old_cred;
464 	int ret;
465 
466 	ret = ovl_sync_status(OVL_FS(file_inode(file)->i_sb));
467 	if (ret <= 0)
468 		return ret;
469 
470 	ret = ovl_real_fdget_meta(file, &real, !datasync);
471 	if (ret)
472 		return ret;
473 
474 	/* Don't sync lower file for fear of receiving EROFS error */
475 	if (file_inode(real.file) == ovl_inode_upper(file_inode(file))) {
476 		old_cred = ovl_override_creds(file_inode(file)->i_sb);
477 		ret = vfs_fsync_range(real.file, start, end, datasync);
478 		revert_creds(old_cred);
479 	}
480 
481 	fdput(real);
482 
483 	return ret;
484 }
485 
486 static int ovl_mmap(struct file *file, struct vm_area_struct *vma)
487 {
488 	struct file *realfile = file->private_data;
489 	const struct cred *old_cred;
490 	int ret;
491 
492 	if (!realfile->f_op->mmap)
493 		return -ENODEV;
494 
495 	if (WARN_ON(file != vma->vm_file))
496 		return -EIO;
497 
498 	vma_set_file(vma, realfile);
499 
500 	old_cred = ovl_override_creds(file_inode(file)->i_sb);
501 	ret = call_mmap(vma->vm_file, vma);
502 	revert_creds(old_cred);
503 	ovl_file_accessed(file);
504 
505 	return ret;
506 }
507 
508 static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
509 {
510 	struct inode *inode = file_inode(file);
511 	struct fd real;
512 	const struct cred *old_cred;
513 	int ret;
514 
515 	ret = ovl_real_fdget(file, &real);
516 	if (ret)
517 		return ret;
518 
519 	old_cred = ovl_override_creds(file_inode(file)->i_sb);
520 	ret = vfs_fallocate(real.file, mode, offset, len);
521 	revert_creds(old_cred);
522 
523 	/* Update size */
524 	ovl_copyattr(ovl_inode_real(inode), inode);
525 
526 	fdput(real);
527 
528 	return ret;
529 }
530 
531 static int ovl_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
532 {
533 	struct fd real;
534 	const struct cred *old_cred;
535 	int ret;
536 
537 	ret = ovl_real_fdget(file, &real);
538 	if (ret)
539 		return ret;
540 
541 	old_cred = ovl_override_creds(file_inode(file)->i_sb);
542 	ret = vfs_fadvise(real.file, offset, len, advice);
543 	revert_creds(old_cred);
544 
545 	fdput(real);
546 
547 	return ret;
548 }
549 
550 enum ovl_copyop {
551 	OVL_COPY,
552 	OVL_CLONE,
553 	OVL_DEDUPE,
554 };
555 
556 static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
557 			    struct file *file_out, loff_t pos_out,
558 			    loff_t len, unsigned int flags, enum ovl_copyop op)
559 {
560 	struct inode *inode_out = file_inode(file_out);
561 	struct fd real_in, real_out;
562 	const struct cred *old_cred;
563 	loff_t ret;
564 
565 	ret = ovl_real_fdget(file_out, &real_out);
566 	if (ret)
567 		return ret;
568 
569 	ret = ovl_real_fdget(file_in, &real_in);
570 	if (ret) {
571 		fdput(real_out);
572 		return ret;
573 	}
574 
575 	old_cred = ovl_override_creds(file_inode(file_out)->i_sb);
576 	switch (op) {
577 	case OVL_COPY:
578 		ret = vfs_copy_file_range(real_in.file, pos_in,
579 					  real_out.file, pos_out, len, flags);
580 		break;
581 
582 	case OVL_CLONE:
583 		ret = vfs_clone_file_range(real_in.file, pos_in,
584 					   real_out.file, pos_out, len, flags);
585 		break;
586 
587 	case OVL_DEDUPE:
588 		ret = vfs_dedupe_file_range_one(real_in.file, pos_in,
589 						real_out.file, pos_out, len,
590 						flags);
591 		break;
592 	}
593 	revert_creds(old_cred);
594 
595 	/* Update size */
596 	ovl_copyattr(ovl_inode_real(inode_out), inode_out);
597 
598 	fdput(real_in);
599 	fdput(real_out);
600 
601 	return ret;
602 }
603 
604 static ssize_t ovl_copy_file_range(struct file *file_in, loff_t pos_in,
605 				   struct file *file_out, loff_t pos_out,
606 				   size_t len, unsigned int flags)
607 {
608 	return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, flags,
609 			    OVL_COPY);
610 }
611 
612 static loff_t ovl_remap_file_range(struct file *file_in, loff_t pos_in,
613 				   struct file *file_out, loff_t pos_out,
614 				   loff_t len, unsigned int remap_flags)
615 {
616 	enum ovl_copyop op;
617 
618 	if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
619 		return -EINVAL;
620 
621 	if (remap_flags & REMAP_FILE_DEDUP)
622 		op = OVL_DEDUPE;
623 	else
624 		op = OVL_CLONE;
625 
626 	/*
627 	 * Don't copy up because of a dedupe request, this wouldn't make sense
628 	 * most of the time (data would be duplicated instead of deduplicated).
629 	 */
630 	if (op == OVL_DEDUPE &&
631 	    (!ovl_inode_upper(file_inode(file_in)) ||
632 	     !ovl_inode_upper(file_inode(file_out))))
633 		return -EPERM;
634 
635 	return ovl_copyfile(file_in, pos_in, file_out, pos_out, len,
636 			    remap_flags, op);
637 }
638 
639 static int ovl_flush(struct file *file, fl_owner_t id)
640 {
641 	struct fd real;
642 	const struct cred *old_cred;
643 	int err;
644 
645 	err = ovl_real_fdget(file, &real);
646 	if (err)
647 		return err;
648 
649 	if (real.file->f_op->flush) {
650 		old_cred = ovl_override_creds(file_inode(file)->i_sb);
651 		err = real.file->f_op->flush(real.file, id);
652 		revert_creds(old_cred);
653 	}
654 	fdput(real);
655 
656 	return err;
657 }
658 
659 const struct file_operations ovl_file_operations = {
660 	.open		= ovl_open,
661 	.release	= ovl_release,
662 	.llseek		= ovl_llseek,
663 	.read_iter	= ovl_read_iter,
664 	.write_iter	= ovl_write_iter,
665 	.fsync		= ovl_fsync,
666 	.mmap		= ovl_mmap,
667 	.fallocate	= ovl_fallocate,
668 	.fadvise	= ovl_fadvise,
669 	.flush		= ovl_flush,
670 	.splice_read    = generic_file_splice_read,
671 	.splice_write   = ovl_splice_write,
672 
673 	.copy_file_range	= ovl_copy_file_range,
674 	.remap_file_range	= ovl_remap_file_range,
675 };
676 
677 int __init ovl_aio_request_cache_init(void)
678 {
679 	ovl_aio_request_cachep = kmem_cache_create("ovl_aio_req",
680 						   sizeof(struct ovl_aio_req),
681 						   0, SLAB_HWCACHE_ALIGN, NULL);
682 	if (!ovl_aio_request_cachep)
683 		return -ENOMEM;
684 
685 	return 0;
686 }
687 
688 void ovl_aio_request_cache_destroy(void)
689 {
690 	kmem_cache_destroy(ovl_aio_request_cachep);
691 }
692