1 /* 2 * File operations used by nfsd. Some of these have been ripped from 3 * other parts of the kernel because they weren't exported, others 4 * are partial duplicates with added or changed functionality. 5 * 6 * Note that several functions dget() the dentry upon which they want 7 * to act, most notably those that create directory entries. Response 8 * dentry's are dput()'d if necessary in the release callback. 9 * So if you notice code paths that apparently fail to dput() the 10 * dentry, don't worry--they have been taken care of. 11 * 12 * Copyright (C) 1995-1999 Olaf Kirch <okir@monad.swb.de> 13 * Zerocpy NFS support (C) 2002 Hirokazu Takahashi <taka@valinux.co.jp> 14 */ 15 16 #include <linux/fs.h> 17 #include <linux/file.h> 18 #include <linux/splice.h> 19 #include <linux/falloc.h> 20 #include <linux/fcntl.h> 21 #include <linux/namei.h> 22 #include <linux/delay.h> 23 #include <linux/fsnotify.h> 24 #include <linux/posix_acl_xattr.h> 25 #include <linux/xattr.h> 26 #include <linux/jhash.h> 27 #include <linux/ima.h> 28 #include <linux/slab.h> 29 #include <asm/uaccess.h> 30 #include <linux/exportfs.h> 31 #include <linux/writeback.h> 32 #include <linux/security.h> 33 34 #ifdef CONFIG_NFSD_V3 35 #include "xdr3.h" 36 #endif /* CONFIG_NFSD_V3 */ 37 38 #ifdef CONFIG_NFSD_V4 39 #include "acl.h" 40 #include "idmap.h" 41 #endif /* CONFIG_NFSD_V4 */ 42 43 #include "nfsd.h" 44 #include "vfs.h" 45 46 #define NFSDDBG_FACILITY NFSDDBG_FILEOP 47 48 49 /* 50 * This is a cache of readahead params that help us choose the proper 51 * readahead strategy. Initially, we set all readahead parameters to 0 52 * and let the VFS handle things. 53 * If you increase the number of cached files very much, you'll need to 54 * add a hash table here. 55 */ 56 struct raparms { 57 struct raparms *p_next; 58 unsigned int p_count; 59 ino_t p_ino; 60 dev_t p_dev; 61 int p_set; 62 struct file_ra_state p_ra; 63 unsigned int p_hindex; 64 }; 65 66 struct raparm_hbucket { 67 struct raparms *pb_head; 68 spinlock_t pb_lock; 69 } ____cacheline_aligned_in_smp; 70 71 #define RAPARM_HASH_BITS 4 72 #define RAPARM_HASH_SIZE (1<<RAPARM_HASH_BITS) 73 #define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1) 74 static struct raparm_hbucket raparm_hash[RAPARM_HASH_SIZE]; 75 76 /* 77 * Called from nfsd_lookup and encode_dirent. Check if we have crossed 78 * a mount point. 79 * Returns -EAGAIN or -ETIMEDOUT leaving *dpp and *expp unchanged, 80 * or nfs_ok having possibly changed *dpp and *expp 81 */ 82 int 83 nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, 84 struct svc_export **expp) 85 { 86 struct svc_export *exp = *expp, *exp2 = NULL; 87 struct dentry *dentry = *dpp; 88 struct path path = {.mnt = mntget(exp->ex_path.mnt), 89 .dentry = dget(dentry)}; 90 int err = 0; 91 92 err = follow_down(&path); 93 if (err < 0) 94 goto out; 95 96 exp2 = rqst_exp_get_by_name(rqstp, &path); 97 if (IS_ERR(exp2)) { 98 err = PTR_ERR(exp2); 99 /* 100 * We normally allow NFS clients to continue 101 * "underneath" a mountpoint that is not exported. 102 * The exception is V4ROOT, where no traversal is ever 103 * allowed without an explicit export of the new 104 * directory. 105 */ 106 if (err == -ENOENT && !(exp->ex_flags & NFSEXP_V4ROOT)) 107 err = 0; 108 path_put(&path); 109 goto out; 110 } 111 if (nfsd_v4client(rqstp) || 112 (exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) { 113 /* successfully crossed mount point */ 114 /* 115 * This is subtle: path.dentry is *not* on path.mnt 116 * at this point. The only reason we are safe is that 117 * original mnt is pinned down by exp, so we should 118 * put path *before* putting exp 119 */ 120 *dpp = path.dentry; 121 path.dentry = dentry; 122 *expp = exp2; 123 exp2 = exp; 124 } 125 path_put(&path); 126 exp_put(exp2); 127 out: 128 return err; 129 } 130 131 static void follow_to_parent(struct path *path) 132 { 133 struct dentry *dp; 134 135 while (path->dentry == path->mnt->mnt_root && follow_up(path)) 136 ; 137 dp = dget_parent(path->dentry); 138 dput(path->dentry); 139 path->dentry = dp; 140 } 141 142 static int nfsd_lookup_parent(struct svc_rqst *rqstp, struct dentry *dparent, struct svc_export **exp, struct dentry **dentryp) 143 { 144 struct svc_export *exp2; 145 struct path path = {.mnt = mntget((*exp)->ex_path.mnt), 146 .dentry = dget(dparent)}; 147 148 follow_to_parent(&path); 149 150 exp2 = rqst_exp_parent(rqstp, &path); 151 if (PTR_ERR(exp2) == -ENOENT) { 152 *dentryp = dget(dparent); 153 } else if (IS_ERR(exp2)) { 154 path_put(&path); 155 return PTR_ERR(exp2); 156 } else { 157 *dentryp = dget(path.dentry); 158 exp_put(*exp); 159 *exp = exp2; 160 } 161 path_put(&path); 162 return 0; 163 } 164 165 /* 166 * For nfsd purposes, we treat V4ROOT exports as though there was an 167 * export at *every* directory. 168 */ 169 int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp) 170 { 171 if (d_mountpoint(dentry)) 172 return 1; 173 if (nfsd4_is_junction(dentry)) 174 return 1; 175 if (!(exp->ex_flags & NFSEXP_V4ROOT)) 176 return 0; 177 return d_inode(dentry) != NULL; 178 } 179 180 __be32 181 nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, 182 const char *name, unsigned int len, 183 struct svc_export **exp_ret, struct dentry **dentry_ret) 184 { 185 struct svc_export *exp; 186 struct dentry *dparent; 187 struct dentry *dentry; 188 int host_err; 189 190 dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name); 191 192 dparent = fhp->fh_dentry; 193 exp = exp_get(fhp->fh_export); 194 195 /* Lookup the name, but don't follow links */ 196 if (isdotent(name, len)) { 197 if (len==1) 198 dentry = dget(dparent); 199 else if (dparent != exp->ex_path.dentry) 200 dentry = dget_parent(dparent); 201 else if (!EX_NOHIDE(exp) && !nfsd_v4client(rqstp)) 202 dentry = dget(dparent); /* .. == . just like at / */ 203 else { 204 /* checking mountpoint crossing is very different when stepping up */ 205 host_err = nfsd_lookup_parent(rqstp, dparent, &exp, &dentry); 206 if (host_err) 207 goto out_nfserr; 208 } 209 } else { 210 /* 211 * In the nfsd4_open() case, this may be held across 212 * subsequent open and delegation acquisition which may 213 * need to take the child's i_mutex: 214 */ 215 fh_lock_nested(fhp, I_MUTEX_PARENT); 216 dentry = lookup_one_len(name, dparent, len); 217 host_err = PTR_ERR(dentry); 218 if (IS_ERR(dentry)) 219 goto out_nfserr; 220 /* 221 * check if we have crossed a mount point ... 222 */ 223 if (nfsd_mountpoint(dentry, exp)) { 224 if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) { 225 dput(dentry); 226 goto out_nfserr; 227 } 228 } 229 } 230 *dentry_ret = dentry; 231 *exp_ret = exp; 232 return 0; 233 234 out_nfserr: 235 exp_put(exp); 236 return nfserrno(host_err); 237 } 238 239 /* 240 * Look up one component of a pathname. 241 * N.B. After this call _both_ fhp and resfh need an fh_put 242 * 243 * If the lookup would cross a mountpoint, and the mounted filesystem 244 * is exported to the client with NFSEXP_NOHIDE, then the lookup is 245 * accepted as it stands and the mounted directory is 246 * returned. Otherwise the covered directory is returned. 247 * NOTE: this mountpoint crossing is not supported properly by all 248 * clients and is explicitly disallowed for NFSv3 249 * NeilBrown <neilb@cse.unsw.edu.au> 250 */ 251 __be32 252 nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, 253 unsigned int len, struct svc_fh *resfh) 254 { 255 struct svc_export *exp; 256 struct dentry *dentry; 257 __be32 err; 258 259 err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC); 260 if (err) 261 return err; 262 err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry); 263 if (err) 264 return err; 265 err = check_nfsd_access(exp, rqstp); 266 if (err) 267 goto out; 268 /* 269 * Note: we compose the file handle now, but as the 270 * dentry may be negative, it may need to be updated. 271 */ 272 err = fh_compose(resfh, exp, dentry, fhp); 273 if (!err && d_really_is_negative(dentry)) 274 err = nfserr_noent; 275 out: 276 dput(dentry); 277 exp_put(exp); 278 return err; 279 } 280 281 /* 282 * Commit metadata changes to stable storage. 283 */ 284 static int 285 commit_metadata(struct svc_fh *fhp) 286 { 287 struct inode *inode = d_inode(fhp->fh_dentry); 288 const struct export_operations *export_ops = inode->i_sb->s_export_op; 289 290 if (!EX_ISSYNC(fhp->fh_export)) 291 return 0; 292 293 if (export_ops->commit_metadata) 294 return export_ops->commit_metadata(inode); 295 return sync_inode_metadata(inode, 1); 296 } 297 298 /* 299 * Go over the attributes and take care of the small differences between 300 * NFS semantics and what Linux expects. 301 */ 302 static void 303 nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap) 304 { 305 /* sanitize the mode change */ 306 if (iap->ia_valid & ATTR_MODE) { 307 iap->ia_mode &= S_IALLUGO; 308 iap->ia_mode |= (inode->i_mode & ~S_IALLUGO); 309 } 310 311 /* Revoke setuid/setgid on chown */ 312 if (!S_ISDIR(inode->i_mode) && 313 ((iap->ia_valid & ATTR_UID) || (iap->ia_valid & ATTR_GID))) { 314 iap->ia_valid |= ATTR_KILL_PRIV; 315 if (iap->ia_valid & ATTR_MODE) { 316 /* we're setting mode too, just clear the s*id bits */ 317 iap->ia_mode &= ~S_ISUID; 318 if (iap->ia_mode & S_IXGRP) 319 iap->ia_mode &= ~S_ISGID; 320 } else { 321 /* set ATTR_KILL_* bits and let VFS handle it */ 322 iap->ia_valid |= (ATTR_KILL_SUID | ATTR_KILL_SGID); 323 } 324 } 325 } 326 327 static __be32 328 nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp, 329 struct iattr *iap) 330 { 331 struct inode *inode = d_inode(fhp->fh_dentry); 332 int host_err; 333 334 if (iap->ia_size < inode->i_size) { 335 __be32 err; 336 337 err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, 338 NFSD_MAY_TRUNC | NFSD_MAY_OWNER_OVERRIDE); 339 if (err) 340 return err; 341 } 342 343 host_err = get_write_access(inode); 344 if (host_err) 345 goto out_nfserrno; 346 347 host_err = locks_verify_truncate(inode, NULL, iap->ia_size); 348 if (host_err) 349 goto out_put_write_access; 350 return 0; 351 352 out_put_write_access: 353 put_write_access(inode); 354 out_nfserrno: 355 return nfserrno(host_err); 356 } 357 358 /* 359 * Set various file attributes. After this call fhp needs an fh_put. 360 */ 361 __be32 362 nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, 363 int check_guard, time_t guardtime) 364 { 365 struct dentry *dentry; 366 struct inode *inode; 367 int accmode = NFSD_MAY_SATTR; 368 umode_t ftype = 0; 369 __be32 err; 370 int host_err; 371 bool get_write_count; 372 int size_change = 0; 373 374 if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE)) 375 accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE; 376 if (iap->ia_valid & ATTR_SIZE) 377 ftype = S_IFREG; 378 379 /* Callers that do fh_verify should do the fh_want_write: */ 380 get_write_count = !fhp->fh_dentry; 381 382 /* Get inode */ 383 err = fh_verify(rqstp, fhp, ftype, accmode); 384 if (err) 385 goto out; 386 if (get_write_count) { 387 host_err = fh_want_write(fhp); 388 if (host_err) 389 return nfserrno(host_err); 390 } 391 392 dentry = fhp->fh_dentry; 393 inode = d_inode(dentry); 394 395 /* Ignore any mode updates on symlinks */ 396 if (S_ISLNK(inode->i_mode)) 397 iap->ia_valid &= ~ATTR_MODE; 398 399 if (!iap->ia_valid) 400 goto out; 401 402 nfsd_sanitize_attrs(inode, iap); 403 404 /* 405 * The size case is special, it changes the file in addition to the 406 * attributes. 407 */ 408 if (iap->ia_valid & ATTR_SIZE) { 409 err = nfsd_get_write_access(rqstp, fhp, iap); 410 if (err) 411 goto out; 412 size_change = 1; 413 414 /* 415 * RFC5661, Section 18.30.4: 416 * Changing the size of a file with SETATTR indirectly 417 * changes the time_modify and change attributes. 418 * 419 * (and similar for the older RFCs) 420 */ 421 if (iap->ia_size != i_size_read(inode)) 422 iap->ia_valid |= ATTR_MTIME; 423 } 424 425 iap->ia_valid |= ATTR_CTIME; 426 427 if (check_guard && guardtime != inode->i_ctime.tv_sec) { 428 err = nfserr_notsync; 429 goto out_put_write_access; 430 } 431 432 fh_lock(fhp); 433 host_err = notify_change(dentry, iap, NULL); 434 fh_unlock(fhp); 435 err = nfserrno(host_err); 436 437 out_put_write_access: 438 if (size_change) 439 put_write_access(inode); 440 if (!err) 441 err = nfserrno(commit_metadata(fhp)); 442 out: 443 return err; 444 } 445 446 #if defined(CONFIG_NFSD_V4) 447 /* 448 * NFS junction information is stored in an extended attribute. 449 */ 450 #define NFSD_JUNCTION_XATTR_NAME XATTR_TRUSTED_PREFIX "junction.nfs" 451 452 /** 453 * nfsd4_is_junction - Test if an object could be an NFS junction 454 * 455 * @dentry: object to test 456 * 457 * Returns 1 if "dentry" appears to contain NFS junction information. 458 * Otherwise 0 is returned. 459 */ 460 int nfsd4_is_junction(struct dentry *dentry) 461 { 462 struct inode *inode = d_inode(dentry); 463 464 if (inode == NULL) 465 return 0; 466 if (inode->i_mode & S_IXUGO) 467 return 0; 468 if (!(inode->i_mode & S_ISVTX)) 469 return 0; 470 if (vfs_getxattr(dentry, NFSD_JUNCTION_XATTR_NAME, NULL, 0) <= 0) 471 return 0; 472 return 1; 473 } 474 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL 475 __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, 476 struct xdr_netobj *label) 477 { 478 __be32 error; 479 int host_error; 480 struct dentry *dentry; 481 482 error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR); 483 if (error) 484 return error; 485 486 dentry = fhp->fh_dentry; 487 488 mutex_lock(&d_inode(dentry)->i_mutex); 489 host_error = security_inode_setsecctx(dentry, label->data, label->len); 490 mutex_unlock(&d_inode(dentry)->i_mutex); 491 return nfserrno(host_error); 492 } 493 #else 494 __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, 495 struct xdr_netobj *label) 496 { 497 return nfserr_notsupp; 498 } 499 #endif 500 501 __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp, 502 struct file *file, loff_t offset, loff_t len, 503 int flags) 504 { 505 int error; 506 507 if (!S_ISREG(file_inode(file)->i_mode)) 508 return nfserr_inval; 509 510 error = vfs_fallocate(file, flags, offset, len); 511 if (!error) 512 error = commit_metadata(fhp); 513 514 return nfserrno(error); 515 } 516 #endif /* defined(CONFIG_NFSD_V4) */ 517 518 #ifdef CONFIG_NFSD_V3 519 /* 520 * Check server access rights to a file system object 521 */ 522 struct accessmap { 523 u32 access; 524 int how; 525 }; 526 static struct accessmap nfs3_regaccess[] = { 527 { NFS3_ACCESS_READ, NFSD_MAY_READ }, 528 { NFS3_ACCESS_EXECUTE, NFSD_MAY_EXEC }, 529 { NFS3_ACCESS_MODIFY, NFSD_MAY_WRITE|NFSD_MAY_TRUNC }, 530 { NFS3_ACCESS_EXTEND, NFSD_MAY_WRITE }, 531 532 { 0, 0 } 533 }; 534 535 static struct accessmap nfs3_diraccess[] = { 536 { NFS3_ACCESS_READ, NFSD_MAY_READ }, 537 { NFS3_ACCESS_LOOKUP, NFSD_MAY_EXEC }, 538 { NFS3_ACCESS_MODIFY, NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC}, 539 { NFS3_ACCESS_EXTEND, NFSD_MAY_EXEC|NFSD_MAY_WRITE }, 540 { NFS3_ACCESS_DELETE, NFSD_MAY_REMOVE }, 541 542 { 0, 0 } 543 }; 544 545 static struct accessmap nfs3_anyaccess[] = { 546 /* Some clients - Solaris 2.6 at least, make an access call 547 * to the server to check for access for things like /dev/null 548 * (which really, the server doesn't care about). So 549 * We provide simple access checking for them, looking 550 * mainly at mode bits, and we make sure to ignore read-only 551 * filesystem checks 552 */ 553 { NFS3_ACCESS_READ, NFSD_MAY_READ }, 554 { NFS3_ACCESS_EXECUTE, NFSD_MAY_EXEC }, 555 { NFS3_ACCESS_MODIFY, NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS }, 556 { NFS3_ACCESS_EXTEND, NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS }, 557 558 { 0, 0 } 559 }; 560 561 __be32 562 nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *supported) 563 { 564 struct accessmap *map; 565 struct svc_export *export; 566 struct dentry *dentry; 567 u32 query, result = 0, sresult = 0; 568 __be32 error; 569 570 error = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP); 571 if (error) 572 goto out; 573 574 export = fhp->fh_export; 575 dentry = fhp->fh_dentry; 576 577 if (d_is_reg(dentry)) 578 map = nfs3_regaccess; 579 else if (d_is_dir(dentry)) 580 map = nfs3_diraccess; 581 else 582 map = nfs3_anyaccess; 583 584 585 query = *access; 586 for (; map->access; map++) { 587 if (map->access & query) { 588 __be32 err2; 589 590 sresult |= map->access; 591 592 err2 = nfsd_permission(rqstp, export, dentry, map->how); 593 switch (err2) { 594 case nfs_ok: 595 result |= map->access; 596 break; 597 598 /* the following error codes just mean the access was not allowed, 599 * rather than an error occurred */ 600 case nfserr_rofs: 601 case nfserr_acces: 602 case nfserr_perm: 603 /* simply don't "or" in the access bit. */ 604 break; 605 default: 606 error = err2; 607 goto out; 608 } 609 } 610 } 611 *access = result; 612 if (supported) 613 *supported = sresult; 614 615 out: 616 return error; 617 } 618 #endif /* CONFIG_NFSD_V3 */ 619 620 static int nfsd_open_break_lease(struct inode *inode, int access) 621 { 622 unsigned int mode; 623 624 if (access & NFSD_MAY_NOT_BREAK_LEASE) 625 return 0; 626 mode = (access & NFSD_MAY_WRITE) ? O_WRONLY : O_RDONLY; 627 return break_lease(inode, mode | O_NONBLOCK); 628 } 629 630 /* 631 * Open an existing file or directory. 632 * The may_flags argument indicates the type of open (read/write/lock) 633 * and additional flags. 634 * N.B. After this call fhp needs an fh_put 635 */ 636 __be32 637 nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, 638 int may_flags, struct file **filp) 639 { 640 struct path path; 641 struct inode *inode; 642 struct file *file; 643 int flags = O_RDONLY|O_LARGEFILE; 644 __be32 err; 645 int host_err = 0; 646 647 validate_process_creds(); 648 649 /* 650 * If we get here, then the client has already done an "open", 651 * and (hopefully) checked permission - so allow OWNER_OVERRIDE 652 * in case a chmod has now revoked permission. 653 * 654 * Arguably we should also allow the owner override for 655 * directories, but we never have and it doesn't seem to have 656 * caused anyone a problem. If we were to change this, note 657 * also that our filldir callbacks would need a variant of 658 * lookup_one_len that doesn't check permissions. 659 */ 660 if (type == S_IFREG) 661 may_flags |= NFSD_MAY_OWNER_OVERRIDE; 662 err = fh_verify(rqstp, fhp, type, may_flags); 663 if (err) 664 goto out; 665 666 path.mnt = fhp->fh_export->ex_path.mnt; 667 path.dentry = fhp->fh_dentry; 668 inode = d_inode(path.dentry); 669 670 /* Disallow write access to files with the append-only bit set 671 * or any access when mandatory locking enabled 672 */ 673 err = nfserr_perm; 674 if (IS_APPEND(inode) && (may_flags & NFSD_MAY_WRITE)) 675 goto out; 676 /* 677 * We must ignore files (but only files) which might have mandatory 678 * locks on them because there is no way to know if the accesser has 679 * the lock. 680 */ 681 if (S_ISREG((inode)->i_mode) && mandatory_lock(inode)) 682 goto out; 683 684 if (!inode->i_fop) 685 goto out; 686 687 host_err = nfsd_open_break_lease(inode, may_flags); 688 if (host_err) /* NOMEM or WOULDBLOCK */ 689 goto out_nfserr; 690 691 if (may_flags & NFSD_MAY_WRITE) { 692 if (may_flags & NFSD_MAY_READ) 693 flags = O_RDWR|O_LARGEFILE; 694 else 695 flags = O_WRONLY|O_LARGEFILE; 696 } 697 698 file = dentry_open(&path, flags, current_cred()); 699 if (IS_ERR(file)) { 700 host_err = PTR_ERR(file); 701 goto out_nfserr; 702 } 703 704 host_err = ima_file_check(file, may_flags, 0); 705 if (host_err) { 706 fput(file); 707 goto out_nfserr; 708 } 709 710 if (may_flags & NFSD_MAY_64BIT_COOKIE) 711 file->f_mode |= FMODE_64BITHASH; 712 else 713 file->f_mode |= FMODE_32BITHASH; 714 715 *filp = file; 716 out_nfserr: 717 err = nfserrno(host_err); 718 out: 719 validate_process_creds(); 720 return err; 721 } 722 723 struct raparms * 724 nfsd_init_raparms(struct file *file) 725 { 726 struct inode *inode = file_inode(file); 727 dev_t dev = inode->i_sb->s_dev; 728 ino_t ino = inode->i_ino; 729 struct raparms *ra, **rap, **frap = NULL; 730 int depth = 0; 731 unsigned int hash; 732 struct raparm_hbucket *rab; 733 734 hash = jhash_2words(dev, ino, 0xfeedbeef) & RAPARM_HASH_MASK; 735 rab = &raparm_hash[hash]; 736 737 spin_lock(&rab->pb_lock); 738 for (rap = &rab->pb_head; (ra = *rap); rap = &ra->p_next) { 739 if (ra->p_ino == ino && ra->p_dev == dev) 740 goto found; 741 depth++; 742 if (ra->p_count == 0) 743 frap = rap; 744 } 745 depth = nfsdstats.ra_size; 746 if (!frap) { 747 spin_unlock(&rab->pb_lock); 748 return NULL; 749 } 750 rap = frap; 751 ra = *frap; 752 ra->p_dev = dev; 753 ra->p_ino = ino; 754 ra->p_set = 0; 755 ra->p_hindex = hash; 756 found: 757 if (rap != &rab->pb_head) { 758 *rap = ra->p_next; 759 ra->p_next = rab->pb_head; 760 rab->pb_head = ra; 761 } 762 ra->p_count++; 763 nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++; 764 spin_unlock(&rab->pb_lock); 765 766 if (ra->p_set) 767 file->f_ra = ra->p_ra; 768 return ra; 769 } 770 771 void nfsd_put_raparams(struct file *file, struct raparms *ra) 772 { 773 struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex]; 774 775 spin_lock(&rab->pb_lock); 776 ra->p_ra = file->f_ra; 777 ra->p_set = 1; 778 ra->p_count--; 779 spin_unlock(&rab->pb_lock); 780 } 781 782 /* 783 * Grab and keep cached pages associated with a file in the svc_rqst 784 * so that they can be passed to the network sendmsg/sendpage routines 785 * directly. They will be released after the sending has completed. 786 */ 787 static int 788 nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 789 struct splice_desc *sd) 790 { 791 struct svc_rqst *rqstp = sd->u.data; 792 struct page **pp = rqstp->rq_next_page; 793 struct page *page = buf->page; 794 size_t size; 795 796 size = sd->len; 797 798 if (rqstp->rq_res.page_len == 0) { 799 get_page(page); 800 put_page(*rqstp->rq_next_page); 801 *(rqstp->rq_next_page++) = page; 802 rqstp->rq_res.page_base = buf->offset; 803 rqstp->rq_res.page_len = size; 804 } else if (page != pp[-1]) { 805 get_page(page); 806 if (*rqstp->rq_next_page) 807 put_page(*rqstp->rq_next_page); 808 *(rqstp->rq_next_page++) = page; 809 rqstp->rq_res.page_len += size; 810 } else 811 rqstp->rq_res.page_len += size; 812 813 return size; 814 } 815 816 static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe, 817 struct splice_desc *sd) 818 { 819 return __splice_from_pipe(pipe, sd, nfsd_splice_actor); 820 } 821 822 static __be32 823 nfsd_finish_read(struct file *file, unsigned long *count, int host_err) 824 { 825 if (host_err >= 0) { 826 nfsdstats.io_read += host_err; 827 *count = host_err; 828 fsnotify_access(file); 829 return 0; 830 } else 831 return nfserrno(host_err); 832 } 833 834 __be32 nfsd_splice_read(struct svc_rqst *rqstp, 835 struct file *file, loff_t offset, unsigned long *count) 836 { 837 struct splice_desc sd = { 838 .len = 0, 839 .total_len = *count, 840 .pos = offset, 841 .u.data = rqstp, 842 }; 843 int host_err; 844 845 rqstp->rq_next_page = rqstp->rq_respages + 1; 846 host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); 847 return nfsd_finish_read(file, count, host_err); 848 } 849 850 __be32 nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen, 851 unsigned long *count) 852 { 853 mm_segment_t oldfs; 854 int host_err; 855 856 oldfs = get_fs(); 857 set_fs(KERNEL_DS); 858 host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset); 859 set_fs(oldfs); 860 return nfsd_finish_read(file, count, host_err); 861 } 862 863 static __be32 864 nfsd_vfs_read(struct svc_rqst *rqstp, struct file *file, 865 loff_t offset, struct kvec *vec, int vlen, unsigned long *count) 866 { 867 if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags)) 868 return nfsd_splice_read(rqstp, file, offset, count); 869 else 870 return nfsd_readv(file, offset, vec, vlen, count); 871 } 872 873 /* 874 * Gathered writes: If another process is currently writing to the file, 875 * there's a high chance this is another nfsd (triggered by a bulk write 876 * from a client's biod). Rather than syncing the file with each write 877 * request, we sleep for 10 msec. 878 * 879 * I don't know if this roughly approximates C. Juszak's idea of 880 * gathered writes, but it's a nice and simple solution (IMHO), and it 881 * seems to work:-) 882 * 883 * Note: we do this only in the NFSv2 case, since v3 and higher have a 884 * better tool (separate unstable writes and commits) for solving this 885 * problem. 886 */ 887 static int wait_for_concurrent_writes(struct file *file) 888 { 889 struct inode *inode = file_inode(file); 890 static ino_t last_ino; 891 static dev_t last_dev; 892 int err = 0; 893 894 if (atomic_read(&inode->i_writecount) > 1 895 || (last_ino == inode->i_ino && last_dev == inode->i_sb->s_dev)) { 896 dprintk("nfsd: write defer %d\n", task_pid_nr(current)); 897 msleep(10); 898 dprintk("nfsd: write resume %d\n", task_pid_nr(current)); 899 } 900 901 if (inode->i_state & I_DIRTY) { 902 dprintk("nfsd: write sync %d\n", task_pid_nr(current)); 903 err = vfs_fsync(file, 0); 904 } 905 last_ino = inode->i_ino; 906 last_dev = inode->i_sb->s_dev; 907 return err; 908 } 909 910 __be32 911 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 912 loff_t offset, struct kvec *vec, int vlen, 913 unsigned long *cnt, int *stablep) 914 { 915 struct svc_export *exp; 916 struct inode *inode; 917 mm_segment_t oldfs; 918 __be32 err = 0; 919 int host_err; 920 int stable = *stablep; 921 int use_wgather; 922 loff_t pos = offset; 923 loff_t end = LLONG_MAX; 924 unsigned int pflags = current->flags; 925 926 if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) 927 /* 928 * We want less throttling in balance_dirty_pages() 929 * and shrink_inactive_list() so that nfs to 930 * localhost doesn't cause nfsd to lock up due to all 931 * the client's dirty pages or its congested queue. 932 */ 933 current->flags |= PF_LESS_THROTTLE; 934 935 inode = file_inode(file); 936 exp = fhp->fh_export; 937 938 use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp); 939 940 if (!EX_ISSYNC(exp)) 941 stable = 0; 942 943 /* Write the data. */ 944 oldfs = get_fs(); set_fs(KERNEL_DS); 945 host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos); 946 set_fs(oldfs); 947 if (host_err < 0) 948 goto out_nfserr; 949 *cnt = host_err; 950 nfsdstats.io_write += host_err; 951 fsnotify_modify(file); 952 953 if (stable) { 954 if (use_wgather) { 955 host_err = wait_for_concurrent_writes(file); 956 } else { 957 if (*cnt) 958 end = offset + *cnt - 1; 959 host_err = vfs_fsync_range(file, offset, end, 0); 960 } 961 } 962 963 out_nfserr: 964 dprintk("nfsd: write complete host_err=%d\n", host_err); 965 if (host_err >= 0) 966 err = 0; 967 else 968 err = nfserrno(host_err); 969 if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) 970 tsk_restore_flags(current, pflags, PF_LESS_THROTTLE); 971 return err; 972 } 973 974 /* 975 * Read data from a file. count must contain the requested read count 976 * on entry. On return, *count contains the number of bytes actually read. 977 * N.B. After this call fhp needs an fh_put 978 */ 979 __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, 980 loff_t offset, struct kvec *vec, int vlen, unsigned long *count) 981 { 982 struct file *file; 983 struct raparms *ra; 984 __be32 err; 985 986 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); 987 if (err) 988 return err; 989 990 ra = nfsd_init_raparms(file); 991 err = nfsd_vfs_read(rqstp, file, offset, vec, vlen, count); 992 if (ra) 993 nfsd_put_raparams(file, ra); 994 fput(file); 995 996 return err; 997 } 998 999 /* 1000 * Write data to a file. 1001 * The stable flag requests synchronous writes. 1002 * N.B. After this call fhp needs an fh_put 1003 */ 1004 __be32 1005 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 1006 loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt, 1007 int *stablep) 1008 { 1009 __be32 err = 0; 1010 1011 if (file) { 1012 err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, 1013 NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE); 1014 if (err) 1015 goto out; 1016 err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, 1017 stablep); 1018 } else { 1019 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file); 1020 if (err) 1021 goto out; 1022 1023 if (cnt) 1024 err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, 1025 cnt, stablep); 1026 fput(file); 1027 } 1028 out: 1029 return err; 1030 } 1031 1032 #ifdef CONFIG_NFSD_V3 1033 /* 1034 * Commit all pending writes to stable storage. 1035 * 1036 * Note: we only guarantee that data that lies within the range specified 1037 * by the 'offset' and 'count' parameters will be synced. 1038 * 1039 * Unfortunately we cannot lock the file to make sure we return full WCC 1040 * data to the client, as locking happens lower down in the filesystem. 1041 */ 1042 __be32 1043 nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, 1044 loff_t offset, unsigned long count) 1045 { 1046 struct file *file; 1047 loff_t end = LLONG_MAX; 1048 __be32 err = nfserr_inval; 1049 1050 if (offset < 0) 1051 goto out; 1052 if (count != 0) { 1053 end = offset + (loff_t)count - 1; 1054 if (end < offset) 1055 goto out; 1056 } 1057 1058 err = nfsd_open(rqstp, fhp, S_IFREG, 1059 NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &file); 1060 if (err) 1061 goto out; 1062 if (EX_ISSYNC(fhp->fh_export)) { 1063 int err2 = vfs_fsync_range(file, offset, end, 0); 1064 1065 if (err2 != -EINVAL) 1066 err = nfserrno(err2); 1067 else 1068 err = nfserr_notsupp; 1069 } 1070 1071 fput(file); 1072 out: 1073 return err; 1074 } 1075 #endif /* CONFIG_NFSD_V3 */ 1076 1077 static __be32 1078 nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp, 1079 struct iattr *iap) 1080 { 1081 /* 1082 * Mode has already been set earlier in create: 1083 */ 1084 iap->ia_valid &= ~ATTR_MODE; 1085 /* 1086 * Setting uid/gid works only for root. Irix appears to 1087 * send along the gid on create when it tries to implement 1088 * setgid directories via NFS: 1089 */ 1090 if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID)) 1091 iap->ia_valid &= ~(ATTR_UID|ATTR_GID); 1092 if (iap->ia_valid) 1093 return nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0); 1094 /* Callers expect file metadata to be committed here */ 1095 return nfserrno(commit_metadata(resfhp)); 1096 } 1097 1098 /* HPUX client sometimes creates a file in mode 000, and sets size to 0. 1099 * setting size to 0 may fail for some specific file systems by the permission 1100 * checking which requires WRITE permission but the mode is 000. 1101 * we ignore the resizing(to 0) on the just new created file, since the size is 1102 * 0 after file created. 1103 * 1104 * call this only after vfs_create() is called. 1105 * */ 1106 static void 1107 nfsd_check_ignore_resizing(struct iattr *iap) 1108 { 1109 if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0)) 1110 iap->ia_valid &= ~ATTR_SIZE; 1111 } 1112 1113 /* 1114 * Create a file (regular, directory, device, fifo); UNIX sockets 1115 * not yet implemented. 1116 * If the response fh has been verified, the parent directory should 1117 * already be locked. Note that the parent directory is left locked. 1118 * 1119 * N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp 1120 */ 1121 __be32 1122 nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, 1123 char *fname, int flen, struct iattr *iap, 1124 int type, dev_t rdev, struct svc_fh *resfhp) 1125 { 1126 struct dentry *dentry, *dchild = NULL; 1127 struct inode *dirp; 1128 __be32 err; 1129 __be32 err2; 1130 int host_err; 1131 1132 err = nfserr_perm; 1133 if (!flen) 1134 goto out; 1135 err = nfserr_exist; 1136 if (isdotent(fname, flen)) 1137 goto out; 1138 1139 err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); 1140 if (err) 1141 goto out; 1142 1143 dentry = fhp->fh_dentry; 1144 dirp = d_inode(dentry); 1145 1146 err = nfserr_notdir; 1147 if (!dirp->i_op->lookup) 1148 goto out; 1149 /* 1150 * Check whether the response file handle has been verified yet. 1151 * If it has, the parent directory should already be locked. 1152 */ 1153 if (!resfhp->fh_dentry) { 1154 host_err = fh_want_write(fhp); 1155 if (host_err) 1156 goto out_nfserr; 1157 1158 /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */ 1159 fh_lock_nested(fhp, I_MUTEX_PARENT); 1160 dchild = lookup_one_len(fname, dentry, flen); 1161 host_err = PTR_ERR(dchild); 1162 if (IS_ERR(dchild)) 1163 goto out_nfserr; 1164 err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); 1165 if (err) 1166 goto out; 1167 } else { 1168 /* called from nfsd_proc_create */ 1169 dchild = dget(resfhp->fh_dentry); 1170 if (!fhp->fh_locked) { 1171 /* not actually possible */ 1172 printk(KERN_ERR 1173 "nfsd_create: parent %pd2 not locked!\n", 1174 dentry); 1175 err = nfserr_io; 1176 goto out; 1177 } 1178 } 1179 /* 1180 * Make sure the child dentry is still negative ... 1181 */ 1182 err = nfserr_exist; 1183 if (d_really_is_positive(dchild)) { 1184 dprintk("nfsd_create: dentry %pd/%pd not negative!\n", 1185 dentry, dchild); 1186 goto out; 1187 } 1188 1189 if (!(iap->ia_valid & ATTR_MODE)) 1190 iap->ia_mode = 0; 1191 iap->ia_mode = (iap->ia_mode & S_IALLUGO) | type; 1192 1193 err = nfserr_inval; 1194 if (!S_ISREG(type) && !S_ISDIR(type) && !special_file(type)) { 1195 printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n", 1196 type); 1197 goto out; 1198 } 1199 1200 /* 1201 * Get the dir op function pointer. 1202 */ 1203 err = 0; 1204 host_err = 0; 1205 switch (type) { 1206 case S_IFREG: 1207 host_err = vfs_create(dirp, dchild, iap->ia_mode, true); 1208 if (!host_err) 1209 nfsd_check_ignore_resizing(iap); 1210 break; 1211 case S_IFDIR: 1212 host_err = vfs_mkdir(dirp, dchild, iap->ia_mode); 1213 break; 1214 case S_IFCHR: 1215 case S_IFBLK: 1216 case S_IFIFO: 1217 case S_IFSOCK: 1218 host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); 1219 break; 1220 } 1221 if (host_err < 0) 1222 goto out_nfserr; 1223 1224 err = nfsd_create_setattr(rqstp, resfhp, iap); 1225 1226 /* 1227 * nfsd_create_setattr already committed the child. Transactional 1228 * filesystems had a chance to commit changes for both parent and 1229 * child * simultaneously making the following commit_metadata a 1230 * noop. 1231 */ 1232 err2 = nfserrno(commit_metadata(fhp)); 1233 if (err2) 1234 err = err2; 1235 /* 1236 * Update the file handle to get the new inode info. 1237 */ 1238 if (!err) 1239 err = fh_update(resfhp); 1240 out: 1241 if (dchild && !IS_ERR(dchild)) 1242 dput(dchild); 1243 return err; 1244 1245 out_nfserr: 1246 err = nfserrno(host_err); 1247 goto out; 1248 } 1249 1250 #ifdef CONFIG_NFSD_V3 1251 1252 /* 1253 * NFSv3 and NFSv4 version of nfsd_create 1254 */ 1255 __be32 1256 do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, 1257 char *fname, int flen, struct iattr *iap, 1258 struct svc_fh *resfhp, int createmode, u32 *verifier, 1259 bool *truncp, bool *created) 1260 { 1261 struct dentry *dentry, *dchild = NULL; 1262 struct inode *dirp; 1263 __be32 err; 1264 int host_err; 1265 __u32 v_mtime=0, v_atime=0; 1266 1267 err = nfserr_perm; 1268 if (!flen) 1269 goto out; 1270 err = nfserr_exist; 1271 if (isdotent(fname, flen)) 1272 goto out; 1273 if (!(iap->ia_valid & ATTR_MODE)) 1274 iap->ia_mode = 0; 1275 err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC); 1276 if (err) 1277 goto out; 1278 1279 dentry = fhp->fh_dentry; 1280 dirp = d_inode(dentry); 1281 1282 /* Get all the sanity checks out of the way before 1283 * we lock the parent. */ 1284 err = nfserr_notdir; 1285 if (!dirp->i_op->lookup) 1286 goto out; 1287 1288 host_err = fh_want_write(fhp); 1289 if (host_err) 1290 goto out_nfserr; 1291 1292 fh_lock_nested(fhp, I_MUTEX_PARENT); 1293 1294 /* 1295 * Compose the response file handle. 1296 */ 1297 dchild = lookup_one_len(fname, dentry, flen); 1298 host_err = PTR_ERR(dchild); 1299 if (IS_ERR(dchild)) 1300 goto out_nfserr; 1301 1302 /* If file doesn't exist, check for permissions to create one */ 1303 if (d_really_is_negative(dchild)) { 1304 err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); 1305 if (err) 1306 goto out; 1307 } 1308 1309 err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); 1310 if (err) 1311 goto out; 1312 1313 if (nfsd_create_is_exclusive(createmode)) { 1314 /* solaris7 gets confused (bugid 4218508) if these have 1315 * the high bit set, so just clear the high bits. If this is 1316 * ever changed to use different attrs for storing the 1317 * verifier, then do_open_lookup() will also need to be fixed 1318 * accordingly. 1319 */ 1320 v_mtime = verifier[0]&0x7fffffff; 1321 v_atime = verifier[1]&0x7fffffff; 1322 } 1323 1324 if (d_really_is_positive(dchild)) { 1325 err = 0; 1326 1327 switch (createmode) { 1328 case NFS3_CREATE_UNCHECKED: 1329 if (! d_is_reg(dchild)) 1330 goto out; 1331 else if (truncp) { 1332 /* in nfsv4, we need to treat this case a little 1333 * differently. we don't want to truncate the 1334 * file now; this would be wrong if the OPEN 1335 * fails for some other reason. furthermore, 1336 * if the size is nonzero, we should ignore it 1337 * according to spec! 1338 */ 1339 *truncp = (iap->ia_valid & ATTR_SIZE) && !iap->ia_size; 1340 } 1341 else { 1342 iap->ia_valid &= ATTR_SIZE; 1343 goto set_attr; 1344 } 1345 break; 1346 case NFS3_CREATE_EXCLUSIVE: 1347 if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime 1348 && d_inode(dchild)->i_atime.tv_sec == v_atime 1349 && d_inode(dchild)->i_size == 0 ) { 1350 if (created) 1351 *created = 1; 1352 break; 1353 } 1354 case NFS4_CREATE_EXCLUSIVE4_1: 1355 if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime 1356 && d_inode(dchild)->i_atime.tv_sec == v_atime 1357 && d_inode(dchild)->i_size == 0 ) { 1358 if (created) 1359 *created = 1; 1360 goto set_attr; 1361 } 1362 /* fallthru */ 1363 case NFS3_CREATE_GUARDED: 1364 err = nfserr_exist; 1365 } 1366 fh_drop_write(fhp); 1367 goto out; 1368 } 1369 1370 host_err = vfs_create(dirp, dchild, iap->ia_mode, true); 1371 if (host_err < 0) { 1372 fh_drop_write(fhp); 1373 goto out_nfserr; 1374 } 1375 if (created) 1376 *created = 1; 1377 1378 nfsd_check_ignore_resizing(iap); 1379 1380 if (nfsd_create_is_exclusive(createmode)) { 1381 /* Cram the verifier into atime/mtime */ 1382 iap->ia_valid = ATTR_MTIME|ATTR_ATIME 1383 | ATTR_MTIME_SET|ATTR_ATIME_SET; 1384 /* XXX someone who knows this better please fix it for nsec */ 1385 iap->ia_mtime.tv_sec = v_mtime; 1386 iap->ia_atime.tv_sec = v_atime; 1387 iap->ia_mtime.tv_nsec = 0; 1388 iap->ia_atime.tv_nsec = 0; 1389 } 1390 1391 set_attr: 1392 err = nfsd_create_setattr(rqstp, resfhp, iap); 1393 1394 /* 1395 * nfsd_create_setattr already committed the child 1396 * (and possibly also the parent). 1397 */ 1398 if (!err) 1399 err = nfserrno(commit_metadata(fhp)); 1400 1401 /* 1402 * Update the filehandle to get the new inode info. 1403 */ 1404 if (!err) 1405 err = fh_update(resfhp); 1406 1407 out: 1408 fh_unlock(fhp); 1409 if (dchild && !IS_ERR(dchild)) 1410 dput(dchild); 1411 fh_drop_write(fhp); 1412 return err; 1413 1414 out_nfserr: 1415 err = nfserrno(host_err); 1416 goto out; 1417 } 1418 #endif /* CONFIG_NFSD_V3 */ 1419 1420 /* 1421 * Read a symlink. On entry, *lenp must contain the maximum path length that 1422 * fits into the buffer. On return, it contains the true length. 1423 * N.B. After this call fhp needs an fh_put 1424 */ 1425 __be32 1426 nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp) 1427 { 1428 struct inode *inode; 1429 mm_segment_t oldfs; 1430 __be32 err; 1431 int host_err; 1432 struct path path; 1433 1434 err = fh_verify(rqstp, fhp, S_IFLNK, NFSD_MAY_NOP); 1435 if (err) 1436 goto out; 1437 1438 path.mnt = fhp->fh_export->ex_path.mnt; 1439 path.dentry = fhp->fh_dentry; 1440 inode = d_inode(path.dentry); 1441 1442 err = nfserr_inval; 1443 if (!inode->i_op->readlink) 1444 goto out; 1445 1446 touch_atime(&path); 1447 /* N.B. Why does this call need a get_fs()?? 1448 * Remove the set_fs and watch the fireworks:-) --okir 1449 */ 1450 1451 oldfs = get_fs(); set_fs(KERNEL_DS); 1452 host_err = inode->i_op->readlink(path.dentry, (char __user *)buf, *lenp); 1453 set_fs(oldfs); 1454 1455 if (host_err < 0) 1456 goto out_nfserr; 1457 *lenp = host_err; 1458 err = 0; 1459 out: 1460 return err; 1461 1462 out_nfserr: 1463 err = nfserrno(host_err); 1464 goto out; 1465 } 1466 1467 /* 1468 * Create a symlink and look up its inode 1469 * N.B. After this call _both_ fhp and resfhp need an fh_put 1470 */ 1471 __be32 1472 nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, 1473 char *fname, int flen, 1474 char *path, 1475 struct svc_fh *resfhp) 1476 { 1477 struct dentry *dentry, *dnew; 1478 __be32 err, cerr; 1479 int host_err; 1480 1481 err = nfserr_noent; 1482 if (!flen || path[0] == '\0') 1483 goto out; 1484 err = nfserr_exist; 1485 if (isdotent(fname, flen)) 1486 goto out; 1487 1488 err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); 1489 if (err) 1490 goto out; 1491 1492 host_err = fh_want_write(fhp); 1493 if (host_err) 1494 goto out_nfserr; 1495 1496 fh_lock(fhp); 1497 dentry = fhp->fh_dentry; 1498 dnew = lookup_one_len(fname, dentry, flen); 1499 host_err = PTR_ERR(dnew); 1500 if (IS_ERR(dnew)) 1501 goto out_nfserr; 1502 1503 host_err = vfs_symlink(d_inode(dentry), dnew, path); 1504 err = nfserrno(host_err); 1505 if (!err) 1506 err = nfserrno(commit_metadata(fhp)); 1507 fh_unlock(fhp); 1508 1509 fh_drop_write(fhp); 1510 1511 cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp); 1512 dput(dnew); 1513 if (err==0) err = cerr; 1514 out: 1515 return err; 1516 1517 out_nfserr: 1518 err = nfserrno(host_err); 1519 goto out; 1520 } 1521 1522 /* 1523 * Create a hardlink 1524 * N.B. After this call _both_ ffhp and tfhp need an fh_put 1525 */ 1526 __be32 1527 nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, 1528 char *name, int len, struct svc_fh *tfhp) 1529 { 1530 struct dentry *ddir, *dnew, *dold; 1531 struct inode *dirp; 1532 __be32 err; 1533 int host_err; 1534 1535 err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE); 1536 if (err) 1537 goto out; 1538 err = fh_verify(rqstp, tfhp, 0, NFSD_MAY_NOP); 1539 if (err) 1540 goto out; 1541 err = nfserr_isdir; 1542 if (d_is_dir(tfhp->fh_dentry)) 1543 goto out; 1544 err = nfserr_perm; 1545 if (!len) 1546 goto out; 1547 err = nfserr_exist; 1548 if (isdotent(name, len)) 1549 goto out; 1550 1551 host_err = fh_want_write(tfhp); 1552 if (host_err) { 1553 err = nfserrno(host_err); 1554 goto out; 1555 } 1556 1557 fh_lock_nested(ffhp, I_MUTEX_PARENT); 1558 ddir = ffhp->fh_dentry; 1559 dirp = d_inode(ddir); 1560 1561 dnew = lookup_one_len(name, ddir, len); 1562 host_err = PTR_ERR(dnew); 1563 if (IS_ERR(dnew)) 1564 goto out_nfserr; 1565 1566 dold = tfhp->fh_dentry; 1567 1568 err = nfserr_noent; 1569 if (d_really_is_negative(dold)) 1570 goto out_dput; 1571 host_err = vfs_link(dold, dirp, dnew, NULL); 1572 if (!host_err) { 1573 err = nfserrno(commit_metadata(ffhp)); 1574 if (!err) 1575 err = nfserrno(commit_metadata(tfhp)); 1576 } else { 1577 if (host_err == -EXDEV && rqstp->rq_vers == 2) 1578 err = nfserr_acces; 1579 else 1580 err = nfserrno(host_err); 1581 } 1582 out_dput: 1583 dput(dnew); 1584 out_unlock: 1585 fh_unlock(ffhp); 1586 fh_drop_write(tfhp); 1587 out: 1588 return err; 1589 1590 out_nfserr: 1591 err = nfserrno(host_err); 1592 goto out_unlock; 1593 } 1594 1595 /* 1596 * Rename a file 1597 * N.B. After this call _both_ ffhp and tfhp need an fh_put 1598 */ 1599 __be32 1600 nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, 1601 struct svc_fh *tfhp, char *tname, int tlen) 1602 { 1603 struct dentry *fdentry, *tdentry, *odentry, *ndentry, *trap; 1604 struct inode *fdir, *tdir; 1605 __be32 err; 1606 int host_err; 1607 1608 err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE); 1609 if (err) 1610 goto out; 1611 err = fh_verify(rqstp, tfhp, S_IFDIR, NFSD_MAY_CREATE); 1612 if (err) 1613 goto out; 1614 1615 fdentry = ffhp->fh_dentry; 1616 fdir = d_inode(fdentry); 1617 1618 tdentry = tfhp->fh_dentry; 1619 tdir = d_inode(tdentry); 1620 1621 err = nfserr_perm; 1622 if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen)) 1623 goto out; 1624 1625 host_err = fh_want_write(ffhp); 1626 if (host_err) { 1627 err = nfserrno(host_err); 1628 goto out; 1629 } 1630 1631 /* cannot use fh_lock as we need deadlock protective ordering 1632 * so do it by hand */ 1633 trap = lock_rename(tdentry, fdentry); 1634 ffhp->fh_locked = tfhp->fh_locked = 1; 1635 fill_pre_wcc(ffhp); 1636 fill_pre_wcc(tfhp); 1637 1638 odentry = lookup_one_len(fname, fdentry, flen); 1639 host_err = PTR_ERR(odentry); 1640 if (IS_ERR(odentry)) 1641 goto out_nfserr; 1642 1643 host_err = -ENOENT; 1644 if (d_really_is_negative(odentry)) 1645 goto out_dput_old; 1646 host_err = -EINVAL; 1647 if (odentry == trap) 1648 goto out_dput_old; 1649 1650 ndentry = lookup_one_len(tname, tdentry, tlen); 1651 host_err = PTR_ERR(ndentry); 1652 if (IS_ERR(ndentry)) 1653 goto out_dput_old; 1654 host_err = -ENOTEMPTY; 1655 if (ndentry == trap) 1656 goto out_dput_new; 1657 1658 host_err = -EXDEV; 1659 if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt) 1660 goto out_dput_new; 1661 if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry) 1662 goto out_dput_new; 1663 1664 host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0); 1665 if (!host_err) { 1666 host_err = commit_metadata(tfhp); 1667 if (!host_err) 1668 host_err = commit_metadata(ffhp); 1669 } 1670 out_dput_new: 1671 dput(ndentry); 1672 out_dput_old: 1673 dput(odentry); 1674 out_nfserr: 1675 err = nfserrno(host_err); 1676 /* 1677 * We cannot rely on fh_unlock on the two filehandles, 1678 * as that would do the wrong thing if the two directories 1679 * were the same, so again we do it by hand. 1680 */ 1681 fill_post_wcc(ffhp); 1682 fill_post_wcc(tfhp); 1683 unlock_rename(tdentry, fdentry); 1684 ffhp->fh_locked = tfhp->fh_locked = 0; 1685 fh_drop_write(ffhp); 1686 1687 out: 1688 return err; 1689 } 1690 1691 /* 1692 * Unlink a file or directory 1693 * N.B. After this call fhp needs an fh_put 1694 */ 1695 __be32 1696 nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, 1697 char *fname, int flen) 1698 { 1699 struct dentry *dentry, *rdentry; 1700 struct inode *dirp; 1701 __be32 err; 1702 int host_err; 1703 1704 err = nfserr_acces; 1705 if (!flen || isdotent(fname, flen)) 1706 goto out; 1707 err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_REMOVE); 1708 if (err) 1709 goto out; 1710 1711 host_err = fh_want_write(fhp); 1712 if (host_err) 1713 goto out_nfserr; 1714 1715 fh_lock_nested(fhp, I_MUTEX_PARENT); 1716 dentry = fhp->fh_dentry; 1717 dirp = d_inode(dentry); 1718 1719 rdentry = lookup_one_len(fname, dentry, flen); 1720 host_err = PTR_ERR(rdentry); 1721 if (IS_ERR(rdentry)) 1722 goto out_nfserr; 1723 1724 if (d_really_is_negative(rdentry)) { 1725 dput(rdentry); 1726 err = nfserr_noent; 1727 goto out; 1728 } 1729 1730 if (!type) 1731 type = d_inode(rdentry)->i_mode & S_IFMT; 1732 1733 if (type != S_IFDIR) 1734 host_err = vfs_unlink(dirp, rdentry, NULL); 1735 else 1736 host_err = vfs_rmdir(dirp, rdentry); 1737 if (!host_err) 1738 host_err = commit_metadata(fhp); 1739 dput(rdentry); 1740 1741 out_nfserr: 1742 err = nfserrno(host_err); 1743 out: 1744 return err; 1745 } 1746 1747 /* 1748 * We do this buffering because we must not call back into the file 1749 * system's ->lookup() method from the filldir callback. That may well 1750 * deadlock a number of file systems. 1751 * 1752 * This is based heavily on the implementation of same in XFS. 1753 */ 1754 struct buffered_dirent { 1755 u64 ino; 1756 loff_t offset; 1757 int namlen; 1758 unsigned int d_type; 1759 char name[]; 1760 }; 1761 1762 struct readdir_data { 1763 struct dir_context ctx; 1764 char *dirent; 1765 size_t used; 1766 int full; 1767 }; 1768 1769 static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name, 1770 int namlen, loff_t offset, u64 ino, 1771 unsigned int d_type) 1772 { 1773 struct readdir_data *buf = 1774 container_of(ctx, struct readdir_data, ctx); 1775 struct buffered_dirent *de = (void *)(buf->dirent + buf->used); 1776 unsigned int reclen; 1777 1778 reclen = ALIGN(sizeof(struct buffered_dirent) + namlen, sizeof(u64)); 1779 if (buf->used + reclen > PAGE_SIZE) { 1780 buf->full = 1; 1781 return -EINVAL; 1782 } 1783 1784 de->namlen = namlen; 1785 de->offset = offset; 1786 de->ino = ino; 1787 de->d_type = d_type; 1788 memcpy(de->name, name, namlen); 1789 buf->used += reclen; 1790 1791 return 0; 1792 } 1793 1794 static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func, 1795 struct readdir_cd *cdp, loff_t *offsetp) 1796 { 1797 struct buffered_dirent *de; 1798 int host_err; 1799 int size; 1800 loff_t offset; 1801 struct readdir_data buf = { 1802 .ctx.actor = nfsd_buffered_filldir, 1803 .dirent = (void *)__get_free_page(GFP_KERNEL) 1804 }; 1805 1806 if (!buf.dirent) 1807 return nfserrno(-ENOMEM); 1808 1809 offset = *offsetp; 1810 1811 while (1) { 1812 struct inode *dir_inode = file_inode(file); 1813 unsigned int reclen; 1814 1815 cdp->err = nfserr_eof; /* will be cleared on successful read */ 1816 buf.used = 0; 1817 buf.full = 0; 1818 1819 host_err = iterate_dir(file, &buf.ctx); 1820 if (buf.full) 1821 host_err = 0; 1822 1823 if (host_err < 0) 1824 break; 1825 1826 size = buf.used; 1827 1828 if (!size) 1829 break; 1830 1831 /* 1832 * Various filldir functions may end up calling back into 1833 * lookup_one_len() and the file system's ->lookup() method. 1834 * These expect i_mutex to be held, as it would within readdir. 1835 */ 1836 host_err = mutex_lock_killable(&dir_inode->i_mutex); 1837 if (host_err) 1838 break; 1839 1840 de = (struct buffered_dirent *)buf.dirent; 1841 while (size > 0) { 1842 offset = de->offset; 1843 1844 if (func(cdp, de->name, de->namlen, de->offset, 1845 de->ino, de->d_type)) 1846 break; 1847 1848 if (cdp->err != nfs_ok) 1849 break; 1850 1851 reclen = ALIGN(sizeof(*de) + de->namlen, 1852 sizeof(u64)); 1853 size -= reclen; 1854 de = (struct buffered_dirent *)((char *)de + reclen); 1855 } 1856 mutex_unlock(&dir_inode->i_mutex); 1857 if (size > 0) /* We bailed out early */ 1858 break; 1859 1860 offset = vfs_llseek(file, 0, SEEK_CUR); 1861 } 1862 1863 free_page((unsigned long)(buf.dirent)); 1864 1865 if (host_err) 1866 return nfserrno(host_err); 1867 1868 *offsetp = offset; 1869 return cdp->err; 1870 } 1871 1872 /* 1873 * Read entries from a directory. 1874 * The NFSv3/4 verifier we ignore for now. 1875 */ 1876 __be32 1877 nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, 1878 struct readdir_cd *cdp, nfsd_filldir_t func) 1879 { 1880 __be32 err; 1881 struct file *file; 1882 loff_t offset = *offsetp; 1883 int may_flags = NFSD_MAY_READ; 1884 1885 /* NFSv2 only supports 32 bit cookies */ 1886 if (rqstp->rq_vers > 2) 1887 may_flags |= NFSD_MAY_64BIT_COOKIE; 1888 1889 err = nfsd_open(rqstp, fhp, S_IFDIR, may_flags, &file); 1890 if (err) 1891 goto out; 1892 1893 offset = vfs_llseek(file, offset, SEEK_SET); 1894 if (offset < 0) { 1895 err = nfserrno((int)offset); 1896 goto out_close; 1897 } 1898 1899 err = nfsd_buffered_readdir(file, func, cdp, offsetp); 1900 1901 if (err == nfserr_eof || err == nfserr_toosmall) 1902 err = nfs_ok; /* can still be found in ->err */ 1903 out_close: 1904 fput(file); 1905 out: 1906 return err; 1907 } 1908 1909 /* 1910 * Get file system stats 1911 * N.B. After this call fhp needs an fh_put 1912 */ 1913 __be32 1914 nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access) 1915 { 1916 __be32 err; 1917 1918 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access); 1919 if (!err) { 1920 struct path path = { 1921 .mnt = fhp->fh_export->ex_path.mnt, 1922 .dentry = fhp->fh_dentry, 1923 }; 1924 if (vfs_statfs(&path, stat)) 1925 err = nfserr_io; 1926 } 1927 return err; 1928 } 1929 1930 static int exp_rdonly(struct svc_rqst *rqstp, struct svc_export *exp) 1931 { 1932 return nfsexp_flags(rqstp, exp) & NFSEXP_READONLY; 1933 } 1934 1935 /* 1936 * Check for a user's access permissions to this inode. 1937 */ 1938 __be32 1939 nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, 1940 struct dentry *dentry, int acc) 1941 { 1942 struct inode *inode = d_inode(dentry); 1943 int err; 1944 1945 if ((acc & NFSD_MAY_MASK) == NFSD_MAY_NOP) 1946 return 0; 1947 #if 0 1948 dprintk("nfsd: permission 0x%x%s%s%s%s%s%s%s mode 0%o%s%s%s\n", 1949 acc, 1950 (acc & NFSD_MAY_READ)? " read" : "", 1951 (acc & NFSD_MAY_WRITE)? " write" : "", 1952 (acc & NFSD_MAY_EXEC)? " exec" : "", 1953 (acc & NFSD_MAY_SATTR)? " sattr" : "", 1954 (acc & NFSD_MAY_TRUNC)? " trunc" : "", 1955 (acc & NFSD_MAY_LOCK)? " lock" : "", 1956 (acc & NFSD_MAY_OWNER_OVERRIDE)? " owneroverride" : "", 1957 inode->i_mode, 1958 IS_IMMUTABLE(inode)? " immut" : "", 1959 IS_APPEND(inode)? " append" : "", 1960 __mnt_is_readonly(exp->ex_path.mnt)? " ro" : ""); 1961 dprintk(" owner %d/%d user %d/%d\n", 1962 inode->i_uid, inode->i_gid, current_fsuid(), current_fsgid()); 1963 #endif 1964 1965 /* Normally we reject any write/sattr etc access on a read-only file 1966 * system. But if it is IRIX doing check on write-access for a 1967 * device special file, we ignore rofs. 1968 */ 1969 if (!(acc & NFSD_MAY_LOCAL_ACCESS)) 1970 if (acc & (NFSD_MAY_WRITE | NFSD_MAY_SATTR | NFSD_MAY_TRUNC)) { 1971 if (exp_rdonly(rqstp, exp) || 1972 __mnt_is_readonly(exp->ex_path.mnt)) 1973 return nfserr_rofs; 1974 if (/* (acc & NFSD_MAY_WRITE) && */ IS_IMMUTABLE(inode)) 1975 return nfserr_perm; 1976 } 1977 if ((acc & NFSD_MAY_TRUNC) && IS_APPEND(inode)) 1978 return nfserr_perm; 1979 1980 if (acc & NFSD_MAY_LOCK) { 1981 /* If we cannot rely on authentication in NLM requests, 1982 * just allow locks, otherwise require read permission, or 1983 * ownership 1984 */ 1985 if (exp->ex_flags & NFSEXP_NOAUTHNLM) 1986 return 0; 1987 else 1988 acc = NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE; 1989 } 1990 /* 1991 * The file owner always gets access permission for accesses that 1992 * would normally be checked at open time. This is to make 1993 * file access work even when the client has done a fchmod(fd, 0). 1994 * 1995 * However, `cp foo bar' should fail nevertheless when bar is 1996 * readonly. A sensible way to do this might be to reject all 1997 * attempts to truncate a read-only file, because a creat() call 1998 * always implies file truncation. 1999 * ... but this isn't really fair. A process may reasonably call 2000 * ftruncate on an open file descriptor on a file with perm 000. 2001 * We must trust the client to do permission checking - using "ACCESS" 2002 * with NFSv3. 2003 */ 2004 if ((acc & NFSD_MAY_OWNER_OVERRIDE) && 2005 uid_eq(inode->i_uid, current_fsuid())) 2006 return 0; 2007 2008 /* This assumes NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */ 2009 err = inode_permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC)); 2010 2011 /* Allow read access to binaries even when mode 111 */ 2012 if (err == -EACCES && S_ISREG(inode->i_mode) && 2013 (acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE) || 2014 acc == (NFSD_MAY_READ | NFSD_MAY_READ_IF_EXEC))) 2015 err = inode_permission(inode, MAY_EXEC); 2016 2017 return err? nfserrno(err) : 0; 2018 } 2019 2020 void 2021 nfsd_racache_shutdown(void) 2022 { 2023 struct raparms *raparm, *last_raparm; 2024 unsigned int i; 2025 2026 dprintk("nfsd: freeing readahead buffers.\n"); 2027 2028 for (i = 0; i < RAPARM_HASH_SIZE; i++) { 2029 raparm = raparm_hash[i].pb_head; 2030 while(raparm) { 2031 last_raparm = raparm; 2032 raparm = raparm->p_next; 2033 kfree(last_raparm); 2034 } 2035 raparm_hash[i].pb_head = NULL; 2036 } 2037 } 2038 /* 2039 * Initialize readahead param cache 2040 */ 2041 int 2042 nfsd_racache_init(int cache_size) 2043 { 2044 int i; 2045 int j = 0; 2046 int nperbucket; 2047 struct raparms **raparm = NULL; 2048 2049 2050 if (raparm_hash[0].pb_head) 2051 return 0; 2052 nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE); 2053 nperbucket = max(2, nperbucket); 2054 cache_size = nperbucket * RAPARM_HASH_SIZE; 2055 2056 dprintk("nfsd: allocating %d readahead buffers.\n", cache_size); 2057 2058 for (i = 0; i < RAPARM_HASH_SIZE; i++) { 2059 spin_lock_init(&raparm_hash[i].pb_lock); 2060 2061 raparm = &raparm_hash[i].pb_head; 2062 for (j = 0; j < nperbucket; j++) { 2063 *raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL); 2064 if (!*raparm) 2065 goto out_nomem; 2066 raparm = &(*raparm)->p_next; 2067 } 2068 *raparm = NULL; 2069 } 2070 2071 nfsdstats.ra_size = cache_size; 2072 return 0; 2073 2074 out_nomem: 2075 dprintk("nfsd: kmalloc failed, freeing readahead buffers\n"); 2076 nfsd_racache_shutdown(); 2077 return -ENOMEM; 2078 } 2079