1 #include "ceph_debug.h" 2 3 #include <linux/spinlock.h> 4 #include <linux/fs_struct.h> 5 #include <linux/namei.h> 6 #include <linux/slab.h> 7 #include <linux/sched.h> 8 9 #include "super.h" 10 11 /* 12 * Directory operations: readdir, lookup, create, link, unlink, 13 * rename, etc. 14 */ 15 16 /* 17 * Ceph MDS operations are specified in terms of a base ino and 18 * relative path. Thus, the client can specify an operation on a 19 * specific inode (e.g., a getattr due to fstat(2)), or as a path 20 * relative to, say, the root directory. 21 * 22 * Normally, we limit ourselves to strict inode ops (no path component) 23 * or dentry operations (a single path component relative to an ino). The 24 * exception to this is open_root_dentry(), which will open the mount 25 * point by name. 26 */ 27 28 const struct inode_operations ceph_dir_iops; 29 const struct file_operations ceph_dir_fops; 30 struct dentry_operations ceph_dentry_ops; 31 32 /* 33 * Initialize ceph dentry state. 34 */ 35 int ceph_init_dentry(struct dentry *dentry) 36 { 37 struct ceph_dentry_info *di; 38 39 if (dentry->d_fsdata) 40 return 0; 41 42 if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) 43 dentry->d_op = &ceph_dentry_ops; 44 else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) 45 dentry->d_op = &ceph_snapdir_dentry_ops; 46 else 47 dentry->d_op = &ceph_snap_dentry_ops; 48 49 di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS); 50 if (!di) 51 return -ENOMEM; /* oh well */ 52 53 spin_lock(&dentry->d_lock); 54 if (dentry->d_fsdata) { 55 /* lost a race */ 56 kmem_cache_free(ceph_dentry_cachep, di); 57 goto out_unlock; 58 } 59 di->dentry = dentry; 60 di->lease_session = NULL; 61 dentry->d_fsdata = di; 62 dentry->d_time = jiffies; 63 ceph_dentry_lru_add(dentry); 64 out_unlock: 65 spin_unlock(&dentry->d_lock); 66 return 0; 67 } 68 69 70 71 /* 72 * for readdir, we encode the directory frag and offset within that 73 * frag into f_pos. 74 */ 75 static unsigned fpos_frag(loff_t p) 76 { 77 return p >> 32; 78 } 79 static unsigned fpos_off(loff_t p) 80 { 81 return p & 0xffffffff; 82 } 83 84 /* 85 * When possible, we try to satisfy a readdir by peeking at the 86 * dcache. We make this work by carefully ordering dentries on 87 * d_u.d_child when we initially get results back from the MDS, and 88 * falling back to a "normal" sync readdir if any dentries in the dir 89 * are dropped. 90 * 91 * I_COMPLETE tells indicates we have all dentries in the dir. It is 92 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by 93 * the MDS if/when the directory is modified). 94 */ 95 static int __dcache_readdir(struct file *filp, 96 void *dirent, filldir_t filldir) 97 { 98 struct inode *inode = filp->f_dentry->d_inode; 99 struct ceph_file_info *fi = filp->private_data; 100 struct dentry *parent = filp->f_dentry; 101 struct inode *dir = parent->d_inode; 102 struct list_head *p; 103 struct dentry *dentry, *last; 104 struct ceph_dentry_info *di; 105 int err = 0; 106 107 /* claim ref on last dentry we returned */ 108 last = fi->dentry; 109 fi->dentry = NULL; 110 111 dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos, 112 last); 113 114 spin_lock(&dcache_lock); 115 116 /* start at beginning? */ 117 if (filp->f_pos == 2 || (last && 118 filp->f_pos < ceph_dentry(last)->offset)) { 119 if (list_empty(&parent->d_subdirs)) 120 goto out_unlock; 121 p = parent->d_subdirs.prev; 122 dout(" initial p %p/%p\n", p->prev, p->next); 123 } else { 124 p = last->d_u.d_child.prev; 125 } 126 127 more: 128 dentry = list_entry(p, struct dentry, d_u.d_child); 129 di = ceph_dentry(dentry); 130 while (1) { 131 dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next, 132 d_unhashed(dentry) ? "!hashed" : "hashed", 133 parent->d_subdirs.prev, parent->d_subdirs.next); 134 if (p == &parent->d_subdirs) { 135 fi->at_end = 1; 136 goto out_unlock; 137 } 138 if (!d_unhashed(dentry) && dentry->d_inode && 139 ceph_snap(dentry->d_inode) != CEPH_SNAPDIR && 140 ceph_ino(dentry->d_inode) != CEPH_INO_CEPH && 141 filp->f_pos <= di->offset) 142 break; 143 dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry, 144 dentry->d_name.len, dentry->d_name.name, di->offset, 145 filp->f_pos, d_unhashed(dentry) ? " unhashed" : "", 146 !dentry->d_inode ? " null" : ""); 147 p = p->prev; 148 dentry = list_entry(p, struct dentry, d_u.d_child); 149 di = ceph_dentry(dentry); 150 } 151 152 atomic_inc(&dentry->d_count); 153 spin_unlock(&dcache_lock); 154 spin_unlock(&inode->i_lock); 155 156 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos, 157 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); 158 filp->f_pos = di->offset; 159 err = filldir(dirent, dentry->d_name.name, 160 dentry->d_name.len, di->offset, 161 dentry->d_inode->i_ino, 162 dentry->d_inode->i_mode >> 12); 163 164 if (last) { 165 if (err < 0) { 166 /* remember our position */ 167 fi->dentry = last; 168 fi->next_offset = di->offset; 169 } else { 170 dput(last); 171 } 172 last = NULL; 173 } 174 175 spin_lock(&inode->i_lock); 176 spin_lock(&dcache_lock); 177 178 last = dentry; 179 180 if (err < 0) 181 goto out_unlock; 182 183 p = p->prev; 184 filp->f_pos++; 185 186 /* make sure a dentry wasn't dropped while we didn't have dcache_lock */ 187 if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE)) 188 goto more; 189 dout(" lost I_COMPLETE on %p; falling back to mds\n", dir); 190 err = -EAGAIN; 191 192 out_unlock: 193 spin_unlock(&dcache_lock); 194 195 if (last) { 196 spin_unlock(&inode->i_lock); 197 dput(last); 198 spin_lock(&inode->i_lock); 199 } 200 201 return err; 202 } 203 204 /* 205 * make note of the last dentry we read, so we can 206 * continue at the same lexicographical point, 207 * regardless of what dir changes take place on the 208 * server. 209 */ 210 static int note_last_dentry(struct ceph_file_info *fi, const char *name, 211 int len) 212 { 213 kfree(fi->last_name); 214 fi->last_name = kmalloc(len+1, GFP_NOFS); 215 if (!fi->last_name) 216 return -ENOMEM; 217 memcpy(fi->last_name, name, len); 218 fi->last_name[len] = 0; 219 dout("note_last_dentry '%s'\n", fi->last_name); 220 return 0; 221 } 222 223 static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) 224 { 225 struct ceph_file_info *fi = filp->private_data; 226 struct inode *inode = filp->f_dentry->d_inode; 227 struct ceph_inode_info *ci = ceph_inode(inode); 228 struct ceph_client *client = ceph_inode_to_client(inode); 229 struct ceph_mds_client *mdsc = &client->mdsc; 230 unsigned frag = fpos_frag(filp->f_pos); 231 int off = fpos_off(filp->f_pos); 232 int err; 233 u32 ftype; 234 struct ceph_mds_reply_info_parsed *rinfo; 235 const int max_entries = client->mount_args->max_readdir; 236 const int max_bytes = client->mount_args->max_readdir_bytes; 237 238 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); 239 if (fi->at_end) 240 return 0; 241 242 /* always start with . and .. */ 243 if (filp->f_pos == 0) { 244 /* note dir version at start of readdir so we can tell 245 * if any dentries get dropped */ 246 fi->dir_release_count = ci->i_release_count; 247 248 dout("readdir off 0 -> '.'\n"); 249 if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0), 250 inode->i_ino, inode->i_mode >> 12) < 0) 251 return 0; 252 filp->f_pos = 1; 253 off = 1; 254 } 255 if (filp->f_pos == 1) { 256 dout("readdir off 1 -> '..'\n"); 257 if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1), 258 filp->f_dentry->d_parent->d_inode->i_ino, 259 inode->i_mode >> 12) < 0) 260 return 0; 261 filp->f_pos = 2; 262 off = 2; 263 } 264 265 /* can we use the dcache? */ 266 spin_lock(&inode->i_lock); 267 if ((filp->f_pos == 2 || fi->dentry) && 268 !ceph_test_opt(client, NOASYNCREADDIR) && 269 (ci->i_ceph_flags & CEPH_I_COMPLETE) && 270 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { 271 err = __dcache_readdir(filp, dirent, filldir); 272 if (err != -EAGAIN) { 273 spin_unlock(&inode->i_lock); 274 return err; 275 } 276 } 277 spin_unlock(&inode->i_lock); 278 if (fi->dentry) { 279 err = note_last_dentry(fi, fi->dentry->d_name.name, 280 fi->dentry->d_name.len); 281 if (err) 282 return err; 283 dput(fi->dentry); 284 fi->dentry = NULL; 285 } 286 287 /* proceed with a normal readdir */ 288 289 more: 290 /* do we have the correct frag content buffered? */ 291 if (fi->frag != frag || fi->last_readdir == NULL) { 292 struct ceph_mds_request *req; 293 int op = ceph_snap(inode) == CEPH_SNAPDIR ? 294 CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; 295 296 /* discard old result, if any */ 297 if (fi->last_readdir) { 298 ceph_mdsc_put_request(fi->last_readdir); 299 fi->last_readdir = NULL; 300 } 301 302 /* requery frag tree, as the frag topology may have changed */ 303 frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL); 304 305 dout("readdir fetching %llx.%llx frag %x offset '%s'\n", 306 ceph_vinop(inode), frag, fi->last_name); 307 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 308 if (IS_ERR(req)) 309 return PTR_ERR(req); 310 req->r_inode = igrab(inode); 311 req->r_dentry = dget(filp->f_dentry); 312 /* hints to request -> mds selection code */ 313 req->r_direct_mode = USE_AUTH_MDS; 314 req->r_direct_hash = ceph_frag_value(frag); 315 req->r_direct_is_hash = true; 316 req->r_path2 = kstrdup(fi->last_name, GFP_NOFS); 317 req->r_readdir_offset = fi->next_offset; 318 req->r_args.readdir.frag = cpu_to_le32(frag); 319 req->r_args.readdir.max_entries = cpu_to_le32(max_entries); 320 req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes); 321 req->r_num_caps = max_entries + 1; 322 err = ceph_mdsc_do_request(mdsc, NULL, req); 323 if (err < 0) { 324 ceph_mdsc_put_request(req); 325 return err; 326 } 327 dout("readdir got and parsed readdir result=%d" 328 " on frag %x, end=%d, complete=%d\n", err, frag, 329 (int)req->r_reply_info.dir_end, 330 (int)req->r_reply_info.dir_complete); 331 332 if (!req->r_did_prepopulate) { 333 dout("readdir !did_prepopulate"); 334 fi->dir_release_count--; /* preclude I_COMPLETE */ 335 } 336 337 /* note next offset and last dentry name */ 338 fi->offset = fi->next_offset; 339 fi->last_readdir = req; 340 341 if (req->r_reply_info.dir_end) { 342 kfree(fi->last_name); 343 fi->last_name = NULL; 344 fi->next_offset = 2; 345 } else { 346 rinfo = &req->r_reply_info; 347 err = note_last_dentry(fi, 348 rinfo->dir_dname[rinfo->dir_nr-1], 349 rinfo->dir_dname_len[rinfo->dir_nr-1]); 350 if (err) 351 return err; 352 fi->next_offset += rinfo->dir_nr; 353 } 354 } 355 356 rinfo = &fi->last_readdir->r_reply_info; 357 dout("readdir frag %x num %d off %d chunkoff %d\n", frag, 358 rinfo->dir_nr, off, fi->offset); 359 while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) { 360 u64 pos = ceph_make_fpos(frag, off); 361 struct ceph_mds_reply_inode *in = 362 rinfo->dir_in[off - fi->offset].in; 363 dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", 364 off, off - fi->offset, rinfo->dir_nr, pos, 365 rinfo->dir_dname_len[off - fi->offset], 366 rinfo->dir_dname[off - fi->offset], in); 367 BUG_ON(!in); 368 ftype = le32_to_cpu(in->mode) >> 12; 369 if (filldir(dirent, 370 rinfo->dir_dname[off - fi->offset], 371 rinfo->dir_dname_len[off - fi->offset], 372 pos, 373 le64_to_cpu(in->ino), 374 ftype) < 0) { 375 dout("filldir stopping us...\n"); 376 return 0; 377 } 378 off++; 379 filp->f_pos = pos + 1; 380 } 381 382 if (fi->last_name) { 383 ceph_mdsc_put_request(fi->last_readdir); 384 fi->last_readdir = NULL; 385 goto more; 386 } 387 388 /* more frags? */ 389 if (!ceph_frag_is_rightmost(frag)) { 390 frag = ceph_frag_next(frag); 391 off = 0; 392 filp->f_pos = ceph_make_fpos(frag, off); 393 dout("readdir next frag is %x\n", frag); 394 goto more; 395 } 396 fi->at_end = 1; 397 398 /* 399 * if dir_release_count still matches the dir, no dentries 400 * were released during the whole readdir, and we should have 401 * the complete dir contents in our cache. 402 */ 403 spin_lock(&inode->i_lock); 404 if (ci->i_release_count == fi->dir_release_count) { 405 dout(" marking %p complete\n", inode); 406 ci->i_ceph_flags |= CEPH_I_COMPLETE; 407 ci->i_max_offset = filp->f_pos; 408 } 409 spin_unlock(&inode->i_lock); 410 411 dout("readdir %p filp %p done.\n", inode, filp); 412 return 0; 413 } 414 415 static void reset_readdir(struct ceph_file_info *fi) 416 { 417 if (fi->last_readdir) { 418 ceph_mdsc_put_request(fi->last_readdir); 419 fi->last_readdir = NULL; 420 } 421 kfree(fi->last_name); 422 fi->next_offset = 2; /* compensate for . and .. */ 423 if (fi->dentry) { 424 dput(fi->dentry); 425 fi->dentry = NULL; 426 } 427 fi->at_end = 0; 428 } 429 430 static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) 431 { 432 struct ceph_file_info *fi = file->private_data; 433 struct inode *inode = file->f_mapping->host; 434 loff_t old_offset = offset; 435 loff_t retval; 436 437 mutex_lock(&inode->i_mutex); 438 switch (origin) { 439 case SEEK_END: 440 offset += inode->i_size + 2; /* FIXME */ 441 break; 442 case SEEK_CUR: 443 offset += file->f_pos; 444 } 445 retval = -EINVAL; 446 if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) { 447 if (offset != file->f_pos) { 448 file->f_pos = offset; 449 file->f_version = 0; 450 fi->at_end = 0; 451 } 452 retval = offset; 453 454 /* 455 * discard buffered readdir content on seekdir(0), or 456 * seek to new frag, or seek prior to current chunk. 457 */ 458 if (offset == 0 || 459 fpos_frag(offset) != fpos_frag(old_offset) || 460 fpos_off(offset) < fi->offset) { 461 dout("dir_llseek dropping %p content\n", file); 462 reset_readdir(fi); 463 } 464 465 /* bump dir_release_count if we did a forward seek */ 466 if (offset > old_offset) 467 fi->dir_release_count--; 468 } 469 mutex_unlock(&inode->i_mutex); 470 return retval; 471 } 472 473 /* 474 * Process result of a lookup/open request. 475 * 476 * Mainly, make sure we return the final req->r_dentry (if it already 477 * existed) in place of the original VFS-provided dentry when they 478 * differ. 479 * 480 * Gracefully handle the case where the MDS replies with -ENOENT and 481 * no trace (which it may do, at its discretion, e.g., if it doesn't 482 * care to issue a lease on the negative dentry). 483 */ 484 struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, 485 struct dentry *dentry, int err) 486 { 487 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); 488 struct inode *parent = dentry->d_parent->d_inode; 489 490 /* .snap dir? */ 491 if (err == -ENOENT && 492 ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */ 493 strcmp(dentry->d_name.name, 494 client->mount_args->snapdir_name) == 0) { 495 struct inode *inode = ceph_get_snapdir(parent); 496 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n", 497 dentry, dentry->d_name.len, dentry->d_name.name, inode); 498 BUG_ON(!d_unhashed(dentry)); 499 d_add(dentry, inode); 500 err = 0; 501 } 502 503 if (err == -ENOENT) { 504 /* no trace? */ 505 err = 0; 506 if (!req->r_reply_info.head->is_dentry) { 507 dout("ENOENT and no trace, dentry %p inode %p\n", 508 dentry, dentry->d_inode); 509 if (dentry->d_inode) { 510 d_drop(dentry); 511 err = -ENOENT; 512 } else { 513 d_add(dentry, NULL); 514 } 515 } 516 } 517 if (err) 518 dentry = ERR_PTR(err); 519 else if (dentry != req->r_dentry) 520 dentry = dget(req->r_dentry); /* we got spliced */ 521 else 522 dentry = NULL; 523 return dentry; 524 } 525 526 static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) 527 { 528 return ceph_ino(inode) == CEPH_INO_ROOT && 529 strncmp(dentry->d_name.name, ".ceph", 5) == 0; 530 } 531 532 /* 533 * Look up a single dir entry. If there is a lookup intent, inform 534 * the MDS so that it gets our 'caps wanted' value in a single op. 535 */ 536 static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, 537 struct nameidata *nd) 538 { 539 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 540 struct ceph_mds_client *mdsc = &client->mdsc; 541 struct ceph_mds_request *req; 542 int op; 543 int err; 544 545 dout("lookup %p dentry %p '%.*s'\n", 546 dir, dentry, dentry->d_name.len, dentry->d_name.name); 547 548 if (dentry->d_name.len > NAME_MAX) 549 return ERR_PTR(-ENAMETOOLONG); 550 551 err = ceph_init_dentry(dentry); 552 if (err < 0) 553 return ERR_PTR(err); 554 555 /* open (but not create!) intent? */ 556 if (nd && 557 (nd->flags & LOOKUP_OPEN) && 558 (nd->flags & LOOKUP_CONTINUE) == 0 && /* only open last component */ 559 !(nd->intent.open.flags & O_CREAT)) { 560 int mode = nd->intent.open.create_mode & ~current->fs->umask; 561 return ceph_lookup_open(dir, dentry, nd, mode, 1); 562 } 563 564 /* can we conclude ENOENT locally? */ 565 if (dentry->d_inode == NULL) { 566 struct ceph_inode_info *ci = ceph_inode(dir); 567 struct ceph_dentry_info *di = ceph_dentry(dentry); 568 569 spin_lock(&dir->i_lock); 570 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags); 571 if (strncmp(dentry->d_name.name, 572 client->mount_args->snapdir_name, 573 dentry->d_name.len) && 574 !is_root_ceph_dentry(dir, dentry) && 575 (ci->i_ceph_flags & CEPH_I_COMPLETE) && 576 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { 577 spin_unlock(&dir->i_lock); 578 dout(" dir %p complete, -ENOENT\n", dir); 579 d_add(dentry, NULL); 580 di->lease_shared_gen = ci->i_shared_gen; 581 return NULL; 582 } 583 spin_unlock(&dir->i_lock); 584 } 585 586 op = ceph_snap(dir) == CEPH_SNAPDIR ? 587 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; 588 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); 589 if (IS_ERR(req)) 590 return ERR_CAST(req); 591 req->r_dentry = dget(dentry); 592 req->r_num_caps = 2; 593 /* we only need inode linkage */ 594 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); 595 req->r_locked_dir = dir; 596 err = ceph_mdsc_do_request(mdsc, NULL, req); 597 dentry = ceph_finish_lookup(req, dentry, err); 598 ceph_mdsc_put_request(req); /* will dput(dentry) */ 599 dout("lookup result=%p\n", dentry); 600 return dentry; 601 } 602 603 /* 604 * If we do a create but get no trace back from the MDS, follow up with 605 * a lookup (the VFS expects us to link up the provided dentry). 606 */ 607 int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) 608 { 609 struct dentry *result = ceph_lookup(dir, dentry, NULL); 610 611 if (result && !IS_ERR(result)) { 612 /* 613 * We created the item, then did a lookup, and found 614 * it was already linked to another inode we already 615 * had in our cache (and thus got spliced). Link our 616 * dentry to that inode, but don't hash it, just in 617 * case the VFS wants to dereference it. 618 */ 619 BUG_ON(!result->d_inode); 620 d_instantiate(dentry, result->d_inode); 621 return 0; 622 } 623 return PTR_ERR(result); 624 } 625 626 static int ceph_mknod(struct inode *dir, struct dentry *dentry, 627 int mode, dev_t rdev) 628 { 629 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 630 struct ceph_mds_client *mdsc = &client->mdsc; 631 struct ceph_mds_request *req; 632 int err; 633 634 if (ceph_snap(dir) != CEPH_NOSNAP) 635 return -EROFS; 636 637 dout("mknod in dir %p dentry %p mode 0%o rdev %d\n", 638 dir, dentry, mode, rdev); 639 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS); 640 if (IS_ERR(req)) { 641 d_drop(dentry); 642 return PTR_ERR(req); 643 } 644 req->r_dentry = dget(dentry); 645 req->r_num_caps = 2; 646 req->r_locked_dir = dir; 647 req->r_args.mknod.mode = cpu_to_le32(mode); 648 req->r_args.mknod.rdev = cpu_to_le32(rdev); 649 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 650 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 651 err = ceph_mdsc_do_request(mdsc, dir, req); 652 if (!err && !req->r_reply_info.head->is_dentry) 653 err = ceph_handle_notrace_create(dir, dentry); 654 ceph_mdsc_put_request(req); 655 if (err) 656 d_drop(dentry); 657 return err; 658 } 659 660 static int ceph_create(struct inode *dir, struct dentry *dentry, int mode, 661 struct nameidata *nd) 662 { 663 dout("create in dir %p dentry %p name '%.*s'\n", 664 dir, dentry, dentry->d_name.len, dentry->d_name.name); 665 666 if (ceph_snap(dir) != CEPH_NOSNAP) 667 return -EROFS; 668 669 if (nd) { 670 BUG_ON((nd->flags & LOOKUP_OPEN) == 0); 671 dentry = ceph_lookup_open(dir, dentry, nd, mode, 0); 672 /* hrm, what should i do here if we get aliased? */ 673 if (IS_ERR(dentry)) 674 return PTR_ERR(dentry); 675 return 0; 676 } 677 678 /* fall back to mknod */ 679 return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0); 680 } 681 682 static int ceph_symlink(struct inode *dir, struct dentry *dentry, 683 const char *dest) 684 { 685 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 686 struct ceph_mds_client *mdsc = &client->mdsc; 687 struct ceph_mds_request *req; 688 int err; 689 690 if (ceph_snap(dir) != CEPH_NOSNAP) 691 return -EROFS; 692 693 dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest); 694 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS); 695 if (IS_ERR(req)) { 696 d_drop(dentry); 697 return PTR_ERR(req); 698 } 699 req->r_dentry = dget(dentry); 700 req->r_num_caps = 2; 701 req->r_path2 = kstrdup(dest, GFP_NOFS); 702 req->r_locked_dir = dir; 703 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 704 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 705 err = ceph_mdsc_do_request(mdsc, dir, req); 706 if (!err && !req->r_reply_info.head->is_dentry) 707 err = ceph_handle_notrace_create(dir, dentry); 708 ceph_mdsc_put_request(req); 709 if (err) 710 d_drop(dentry); 711 return err; 712 } 713 714 static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode) 715 { 716 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 717 struct ceph_mds_client *mdsc = &client->mdsc; 718 struct ceph_mds_request *req; 719 int err = -EROFS; 720 int op; 721 722 if (ceph_snap(dir) == CEPH_SNAPDIR) { 723 /* mkdir .snap/foo is a MKSNAP */ 724 op = CEPH_MDS_OP_MKSNAP; 725 dout("mksnap dir %p snap '%.*s' dn %p\n", dir, 726 dentry->d_name.len, dentry->d_name.name, dentry); 727 } else if (ceph_snap(dir) == CEPH_NOSNAP) { 728 dout("mkdir dir %p dn %p mode 0%o\n", dir, dentry, mode); 729 op = CEPH_MDS_OP_MKDIR; 730 } else { 731 goto out; 732 } 733 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 734 if (IS_ERR(req)) { 735 err = PTR_ERR(req); 736 goto out; 737 } 738 739 req->r_dentry = dget(dentry); 740 req->r_num_caps = 2; 741 req->r_locked_dir = dir; 742 req->r_args.mkdir.mode = cpu_to_le32(mode); 743 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 744 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 745 err = ceph_mdsc_do_request(mdsc, dir, req); 746 if (!err && !req->r_reply_info.head->is_dentry) 747 err = ceph_handle_notrace_create(dir, dentry); 748 ceph_mdsc_put_request(req); 749 out: 750 if (err < 0) 751 d_drop(dentry); 752 return err; 753 } 754 755 static int ceph_link(struct dentry *old_dentry, struct inode *dir, 756 struct dentry *dentry) 757 { 758 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 759 struct ceph_mds_client *mdsc = &client->mdsc; 760 struct ceph_mds_request *req; 761 int err; 762 763 if (ceph_snap(dir) != CEPH_NOSNAP) 764 return -EROFS; 765 766 dout("link in dir %p old_dentry %p dentry %p\n", dir, 767 old_dentry, dentry); 768 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS); 769 if (IS_ERR(req)) { 770 d_drop(dentry); 771 return PTR_ERR(req); 772 } 773 req->r_dentry = dget(dentry); 774 req->r_num_caps = 2; 775 req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */ 776 req->r_locked_dir = dir; 777 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 778 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 779 err = ceph_mdsc_do_request(mdsc, dir, req); 780 if (err) 781 d_drop(dentry); 782 else if (!req->r_reply_info.head->is_dentry) 783 d_instantiate(dentry, igrab(old_dentry->d_inode)); 784 ceph_mdsc_put_request(req); 785 return err; 786 } 787 788 /* 789 * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it 790 * looks like the link count will hit 0, drop any other caps (other 791 * than PIN) we don't specifically want (due to the file still being 792 * open). 793 */ 794 static int drop_caps_for_unlink(struct inode *inode) 795 { 796 struct ceph_inode_info *ci = ceph_inode(inode); 797 int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; 798 799 spin_lock(&inode->i_lock); 800 if (inode->i_nlink == 1) { 801 drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); 802 ci->i_ceph_flags |= CEPH_I_NODELAY; 803 } 804 spin_unlock(&inode->i_lock); 805 return drop; 806 } 807 808 /* 809 * rmdir and unlink are differ only by the metadata op code 810 */ 811 static int ceph_unlink(struct inode *dir, struct dentry *dentry) 812 { 813 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 814 struct ceph_mds_client *mdsc = &client->mdsc; 815 struct inode *inode = dentry->d_inode; 816 struct ceph_mds_request *req; 817 int err = -EROFS; 818 int op; 819 820 if (ceph_snap(dir) == CEPH_SNAPDIR) { 821 /* rmdir .snap/foo is RMSNAP */ 822 dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len, 823 dentry->d_name.name, dentry); 824 op = CEPH_MDS_OP_RMSNAP; 825 } else if (ceph_snap(dir) == CEPH_NOSNAP) { 826 dout("unlink/rmdir dir %p dn %p inode %p\n", 827 dir, dentry, inode); 828 op = ((dentry->d_inode->i_mode & S_IFMT) == S_IFDIR) ? 829 CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK; 830 } else 831 goto out; 832 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 833 if (IS_ERR(req)) { 834 err = PTR_ERR(req); 835 goto out; 836 } 837 req->r_dentry = dget(dentry); 838 req->r_num_caps = 2; 839 req->r_locked_dir = dir; 840 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 841 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 842 req->r_inode_drop = drop_caps_for_unlink(inode); 843 err = ceph_mdsc_do_request(mdsc, dir, req); 844 if (!err && !req->r_reply_info.head->is_dentry) 845 d_delete(dentry); 846 ceph_mdsc_put_request(req); 847 out: 848 return err; 849 } 850 851 static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, 852 struct inode *new_dir, struct dentry *new_dentry) 853 { 854 struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb); 855 struct ceph_mds_client *mdsc = &client->mdsc; 856 struct ceph_mds_request *req; 857 int err; 858 859 if (ceph_snap(old_dir) != ceph_snap(new_dir)) 860 return -EXDEV; 861 if (ceph_snap(old_dir) != CEPH_NOSNAP || 862 ceph_snap(new_dir) != CEPH_NOSNAP) 863 return -EROFS; 864 dout("rename dir %p dentry %p to dir %p dentry %p\n", 865 old_dir, old_dentry, new_dir, new_dentry); 866 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS); 867 if (IS_ERR(req)) 868 return PTR_ERR(req); 869 req->r_dentry = dget(new_dentry); 870 req->r_num_caps = 2; 871 req->r_old_dentry = dget(old_dentry); 872 req->r_locked_dir = new_dir; 873 req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; 874 req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; 875 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 876 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 877 /* release LINK_RDCACHE on source inode (mds will lock it) */ 878 req->r_old_inode_drop = CEPH_CAP_LINK_SHARED; 879 if (new_dentry->d_inode) 880 req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode); 881 err = ceph_mdsc_do_request(mdsc, old_dir, req); 882 if (!err && !req->r_reply_info.head->is_dentry) { 883 /* 884 * Normally d_move() is done by fill_trace (called by 885 * do_request, above). If there is no trace, we need 886 * to do it here. 887 */ 888 889 /* d_move screws up d_subdirs order */ 890 ceph_i_clear(new_dir, CEPH_I_COMPLETE); 891 892 d_move(old_dentry, new_dentry); 893 894 /* ensure target dentry is invalidated, despite 895 rehashing bug in vfs_rename_dir */ 896 ceph_invalidate_dentry_lease(new_dentry); 897 } 898 ceph_mdsc_put_request(req); 899 return err; 900 } 901 902 /* 903 * Ensure a dentry lease will no longer revalidate. 904 */ 905 void ceph_invalidate_dentry_lease(struct dentry *dentry) 906 { 907 spin_lock(&dentry->d_lock); 908 dentry->d_time = jiffies; 909 ceph_dentry(dentry)->lease_shared_gen = 0; 910 spin_unlock(&dentry->d_lock); 911 } 912 913 /* 914 * Check if dentry lease is valid. If not, delete the lease. Try to 915 * renew if the least is more than half up. 916 */ 917 static int dentry_lease_is_valid(struct dentry *dentry) 918 { 919 struct ceph_dentry_info *di; 920 struct ceph_mds_session *s; 921 int valid = 0; 922 u32 gen; 923 unsigned long ttl; 924 struct ceph_mds_session *session = NULL; 925 struct inode *dir = NULL; 926 u32 seq = 0; 927 928 spin_lock(&dentry->d_lock); 929 di = ceph_dentry(dentry); 930 if (di && di->lease_session) { 931 s = di->lease_session; 932 spin_lock(&s->s_cap_lock); 933 gen = s->s_cap_gen; 934 ttl = s->s_cap_ttl; 935 spin_unlock(&s->s_cap_lock); 936 937 if (di->lease_gen == gen && 938 time_before(jiffies, dentry->d_time) && 939 time_before(jiffies, ttl)) { 940 valid = 1; 941 if (di->lease_renew_after && 942 time_after(jiffies, di->lease_renew_after)) { 943 /* we should renew */ 944 dir = dentry->d_parent->d_inode; 945 session = ceph_get_mds_session(s); 946 seq = di->lease_seq; 947 di->lease_renew_after = 0; 948 di->lease_renew_from = jiffies; 949 } 950 } 951 } 952 spin_unlock(&dentry->d_lock); 953 954 if (session) { 955 ceph_mdsc_lease_send_msg(session, dir, dentry, 956 CEPH_MDS_LEASE_RENEW, seq); 957 ceph_put_mds_session(session); 958 } 959 dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid); 960 return valid; 961 } 962 963 /* 964 * Check if directory-wide content lease/cap is valid. 965 */ 966 static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) 967 { 968 struct ceph_inode_info *ci = ceph_inode(dir); 969 struct ceph_dentry_info *di = ceph_dentry(dentry); 970 int valid = 0; 971 972 spin_lock(&dir->i_lock); 973 if (ci->i_shared_gen == di->lease_shared_gen) 974 valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1); 975 spin_unlock(&dir->i_lock); 976 dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n", 977 dir, (unsigned)ci->i_shared_gen, dentry, 978 (unsigned)di->lease_shared_gen, valid); 979 return valid; 980 } 981 982 /* 983 * Check if cached dentry can be trusted. 984 */ 985 static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd) 986 { 987 struct inode *dir = dentry->d_parent->d_inode; 988 989 dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry, 990 dentry->d_name.len, dentry->d_name.name, dentry->d_inode, 991 ceph_dentry(dentry)->offset); 992 993 /* always trust cached snapped dentries, snapdir dentry */ 994 if (ceph_snap(dir) != CEPH_NOSNAP) { 995 dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry, 996 dentry->d_name.len, dentry->d_name.name, dentry->d_inode); 997 goto out_touch; 998 } 999 if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) 1000 goto out_touch; 1001 1002 if (dentry_lease_is_valid(dentry) || 1003 dir_lease_is_valid(dir, dentry)) 1004 goto out_touch; 1005 1006 dout("d_revalidate %p invalid\n", dentry); 1007 d_drop(dentry); 1008 return 0; 1009 out_touch: 1010 ceph_dentry_lru_touch(dentry); 1011 return 1; 1012 } 1013 1014 /* 1015 * When a dentry is released, clear the dir I_COMPLETE if it was part 1016 * of the current dir gen. 1017 */ 1018 static void ceph_dentry_release(struct dentry *dentry) 1019 { 1020 struct ceph_dentry_info *di = ceph_dentry(dentry); 1021 struct inode *parent_inode = dentry->d_parent->d_inode; 1022 1023 if (parent_inode) { 1024 struct ceph_inode_info *ci = ceph_inode(parent_inode); 1025 1026 spin_lock(&parent_inode->i_lock); 1027 if (ci->i_shared_gen == di->lease_shared_gen) { 1028 dout(" clearing %p complete (d_release)\n", 1029 parent_inode); 1030 ci->i_ceph_flags &= ~CEPH_I_COMPLETE; 1031 ci->i_release_count++; 1032 } 1033 spin_unlock(&parent_inode->i_lock); 1034 } 1035 if (di) { 1036 ceph_dentry_lru_del(dentry); 1037 if (di->lease_session) 1038 ceph_put_mds_session(di->lease_session); 1039 kmem_cache_free(ceph_dentry_cachep, di); 1040 dentry->d_fsdata = NULL; 1041 } 1042 } 1043 1044 static int ceph_snapdir_d_revalidate(struct dentry *dentry, 1045 struct nameidata *nd) 1046 { 1047 /* 1048 * Eventually, we'll want to revalidate snapped metadata 1049 * too... probably... 1050 */ 1051 return 1; 1052 } 1053 1054 1055 1056 /* 1057 * read() on a dir. This weird interface hack only works if mounted 1058 * with '-o dirstat'. 1059 */ 1060 static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, 1061 loff_t *ppos) 1062 { 1063 struct ceph_file_info *cf = file->private_data; 1064 struct inode *inode = file->f_dentry->d_inode; 1065 struct ceph_inode_info *ci = ceph_inode(inode); 1066 int left; 1067 1068 if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) 1069 return -EISDIR; 1070 1071 if (!cf->dir_info) { 1072 cf->dir_info = kmalloc(1024, GFP_NOFS); 1073 if (!cf->dir_info) 1074 return -ENOMEM; 1075 cf->dir_info_len = 1076 sprintf(cf->dir_info, 1077 "entries: %20lld\n" 1078 " files: %20lld\n" 1079 " subdirs: %20lld\n" 1080 "rentries: %20lld\n" 1081 " rfiles: %20lld\n" 1082 " rsubdirs: %20lld\n" 1083 "rbytes: %20lld\n" 1084 "rctime: %10ld.%09ld\n", 1085 ci->i_files + ci->i_subdirs, 1086 ci->i_files, 1087 ci->i_subdirs, 1088 ci->i_rfiles + ci->i_rsubdirs, 1089 ci->i_rfiles, 1090 ci->i_rsubdirs, 1091 ci->i_rbytes, 1092 (long)ci->i_rctime.tv_sec, 1093 (long)ci->i_rctime.tv_nsec); 1094 } 1095 1096 if (*ppos >= cf->dir_info_len) 1097 return 0; 1098 size = min_t(unsigned, size, cf->dir_info_len-*ppos); 1099 left = copy_to_user(buf, cf->dir_info + *ppos, size); 1100 if (left == size) 1101 return -EFAULT; 1102 *ppos += (size - left); 1103 return size - left; 1104 } 1105 1106 /* 1107 * an fsync() on a dir will wait for any uncommitted directory 1108 * operations to commit. 1109 */ 1110 static int ceph_dir_fsync(struct file *file, int datasync) 1111 { 1112 struct inode *inode = file->f_path.dentry->d_inode; 1113 struct ceph_inode_info *ci = ceph_inode(inode); 1114 struct list_head *head = &ci->i_unsafe_dirops; 1115 struct ceph_mds_request *req; 1116 u64 last_tid; 1117 int ret = 0; 1118 1119 dout("dir_fsync %p\n", inode); 1120 spin_lock(&ci->i_unsafe_lock); 1121 if (list_empty(head)) 1122 goto out; 1123 1124 req = list_entry(head->prev, 1125 struct ceph_mds_request, r_unsafe_dir_item); 1126 last_tid = req->r_tid; 1127 1128 do { 1129 ceph_mdsc_get_request(req); 1130 spin_unlock(&ci->i_unsafe_lock); 1131 dout("dir_fsync %p wait on tid %llu (until %llu)\n", 1132 inode, req->r_tid, last_tid); 1133 if (req->r_timeout) { 1134 ret = wait_for_completion_timeout( 1135 &req->r_safe_completion, req->r_timeout); 1136 if (ret > 0) 1137 ret = 0; 1138 else if (ret == 0) 1139 ret = -EIO; /* timed out */ 1140 } else { 1141 wait_for_completion(&req->r_safe_completion); 1142 } 1143 spin_lock(&ci->i_unsafe_lock); 1144 ceph_mdsc_put_request(req); 1145 1146 if (ret || list_empty(head)) 1147 break; 1148 req = list_entry(head->next, 1149 struct ceph_mds_request, r_unsafe_dir_item); 1150 } while (req->r_tid < last_tid); 1151 out: 1152 spin_unlock(&ci->i_unsafe_lock); 1153 return ret; 1154 } 1155 1156 /* 1157 * We maintain a private dentry LRU. 1158 * 1159 * FIXME: this needs to be changed to a per-mds lru to be useful. 1160 */ 1161 void ceph_dentry_lru_add(struct dentry *dn) 1162 { 1163 struct ceph_dentry_info *di = ceph_dentry(dn); 1164 struct ceph_mds_client *mdsc; 1165 1166 dout("dentry_lru_add %p %p '%.*s'\n", di, dn, 1167 dn->d_name.len, dn->d_name.name); 1168 if (di) { 1169 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; 1170 spin_lock(&mdsc->dentry_lru_lock); 1171 list_add_tail(&di->lru, &mdsc->dentry_lru); 1172 mdsc->num_dentry++; 1173 spin_unlock(&mdsc->dentry_lru_lock); 1174 } 1175 } 1176 1177 void ceph_dentry_lru_touch(struct dentry *dn) 1178 { 1179 struct ceph_dentry_info *di = ceph_dentry(dn); 1180 struct ceph_mds_client *mdsc; 1181 1182 dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn, 1183 dn->d_name.len, dn->d_name.name, di->offset); 1184 if (di) { 1185 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; 1186 spin_lock(&mdsc->dentry_lru_lock); 1187 list_move_tail(&di->lru, &mdsc->dentry_lru); 1188 spin_unlock(&mdsc->dentry_lru_lock); 1189 } 1190 } 1191 1192 void ceph_dentry_lru_del(struct dentry *dn) 1193 { 1194 struct ceph_dentry_info *di = ceph_dentry(dn); 1195 struct ceph_mds_client *mdsc; 1196 1197 dout("dentry_lru_del %p %p '%.*s'\n", di, dn, 1198 dn->d_name.len, dn->d_name.name); 1199 if (di) { 1200 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; 1201 spin_lock(&mdsc->dentry_lru_lock); 1202 list_del_init(&di->lru); 1203 mdsc->num_dentry--; 1204 spin_unlock(&mdsc->dentry_lru_lock); 1205 } 1206 } 1207 1208 const struct file_operations ceph_dir_fops = { 1209 .read = ceph_read_dir, 1210 .readdir = ceph_readdir, 1211 .llseek = ceph_dir_llseek, 1212 .open = ceph_open, 1213 .release = ceph_release, 1214 .unlocked_ioctl = ceph_ioctl, 1215 .fsync = ceph_dir_fsync, 1216 }; 1217 1218 const struct inode_operations ceph_dir_iops = { 1219 .lookup = ceph_lookup, 1220 .permission = ceph_permission, 1221 .getattr = ceph_getattr, 1222 .setattr = ceph_setattr, 1223 .setxattr = ceph_setxattr, 1224 .getxattr = ceph_getxattr, 1225 .listxattr = ceph_listxattr, 1226 .removexattr = ceph_removexattr, 1227 .mknod = ceph_mknod, 1228 .symlink = ceph_symlink, 1229 .mkdir = ceph_mkdir, 1230 .link = ceph_link, 1231 .unlink = ceph_unlink, 1232 .rmdir = ceph_unlink, 1233 .rename = ceph_rename, 1234 .create = ceph_create, 1235 }; 1236 1237 struct dentry_operations ceph_dentry_ops = { 1238 .d_revalidate = ceph_d_revalidate, 1239 .d_release = ceph_dentry_release, 1240 }; 1241 1242 struct dentry_operations ceph_snapdir_dentry_ops = { 1243 .d_revalidate = ceph_snapdir_d_revalidate, 1244 }; 1245 1246 struct dentry_operations ceph_snap_dentry_ops = { 1247 }; 1248