1 #include <linux/ceph/ceph_debug.h> 2 3 #include <linux/spinlock.h> 4 #include <linux/fs_struct.h> 5 #include <linux/namei.h> 6 #include <linux/slab.h> 7 #include <linux/sched.h> 8 #include <linux/xattr.h> 9 10 #include "super.h" 11 #include "mds_client.h" 12 13 /* 14 * Directory operations: readdir, lookup, create, link, unlink, 15 * rename, etc. 16 */ 17 18 /* 19 * Ceph MDS operations are specified in terms of a base ino and 20 * relative path. Thus, the client can specify an operation on a 21 * specific inode (e.g., a getattr due to fstat(2)), or as a path 22 * relative to, say, the root directory. 23 * 24 * Normally, we limit ourselves to strict inode ops (no path component) 25 * or dentry operations (a single path component relative to an ino). The 26 * exception to this is open_root_dentry(), which will open the mount 27 * point by name. 28 */ 29 30 const struct dentry_operations ceph_dentry_ops; 31 32 /* 33 * Initialize ceph dentry state. 34 */ 35 int ceph_init_dentry(struct dentry *dentry) 36 { 37 struct ceph_dentry_info *di; 38 39 if (dentry->d_fsdata) 40 return 0; 41 42 di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL); 43 if (!di) 44 return -ENOMEM; /* oh well */ 45 46 spin_lock(&dentry->d_lock); 47 if (dentry->d_fsdata) { 48 /* lost a race */ 49 kmem_cache_free(ceph_dentry_cachep, di); 50 goto out_unlock; 51 } 52 53 if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_NOSNAP) 54 d_set_d_op(dentry, &ceph_dentry_ops); 55 else if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_SNAPDIR) 56 d_set_d_op(dentry, &ceph_snapdir_dentry_ops); 57 else 58 d_set_d_op(dentry, &ceph_snap_dentry_ops); 59 60 di->dentry = dentry; 61 di->lease_session = NULL; 62 di->time = jiffies; 63 /* avoid reordering d_fsdata setup so that the check above is safe */ 64 smp_mb(); 65 dentry->d_fsdata = di; 66 ceph_dentry_lru_add(dentry); 67 out_unlock: 68 spin_unlock(&dentry->d_lock); 69 return 0; 70 } 71 72 /* 73 * for f_pos for readdir: 74 * - hash order: 75 * (0xff << 52) | ((24 bits hash) << 28) | 76 * (the nth entry has hash collision); 77 * - frag+name order; 78 * ((frag value) << 28) | (the nth entry in frag); 79 */ 80 #define OFFSET_BITS 28 81 #define OFFSET_MASK ((1 << OFFSET_BITS) - 1) 82 #define HASH_ORDER (0xffull << (OFFSET_BITS + 24)) 83 loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order) 84 { 85 loff_t fpos = ((loff_t)high << 28) | (loff_t)off; 86 if (hash_order) 87 fpos |= HASH_ORDER; 88 return fpos; 89 } 90 91 static bool is_hash_order(loff_t p) 92 { 93 return (p & HASH_ORDER) == HASH_ORDER; 94 } 95 96 static unsigned fpos_frag(loff_t p) 97 { 98 return p >> OFFSET_BITS; 99 } 100 101 static unsigned fpos_hash(loff_t p) 102 { 103 return ceph_frag_value(fpos_frag(p)); 104 } 105 106 static unsigned fpos_off(loff_t p) 107 { 108 return p & OFFSET_MASK; 109 } 110 111 static int fpos_cmp(loff_t l, loff_t r) 112 { 113 int v = ceph_frag_compare(fpos_frag(l), fpos_frag(r)); 114 if (v) 115 return v; 116 return (int)(fpos_off(l) - fpos_off(r)); 117 } 118 119 /* 120 * make note of the last dentry we read, so we can 121 * continue at the same lexicographical point, 122 * regardless of what dir changes take place on the 123 * server. 124 */ 125 static int note_last_dentry(struct ceph_file_info *fi, const char *name, 126 int len, unsigned next_offset) 127 { 128 char *buf = kmalloc(len+1, GFP_KERNEL); 129 if (!buf) 130 return -ENOMEM; 131 kfree(fi->last_name); 132 fi->last_name = buf; 133 memcpy(fi->last_name, name, len); 134 fi->last_name[len] = 0; 135 fi->next_offset = next_offset; 136 dout("note_last_dentry '%s'\n", fi->last_name); 137 return 0; 138 } 139 140 141 static struct dentry * 142 __dcache_find_get_entry(struct dentry *parent, u64 idx, 143 struct ceph_readdir_cache_control *cache_ctl) 144 { 145 struct inode *dir = d_inode(parent); 146 struct dentry *dentry; 147 unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - 1; 148 loff_t ptr_pos = idx * sizeof(struct dentry *); 149 pgoff_t ptr_pgoff = ptr_pos >> PAGE_SHIFT; 150 151 if (ptr_pos >= i_size_read(dir)) 152 return NULL; 153 154 if (!cache_ctl->page || ptr_pgoff != page_index(cache_ctl->page)) { 155 ceph_readdir_cache_release(cache_ctl); 156 cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff); 157 if (!cache_ctl->page) { 158 dout(" page %lu not found\n", ptr_pgoff); 159 return ERR_PTR(-EAGAIN); 160 } 161 /* reading/filling the cache are serialized by 162 i_mutex, no need to use page lock */ 163 unlock_page(cache_ctl->page); 164 cache_ctl->dentries = kmap(cache_ctl->page); 165 } 166 167 cache_ctl->index = idx & idx_mask; 168 169 rcu_read_lock(); 170 spin_lock(&parent->d_lock); 171 /* check i_size again here, because empty directory can be 172 * marked as complete while not holding the i_mutex. */ 173 if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir)) 174 dentry = cache_ctl->dentries[cache_ctl->index]; 175 else 176 dentry = NULL; 177 spin_unlock(&parent->d_lock); 178 if (dentry && !lockref_get_not_dead(&dentry->d_lockref)) 179 dentry = NULL; 180 rcu_read_unlock(); 181 return dentry ? : ERR_PTR(-EAGAIN); 182 } 183 184 /* 185 * When possible, we try to satisfy a readdir by peeking at the 186 * dcache. We make this work by carefully ordering dentries on 187 * d_child when we initially get results back from the MDS, and 188 * falling back to a "normal" sync readdir if any dentries in the dir 189 * are dropped. 190 * 191 * Complete dir indicates that we have all dentries in the dir. It is 192 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by 193 * the MDS if/when the directory is modified). 194 */ 195 static int __dcache_readdir(struct file *file, struct dir_context *ctx, 196 u32 shared_gen) 197 { 198 struct ceph_file_info *fi = file->private_data; 199 struct dentry *parent = file->f_path.dentry; 200 struct inode *dir = d_inode(parent); 201 struct dentry *dentry, *last = NULL; 202 struct ceph_dentry_info *di; 203 struct ceph_readdir_cache_control cache_ctl = {}; 204 u64 idx = 0; 205 int err = 0; 206 207 dout("__dcache_readdir %p v%u at %llx\n", dir, shared_gen, ctx->pos); 208 209 /* search start position */ 210 if (ctx->pos > 2) { 211 u64 count = div_u64(i_size_read(dir), sizeof(struct dentry *)); 212 while (count > 0) { 213 u64 step = count >> 1; 214 dentry = __dcache_find_get_entry(parent, idx + step, 215 &cache_ctl); 216 if (!dentry) { 217 /* use linar search */ 218 idx = 0; 219 break; 220 } 221 if (IS_ERR(dentry)) { 222 err = PTR_ERR(dentry); 223 goto out; 224 } 225 di = ceph_dentry(dentry); 226 spin_lock(&dentry->d_lock); 227 if (fpos_cmp(di->offset, ctx->pos) < 0) { 228 idx += step + 1; 229 count -= step + 1; 230 } else { 231 count = step; 232 } 233 spin_unlock(&dentry->d_lock); 234 dput(dentry); 235 } 236 237 dout("__dcache_readdir %p cache idx %llu\n", dir, idx); 238 } 239 240 241 for (;;) { 242 bool emit_dentry = false; 243 dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl); 244 if (!dentry) { 245 fi->flags |= CEPH_F_ATEND; 246 err = 0; 247 break; 248 } 249 if (IS_ERR(dentry)) { 250 err = PTR_ERR(dentry); 251 goto out; 252 } 253 254 di = ceph_dentry(dentry); 255 spin_lock(&dentry->d_lock); 256 if (di->lease_shared_gen == shared_gen && 257 d_really_is_positive(dentry) && 258 fpos_cmp(ctx->pos, di->offset) <= 0) { 259 emit_dentry = true; 260 } 261 spin_unlock(&dentry->d_lock); 262 263 if (emit_dentry) { 264 dout(" %llx dentry %p %pd %p\n", di->offset, 265 dentry, dentry, d_inode(dentry)); 266 ctx->pos = di->offset; 267 if (!dir_emit(ctx, dentry->d_name.name, 268 dentry->d_name.len, 269 ceph_translate_ino(dentry->d_sb, 270 d_inode(dentry)->i_ino), 271 d_inode(dentry)->i_mode >> 12)) { 272 dput(dentry); 273 err = 0; 274 break; 275 } 276 ctx->pos++; 277 278 if (last) 279 dput(last); 280 last = dentry; 281 } else { 282 dput(dentry); 283 } 284 } 285 out: 286 ceph_readdir_cache_release(&cache_ctl); 287 if (last) { 288 int ret; 289 di = ceph_dentry(last); 290 ret = note_last_dentry(fi, last->d_name.name, last->d_name.len, 291 fpos_off(di->offset) + 1); 292 if (ret < 0) 293 err = ret; 294 dput(last); 295 } 296 return err; 297 } 298 299 static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos) 300 { 301 if (!fi->last_readdir) 302 return true; 303 if (is_hash_order(pos)) 304 return !ceph_frag_contains_value(fi->frag, fpos_hash(pos)); 305 else 306 return fi->frag != fpos_frag(pos); 307 } 308 309 static int ceph_readdir(struct file *file, struct dir_context *ctx) 310 { 311 struct ceph_file_info *fi = file->private_data; 312 struct inode *inode = file_inode(file); 313 struct ceph_inode_info *ci = ceph_inode(inode); 314 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 315 struct ceph_mds_client *mdsc = fsc->mdsc; 316 int i; 317 int err; 318 u32 ftype; 319 struct ceph_mds_reply_info_parsed *rinfo; 320 321 dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos); 322 if (fi->flags & CEPH_F_ATEND) 323 return 0; 324 325 /* always start with . and .. */ 326 if (ctx->pos == 0) { 327 dout("readdir off 0 -> '.'\n"); 328 if (!dir_emit(ctx, ".", 1, 329 ceph_translate_ino(inode->i_sb, inode->i_ino), 330 inode->i_mode >> 12)) 331 return 0; 332 ctx->pos = 1; 333 } 334 if (ctx->pos == 1) { 335 ino_t ino = parent_ino(file->f_path.dentry); 336 dout("readdir off 1 -> '..'\n"); 337 if (!dir_emit(ctx, "..", 2, 338 ceph_translate_ino(inode->i_sb, ino), 339 inode->i_mode >> 12)) 340 return 0; 341 ctx->pos = 2; 342 } 343 344 /* can we use the dcache? */ 345 spin_lock(&ci->i_ceph_lock); 346 if (ceph_test_mount_opt(fsc, DCACHE) && 347 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && 348 ceph_snap(inode) != CEPH_SNAPDIR && 349 __ceph_dir_is_complete_ordered(ci) && 350 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { 351 u32 shared_gen = ci->i_shared_gen; 352 spin_unlock(&ci->i_ceph_lock); 353 err = __dcache_readdir(file, ctx, shared_gen); 354 if (err != -EAGAIN) 355 return err; 356 } else { 357 spin_unlock(&ci->i_ceph_lock); 358 } 359 360 /* proceed with a normal readdir */ 361 more: 362 /* do we have the correct frag content buffered? */ 363 if (need_send_readdir(fi, ctx->pos)) { 364 struct ceph_mds_request *req; 365 unsigned frag; 366 int op = ceph_snap(inode) == CEPH_SNAPDIR ? 367 CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; 368 369 /* discard old result, if any */ 370 if (fi->last_readdir) { 371 ceph_mdsc_put_request(fi->last_readdir); 372 fi->last_readdir = NULL; 373 } 374 375 if (is_hash_order(ctx->pos)) { 376 frag = ceph_choose_frag(ci, fpos_hash(ctx->pos), 377 NULL, NULL); 378 } else { 379 frag = fpos_frag(ctx->pos); 380 } 381 382 dout("readdir fetching %llx.%llx frag %x offset '%s'\n", 383 ceph_vinop(inode), frag, fi->last_name); 384 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 385 if (IS_ERR(req)) 386 return PTR_ERR(req); 387 err = ceph_alloc_readdir_reply_buffer(req, inode); 388 if (err) { 389 ceph_mdsc_put_request(req); 390 return err; 391 } 392 /* hints to request -> mds selection code */ 393 req->r_direct_mode = USE_AUTH_MDS; 394 req->r_direct_hash = ceph_frag_value(frag); 395 req->r_direct_is_hash = true; 396 if (fi->last_name) { 397 req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL); 398 if (!req->r_path2) { 399 ceph_mdsc_put_request(req); 400 return -ENOMEM; 401 } 402 } 403 req->r_dir_release_cnt = fi->dir_release_count; 404 req->r_dir_ordered_cnt = fi->dir_ordered_count; 405 req->r_readdir_cache_idx = fi->readdir_cache_idx; 406 req->r_readdir_offset = fi->next_offset; 407 req->r_args.readdir.frag = cpu_to_le32(frag); 408 req->r_args.readdir.flags = 409 cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS); 410 411 req->r_inode = inode; 412 ihold(inode); 413 req->r_dentry = dget(file->f_path.dentry); 414 err = ceph_mdsc_do_request(mdsc, NULL, req); 415 if (err < 0) { 416 ceph_mdsc_put_request(req); 417 return err; 418 } 419 dout("readdir got and parsed readdir result=%d on " 420 "frag %x, end=%d, complete=%d, hash_order=%d\n", 421 err, frag, 422 (int)req->r_reply_info.dir_end, 423 (int)req->r_reply_info.dir_complete, 424 (int)req->r_reply_info.hash_order); 425 426 rinfo = &req->r_reply_info; 427 if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { 428 frag = le32_to_cpu(rinfo->dir_dir->frag); 429 if (!rinfo->hash_order) { 430 fi->next_offset = req->r_readdir_offset; 431 /* adjust ctx->pos to beginning of frag */ 432 ctx->pos = ceph_make_fpos(frag, 433 fi->next_offset, 434 false); 435 } 436 } 437 438 fi->frag = frag; 439 fi->last_readdir = req; 440 441 if (req->r_did_prepopulate) { 442 fi->readdir_cache_idx = req->r_readdir_cache_idx; 443 if (fi->readdir_cache_idx < 0) { 444 /* preclude from marking dir ordered */ 445 fi->dir_ordered_count = 0; 446 } else if (ceph_frag_is_leftmost(frag) && 447 fi->next_offset == 2) { 448 /* note dir version at start of readdir so 449 * we can tell if any dentries get dropped */ 450 fi->dir_release_count = req->r_dir_release_cnt; 451 fi->dir_ordered_count = req->r_dir_ordered_cnt; 452 } 453 } else { 454 dout("readdir !did_prepopulate"); 455 /* disable readdir cache */ 456 fi->readdir_cache_idx = -1; 457 /* preclude from marking dir complete */ 458 fi->dir_release_count = 0; 459 } 460 461 /* note next offset and last dentry name */ 462 if (rinfo->dir_nr > 0) { 463 struct ceph_mds_reply_dir_entry *rde = 464 rinfo->dir_entries + (rinfo->dir_nr-1); 465 unsigned next_offset = req->r_reply_info.dir_end ? 466 2 : (fpos_off(rde->offset) + 1); 467 err = note_last_dentry(fi, rde->name, rde->name_len, 468 next_offset); 469 if (err) 470 return err; 471 } else if (req->r_reply_info.dir_end) { 472 fi->next_offset = 2; 473 /* keep last name */ 474 } 475 } 476 477 rinfo = &fi->last_readdir->r_reply_info; 478 dout("readdir frag %x num %d pos %llx chunk first %llx\n", 479 fi->frag, rinfo->dir_nr, ctx->pos, 480 rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL); 481 482 i = 0; 483 /* search start position */ 484 if (rinfo->dir_nr > 0) { 485 int step, nr = rinfo->dir_nr; 486 while (nr > 0) { 487 step = nr >> 1; 488 if (rinfo->dir_entries[i + step].offset < ctx->pos) { 489 i += step + 1; 490 nr -= step + 1; 491 } else { 492 nr = step; 493 } 494 } 495 } 496 for (; i < rinfo->dir_nr; i++) { 497 struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; 498 struct ceph_vino vino; 499 ino_t ino; 500 501 BUG_ON(rde->offset < ctx->pos); 502 503 ctx->pos = rde->offset; 504 dout("readdir (%d/%d) -> %llx '%.*s' %p\n", 505 i, rinfo->dir_nr, ctx->pos, 506 rde->name_len, rde->name, &rde->inode.in); 507 508 BUG_ON(!rde->inode.in); 509 ftype = le32_to_cpu(rde->inode.in->mode) >> 12; 510 vino.ino = le64_to_cpu(rde->inode.in->ino); 511 vino.snap = le64_to_cpu(rde->inode.in->snapid); 512 ino = ceph_vino_to_ino(vino); 513 514 if (!dir_emit(ctx, rde->name, rde->name_len, 515 ceph_translate_ino(inode->i_sb, ino), ftype)) { 516 dout("filldir stopping us...\n"); 517 return 0; 518 } 519 ctx->pos++; 520 } 521 522 if (fi->next_offset > 2) { 523 ceph_mdsc_put_request(fi->last_readdir); 524 fi->last_readdir = NULL; 525 goto more; 526 } 527 528 /* more frags? */ 529 if (!ceph_frag_is_rightmost(fi->frag)) { 530 unsigned frag = ceph_frag_next(fi->frag); 531 if (is_hash_order(ctx->pos)) { 532 loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag), 533 fi->next_offset, true); 534 if (new_pos > ctx->pos) 535 ctx->pos = new_pos; 536 /* keep last_name */ 537 } else { 538 ctx->pos = ceph_make_fpos(frag, fi->next_offset, false); 539 kfree(fi->last_name); 540 fi->last_name = NULL; 541 } 542 dout("readdir next frag is %x\n", frag); 543 goto more; 544 } 545 fi->flags |= CEPH_F_ATEND; 546 547 /* 548 * if dir_release_count still matches the dir, no dentries 549 * were released during the whole readdir, and we should have 550 * the complete dir contents in our cache. 551 */ 552 if (atomic64_read(&ci->i_release_count) == fi->dir_release_count) { 553 spin_lock(&ci->i_ceph_lock); 554 if (fi->dir_ordered_count == atomic64_read(&ci->i_ordered_count)) { 555 dout(" marking %p complete and ordered\n", inode); 556 /* use i_size to track number of entries in 557 * readdir cache */ 558 BUG_ON(fi->readdir_cache_idx < 0); 559 i_size_write(inode, fi->readdir_cache_idx * 560 sizeof(struct dentry*)); 561 } else { 562 dout(" marking %p complete\n", inode); 563 } 564 __ceph_dir_set_complete(ci, fi->dir_release_count, 565 fi->dir_ordered_count); 566 spin_unlock(&ci->i_ceph_lock); 567 } 568 569 dout("readdir %p file %p done.\n", inode, file); 570 return 0; 571 } 572 573 static void reset_readdir(struct ceph_file_info *fi) 574 { 575 if (fi->last_readdir) { 576 ceph_mdsc_put_request(fi->last_readdir); 577 fi->last_readdir = NULL; 578 } 579 kfree(fi->last_name); 580 fi->last_name = NULL; 581 fi->dir_release_count = 0; 582 fi->readdir_cache_idx = -1; 583 fi->next_offset = 2; /* compensate for . and .. */ 584 fi->flags &= ~CEPH_F_ATEND; 585 } 586 587 /* 588 * discard buffered readdir content on seekdir(0), or seek to new frag, 589 * or seek prior to current chunk 590 */ 591 static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos) 592 { 593 struct ceph_mds_reply_info_parsed *rinfo; 594 loff_t chunk_offset; 595 if (new_pos == 0) 596 return true; 597 if (is_hash_order(new_pos)) { 598 /* no need to reset last_name for a forward seek when 599 * dentries are sotred in hash order */ 600 } else if (fi->frag != fpos_frag(new_pos)) { 601 return true; 602 } 603 rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL; 604 if (!rinfo || !rinfo->dir_nr) 605 return true; 606 chunk_offset = rinfo->dir_entries[0].offset; 607 return new_pos < chunk_offset || 608 is_hash_order(new_pos) != is_hash_order(chunk_offset); 609 } 610 611 static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) 612 { 613 struct ceph_file_info *fi = file->private_data; 614 struct inode *inode = file->f_mapping->host; 615 loff_t retval; 616 617 inode_lock(inode); 618 retval = -EINVAL; 619 switch (whence) { 620 case SEEK_CUR: 621 offset += file->f_pos; 622 case SEEK_SET: 623 break; 624 case SEEK_END: 625 retval = -EOPNOTSUPP; 626 default: 627 goto out; 628 } 629 630 if (offset >= 0) { 631 if (need_reset_readdir(fi, offset)) { 632 dout("dir_llseek dropping %p content\n", file); 633 reset_readdir(fi); 634 } else if (is_hash_order(offset) && offset > file->f_pos) { 635 /* for hash offset, we don't know if a forward seek 636 * is within same frag */ 637 fi->dir_release_count = 0; 638 fi->readdir_cache_idx = -1; 639 } 640 641 if (offset != file->f_pos) { 642 file->f_pos = offset; 643 file->f_version = 0; 644 fi->flags &= ~CEPH_F_ATEND; 645 } 646 retval = offset; 647 } 648 out: 649 inode_unlock(inode); 650 return retval; 651 } 652 653 /* 654 * Handle lookups for the hidden .snap directory. 655 */ 656 int ceph_handle_snapdir(struct ceph_mds_request *req, 657 struct dentry *dentry, int err) 658 { 659 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); 660 struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */ 661 662 /* .snap dir? */ 663 if (err == -ENOENT && 664 ceph_snap(parent) == CEPH_NOSNAP && 665 strcmp(dentry->d_name.name, 666 fsc->mount_options->snapdir_name) == 0) { 667 struct inode *inode = ceph_get_snapdir(parent); 668 dout("ENOENT on snapdir %p '%pd', linking to snapdir %p\n", 669 dentry, dentry, inode); 670 BUG_ON(!d_unhashed(dentry)); 671 d_add(dentry, inode); 672 err = 0; 673 } 674 return err; 675 } 676 677 /* 678 * Figure out final result of a lookup/open request. 679 * 680 * Mainly, make sure we return the final req->r_dentry (if it already 681 * existed) in place of the original VFS-provided dentry when they 682 * differ. 683 * 684 * Gracefully handle the case where the MDS replies with -ENOENT and 685 * no trace (which it may do, at its discretion, e.g., if it doesn't 686 * care to issue a lease on the negative dentry). 687 */ 688 struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, 689 struct dentry *dentry, int err) 690 { 691 if (err == -ENOENT) { 692 /* no trace? */ 693 err = 0; 694 if (!req->r_reply_info.head->is_dentry) { 695 dout("ENOENT and no trace, dentry %p inode %p\n", 696 dentry, d_inode(dentry)); 697 if (d_really_is_positive(dentry)) { 698 d_drop(dentry); 699 err = -ENOENT; 700 } else { 701 d_add(dentry, NULL); 702 } 703 } 704 } 705 if (err) 706 dentry = ERR_PTR(err); 707 else if (dentry != req->r_dentry) 708 dentry = dget(req->r_dentry); /* we got spliced */ 709 else 710 dentry = NULL; 711 return dentry; 712 } 713 714 static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) 715 { 716 return ceph_ino(inode) == CEPH_INO_ROOT && 717 strncmp(dentry->d_name.name, ".ceph", 5) == 0; 718 } 719 720 /* 721 * Look up a single dir entry. If there is a lookup intent, inform 722 * the MDS so that it gets our 'caps wanted' value in a single op. 723 */ 724 static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, 725 unsigned int flags) 726 { 727 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 728 struct ceph_mds_client *mdsc = fsc->mdsc; 729 struct ceph_mds_request *req; 730 int op; 731 int mask; 732 int err; 733 734 dout("lookup %p dentry %p '%pd'\n", 735 dir, dentry, dentry); 736 737 if (dentry->d_name.len > NAME_MAX) 738 return ERR_PTR(-ENAMETOOLONG); 739 740 err = ceph_init_dentry(dentry); 741 if (err < 0) 742 return ERR_PTR(err); 743 744 /* can we conclude ENOENT locally? */ 745 if (d_really_is_negative(dentry)) { 746 struct ceph_inode_info *ci = ceph_inode(dir); 747 struct ceph_dentry_info *di = ceph_dentry(dentry); 748 749 spin_lock(&ci->i_ceph_lock); 750 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags); 751 if (strncmp(dentry->d_name.name, 752 fsc->mount_options->snapdir_name, 753 dentry->d_name.len) && 754 !is_root_ceph_dentry(dir, dentry) && 755 ceph_test_mount_opt(fsc, DCACHE) && 756 __ceph_dir_is_complete(ci) && 757 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { 758 spin_unlock(&ci->i_ceph_lock); 759 dout(" dir %p complete, -ENOENT\n", dir); 760 d_add(dentry, NULL); 761 di->lease_shared_gen = ci->i_shared_gen; 762 return NULL; 763 } 764 spin_unlock(&ci->i_ceph_lock); 765 } 766 767 op = ceph_snap(dir) == CEPH_SNAPDIR ? 768 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; 769 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); 770 if (IS_ERR(req)) 771 return ERR_CAST(req); 772 req->r_dentry = dget(dentry); 773 req->r_num_caps = 2; 774 775 mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; 776 if (ceph_security_xattr_wanted(dir)) 777 mask |= CEPH_CAP_XATTR_SHARED; 778 req->r_args.getattr.mask = cpu_to_le32(mask); 779 780 req->r_locked_dir = dir; 781 err = ceph_mdsc_do_request(mdsc, NULL, req); 782 err = ceph_handle_snapdir(req, dentry, err); 783 dentry = ceph_finish_lookup(req, dentry, err); 784 ceph_mdsc_put_request(req); /* will dput(dentry) */ 785 dout("lookup result=%p\n", dentry); 786 return dentry; 787 } 788 789 /* 790 * If we do a create but get no trace back from the MDS, follow up with 791 * a lookup (the VFS expects us to link up the provided dentry). 792 */ 793 int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) 794 { 795 struct dentry *result = ceph_lookup(dir, dentry, 0); 796 797 if (result && !IS_ERR(result)) { 798 /* 799 * We created the item, then did a lookup, and found 800 * it was already linked to another inode we already 801 * had in our cache (and thus got spliced). To not 802 * confuse VFS (especially when inode is a directory), 803 * we don't link our dentry to that inode, return an 804 * error instead. 805 * 806 * This event should be rare and it happens only when 807 * we talk to old MDS. Recent MDS does not send traceless 808 * reply for request that creates new inode. 809 */ 810 d_drop(result); 811 return -ESTALE; 812 } 813 return PTR_ERR(result); 814 } 815 816 static int ceph_mknod(struct inode *dir, struct dentry *dentry, 817 umode_t mode, dev_t rdev) 818 { 819 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 820 struct ceph_mds_client *mdsc = fsc->mdsc; 821 struct ceph_mds_request *req; 822 struct ceph_acls_info acls = {}; 823 int err; 824 825 if (ceph_snap(dir) != CEPH_NOSNAP) 826 return -EROFS; 827 828 err = ceph_pre_init_acls(dir, &mode, &acls); 829 if (err < 0) 830 return err; 831 832 dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n", 833 dir, dentry, mode, rdev); 834 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS); 835 if (IS_ERR(req)) { 836 err = PTR_ERR(req); 837 goto out; 838 } 839 req->r_dentry = dget(dentry); 840 req->r_num_caps = 2; 841 req->r_locked_dir = dir; 842 req->r_args.mknod.mode = cpu_to_le32(mode); 843 req->r_args.mknod.rdev = cpu_to_le32(rdev); 844 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 845 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 846 if (acls.pagelist) { 847 req->r_pagelist = acls.pagelist; 848 acls.pagelist = NULL; 849 } 850 err = ceph_mdsc_do_request(mdsc, dir, req); 851 if (!err && !req->r_reply_info.head->is_dentry) 852 err = ceph_handle_notrace_create(dir, dentry); 853 ceph_mdsc_put_request(req); 854 out: 855 if (!err) 856 ceph_init_inode_acls(d_inode(dentry), &acls); 857 else 858 d_drop(dentry); 859 ceph_release_acls_info(&acls); 860 return err; 861 } 862 863 static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode, 864 bool excl) 865 { 866 return ceph_mknod(dir, dentry, mode, 0); 867 } 868 869 static int ceph_symlink(struct inode *dir, struct dentry *dentry, 870 const char *dest) 871 { 872 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 873 struct ceph_mds_client *mdsc = fsc->mdsc; 874 struct ceph_mds_request *req; 875 int err; 876 877 if (ceph_snap(dir) != CEPH_NOSNAP) 878 return -EROFS; 879 880 dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest); 881 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS); 882 if (IS_ERR(req)) { 883 err = PTR_ERR(req); 884 goto out; 885 } 886 req->r_path2 = kstrdup(dest, GFP_KERNEL); 887 if (!req->r_path2) { 888 err = -ENOMEM; 889 ceph_mdsc_put_request(req); 890 goto out; 891 } 892 req->r_locked_dir = dir; 893 req->r_dentry = dget(dentry); 894 req->r_num_caps = 2; 895 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 896 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 897 err = ceph_mdsc_do_request(mdsc, dir, req); 898 if (!err && !req->r_reply_info.head->is_dentry) 899 err = ceph_handle_notrace_create(dir, dentry); 900 ceph_mdsc_put_request(req); 901 out: 902 if (err) 903 d_drop(dentry); 904 return err; 905 } 906 907 static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 908 { 909 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 910 struct ceph_mds_client *mdsc = fsc->mdsc; 911 struct ceph_mds_request *req; 912 struct ceph_acls_info acls = {}; 913 int err = -EROFS; 914 int op; 915 916 if (ceph_snap(dir) == CEPH_SNAPDIR) { 917 /* mkdir .snap/foo is a MKSNAP */ 918 op = CEPH_MDS_OP_MKSNAP; 919 dout("mksnap dir %p snap '%pd' dn %p\n", dir, 920 dentry, dentry); 921 } else if (ceph_snap(dir) == CEPH_NOSNAP) { 922 dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode); 923 op = CEPH_MDS_OP_MKDIR; 924 } else { 925 goto out; 926 } 927 928 mode |= S_IFDIR; 929 err = ceph_pre_init_acls(dir, &mode, &acls); 930 if (err < 0) 931 goto out; 932 933 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 934 if (IS_ERR(req)) { 935 err = PTR_ERR(req); 936 goto out; 937 } 938 939 req->r_dentry = dget(dentry); 940 req->r_num_caps = 2; 941 req->r_locked_dir = dir; 942 req->r_args.mkdir.mode = cpu_to_le32(mode); 943 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 944 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 945 if (acls.pagelist) { 946 req->r_pagelist = acls.pagelist; 947 acls.pagelist = NULL; 948 } 949 err = ceph_mdsc_do_request(mdsc, dir, req); 950 if (!err && 951 !req->r_reply_info.head->is_target && 952 !req->r_reply_info.head->is_dentry) 953 err = ceph_handle_notrace_create(dir, dentry); 954 ceph_mdsc_put_request(req); 955 out: 956 if (!err) 957 ceph_init_inode_acls(d_inode(dentry), &acls); 958 else 959 d_drop(dentry); 960 ceph_release_acls_info(&acls); 961 return err; 962 } 963 964 static int ceph_link(struct dentry *old_dentry, struct inode *dir, 965 struct dentry *dentry) 966 { 967 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 968 struct ceph_mds_client *mdsc = fsc->mdsc; 969 struct ceph_mds_request *req; 970 int err; 971 972 if (ceph_snap(dir) != CEPH_NOSNAP) 973 return -EROFS; 974 975 dout("link in dir %p old_dentry %p dentry %p\n", dir, 976 old_dentry, dentry); 977 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS); 978 if (IS_ERR(req)) { 979 d_drop(dentry); 980 return PTR_ERR(req); 981 } 982 req->r_dentry = dget(dentry); 983 req->r_num_caps = 2; 984 req->r_old_dentry = dget(old_dentry); 985 req->r_locked_dir = dir; 986 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 987 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 988 /* release LINK_SHARED on source inode (mds will lock it) */ 989 req->r_old_inode_drop = CEPH_CAP_LINK_SHARED; 990 err = ceph_mdsc_do_request(mdsc, dir, req); 991 if (err) { 992 d_drop(dentry); 993 } else if (!req->r_reply_info.head->is_dentry) { 994 ihold(d_inode(old_dentry)); 995 d_instantiate(dentry, d_inode(old_dentry)); 996 } 997 ceph_mdsc_put_request(req); 998 return err; 999 } 1000 1001 /* 1002 * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it 1003 * looks like the link count will hit 0, drop any other caps (other 1004 * than PIN) we don't specifically want (due to the file still being 1005 * open). 1006 */ 1007 static int drop_caps_for_unlink(struct inode *inode) 1008 { 1009 struct ceph_inode_info *ci = ceph_inode(inode); 1010 int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; 1011 1012 spin_lock(&ci->i_ceph_lock); 1013 if (inode->i_nlink == 1) { 1014 drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); 1015 ci->i_ceph_flags |= CEPH_I_NODELAY; 1016 } 1017 spin_unlock(&ci->i_ceph_lock); 1018 return drop; 1019 } 1020 1021 /* 1022 * rmdir and unlink are differ only by the metadata op code 1023 */ 1024 static int ceph_unlink(struct inode *dir, struct dentry *dentry) 1025 { 1026 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 1027 struct ceph_mds_client *mdsc = fsc->mdsc; 1028 struct inode *inode = d_inode(dentry); 1029 struct ceph_mds_request *req; 1030 int err = -EROFS; 1031 int op; 1032 1033 if (ceph_snap(dir) == CEPH_SNAPDIR) { 1034 /* rmdir .snap/foo is RMSNAP */ 1035 dout("rmsnap dir %p '%pd' dn %p\n", dir, dentry, dentry); 1036 op = CEPH_MDS_OP_RMSNAP; 1037 } else if (ceph_snap(dir) == CEPH_NOSNAP) { 1038 dout("unlink/rmdir dir %p dn %p inode %p\n", 1039 dir, dentry, inode); 1040 op = d_is_dir(dentry) ? 1041 CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK; 1042 } else 1043 goto out; 1044 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 1045 if (IS_ERR(req)) { 1046 err = PTR_ERR(req); 1047 goto out; 1048 } 1049 req->r_dentry = dget(dentry); 1050 req->r_num_caps = 2; 1051 req->r_locked_dir = dir; 1052 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 1053 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 1054 req->r_inode_drop = drop_caps_for_unlink(inode); 1055 err = ceph_mdsc_do_request(mdsc, dir, req); 1056 if (!err && !req->r_reply_info.head->is_dentry) 1057 d_delete(dentry); 1058 ceph_mdsc_put_request(req); 1059 out: 1060 return err; 1061 } 1062 1063 static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, 1064 struct inode *new_dir, struct dentry *new_dentry, 1065 unsigned int flags) 1066 { 1067 struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb); 1068 struct ceph_mds_client *mdsc = fsc->mdsc; 1069 struct ceph_mds_request *req; 1070 int op = CEPH_MDS_OP_RENAME; 1071 int err; 1072 1073 if (flags) 1074 return -EINVAL; 1075 1076 if (ceph_snap(old_dir) != ceph_snap(new_dir)) 1077 return -EXDEV; 1078 if (ceph_snap(old_dir) != CEPH_NOSNAP) { 1079 if (old_dir == new_dir && ceph_snap(old_dir) == CEPH_SNAPDIR) 1080 op = CEPH_MDS_OP_RENAMESNAP; 1081 else 1082 return -EROFS; 1083 } 1084 dout("rename dir %p dentry %p to dir %p dentry %p\n", 1085 old_dir, old_dentry, new_dir, new_dentry); 1086 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 1087 if (IS_ERR(req)) 1088 return PTR_ERR(req); 1089 ihold(old_dir); 1090 req->r_dentry = dget(new_dentry); 1091 req->r_num_caps = 2; 1092 req->r_old_dentry = dget(old_dentry); 1093 req->r_old_dentry_dir = old_dir; 1094 req->r_locked_dir = new_dir; 1095 req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; 1096 req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; 1097 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 1098 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 1099 /* release LINK_RDCACHE on source inode (mds will lock it) */ 1100 req->r_old_inode_drop = CEPH_CAP_LINK_SHARED; 1101 if (d_really_is_positive(new_dentry)) 1102 req->r_inode_drop = drop_caps_for_unlink(d_inode(new_dentry)); 1103 err = ceph_mdsc_do_request(mdsc, old_dir, req); 1104 if (!err && !req->r_reply_info.head->is_dentry) { 1105 /* 1106 * Normally d_move() is done by fill_trace (called by 1107 * do_request, above). If there is no trace, we need 1108 * to do it here. 1109 */ 1110 1111 /* d_move screws up sibling dentries' offsets */ 1112 ceph_dir_clear_complete(old_dir); 1113 ceph_dir_clear_complete(new_dir); 1114 1115 d_move(old_dentry, new_dentry); 1116 1117 /* ensure target dentry is invalidated, despite 1118 rehashing bug in vfs_rename_dir */ 1119 ceph_invalidate_dentry_lease(new_dentry); 1120 } 1121 ceph_mdsc_put_request(req); 1122 return err; 1123 } 1124 1125 /* 1126 * Ensure a dentry lease will no longer revalidate. 1127 */ 1128 void ceph_invalidate_dentry_lease(struct dentry *dentry) 1129 { 1130 spin_lock(&dentry->d_lock); 1131 ceph_dentry(dentry)->time = jiffies; 1132 ceph_dentry(dentry)->lease_shared_gen = 0; 1133 spin_unlock(&dentry->d_lock); 1134 } 1135 1136 /* 1137 * Check if dentry lease is valid. If not, delete the lease. Try to 1138 * renew if the least is more than half up. 1139 */ 1140 static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags, 1141 struct inode *dir) 1142 { 1143 struct ceph_dentry_info *di; 1144 struct ceph_mds_session *s; 1145 int valid = 0; 1146 u32 gen; 1147 unsigned long ttl; 1148 struct ceph_mds_session *session = NULL; 1149 u32 seq = 0; 1150 1151 spin_lock(&dentry->d_lock); 1152 di = ceph_dentry(dentry); 1153 if (di && di->lease_session) { 1154 s = di->lease_session; 1155 spin_lock(&s->s_gen_ttl_lock); 1156 gen = s->s_cap_gen; 1157 ttl = s->s_cap_ttl; 1158 spin_unlock(&s->s_gen_ttl_lock); 1159 1160 if (di->lease_gen == gen && 1161 time_before(jiffies, di->time) && 1162 time_before(jiffies, ttl)) { 1163 valid = 1; 1164 if (di->lease_renew_after && 1165 time_after(jiffies, di->lease_renew_after)) { 1166 /* 1167 * We should renew. If we're in RCU walk mode 1168 * though, we can't do that so just return 1169 * -ECHILD. 1170 */ 1171 if (flags & LOOKUP_RCU) { 1172 valid = -ECHILD; 1173 } else { 1174 session = ceph_get_mds_session(s); 1175 seq = di->lease_seq; 1176 di->lease_renew_after = 0; 1177 di->lease_renew_from = jiffies; 1178 } 1179 } 1180 } 1181 } 1182 spin_unlock(&dentry->d_lock); 1183 1184 if (session) { 1185 ceph_mdsc_lease_send_msg(session, dir, dentry, 1186 CEPH_MDS_LEASE_RENEW, seq); 1187 ceph_put_mds_session(session); 1188 } 1189 dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid); 1190 return valid; 1191 } 1192 1193 /* 1194 * Check if directory-wide content lease/cap is valid. 1195 */ 1196 static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) 1197 { 1198 struct ceph_inode_info *ci = ceph_inode(dir); 1199 struct ceph_dentry_info *di = ceph_dentry(dentry); 1200 int valid = 0; 1201 1202 spin_lock(&ci->i_ceph_lock); 1203 if (ci->i_shared_gen == di->lease_shared_gen) 1204 valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1); 1205 spin_unlock(&ci->i_ceph_lock); 1206 dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n", 1207 dir, (unsigned)ci->i_shared_gen, dentry, 1208 (unsigned)di->lease_shared_gen, valid); 1209 return valid; 1210 } 1211 1212 /* 1213 * Check if cached dentry can be trusted. 1214 */ 1215 static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) 1216 { 1217 int valid = 0; 1218 struct dentry *parent; 1219 struct inode *dir; 1220 1221 if (flags & LOOKUP_RCU) { 1222 parent = ACCESS_ONCE(dentry->d_parent); 1223 dir = d_inode_rcu(parent); 1224 if (!dir) 1225 return -ECHILD; 1226 } else { 1227 parent = dget_parent(dentry); 1228 dir = d_inode(parent); 1229 } 1230 1231 dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry, 1232 dentry, d_inode(dentry), ceph_dentry(dentry)->offset); 1233 1234 /* always trust cached snapped dentries, snapdir dentry */ 1235 if (ceph_snap(dir) != CEPH_NOSNAP) { 1236 dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry, 1237 dentry, d_inode(dentry)); 1238 valid = 1; 1239 } else if (d_really_is_positive(dentry) && 1240 ceph_snap(d_inode(dentry)) == CEPH_SNAPDIR) { 1241 valid = 1; 1242 } else { 1243 valid = dentry_lease_is_valid(dentry, flags, dir); 1244 if (valid == -ECHILD) 1245 return valid; 1246 if (valid || dir_lease_is_valid(dir, dentry)) { 1247 if (d_really_is_positive(dentry)) 1248 valid = ceph_is_any_caps(d_inode(dentry)); 1249 else 1250 valid = 1; 1251 } 1252 } 1253 1254 if (!valid) { 1255 struct ceph_mds_client *mdsc = 1256 ceph_sb_to_client(dir->i_sb)->mdsc; 1257 struct ceph_mds_request *req; 1258 int op, mask, err; 1259 1260 if (flags & LOOKUP_RCU) 1261 return -ECHILD; 1262 1263 op = ceph_snap(dir) == CEPH_SNAPDIR ? 1264 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; 1265 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); 1266 if (!IS_ERR(req)) { 1267 req->r_dentry = dget(dentry); 1268 req->r_num_caps = 2; 1269 1270 mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; 1271 if (ceph_security_xattr_wanted(dir)) 1272 mask |= CEPH_CAP_XATTR_SHARED; 1273 req->r_args.getattr.mask = mask; 1274 1275 req->r_locked_dir = dir; 1276 err = ceph_mdsc_do_request(mdsc, NULL, req); 1277 if (err == 0 || err == -ENOENT) { 1278 if (dentry == req->r_dentry) { 1279 valid = !d_unhashed(dentry); 1280 } else { 1281 d_invalidate(req->r_dentry); 1282 err = -EAGAIN; 1283 } 1284 } 1285 ceph_mdsc_put_request(req); 1286 dout("d_revalidate %p lookup result=%d\n", 1287 dentry, err); 1288 } 1289 } 1290 1291 dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); 1292 if (valid) { 1293 ceph_dentry_lru_touch(dentry); 1294 } else { 1295 ceph_dir_clear_complete(dir); 1296 } 1297 1298 if (!(flags & LOOKUP_RCU)) 1299 dput(parent); 1300 return valid; 1301 } 1302 1303 /* 1304 * Release our ceph_dentry_info. 1305 */ 1306 static void ceph_d_release(struct dentry *dentry) 1307 { 1308 struct ceph_dentry_info *di = ceph_dentry(dentry); 1309 1310 dout("d_release %p\n", dentry); 1311 ceph_dentry_lru_del(dentry); 1312 1313 spin_lock(&dentry->d_lock); 1314 dentry->d_fsdata = NULL; 1315 spin_unlock(&dentry->d_lock); 1316 1317 if (di->lease_session) 1318 ceph_put_mds_session(di->lease_session); 1319 kmem_cache_free(ceph_dentry_cachep, di); 1320 } 1321 1322 static int ceph_snapdir_d_revalidate(struct dentry *dentry, 1323 unsigned int flags) 1324 { 1325 /* 1326 * Eventually, we'll want to revalidate snapped metadata 1327 * too... probably... 1328 */ 1329 return 1; 1330 } 1331 1332 /* 1333 * When the VFS prunes a dentry from the cache, we need to clear the 1334 * complete flag on the parent directory. 1335 * 1336 * Called under dentry->d_lock. 1337 */ 1338 static void ceph_d_prune(struct dentry *dentry) 1339 { 1340 dout("ceph_d_prune %p\n", dentry); 1341 1342 /* do we have a valid parent? */ 1343 if (IS_ROOT(dentry)) 1344 return; 1345 1346 /* if we are not hashed, we don't affect dir's completeness */ 1347 if (d_unhashed(dentry)) 1348 return; 1349 1350 /* 1351 * we hold d_lock, so d_parent is stable, and d_fsdata is never 1352 * cleared until d_release 1353 */ 1354 ceph_dir_clear_complete(d_inode(dentry->d_parent)); 1355 } 1356 1357 /* 1358 * read() on a dir. This weird interface hack only works if mounted 1359 * with '-o dirstat'. 1360 */ 1361 static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, 1362 loff_t *ppos) 1363 { 1364 struct ceph_file_info *cf = file->private_data; 1365 struct inode *inode = file_inode(file); 1366 struct ceph_inode_info *ci = ceph_inode(inode); 1367 int left; 1368 const int bufsize = 1024; 1369 1370 if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) 1371 return -EISDIR; 1372 1373 if (!cf->dir_info) { 1374 cf->dir_info = kmalloc(bufsize, GFP_KERNEL); 1375 if (!cf->dir_info) 1376 return -ENOMEM; 1377 cf->dir_info_len = 1378 snprintf(cf->dir_info, bufsize, 1379 "entries: %20lld\n" 1380 " files: %20lld\n" 1381 " subdirs: %20lld\n" 1382 "rentries: %20lld\n" 1383 " rfiles: %20lld\n" 1384 " rsubdirs: %20lld\n" 1385 "rbytes: %20lld\n" 1386 "rctime: %10ld.%09ld\n", 1387 ci->i_files + ci->i_subdirs, 1388 ci->i_files, 1389 ci->i_subdirs, 1390 ci->i_rfiles + ci->i_rsubdirs, 1391 ci->i_rfiles, 1392 ci->i_rsubdirs, 1393 ci->i_rbytes, 1394 (long)ci->i_rctime.tv_sec, 1395 (long)ci->i_rctime.tv_nsec); 1396 } 1397 1398 if (*ppos >= cf->dir_info_len) 1399 return 0; 1400 size = min_t(unsigned, size, cf->dir_info_len-*ppos); 1401 left = copy_to_user(buf, cf->dir_info + *ppos, size); 1402 if (left == size) 1403 return -EFAULT; 1404 *ppos += (size - left); 1405 return size - left; 1406 } 1407 1408 /* 1409 * We maintain a private dentry LRU. 1410 * 1411 * FIXME: this needs to be changed to a per-mds lru to be useful. 1412 */ 1413 void ceph_dentry_lru_add(struct dentry *dn) 1414 { 1415 struct ceph_dentry_info *di = ceph_dentry(dn); 1416 struct ceph_mds_client *mdsc; 1417 1418 dout("dentry_lru_add %p %p '%pd'\n", di, dn, dn); 1419 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; 1420 spin_lock(&mdsc->dentry_lru_lock); 1421 list_add_tail(&di->lru, &mdsc->dentry_lru); 1422 mdsc->num_dentry++; 1423 spin_unlock(&mdsc->dentry_lru_lock); 1424 } 1425 1426 void ceph_dentry_lru_touch(struct dentry *dn) 1427 { 1428 struct ceph_dentry_info *di = ceph_dentry(dn); 1429 struct ceph_mds_client *mdsc; 1430 1431 dout("dentry_lru_touch %p %p '%pd' (offset %lld)\n", di, dn, dn, 1432 di->offset); 1433 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; 1434 spin_lock(&mdsc->dentry_lru_lock); 1435 list_move_tail(&di->lru, &mdsc->dentry_lru); 1436 spin_unlock(&mdsc->dentry_lru_lock); 1437 } 1438 1439 void ceph_dentry_lru_del(struct dentry *dn) 1440 { 1441 struct ceph_dentry_info *di = ceph_dentry(dn); 1442 struct ceph_mds_client *mdsc; 1443 1444 dout("dentry_lru_del %p %p '%pd'\n", di, dn, dn); 1445 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; 1446 spin_lock(&mdsc->dentry_lru_lock); 1447 list_del_init(&di->lru); 1448 mdsc->num_dentry--; 1449 spin_unlock(&mdsc->dentry_lru_lock); 1450 } 1451 1452 /* 1453 * Return name hash for a given dentry. This is dependent on 1454 * the parent directory's hash function. 1455 */ 1456 unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) 1457 { 1458 struct ceph_inode_info *dci = ceph_inode(dir); 1459 1460 switch (dci->i_dir_layout.dl_dir_hash) { 1461 case 0: /* for backward compat */ 1462 case CEPH_STR_HASH_LINUX: 1463 return dn->d_name.hash; 1464 1465 default: 1466 return ceph_str_hash(dci->i_dir_layout.dl_dir_hash, 1467 dn->d_name.name, dn->d_name.len); 1468 } 1469 } 1470 1471 const struct file_operations ceph_dir_fops = { 1472 .read = ceph_read_dir, 1473 .iterate = ceph_readdir, 1474 .llseek = ceph_dir_llseek, 1475 .open = ceph_open, 1476 .release = ceph_release, 1477 .unlocked_ioctl = ceph_ioctl, 1478 .fsync = ceph_fsync, 1479 }; 1480 1481 const struct file_operations ceph_snapdir_fops = { 1482 .iterate = ceph_readdir, 1483 .llseek = ceph_dir_llseek, 1484 .open = ceph_open, 1485 .release = ceph_release, 1486 }; 1487 1488 const struct inode_operations ceph_dir_iops = { 1489 .lookup = ceph_lookup, 1490 .permission = ceph_permission, 1491 .getattr = ceph_getattr, 1492 .setattr = ceph_setattr, 1493 .listxattr = ceph_listxattr, 1494 .get_acl = ceph_get_acl, 1495 .set_acl = ceph_set_acl, 1496 .mknod = ceph_mknod, 1497 .symlink = ceph_symlink, 1498 .mkdir = ceph_mkdir, 1499 .link = ceph_link, 1500 .unlink = ceph_unlink, 1501 .rmdir = ceph_unlink, 1502 .rename = ceph_rename, 1503 .create = ceph_create, 1504 .atomic_open = ceph_atomic_open, 1505 }; 1506 1507 const struct inode_operations ceph_snapdir_iops = { 1508 .lookup = ceph_lookup, 1509 .permission = ceph_permission, 1510 .getattr = ceph_getattr, 1511 .mkdir = ceph_mkdir, 1512 .rmdir = ceph_unlink, 1513 .rename = ceph_rename, 1514 }; 1515 1516 const struct dentry_operations ceph_dentry_ops = { 1517 .d_revalidate = ceph_d_revalidate, 1518 .d_release = ceph_d_release, 1519 .d_prune = ceph_d_prune, 1520 }; 1521 1522 const struct dentry_operations ceph_snapdir_dentry_ops = { 1523 .d_revalidate = ceph_snapdir_d_revalidate, 1524 .d_release = ceph_d_release, 1525 }; 1526 1527 const struct dentry_operations ceph_snap_dentry_ops = { 1528 .d_release = ceph_d_release, 1529 .d_prune = ceph_d_prune, 1530 }; 1531