1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/ceph/ceph_debug.h> 3 4 #include <linux/spinlock.h> 5 #include <linux/fs_struct.h> 6 #include <linux/namei.h> 7 #include <linux/slab.h> 8 #include <linux/sched.h> 9 #include <linux/xattr.h> 10 11 #include "super.h" 12 #include "mds_client.h" 13 14 /* 15 * Directory operations: readdir, lookup, create, link, unlink, 16 * rename, etc. 17 */ 18 19 /* 20 * Ceph MDS operations are specified in terms of a base ino and 21 * relative path. Thus, the client can specify an operation on a 22 * specific inode (e.g., a getattr due to fstat(2)), or as a path 23 * relative to, say, the root directory. 24 * 25 * Normally, we limit ourselves to strict inode ops (no path component) 26 * or dentry operations (a single path component relative to an ino). The 27 * exception to this is open_root_dentry(), which will open the mount 28 * point by name. 29 */ 30 31 const struct dentry_operations ceph_dentry_ops; 32 33 /* 34 * Initialize ceph dentry state. 35 */ 36 static int ceph_d_init(struct dentry *dentry) 37 { 38 struct ceph_dentry_info *di; 39 40 di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL); 41 if (!di) 42 return -ENOMEM; /* oh well */ 43 44 di->dentry = dentry; 45 di->lease_session = NULL; 46 di->time = jiffies; 47 dentry->d_fsdata = di; 48 ceph_dentry_lru_add(dentry); 49 return 0; 50 } 51 52 /* 53 * for f_pos for readdir: 54 * - hash order: 55 * (0xff << 52) | ((24 bits hash) << 28) | 56 * (the nth entry has hash collision); 57 * - frag+name order; 58 * ((frag value) << 28) | (the nth entry in frag); 59 */ 60 #define OFFSET_BITS 28 61 #define OFFSET_MASK ((1 << OFFSET_BITS) - 1) 62 #define HASH_ORDER (0xffull << (OFFSET_BITS + 24)) 63 loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order) 64 { 65 loff_t fpos = ((loff_t)high << 28) | (loff_t)off; 66 if (hash_order) 67 fpos |= HASH_ORDER; 68 return fpos; 69 } 70 71 static bool is_hash_order(loff_t p) 72 { 73 return (p & HASH_ORDER) == HASH_ORDER; 74 } 75 76 static unsigned fpos_frag(loff_t p) 77 { 78 return p >> OFFSET_BITS; 79 } 80 81 static unsigned fpos_hash(loff_t p) 82 { 83 return ceph_frag_value(fpos_frag(p)); 84 } 85 86 static unsigned fpos_off(loff_t p) 87 { 88 return p & OFFSET_MASK; 89 } 90 91 static int fpos_cmp(loff_t l, loff_t r) 92 { 93 int v = ceph_frag_compare(fpos_frag(l), fpos_frag(r)); 94 if (v) 95 return v; 96 return (int)(fpos_off(l) - fpos_off(r)); 97 } 98 99 /* 100 * make note of the last dentry we read, so we can 101 * continue at the same lexicographical point, 102 * regardless of what dir changes take place on the 103 * server. 104 */ 105 static int note_last_dentry(struct ceph_file_info *fi, const char *name, 106 int len, unsigned next_offset) 107 { 108 char *buf = kmalloc(len+1, GFP_KERNEL); 109 if (!buf) 110 return -ENOMEM; 111 kfree(fi->last_name); 112 fi->last_name = buf; 113 memcpy(fi->last_name, name, len); 114 fi->last_name[len] = 0; 115 fi->next_offset = next_offset; 116 dout("note_last_dentry '%s'\n", fi->last_name); 117 return 0; 118 } 119 120 121 static struct dentry * 122 __dcache_find_get_entry(struct dentry *parent, u64 idx, 123 struct ceph_readdir_cache_control *cache_ctl) 124 { 125 struct inode *dir = d_inode(parent); 126 struct dentry *dentry; 127 unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - 1; 128 loff_t ptr_pos = idx * sizeof(struct dentry *); 129 pgoff_t ptr_pgoff = ptr_pos >> PAGE_SHIFT; 130 131 if (ptr_pos >= i_size_read(dir)) 132 return NULL; 133 134 if (!cache_ctl->page || ptr_pgoff != page_index(cache_ctl->page)) { 135 ceph_readdir_cache_release(cache_ctl); 136 cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff); 137 if (!cache_ctl->page) { 138 dout(" page %lu not found\n", ptr_pgoff); 139 return ERR_PTR(-EAGAIN); 140 } 141 /* reading/filling the cache are serialized by 142 i_mutex, no need to use page lock */ 143 unlock_page(cache_ctl->page); 144 cache_ctl->dentries = kmap(cache_ctl->page); 145 } 146 147 cache_ctl->index = idx & idx_mask; 148 149 rcu_read_lock(); 150 spin_lock(&parent->d_lock); 151 /* check i_size again here, because empty directory can be 152 * marked as complete while not holding the i_mutex. */ 153 if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir)) 154 dentry = cache_ctl->dentries[cache_ctl->index]; 155 else 156 dentry = NULL; 157 spin_unlock(&parent->d_lock); 158 if (dentry && !lockref_get_not_dead(&dentry->d_lockref)) 159 dentry = NULL; 160 rcu_read_unlock(); 161 return dentry ? : ERR_PTR(-EAGAIN); 162 } 163 164 /* 165 * When possible, we try to satisfy a readdir by peeking at the 166 * dcache. We make this work by carefully ordering dentries on 167 * d_child when we initially get results back from the MDS, and 168 * falling back to a "normal" sync readdir if any dentries in the dir 169 * are dropped. 170 * 171 * Complete dir indicates that we have all dentries in the dir. It is 172 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by 173 * the MDS if/when the directory is modified). 174 */ 175 static int __dcache_readdir(struct file *file, struct dir_context *ctx, 176 u32 shared_gen) 177 { 178 struct ceph_file_info *fi = file->private_data; 179 struct dentry *parent = file->f_path.dentry; 180 struct inode *dir = d_inode(parent); 181 struct dentry *dentry, *last = NULL; 182 struct ceph_dentry_info *di; 183 struct ceph_readdir_cache_control cache_ctl = {}; 184 u64 idx = 0; 185 int err = 0; 186 187 dout("__dcache_readdir %p v%u at %llx\n", dir, shared_gen, ctx->pos); 188 189 /* search start position */ 190 if (ctx->pos > 2) { 191 u64 count = div_u64(i_size_read(dir), sizeof(struct dentry *)); 192 while (count > 0) { 193 u64 step = count >> 1; 194 dentry = __dcache_find_get_entry(parent, idx + step, 195 &cache_ctl); 196 if (!dentry) { 197 /* use linar search */ 198 idx = 0; 199 break; 200 } 201 if (IS_ERR(dentry)) { 202 err = PTR_ERR(dentry); 203 goto out; 204 } 205 di = ceph_dentry(dentry); 206 spin_lock(&dentry->d_lock); 207 if (fpos_cmp(di->offset, ctx->pos) < 0) { 208 idx += step + 1; 209 count -= step + 1; 210 } else { 211 count = step; 212 } 213 spin_unlock(&dentry->d_lock); 214 dput(dentry); 215 } 216 217 dout("__dcache_readdir %p cache idx %llu\n", dir, idx); 218 } 219 220 221 for (;;) { 222 bool emit_dentry = false; 223 dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl); 224 if (!dentry) { 225 fi->flags |= CEPH_F_ATEND; 226 err = 0; 227 break; 228 } 229 if (IS_ERR(dentry)) { 230 err = PTR_ERR(dentry); 231 goto out; 232 } 233 234 di = ceph_dentry(dentry); 235 spin_lock(&dentry->d_lock); 236 if (di->lease_shared_gen == shared_gen && 237 d_really_is_positive(dentry) && 238 fpos_cmp(ctx->pos, di->offset) <= 0) { 239 emit_dentry = true; 240 } 241 spin_unlock(&dentry->d_lock); 242 243 if (emit_dentry) { 244 dout(" %llx dentry %p %pd %p\n", di->offset, 245 dentry, dentry, d_inode(dentry)); 246 ctx->pos = di->offset; 247 if (!dir_emit(ctx, dentry->d_name.name, 248 dentry->d_name.len, 249 ceph_translate_ino(dentry->d_sb, 250 d_inode(dentry)->i_ino), 251 d_inode(dentry)->i_mode >> 12)) { 252 dput(dentry); 253 err = 0; 254 break; 255 } 256 ctx->pos++; 257 258 if (last) 259 dput(last); 260 last = dentry; 261 } else { 262 dput(dentry); 263 } 264 } 265 out: 266 ceph_readdir_cache_release(&cache_ctl); 267 if (last) { 268 int ret; 269 di = ceph_dentry(last); 270 ret = note_last_dentry(fi, last->d_name.name, last->d_name.len, 271 fpos_off(di->offset) + 1); 272 if (ret < 0) 273 err = ret; 274 dput(last); 275 /* last_name no longer match cache index */ 276 if (fi->readdir_cache_idx >= 0) { 277 fi->readdir_cache_idx = -1; 278 fi->dir_release_count = 0; 279 } 280 } 281 return err; 282 } 283 284 static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos) 285 { 286 if (!fi->last_readdir) 287 return true; 288 if (is_hash_order(pos)) 289 return !ceph_frag_contains_value(fi->frag, fpos_hash(pos)); 290 else 291 return fi->frag != fpos_frag(pos); 292 } 293 294 static int ceph_readdir(struct file *file, struct dir_context *ctx) 295 { 296 struct ceph_file_info *fi = file->private_data; 297 struct inode *inode = file_inode(file); 298 struct ceph_inode_info *ci = ceph_inode(inode); 299 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 300 struct ceph_mds_client *mdsc = fsc->mdsc; 301 int i; 302 int err; 303 unsigned frag = -1; 304 struct ceph_mds_reply_info_parsed *rinfo; 305 306 dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos); 307 if (fi->flags & CEPH_F_ATEND) 308 return 0; 309 310 /* always start with . and .. */ 311 if (ctx->pos == 0) { 312 dout("readdir off 0 -> '.'\n"); 313 if (!dir_emit(ctx, ".", 1, 314 ceph_translate_ino(inode->i_sb, inode->i_ino), 315 inode->i_mode >> 12)) 316 return 0; 317 ctx->pos = 1; 318 } 319 if (ctx->pos == 1) { 320 ino_t ino = parent_ino(file->f_path.dentry); 321 dout("readdir off 1 -> '..'\n"); 322 if (!dir_emit(ctx, "..", 2, 323 ceph_translate_ino(inode->i_sb, ino), 324 inode->i_mode >> 12)) 325 return 0; 326 ctx->pos = 2; 327 } 328 329 /* can we use the dcache? */ 330 spin_lock(&ci->i_ceph_lock); 331 if (ceph_test_mount_opt(fsc, DCACHE) && 332 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && 333 ceph_snap(inode) != CEPH_SNAPDIR && 334 __ceph_dir_is_complete_ordered(ci) && 335 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { 336 u32 shared_gen = ci->i_shared_gen; 337 spin_unlock(&ci->i_ceph_lock); 338 err = __dcache_readdir(file, ctx, shared_gen); 339 if (err != -EAGAIN) 340 return err; 341 } else { 342 spin_unlock(&ci->i_ceph_lock); 343 } 344 345 /* proceed with a normal readdir */ 346 more: 347 /* do we have the correct frag content buffered? */ 348 if (need_send_readdir(fi, ctx->pos)) { 349 struct ceph_mds_request *req; 350 int op = ceph_snap(inode) == CEPH_SNAPDIR ? 351 CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; 352 353 /* discard old result, if any */ 354 if (fi->last_readdir) { 355 ceph_mdsc_put_request(fi->last_readdir); 356 fi->last_readdir = NULL; 357 } 358 359 if (is_hash_order(ctx->pos)) { 360 /* fragtree isn't always accurate. choose frag 361 * based on previous reply when possible. */ 362 if (frag == (unsigned)-1) 363 frag = ceph_choose_frag(ci, fpos_hash(ctx->pos), 364 NULL, NULL); 365 } else { 366 frag = fpos_frag(ctx->pos); 367 } 368 369 dout("readdir fetching %llx.%llx frag %x offset '%s'\n", 370 ceph_vinop(inode), frag, fi->last_name); 371 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 372 if (IS_ERR(req)) 373 return PTR_ERR(req); 374 err = ceph_alloc_readdir_reply_buffer(req, inode); 375 if (err) { 376 ceph_mdsc_put_request(req); 377 return err; 378 } 379 /* hints to request -> mds selection code */ 380 req->r_direct_mode = USE_AUTH_MDS; 381 if (op == CEPH_MDS_OP_READDIR) { 382 req->r_direct_hash = ceph_frag_value(frag); 383 __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); 384 } 385 if (fi->last_name) { 386 req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL); 387 if (!req->r_path2) { 388 ceph_mdsc_put_request(req); 389 return -ENOMEM; 390 } 391 } else if (is_hash_order(ctx->pos)) { 392 req->r_args.readdir.offset_hash = 393 cpu_to_le32(fpos_hash(ctx->pos)); 394 } 395 396 req->r_dir_release_cnt = fi->dir_release_count; 397 req->r_dir_ordered_cnt = fi->dir_ordered_count; 398 req->r_readdir_cache_idx = fi->readdir_cache_idx; 399 req->r_readdir_offset = fi->next_offset; 400 req->r_args.readdir.frag = cpu_to_le32(frag); 401 req->r_args.readdir.flags = 402 cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS); 403 404 req->r_inode = inode; 405 ihold(inode); 406 req->r_dentry = dget(file->f_path.dentry); 407 err = ceph_mdsc_do_request(mdsc, NULL, req); 408 if (err < 0) { 409 ceph_mdsc_put_request(req); 410 return err; 411 } 412 dout("readdir got and parsed readdir result=%d on " 413 "frag %x, end=%d, complete=%d, hash_order=%d\n", 414 err, frag, 415 (int)req->r_reply_info.dir_end, 416 (int)req->r_reply_info.dir_complete, 417 (int)req->r_reply_info.hash_order); 418 419 rinfo = &req->r_reply_info; 420 if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { 421 frag = le32_to_cpu(rinfo->dir_dir->frag); 422 if (!rinfo->hash_order) { 423 fi->next_offset = req->r_readdir_offset; 424 /* adjust ctx->pos to beginning of frag */ 425 ctx->pos = ceph_make_fpos(frag, 426 fi->next_offset, 427 false); 428 } 429 } 430 431 fi->frag = frag; 432 fi->last_readdir = req; 433 434 if (test_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags)) { 435 fi->readdir_cache_idx = req->r_readdir_cache_idx; 436 if (fi->readdir_cache_idx < 0) { 437 /* preclude from marking dir ordered */ 438 fi->dir_ordered_count = 0; 439 } else if (ceph_frag_is_leftmost(frag) && 440 fi->next_offset == 2) { 441 /* note dir version at start of readdir so 442 * we can tell if any dentries get dropped */ 443 fi->dir_release_count = req->r_dir_release_cnt; 444 fi->dir_ordered_count = req->r_dir_ordered_cnt; 445 } 446 } else { 447 dout("readdir !did_prepopulate"); 448 /* disable readdir cache */ 449 fi->readdir_cache_idx = -1; 450 /* preclude from marking dir complete */ 451 fi->dir_release_count = 0; 452 } 453 454 /* note next offset and last dentry name */ 455 if (rinfo->dir_nr > 0) { 456 struct ceph_mds_reply_dir_entry *rde = 457 rinfo->dir_entries + (rinfo->dir_nr-1); 458 unsigned next_offset = req->r_reply_info.dir_end ? 459 2 : (fpos_off(rde->offset) + 1); 460 err = note_last_dentry(fi, rde->name, rde->name_len, 461 next_offset); 462 if (err) 463 return err; 464 } else if (req->r_reply_info.dir_end) { 465 fi->next_offset = 2; 466 /* keep last name */ 467 } 468 } 469 470 rinfo = &fi->last_readdir->r_reply_info; 471 dout("readdir frag %x num %d pos %llx chunk first %llx\n", 472 fi->frag, rinfo->dir_nr, ctx->pos, 473 rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL); 474 475 i = 0; 476 /* search start position */ 477 if (rinfo->dir_nr > 0) { 478 int step, nr = rinfo->dir_nr; 479 while (nr > 0) { 480 step = nr >> 1; 481 if (rinfo->dir_entries[i + step].offset < ctx->pos) { 482 i += step + 1; 483 nr -= step + 1; 484 } else { 485 nr = step; 486 } 487 } 488 } 489 for (; i < rinfo->dir_nr; i++) { 490 struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; 491 struct ceph_vino vino; 492 ino_t ino; 493 u32 ftype; 494 495 BUG_ON(rde->offset < ctx->pos); 496 497 ctx->pos = rde->offset; 498 dout("readdir (%d/%d) -> %llx '%.*s' %p\n", 499 i, rinfo->dir_nr, ctx->pos, 500 rde->name_len, rde->name, &rde->inode.in); 501 502 BUG_ON(!rde->inode.in); 503 ftype = le32_to_cpu(rde->inode.in->mode) >> 12; 504 vino.ino = le64_to_cpu(rde->inode.in->ino); 505 vino.snap = le64_to_cpu(rde->inode.in->snapid); 506 ino = ceph_vino_to_ino(vino); 507 508 if (!dir_emit(ctx, rde->name, rde->name_len, 509 ceph_translate_ino(inode->i_sb, ino), ftype)) { 510 dout("filldir stopping us...\n"); 511 return 0; 512 } 513 ctx->pos++; 514 } 515 516 ceph_mdsc_put_request(fi->last_readdir); 517 fi->last_readdir = NULL; 518 519 if (fi->next_offset > 2) { 520 frag = fi->frag; 521 goto more; 522 } 523 524 /* more frags? */ 525 if (!ceph_frag_is_rightmost(fi->frag)) { 526 frag = ceph_frag_next(fi->frag); 527 if (is_hash_order(ctx->pos)) { 528 loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag), 529 fi->next_offset, true); 530 if (new_pos > ctx->pos) 531 ctx->pos = new_pos; 532 /* keep last_name */ 533 } else { 534 ctx->pos = ceph_make_fpos(frag, fi->next_offset, false); 535 kfree(fi->last_name); 536 fi->last_name = NULL; 537 } 538 dout("readdir next frag is %x\n", frag); 539 goto more; 540 } 541 fi->flags |= CEPH_F_ATEND; 542 543 /* 544 * if dir_release_count still matches the dir, no dentries 545 * were released during the whole readdir, and we should have 546 * the complete dir contents in our cache. 547 */ 548 if (atomic64_read(&ci->i_release_count) == fi->dir_release_count) { 549 spin_lock(&ci->i_ceph_lock); 550 if (fi->dir_ordered_count == atomic64_read(&ci->i_ordered_count)) { 551 dout(" marking %p complete and ordered\n", inode); 552 /* use i_size to track number of entries in 553 * readdir cache */ 554 BUG_ON(fi->readdir_cache_idx < 0); 555 i_size_write(inode, fi->readdir_cache_idx * 556 sizeof(struct dentry*)); 557 } else { 558 dout(" marking %p complete\n", inode); 559 } 560 __ceph_dir_set_complete(ci, fi->dir_release_count, 561 fi->dir_ordered_count); 562 spin_unlock(&ci->i_ceph_lock); 563 } 564 565 dout("readdir %p file %p done.\n", inode, file); 566 return 0; 567 } 568 569 static void reset_readdir(struct ceph_file_info *fi) 570 { 571 if (fi->last_readdir) { 572 ceph_mdsc_put_request(fi->last_readdir); 573 fi->last_readdir = NULL; 574 } 575 kfree(fi->last_name); 576 fi->last_name = NULL; 577 fi->dir_release_count = 0; 578 fi->readdir_cache_idx = -1; 579 fi->next_offset = 2; /* compensate for . and .. */ 580 fi->flags &= ~CEPH_F_ATEND; 581 } 582 583 /* 584 * discard buffered readdir content on seekdir(0), or seek to new frag, 585 * or seek prior to current chunk 586 */ 587 static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos) 588 { 589 struct ceph_mds_reply_info_parsed *rinfo; 590 loff_t chunk_offset; 591 if (new_pos == 0) 592 return true; 593 if (is_hash_order(new_pos)) { 594 /* no need to reset last_name for a forward seek when 595 * dentries are sotred in hash order */ 596 } else if (fi->frag != fpos_frag(new_pos)) { 597 return true; 598 } 599 rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL; 600 if (!rinfo || !rinfo->dir_nr) 601 return true; 602 chunk_offset = rinfo->dir_entries[0].offset; 603 return new_pos < chunk_offset || 604 is_hash_order(new_pos) != is_hash_order(chunk_offset); 605 } 606 607 static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) 608 { 609 struct ceph_file_info *fi = file->private_data; 610 struct inode *inode = file->f_mapping->host; 611 loff_t retval; 612 613 inode_lock(inode); 614 retval = -EINVAL; 615 switch (whence) { 616 case SEEK_CUR: 617 offset += file->f_pos; 618 case SEEK_SET: 619 break; 620 case SEEK_END: 621 retval = -EOPNOTSUPP; 622 default: 623 goto out; 624 } 625 626 if (offset >= 0) { 627 if (need_reset_readdir(fi, offset)) { 628 dout("dir_llseek dropping %p content\n", file); 629 reset_readdir(fi); 630 } else if (is_hash_order(offset) && offset > file->f_pos) { 631 /* for hash offset, we don't know if a forward seek 632 * is within same frag */ 633 fi->dir_release_count = 0; 634 fi->readdir_cache_idx = -1; 635 } 636 637 if (offset != file->f_pos) { 638 file->f_pos = offset; 639 file->f_version = 0; 640 fi->flags &= ~CEPH_F_ATEND; 641 } 642 retval = offset; 643 } 644 out: 645 inode_unlock(inode); 646 return retval; 647 } 648 649 /* 650 * Handle lookups for the hidden .snap directory. 651 */ 652 int ceph_handle_snapdir(struct ceph_mds_request *req, 653 struct dentry *dentry, int err) 654 { 655 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); 656 struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */ 657 658 /* .snap dir? */ 659 if (err == -ENOENT && 660 ceph_snap(parent) == CEPH_NOSNAP && 661 strcmp(dentry->d_name.name, 662 fsc->mount_options->snapdir_name) == 0) { 663 struct inode *inode = ceph_get_snapdir(parent); 664 dout("ENOENT on snapdir %p '%pd', linking to snapdir %p\n", 665 dentry, dentry, inode); 666 BUG_ON(!d_unhashed(dentry)); 667 d_add(dentry, inode); 668 err = 0; 669 } 670 return err; 671 } 672 673 /* 674 * Figure out final result of a lookup/open request. 675 * 676 * Mainly, make sure we return the final req->r_dentry (if it already 677 * existed) in place of the original VFS-provided dentry when they 678 * differ. 679 * 680 * Gracefully handle the case where the MDS replies with -ENOENT and 681 * no trace (which it may do, at its discretion, e.g., if it doesn't 682 * care to issue a lease on the negative dentry). 683 */ 684 struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, 685 struct dentry *dentry, int err) 686 { 687 if (err == -ENOENT) { 688 /* no trace? */ 689 err = 0; 690 if (!req->r_reply_info.head->is_dentry) { 691 dout("ENOENT and no trace, dentry %p inode %p\n", 692 dentry, d_inode(dentry)); 693 if (d_really_is_positive(dentry)) { 694 d_drop(dentry); 695 err = -ENOENT; 696 } else { 697 d_add(dentry, NULL); 698 } 699 } 700 } 701 if (err) 702 dentry = ERR_PTR(err); 703 else if (dentry != req->r_dentry) 704 dentry = dget(req->r_dentry); /* we got spliced */ 705 else 706 dentry = NULL; 707 return dentry; 708 } 709 710 static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) 711 { 712 return ceph_ino(inode) == CEPH_INO_ROOT && 713 strncmp(dentry->d_name.name, ".ceph", 5) == 0; 714 } 715 716 /* 717 * Look up a single dir entry. If there is a lookup intent, inform 718 * the MDS so that it gets our 'caps wanted' value in a single op. 719 */ 720 static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, 721 unsigned int flags) 722 { 723 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 724 struct ceph_mds_client *mdsc = fsc->mdsc; 725 struct ceph_mds_request *req; 726 int op; 727 int mask; 728 int err; 729 730 dout("lookup %p dentry %p '%pd'\n", 731 dir, dentry, dentry); 732 733 if (dentry->d_name.len > NAME_MAX) 734 return ERR_PTR(-ENAMETOOLONG); 735 736 /* can we conclude ENOENT locally? */ 737 if (d_really_is_negative(dentry)) { 738 struct ceph_inode_info *ci = ceph_inode(dir); 739 struct ceph_dentry_info *di = ceph_dentry(dentry); 740 741 spin_lock(&ci->i_ceph_lock); 742 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags); 743 if (strncmp(dentry->d_name.name, 744 fsc->mount_options->snapdir_name, 745 dentry->d_name.len) && 746 !is_root_ceph_dentry(dir, dentry) && 747 ceph_test_mount_opt(fsc, DCACHE) && 748 __ceph_dir_is_complete(ci) && 749 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { 750 spin_unlock(&ci->i_ceph_lock); 751 dout(" dir %p complete, -ENOENT\n", dir); 752 d_add(dentry, NULL); 753 di->lease_shared_gen = ci->i_shared_gen; 754 return NULL; 755 } 756 spin_unlock(&ci->i_ceph_lock); 757 } 758 759 op = ceph_snap(dir) == CEPH_SNAPDIR ? 760 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; 761 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); 762 if (IS_ERR(req)) 763 return ERR_CAST(req); 764 req->r_dentry = dget(dentry); 765 req->r_num_caps = 2; 766 767 mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; 768 if (ceph_security_xattr_wanted(dir)) 769 mask |= CEPH_CAP_XATTR_SHARED; 770 req->r_args.getattr.mask = cpu_to_le32(mask); 771 772 req->r_parent = dir; 773 set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); 774 err = ceph_mdsc_do_request(mdsc, NULL, req); 775 err = ceph_handle_snapdir(req, dentry, err); 776 dentry = ceph_finish_lookup(req, dentry, err); 777 ceph_mdsc_put_request(req); /* will dput(dentry) */ 778 dout("lookup result=%p\n", dentry); 779 return dentry; 780 } 781 782 /* 783 * If we do a create but get no trace back from the MDS, follow up with 784 * a lookup (the VFS expects us to link up the provided dentry). 785 */ 786 int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) 787 { 788 struct dentry *result = ceph_lookup(dir, dentry, 0); 789 790 if (result && !IS_ERR(result)) { 791 /* 792 * We created the item, then did a lookup, and found 793 * it was already linked to another inode we already 794 * had in our cache (and thus got spliced). To not 795 * confuse VFS (especially when inode is a directory), 796 * we don't link our dentry to that inode, return an 797 * error instead. 798 * 799 * This event should be rare and it happens only when 800 * we talk to old MDS. Recent MDS does not send traceless 801 * reply for request that creates new inode. 802 */ 803 d_drop(result); 804 return -ESTALE; 805 } 806 return PTR_ERR(result); 807 } 808 809 static int ceph_mknod(struct inode *dir, struct dentry *dentry, 810 umode_t mode, dev_t rdev) 811 { 812 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 813 struct ceph_mds_client *mdsc = fsc->mdsc; 814 struct ceph_mds_request *req; 815 struct ceph_acls_info acls = {}; 816 int err; 817 818 if (ceph_snap(dir) != CEPH_NOSNAP) 819 return -EROFS; 820 821 err = ceph_pre_init_acls(dir, &mode, &acls); 822 if (err < 0) 823 return err; 824 825 dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n", 826 dir, dentry, mode, rdev); 827 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS); 828 if (IS_ERR(req)) { 829 err = PTR_ERR(req); 830 goto out; 831 } 832 req->r_dentry = dget(dentry); 833 req->r_num_caps = 2; 834 req->r_parent = dir; 835 set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); 836 req->r_args.mknod.mode = cpu_to_le32(mode); 837 req->r_args.mknod.rdev = cpu_to_le32(rdev); 838 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 839 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 840 if (acls.pagelist) { 841 req->r_pagelist = acls.pagelist; 842 acls.pagelist = NULL; 843 } 844 err = ceph_mdsc_do_request(mdsc, dir, req); 845 if (!err && !req->r_reply_info.head->is_dentry) 846 err = ceph_handle_notrace_create(dir, dentry); 847 ceph_mdsc_put_request(req); 848 out: 849 if (!err) 850 ceph_init_inode_acls(d_inode(dentry), &acls); 851 else 852 d_drop(dentry); 853 ceph_release_acls_info(&acls); 854 return err; 855 } 856 857 static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode, 858 bool excl) 859 { 860 return ceph_mknod(dir, dentry, mode, 0); 861 } 862 863 static int ceph_symlink(struct inode *dir, struct dentry *dentry, 864 const char *dest) 865 { 866 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 867 struct ceph_mds_client *mdsc = fsc->mdsc; 868 struct ceph_mds_request *req; 869 int err; 870 871 if (ceph_snap(dir) != CEPH_NOSNAP) 872 return -EROFS; 873 874 dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest); 875 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS); 876 if (IS_ERR(req)) { 877 err = PTR_ERR(req); 878 goto out; 879 } 880 req->r_path2 = kstrdup(dest, GFP_KERNEL); 881 if (!req->r_path2) { 882 err = -ENOMEM; 883 ceph_mdsc_put_request(req); 884 goto out; 885 } 886 req->r_parent = dir; 887 set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); 888 req->r_dentry = dget(dentry); 889 req->r_num_caps = 2; 890 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 891 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 892 err = ceph_mdsc_do_request(mdsc, dir, req); 893 if (!err && !req->r_reply_info.head->is_dentry) 894 err = ceph_handle_notrace_create(dir, dentry); 895 ceph_mdsc_put_request(req); 896 out: 897 if (err) 898 d_drop(dentry); 899 return err; 900 } 901 902 static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 903 { 904 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 905 struct ceph_mds_client *mdsc = fsc->mdsc; 906 struct ceph_mds_request *req; 907 struct ceph_acls_info acls = {}; 908 int err = -EROFS; 909 int op; 910 911 if (ceph_snap(dir) == CEPH_SNAPDIR) { 912 /* mkdir .snap/foo is a MKSNAP */ 913 op = CEPH_MDS_OP_MKSNAP; 914 dout("mksnap dir %p snap '%pd' dn %p\n", dir, 915 dentry, dentry); 916 } else if (ceph_snap(dir) == CEPH_NOSNAP) { 917 dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode); 918 op = CEPH_MDS_OP_MKDIR; 919 } else { 920 goto out; 921 } 922 923 mode |= S_IFDIR; 924 err = ceph_pre_init_acls(dir, &mode, &acls); 925 if (err < 0) 926 goto out; 927 928 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 929 if (IS_ERR(req)) { 930 err = PTR_ERR(req); 931 goto out; 932 } 933 934 req->r_dentry = dget(dentry); 935 req->r_num_caps = 2; 936 req->r_parent = dir; 937 set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); 938 req->r_args.mkdir.mode = cpu_to_le32(mode); 939 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 940 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 941 if (acls.pagelist) { 942 req->r_pagelist = acls.pagelist; 943 acls.pagelist = NULL; 944 } 945 err = ceph_mdsc_do_request(mdsc, dir, req); 946 if (!err && 947 !req->r_reply_info.head->is_target && 948 !req->r_reply_info.head->is_dentry) 949 err = ceph_handle_notrace_create(dir, dentry); 950 ceph_mdsc_put_request(req); 951 out: 952 if (!err) 953 ceph_init_inode_acls(d_inode(dentry), &acls); 954 else 955 d_drop(dentry); 956 ceph_release_acls_info(&acls); 957 return err; 958 } 959 960 static int ceph_link(struct dentry *old_dentry, struct inode *dir, 961 struct dentry *dentry) 962 { 963 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 964 struct ceph_mds_client *mdsc = fsc->mdsc; 965 struct ceph_mds_request *req; 966 int err; 967 968 if (ceph_snap(dir) != CEPH_NOSNAP) 969 return -EROFS; 970 971 dout("link in dir %p old_dentry %p dentry %p\n", dir, 972 old_dentry, dentry); 973 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS); 974 if (IS_ERR(req)) { 975 d_drop(dentry); 976 return PTR_ERR(req); 977 } 978 req->r_dentry = dget(dentry); 979 req->r_num_caps = 2; 980 req->r_old_dentry = dget(old_dentry); 981 req->r_parent = dir; 982 set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); 983 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 984 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 985 /* release LINK_SHARED on source inode (mds will lock it) */ 986 req->r_old_inode_drop = CEPH_CAP_LINK_SHARED; 987 err = ceph_mdsc_do_request(mdsc, dir, req); 988 if (err) { 989 d_drop(dentry); 990 } else if (!req->r_reply_info.head->is_dentry) { 991 ihold(d_inode(old_dentry)); 992 d_instantiate(dentry, d_inode(old_dentry)); 993 } 994 ceph_mdsc_put_request(req); 995 return err; 996 } 997 998 /* 999 * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it 1000 * looks like the link count will hit 0, drop any other caps (other 1001 * than PIN) we don't specifically want (due to the file still being 1002 * open). 1003 */ 1004 static int drop_caps_for_unlink(struct inode *inode) 1005 { 1006 struct ceph_inode_info *ci = ceph_inode(inode); 1007 int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; 1008 1009 spin_lock(&ci->i_ceph_lock); 1010 if (inode->i_nlink == 1) { 1011 drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); 1012 ci->i_ceph_flags |= CEPH_I_NODELAY; 1013 } 1014 spin_unlock(&ci->i_ceph_lock); 1015 return drop; 1016 } 1017 1018 /* 1019 * rmdir and unlink are differ only by the metadata op code 1020 */ 1021 static int ceph_unlink(struct inode *dir, struct dentry *dentry) 1022 { 1023 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 1024 struct ceph_mds_client *mdsc = fsc->mdsc; 1025 struct inode *inode = d_inode(dentry); 1026 struct ceph_mds_request *req; 1027 int err = -EROFS; 1028 int op; 1029 1030 if (ceph_snap(dir) == CEPH_SNAPDIR) { 1031 /* rmdir .snap/foo is RMSNAP */ 1032 dout("rmsnap dir %p '%pd' dn %p\n", dir, dentry, dentry); 1033 op = CEPH_MDS_OP_RMSNAP; 1034 } else if (ceph_snap(dir) == CEPH_NOSNAP) { 1035 dout("unlink/rmdir dir %p dn %p inode %p\n", 1036 dir, dentry, inode); 1037 op = d_is_dir(dentry) ? 1038 CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK; 1039 } else 1040 goto out; 1041 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 1042 if (IS_ERR(req)) { 1043 err = PTR_ERR(req); 1044 goto out; 1045 } 1046 req->r_dentry = dget(dentry); 1047 req->r_num_caps = 2; 1048 req->r_parent = dir; 1049 set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); 1050 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 1051 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 1052 req->r_inode_drop = drop_caps_for_unlink(inode); 1053 err = ceph_mdsc_do_request(mdsc, dir, req); 1054 if (!err && !req->r_reply_info.head->is_dentry) 1055 d_delete(dentry); 1056 ceph_mdsc_put_request(req); 1057 out: 1058 return err; 1059 } 1060 1061 static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, 1062 struct inode *new_dir, struct dentry *new_dentry, 1063 unsigned int flags) 1064 { 1065 struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb); 1066 struct ceph_mds_client *mdsc = fsc->mdsc; 1067 struct ceph_mds_request *req; 1068 int op = CEPH_MDS_OP_RENAME; 1069 int err; 1070 1071 if (flags) 1072 return -EINVAL; 1073 1074 if (ceph_snap(old_dir) != ceph_snap(new_dir)) 1075 return -EXDEV; 1076 if (ceph_snap(old_dir) != CEPH_NOSNAP) { 1077 if (old_dir == new_dir && ceph_snap(old_dir) == CEPH_SNAPDIR) 1078 op = CEPH_MDS_OP_RENAMESNAP; 1079 else 1080 return -EROFS; 1081 } 1082 dout("rename dir %p dentry %p to dir %p dentry %p\n", 1083 old_dir, old_dentry, new_dir, new_dentry); 1084 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 1085 if (IS_ERR(req)) 1086 return PTR_ERR(req); 1087 ihold(old_dir); 1088 req->r_dentry = dget(new_dentry); 1089 req->r_num_caps = 2; 1090 req->r_old_dentry = dget(old_dentry); 1091 req->r_old_dentry_dir = old_dir; 1092 req->r_parent = new_dir; 1093 set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); 1094 req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; 1095 req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; 1096 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 1097 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 1098 /* release LINK_RDCACHE on source inode (mds will lock it) */ 1099 req->r_old_inode_drop = CEPH_CAP_LINK_SHARED; 1100 if (d_really_is_positive(new_dentry)) 1101 req->r_inode_drop = drop_caps_for_unlink(d_inode(new_dentry)); 1102 err = ceph_mdsc_do_request(mdsc, old_dir, req); 1103 if (!err && !req->r_reply_info.head->is_dentry) { 1104 /* 1105 * Normally d_move() is done by fill_trace (called by 1106 * do_request, above). If there is no trace, we need 1107 * to do it here. 1108 */ 1109 1110 /* d_move screws up sibling dentries' offsets */ 1111 ceph_dir_clear_complete(old_dir); 1112 ceph_dir_clear_complete(new_dir); 1113 1114 d_move(old_dentry, new_dentry); 1115 1116 /* ensure target dentry is invalidated, despite 1117 rehashing bug in vfs_rename_dir */ 1118 ceph_invalidate_dentry_lease(new_dentry); 1119 } 1120 ceph_mdsc_put_request(req); 1121 return err; 1122 } 1123 1124 /* 1125 * Ensure a dentry lease will no longer revalidate. 1126 */ 1127 void ceph_invalidate_dentry_lease(struct dentry *dentry) 1128 { 1129 spin_lock(&dentry->d_lock); 1130 ceph_dentry(dentry)->time = jiffies; 1131 ceph_dentry(dentry)->lease_shared_gen = 0; 1132 spin_unlock(&dentry->d_lock); 1133 } 1134 1135 /* 1136 * Check if dentry lease is valid. If not, delete the lease. Try to 1137 * renew if the least is more than half up. 1138 */ 1139 static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags, 1140 struct inode *dir) 1141 { 1142 struct ceph_dentry_info *di; 1143 struct ceph_mds_session *s; 1144 int valid = 0; 1145 u32 gen; 1146 unsigned long ttl; 1147 struct ceph_mds_session *session = NULL; 1148 u32 seq = 0; 1149 1150 spin_lock(&dentry->d_lock); 1151 di = ceph_dentry(dentry); 1152 if (di && di->lease_session) { 1153 s = di->lease_session; 1154 spin_lock(&s->s_gen_ttl_lock); 1155 gen = s->s_cap_gen; 1156 ttl = s->s_cap_ttl; 1157 spin_unlock(&s->s_gen_ttl_lock); 1158 1159 if (di->lease_gen == gen && 1160 time_before(jiffies, di->time) && 1161 time_before(jiffies, ttl)) { 1162 valid = 1; 1163 if (di->lease_renew_after && 1164 time_after(jiffies, di->lease_renew_after)) { 1165 /* 1166 * We should renew. If we're in RCU walk mode 1167 * though, we can't do that so just return 1168 * -ECHILD. 1169 */ 1170 if (flags & LOOKUP_RCU) { 1171 valid = -ECHILD; 1172 } else { 1173 session = ceph_get_mds_session(s); 1174 seq = di->lease_seq; 1175 di->lease_renew_after = 0; 1176 di->lease_renew_from = jiffies; 1177 } 1178 } 1179 } 1180 } 1181 spin_unlock(&dentry->d_lock); 1182 1183 if (session) { 1184 ceph_mdsc_lease_send_msg(session, dir, dentry, 1185 CEPH_MDS_LEASE_RENEW, seq); 1186 ceph_put_mds_session(session); 1187 } 1188 dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid); 1189 return valid; 1190 } 1191 1192 /* 1193 * Check if directory-wide content lease/cap is valid. 1194 */ 1195 static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) 1196 { 1197 struct ceph_inode_info *ci = ceph_inode(dir); 1198 struct ceph_dentry_info *di = ceph_dentry(dentry); 1199 int valid = 0; 1200 1201 spin_lock(&ci->i_ceph_lock); 1202 if (ci->i_shared_gen == di->lease_shared_gen) 1203 valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1); 1204 spin_unlock(&ci->i_ceph_lock); 1205 dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n", 1206 dir, (unsigned)ci->i_shared_gen, dentry, 1207 (unsigned)di->lease_shared_gen, valid); 1208 return valid; 1209 } 1210 1211 /* 1212 * Check if cached dentry can be trusted. 1213 */ 1214 static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) 1215 { 1216 int valid = 0; 1217 struct dentry *parent; 1218 struct inode *dir; 1219 1220 if (flags & LOOKUP_RCU) { 1221 parent = READ_ONCE(dentry->d_parent); 1222 dir = d_inode_rcu(parent); 1223 if (!dir) 1224 return -ECHILD; 1225 } else { 1226 parent = dget_parent(dentry); 1227 dir = d_inode(parent); 1228 } 1229 1230 dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry, 1231 dentry, d_inode(dentry), ceph_dentry(dentry)->offset); 1232 1233 /* always trust cached snapped dentries, snapdir dentry */ 1234 if (ceph_snap(dir) != CEPH_NOSNAP) { 1235 dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry, 1236 dentry, d_inode(dentry)); 1237 valid = 1; 1238 } else if (d_really_is_positive(dentry) && 1239 ceph_snap(d_inode(dentry)) == CEPH_SNAPDIR) { 1240 valid = 1; 1241 } else { 1242 valid = dentry_lease_is_valid(dentry, flags, dir); 1243 if (valid == -ECHILD) 1244 return valid; 1245 if (valid || dir_lease_is_valid(dir, dentry)) { 1246 if (d_really_is_positive(dentry)) 1247 valid = ceph_is_any_caps(d_inode(dentry)); 1248 else 1249 valid = 1; 1250 } 1251 } 1252 1253 if (!valid) { 1254 struct ceph_mds_client *mdsc = 1255 ceph_sb_to_client(dir->i_sb)->mdsc; 1256 struct ceph_mds_request *req; 1257 int op, err; 1258 u32 mask; 1259 1260 if (flags & LOOKUP_RCU) 1261 return -ECHILD; 1262 1263 op = ceph_snap(dir) == CEPH_SNAPDIR ? 1264 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; 1265 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); 1266 if (!IS_ERR(req)) { 1267 req->r_dentry = dget(dentry); 1268 req->r_num_caps = 2; 1269 req->r_parent = dir; 1270 1271 mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; 1272 if (ceph_security_xattr_wanted(dir)) 1273 mask |= CEPH_CAP_XATTR_SHARED; 1274 req->r_args.getattr.mask = cpu_to_le32(mask); 1275 1276 err = ceph_mdsc_do_request(mdsc, NULL, req); 1277 switch (err) { 1278 case 0: 1279 if (d_really_is_positive(dentry) && 1280 d_inode(dentry) == req->r_target_inode) 1281 valid = 1; 1282 break; 1283 case -ENOENT: 1284 if (d_really_is_negative(dentry)) 1285 valid = 1; 1286 /* Fallthrough */ 1287 default: 1288 break; 1289 } 1290 ceph_mdsc_put_request(req); 1291 dout("d_revalidate %p lookup result=%d\n", 1292 dentry, err); 1293 } 1294 } 1295 1296 dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); 1297 if (valid) { 1298 ceph_dentry_lru_touch(dentry); 1299 } else { 1300 ceph_dir_clear_complete(dir); 1301 } 1302 1303 if (!(flags & LOOKUP_RCU)) 1304 dput(parent); 1305 return valid; 1306 } 1307 1308 /* 1309 * Release our ceph_dentry_info. 1310 */ 1311 static void ceph_d_release(struct dentry *dentry) 1312 { 1313 struct ceph_dentry_info *di = ceph_dentry(dentry); 1314 1315 dout("d_release %p\n", dentry); 1316 ceph_dentry_lru_del(dentry); 1317 1318 spin_lock(&dentry->d_lock); 1319 dentry->d_fsdata = NULL; 1320 spin_unlock(&dentry->d_lock); 1321 1322 if (di->lease_session) 1323 ceph_put_mds_session(di->lease_session); 1324 kmem_cache_free(ceph_dentry_cachep, di); 1325 } 1326 1327 /* 1328 * When the VFS prunes a dentry from the cache, we need to clear the 1329 * complete flag on the parent directory. 1330 * 1331 * Called under dentry->d_lock. 1332 */ 1333 static void ceph_d_prune(struct dentry *dentry) 1334 { 1335 dout("ceph_d_prune %p\n", dentry); 1336 1337 /* do we have a valid parent? */ 1338 if (IS_ROOT(dentry)) 1339 return; 1340 1341 /* if we are not hashed, we don't affect dir's completeness */ 1342 if (d_unhashed(dentry)) 1343 return; 1344 1345 if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_SNAPDIR) 1346 return; 1347 1348 /* 1349 * we hold d_lock, so d_parent is stable, and d_fsdata is never 1350 * cleared until d_release 1351 */ 1352 ceph_dir_clear_complete(d_inode(dentry->d_parent)); 1353 } 1354 1355 /* 1356 * read() on a dir. This weird interface hack only works if mounted 1357 * with '-o dirstat'. 1358 */ 1359 static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, 1360 loff_t *ppos) 1361 { 1362 struct ceph_file_info *cf = file->private_data; 1363 struct inode *inode = file_inode(file); 1364 struct ceph_inode_info *ci = ceph_inode(inode); 1365 int left; 1366 const int bufsize = 1024; 1367 1368 if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) 1369 return -EISDIR; 1370 1371 if (!cf->dir_info) { 1372 cf->dir_info = kmalloc(bufsize, GFP_KERNEL); 1373 if (!cf->dir_info) 1374 return -ENOMEM; 1375 cf->dir_info_len = 1376 snprintf(cf->dir_info, bufsize, 1377 "entries: %20lld\n" 1378 " files: %20lld\n" 1379 " subdirs: %20lld\n" 1380 "rentries: %20lld\n" 1381 " rfiles: %20lld\n" 1382 " rsubdirs: %20lld\n" 1383 "rbytes: %20lld\n" 1384 "rctime: %10ld.%09ld\n", 1385 ci->i_files + ci->i_subdirs, 1386 ci->i_files, 1387 ci->i_subdirs, 1388 ci->i_rfiles + ci->i_rsubdirs, 1389 ci->i_rfiles, 1390 ci->i_rsubdirs, 1391 ci->i_rbytes, 1392 (long)ci->i_rctime.tv_sec, 1393 (long)ci->i_rctime.tv_nsec); 1394 } 1395 1396 if (*ppos >= cf->dir_info_len) 1397 return 0; 1398 size = min_t(unsigned, size, cf->dir_info_len-*ppos); 1399 left = copy_to_user(buf, cf->dir_info + *ppos, size); 1400 if (left == size) 1401 return -EFAULT; 1402 *ppos += (size - left); 1403 return size - left; 1404 } 1405 1406 /* 1407 * We maintain a private dentry LRU. 1408 * 1409 * FIXME: this needs to be changed to a per-mds lru to be useful. 1410 */ 1411 void ceph_dentry_lru_add(struct dentry *dn) 1412 { 1413 struct ceph_dentry_info *di = ceph_dentry(dn); 1414 struct ceph_mds_client *mdsc; 1415 1416 dout("dentry_lru_add %p %p '%pd'\n", di, dn, dn); 1417 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; 1418 spin_lock(&mdsc->dentry_lru_lock); 1419 list_add_tail(&di->lru, &mdsc->dentry_lru); 1420 mdsc->num_dentry++; 1421 spin_unlock(&mdsc->dentry_lru_lock); 1422 } 1423 1424 void ceph_dentry_lru_touch(struct dentry *dn) 1425 { 1426 struct ceph_dentry_info *di = ceph_dentry(dn); 1427 struct ceph_mds_client *mdsc; 1428 1429 dout("dentry_lru_touch %p %p '%pd' (offset %lld)\n", di, dn, dn, 1430 di->offset); 1431 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; 1432 spin_lock(&mdsc->dentry_lru_lock); 1433 list_move_tail(&di->lru, &mdsc->dentry_lru); 1434 spin_unlock(&mdsc->dentry_lru_lock); 1435 } 1436 1437 void ceph_dentry_lru_del(struct dentry *dn) 1438 { 1439 struct ceph_dentry_info *di = ceph_dentry(dn); 1440 struct ceph_mds_client *mdsc; 1441 1442 dout("dentry_lru_del %p %p '%pd'\n", di, dn, dn); 1443 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; 1444 spin_lock(&mdsc->dentry_lru_lock); 1445 list_del_init(&di->lru); 1446 mdsc->num_dentry--; 1447 spin_unlock(&mdsc->dentry_lru_lock); 1448 } 1449 1450 /* 1451 * Return name hash for a given dentry. This is dependent on 1452 * the parent directory's hash function. 1453 */ 1454 unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) 1455 { 1456 struct ceph_inode_info *dci = ceph_inode(dir); 1457 1458 switch (dci->i_dir_layout.dl_dir_hash) { 1459 case 0: /* for backward compat */ 1460 case CEPH_STR_HASH_LINUX: 1461 return dn->d_name.hash; 1462 1463 default: 1464 return ceph_str_hash(dci->i_dir_layout.dl_dir_hash, 1465 dn->d_name.name, dn->d_name.len); 1466 } 1467 } 1468 1469 const struct file_operations ceph_dir_fops = { 1470 .read = ceph_read_dir, 1471 .iterate = ceph_readdir, 1472 .llseek = ceph_dir_llseek, 1473 .open = ceph_open, 1474 .release = ceph_release, 1475 .unlocked_ioctl = ceph_ioctl, 1476 .fsync = ceph_fsync, 1477 }; 1478 1479 const struct file_operations ceph_snapdir_fops = { 1480 .iterate = ceph_readdir, 1481 .llseek = ceph_dir_llseek, 1482 .open = ceph_open, 1483 .release = ceph_release, 1484 }; 1485 1486 const struct inode_operations ceph_dir_iops = { 1487 .lookup = ceph_lookup, 1488 .permission = ceph_permission, 1489 .getattr = ceph_getattr, 1490 .setattr = ceph_setattr, 1491 .listxattr = ceph_listxattr, 1492 .get_acl = ceph_get_acl, 1493 .set_acl = ceph_set_acl, 1494 .mknod = ceph_mknod, 1495 .symlink = ceph_symlink, 1496 .mkdir = ceph_mkdir, 1497 .link = ceph_link, 1498 .unlink = ceph_unlink, 1499 .rmdir = ceph_unlink, 1500 .rename = ceph_rename, 1501 .create = ceph_create, 1502 .atomic_open = ceph_atomic_open, 1503 }; 1504 1505 const struct inode_operations ceph_snapdir_iops = { 1506 .lookup = ceph_lookup, 1507 .permission = ceph_permission, 1508 .getattr = ceph_getattr, 1509 .mkdir = ceph_mkdir, 1510 .rmdir = ceph_unlink, 1511 .rename = ceph_rename, 1512 }; 1513 1514 const struct dentry_operations ceph_dentry_ops = { 1515 .d_revalidate = ceph_d_revalidate, 1516 .d_release = ceph_d_release, 1517 .d_prune = ceph_d_prune, 1518 .d_init = ceph_d_init, 1519 }; 1520