1 /* SPDX-License-Identifier: GPL-2.0 */ 2 #ifndef _FS_CEPH_MDS_CLIENT_H 3 #define _FS_CEPH_MDS_CLIENT_H 4 5 #include <linux/completion.h> 6 #include <linux/kref.h> 7 #include <linux/list.h> 8 #include <linux/mutex.h> 9 #include <linux/rbtree.h> 10 #include <linux/spinlock.h> 11 #include <linux/refcount.h> 12 #include <linux/utsname.h> 13 #include <linux/ktime.h> 14 15 #include <linux/ceph/types.h> 16 #include <linux/ceph/messenger.h> 17 #include <linux/ceph/mdsmap.h> 18 #include <linux/ceph/auth.h> 19 20 #include "metric.h" 21 #include "super.h" 22 23 /* The first 8 bits are reserved for old ceph releases */ 24 enum ceph_feature_type { 25 CEPHFS_FEATURE_MIMIC = 8, 26 CEPHFS_FEATURE_REPLY_ENCODING, 27 CEPHFS_FEATURE_RECLAIM_CLIENT, 28 CEPHFS_FEATURE_LAZY_CAP_WANTED, 29 CEPHFS_FEATURE_MULTI_RECONNECT, 30 CEPHFS_FEATURE_DELEG_INO, 31 CEPHFS_FEATURE_METRIC_COLLECT, 32 CEPHFS_FEATURE_ALTERNATE_NAME, 33 CEPHFS_FEATURE_NOTIFY_SESSION_STATE, 34 CEPHFS_FEATURE_OP_GETVXATTR, 35 CEPHFS_FEATURE_32BITS_RETRY_FWD, 36 37 CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_32BITS_RETRY_FWD, 38 }; 39 40 #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ 41 0, 1, 2, 3, 4, 5, 6, 7, \ 42 CEPHFS_FEATURE_MIMIC, \ 43 CEPHFS_FEATURE_REPLY_ENCODING, \ 44 CEPHFS_FEATURE_LAZY_CAP_WANTED, \ 45 CEPHFS_FEATURE_MULTI_RECONNECT, \ 46 CEPHFS_FEATURE_DELEG_INO, \ 47 CEPHFS_FEATURE_METRIC_COLLECT, \ 48 CEPHFS_FEATURE_ALTERNATE_NAME, \ 49 CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \ 50 CEPHFS_FEATURE_OP_GETVXATTR, \ 51 CEPHFS_FEATURE_32BITS_RETRY_FWD, \ 52 } 53 54 /* 55 * Some lock dependencies: 56 * 57 * session->s_mutex 58 * mdsc->mutex 59 * 60 * mdsc->snap_rwsem 61 * 62 * ci->i_ceph_lock 63 * mdsc->snap_flush_lock 64 * mdsc->cap_delay_lock 65 * 66 */ 67 68 struct ceph_fs_client; 69 struct ceph_cap; 70 71 /* 72 * parsed info about a single inode. pointers are into the encoded 73 * on-wire structures within the mds reply message payload. 74 */ 75 struct ceph_mds_reply_info_in { 76 struct ceph_mds_reply_inode *in; 77 struct ceph_dir_layout dir_layout; 78 u32 symlink_len; 79 char *symlink; 80 u32 xattr_len; 81 char *xattr_data; 82 u64 inline_version; 83 u32 inline_len; 84 char *inline_data; 85 u32 pool_ns_len; 86 char *pool_ns_data; 87 u64 max_bytes; 88 u64 max_files; 89 s32 dir_pin; 90 struct ceph_timespec btime; 91 struct ceph_timespec snap_btime; 92 u8 *fscrypt_auth; 93 u8 *fscrypt_file; 94 u32 fscrypt_auth_len; 95 u32 fscrypt_file_len; 96 u64 rsnaps; 97 u64 change_attr; 98 }; 99 100 struct ceph_mds_reply_dir_entry { 101 bool is_nokey; 102 char *name; 103 u32 name_len; 104 u32 raw_hash; 105 struct ceph_mds_reply_lease *lease; 106 struct ceph_mds_reply_info_in inode; 107 loff_t offset; 108 }; 109 110 struct ceph_mds_reply_xattr { 111 char *xattr_value; 112 size_t xattr_value_len; 113 }; 114 115 /* 116 * parsed info about an mds reply, including information about 117 * either: 1) the target inode and/or its parent directory and dentry, 118 * and directory contents (for readdir results), or 119 * 2) the file range lock info (for fcntl F_GETLK results). 120 */ 121 struct ceph_mds_reply_info_parsed { 122 struct ceph_mds_reply_head *head; 123 124 /* trace */ 125 struct ceph_mds_reply_info_in diri, targeti; 126 struct ceph_mds_reply_dirfrag *dirfrag; 127 char *dname; 128 u8 *altname; 129 u32 dname_len; 130 u32 altname_len; 131 struct ceph_mds_reply_lease *dlease; 132 struct ceph_mds_reply_xattr xattr_info; 133 134 /* extra */ 135 union { 136 /* for fcntl F_GETLK results */ 137 struct ceph_filelock *filelock_reply; 138 139 /* for readdir results */ 140 struct { 141 struct ceph_mds_reply_dirfrag *dir_dir; 142 size_t dir_buf_size; 143 int dir_nr; 144 bool dir_end; 145 bool dir_complete; 146 bool hash_order; 147 bool offset_hash; 148 struct ceph_mds_reply_dir_entry *dir_entries; 149 }; 150 151 /* for create results */ 152 struct { 153 bool has_create_ino; 154 u64 ino; 155 }; 156 }; 157 158 /* encoded blob describing snapshot contexts for certain 159 operations (e.g., open) */ 160 void *snapblob; 161 int snapblob_len; 162 }; 163 164 165 /* 166 * cap releases are batched and sent to the MDS en masse. 167 * 168 * Account for per-message overhead of mds_cap_release header 169 * and __le32 for osd epoch barrier trailing field. 170 */ 171 #define CEPH_CAPS_PER_RELEASE ((PAGE_SIZE - sizeof(u32) - \ 172 sizeof(struct ceph_mds_cap_release)) / \ 173 sizeof(struct ceph_mds_cap_item)) 174 175 176 /* 177 * state associated with each MDS<->client session 178 */ 179 enum { 180 CEPH_MDS_SESSION_NEW = 1, 181 CEPH_MDS_SESSION_OPENING = 2, 182 CEPH_MDS_SESSION_OPEN = 3, 183 CEPH_MDS_SESSION_HUNG = 4, 184 CEPH_MDS_SESSION_RESTARTING = 5, 185 CEPH_MDS_SESSION_RECONNECTING = 6, 186 CEPH_MDS_SESSION_CLOSING = 7, 187 CEPH_MDS_SESSION_CLOSED = 8, 188 CEPH_MDS_SESSION_REJECTED = 9, 189 }; 190 191 struct ceph_mds_session { 192 struct ceph_mds_client *s_mdsc; 193 int s_mds; 194 int s_state; 195 unsigned long s_ttl; /* time until mds kills us */ 196 unsigned long s_features; 197 u64 s_seq; /* incoming msg seq # */ 198 struct mutex s_mutex; /* serialize session messages */ 199 200 struct ceph_connection s_con; 201 202 struct ceph_auth_handshake s_auth; 203 204 atomic_t s_cap_gen; /* inc each time we get mds stale msg */ 205 unsigned long s_cap_ttl; /* when session caps expire. protected by s_mutex */ 206 207 /* protected by s_cap_lock */ 208 spinlock_t s_cap_lock; 209 refcount_t s_ref; 210 struct list_head s_caps; /* all caps issued by this session */ 211 struct ceph_cap *s_cap_iterator; 212 int s_nr_caps; 213 int s_num_cap_releases; 214 int s_cap_reconnect; 215 int s_readonly; 216 struct list_head s_cap_releases; /* waiting cap_release messages */ 217 struct work_struct s_cap_release_work; 218 219 /* See ceph_inode_info->i_dirty_item. */ 220 struct list_head s_cap_dirty; /* inodes w/ dirty caps */ 221 222 /* See ceph_inode_info->i_flushing_item. */ 223 struct list_head s_cap_flushing; /* inodes w/ flushing caps */ 224 225 unsigned long s_renew_requested; /* last time we sent a renew req */ 226 u64 s_renew_seq; 227 228 struct list_head s_waiting; /* waiting requests */ 229 struct list_head s_unsafe; /* unsafe requests */ 230 struct xarray s_delegated_inos; 231 }; 232 233 /* 234 * modes of choosing which MDS to send a request to 235 */ 236 enum { 237 USE_ANY_MDS, 238 USE_RANDOM_MDS, 239 USE_AUTH_MDS, /* prefer authoritative mds for this metadata item */ 240 }; 241 242 struct ceph_mds_request; 243 struct ceph_mds_client; 244 245 /* 246 * request completion callback 247 */ 248 typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc, 249 struct ceph_mds_request *req); 250 /* 251 * wait for request completion callback 252 */ 253 typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc, 254 struct ceph_mds_request *req); 255 256 /* 257 * an in-flight mds request 258 */ 259 struct ceph_mds_request { 260 u64 r_tid; /* transaction id */ 261 struct rb_node r_node; 262 struct ceph_mds_client *r_mdsc; 263 264 struct kref r_kref; 265 int r_op; /* mds op code */ 266 267 /* operation on what? */ 268 struct inode *r_inode; /* arg1 */ 269 struct dentry *r_dentry; /* arg1 */ 270 struct dentry *r_old_dentry; /* arg2: rename from or link from */ 271 struct inode *r_old_dentry_dir; /* arg2: old dentry's parent dir */ 272 char *r_path1, *r_path2; 273 struct ceph_vino r_ino1, r_ino2; 274 275 struct inode *r_parent; /* parent dir inode */ 276 struct inode *r_target_inode; /* resulting inode */ 277 struct inode *r_new_inode; /* new inode (for creates) */ 278 279 #define CEPH_MDS_R_DIRECT_IS_HASH (1) /* r_direct_hash is valid */ 280 #define CEPH_MDS_R_ABORTED (2) /* call was aborted */ 281 #define CEPH_MDS_R_GOT_UNSAFE (3) /* got an unsafe reply */ 282 #define CEPH_MDS_R_GOT_SAFE (4) /* got a safe reply */ 283 #define CEPH_MDS_R_GOT_RESULT (5) /* got a result */ 284 #define CEPH_MDS_R_DID_PREPOPULATE (6) /* prepopulated readdir */ 285 #define CEPH_MDS_R_PARENT_LOCKED (7) /* is r_parent->i_rwsem wlocked? */ 286 #define CEPH_MDS_R_ASYNC (8) /* async request */ 287 #define CEPH_MDS_R_FSCRYPT_FILE (9) /* must marshal fscrypt_file field */ 288 unsigned long r_req_flags; 289 290 struct mutex r_fill_mutex; 291 292 union ceph_mds_request_args r_args; 293 294 struct ceph_fscrypt_auth *r_fscrypt_auth; 295 u64 r_fscrypt_file; 296 297 u8 *r_altname; /* fscrypt binary crypttext for long filenames */ 298 u32 r_altname_len; /* length of r_altname */ 299 300 int r_fmode; /* file mode, if expecting cap */ 301 int r_request_release_offset; 302 const struct cred *r_cred; 303 struct timespec64 r_stamp; 304 305 /* for choosing which mds to send this request to */ 306 int r_direct_mode; 307 u32 r_direct_hash; /* choose dir frag based on this dentry hash */ 308 309 /* data payload is used for xattr ops */ 310 struct ceph_pagelist *r_pagelist; 311 312 /* what caps shall we drop? */ 313 int r_inode_drop, r_inode_unless; 314 int r_dentry_drop, r_dentry_unless; 315 int r_old_dentry_drop, r_old_dentry_unless; 316 struct inode *r_old_inode; 317 int r_old_inode_drop, r_old_inode_unless; 318 319 struct ceph_msg *r_request; /* original request */ 320 struct ceph_msg *r_reply; 321 struct ceph_mds_reply_info_parsed r_reply_info; 322 int r_err; 323 u32 r_readdir_offset; 324 325 struct page *r_locked_page; 326 int r_dir_caps; 327 int r_num_caps; 328 329 unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */ 330 unsigned long r_started; /* start time to measure timeout against */ 331 unsigned long r_start_latency; /* start time to measure latency */ 332 unsigned long r_end_latency; /* finish time to measure latency */ 333 unsigned long r_request_started; /* start time for mds request only, 334 used to measure lease durations */ 335 336 /* link unsafe requests to parent directory, for fsync */ 337 struct inode *r_unsafe_dir; 338 struct list_head r_unsafe_dir_item; 339 340 /* unsafe requests that modify the target inode */ 341 struct list_head r_unsafe_target_item; 342 343 struct ceph_mds_session *r_session; 344 345 int r_attempts; /* resend attempts */ 346 int r_num_fwd; /* number of forward attempts */ 347 int r_resend_mds; /* mds to resend to next, if any*/ 348 u32 r_sent_on_mseq; /* cap mseq request was sent at*/ 349 u64 r_deleg_ino; 350 351 struct list_head r_wait; 352 struct completion r_completion; 353 struct completion r_safe_completion; 354 ceph_mds_request_callback_t r_callback; 355 struct list_head r_unsafe_item; /* per-session unsafe list item */ 356 357 long long r_dir_release_cnt; 358 long long r_dir_ordered_cnt; 359 int r_readdir_cache_idx; 360 361 int r_feature_needed; 362 363 struct ceph_cap_reservation r_caps_reservation; 364 }; 365 366 struct ceph_pool_perm { 367 struct rb_node node; 368 int perm; 369 s64 pool; 370 size_t pool_ns_len; 371 char pool_ns[]; 372 }; 373 374 struct ceph_snapid_map { 375 struct rb_node node; 376 struct list_head lru; 377 atomic_t ref; 378 dev_t dev; 379 u64 snap; 380 unsigned long last_used; 381 }; 382 383 /* 384 * node for list of quotarealm inodes that are not visible from the filesystem 385 * mountpoint, but required to handle, e.g. quotas. 386 */ 387 struct ceph_quotarealm_inode { 388 struct rb_node node; 389 u64 ino; 390 unsigned long timeout; /* last time a lookup failed for this inode */ 391 struct mutex mutex; 392 struct inode *inode; 393 }; 394 395 struct cap_wait { 396 struct list_head list; 397 u64 ino; 398 pid_t tgid; 399 int need; 400 int want; 401 }; 402 403 enum { 404 CEPH_MDSC_STOPPING_BEGIN = 1, 405 CEPH_MDSC_STOPPING_FLUSHING = 2, 406 CEPH_MDSC_STOPPING_FLUSHED = 3, 407 }; 408 409 /* 410 * mds client state 411 */ 412 struct ceph_mds_client { 413 struct ceph_fs_client *fsc; 414 struct mutex mutex; /* all nested structures */ 415 416 struct ceph_mdsmap *mdsmap; 417 struct completion safe_umount_waiters; 418 wait_queue_head_t session_close_wq; 419 struct list_head waiting_for_map; 420 int mdsmap_err; 421 422 struct ceph_mds_session **sessions; /* NULL for mds if no session */ 423 atomic_t num_sessions; 424 int max_sessions; /* len of sessions array */ 425 426 spinlock_t stopping_lock; /* protect snap_empty */ 427 int stopping; /* the stage of shutting down */ 428 atomic_t stopping_blockers; 429 struct completion stopping_waiter; 430 431 atomic64_t quotarealms_count; /* # realms with quota */ 432 /* 433 * We keep a list of inodes we don't see in the mountpoint but that we 434 * need to track quota realms. 435 */ 436 struct rb_root quotarealms_inodes; 437 struct mutex quotarealms_inodes_mutex; 438 439 /* 440 * snap_rwsem will cover cap linkage into snaprealms, and 441 * realm snap contexts. (later, we can do per-realm snap 442 * contexts locks..) the empty list contains realms with no 443 * references (implying they contain no inodes with caps) that 444 * should be destroyed. 445 */ 446 u64 last_snap_seq; 447 struct rw_semaphore snap_rwsem; 448 struct rb_root snap_realms; 449 struct list_head snap_empty; 450 int num_snap_realms; 451 spinlock_t snap_empty_lock; /* protect snap_empty */ 452 453 u64 last_tid; /* most recent mds request */ 454 u64 oldest_tid; /* oldest incomplete mds request, 455 excluding setfilelock requests */ 456 struct rb_root request_tree; /* pending mds requests */ 457 struct delayed_work delayed_work; /* delayed work */ 458 unsigned long last_renew_caps; /* last time we renewed our caps */ 459 struct list_head cap_delay_list; /* caps with delayed release */ 460 spinlock_t cap_delay_lock; /* protects cap_delay_list */ 461 struct list_head snap_flush_list; /* cap_snaps ready to flush */ 462 spinlock_t snap_flush_lock; 463 464 u64 last_cap_flush_tid; 465 struct list_head cap_flush_list; 466 struct list_head cap_dirty_migrating; /* ...that are migration... */ 467 int num_cap_flushing; /* # caps we are flushing */ 468 spinlock_t cap_dirty_lock; /* protects above items */ 469 wait_queue_head_t cap_flushing_wq; 470 471 struct work_struct cap_reclaim_work; 472 atomic_t cap_reclaim_pending; 473 474 /* 475 * Cap reservations 476 * 477 * Maintain a global pool of preallocated struct ceph_caps, referenced 478 * by struct ceph_caps_reservations. This ensures that we preallocate 479 * memory needed to successfully process an MDS response. (If an MDS 480 * sends us cap information and we fail to process it, we will have 481 * problems due to the client and MDS being out of sync.) 482 * 483 * Reservations are 'owned' by a ceph_cap_reservation context. 484 */ 485 spinlock_t caps_list_lock; 486 struct list_head caps_list; /* unused (reserved or 487 unreserved) */ 488 struct list_head cap_wait_list; 489 int caps_total_count; /* total caps allocated */ 490 int caps_use_count; /* in use */ 491 int caps_use_max; /* max used caps */ 492 int caps_reserve_count; /* unused, reserved */ 493 int caps_avail_count; /* unused, unreserved */ 494 int caps_min_count; /* keep at least this many 495 (unreserved) */ 496 spinlock_t dentry_list_lock; 497 struct list_head dentry_leases; /* fifo list */ 498 struct list_head dentry_dir_leases; /* lru list */ 499 500 struct ceph_client_metric metric; 501 502 spinlock_t snapid_map_lock; 503 struct rb_root snapid_map_tree; 504 struct list_head snapid_map_lru; 505 506 struct rw_semaphore pool_perm_rwsem; 507 struct rb_root pool_perm_tree; 508 509 char nodename[__NEW_UTS_LEN + 1]; 510 }; 511 512 extern const char *ceph_mds_op_name(int op); 513 514 extern bool check_session_state(struct ceph_mds_session *s); 515 void inc_session_sequence(struct ceph_mds_session *s); 516 517 extern struct ceph_mds_session * 518 __ceph_lookup_mds_session(struct ceph_mds_client *, int mds); 519 520 extern const char *ceph_session_state_name(int s); 521 522 extern struct ceph_mds_session * 523 ceph_get_mds_session(struct ceph_mds_session *s); 524 extern void ceph_put_mds_session(struct ceph_mds_session *s); 525 526 extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc, 527 struct ceph_msg *msg, int mds); 528 529 extern int ceph_mdsc_init(struct ceph_fs_client *fsc); 530 extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc); 531 extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc); 532 extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc); 533 534 extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc); 535 536 extern void ceph_invalidate_dir_request(struct ceph_mds_request *req); 537 extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, 538 struct inode *dir); 539 extern struct ceph_mds_request * 540 ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); 541 extern int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, 542 struct inode *dir, 543 struct ceph_mds_request *req); 544 int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc, 545 struct ceph_mds_request *req, 546 ceph_mds_request_wait_callback_t wait_func); 547 extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, 548 struct inode *dir, 549 struct ceph_mds_request *req); 550 extern void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req); 551 extern void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req); 552 static inline void ceph_mdsc_get_request(struct ceph_mds_request *req) 553 { 554 kref_get(&req->r_kref); 555 } 556 extern void ceph_mdsc_release_request(struct kref *kref); 557 static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) 558 { 559 kref_put(&req->r_kref, ceph_mdsc_release_request); 560 } 561 562 extern void send_flush_mdlog(struct ceph_mds_session *s); 563 extern void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc, 564 void (*cb)(struct ceph_mds_session *), 565 bool check_state); 566 extern struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq); 567 extern void __ceph_queue_cap_release(struct ceph_mds_session *session, 568 struct ceph_cap *cap); 569 extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc, 570 struct ceph_mds_session *session); 571 extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc); 572 extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr); 573 extern int ceph_iterate_session_caps(struct ceph_mds_session *session, 574 int (*cb)(struct inode *, int mds, void *), 575 void *arg); 576 extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); 577 578 static inline void ceph_mdsc_free_path(char *path, int len) 579 { 580 if (!IS_ERR_OR_NULL(path)) 581 __putname(path - (PATH_MAX - 1 - len)); 582 } 583 584 extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, 585 int for_wire); 586 587 extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry); 588 extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, 589 struct dentry *dentry, char action, 590 u32 seq); 591 592 extern void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, 593 struct ceph_msg *msg); 594 extern void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, 595 struct ceph_msg *msg); 596 597 extern struct ceph_mds_session * 598 ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target); 599 extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, 600 struct ceph_mds_session *session); 601 602 extern int ceph_trim_caps(struct ceph_mds_client *mdsc, 603 struct ceph_mds_session *session, 604 int max_caps); 605 606 static inline int ceph_wait_on_async_create(struct inode *inode) 607 { 608 struct ceph_inode_info *ci = ceph_inode(inode); 609 610 return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT, 611 TASK_KILLABLE); 612 } 613 614 extern int ceph_wait_on_conflict_unlink(struct dentry *dentry); 615 extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session); 616 extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino); 617 #endif 618