1 #ifndef _FS_CEPH_MDS_CLIENT_H 2 #define _FS_CEPH_MDS_CLIENT_H 3 4 #include <linux/completion.h> 5 #include <linux/kref.h> 6 #include <linux/list.h> 7 #include <linux/mutex.h> 8 #include <linux/rbtree.h> 9 #include <linux/spinlock.h> 10 #include <linux/refcount.h> 11 #include <linux/utsname.h> 12 13 #include <linux/ceph/types.h> 14 #include <linux/ceph/messenger.h> 15 #include <linux/ceph/mdsmap.h> 16 #include <linux/ceph/auth.h> 17 18 /* 19 * Some lock dependencies: 20 * 21 * session->s_mutex 22 * mdsc->mutex 23 * 24 * mdsc->snap_rwsem 25 * 26 * ci->i_ceph_lock 27 * mdsc->snap_flush_lock 28 * mdsc->cap_delay_lock 29 * 30 */ 31 32 struct ceph_fs_client; 33 struct ceph_cap; 34 35 /* 36 * parsed info about a single inode. pointers are into the encoded 37 * on-wire structures within the mds reply message payload. 38 */ 39 struct ceph_mds_reply_info_in { 40 struct ceph_mds_reply_inode *in; 41 struct ceph_dir_layout dir_layout; 42 u32 symlink_len; 43 char *symlink; 44 u32 xattr_len; 45 char *xattr_data; 46 u64 inline_version; 47 u32 inline_len; 48 char *inline_data; 49 u32 pool_ns_len; 50 char *pool_ns_data; 51 }; 52 53 struct ceph_mds_reply_dir_entry { 54 char *name; 55 u32 name_len; 56 struct ceph_mds_reply_lease *lease; 57 struct ceph_mds_reply_info_in inode; 58 loff_t offset; 59 }; 60 61 /* 62 * parsed info about an mds reply, including information about 63 * either: 1) the target inode and/or its parent directory and dentry, 64 * and directory contents (for readdir results), or 65 * 2) the file range lock info (for fcntl F_GETLK results). 66 */ 67 struct ceph_mds_reply_info_parsed { 68 struct ceph_mds_reply_head *head; 69 70 /* trace */ 71 struct ceph_mds_reply_info_in diri, targeti; 72 struct ceph_mds_reply_dirfrag *dirfrag; 73 char *dname; 74 u32 dname_len; 75 struct ceph_mds_reply_lease *dlease; 76 77 /* extra */ 78 union { 79 /* for fcntl F_GETLK results */ 80 struct ceph_filelock *filelock_reply; 81 82 /* for readdir results */ 83 struct { 84 struct ceph_mds_reply_dirfrag *dir_dir; 85 size_t dir_buf_size; 86 int dir_nr; 87 bool dir_end; 88 bool dir_complete; 89 bool hash_order; 90 bool offset_hash; 91 struct ceph_mds_reply_dir_entry *dir_entries; 92 }; 93 94 /* for create results */ 95 struct { 96 bool has_create_ino; 97 u64 ino; 98 }; 99 }; 100 101 /* encoded blob describing snapshot contexts for certain 102 operations (e.g., open) */ 103 void *snapblob; 104 int snapblob_len; 105 }; 106 107 108 /* 109 * cap releases are batched and sent to the MDS en masse. 110 * 111 * Account for per-message overhead of mds_cap_release header 112 * and __le32 for osd epoch barrier trailing field. 113 */ 114 #define CEPH_CAPS_PER_RELEASE ((PAGE_SIZE - sizeof(u32) - \ 115 sizeof(struct ceph_mds_cap_release)) / \ 116 sizeof(struct ceph_mds_cap_item)) 117 118 119 /* 120 * state associated with each MDS<->client session 121 */ 122 enum { 123 CEPH_MDS_SESSION_NEW = 1, 124 CEPH_MDS_SESSION_OPENING = 2, 125 CEPH_MDS_SESSION_OPEN = 3, 126 CEPH_MDS_SESSION_HUNG = 4, 127 CEPH_MDS_SESSION_CLOSING = 5, 128 CEPH_MDS_SESSION_RESTARTING = 6, 129 CEPH_MDS_SESSION_RECONNECTING = 7, 130 CEPH_MDS_SESSION_REJECTED = 8, 131 }; 132 133 struct ceph_mds_session { 134 struct ceph_mds_client *s_mdsc; 135 int s_mds; 136 int s_state; 137 unsigned long s_ttl; /* time until mds kills us */ 138 u64 s_seq; /* incoming msg seq # */ 139 struct mutex s_mutex; /* serialize session messages */ 140 141 struct ceph_connection s_con; 142 143 struct ceph_auth_handshake s_auth; 144 145 /* protected by s_gen_ttl_lock */ 146 spinlock_t s_gen_ttl_lock; 147 u32 s_cap_gen; /* inc each time we get mds stale msg */ 148 unsigned long s_cap_ttl; /* when session caps expire */ 149 150 /* protected by s_cap_lock */ 151 spinlock_t s_cap_lock; 152 struct list_head s_caps; /* all caps issued by this session */ 153 int s_nr_caps, s_trim_caps; 154 int s_num_cap_releases; 155 int s_cap_reconnect; 156 int s_readonly; 157 struct list_head s_cap_releases; /* waiting cap_release messages */ 158 struct ceph_cap *s_cap_iterator; 159 160 /* protected by mutex */ 161 struct list_head s_cap_flushing; /* inodes w/ flushing caps */ 162 unsigned long s_renew_requested; /* last time we sent a renew req */ 163 u64 s_renew_seq; 164 165 refcount_t s_ref; 166 struct list_head s_waiting; /* waiting requests */ 167 struct list_head s_unsafe; /* unsafe requests */ 168 }; 169 170 /* 171 * modes of choosing which MDS to send a request to 172 */ 173 enum { 174 USE_ANY_MDS, 175 USE_RANDOM_MDS, 176 USE_AUTH_MDS, /* prefer authoritative mds for this metadata item */ 177 }; 178 179 struct ceph_mds_request; 180 struct ceph_mds_client; 181 182 /* 183 * request completion callback 184 */ 185 typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc, 186 struct ceph_mds_request *req); 187 /* 188 * wait for request completion callback 189 */ 190 typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc, 191 struct ceph_mds_request *req); 192 193 /* 194 * an in-flight mds request 195 */ 196 struct ceph_mds_request { 197 u64 r_tid; /* transaction id */ 198 struct rb_node r_node; 199 struct ceph_mds_client *r_mdsc; 200 201 int r_op; /* mds op code */ 202 203 /* operation on what? */ 204 struct inode *r_inode; /* arg1 */ 205 struct dentry *r_dentry; /* arg1 */ 206 struct dentry *r_old_dentry; /* arg2: rename from or link from */ 207 struct inode *r_old_dentry_dir; /* arg2: old dentry's parent dir */ 208 char *r_path1, *r_path2; 209 struct ceph_vino r_ino1, r_ino2; 210 211 struct inode *r_parent; /* parent dir inode */ 212 struct inode *r_target_inode; /* resulting inode */ 213 214 #define CEPH_MDS_R_DIRECT_IS_HASH (1) /* r_direct_hash is valid */ 215 #define CEPH_MDS_R_ABORTED (2) /* call was aborted */ 216 #define CEPH_MDS_R_GOT_UNSAFE (3) /* got an unsafe reply */ 217 #define CEPH_MDS_R_GOT_SAFE (4) /* got a safe reply */ 218 #define CEPH_MDS_R_GOT_RESULT (5) /* got a result */ 219 #define CEPH_MDS_R_DID_PREPOPULATE (6) /* prepopulated readdir */ 220 #define CEPH_MDS_R_PARENT_LOCKED (7) /* is r_parent->i_rwsem wlocked? */ 221 unsigned long r_req_flags; 222 223 struct mutex r_fill_mutex; 224 225 union ceph_mds_request_args r_args; 226 int r_fmode; /* file mode, if expecting cap */ 227 kuid_t r_uid; 228 kgid_t r_gid; 229 struct timespec r_stamp; 230 231 /* for choosing which mds to send this request to */ 232 int r_direct_mode; 233 u32 r_direct_hash; /* choose dir frag based on this dentry hash */ 234 235 /* data payload is used for xattr ops */ 236 struct ceph_pagelist *r_pagelist; 237 238 /* what caps shall we drop? */ 239 int r_inode_drop, r_inode_unless; 240 int r_dentry_drop, r_dentry_unless; 241 int r_old_dentry_drop, r_old_dentry_unless; 242 struct inode *r_old_inode; 243 int r_old_inode_drop, r_old_inode_unless; 244 245 struct ceph_msg *r_request; /* original request */ 246 int r_request_release_offset; 247 struct ceph_msg *r_reply; 248 struct ceph_mds_reply_info_parsed r_reply_info; 249 struct page *r_locked_page; 250 int r_err; 251 252 unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */ 253 unsigned long r_started; /* start time to measure timeout against */ 254 unsigned long r_request_started; /* start time for mds request only, 255 used to measure lease durations */ 256 257 /* link unsafe requests to parent directory, for fsync */ 258 struct inode *r_unsafe_dir; 259 struct list_head r_unsafe_dir_item; 260 261 /* unsafe requests that modify the target inode */ 262 struct list_head r_unsafe_target_item; 263 264 struct ceph_mds_session *r_session; 265 266 int r_attempts; /* resend attempts */ 267 int r_num_fwd; /* number of forward attempts */ 268 int r_resend_mds; /* mds to resend to next, if any*/ 269 u32 r_sent_on_mseq; /* cap mseq request was sent at*/ 270 271 struct kref r_kref; 272 struct list_head r_wait; 273 struct completion r_completion; 274 struct completion r_safe_completion; 275 ceph_mds_request_callback_t r_callback; 276 ceph_mds_request_wait_callback_t r_wait_for_completion; 277 struct list_head r_unsafe_item; /* per-session unsafe list item */ 278 279 long long r_dir_release_cnt; 280 long long r_dir_ordered_cnt; 281 int r_readdir_cache_idx; 282 u32 r_readdir_offset; 283 284 struct ceph_cap_reservation r_caps_reservation; 285 int r_num_caps; 286 }; 287 288 struct ceph_pool_perm { 289 struct rb_node node; 290 int perm; 291 s64 pool; 292 size_t pool_ns_len; 293 char pool_ns[]; 294 }; 295 296 /* 297 * mds client state 298 */ 299 struct ceph_mds_client { 300 struct ceph_fs_client *fsc; 301 struct mutex mutex; /* all nested structures */ 302 303 struct ceph_mdsmap *mdsmap; 304 struct completion safe_umount_waiters; 305 wait_queue_head_t session_close_wq; 306 struct list_head waiting_for_map; 307 int mdsmap_err; 308 309 struct ceph_mds_session **sessions; /* NULL for mds if no session */ 310 atomic_t num_sessions; 311 int max_sessions; /* len of s_mds_sessions */ 312 int stopping; /* true if shutting down */ 313 314 /* 315 * snap_rwsem will cover cap linkage into snaprealms, and 316 * realm snap contexts. (later, we can do per-realm snap 317 * contexts locks..) the empty list contains realms with no 318 * references (implying they contain no inodes with caps) that 319 * should be destroyed. 320 */ 321 u64 last_snap_seq; 322 struct rw_semaphore snap_rwsem; 323 struct rb_root snap_realms; 324 struct list_head snap_empty; 325 spinlock_t snap_empty_lock; /* protect snap_empty */ 326 327 u64 last_tid; /* most recent mds request */ 328 u64 oldest_tid; /* oldest incomplete mds request, 329 excluding setfilelock requests */ 330 struct rb_root request_tree; /* pending mds requests */ 331 struct delayed_work delayed_work; /* delayed work */ 332 unsigned long last_renew_caps; /* last time we renewed our caps */ 333 struct list_head cap_delay_list; /* caps with delayed release */ 334 spinlock_t cap_delay_lock; /* protects cap_delay_list */ 335 struct list_head snap_flush_list; /* cap_snaps ready to flush */ 336 spinlock_t snap_flush_lock; 337 338 u64 last_cap_flush_tid; 339 struct list_head cap_flush_list; 340 struct list_head cap_dirty; /* inodes with dirty caps */ 341 struct list_head cap_dirty_migrating; /* ...that are migration... */ 342 int num_cap_flushing; /* # caps we are flushing */ 343 spinlock_t cap_dirty_lock; /* protects above items */ 344 wait_queue_head_t cap_flushing_wq; 345 346 /* 347 * Cap reservations 348 * 349 * Maintain a global pool of preallocated struct ceph_caps, referenced 350 * by struct ceph_caps_reservations. This ensures that we preallocate 351 * memory needed to successfully process an MDS response. (If an MDS 352 * sends us cap information and we fail to process it, we will have 353 * problems due to the client and MDS being out of sync.) 354 * 355 * Reservations are 'owned' by a ceph_cap_reservation context. 356 */ 357 spinlock_t caps_list_lock; 358 struct list_head caps_list; /* unused (reserved or 359 unreserved) */ 360 int caps_total_count; /* total caps allocated */ 361 int caps_use_count; /* in use */ 362 int caps_reserve_count; /* unused, reserved */ 363 int caps_avail_count; /* unused, unreserved */ 364 int caps_min_count; /* keep at least this many 365 (unreserved) */ 366 spinlock_t dentry_lru_lock; 367 struct list_head dentry_lru; 368 int num_dentry; 369 370 struct rw_semaphore pool_perm_rwsem; 371 struct rb_root pool_perm_tree; 372 373 char nodename[__NEW_UTS_LEN + 1]; 374 }; 375 376 extern const char *ceph_mds_op_name(int op); 377 378 extern struct ceph_mds_session * 379 __ceph_lookup_mds_session(struct ceph_mds_client *, int mds); 380 381 static inline struct ceph_mds_session * 382 ceph_get_mds_session(struct ceph_mds_session *s) 383 { 384 refcount_inc(&s->s_ref); 385 return s; 386 } 387 388 extern const char *ceph_session_state_name(int s); 389 390 extern void ceph_put_mds_session(struct ceph_mds_session *s); 391 392 extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc, 393 struct ceph_msg *msg, int mds); 394 395 extern int ceph_mdsc_init(struct ceph_fs_client *fsc); 396 extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc); 397 extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc); 398 extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc); 399 400 extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc); 401 402 extern void ceph_invalidate_dir_request(struct ceph_mds_request *req); 403 extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, 404 struct inode *dir); 405 extern struct ceph_mds_request * 406 ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); 407 extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, 408 struct ceph_mds_request *req); 409 extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, 410 struct inode *dir, 411 struct ceph_mds_request *req); 412 static inline void ceph_mdsc_get_request(struct ceph_mds_request *req) 413 { 414 kref_get(&req->r_kref); 415 } 416 extern void ceph_mdsc_release_request(struct kref *kref); 417 static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) 418 { 419 kref_put(&req->r_kref, ceph_mdsc_release_request); 420 } 421 422 extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc, 423 struct ceph_mds_session *session); 424 425 extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); 426 427 extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, 428 int stop_on_nosnap); 429 430 extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry); 431 extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, 432 struct inode *inode, 433 struct dentry *dentry, char action, 434 u32 seq); 435 436 extern void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, 437 struct ceph_msg *msg); 438 extern void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, 439 struct ceph_msg *msg); 440 441 extern struct ceph_mds_session * 442 ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target); 443 extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, 444 struct ceph_mds_session *session); 445 446 #endif 447