xref: /openbmc/linux/fs/ceph/mds_client.c (revision 985b9ee8)
1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
23d14c5d2SYehuda Sadeh #include <linux/ceph/ceph_debug.h>
32f2dc053SSage Weil 
4496e5955SSage Weil #include <linux/fs.h>
52f2dc053SSage Weil #include <linux/wait.h>
65a0e3ad6STejun Heo #include <linux/slab.h>
754008399SYan, Zheng #include <linux/gfp.h>
82f2dc053SSage Weil #include <linux/sched.h>
93d14c5d2SYehuda Sadeh #include <linux/debugfs.h>
103d14c5d2SYehuda Sadeh #include <linux/seq_file.h>
113e0708b9SYan, Zheng #include <linux/ratelimit.h>
129ba1e224SXiubo Li #include <linux/bits.h>
1370c94820SXiubo Li #include <linux/ktime.h>
14d517b398SXiubo Li #include <linux/bitmap.h>
152f2dc053SSage Weil 
162f2dc053SSage Weil #include "super.h"
173d14c5d2SYehuda Sadeh #include "mds_client.h"
182d332d5bSJeff Layton #include "crypto.h"
193d14c5d2SYehuda Sadeh 
201fe60e51SSage Weil #include <linux/ceph/ceph_features.h>
213d14c5d2SYehuda Sadeh #include <linux/ceph/messenger.h>
223d14c5d2SYehuda Sadeh #include <linux/ceph/decode.h>
233d14c5d2SYehuda Sadeh #include <linux/ceph/pagelist.h>
243d14c5d2SYehuda Sadeh #include <linux/ceph/auth.h>
253d14c5d2SYehuda Sadeh #include <linux/ceph/debugfs.h>
262f2dc053SSage Weil 
2781c5a148SYan, Zheng #define RECONNECT_MAX_SIZE (INT_MAX - PAGE_SIZE)
2881c5a148SYan, Zheng 
292f2dc053SSage Weil /*
302f2dc053SSage Weil  * A cluster of MDS (metadata server) daemons is responsible for
312f2dc053SSage Weil  * managing the file system namespace (the directory hierarchy and
322f2dc053SSage Weil  * inodes) and for coordinating shared access to storage.  Metadata is
332f2dc053SSage Weil  * partitioning hierarchically across a number of servers, and that
342f2dc053SSage Weil  * partition varies over time as the cluster adjusts the distribution
352f2dc053SSage Weil  * in order to balance load.
362f2dc053SSage Weil  *
372f2dc053SSage Weil  * The MDS client is primarily responsible to managing synchronous
382f2dc053SSage Weil  * metadata requests for operations like open, unlink, and so forth.
392f2dc053SSage Weil  * If there is a MDS failure, we find out about it when we (possibly
402f2dc053SSage Weil  * request and) receive a new MDS map, and can resubmit affected
412f2dc053SSage Weil  * requests.
422f2dc053SSage Weil  *
432f2dc053SSage Weil  * For the most part, though, we take advantage of a lossless
442f2dc053SSage Weil  * communications channel to the MDS, and do not need to worry about
452f2dc053SSage Weil  * timing out or resubmitting requests.
462f2dc053SSage Weil  *
472f2dc053SSage Weil  * We maintain a stateful "session" with each MDS we interact with.
482f2dc053SSage Weil  * Within each session, we sent periodic heartbeat messages to ensure
492f2dc053SSage Weil  * any capabilities or leases we have been issues remain valid.  If
502f2dc053SSage Weil  * the session times out and goes stale, our leases and capabilities
512f2dc053SSage Weil  * are no longer valid.
522f2dc053SSage Weil  */
532f2dc053SSage Weil 
5420cb34aeSSage Weil struct ceph_reconnect_state {
5581c5a148SYan, Zheng 	struct ceph_mds_session *session;
5681c5a148SYan, Zheng 	int nr_caps, nr_realms;
5720cb34aeSSage Weil 	struct ceph_pagelist *pagelist;
58121f22a1SYan, Zheng 	unsigned msg_version;
5981c5a148SYan, Zheng 	bool allow_multi;
6020cb34aeSSage Weil };
6120cb34aeSSage Weil 
622f2dc053SSage Weil static void __wake_requests(struct ceph_mds_client *mdsc,
632f2dc053SSage Weil 			    struct list_head *head);
64e3ec8d68SYan, Zheng static void ceph_cap_release_work(struct work_struct *work);
6537c4efc1SYan, Zheng static void ceph_cap_reclaim_work(struct work_struct *work);
662f2dc053SSage Weil 
679e32789fSTobias Klauser static const struct ceph_connection_operations mds_con_ops;
682f2dc053SSage Weil 
692f2dc053SSage Weil 
702f2dc053SSage Weil /*
712f2dc053SSage Weil  * mds reply parsing
722f2dc053SSage Weil  */
732f2dc053SSage Weil 
parse_reply_info_quota(void ** p,void * end,struct ceph_mds_reply_info_in * info)74b37fe1f9SYan, Zheng static int parse_reply_info_quota(void **p, void *end,
75b37fe1f9SYan, Zheng 				  struct ceph_mds_reply_info_in *info)
76b37fe1f9SYan, Zheng {
77b37fe1f9SYan, Zheng 	u8 struct_v, struct_compat;
78b37fe1f9SYan, Zheng 	u32 struct_len;
79b37fe1f9SYan, Zheng 
80b37fe1f9SYan, Zheng 	ceph_decode_8_safe(p, end, struct_v, bad);
81b37fe1f9SYan, Zheng 	ceph_decode_8_safe(p, end, struct_compat, bad);
82b37fe1f9SYan, Zheng 	/* struct_v is expected to be >= 1. we only
83b37fe1f9SYan, Zheng 	 * understand encoding with struct_compat == 1. */
84b37fe1f9SYan, Zheng 	if (!struct_v || struct_compat != 1)
85b37fe1f9SYan, Zheng 		goto bad;
86b37fe1f9SYan, Zheng 	ceph_decode_32_safe(p, end, struct_len, bad);
87b37fe1f9SYan, Zheng 	ceph_decode_need(p, end, struct_len, bad);
88b37fe1f9SYan, Zheng 	end = *p + struct_len;
89b37fe1f9SYan, Zheng 	ceph_decode_64_safe(p, end, info->max_bytes, bad);
90b37fe1f9SYan, Zheng 	ceph_decode_64_safe(p, end, info->max_files, bad);
91b37fe1f9SYan, Zheng 	*p = end;
92b37fe1f9SYan, Zheng 	return 0;
93b37fe1f9SYan, Zheng bad:
94b37fe1f9SYan, Zheng 	return -EIO;
95b37fe1f9SYan, Zheng }
96b37fe1f9SYan, Zheng 
972f2dc053SSage Weil /*
982f2dc053SSage Weil  * parse individual inode info
992f2dc053SSage Weil  */
parse_reply_info_in(void ** p,void * end,struct ceph_mds_reply_info_in * info,u64 features)1002f2dc053SSage Weil static int parse_reply_info_in(void **p, void *end,
10114303d20SSage Weil 			       struct ceph_mds_reply_info_in *info,
10212b4629aSIlya Dryomov 			       u64 features)
1032f2dc053SSage Weil {
104b37fe1f9SYan, Zheng 	int err = 0;
105b37fe1f9SYan, Zheng 	u8 struct_v = 0;
1062f2dc053SSage Weil 
107b37fe1f9SYan, Zheng 	if (features == (u64)-1) {
108b37fe1f9SYan, Zheng 		u32 struct_len;
109b37fe1f9SYan, Zheng 		u8 struct_compat;
110b37fe1f9SYan, Zheng 		ceph_decode_8_safe(p, end, struct_v, bad);
111b37fe1f9SYan, Zheng 		ceph_decode_8_safe(p, end, struct_compat, bad);
112b37fe1f9SYan, Zheng 		/* struct_v is expected to be >= 1. we only understand
113b37fe1f9SYan, Zheng 		 * encoding with struct_compat == 1. */
114b37fe1f9SYan, Zheng 		if (!struct_v || struct_compat != 1)
115b37fe1f9SYan, Zheng 			goto bad;
116b37fe1f9SYan, Zheng 		ceph_decode_32_safe(p, end, struct_len, bad);
117b37fe1f9SYan, Zheng 		ceph_decode_need(p, end, struct_len, bad);
118b37fe1f9SYan, Zheng 		end = *p + struct_len;
119b37fe1f9SYan, Zheng 	}
120b37fe1f9SYan, Zheng 
121b37fe1f9SYan, Zheng 	ceph_decode_need(p, end, sizeof(struct ceph_mds_reply_inode), bad);
1222f2dc053SSage Weil 	info->in = *p;
1232f2dc053SSage Weil 	*p += sizeof(struct ceph_mds_reply_inode) +
1242f2dc053SSage Weil 		sizeof(*info->in->fragtree.splits) *
1252f2dc053SSage Weil 		le32_to_cpu(info->in->fragtree.nsplits);
1262f2dc053SSage Weil 
1272f2dc053SSage Weil 	ceph_decode_32_safe(p, end, info->symlink_len, bad);
1282f2dc053SSage Weil 	ceph_decode_need(p, end, info->symlink_len, bad);
1292f2dc053SSage Weil 	info->symlink = *p;
1302f2dc053SSage Weil 	*p += info->symlink_len;
1312f2dc053SSage Weil 
13214303d20SSage Weil 	ceph_decode_copy_safe(p, end, &info->dir_layout,
13314303d20SSage Weil 			      sizeof(info->dir_layout), bad);
1342f2dc053SSage Weil 	ceph_decode_32_safe(p, end, info->xattr_len, bad);
1352f2dc053SSage Weil 	ceph_decode_need(p, end, info->xattr_len, bad);
1362f2dc053SSage Weil 	info->xattr_data = *p;
1372f2dc053SSage Weil 	*p += info->xattr_len;
138fb01d1f8SYan, Zheng 
139b37fe1f9SYan, Zheng 	if (features == (u64)-1) {
140b37fe1f9SYan, Zheng 		/* inline data */
141b37fe1f9SYan, Zheng 		ceph_decode_64_safe(p, end, info->inline_version, bad);
142b37fe1f9SYan, Zheng 		ceph_decode_32_safe(p, end, info->inline_len, bad);
143b37fe1f9SYan, Zheng 		ceph_decode_need(p, end, info->inline_len, bad);
144b37fe1f9SYan, Zheng 		info->inline_data = *p;
145b37fe1f9SYan, Zheng 		*p += info->inline_len;
146b37fe1f9SYan, Zheng 		/* quota */
147b37fe1f9SYan, Zheng 		err = parse_reply_info_quota(p, end, info);
148b37fe1f9SYan, Zheng 		if (err < 0)
149b37fe1f9SYan, Zheng 			goto out_bad;
150b37fe1f9SYan, Zheng 		/* pool namespace */
151b37fe1f9SYan, Zheng 		ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
152b37fe1f9SYan, Zheng 		if (info->pool_ns_len > 0) {
153b37fe1f9SYan, Zheng 			ceph_decode_need(p, end, info->pool_ns_len, bad);
154b37fe1f9SYan, Zheng 			info->pool_ns_data = *p;
155b37fe1f9SYan, Zheng 			*p += info->pool_ns_len;
156b37fe1f9SYan, Zheng 		}
157245ce991SJeff Layton 
158245ce991SJeff Layton 		/* btime */
159245ce991SJeff Layton 		ceph_decode_need(p, end, sizeof(info->btime), bad);
160245ce991SJeff Layton 		ceph_decode_copy(p, &info->btime, sizeof(info->btime));
161245ce991SJeff Layton 
162245ce991SJeff Layton 		/* change attribute */
163a35ead31SJeff Layton 		ceph_decode_64_safe(p, end, info->change_attr, bad);
164b37fe1f9SYan, Zheng 
16508796873SYan, Zheng 		/* dir pin */
16608796873SYan, Zheng 		if (struct_v >= 2) {
16708796873SYan, Zheng 			ceph_decode_32_safe(p, end, info->dir_pin, bad);
16808796873SYan, Zheng 		} else {
16908796873SYan, Zheng 			info->dir_pin = -ENODATA;
17008796873SYan, Zheng 		}
17108796873SYan, Zheng 
172193e7b37SDavid Disseldorp 		/* snapshot birth time, remains zero for v<=2 */
173193e7b37SDavid Disseldorp 		if (struct_v >= 3) {
174193e7b37SDavid Disseldorp 			ceph_decode_need(p, end, sizeof(info->snap_btime), bad);
175193e7b37SDavid Disseldorp 			ceph_decode_copy(p, &info->snap_btime,
176193e7b37SDavid Disseldorp 					 sizeof(info->snap_btime));
177193e7b37SDavid Disseldorp 		} else {
178193e7b37SDavid Disseldorp 			memset(&info->snap_btime, 0, sizeof(info->snap_btime));
179193e7b37SDavid Disseldorp 		}
180193e7b37SDavid Disseldorp 
181e7f72952SYanhu Cao 		/* snapshot count, remains zero for v<=3 */
182e7f72952SYanhu Cao 		if (struct_v >= 4) {
183e7f72952SYanhu Cao 			ceph_decode_64_safe(p, end, info->rsnaps, bad);
184e7f72952SYanhu Cao 		} else {
185e7f72952SYanhu Cao 			info->rsnaps = 0;
186e7f72952SYanhu Cao 		}
187e7f72952SYanhu Cao 
1882d332d5bSJeff Layton 		if (struct_v >= 5) {
1892d332d5bSJeff Layton 			u32 alen;
1902d332d5bSJeff Layton 
1912d332d5bSJeff Layton 			ceph_decode_32_safe(p, end, alen, bad);
1922d332d5bSJeff Layton 
1932d332d5bSJeff Layton 			while (alen--) {
1942d332d5bSJeff Layton 				u32 len;
1952d332d5bSJeff Layton 
1962d332d5bSJeff Layton 				/* key */
1972d332d5bSJeff Layton 				ceph_decode_32_safe(p, end, len, bad);
1982d332d5bSJeff Layton 				ceph_decode_skip_n(p, end, len, bad);
1992d332d5bSJeff Layton 				/* value */
2002d332d5bSJeff Layton 				ceph_decode_32_safe(p, end, len, bad);
2012d332d5bSJeff Layton 				ceph_decode_skip_n(p, end, len, bad);
2022d332d5bSJeff Layton 			}
2032d332d5bSJeff Layton 		}
2042d332d5bSJeff Layton 
2052d332d5bSJeff Layton 		/* fscrypt flag -- ignore */
2062d332d5bSJeff Layton 		if (struct_v >= 6)
2072d332d5bSJeff Layton 			ceph_decode_skip_8(p, end, bad);
2082d332d5bSJeff Layton 
2092d332d5bSJeff Layton 		info->fscrypt_auth = NULL;
2102d332d5bSJeff Layton 		info->fscrypt_auth_len = 0;
2112d332d5bSJeff Layton 		info->fscrypt_file = NULL;
2122d332d5bSJeff Layton 		info->fscrypt_file_len = 0;
2132d332d5bSJeff Layton 		if (struct_v >= 7) {
2142d332d5bSJeff Layton 			ceph_decode_32_safe(p, end, info->fscrypt_auth_len, bad);
2152d332d5bSJeff Layton 			if (info->fscrypt_auth_len) {
2162d332d5bSJeff Layton 				info->fscrypt_auth = kmalloc(info->fscrypt_auth_len,
2172d332d5bSJeff Layton 							     GFP_KERNEL);
2182d332d5bSJeff Layton 				if (!info->fscrypt_auth)
2192d332d5bSJeff Layton 					return -ENOMEM;
2202d332d5bSJeff Layton 				ceph_decode_copy_safe(p, end, info->fscrypt_auth,
2212d332d5bSJeff Layton 						      info->fscrypt_auth_len, bad);
2222d332d5bSJeff Layton 			}
2232d332d5bSJeff Layton 			ceph_decode_32_safe(p, end, info->fscrypt_file_len, bad);
2242d332d5bSJeff Layton 			if (info->fscrypt_file_len) {
2252d332d5bSJeff Layton 				info->fscrypt_file = kmalloc(info->fscrypt_file_len,
2262d332d5bSJeff Layton 							     GFP_KERNEL);
2272d332d5bSJeff Layton 				if (!info->fscrypt_file)
2282d332d5bSJeff Layton 					return -ENOMEM;
2292d332d5bSJeff Layton 				ceph_decode_copy_safe(p, end, info->fscrypt_file,
2302d332d5bSJeff Layton 						      info->fscrypt_file_len, bad);
2312d332d5bSJeff Layton 			}
2322d332d5bSJeff Layton 		}
233b37fe1f9SYan, Zheng 		*p = end;
234b37fe1f9SYan, Zheng 	} else {
2352d332d5bSJeff Layton 		/* legacy (unversioned) struct */
236fb01d1f8SYan, Zheng 		if (features & CEPH_FEATURE_MDS_INLINE_DATA) {
237fb01d1f8SYan, Zheng 			ceph_decode_64_safe(p, end, info->inline_version, bad);
238fb01d1f8SYan, Zheng 			ceph_decode_32_safe(p, end, info->inline_len, bad);
239fb01d1f8SYan, Zheng 			ceph_decode_need(p, end, info->inline_len, bad);
240fb01d1f8SYan, Zheng 			info->inline_data = *p;
241fb01d1f8SYan, Zheng 			*p += info->inline_len;
242fb01d1f8SYan, Zheng 		} else
243fb01d1f8SYan, Zheng 			info->inline_version = CEPH_INLINE_NONE;
244fb01d1f8SYan, Zheng 
245fb18a575SLuis Henriques 		if (features & CEPH_FEATURE_MDS_QUOTA) {
246b37fe1f9SYan, Zheng 			err = parse_reply_info_quota(p, end, info);
247b37fe1f9SYan, Zheng 			if (err < 0)
248b37fe1f9SYan, Zheng 				goto out_bad;
249fb18a575SLuis Henriques 		} else {
250fb18a575SLuis Henriques 			info->max_bytes = 0;
251fb18a575SLuis Henriques 			info->max_files = 0;
252fb18a575SLuis Henriques 		}
253fb18a575SLuis Henriques 
254779fe0fbSYan, Zheng 		info->pool_ns_len = 0;
255779fe0fbSYan, Zheng 		info->pool_ns_data = NULL;
2565ea5c5e0SYan, Zheng 		if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
2575ea5c5e0SYan, Zheng 			ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
258779fe0fbSYan, Zheng 			if (info->pool_ns_len > 0) {
2595ea5c5e0SYan, Zheng 				ceph_decode_need(p, end, info->pool_ns_len, bad);
260779fe0fbSYan, Zheng 				info->pool_ns_data = *p;
2615ea5c5e0SYan, Zheng 				*p += info->pool_ns_len;
262779fe0fbSYan, Zheng 			}
2635ea5c5e0SYan, Zheng 		}
26408796873SYan, Zheng 
265245ce991SJeff Layton 		if (features & CEPH_FEATURE_FS_BTIME) {
266245ce991SJeff Layton 			ceph_decode_need(p, end, sizeof(info->btime), bad);
267245ce991SJeff Layton 			ceph_decode_copy(p, &info->btime, sizeof(info->btime));
268a35ead31SJeff Layton 			ceph_decode_64_safe(p, end, info->change_attr, bad);
269245ce991SJeff Layton 		}
270245ce991SJeff Layton 
27108796873SYan, Zheng 		info->dir_pin = -ENODATA;
272e7f72952SYanhu Cao 		/* info->snap_btime and info->rsnaps remain zero */
273b37fe1f9SYan, Zheng 	}
2742f2dc053SSage Weil 	return 0;
2752f2dc053SSage Weil bad:
276b37fe1f9SYan, Zheng 	err = -EIO;
277b37fe1f9SYan, Zheng out_bad:
2782f2dc053SSage Weil 	return err;
2792f2dc053SSage Weil }
2802f2dc053SSage Weil 
parse_reply_info_dir(void ** p,void * end,struct ceph_mds_reply_dirfrag ** dirfrag,u64 features)281b37fe1f9SYan, Zheng static int parse_reply_info_dir(void **p, void *end,
282b37fe1f9SYan, Zheng 				struct ceph_mds_reply_dirfrag **dirfrag,
283b37fe1f9SYan, Zheng 				u64 features)
284b37fe1f9SYan, Zheng {
285b37fe1f9SYan, Zheng 	if (features == (u64)-1) {
286b37fe1f9SYan, Zheng 		u8 struct_v, struct_compat;
287b37fe1f9SYan, Zheng 		u32 struct_len;
288b37fe1f9SYan, Zheng 		ceph_decode_8_safe(p, end, struct_v, bad);
289b37fe1f9SYan, Zheng 		ceph_decode_8_safe(p, end, struct_compat, bad);
290b37fe1f9SYan, Zheng 		/* struct_v is expected to be >= 1. we only understand
291b37fe1f9SYan, Zheng 		 * encoding whose struct_compat == 1. */
292b37fe1f9SYan, Zheng 		if (!struct_v || struct_compat != 1)
293b37fe1f9SYan, Zheng 			goto bad;
294b37fe1f9SYan, Zheng 		ceph_decode_32_safe(p, end, struct_len, bad);
295b37fe1f9SYan, Zheng 		ceph_decode_need(p, end, struct_len, bad);
296b37fe1f9SYan, Zheng 		end = *p + struct_len;
297b37fe1f9SYan, Zheng 	}
298b37fe1f9SYan, Zheng 
299b37fe1f9SYan, Zheng 	ceph_decode_need(p, end, sizeof(**dirfrag), bad);
300b37fe1f9SYan, Zheng 	*dirfrag = *p;
301b37fe1f9SYan, Zheng 	*p += sizeof(**dirfrag) + sizeof(u32) * le32_to_cpu((*dirfrag)->ndist);
302b37fe1f9SYan, Zheng 	if (unlikely(*p > end))
303b37fe1f9SYan, Zheng 		goto bad;
304b37fe1f9SYan, Zheng 	if (features == (u64)-1)
305b37fe1f9SYan, Zheng 		*p = end;
306b37fe1f9SYan, Zheng 	return 0;
307b37fe1f9SYan, Zheng bad:
308b37fe1f9SYan, Zheng 	return -EIO;
309b37fe1f9SYan, Zheng }
310b37fe1f9SYan, Zheng 
parse_reply_info_lease(void ** p,void * end,struct ceph_mds_reply_lease ** lease,u64 features,u32 * altname_len,u8 ** altname)311b37fe1f9SYan, Zheng static int parse_reply_info_lease(void **p, void *end,
312b37fe1f9SYan, Zheng 				  struct ceph_mds_reply_lease **lease,
3134ac4c23eSJeff Layton 				  u64 features, u32 *altname_len, u8 **altname)
314b37fe1f9SYan, Zheng {
3154ac4c23eSJeff Layton 	u8 struct_v;
316b37fe1f9SYan, Zheng 	u32 struct_len;
3174ac4c23eSJeff Layton 	void *lend;
3184ac4c23eSJeff Layton 
3194ac4c23eSJeff Layton 	if (features == (u64)-1) {
3204ac4c23eSJeff Layton 		u8 struct_compat;
3214ac4c23eSJeff Layton 
322b37fe1f9SYan, Zheng 		ceph_decode_8_safe(p, end, struct_v, bad);
323b37fe1f9SYan, Zheng 		ceph_decode_8_safe(p, end, struct_compat, bad);
3244ac4c23eSJeff Layton 
325b37fe1f9SYan, Zheng 		/* struct_v is expected to be >= 1. we only understand
326b37fe1f9SYan, Zheng 		 * encoding whose struct_compat == 1. */
327b37fe1f9SYan, Zheng 		if (!struct_v || struct_compat != 1)
328b37fe1f9SYan, Zheng 			goto bad;
3294ac4c23eSJeff Layton 
330b37fe1f9SYan, Zheng 		ceph_decode_32_safe(p, end, struct_len, bad);
3314ac4c23eSJeff Layton 	} else {
3324ac4c23eSJeff Layton 		struct_len = sizeof(**lease);
3334ac4c23eSJeff Layton 		*altname_len = 0;
3344ac4c23eSJeff Layton 		*altname = NULL;
335b37fe1f9SYan, Zheng 	}
336b37fe1f9SYan, Zheng 
3374ac4c23eSJeff Layton 	lend = *p + struct_len;
3384ac4c23eSJeff Layton 	ceph_decode_need(p, end, struct_len, bad);
339b37fe1f9SYan, Zheng 	*lease = *p;
340b37fe1f9SYan, Zheng 	*p += sizeof(**lease);
3414ac4c23eSJeff Layton 
3424ac4c23eSJeff Layton 	if (features == (u64)-1) {
3434ac4c23eSJeff Layton 		if (struct_v >= 2) {
3444ac4c23eSJeff Layton 			ceph_decode_32_safe(p, end, *altname_len, bad);
3454ac4c23eSJeff Layton 			ceph_decode_need(p, end, *altname_len, bad);
3464ac4c23eSJeff Layton 			*altname = *p;
3474ac4c23eSJeff Layton 			*p += *altname_len;
3484ac4c23eSJeff Layton 		} else {
3494ac4c23eSJeff Layton 			*altname = NULL;
3504ac4c23eSJeff Layton 			*altname_len = 0;
3514ac4c23eSJeff Layton 		}
3524ac4c23eSJeff Layton 	}
3534ac4c23eSJeff Layton 	*p = lend;
354b37fe1f9SYan, Zheng 	return 0;
355b37fe1f9SYan, Zheng bad:
356b37fe1f9SYan, Zheng 	return -EIO;
357b37fe1f9SYan, Zheng }
358b37fe1f9SYan, Zheng 
3592f2dc053SSage Weil /*
3602f2dc053SSage Weil  * parse a normal reply, which may contain a (dir+)dentry and/or a
3612f2dc053SSage Weil  * target inode.
3622f2dc053SSage Weil  */
parse_reply_info_trace(void ** p,void * end,struct ceph_mds_reply_info_parsed * info,u64 features)3632f2dc053SSage Weil static int parse_reply_info_trace(void **p, void *end,
36414303d20SSage Weil 				  struct ceph_mds_reply_info_parsed *info,
36512b4629aSIlya Dryomov 				  u64 features)
3662f2dc053SSage Weil {
3672f2dc053SSage Weil 	int err;
3682f2dc053SSage Weil 
3692f2dc053SSage Weil 	if (info->head->is_dentry) {
37014303d20SSage Weil 		err = parse_reply_info_in(p, end, &info->diri, features);
3712f2dc053SSage Weil 		if (err < 0)
3722f2dc053SSage Weil 			goto out_bad;
3732f2dc053SSage Weil 
374b37fe1f9SYan, Zheng 		err = parse_reply_info_dir(p, end, &info->dirfrag, features);
375b37fe1f9SYan, Zheng 		if (err < 0)
376b37fe1f9SYan, Zheng 			goto out_bad;
3772f2dc053SSage Weil 
3782f2dc053SSage Weil 		ceph_decode_32_safe(p, end, info->dname_len, bad);
3792f2dc053SSage Weil 		ceph_decode_need(p, end, info->dname_len, bad);
3802f2dc053SSage Weil 		info->dname = *p;
3812f2dc053SSage Weil 		*p += info->dname_len;
382b37fe1f9SYan, Zheng 
3834ac4c23eSJeff Layton 		err = parse_reply_info_lease(p, end, &info->dlease, features,
3844ac4c23eSJeff Layton 					     &info->altname_len, &info->altname);
385b37fe1f9SYan, Zheng 		if (err < 0)
386b37fe1f9SYan, Zheng 			goto out_bad;
3872f2dc053SSage Weil 	}
3882f2dc053SSage Weil 
3892f2dc053SSage Weil 	if (info->head->is_target) {
39014303d20SSage Weil 		err = parse_reply_info_in(p, end, &info->targeti, features);
3912f2dc053SSage Weil 		if (err < 0)
3922f2dc053SSage Weil 			goto out_bad;
3932f2dc053SSage Weil 	}
3942f2dc053SSage Weil 
3952f2dc053SSage Weil 	if (unlikely(*p != end))
3962f2dc053SSage Weil 		goto bad;
3972f2dc053SSage Weil 	return 0;
3982f2dc053SSage Weil 
3992f2dc053SSage Weil bad:
4002f2dc053SSage Weil 	err = -EIO;
4012f2dc053SSage Weil out_bad:
4022f2dc053SSage Weil 	pr_err("problem parsing mds trace %d\n", err);
4032f2dc053SSage Weil 	return err;
4042f2dc053SSage Weil }
4052f2dc053SSage Weil 
4062f2dc053SSage Weil /*
4072f2dc053SSage Weil  * parse readdir results
4082f2dc053SSage Weil  */
parse_reply_info_readdir(void ** p,void * end,struct ceph_mds_request * req,u64 features)409b37fe1f9SYan, Zheng static int parse_reply_info_readdir(void **p, void *end,
4103859af9eSXiubo Li 				    struct ceph_mds_request *req,
41112b4629aSIlya Dryomov 				    u64 features)
4122f2dc053SSage Weil {
4133859af9eSXiubo Li 	struct ceph_mds_reply_info_parsed *info = &req->r_reply_info;
4142f2dc053SSage Weil 	u32 num, i = 0;
4152f2dc053SSage Weil 	int err;
4162f2dc053SSage Weil 
417b37fe1f9SYan, Zheng 	err = parse_reply_info_dir(p, end, &info->dir_dir, features);
418b37fe1f9SYan, Zheng 	if (err < 0)
419b37fe1f9SYan, Zheng 		goto out_bad;
4202f2dc053SSage Weil 
4212f2dc053SSage Weil 	ceph_decode_need(p, end, sizeof(num) + 2, bad);
422c89136eaSSage Weil 	num = ceph_decode_32(p);
423956d39d6SYan, Zheng 	{
424956d39d6SYan, Zheng 		u16 flags = ceph_decode_16(p);
425956d39d6SYan, Zheng 		info->dir_end = !!(flags & CEPH_READDIR_FRAG_END);
426956d39d6SYan, Zheng 		info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE);
427f3c4ebe6SYan, Zheng 		info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER);
42879162547SYan, Zheng 		info->offset_hash = !!(flags & CEPH_READDIR_OFFSET_HASH);
429956d39d6SYan, Zheng 	}
4302f2dc053SSage Weil 	if (num == 0)
4312f2dc053SSage Weil 		goto done;
4322f2dc053SSage Weil 
4332a5beea3SYan, Zheng 	BUG_ON(!info->dir_entries);
4342a5beea3SYan, Zheng 	if ((unsigned long)(info->dir_entries + num) >
4352a5beea3SYan, Zheng 	    (unsigned long)info->dir_entries + info->dir_buf_size) {
43654008399SYan, Zheng 		pr_err("dir contents are larger than expected\n");
43754008399SYan, Zheng 		WARN_ON(1);
43854008399SYan, Zheng 		goto bad;
43954008399SYan, Zheng 	}
4402f2dc053SSage Weil 
44154008399SYan, Zheng 	info->dir_nr = num;
4422f2dc053SSage Weil 	while (num) {
443af9ffa6dSXiubo Li 		struct inode *inode = d_inode(req->r_dentry);
444af9ffa6dSXiubo Li 		struct ceph_inode_info *ci = ceph_inode(inode);
4452a5beea3SYan, Zheng 		struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;
446af9ffa6dSXiubo Li 		struct fscrypt_str tname = FSTR_INIT(NULL, 0);
447af9ffa6dSXiubo Li 		struct fscrypt_str oname = FSTR_INIT(NULL, 0);
448af9ffa6dSXiubo Li 		struct ceph_fname fname;
449af9ffa6dSXiubo Li 		u32 altname_len, _name_len;
450af9ffa6dSXiubo Li 		u8 *altname, *_name;
451af9ffa6dSXiubo Li 
4522f2dc053SSage Weil 		/* dentry */
453af9ffa6dSXiubo Li 		ceph_decode_32_safe(p, end, _name_len, bad);
454af9ffa6dSXiubo Li 		ceph_decode_need(p, end, _name_len, bad);
455af9ffa6dSXiubo Li 		_name = *p;
456af9ffa6dSXiubo Li 		*p += _name_len;
457af9ffa6dSXiubo Li 		dout("parsed dir dname '%.*s'\n", _name_len, _name);
458af9ffa6dSXiubo Li 
459af9ffa6dSXiubo Li 		if (info->hash_order)
460af9ffa6dSXiubo Li 			rde->raw_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
461af9ffa6dSXiubo Li 						      _name, _name_len);
4622f2dc053SSage Weil 
463b37fe1f9SYan, Zheng 		/* dentry lease */
4644ac4c23eSJeff Layton 		err = parse_reply_info_lease(p, end, &rde->lease, features,
465af9ffa6dSXiubo Li 					     &altname_len, &altname);
466b37fe1f9SYan, Zheng 		if (err)
467b37fe1f9SYan, Zheng 			goto out_bad;
4684ac4c23eSJeff Layton 
469af9ffa6dSXiubo Li 		/*
470af9ffa6dSXiubo Li 		 * Try to dencrypt the dentry names and update them
471af9ffa6dSXiubo Li 		 * in the ceph_mds_reply_dir_entry struct.
472af9ffa6dSXiubo Li 		 */
473af9ffa6dSXiubo Li 		fname.dir = inode;
474af9ffa6dSXiubo Li 		fname.name = _name;
475af9ffa6dSXiubo Li 		fname.name_len = _name_len;
476af9ffa6dSXiubo Li 		fname.ctext = altname;
477af9ffa6dSXiubo Li 		fname.ctext_len = altname_len;
478af9ffa6dSXiubo Li 		/*
479af9ffa6dSXiubo Li 		 * The _name_len maybe larger than altname_len, such as
480af9ffa6dSXiubo Li 		 * when the human readable name length is in range of
481af9ffa6dSXiubo Li 		 * (CEPH_NOHASH_NAME_MAX, CEPH_NOHASH_NAME_MAX + SHA256_DIGEST_SIZE),
482af9ffa6dSXiubo Li 		 * then the copy in ceph_fname_to_usr will corrupt the
483af9ffa6dSXiubo Li 		 * data if there has no encryption key.
484af9ffa6dSXiubo Li 		 *
485af9ffa6dSXiubo Li 		 * Just set the no_copy flag and then if there has no
486af9ffa6dSXiubo Li 		 * encryption key the oname.name will be assigned to
487af9ffa6dSXiubo Li 		 * _name always.
488af9ffa6dSXiubo Li 		 */
489af9ffa6dSXiubo Li 		fname.no_copy = true;
490af9ffa6dSXiubo Li 		if (altname_len == 0) {
491af9ffa6dSXiubo Li 			/*
492af9ffa6dSXiubo Li 			 * Set tname to _name, and this will be used
493af9ffa6dSXiubo Li 			 * to do the base64_decode in-place. It's
494af9ffa6dSXiubo Li 			 * safe because the decoded string should
495af9ffa6dSXiubo Li 			 * always be shorter, which is 3/4 of origin
496af9ffa6dSXiubo Li 			 * string.
497af9ffa6dSXiubo Li 			 */
498af9ffa6dSXiubo Li 			tname.name = _name;
499af9ffa6dSXiubo Li 
500af9ffa6dSXiubo Li 			/*
501af9ffa6dSXiubo Li 			 * Set oname to _name too, and this will be
502af9ffa6dSXiubo Li 			 * used to do the dencryption in-place.
503af9ffa6dSXiubo Li 			 */
504af9ffa6dSXiubo Li 			oname.name = _name;
505af9ffa6dSXiubo Li 			oname.len = _name_len;
506af9ffa6dSXiubo Li 		} else {
507af9ffa6dSXiubo Li 			/*
508af9ffa6dSXiubo Li 			 * This will do the decryption only in-place
509af9ffa6dSXiubo Li 			 * from altname cryptext directly.
510af9ffa6dSXiubo Li 			 */
511af9ffa6dSXiubo Li 			oname.name = altname;
512af9ffa6dSXiubo Li 			oname.len = altname_len;
513af9ffa6dSXiubo Li 		}
514af9ffa6dSXiubo Li 		rde->is_nokey = false;
515af9ffa6dSXiubo Li 		err = ceph_fname_to_usr(&fname, &tname, &oname, &rde->is_nokey);
516af9ffa6dSXiubo Li 		if (err) {
517af9ffa6dSXiubo Li 			pr_err("%s unable to decode %.*s, got %d\n", __func__,
518af9ffa6dSXiubo Li 			       _name_len, _name, err);
519af9ffa6dSXiubo Li 			goto out_bad;
520af9ffa6dSXiubo Li 		}
521af9ffa6dSXiubo Li 		rde->name = oname.name;
522af9ffa6dSXiubo Li 		rde->name_len = oname.len;
523af9ffa6dSXiubo Li 
5242f2dc053SSage Weil 		/* inode */
5252a5beea3SYan, Zheng 		err = parse_reply_info_in(p, end, &rde->inode, features);
5262f2dc053SSage Weil 		if (err < 0)
5272f2dc053SSage Weil 			goto out_bad;
5288974eebdSYan, Zheng 		/* ceph_readdir_prepopulate() will update it */
5298974eebdSYan, Zheng 		rde->offset = 0;
5302f2dc053SSage Weil 		i++;
5312f2dc053SSage Weil 		num--;
5322f2dc053SSage Weil 	}
5332f2dc053SSage Weil 
5342f2dc053SSage Weil done:
5351d3f8723SJeff Layton 	/* Skip over any unrecognized fields */
5361d3f8723SJeff Layton 	*p = end;
5372f2dc053SSage Weil 	return 0;
5382f2dc053SSage Weil 
5392f2dc053SSage Weil bad:
5402f2dc053SSage Weil 	err = -EIO;
5412f2dc053SSage Weil out_bad:
5422f2dc053SSage Weil 	pr_err("problem parsing dir contents %d\n", err);
5432f2dc053SSage Weil 	return err;
5442f2dc053SSage Weil }
5452f2dc053SSage Weil 
5462f2dc053SSage Weil /*
54725933abdSHerb Shiu  * parse fcntl F_GETLK results
54825933abdSHerb Shiu  */
parse_reply_info_filelock(void ** p,void * end,struct ceph_mds_reply_info_parsed * info,u64 features)54925933abdSHerb Shiu static int parse_reply_info_filelock(void **p, void *end,
55014303d20SSage Weil 				     struct ceph_mds_reply_info_parsed *info,
55112b4629aSIlya Dryomov 				     u64 features)
55225933abdSHerb Shiu {
55325933abdSHerb Shiu 	if (*p + sizeof(*info->filelock_reply) > end)
55425933abdSHerb Shiu 		goto bad;
55525933abdSHerb Shiu 
55625933abdSHerb Shiu 	info->filelock_reply = *p;
55725933abdSHerb Shiu 
5581d3f8723SJeff Layton 	/* Skip over any unrecognized fields */
5591d3f8723SJeff Layton 	*p = end;
56025933abdSHerb Shiu 	return 0;
56125933abdSHerb Shiu bad:
56225933abdSHerb Shiu 	return -EIO;
56325933abdSHerb Shiu }
56425933abdSHerb Shiu 
565d4846487SJeff Layton 
566d4846487SJeff Layton #if BITS_PER_LONG == 64
567d4846487SJeff Layton 
568d4846487SJeff Layton #define DELEGATED_INO_AVAILABLE		xa_mk_value(1)
569d4846487SJeff Layton 
ceph_parse_deleg_inos(void ** p,void * end,struct ceph_mds_session * s)570d4846487SJeff Layton static int ceph_parse_deleg_inos(void **p, void *end,
571d4846487SJeff Layton 				 struct ceph_mds_session *s)
572d4846487SJeff Layton {
573d4846487SJeff Layton 	u32 sets;
574d4846487SJeff Layton 
575d4846487SJeff Layton 	ceph_decode_32_safe(p, end, sets, bad);
576d4846487SJeff Layton 	dout("got %u sets of delegated inodes\n", sets);
577d4846487SJeff Layton 	while (sets--) {
5782ecd0eddSColin Ian King 		u64 start, len;
579d4846487SJeff Layton 
580d4846487SJeff Layton 		ceph_decode_64_safe(p, end, start, bad);
581d4846487SJeff Layton 		ceph_decode_64_safe(p, end, len, bad);
582d4f6b31dSJeff Layton 
583d4f6b31dSJeff Layton 		/* Don't accept a delegation of system inodes */
584d4f6b31dSJeff Layton 		if (start < CEPH_INO_SYSTEM_BASE) {
585d4f6b31dSJeff Layton 			pr_warn_ratelimited("ceph: ignoring reserved inode range delegation (start=0x%llx len=0x%llx)\n",
586d4f6b31dSJeff Layton 					start, len);
587d4f6b31dSJeff Layton 			continue;
588d4f6b31dSJeff Layton 		}
589d4846487SJeff Layton 		while (len--) {
5902ecd0eddSColin Ian King 			int err = xa_insert(&s->s_delegated_inos, start++,
591d4846487SJeff Layton 					    DELEGATED_INO_AVAILABLE,
592d4846487SJeff Layton 					    GFP_KERNEL);
593d4846487SJeff Layton 			if (!err) {
594d4846487SJeff Layton 				dout("added delegated inode 0x%llx\n",
595d4846487SJeff Layton 				     start - 1);
596d4846487SJeff Layton 			} else if (err == -EBUSY) {
5974868e537SXiubo Li 				pr_warn("MDS delegated inode 0x%llx more than once.\n",
598d4846487SJeff Layton 					start - 1);
599d4846487SJeff Layton 			} else {
600d4846487SJeff Layton 				return err;
601d4846487SJeff Layton 			}
602d4846487SJeff Layton 		}
603d4846487SJeff Layton 	}
604d4846487SJeff Layton 	return 0;
605d4846487SJeff Layton bad:
606d4846487SJeff Layton 	return -EIO;
607d4846487SJeff Layton }
608d4846487SJeff Layton 
ceph_get_deleg_ino(struct ceph_mds_session * s)609d4846487SJeff Layton u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
610d4846487SJeff Layton {
611d4846487SJeff Layton 	unsigned long ino;
612d4846487SJeff Layton 	void *val;
613d4846487SJeff Layton 
614d4846487SJeff Layton 	xa_for_each(&s->s_delegated_inos, ino, val) {
615d4846487SJeff Layton 		val = xa_erase(&s->s_delegated_inos, ino);
616d4846487SJeff Layton 		if (val == DELEGATED_INO_AVAILABLE)
617d4846487SJeff Layton 			return ino;
618d4846487SJeff Layton 	}
619d4846487SJeff Layton 	return 0;
620d4846487SJeff Layton }
621d4846487SJeff Layton 
ceph_restore_deleg_ino(struct ceph_mds_session * s,u64 ino)622d4846487SJeff Layton int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
623d4846487SJeff Layton {
624d4846487SJeff Layton 	return xa_insert(&s->s_delegated_inos, ino, DELEGATED_INO_AVAILABLE,
625d4846487SJeff Layton 			 GFP_KERNEL);
626d4846487SJeff Layton }
627d4846487SJeff Layton #else /* BITS_PER_LONG == 64 */
628d4846487SJeff Layton /*
629d4846487SJeff Layton  * FIXME: xarrays can't handle 64-bit indexes on a 32-bit arch. For now, just
630d4846487SJeff Layton  * ignore delegated_inos on 32 bit arch. Maybe eventually add xarrays for top
631d4846487SJeff Layton  * and bottom words?
632d4846487SJeff Layton  */
ceph_parse_deleg_inos(void ** p,void * end,struct ceph_mds_session * s)633d4846487SJeff Layton static int ceph_parse_deleg_inos(void **p, void *end,
634d4846487SJeff Layton 				 struct ceph_mds_session *s)
635d4846487SJeff Layton {
636d4846487SJeff Layton 	u32 sets;
637d4846487SJeff Layton 
638d4846487SJeff Layton 	ceph_decode_32_safe(p, end, sets, bad);
639d4846487SJeff Layton 	if (sets)
640d4846487SJeff Layton 		ceph_decode_skip_n(p, end, sets * 2 * sizeof(__le64), bad);
641d4846487SJeff Layton 	return 0;
642d4846487SJeff Layton bad:
643d4846487SJeff Layton 	return -EIO;
644d4846487SJeff Layton }
645d4846487SJeff Layton 
ceph_get_deleg_ino(struct ceph_mds_session * s)646d4846487SJeff Layton u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
647d4846487SJeff Layton {
648d4846487SJeff Layton 	return 0;
649d4846487SJeff Layton }
650d4846487SJeff Layton 
ceph_restore_deleg_ino(struct ceph_mds_session * s,u64 ino)651d4846487SJeff Layton int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
652d4846487SJeff Layton {
653d4846487SJeff Layton 	return 0;
654d4846487SJeff Layton }
655d4846487SJeff Layton #endif /* BITS_PER_LONG == 64 */
656d4846487SJeff Layton 
65725933abdSHerb Shiu /*
6586e8575faSSam Lang  * parse create results
6596e8575faSSam Lang  */
parse_reply_info_create(void ** p,void * end,struct ceph_mds_reply_info_parsed * info,u64 features,struct ceph_mds_session * s)6606e8575faSSam Lang static int parse_reply_info_create(void **p, void *end,
6616e8575faSSam Lang 				  struct ceph_mds_reply_info_parsed *info,
662d4846487SJeff Layton 				  u64 features, struct ceph_mds_session *s)
6636e8575faSSam Lang {
664d4846487SJeff Layton 	int ret;
665d4846487SJeff Layton 
666b37fe1f9SYan, Zheng 	if (features == (u64)-1 ||
667b37fe1f9SYan, Zheng 	    (features & CEPH_FEATURE_REPLY_CREATE_INODE)) {
6686e8575faSSam Lang 		if (*p == end) {
669d4846487SJeff Layton 			/* Malformed reply? */
6706e8575faSSam Lang 			info->has_create_ino = false;
671d4846487SJeff Layton 		} else if (test_bit(CEPHFS_FEATURE_DELEG_INO, &s->s_features)) {
6726e8575faSSam Lang 			info->has_create_ino = true;
67306a1ad43SJeff Layton 			/* struct_v, struct_compat, and len */
67406a1ad43SJeff Layton 			ceph_decode_skip_n(p, end, 2 + sizeof(u32), bad);
6751d3f8723SJeff Layton 			ceph_decode_64_safe(p, end, info->ino, bad);
676d4846487SJeff Layton 			ret = ceph_parse_deleg_inos(p, end, s);
677d4846487SJeff Layton 			if (ret)
678d4846487SJeff Layton 				return ret;
679d4846487SJeff Layton 		} else {
680d4846487SJeff Layton 			/* legacy */
681d4846487SJeff Layton 			ceph_decode_64_safe(p, end, info->ino, bad);
682d4846487SJeff Layton 			info->has_create_ino = true;
6836e8575faSSam Lang 		}
6841d3f8723SJeff Layton 	} else {
6851d3f8723SJeff Layton 		if (*p != end)
6866e8575faSSam Lang 			goto bad;
6871d3f8723SJeff Layton 	}
6886e8575faSSam Lang 
6891d3f8723SJeff Layton 	/* Skip over any unrecognized fields */
6901d3f8723SJeff Layton 	*p = end;
6911d3f8723SJeff Layton 	return 0;
6926e8575faSSam Lang bad:
6936e8575faSSam Lang 	return -EIO;
6946e8575faSSam Lang }
6956e8575faSSam Lang 
parse_reply_info_getvxattr(void ** p,void * end,struct ceph_mds_reply_info_parsed * info,u64 features)6966ddf5f16SMilind Changire static int parse_reply_info_getvxattr(void **p, void *end,
6976ddf5f16SMilind Changire 				      struct ceph_mds_reply_info_parsed *info,
6986ddf5f16SMilind Changire 				      u64 features)
6996ddf5f16SMilind Changire {
7006ddf5f16SMilind Changire 	u32 value_len;
7016ddf5f16SMilind Changire 
7026ddf5f16SMilind Changire 	ceph_decode_skip_8(p, end, bad); /* skip current version: 1 */
7036ddf5f16SMilind Changire 	ceph_decode_skip_8(p, end, bad); /* skip first version: 1 */
7046ddf5f16SMilind Changire 	ceph_decode_skip_32(p, end, bad); /* skip payload length */
7056ddf5f16SMilind Changire 
7066ddf5f16SMilind Changire 	ceph_decode_32_safe(p, end, value_len, bad);
7076ddf5f16SMilind Changire 
7086ddf5f16SMilind Changire 	if (value_len == end - *p) {
7096ddf5f16SMilind Changire 	  info->xattr_info.xattr_value = *p;
7106ddf5f16SMilind Changire 	  info->xattr_info.xattr_value_len = value_len;
7116ddf5f16SMilind Changire 	  *p = end;
7126ddf5f16SMilind Changire 	  return value_len;
7136ddf5f16SMilind Changire 	}
7146ddf5f16SMilind Changire bad:
7156ddf5f16SMilind Changire 	return -EIO;
7166ddf5f16SMilind Changire }
7176ddf5f16SMilind Changire 
7186e8575faSSam Lang /*
71925933abdSHerb Shiu  * parse extra results
72025933abdSHerb Shiu  */
parse_reply_info_extra(void ** p,void * end,struct ceph_mds_request * req,u64 features,struct ceph_mds_session * s)72125933abdSHerb Shiu static int parse_reply_info_extra(void **p, void *end,
7223859af9eSXiubo Li 				  struct ceph_mds_request *req,
723d4846487SJeff Layton 				  u64 features, struct ceph_mds_session *s)
72425933abdSHerb Shiu {
7253859af9eSXiubo Li 	struct ceph_mds_reply_info_parsed *info = &req->r_reply_info;
7266df8c9d8SJeff Layton 	u32 op = le32_to_cpu(info->head->op);
7276df8c9d8SJeff Layton 
7286df8c9d8SJeff Layton 	if (op == CEPH_MDS_OP_GETFILELOCK)
72914303d20SSage Weil 		return parse_reply_info_filelock(p, end, info, features);
7306df8c9d8SJeff Layton 	else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP)
7313859af9eSXiubo Li 		return parse_reply_info_readdir(p, end, req, features);
7326df8c9d8SJeff Layton 	else if (op == CEPH_MDS_OP_CREATE)
733d4846487SJeff Layton 		return parse_reply_info_create(p, end, info, features, s);
7346ddf5f16SMilind Changire 	else if (op == CEPH_MDS_OP_GETVXATTR)
7356ddf5f16SMilind Changire 		return parse_reply_info_getvxattr(p, end, info, features);
7366e8575faSSam Lang 	else
7376e8575faSSam Lang 		return -EIO;
73825933abdSHerb Shiu }
73925933abdSHerb Shiu 
74025933abdSHerb Shiu /*
7412f2dc053SSage Weil  * parse entire mds reply
7422f2dc053SSage Weil  */
parse_reply_info(struct ceph_mds_session * s,struct ceph_msg * msg,struct ceph_mds_request * req,u64 features)743d4846487SJeff Layton static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
7443859af9eSXiubo Li 			    struct ceph_mds_request *req, u64 features)
7452f2dc053SSage Weil {
7463859af9eSXiubo Li 	struct ceph_mds_reply_info_parsed *info = &req->r_reply_info;
7472f2dc053SSage Weil 	void *p, *end;
7482f2dc053SSage Weil 	u32 len;
7492f2dc053SSage Weil 	int err;
7502f2dc053SSage Weil 
7512f2dc053SSage Weil 	info->head = msg->front.iov_base;
7522f2dc053SSage Weil 	p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
7532f2dc053SSage Weil 	end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
7542f2dc053SSage Weil 
7552f2dc053SSage Weil 	/* trace */
7562f2dc053SSage Weil 	ceph_decode_32_safe(&p, end, len, bad);
7572f2dc053SSage Weil 	if (len > 0) {
75832852a81SXi Wang 		ceph_decode_need(&p, end, len, bad);
75914303d20SSage Weil 		err = parse_reply_info_trace(&p, p+len, info, features);
7602f2dc053SSage Weil 		if (err < 0)
7612f2dc053SSage Weil 			goto out_bad;
7622f2dc053SSage Weil 	}
7632f2dc053SSage Weil 
76425933abdSHerb Shiu 	/* extra */
7652f2dc053SSage Weil 	ceph_decode_32_safe(&p, end, len, bad);
7662f2dc053SSage Weil 	if (len > 0) {
76732852a81SXi Wang 		ceph_decode_need(&p, end, len, bad);
7683859af9eSXiubo Li 		err = parse_reply_info_extra(&p, p+len, req, features, s);
7692f2dc053SSage Weil 		if (err < 0)
7702f2dc053SSage Weil 			goto out_bad;
7712f2dc053SSage Weil 	}
7722f2dc053SSage Weil 
7732f2dc053SSage Weil 	/* snap blob */
7742f2dc053SSage Weil 	ceph_decode_32_safe(&p, end, len, bad);
7752f2dc053SSage Weil 	info->snapblob_len = len;
7762f2dc053SSage Weil 	info->snapblob = p;
7772f2dc053SSage Weil 	p += len;
7782f2dc053SSage Weil 
7792f2dc053SSage Weil 	if (p != end)
7802f2dc053SSage Weil 		goto bad;
7812f2dc053SSage Weil 	return 0;
7822f2dc053SSage Weil 
7832f2dc053SSage Weil bad:
7842f2dc053SSage Weil 	err = -EIO;
7852f2dc053SSage Weil out_bad:
7862f2dc053SSage Weil 	pr_err("mds parse_reply err %d\n", err);
7878b0da5c5SXiubo Li 	ceph_msg_dump(msg);
7882f2dc053SSage Weil 	return err;
7892f2dc053SSage Weil }
7902f2dc053SSage Weil 
destroy_reply_info(struct ceph_mds_reply_info_parsed * info)7912f2dc053SSage Weil static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
7922f2dc053SSage Weil {
7932d332d5bSJeff Layton 	int i;
7942d332d5bSJeff Layton 
7952d332d5bSJeff Layton 	kfree(info->diri.fscrypt_auth);
7962d332d5bSJeff Layton 	kfree(info->diri.fscrypt_file);
7972d332d5bSJeff Layton 	kfree(info->targeti.fscrypt_auth);
7982d332d5bSJeff Layton 	kfree(info->targeti.fscrypt_file);
7992a5beea3SYan, Zheng 	if (!info->dir_entries)
80054008399SYan, Zheng 		return;
8012d332d5bSJeff Layton 
8022d332d5bSJeff Layton 	for (i = 0; i < info->dir_nr; i++) {
8032d332d5bSJeff Layton 		struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;
8042d332d5bSJeff Layton 
8052d332d5bSJeff Layton 		kfree(rde->inode.fscrypt_auth);
8062d332d5bSJeff Layton 		kfree(rde->inode.fscrypt_file);
8072d332d5bSJeff Layton 	}
8082a5beea3SYan, Zheng 	free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size));
8092f2dc053SSage Weil }
8102f2dc053SSage Weil 
8114868e537SXiubo Li /*
8124868e537SXiubo Li  * In async unlink case the kclient won't wait for the first reply
8134868e537SXiubo Li  * from MDS and just drop all the links and unhash the dentry and then
8144868e537SXiubo Li  * succeeds immediately.
8154868e537SXiubo Li  *
8164868e537SXiubo Li  * For any new create/link/rename,etc requests followed by using the
8174868e537SXiubo Li  * same file names we must wait for the first reply of the inflight
8184868e537SXiubo Li  * unlink request, or the MDS possibly will fail these following
8194868e537SXiubo Li  * requests with -EEXIST if the inflight async unlink request was
8204868e537SXiubo Li  * delayed for some reasons.
8214868e537SXiubo Li  *
8224868e537SXiubo Li  * And the worst case is that for the none async openc request it will
8234868e537SXiubo Li  * successfully open the file if the CDentry hasn't been unlinked yet,
8244868e537SXiubo Li  * but later the previous delayed async unlink request will remove the
8254868e537SXiubo Li  * CDenty. That means the just created file is possiblly deleted later
8264868e537SXiubo Li  * by accident.
8274868e537SXiubo Li  *
8284868e537SXiubo Li  * We need to wait for the inflight async unlink requests to finish
8294868e537SXiubo Li  * when creating new files/directories by using the same file names.
8304868e537SXiubo Li  */
ceph_wait_on_conflict_unlink(struct dentry * dentry)8314868e537SXiubo Li int ceph_wait_on_conflict_unlink(struct dentry *dentry)
8324868e537SXiubo Li {
833985b9ee8SXiubo Li 	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dentry->d_sb);
8344868e537SXiubo Li 	struct dentry *pdentry = dentry->d_parent;
8354868e537SXiubo Li 	struct dentry *udentry, *found = NULL;
8364868e537SXiubo Li 	struct ceph_dentry_info *di;
8374868e537SXiubo Li 	struct qstr dname;
8384868e537SXiubo Li 	u32 hash = dentry->d_name.hash;
8394868e537SXiubo Li 	int err;
8404868e537SXiubo Li 
8414868e537SXiubo Li 	dname.name = dentry->d_name.name;
8424868e537SXiubo Li 	dname.len = dentry->d_name.len;
8434868e537SXiubo Li 
8444868e537SXiubo Li 	rcu_read_lock();
8454868e537SXiubo Li 	hash_for_each_possible_rcu(fsc->async_unlink_conflict, di,
8464868e537SXiubo Li 				   hnode, hash) {
8474868e537SXiubo Li 		udentry = di->dentry;
8484868e537SXiubo Li 
8494868e537SXiubo Li 		spin_lock(&udentry->d_lock);
8504868e537SXiubo Li 		if (udentry->d_name.hash != hash)
8514868e537SXiubo Li 			goto next;
8524868e537SXiubo Li 		if (unlikely(udentry->d_parent != pdentry))
8534868e537SXiubo Li 			goto next;
8544868e537SXiubo Li 		if (!hash_hashed(&di->hnode))
8554868e537SXiubo Li 			goto next;
8564868e537SXiubo Li 
8574868e537SXiubo Li 		if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags))
8584868e537SXiubo Li 			pr_warn("%s dentry %p:%pd async unlink bit is not set\n",
8594868e537SXiubo Li 				__func__, dentry, dentry);
8604868e537SXiubo Li 
8614868e537SXiubo Li 		if (!d_same_name(udentry, pdentry, &dname))
8624868e537SXiubo Li 			goto next;
8634868e537SXiubo Li 
864dc32464aSAl Viro 		found = dget_dlock(udentry);
8654868e537SXiubo Li 		spin_unlock(&udentry->d_lock);
8664868e537SXiubo Li 		break;
8674868e537SXiubo Li next:
8684868e537SXiubo Li 		spin_unlock(&udentry->d_lock);
8694868e537SXiubo Li 	}
8704868e537SXiubo Li 	rcu_read_unlock();
8714868e537SXiubo Li 
8724868e537SXiubo Li 	if (likely(!found))
8734868e537SXiubo Li 		return 0;
8744868e537SXiubo Li 
8754868e537SXiubo Li 	dout("%s dentry %p:%pd conflict with old %p:%pd\n", __func__,
8764868e537SXiubo Li 	     dentry, dentry, found, found);
8774868e537SXiubo Li 
8784868e537SXiubo Li 	err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT,
8794868e537SXiubo Li 			  TASK_KILLABLE);
8804868e537SXiubo Li 	dput(found);
8814868e537SXiubo Li 	return err;
8824868e537SXiubo Li }
8834868e537SXiubo Li 
8842f2dc053SSage Weil 
8852f2dc053SSage Weil /*
8862f2dc053SSage Weil  * sessions
8872f2dc053SSage Weil  */
ceph_session_state_name(int s)888a687ecafSJohn Spray const char *ceph_session_state_name(int s)
8892f2dc053SSage Weil {
8902f2dc053SSage Weil 	switch (s) {
8912f2dc053SSage Weil 	case CEPH_MDS_SESSION_NEW: return "new";
8922f2dc053SSage Weil 	case CEPH_MDS_SESSION_OPENING: return "opening";
8932f2dc053SSage Weil 	case CEPH_MDS_SESSION_OPEN: return "open";
8942f2dc053SSage Weil 	case CEPH_MDS_SESSION_HUNG: return "hung";
8952f2dc053SSage Weil 	case CEPH_MDS_SESSION_CLOSING: return "closing";
8964d681c2fSXiubo Li 	case CEPH_MDS_SESSION_CLOSED: return "closed";
89744ca18f2SSage Weil 	case CEPH_MDS_SESSION_RESTARTING: return "restarting";
8982f2dc053SSage Weil 	case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
899fcff415cSYan, Zheng 	case CEPH_MDS_SESSION_REJECTED: return "rejected";
9002f2dc053SSage Weil 	default: return "???";
9012f2dc053SSage Weil 	}
9022f2dc053SSage Weil }
9032f2dc053SSage Weil 
ceph_get_mds_session(struct ceph_mds_session * s)9045b3248c6SXiubo Li struct ceph_mds_session *ceph_get_mds_session(struct ceph_mds_session *s)
9052f2dc053SSage Weil {
9069f358999SJeff Layton 	if (refcount_inc_not_zero(&s->s_ref))
9072f2dc053SSage Weil 		return s;
9082f2dc053SSage Weil 	return NULL;
9092f2dc053SSage Weil }
9102f2dc053SSage Weil 
ceph_put_mds_session(struct ceph_mds_session * s)9112f2dc053SSage Weil void ceph_put_mds_session(struct ceph_mds_session *s)
9122f2dc053SSage Weil {
9137e65624dSJeff Layton 	if (IS_ERR_OR_NULL(s))
9147e65624dSJeff Layton 		return;
9157e65624dSJeff Layton 
9163997c01dSElena Reshetova 	if (refcount_dec_and_test(&s->s_ref)) {
9176c4a1915SAlex Elder 		if (s->s_auth.authorizer)
9186c1ea260SIlya Dryomov 			ceph_auth_destroy_authorizer(s->s_auth.authorizer);
91988828190SJeff Layton 		WARN_ON(mutex_is_locked(&s->s_mutex));
920d4846487SJeff Layton 		xa_destroy(&s->s_delegated_inos);
9212f2dc053SSage Weil 		kfree(s);
9222f2dc053SSage Weil 	}
9234e7a5dcdSSage Weil }
9242f2dc053SSage Weil 
9252f2dc053SSage Weil /*
9262f2dc053SSage Weil  * called under mdsc->mutex
9272f2dc053SSage Weil  */
__ceph_lookup_mds_session(struct ceph_mds_client * mdsc,int mds)9282f2dc053SSage Weil struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
9292f2dc053SSage Weil 						   int mds)
9302f2dc053SSage Weil {
931d37b1d99SMarkus Elfring 	if (mds >= mdsc->max_sessions || !mdsc->sessions[mds])
9322f2dc053SSage Weil 		return NULL;
9335b3248c6SXiubo Li 	return ceph_get_mds_session(mdsc->sessions[mds]);
9342f2dc053SSage Weil }
9352f2dc053SSage Weil 
__have_session(struct ceph_mds_client * mdsc,int mds)9362f2dc053SSage Weil static bool __have_session(struct ceph_mds_client *mdsc, int mds)
9372f2dc053SSage Weil {
93898cfda81SChengguang Xu 	if (mds >= mdsc->max_sessions || !mdsc->sessions[mds])
9392f2dc053SSage Weil 		return false;
94098cfda81SChengguang Xu 	else
94198cfda81SChengguang Xu 		return true;
9422f2dc053SSage Weil }
9432f2dc053SSage Weil 
__verify_registered_session(struct ceph_mds_client * mdsc,struct ceph_mds_session * s)9442600d2ddSSage Weil static int __verify_registered_session(struct ceph_mds_client *mdsc,
9452600d2ddSSage Weil 				       struct ceph_mds_session *s)
9462600d2ddSSage Weil {
9472600d2ddSSage Weil 	if (s->s_mds >= mdsc->max_sessions ||
9482600d2ddSSage Weil 	    mdsc->sessions[s->s_mds] != s)
9492600d2ddSSage Weil 		return -ENOENT;
9502600d2ddSSage Weil 	return 0;
9512600d2ddSSage Weil }
9522600d2ddSSage Weil 
9532f2dc053SSage Weil /*
9542f2dc053SSage Weil  * create+register a new session for given mds.
9552f2dc053SSage Weil  * called under mdsc->mutex.
9562f2dc053SSage Weil  */
register_session(struct ceph_mds_client * mdsc,int mds)9572f2dc053SSage Weil static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
9582f2dc053SSage Weil 						 int mds)
9592f2dc053SSage Weil {
9602f2dc053SSage Weil 	struct ceph_mds_session *s;
9612f2dc053SSage Weil 
962a68e564aSXiubo Li 	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO)
963a68e564aSXiubo Li 		return ERR_PTR(-EIO);
964a68e564aSXiubo Li 
965b38c9eb4SXiubo Li 	if (mds >= mdsc->mdsmap->possible_max_rank)
966c338c07cSNathaniel Yazdani 		return ERR_PTR(-EINVAL);
967c338c07cSNathaniel Yazdani 
9682f2dc053SSage Weil 	s = kzalloc(sizeof(*s), GFP_NOFS);
9694736b009SDan Carpenter 	if (!s)
9704736b009SDan Carpenter 		return ERR_PTR(-ENOMEM);
97147474d0bSChengguang Xu 
97247474d0bSChengguang Xu 	if (mds >= mdsc->max_sessions) {
97347474d0bSChengguang Xu 		int newmax = 1 << get_count_order(mds + 1);
97447474d0bSChengguang Xu 		struct ceph_mds_session **sa;
97547474d0bSChengguang Xu 
97647474d0bSChengguang Xu 		dout("%s: realloc to %d\n", __func__, newmax);
97747474d0bSChengguang Xu 		sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
97847474d0bSChengguang Xu 		if (!sa)
97947474d0bSChengguang Xu 			goto fail_realloc;
98047474d0bSChengguang Xu 		if (mdsc->sessions) {
98147474d0bSChengguang Xu 			memcpy(sa, mdsc->sessions,
98247474d0bSChengguang Xu 			       mdsc->max_sessions * sizeof(void *));
98347474d0bSChengguang Xu 			kfree(mdsc->sessions);
98447474d0bSChengguang Xu 		}
98547474d0bSChengguang Xu 		mdsc->sessions = sa;
98647474d0bSChengguang Xu 		mdsc->max_sessions = newmax;
98747474d0bSChengguang Xu 	}
98847474d0bSChengguang Xu 
98947474d0bSChengguang Xu 	dout("%s: mds%d\n", __func__, mds);
9902f2dc053SSage Weil 	s->s_mdsc = mdsc;
9912f2dc053SSage Weil 	s->s_mds = mds;
9922f2dc053SSage Weil 	s->s_state = CEPH_MDS_SESSION_NEW;
9932f2dc053SSage Weil 	mutex_init(&s->s_mutex);
9942f2dc053SSage Weil 
995b7a9e5ddSSage Weil 	ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr);
9962f2dc053SSage Weil 
99752d60f8eSJeff Layton 	atomic_set(&s->s_cap_gen, 1);
9981ce208a6SAlex Elder 	s->s_cap_ttl = jiffies - 1;
999d8fb02abSAlex Elder 
1000d8fb02abSAlex Elder 	spin_lock_init(&s->s_cap_lock);
10012f2dc053SSage Weil 	INIT_LIST_HEAD(&s->s_caps);
10023997c01dSElena Reshetova 	refcount_set(&s->s_ref, 1);
10032f2dc053SSage Weil 	INIT_LIST_HEAD(&s->s_waiting);
10042f2dc053SSage Weil 	INIT_LIST_HEAD(&s->s_unsafe);
1005d4846487SJeff Layton 	xa_init(&s->s_delegated_inos);
10062f2dc053SSage Weil 	INIT_LIST_HEAD(&s->s_cap_releases);
1007e3ec8d68SYan, Zheng 	INIT_WORK(&s->s_cap_release_work, ceph_cap_release_work);
1008e3ec8d68SYan, Zheng 
10091cf03a68SJeff Layton 	INIT_LIST_HEAD(&s->s_cap_dirty);
10102f2dc053SSage Weil 	INIT_LIST_HEAD(&s->s_cap_flushing);
10112f2dc053SSage Weil 
10122f2dc053SSage Weil 	mdsc->sessions[mds] = s;
101386d8f67bSYan, Zheng 	atomic_inc(&mdsc->num_sessions);
10143997c01dSElena Reshetova 	refcount_inc(&s->s_ref);  /* one ref to sessions[], one to caller */
101542ce56e5SSage Weil 
1016b7a9e5ddSSage Weil 	ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds,
1017b7a9e5ddSSage Weil 		      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
101842ce56e5SSage Weil 
10192f2dc053SSage Weil 	return s;
102042ce56e5SSage Weil 
102142ce56e5SSage Weil fail_realloc:
102242ce56e5SSage Weil 	kfree(s);
102342ce56e5SSage Weil 	return ERR_PTR(-ENOMEM);
10242f2dc053SSage Weil }
10252f2dc053SSage Weil 
10262f2dc053SSage Weil /*
10272f2dc053SSage Weil  * called under mdsc->mutex
10282f2dc053SSage Weil  */
__unregister_session(struct ceph_mds_client * mdsc,struct ceph_mds_session * s)10292600d2ddSSage Weil static void __unregister_session(struct ceph_mds_client *mdsc,
103042ce56e5SSage Weil 			       struct ceph_mds_session *s)
10312f2dc053SSage Weil {
10322600d2ddSSage Weil 	dout("__unregister_session mds%d %p\n", s->s_mds, s);
10332600d2ddSSage Weil 	BUG_ON(mdsc->sessions[s->s_mds] != s);
103442ce56e5SSage Weil 	mdsc->sessions[s->s_mds] = NULL;
103542ce56e5SSage Weil 	ceph_con_close(&s->s_con);
103642ce56e5SSage Weil 	ceph_put_mds_session(s);
103786d8f67bSYan, Zheng 	atomic_dec(&mdsc->num_sessions);
10382f2dc053SSage Weil }
10392f2dc053SSage Weil 
10402f2dc053SSage Weil /*
10412f2dc053SSage Weil  * drop session refs in request.
10422f2dc053SSage Weil  *
10432f2dc053SSage Weil  * should be last request ref, or hold mdsc->mutex
10442f2dc053SSage Weil  */
put_request_session(struct ceph_mds_request * req)10452f2dc053SSage Weil static void put_request_session(struct ceph_mds_request *req)
10462f2dc053SSage Weil {
10472f2dc053SSage Weil 	if (req->r_session) {
10482f2dc053SSage Weil 		ceph_put_mds_session(req->r_session);
10492f2dc053SSage Weil 		req->r_session = NULL;
10502f2dc053SSage Weil 	}
10512f2dc053SSage Weil }
10522f2dc053SSage Weil 
ceph_mdsc_iterate_sessions(struct ceph_mds_client * mdsc,void (* cb)(struct ceph_mds_session *),bool check_state)105359b312f3SXiubo Li void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc,
105459b312f3SXiubo Li 				void (*cb)(struct ceph_mds_session *),
105559b312f3SXiubo Li 				bool check_state)
105659b312f3SXiubo Li {
105759b312f3SXiubo Li 	int mds;
105859b312f3SXiubo Li 
105959b312f3SXiubo Li 	mutex_lock(&mdsc->mutex);
106059b312f3SXiubo Li 	for (mds = 0; mds < mdsc->max_sessions; ++mds) {
106159b312f3SXiubo Li 		struct ceph_mds_session *s;
106259b312f3SXiubo Li 
106359b312f3SXiubo Li 		s = __ceph_lookup_mds_session(mdsc, mds);
106459b312f3SXiubo Li 		if (!s)
106559b312f3SXiubo Li 			continue;
106659b312f3SXiubo Li 
106759b312f3SXiubo Li 		if (check_state && !check_session_state(s)) {
106859b312f3SXiubo Li 			ceph_put_mds_session(s);
106959b312f3SXiubo Li 			continue;
107059b312f3SXiubo Li 		}
107159b312f3SXiubo Li 
107259b312f3SXiubo Li 		mutex_unlock(&mdsc->mutex);
107359b312f3SXiubo Li 		cb(s);
107459b312f3SXiubo Li 		ceph_put_mds_session(s);
107559b312f3SXiubo Li 		mutex_lock(&mdsc->mutex);
107659b312f3SXiubo Li 	}
107759b312f3SXiubo Li 	mutex_unlock(&mdsc->mutex);
107859b312f3SXiubo Li }
107959b312f3SXiubo Li 
ceph_mdsc_release_request(struct kref * kref)1080153c8e6bSSage Weil void ceph_mdsc_release_request(struct kref *kref)
10812f2dc053SSage Weil {
1082153c8e6bSSage Weil 	struct ceph_mds_request *req = container_of(kref,
1083153c8e6bSSage Weil 						    struct ceph_mds_request,
1084153c8e6bSSage Weil 						    r_kref);
1085e64f44a8SXiubo Li 	ceph_mdsc_release_dir_caps_no_check(req);
108654008399SYan, Zheng 	destroy_reply_info(&req->r_reply_info);
10872f2dc053SSage Weil 	if (req->r_request)
10882f2dc053SSage Weil 		ceph_msg_put(req->r_request);
108954008399SYan, Zheng 	if (req->r_reply)
10902f2dc053SSage Weil 		ceph_msg_put(req->r_reply);
10912f2dc053SSage Weil 	if (req->r_inode) {
109241b02e1fSSage Weil 		ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
109323c2c76eSJeff Layton 		iput(req->r_inode);
10942f2dc053SSage Weil 	}
10959c1c2b35SJeff Layton 	if (req->r_parent) {
10963dd69aabSJeff Layton 		ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
109723c2c76eSJeff Layton 		iput(req->r_parent);
10989c1c2b35SJeff Layton 	}
109923c2c76eSJeff Layton 	iput(req->r_target_inode);
1100ec9595c0SJeff Layton 	iput(req->r_new_inode);
11012f2dc053SSage Weil 	if (req->r_dentry)
11022f2dc053SSage Weil 		dput(req->r_dentry);
1103844d87c3SSage Weil 	if (req->r_old_dentry)
1104844d87c3SSage Weil 		dput(req->r_old_dentry);
1105844d87c3SSage Weil 	if (req->r_old_dentry_dir) {
110641b02e1fSSage Weil 		/*
110741b02e1fSSage Weil 		 * track (and drop pins for) r_old_dentry_dir
110841b02e1fSSage Weil 		 * separately, since r_old_dentry's d_parent may have
110941b02e1fSSage Weil 		 * changed between the dir mutex being dropped and
111041b02e1fSSage Weil 		 * this request being freed.
111141b02e1fSSage Weil 		 */
111241b02e1fSSage Weil 		ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
11132f2dc053SSage Weil 				  CEPH_CAP_PIN);
111423c2c76eSJeff Layton 		iput(req->r_old_dentry_dir);
11152f2dc053SSage Weil 	}
11162f2dc053SSage Weil 	kfree(req->r_path1);
11172f2dc053SSage Weil 	kfree(req->r_path2);
11187fe0cdebSJeff Layton 	put_cred(req->r_cred);
111925e6bae3SYan, Zheng 	if (req->r_pagelist)
112025e6bae3SYan, Zheng 		ceph_pagelist_release(req->r_pagelist);
11212d332d5bSJeff Layton 	kfree(req->r_fscrypt_auth);
112224865e75SJeff Layton 	kfree(req->r_altname);
11232f2dc053SSage Weil 	put_request_session(req);
112437151668SYehuda Sadeh 	ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
1125428138c9SYan, Zheng 	WARN_ON_ONCE(!list_empty(&req->r_wait));
1126058daab7SJeff Layton 	kmem_cache_free(ceph_mds_request_cachep, req);
11272f2dc053SSage Weil }
11282f2dc053SSage Weil 
DEFINE_RB_FUNCS(request,struct ceph_mds_request,r_tid,r_node)1129fcd00b68SIlya Dryomov DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node)
1130fcd00b68SIlya Dryomov 
11312f2dc053SSage Weil /*
11322f2dc053SSage Weil  * lookup session, bump ref if found.
11332f2dc053SSage Weil  *
11342f2dc053SSage Weil  * called under mdsc->mutex.
11352f2dc053SSage Weil  */
1136fcd00b68SIlya Dryomov static struct ceph_mds_request *
1137fcd00b68SIlya Dryomov lookup_get_request(struct ceph_mds_client *mdsc, u64 tid)
11382f2dc053SSage Weil {
11392f2dc053SSage Weil 	struct ceph_mds_request *req;
114044ca18f2SSage Weil 
1141fcd00b68SIlya Dryomov 	req = lookup_request(&mdsc->request_tree, tid);
1142fcd00b68SIlya Dryomov 	if (req)
11432f2dc053SSage Weil 		ceph_mdsc_get_request(req);
1144fcd00b68SIlya Dryomov 
11452f2dc053SSage Weil 	return req;
11462f2dc053SSage Weil }
11472f2dc053SSage Weil 
11482f2dc053SSage Weil /*
11492f2dc053SSage Weil  * Register an in-flight request, and assign a tid.  Link to directory
11502f2dc053SSage Weil  * are modifying (if any).
11512f2dc053SSage Weil  *
11522f2dc053SSage Weil  * Called under mdsc->mutex.
11532f2dc053SSage Weil  */
__register_request(struct ceph_mds_client * mdsc,struct ceph_mds_request * req,struct inode * dir)11542f2dc053SSage Weil static void __register_request(struct ceph_mds_client *mdsc,
11552f2dc053SSage Weil 			       struct ceph_mds_request *req,
11562f2dc053SSage Weil 			       struct inode *dir)
11572f2dc053SSage Weil {
1158e30ee581SZhi Zhang 	int ret = 0;
1159e30ee581SZhi Zhang 
11602f2dc053SSage Weil 	req->r_tid = ++mdsc->last_tid;
1161e30ee581SZhi Zhang 	if (req->r_num_caps) {
1162e30ee581SZhi Zhang 		ret = ceph_reserve_caps(mdsc, &req->r_caps_reservation,
116337151668SYehuda Sadeh 					req->r_num_caps);
1164e30ee581SZhi Zhang 		if (ret < 0) {
1165e30ee581SZhi Zhang 			pr_err("__register_request %p "
1166e30ee581SZhi Zhang 			       "failed to reserve caps: %d\n", req, ret);
1167e30ee581SZhi Zhang 			/* set req->r_err to fail early from __do_request */
1168e30ee581SZhi Zhang 			req->r_err = ret;
1169e30ee581SZhi Zhang 			return;
1170e30ee581SZhi Zhang 		}
1171e30ee581SZhi Zhang 	}
11722f2dc053SSage Weil 	dout("__register_request %p tid %lld\n", req, req->r_tid);
11732f2dc053SSage Weil 	ceph_mdsc_get_request(req);
1174fcd00b68SIlya Dryomov 	insert_request(&mdsc->request_tree, req);
11752f2dc053SSage Weil 
11767fe0cdebSJeff Layton 	req->r_cred = get_current_cred();
1177cb4276ccSSage Weil 
1178e8a7b8b1SYan, Zheng 	if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK)
1179e8a7b8b1SYan, Zheng 		mdsc->oldest_tid = req->r_tid;
1180e8a7b8b1SYan, Zheng 
11812f2dc053SSage Weil 	if (dir) {
11823db0a2fcSJeff Layton 		struct ceph_inode_info *ci = ceph_inode(dir);
11833db0a2fcSJeff Layton 
11843b663780SSage Weil 		ihold(dir);
11852f2dc053SSage Weil 		req->r_unsafe_dir = dir;
11863db0a2fcSJeff Layton 		spin_lock(&ci->i_unsafe_lock);
11873db0a2fcSJeff Layton 		list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
11883db0a2fcSJeff Layton 		spin_unlock(&ci->i_unsafe_lock);
11892f2dc053SSage Weil 	}
11902f2dc053SSage Weil }
11912f2dc053SSage Weil 
__unregister_request(struct ceph_mds_client * mdsc,struct ceph_mds_request * req)11922f2dc053SSage Weil static void __unregister_request(struct ceph_mds_client *mdsc,
11932f2dc053SSage Weil 				 struct ceph_mds_request *req)
11942f2dc053SSage Weil {
11952f2dc053SSage Weil 	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
1196e8a7b8b1SYan, Zheng 
1197df963ea8SJeff Layton 	/* Never leave an unregistered request on an unsafe list! */
1198df963ea8SJeff Layton 	list_del_init(&req->r_unsafe_item);
1199df963ea8SJeff Layton 
1200e8a7b8b1SYan, Zheng 	if (req->r_tid == mdsc->oldest_tid) {
1201e8a7b8b1SYan, Zheng 		struct rb_node *p = rb_next(&req->r_node);
1202e8a7b8b1SYan, Zheng 		mdsc->oldest_tid = 0;
1203e8a7b8b1SYan, Zheng 		while (p) {
1204e8a7b8b1SYan, Zheng 			struct ceph_mds_request *next_req =
1205e8a7b8b1SYan, Zheng 				rb_entry(p, struct ceph_mds_request, r_node);
1206e8a7b8b1SYan, Zheng 			if (next_req->r_op != CEPH_MDS_OP_SETFILELOCK) {
1207e8a7b8b1SYan, Zheng 				mdsc->oldest_tid = next_req->r_tid;
1208e8a7b8b1SYan, Zheng 				break;
1209e8a7b8b1SYan, Zheng 			}
1210e8a7b8b1SYan, Zheng 			p = rb_next(p);
1211e8a7b8b1SYan, Zheng 		}
1212e8a7b8b1SYan, Zheng 	}
1213e8a7b8b1SYan, Zheng 
1214fcd00b68SIlya Dryomov 	erase_request(&mdsc->request_tree, req);
12152f2dc053SSage Weil 
12163db0a2fcSJeff Layton 	if (req->r_unsafe_dir) {
12172f2dc053SSage Weil 		struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
12182f2dc053SSage Weil 		spin_lock(&ci->i_unsafe_lock);
12192f2dc053SSage Weil 		list_del_init(&req->r_unsafe_dir_item);
12202f2dc053SSage Weil 		spin_unlock(&ci->i_unsafe_lock);
12214c06ace8SYan, Zheng 	}
1222bc2de10dSJeff Layton 	if (req->r_target_inode &&
1223bc2de10dSJeff Layton 	    test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
122468cd5b4bSYan, Zheng 		struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
122568cd5b4bSYan, Zheng 		spin_lock(&ci->i_unsafe_lock);
122668cd5b4bSYan, Zheng 		list_del_init(&req->r_unsafe_target_item);
122768cd5b4bSYan, Zheng 		spin_unlock(&ci->i_unsafe_lock);
122868cd5b4bSYan, Zheng 	}
12293b663780SSage Weil 
12304c06ace8SYan, Zheng 	if (req->r_unsafe_dir) {
123123c2c76eSJeff Layton 		iput(req->r_unsafe_dir);
12323b663780SSage Weil 		req->r_unsafe_dir = NULL;
12332f2dc053SSage Weil 	}
123494aa8ae1SSage Weil 
1235fc55d2c9SYan, Zheng 	complete_all(&req->r_safe_completion);
1236fc55d2c9SYan, Zheng 
123794aa8ae1SSage Weil 	ceph_mdsc_put_request(req);
12382f2dc053SSage Weil }
12392f2dc053SSage Weil 
12402f2dc053SSage Weil /*
124130c71233SJeff Layton  * Walk back up the dentry tree until we hit a dentry representing a
124230c71233SJeff Layton  * non-snapshot inode. We do this using the rcu_read_lock (which must be held
124330c71233SJeff Layton  * when calling this) to ensure that the objects won't disappear while we're
124430c71233SJeff Layton  * working with them. Once we hit a candidate dentry, we attempt to take a
124530c71233SJeff Layton  * reference to it, and return that as the result.
124630c71233SJeff Layton  */
get_nonsnap_parent(struct dentry * dentry)1247f1075480SDan Carpenter static struct inode *get_nonsnap_parent(struct dentry *dentry)
1248f1075480SDan Carpenter {
1249f1075480SDan Carpenter 	struct inode *inode = NULL;
125030c71233SJeff Layton 
125130c71233SJeff Layton 	while (dentry && !IS_ROOT(dentry)) {
125230c71233SJeff Layton 		inode = d_inode_rcu(dentry);
125330c71233SJeff Layton 		if (!inode || ceph_snap(inode) == CEPH_NOSNAP)
125430c71233SJeff Layton 			break;
125530c71233SJeff Layton 		dentry = dentry->d_parent;
125630c71233SJeff Layton 	}
125730c71233SJeff Layton 	if (inode)
125830c71233SJeff Layton 		inode = igrab(inode);
125930c71233SJeff Layton 	return inode;
126030c71233SJeff Layton }
126130c71233SJeff Layton 
126230c71233SJeff Layton /*
12632f2dc053SSage Weil  * Choose mds to send request to next.  If there is a hint set in the
12642f2dc053SSage Weil  * request (e.g., due to a prior forward hint from the mds), use that.
12652f2dc053SSage Weil  * Otherwise, consult frag tree and/or caps to identify the
12662f2dc053SSage Weil  * appropriate mds.  If all else fails, choose randomly.
12672f2dc053SSage Weil  *
12682f2dc053SSage Weil  * Called under mdsc->mutex.
12692f2dc053SSage Weil  */
__choose_mds(struct ceph_mds_client * mdsc,struct ceph_mds_request * req,bool * random)12702f2dc053SSage Weil static int __choose_mds(struct ceph_mds_client *mdsc,
1271c4853e97SXiubo Li 			struct ceph_mds_request *req,
1272c4853e97SXiubo Li 			bool *random)
12732f2dc053SSage Weil {
12742f2dc053SSage Weil 	struct inode *inode;
12752f2dc053SSage Weil 	struct ceph_inode_info *ci;
12762f2dc053SSage Weil 	struct ceph_cap *cap;
12772f2dc053SSage Weil 	int mode = req->r_direct_mode;
12782f2dc053SSage Weil 	int mds = -1;
12792f2dc053SSage Weil 	u32 hash = req->r_direct_hash;
1280bc2de10dSJeff Layton 	bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
12812f2dc053SSage Weil 
1282c4853e97SXiubo Li 	if (random)
1283c4853e97SXiubo Li 		*random = false;
1284c4853e97SXiubo Li 
12852f2dc053SSage Weil 	/*
12862f2dc053SSage Weil 	 * is there a specific mds we should try?  ignore hint if we have
12872f2dc053SSage Weil 	 * no session and the mds is not up (active or recovering).
12882f2dc053SSage Weil 	 */
12892f2dc053SSage Weil 	if (req->r_resend_mds >= 0 &&
12902f2dc053SSage Weil 	    (__have_session(mdsc, req->r_resend_mds) ||
12912f2dc053SSage Weil 	     ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
12923c802092SXiubo Li 		dout("%s using resend_mds mds%d\n", __func__,
12932f2dc053SSage Weil 		     req->r_resend_mds);
12942f2dc053SSage Weil 		return req->r_resend_mds;
12952f2dc053SSage Weil 	}
12962f2dc053SSage Weil 
12972f2dc053SSage Weil 	if (mode == USE_RANDOM_MDS)
12982f2dc053SSage Weil 		goto random;
12992f2dc053SSage Weil 
13002f2dc053SSage Weil 	inode = NULL;
13012f2dc053SSage Weil 	if (req->r_inode) {
13025d37ca14SYan, Zheng 		if (ceph_snap(req->r_inode) != CEPH_SNAPDIR) {
13032f2dc053SSage Weil 			inode = req->r_inode;
130430c71233SJeff Layton 			ihold(inode);
13055d37ca14SYan, Zheng 		} else {
130638f340ccSYan, Zheng 			/* req->r_dentry is non-null for LSSNAP request */
130738f340ccSYan, Zheng 			rcu_read_lock();
130838f340ccSYan, Zheng 			inode = get_nonsnap_parent(req->r_dentry);
130938f340ccSYan, Zheng 			rcu_read_unlock();
13103c802092SXiubo Li 			dout("%s using snapdir's parent %p\n", __func__, inode);
13115d37ca14SYan, Zheng 		}
131238f340ccSYan, Zheng 	} else if (req->r_dentry) {
1313d79698daSSage Weil 		/* ignore race with rename; old or new d_parent is okay */
131430c71233SJeff Layton 		struct dentry *parent;
131530c71233SJeff Layton 		struct inode *dir;
1316eb6bb1c5SSage Weil 
131730c71233SJeff Layton 		rcu_read_lock();
131841883ba8SYan, Zheng 		parent = READ_ONCE(req->r_dentry->d_parent);
13193dd69aabSJeff Layton 		dir = req->r_parent ? : d_inode_rcu(parent);
132030c71233SJeff Layton 
132130c71233SJeff Layton 		if (!dir || dir->i_sb != mdsc->fsc->sb) {
132230c71233SJeff Layton 			/*  not this fs or parent went negative */
13232b0143b5SDavid Howells 			inode = d_inode(req->r_dentry);
132430c71233SJeff Layton 			if (inode)
132530c71233SJeff Layton 				ihold(inode);
1326eb6bb1c5SSage Weil 		} else if (ceph_snap(dir) != CEPH_NOSNAP) {
1327eb6bb1c5SSage Weil 			/* direct snapped/virtual snapdir requests
1328eb6bb1c5SSage Weil 			 * based on parent dir inode */
132930c71233SJeff Layton 			inode = get_nonsnap_parent(parent);
13303c802092SXiubo Li 			dout("%s using nonsnap parent %p\n", __func__, inode);
1331ca18bedeSYan, Zheng 		} else {
1332eb6bb1c5SSage Weil 			/* dentry target */
13332b0143b5SDavid Howells 			inode = d_inode(req->r_dentry);
1334ca18bedeSYan, Zheng 			if (!inode || mode == USE_AUTH_MDS) {
1335eb6bb1c5SSage Weil 				/* dir + name */
133630c71233SJeff Layton 				inode = igrab(dir);
1337e5f86dc3SSage Weil 				hash = ceph_dentry_hash(dir, req->r_dentry);
13382f2dc053SSage Weil 				is_hash = true;
133930c71233SJeff Layton 			} else {
134030c71233SJeff Layton 				ihold(inode);
13412f2dc053SSage Weil 			}
13422f2dc053SSage Weil 		}
134330c71233SJeff Layton 		rcu_read_unlock();
1344ca18bedeSYan, Zheng 	}
1345eb6bb1c5SSage Weil 
13463c802092SXiubo Li 	dout("%s %p is_hash=%d (0x%x) mode %d\n", __func__, inode, (int)is_hash,
13473c802092SXiubo Li 	     hash, mode);
13482f2dc053SSage Weil 	if (!inode)
13492f2dc053SSage Weil 		goto random;
13502f2dc053SSage Weil 	ci = ceph_inode(inode);
13512f2dc053SSage Weil 
13522f2dc053SSage Weil 	if (is_hash && S_ISDIR(inode->i_mode)) {
13532f2dc053SSage Weil 		struct ceph_inode_frag frag;
13542f2dc053SSage Weil 		int found;
13552f2dc053SSage Weil 
13562f2dc053SSage Weil 		ceph_choose_frag(ci, hash, &frag, &found);
13572f2dc053SSage Weil 		if (found) {
13582f2dc053SSage Weil 			if (mode == USE_ANY_MDS && frag.ndist > 0) {
13592f2dc053SSage Weil 				u8 r;
13602f2dc053SSage Weil 
13612f2dc053SSage Weil 				/* choose a random replica */
13622f2dc053SSage Weil 				get_random_bytes(&r, 1);
13632f2dc053SSage Weil 				r %= frag.ndist;
13642f2dc053SSage Weil 				mds = frag.dist[r];
13653c802092SXiubo Li 				dout("%s %p %llx.%llx frag %u mds%d (%d/%d)\n",
13663c802092SXiubo Li 				     __func__, inode, ceph_vinop(inode),
13673c802092SXiubo Li 				     frag.frag, mds, (int)r, frag.ndist);
1368d66bbd44SSage Weil 				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
13695d47648fSXiubo Li 				    CEPH_MDS_STATE_ACTIVE &&
13705d47648fSXiubo Li 				    !ceph_mdsmap_is_laggy(mdsc->mdsmap, mds))
137130c71233SJeff Layton 					goto out;
13722f2dc053SSage Weil 			}
13732f2dc053SSage Weil 
13742f2dc053SSage Weil 			/* since this file/dir wasn't known to be
13752f2dc053SSage Weil 			 * replicated, then we want to look for the
13762f2dc053SSage Weil 			 * authoritative mds. */
13772f2dc053SSage Weil 			if (frag.mds >= 0) {
13782f2dc053SSage Weil 				/* choose auth mds */
13792f2dc053SSage Weil 				mds = frag.mds;
13803c802092SXiubo Li 				dout("%s %p %llx.%llx frag %u mds%d (auth)\n",
13813c802092SXiubo Li 				     __func__, inode, ceph_vinop(inode),
13823c802092SXiubo Li 				     frag.frag, mds);
1383d66bbd44SSage Weil 				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
13845d47648fSXiubo Li 				    CEPH_MDS_STATE_ACTIVE) {
1385224c7b67SYanhu Cao 					if (!ceph_mdsmap_is_laggy(mdsc->mdsmap,
13865d47648fSXiubo Li 								  mds))
138730c71233SJeff Layton 						goto out;
13882f2dc053SSage Weil 				}
13892f2dc053SSage Weil 			}
13905d47648fSXiubo Li 			mode = USE_AUTH_MDS;
13915d47648fSXiubo Li 		}
13922f2dc053SSage Weil 	}
13932f2dc053SSage Weil 
1394be655596SSage Weil 	spin_lock(&ci->i_ceph_lock);
13952f2dc053SSage Weil 	cap = NULL;
13962f2dc053SSage Weil 	if (mode == USE_AUTH_MDS)
13972f2dc053SSage Weil 		cap = ci->i_auth_cap;
13982f2dc053SSage Weil 	if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
13992f2dc053SSage Weil 		cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
14002f2dc053SSage Weil 	if (!cap) {
1401be655596SSage Weil 		spin_unlock(&ci->i_ceph_lock);
140223c2c76eSJeff Layton 		iput(inode);
14032f2dc053SSage Weil 		goto random;
14042f2dc053SSage Weil 	}
14052f2dc053SSage Weil 	mds = cap->session->s_mds;
14063c802092SXiubo Li 	dout("%s %p %llx.%llx mds%d (%scap %p)\n", __func__,
14072f2dc053SSage Weil 	     inode, ceph_vinop(inode), mds,
14082f2dc053SSage Weil 	     cap == ci->i_auth_cap ? "auth " : "", cap);
1409be655596SSage Weil 	spin_unlock(&ci->i_ceph_lock);
141030c71233SJeff Layton out:
141123c2c76eSJeff Layton 	iput(inode);
14122f2dc053SSage Weil 	return mds;
14132f2dc053SSage Weil 
14142f2dc053SSage Weil random:
1415c4853e97SXiubo Li 	if (random)
1416c4853e97SXiubo Li 		*random = true;
1417c4853e97SXiubo Li 
14182f2dc053SSage Weil 	mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
14193c802092SXiubo Li 	dout("%s chose random mds%d\n", __func__, mds);
14202f2dc053SSage Weil 	return mds;
14212f2dc053SSage Weil }
14222f2dc053SSage Weil 
14232f2dc053SSage Weil 
14242f2dc053SSage Weil /*
14252f2dc053SSage Weil  * session messages
14262f2dc053SSage Weil  */
ceph_create_session_msg(u32 op,u64 seq)1427fba97e80SXiubo Li struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq)
14282f2dc053SSage Weil {
14292f2dc053SSage Weil 	struct ceph_msg *msg;
14302f2dc053SSage Weil 	struct ceph_mds_session_head *h;
14312f2dc053SSage Weil 
1432b61c2763SSage Weil 	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS,
1433b61c2763SSage Weil 			   false);
1434a79832f2SSage Weil 	if (!msg) {
1435fba97e80SXiubo Li 		pr_err("ENOMEM creating session %s msg\n",
1436fba97e80SXiubo Li 		       ceph_session_op_name(op));
1437a79832f2SSage Weil 		return NULL;
14382f2dc053SSage Weil 	}
14392f2dc053SSage Weil 	h = msg->front.iov_base;
14402f2dc053SSage Weil 	h->op = cpu_to_le32(op);
14412f2dc053SSage Weil 	h->seq = cpu_to_le64(seq);
1442dbd0c8bfSJohn Spray 
1443dbd0c8bfSJohn Spray 	return msg;
1444dbd0c8bfSJohn Spray }
1445dbd0c8bfSJohn Spray 
14469ba1e224SXiubo Li static const unsigned char feature_bits[] = CEPHFS_FEATURES_CLIENT_SUPPORTED;
14479ba1e224SXiubo Li #define FEATURE_BYTES(c) (DIV_ROUND_UP((size_t)feature_bits[c - 1] + 1, 64) * 8)
encode_supported_features(void ** p,void * end)1448b682c6d4SXiubo Li static int encode_supported_features(void **p, void *end)
1449342ce182SYan, Zheng {
14509ba1e224SXiubo Li 	static const size_t count = ARRAY_SIZE(feature_bits);
1451342ce182SYan, Zheng 
1452342ce182SYan, Zheng 	if (count > 0) {
1453342ce182SYan, Zheng 		size_t i;
14549ba1e224SXiubo Li 		size_t size = FEATURE_BYTES(count);
1455fea013e0SLuís Henriques 		unsigned long bit;
1456342ce182SYan, Zheng 
1457b682c6d4SXiubo Li 		if (WARN_ON_ONCE(*p + 4 + size > end))
1458b682c6d4SXiubo Li 			return -ERANGE;
1459b682c6d4SXiubo Li 
1460342ce182SYan, Zheng 		ceph_encode_32(p, size);
1461342ce182SYan, Zheng 		memset(*p, 0, size);
1462fea013e0SLuís Henriques 		for (i = 0; i < count; i++) {
1463fea013e0SLuís Henriques 			bit = feature_bits[i];
1464fea013e0SLuís Henriques 			((unsigned char *)(*p))[bit / 8] |= BIT(bit % 8);
1465fea013e0SLuís Henriques 		}
1466342ce182SYan, Zheng 		*p += size;
1467342ce182SYan, Zheng 	} else {
1468b682c6d4SXiubo Li 		if (WARN_ON_ONCE(*p + 4 > end))
1469b682c6d4SXiubo Li 			return -ERANGE;
1470b682c6d4SXiubo Li 
1471342ce182SYan, Zheng 		ceph_encode_32(p, 0);
1472342ce182SYan, Zheng 	}
1473b682c6d4SXiubo Li 
1474b682c6d4SXiubo Li 	return 0;
1475342ce182SYan, Zheng }
1476342ce182SYan, Zheng 
14773b4168ddSXiubo Li static const unsigned char metric_bits[] = CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED;
14783b4168ddSXiubo Li #define METRIC_BYTES(cnt) (DIV_ROUND_UP((size_t)metric_bits[cnt - 1] + 1, 64) * 8)
encode_metric_spec(void ** p,void * end)14793b4168ddSXiubo Li static int encode_metric_spec(void **p, void *end)
14803b4168ddSXiubo Li {
14813b4168ddSXiubo Li 	static const size_t count = ARRAY_SIZE(metric_bits);
14823b4168ddSXiubo Li 
14833b4168ddSXiubo Li 	/* header */
14843b4168ddSXiubo Li 	if (WARN_ON_ONCE(*p + 2 > end))
14853b4168ddSXiubo Li 		return -ERANGE;
14863b4168ddSXiubo Li 
14873b4168ddSXiubo Li 	ceph_encode_8(p, 1); /* version */
14883b4168ddSXiubo Li 	ceph_encode_8(p, 1); /* compat */
14893b4168ddSXiubo Li 
14903b4168ddSXiubo Li 	if (count > 0) {
14913b4168ddSXiubo Li 		size_t i;
14923b4168ddSXiubo Li 		size_t size = METRIC_BYTES(count);
14933b4168ddSXiubo Li 
14943b4168ddSXiubo Li 		if (WARN_ON_ONCE(*p + 4 + 4 + size > end))
14953b4168ddSXiubo Li 			return -ERANGE;
14963b4168ddSXiubo Li 
14973b4168ddSXiubo Li 		/* metric spec info length */
14983b4168ddSXiubo Li 		ceph_encode_32(p, 4 + size);
14993b4168ddSXiubo Li 
15003b4168ddSXiubo Li 		/* metric spec */
15013b4168ddSXiubo Li 		ceph_encode_32(p, size);
15023b4168ddSXiubo Li 		memset(*p, 0, size);
15033b4168ddSXiubo Li 		for (i = 0; i < count; i++)
15043b4168ddSXiubo Li 			((unsigned char *)(*p))[i / 8] |= BIT(metric_bits[i] % 8);
15053b4168ddSXiubo Li 		*p += size;
15063b4168ddSXiubo Li 	} else {
15073b4168ddSXiubo Li 		if (WARN_ON_ONCE(*p + 4 + 4 > end))
15083b4168ddSXiubo Li 			return -ERANGE;
15093b4168ddSXiubo Li 
15103b4168ddSXiubo Li 		/* metric spec info length */
15113b4168ddSXiubo Li 		ceph_encode_32(p, 4);
15123b4168ddSXiubo Li 		/* metric spec */
15133b4168ddSXiubo Li 		ceph_encode_32(p, 0);
15143b4168ddSXiubo Li 	}
15153b4168ddSXiubo Li 
15163b4168ddSXiubo Li 	return 0;
15173b4168ddSXiubo Li }
15183b4168ddSXiubo Li 
1519dbd0c8bfSJohn Spray /*
1520dbd0c8bfSJohn Spray  * session message, specialization for CEPH_SESSION_REQUEST_OPEN
1521dbd0c8bfSJohn Spray  * to include additional client metadata fields.
1522dbd0c8bfSJohn Spray  */
create_session_open_msg(struct ceph_mds_client * mdsc,u64 seq)1523dbd0c8bfSJohn Spray static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u64 seq)
1524dbd0c8bfSJohn Spray {
1525dbd0c8bfSJohn Spray 	struct ceph_msg *msg;
1526dbd0c8bfSJohn Spray 	struct ceph_mds_session_head *h;
15274a756db2SColin Ian King 	int i;
1528342ce182SYan, Zheng 	int extra_bytes = 0;
1529dbd0c8bfSJohn Spray 	int metadata_key_count = 0;
1530dbd0c8bfSJohn Spray 	struct ceph_options *opt = mdsc->fsc->client->options;
15313f384954SYan, Zheng 	struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;
15329ba1e224SXiubo Li 	size_t size, count;
1533342ce182SYan, Zheng 	void *p, *end;
1534b682c6d4SXiubo Li 	int ret;
1535dbd0c8bfSJohn Spray 
1536a6a5ce4fSYan, Zheng 	const char* metadata[][2] = {
1537717e6f28SYan, Zheng 		{"hostname", mdsc->nodename},
1538717e6f28SYan, Zheng 		{"kernel_version", init_utsname()->release},
15393f384954SYan, Zheng 		{"entity_id", opt->name ? : ""},
15403f384954SYan, Zheng 		{"root", fsopt->server_path ? : "/"},
1541dbd0c8bfSJohn Spray 		{NULL, NULL}
1542dbd0c8bfSJohn Spray 	};
1543dbd0c8bfSJohn Spray 
1544dbd0c8bfSJohn Spray 	/* Calculate serialized length of metadata */
1545342ce182SYan, Zheng 	extra_bytes = 4;  /* map length */
1546d37b1d99SMarkus Elfring 	for (i = 0; metadata[i][0]; ++i) {
1547342ce182SYan, Zheng 		extra_bytes += 8 + strlen(metadata[i][0]) +
1548dbd0c8bfSJohn Spray 			strlen(metadata[i][1]);
1549dbd0c8bfSJohn Spray 		metadata_key_count++;
1550dbd0c8bfSJohn Spray 	}
15519ba1e224SXiubo Li 
1552342ce182SYan, Zheng 	/* supported feature */
15539ba1e224SXiubo Li 	size = 0;
15549ba1e224SXiubo Li 	count = ARRAY_SIZE(feature_bits);
15559ba1e224SXiubo Li 	if (count > 0)
15569ba1e224SXiubo Li 		size = FEATURE_BYTES(count);
15579ba1e224SXiubo Li 	extra_bytes += 4 + size;
1558dbd0c8bfSJohn Spray 
15593b4168ddSXiubo Li 	/* metric spec */
15603b4168ddSXiubo Li 	size = 0;
15613b4168ddSXiubo Li 	count = ARRAY_SIZE(metric_bits);
15623b4168ddSXiubo Li 	if (count > 0)
15633b4168ddSXiubo Li 		size = METRIC_BYTES(count);
15643b4168ddSXiubo Li 	extra_bytes += 2 + 4 + 4 + size;
15653b4168ddSXiubo Li 
1566dbd0c8bfSJohn Spray 	/* Allocate the message */
1567342ce182SYan, Zheng 	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes,
1568dbd0c8bfSJohn Spray 			   GFP_NOFS, false);
1569dbd0c8bfSJohn Spray 	if (!msg) {
1570fba97e80SXiubo Li 		pr_err("ENOMEM creating session open msg\n");
1571b682c6d4SXiubo Li 		return ERR_PTR(-ENOMEM);
1572dbd0c8bfSJohn Spray 	}
1573342ce182SYan, Zheng 	p = msg->front.iov_base;
1574342ce182SYan, Zheng 	end = p + msg->front.iov_len;
1575342ce182SYan, Zheng 
1576342ce182SYan, Zheng 	h = p;
1577dbd0c8bfSJohn Spray 	h->op = cpu_to_le32(CEPH_SESSION_REQUEST_OPEN);
1578dbd0c8bfSJohn Spray 	h->seq = cpu_to_le64(seq);
1579dbd0c8bfSJohn Spray 
1580dbd0c8bfSJohn Spray 	/*
1581dbd0c8bfSJohn Spray 	 * Serialize client metadata into waiting buffer space, using
1582dbd0c8bfSJohn Spray 	 * the format that userspace expects for map<string, string>
15837cfa0313SJohn Spray 	 *
15843b4168ddSXiubo Li 	 * ClientSession messages with metadata are v4
1585dbd0c8bfSJohn Spray 	 */
15863b4168ddSXiubo Li 	msg->hdr.version = cpu_to_le16(4);
15877cfa0313SJohn Spray 	msg->hdr.compat_version = cpu_to_le16(1);
1588dbd0c8bfSJohn Spray 
1589dbd0c8bfSJohn Spray 	/* The write pointer, following the session_head structure */
1590342ce182SYan, Zheng 	p += sizeof(*h);
1591dbd0c8bfSJohn Spray 
1592dbd0c8bfSJohn Spray 	/* Number of entries in the map */
1593dbd0c8bfSJohn Spray 	ceph_encode_32(&p, metadata_key_count);
1594dbd0c8bfSJohn Spray 
1595dbd0c8bfSJohn Spray 	/* Two length-prefixed strings for each entry in the map */
1596d37b1d99SMarkus Elfring 	for (i = 0; metadata[i][0]; ++i) {
1597dbd0c8bfSJohn Spray 		size_t const key_len = strlen(metadata[i][0]);
1598dbd0c8bfSJohn Spray 		size_t const val_len = strlen(metadata[i][1]);
1599dbd0c8bfSJohn Spray 
1600dbd0c8bfSJohn Spray 		ceph_encode_32(&p, key_len);
1601dbd0c8bfSJohn Spray 		memcpy(p, metadata[i][0], key_len);
1602dbd0c8bfSJohn Spray 		p += key_len;
1603dbd0c8bfSJohn Spray 		ceph_encode_32(&p, val_len);
1604dbd0c8bfSJohn Spray 		memcpy(p, metadata[i][1], val_len);
1605dbd0c8bfSJohn Spray 		p += val_len;
1606dbd0c8bfSJohn Spray 	}
1607dbd0c8bfSJohn Spray 
1608b682c6d4SXiubo Li 	ret = encode_supported_features(&p, end);
1609b682c6d4SXiubo Li 	if (ret) {
1610b682c6d4SXiubo Li 		pr_err("encode_supported_features failed!\n");
1611b682c6d4SXiubo Li 		ceph_msg_put(msg);
1612b682c6d4SXiubo Li 		return ERR_PTR(ret);
1613b682c6d4SXiubo Li 	}
1614b682c6d4SXiubo Li 
16153b4168ddSXiubo Li 	ret = encode_metric_spec(&p, end);
16163b4168ddSXiubo Li 	if (ret) {
16173b4168ddSXiubo Li 		pr_err("encode_metric_spec failed!\n");
16183b4168ddSXiubo Li 		ceph_msg_put(msg);
16193b4168ddSXiubo Li 		return ERR_PTR(ret);
16203b4168ddSXiubo Li 	}
16213b4168ddSXiubo Li 
1622342ce182SYan, Zheng 	msg->front.iov_len = p - msg->front.iov_base;
1623342ce182SYan, Zheng 	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1624342ce182SYan, Zheng 
16252f2dc053SSage Weil 	return msg;
16262f2dc053SSage Weil }
16272f2dc053SSage Weil 
16282f2dc053SSage Weil /*
16292f2dc053SSage Weil  * send session open request.
16302f2dc053SSage Weil  *
16312f2dc053SSage Weil  * called under mdsc->mutex
16322f2dc053SSage Weil  */
__open_session(struct ceph_mds_client * mdsc,struct ceph_mds_session * session)16332f2dc053SSage Weil static int __open_session(struct ceph_mds_client *mdsc,
16342f2dc053SSage Weil 			  struct ceph_mds_session *session)
16352f2dc053SSage Weil {
16362f2dc053SSage Weil 	struct ceph_msg *msg;
16372f2dc053SSage Weil 	int mstate;
16382f2dc053SSage Weil 	int mds = session->s_mds;
16392f2dc053SSage Weil 
1640a68e564aSXiubo Li 	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO)
1641a68e564aSXiubo Li 		return -EIO;
1642a68e564aSXiubo Li 
16432f2dc053SSage Weil 	/* wait for mds to go active? */
16442f2dc053SSage Weil 	mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
16452f2dc053SSage Weil 	dout("open_session to mds%d (%s)\n", mds,
16462f2dc053SSage Weil 	     ceph_mds_state_name(mstate));
16472f2dc053SSage Weil 	session->s_state = CEPH_MDS_SESSION_OPENING;
16482f2dc053SSage Weil 	session->s_renew_requested = jiffies;
16492f2dc053SSage Weil 
16502f2dc053SSage Weil 	/* send connect message */
1651dbd0c8bfSJohn Spray 	msg = create_session_open_msg(mdsc, session->s_seq);
1652b682c6d4SXiubo Li 	if (IS_ERR(msg))
1653b682c6d4SXiubo Li 		return PTR_ERR(msg);
16542f2dc053SSage Weil 	ceph_con_send(&session->s_con, msg);
16552f2dc053SSage Weil 	return 0;
16562f2dc053SSage Weil }
16572f2dc053SSage Weil 
16582f2dc053SSage Weil /*
1659ed0552a1SSage Weil  * open sessions for any export targets for the given mds
1660ed0552a1SSage Weil  *
1661ed0552a1SSage Weil  * called under mdsc->mutex
1662ed0552a1SSage Weil  */
16635d72d13cSYan, Zheng static struct ceph_mds_session *
__open_export_target_session(struct ceph_mds_client * mdsc,int target)16645d72d13cSYan, Zheng __open_export_target_session(struct ceph_mds_client *mdsc, int target)
16655d72d13cSYan, Zheng {
16665d72d13cSYan, Zheng 	struct ceph_mds_session *session;
1667b682c6d4SXiubo Li 	int ret;
16685d72d13cSYan, Zheng 
16695d72d13cSYan, Zheng 	session = __ceph_lookup_mds_session(mdsc, target);
16705d72d13cSYan, Zheng 	if (!session) {
16715d72d13cSYan, Zheng 		session = register_session(mdsc, target);
16725d72d13cSYan, Zheng 		if (IS_ERR(session))
16735d72d13cSYan, Zheng 			return session;
16745d72d13cSYan, Zheng 	}
16755d72d13cSYan, Zheng 	if (session->s_state == CEPH_MDS_SESSION_NEW ||
1676b682c6d4SXiubo Li 	    session->s_state == CEPH_MDS_SESSION_CLOSING) {
1677b682c6d4SXiubo Li 		ret = __open_session(mdsc, session);
1678b682c6d4SXiubo Li 		if (ret)
1679b682c6d4SXiubo Li 			return ERR_PTR(ret);
1680b682c6d4SXiubo Li 	}
16815d72d13cSYan, Zheng 
16825d72d13cSYan, Zheng 	return session;
16835d72d13cSYan, Zheng }
16845d72d13cSYan, Zheng 
16855d72d13cSYan, Zheng struct ceph_mds_session *
ceph_mdsc_open_export_target_session(struct ceph_mds_client * mdsc,int target)16865d72d13cSYan, Zheng ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target)
16875d72d13cSYan, Zheng {
16885d72d13cSYan, Zheng 	struct ceph_mds_session *session;
16895d72d13cSYan, Zheng 
16905d72d13cSYan, Zheng 	dout("open_export_target_session to mds%d\n", target);
16915d72d13cSYan, Zheng 
16925d72d13cSYan, Zheng 	mutex_lock(&mdsc->mutex);
16935d72d13cSYan, Zheng 	session = __open_export_target_session(mdsc, target);
16945d72d13cSYan, Zheng 	mutex_unlock(&mdsc->mutex);
16955d72d13cSYan, Zheng 
16965d72d13cSYan, Zheng 	return session;
16975d72d13cSYan, Zheng }
16985d72d13cSYan, Zheng 
__open_export_target_sessions(struct ceph_mds_client * mdsc,struct ceph_mds_session * session)1699ed0552a1SSage Weil static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
1700ed0552a1SSage Weil 					  struct ceph_mds_session *session)
1701ed0552a1SSage Weil {
1702ed0552a1SSage Weil 	struct ceph_mds_info *mi;
1703ed0552a1SSage Weil 	struct ceph_mds_session *ts;
1704ed0552a1SSage Weil 	int i, mds = session->s_mds;
1705ed0552a1SSage Weil 
1706b38c9eb4SXiubo Li 	if (mds >= mdsc->mdsmap->possible_max_rank)
1707ed0552a1SSage Weil 		return;
17085d72d13cSYan, Zheng 
1709ed0552a1SSage Weil 	mi = &mdsc->mdsmap->m_info[mds];
1710ed0552a1SSage Weil 	dout("open_export_target_sessions for mds%d (%d targets)\n",
1711ed0552a1SSage Weil 	     session->s_mds, mi->num_export_targets);
1712ed0552a1SSage Weil 
1713ed0552a1SSage Weil 	for (i = 0; i < mi->num_export_targets; i++) {
17145d72d13cSYan, Zheng 		ts = __open_export_target_session(mdsc, mi->export_targets[i]);
1715ed0552a1SSage Weil 		ceph_put_mds_session(ts);
1716ed0552a1SSage Weil 	}
1717ed0552a1SSage Weil }
1718ed0552a1SSage Weil 
ceph_mdsc_open_export_target_sessions(struct ceph_mds_client * mdsc,struct ceph_mds_session * session)1719154f42c2SSage Weil void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
1720154f42c2SSage Weil 					   struct ceph_mds_session *session)
1721154f42c2SSage Weil {
1722154f42c2SSage Weil 	mutex_lock(&mdsc->mutex);
1723154f42c2SSage Weil 	__open_export_target_sessions(mdsc, session);
1724154f42c2SSage Weil 	mutex_unlock(&mdsc->mutex);
1725154f42c2SSage Weil }
1726154f42c2SSage Weil 
1727ed0552a1SSage Weil /*
17282f2dc053SSage Weil  * session caps
17292f2dc053SSage Weil  */
17302f2dc053SSage Weil 
detach_cap_releases(struct ceph_mds_session * session,struct list_head * target)1731c8a96a31SJeff Layton static void detach_cap_releases(struct ceph_mds_session *session,
1732c8a96a31SJeff Layton 				struct list_head *target)
17332f2dc053SSage Weil {
1734c8a96a31SJeff Layton 	lockdep_assert_held(&session->s_cap_lock);
1735745a8e3bSYan, Zheng 
1736c8a96a31SJeff Layton 	list_splice_init(&session->s_cap_releases, target);
1737c8a96a31SJeff Layton 	session->s_num_cap_releases = 0;
1738c8a96a31SJeff Layton 	dout("dispose_cap_releases mds%d\n", session->s_mds);
1739c8a96a31SJeff Layton }
1740c8a96a31SJeff Layton 
dispose_cap_releases(struct ceph_mds_client * mdsc,struct list_head * dispose)1741c8a96a31SJeff Layton static void dispose_cap_releases(struct ceph_mds_client *mdsc,
1742c8a96a31SJeff Layton 				 struct list_head *dispose)
1743c8a96a31SJeff Layton {
1744c8a96a31SJeff Layton 	while (!list_empty(dispose)) {
1745745a8e3bSYan, Zheng 		struct ceph_cap *cap;
1746745a8e3bSYan, Zheng 		/* zero out the in-progress message */
1747c8a96a31SJeff Layton 		cap = list_first_entry(dispose, struct ceph_cap, session_caps);
1748745a8e3bSYan, Zheng 		list_del(&cap->session_caps);
1749745a8e3bSYan, Zheng 		ceph_put_cap(mdsc, cap);
1750745a8e3bSYan, Zheng 	}
17512f2dc053SSage Weil }
17522f2dc053SSage Weil 
cleanup_session_requests(struct ceph_mds_client * mdsc,struct ceph_mds_session * session)17531c841a96SYan, Zheng static void cleanup_session_requests(struct ceph_mds_client *mdsc,
17541c841a96SYan, Zheng 				     struct ceph_mds_session *session)
17551c841a96SYan, Zheng {
17561c841a96SYan, Zheng 	struct ceph_mds_request *req;
17571c841a96SYan, Zheng 	struct rb_node *p;
17581c841a96SYan, Zheng 
17591c841a96SYan, Zheng 	dout("cleanup_session_requests mds%d\n", session->s_mds);
17601c841a96SYan, Zheng 	mutex_lock(&mdsc->mutex);
17611c841a96SYan, Zheng 	while (!list_empty(&session->s_unsafe)) {
17621c841a96SYan, Zheng 		req = list_first_entry(&session->s_unsafe,
17631c841a96SYan, Zheng 				       struct ceph_mds_request, r_unsafe_item);
17643e0708b9SYan, Zheng 		pr_warn_ratelimited(" dropping unsafe request %llu\n",
17653e0708b9SYan, Zheng 				    req->r_tid);
17661bd85aa6SJeff Layton 		if (req->r_target_inode)
17671bd85aa6SJeff Layton 			mapping_set_error(req->r_target_inode->i_mapping, -EIO);
17681bd85aa6SJeff Layton 		if (req->r_unsafe_dir)
17691bd85aa6SJeff Layton 			mapping_set_error(req->r_unsafe_dir->i_mapping, -EIO);
17701c841a96SYan, Zheng 		__unregister_request(mdsc, req);
17711c841a96SYan, Zheng 	}
17721c841a96SYan, Zheng 	/* zero r_attempts, so kick_requests() will re-send requests */
17731c841a96SYan, Zheng 	p = rb_first(&mdsc->request_tree);
17741c841a96SYan, Zheng 	while (p) {
17751c841a96SYan, Zheng 		req = rb_entry(p, struct ceph_mds_request, r_node);
17761c841a96SYan, Zheng 		p = rb_next(p);
17771c841a96SYan, Zheng 		if (req->r_session &&
17781c841a96SYan, Zheng 		    req->r_session->s_mds == session->s_mds)
17791c841a96SYan, Zheng 			req->r_attempts = 0;
17801c841a96SYan, Zheng 	}
17811c841a96SYan, Zheng 	mutex_unlock(&mdsc->mutex);
17821c841a96SYan, Zheng }
17831c841a96SYan, Zheng 
17842f2dc053SSage Weil /*
1785f818a736SSage Weil  * Helper to safely iterate over all caps associated with a session, with
1786f818a736SSage Weil  * special care taken to handle a racing __ceph_remove_cap().
17872f2dc053SSage Weil  *
1788f818a736SSage Weil  * Caller must hold session s_mutex.
17892f2dc053SSage Weil  */
ceph_iterate_session_caps(struct ceph_mds_session * session,int (* cb)(struct inode *,int mds,void *),void * arg)1790f5d77269SJeff Layton int ceph_iterate_session_caps(struct ceph_mds_session *session,
1791aaf67de7SXiubo Li 			      int (*cb)(struct inode *, int mds, void *),
1792aaf67de7SXiubo Li 			      void *arg)
17932f2dc053SSage Weil {
17947c1332b8SSage Weil 	struct list_head *p;
17957c1332b8SSage Weil 	struct ceph_cap *cap;
17967c1332b8SSage Weil 	struct inode *inode, *last_inode = NULL;
17977c1332b8SSage Weil 	struct ceph_cap *old_cap = NULL;
17982f2dc053SSage Weil 	int ret;
17992f2dc053SSage Weil 
18002f2dc053SSage Weil 	dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
18012f2dc053SSage Weil 	spin_lock(&session->s_cap_lock);
18027c1332b8SSage Weil 	p = session->s_caps.next;
18037c1332b8SSage Weil 	while (p != &session->s_caps) {
1804aaf67de7SXiubo Li 		int mds;
1805aaf67de7SXiubo Li 
18067c1332b8SSage Weil 		cap = list_entry(p, struct ceph_cap, session_caps);
1807874c8ca1SDavid Howells 		inode = igrab(&cap->ci->netfs.inode);
18087c1332b8SSage Weil 		if (!inode) {
18097c1332b8SSage Weil 			p = p->next;
18102f2dc053SSage Weil 			continue;
18117c1332b8SSage Weil 		}
18127c1332b8SSage Weil 		session->s_cap_iterator = cap;
1813aaf67de7SXiubo Li 		mds = cap->mds;
18142f2dc053SSage Weil 		spin_unlock(&session->s_cap_lock);
18157c1332b8SSage Weil 
18167c1332b8SSage Weil 		if (last_inode) {
181723c2c76eSJeff Layton 			iput(last_inode);
18187c1332b8SSage Weil 			last_inode = NULL;
18197c1332b8SSage Weil 		}
18207c1332b8SSage Weil 		if (old_cap) {
182137151668SYehuda Sadeh 			ceph_put_cap(session->s_mdsc, old_cap);
18227c1332b8SSage Weil 			old_cap = NULL;
18237c1332b8SSage Weil 		}
18247c1332b8SSage Weil 
1825aaf67de7SXiubo Li 		ret = cb(inode, mds, arg);
18267c1332b8SSage Weil 		last_inode = inode;
18277c1332b8SSage Weil 
18282f2dc053SSage Weil 		spin_lock(&session->s_cap_lock);
18297c1332b8SSage Weil 		p = p->next;
1830d37b1d99SMarkus Elfring 		if (!cap->ci) {
18317c1332b8SSage Weil 			dout("iterate_session_caps  finishing cap %p removal\n",
18327c1332b8SSage Weil 			     cap);
18337c1332b8SSage Weil 			BUG_ON(cap->session != session);
1834745a8e3bSYan, Zheng 			cap->session = NULL;
18357c1332b8SSage Weil 			list_del_init(&cap->session_caps);
18367c1332b8SSage Weil 			session->s_nr_caps--;
18374f1d756dSXiubo Li 			atomic64_dec(&session->s_mdsc->metric.total_caps);
1838e3ec8d68SYan, Zheng 			if (cap->queue_release)
1839e3ec8d68SYan, Zheng 				__ceph_queue_cap_release(session, cap);
1840e3ec8d68SYan, Zheng 			else
18417c1332b8SSage Weil 				old_cap = cap;  /* put_cap it w/o locks held */
18427c1332b8SSage Weil 		}
18435dacf091SSage Weil 		if (ret < 0)
18445dacf091SSage Weil 			goto out;
18452f2dc053SSage Weil 	}
18465dacf091SSage Weil 	ret = 0;
18475dacf091SSage Weil out:
18487c1332b8SSage Weil 	session->s_cap_iterator = NULL;
18492f2dc053SSage Weil 	spin_unlock(&session->s_cap_lock);
18507c1332b8SSage Weil 
185123c2c76eSJeff Layton 	iput(last_inode);
18527c1332b8SSage Weil 	if (old_cap)
185337151668SYehuda Sadeh 		ceph_put_cap(session->s_mdsc, old_cap);
18547c1332b8SSage Weil 
18555dacf091SSage Weil 	return ret;
18562f2dc053SSage Weil }
18572f2dc053SSage Weil 
remove_session_caps_cb(struct inode * inode,int mds,void * arg)1858aaf67de7SXiubo Li static int remove_session_caps_cb(struct inode *inode, int mds, void *arg)
18592f2dc053SSage Weil {
18602f2dc053SSage Weil 	struct ceph_inode_info *ci = ceph_inode(inode);
18616c93df5dSYan, Zheng 	bool invalidate = false;
1862aaf67de7SXiubo Li 	struct ceph_cap *cap;
1863aaf67de7SXiubo Li 	int iputs = 0;
18646c99f254SSage Weil 
1865aaf67de7SXiubo Li 	spin_lock(&ci->i_ceph_lock);
1866aaf67de7SXiubo Li 	cap = __get_cap_for_mds(ci, mds);
1867aaf67de7SXiubo Li 	if (cap) {
18682f2dc053SSage Weil 		dout(" removing cap %p, ci is %p, inode is %p\n",
1869874c8ca1SDavid Howells 		     cap, ci, &ci->netfs.inode);
1870aaf67de7SXiubo Li 
187136e6da98SJeff Layton 		iputs = ceph_purge_inode_cap(inode, cap, &invalidate);
1872aaf67de7SXiubo Li 	}
1873be655596SSage Weil 	spin_unlock(&ci->i_ceph_lock);
187477310320SYan, Zheng 
1875aaf67de7SXiubo Li 	if (cap)
187677310320SYan, Zheng 		wake_up_all(&ci->i_cap_wq);
18776c93df5dSYan, Zheng 	if (invalidate)
18786c93df5dSYan, Zheng 		ceph_queue_invalidate(inode);
187936e6da98SJeff Layton 	while (iputs--)
1880a6d37ccdSXiubo Li 		iput(inode);
18812f2dc053SSage Weil 	return 0;
18822f2dc053SSage Weil }
18832f2dc053SSage Weil 
18842f2dc053SSage Weil /*
18852f2dc053SSage Weil  * caller must hold session s_mutex
18862f2dc053SSage Weil  */
remove_session_caps(struct ceph_mds_session * session)18872f2dc053SSage Weil static void remove_session_caps(struct ceph_mds_session *session)
18882f2dc053SSage Weil {
18896c93df5dSYan, Zheng 	struct ceph_fs_client *fsc = session->s_mdsc->fsc;
18906c93df5dSYan, Zheng 	struct super_block *sb = fsc->sb;
1891c8a96a31SJeff Layton 	LIST_HEAD(dispose);
1892c8a96a31SJeff Layton 
18932f2dc053SSage Weil 	dout("remove_session_caps on %p\n", session);
1894f5d77269SJeff Layton 	ceph_iterate_session_caps(session, remove_session_caps_cb, fsc);
18956f60f889SYan, Zheng 
1896c8799fc4SYan, Zheng 	wake_up_all(&fsc->mdsc->cap_flushing_wq);
1897c8799fc4SYan, Zheng 
18986f60f889SYan, Zheng 	spin_lock(&session->s_cap_lock);
18996f60f889SYan, Zheng 	if (session->s_nr_caps > 0) {
19006f60f889SYan, Zheng 		struct inode *inode;
19016f60f889SYan, Zheng 		struct ceph_cap *cap, *prev = NULL;
19026f60f889SYan, Zheng 		struct ceph_vino vino;
19036f60f889SYan, Zheng 		/*
19046f60f889SYan, Zheng 		 * iterate_session_caps() skips inodes that are being
19056f60f889SYan, Zheng 		 * deleted, we need to wait until deletions are complete.
19066f60f889SYan, Zheng 		 * __wait_on_freeing_inode() is designed for the job,
19076f60f889SYan, Zheng 		 * but it is not exported, so use lookup inode function
19086f60f889SYan, Zheng 		 * to access it.
19096f60f889SYan, Zheng 		 */
19106f60f889SYan, Zheng 		while (!list_empty(&session->s_caps)) {
19116f60f889SYan, Zheng 			cap = list_entry(session->s_caps.next,
19126f60f889SYan, Zheng 					 struct ceph_cap, session_caps);
19136f60f889SYan, Zheng 			if (cap == prev)
19146f60f889SYan, Zheng 				break;
19156f60f889SYan, Zheng 			prev = cap;
19166f60f889SYan, Zheng 			vino = cap->ci->i_vino;
19176f60f889SYan, Zheng 			spin_unlock(&session->s_cap_lock);
19186f60f889SYan, Zheng 
1919ed284c49SYan, Zheng 			inode = ceph_find_inode(sb, vino);
192023c2c76eSJeff Layton 			iput(inode);
19216f60f889SYan, Zheng 
19226f60f889SYan, Zheng 			spin_lock(&session->s_cap_lock);
19236f60f889SYan, Zheng 		}
19246f60f889SYan, Zheng 	}
1925745a8e3bSYan, Zheng 
1926745a8e3bSYan, Zheng 	// drop cap expires and unlock s_cap_lock
1927c8a96a31SJeff Layton 	detach_cap_releases(session, &dispose);
19286f60f889SYan, Zheng 
19292f2dc053SSage Weil 	BUG_ON(session->s_nr_caps > 0);
19306c99f254SSage Weil 	BUG_ON(!list_empty(&session->s_cap_flushing));
1931c8a96a31SJeff Layton 	spin_unlock(&session->s_cap_lock);
1932c8a96a31SJeff Layton 	dispose_cap_releases(session->s_mdsc, &dispose);
19332f2dc053SSage Weil }
19342f2dc053SSage Weil 
1935d2f8bb27SYan, Zheng enum {
1936d2f8bb27SYan, Zheng 	RECONNECT,
1937d2f8bb27SYan, Zheng 	RENEWCAPS,
1938d2f8bb27SYan, Zheng 	FORCE_RO,
1939d2f8bb27SYan, Zheng };
1940d2f8bb27SYan, Zheng 
19412f2dc053SSage Weil /*
19422f2dc053SSage Weil  * wake up any threads waiting on this session's caps.  if the cap is
19432f2dc053SSage Weil  * old (didn't get renewed on the client reconnect), remove it now.
19442f2dc053SSage Weil  *
19452f2dc053SSage Weil  * caller must hold s_mutex.
19462f2dc053SSage Weil  */
wake_up_session_cb(struct inode * inode,int mds,void * arg)1947aaf67de7SXiubo Li static int wake_up_session_cb(struct inode *inode, int mds, void *arg)
19482f2dc053SSage Weil {
19490dc2570fSSage Weil 	struct ceph_inode_info *ci = ceph_inode(inode);
1950d2f8bb27SYan, Zheng 	unsigned long ev = (unsigned long)arg;
19510dc2570fSSage Weil 
1952d2f8bb27SYan, Zheng 	if (ev == RECONNECT) {
1953be655596SSage Weil 		spin_lock(&ci->i_ceph_lock);
19540dc2570fSSage Weil 		ci->i_wanted_max_size = 0;
19550dc2570fSSage Weil 		ci->i_requested_max_size = 0;
1956be655596SSage Weil 		spin_unlock(&ci->i_ceph_lock);
1957d2f8bb27SYan, Zheng 	} else if (ev == RENEWCAPS) {
1958aaf67de7SXiubo Li 		struct ceph_cap *cap;
1959aaf67de7SXiubo Li 
1960d2f8bb27SYan, Zheng 		spin_lock(&ci->i_ceph_lock);
1961aaf67de7SXiubo Li 		cap = __get_cap_for_mds(ci, mds);
1962aaf67de7SXiubo Li 		/* mds did not re-issue stale cap */
1963aaf67de7SXiubo Li 		if (cap && cap->cap_gen < atomic_read(&cap->session->s_cap_gen))
1964d2f8bb27SYan, Zheng 			cap->issued = cap->implemented = CEPH_CAP_PIN;
1965d2f8bb27SYan, Zheng 		spin_unlock(&ci->i_ceph_lock);
1966d2f8bb27SYan, Zheng 	} else if (ev == FORCE_RO) {
19670dc2570fSSage Weil 	}
1968e5360309SYan, Zheng 	wake_up_all(&ci->i_cap_wq);
19692f2dc053SSage Weil 	return 0;
19702f2dc053SSage Weil }
19712f2dc053SSage Weil 
wake_up_session_caps(struct ceph_mds_session * session,int ev)1972d2f8bb27SYan, Zheng static void wake_up_session_caps(struct ceph_mds_session *session, int ev)
19732f2dc053SSage Weil {
19742f2dc053SSage Weil 	dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
1975f5d77269SJeff Layton 	ceph_iterate_session_caps(session, wake_up_session_cb,
1976d2f8bb27SYan, Zheng 				  (void *)(unsigned long)ev);
19772f2dc053SSage Weil }
19782f2dc053SSage Weil 
19792f2dc053SSage Weil /*
19802f2dc053SSage Weil  * Send periodic message to MDS renewing all currently held caps.  The
19812f2dc053SSage Weil  * ack will reset the expiration for all caps from this session.
19822f2dc053SSage Weil  *
19832f2dc053SSage Weil  * caller holds s_mutex
19842f2dc053SSage Weil  */
send_renew_caps(struct ceph_mds_client * mdsc,struct ceph_mds_session * session)19852f2dc053SSage Weil static int send_renew_caps(struct ceph_mds_client *mdsc,
19862f2dc053SSage Weil 			   struct ceph_mds_session *session)
19872f2dc053SSage Weil {
19882f2dc053SSage Weil 	struct ceph_msg *msg;
19892f2dc053SSage Weil 	int state;
19902f2dc053SSage Weil 
19912f2dc053SSage Weil 	if (time_after_eq(jiffies, session->s_cap_ttl) &&
19922f2dc053SSage Weil 	    time_after_eq(session->s_cap_ttl, session->s_renew_requested))
19932f2dc053SSage Weil 		pr_info("mds%d caps stale\n", session->s_mds);
1994e4cb4cb8SSage Weil 	session->s_renew_requested = jiffies;
19952f2dc053SSage Weil 
19962f2dc053SSage Weil 	/* do not try to renew caps until a recovering mds has reconnected
19972f2dc053SSage Weil 	 * with its clients. */
19982f2dc053SSage Weil 	state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
19992f2dc053SSage Weil 	if (state < CEPH_MDS_STATE_RECONNECT) {
20002f2dc053SSage Weil 		dout("send_renew_caps ignoring mds%d (%s)\n",
20012f2dc053SSage Weil 		     session->s_mds, ceph_mds_state_name(state));
20022f2dc053SSage Weil 		return 0;
20032f2dc053SSage Weil 	}
20042f2dc053SSage Weil 
20052f2dc053SSage Weil 	dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
20062f2dc053SSage Weil 		ceph_mds_state_name(state));
2007fba97e80SXiubo Li 	msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
20082f2dc053SSage Weil 				      ++session->s_renew_seq);
2009a79832f2SSage Weil 	if (!msg)
2010a79832f2SSage Weil 		return -ENOMEM;
20112f2dc053SSage Weil 	ceph_con_send(&session->s_con, msg);
20122f2dc053SSage Weil 	return 0;
20132f2dc053SSage Weil }
20142f2dc053SSage Weil 
send_flushmsg_ack(struct ceph_mds_client * mdsc,struct ceph_mds_session * session,u64 seq)2015186e4f7aSYan, Zheng static int send_flushmsg_ack(struct ceph_mds_client *mdsc,
2016186e4f7aSYan, Zheng 			     struct ceph_mds_session *session, u64 seq)
2017186e4f7aSYan, Zheng {
2018186e4f7aSYan, Zheng 	struct ceph_msg *msg;
2019186e4f7aSYan, Zheng 
2020186e4f7aSYan, Zheng 	dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n",
2021a687ecafSJohn Spray 	     session->s_mds, ceph_session_state_name(session->s_state), seq);
2022fba97e80SXiubo Li 	msg = ceph_create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
2023186e4f7aSYan, Zheng 	if (!msg)
2024186e4f7aSYan, Zheng 		return -ENOMEM;
2025186e4f7aSYan, Zheng 	ceph_con_send(&session->s_con, msg);
2026186e4f7aSYan, Zheng 	return 0;
2027186e4f7aSYan, Zheng }
2028186e4f7aSYan, Zheng 
2029186e4f7aSYan, Zheng 
20302f2dc053SSage Weil /*
20312f2dc053SSage Weil  * Note new cap ttl, and any transition from stale -> not stale (fresh?).
20320dc2570fSSage Weil  *
20330dc2570fSSage Weil  * Called under session->s_mutex
20342f2dc053SSage Weil  */
renewed_caps(struct ceph_mds_client * mdsc,struct ceph_mds_session * session,int is_renew)20352f2dc053SSage Weil static void renewed_caps(struct ceph_mds_client *mdsc,
20362f2dc053SSage Weil 			 struct ceph_mds_session *session, int is_renew)
20372f2dc053SSage Weil {
20382f2dc053SSage Weil 	int was_stale;
20392f2dc053SSage Weil 	int wake = 0;
20402f2dc053SSage Weil 
20412f2dc053SSage Weil 	spin_lock(&session->s_cap_lock);
20421ce208a6SAlex Elder 	was_stale = is_renew && time_after_eq(jiffies, session->s_cap_ttl);
20432f2dc053SSage Weil 
20442f2dc053SSage Weil 	session->s_cap_ttl = session->s_renew_requested +
20452f2dc053SSage Weil 		mdsc->mdsmap->m_session_timeout*HZ;
20462f2dc053SSage Weil 
20472f2dc053SSage Weil 	if (was_stale) {
20482f2dc053SSage Weil 		if (time_before(jiffies, session->s_cap_ttl)) {
20492f2dc053SSage Weil 			pr_info("mds%d caps renewed\n", session->s_mds);
20502f2dc053SSage Weil 			wake = 1;
20512f2dc053SSage Weil 		} else {
20522f2dc053SSage Weil 			pr_info("mds%d caps still stale\n", session->s_mds);
20532f2dc053SSage Weil 		}
20542f2dc053SSage Weil 	}
20552f2dc053SSage Weil 	dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
20562f2dc053SSage Weil 	     session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
20572f2dc053SSage Weil 	     time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
20582f2dc053SSage Weil 	spin_unlock(&session->s_cap_lock);
20592f2dc053SSage Weil 
20602f2dc053SSage Weil 	if (wake)
2061d2f8bb27SYan, Zheng 		wake_up_session_caps(session, RENEWCAPS);
20622f2dc053SSage Weil }
20632f2dc053SSage Weil 
20642f2dc053SSage Weil /*
20652f2dc053SSage Weil  * send a session close request
20662f2dc053SSage Weil  */
request_close_session(struct ceph_mds_session * session)20673e699bd8SXiubo Li static int request_close_session(struct ceph_mds_session *session)
20682f2dc053SSage Weil {
20692f2dc053SSage Weil 	struct ceph_msg *msg;
20702f2dc053SSage Weil 
20712f2dc053SSage Weil 	dout("request_close_session mds%d state %s seq %lld\n",
2072a687ecafSJohn Spray 	     session->s_mds, ceph_session_state_name(session->s_state),
20732f2dc053SSage Weil 	     session->s_seq);
2074fba97e80SXiubo Li 	msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_CLOSE,
2075fba97e80SXiubo Li 				      session->s_seq);
2076a79832f2SSage Weil 	if (!msg)
2077a79832f2SSage Weil 		return -ENOMEM;
20782f2dc053SSage Weil 	ceph_con_send(&session->s_con, msg);
2079fcff415cSYan, Zheng 	return 1;
20802f2dc053SSage Weil }
20812f2dc053SSage Weil 
20822f2dc053SSage Weil /*
20832f2dc053SSage Weil  * Called with s_mutex held.
20842f2dc053SSage Weil  */
__close_session(struct ceph_mds_client * mdsc,struct ceph_mds_session * session)20852f2dc053SSage Weil static int __close_session(struct ceph_mds_client *mdsc,
20862f2dc053SSage Weil 			 struct ceph_mds_session *session)
20872f2dc053SSage Weil {
20882f2dc053SSage Weil 	if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
20892f2dc053SSage Weil 		return 0;
20902f2dc053SSage Weil 	session->s_state = CEPH_MDS_SESSION_CLOSING;
20913e699bd8SXiubo Li 	return request_close_session(session);
20922f2dc053SSage Weil }
20932f2dc053SSage Weil 
drop_negative_children(struct dentry * dentry)2094040d7860SYan, Zheng static bool drop_negative_children(struct dentry *dentry)
2095040d7860SYan, Zheng {
2096040d7860SYan, Zheng 	struct dentry *child;
2097040d7860SYan, Zheng 	bool all_negative = true;
2098040d7860SYan, Zheng 
2099040d7860SYan, Zheng 	if (!d_is_dir(dentry))
2100040d7860SYan, Zheng 		goto out;
2101040d7860SYan, Zheng 
2102040d7860SYan, Zheng 	spin_lock(&dentry->d_lock);
2103040d7860SYan, Zheng 	list_for_each_entry(child, &dentry->d_subdirs, d_child) {
2104040d7860SYan, Zheng 		if (d_really_is_positive(child)) {
2105040d7860SYan, Zheng 			all_negative = false;
2106040d7860SYan, Zheng 			break;
2107040d7860SYan, Zheng 		}
2108040d7860SYan, Zheng 	}
2109040d7860SYan, Zheng 	spin_unlock(&dentry->d_lock);
2110040d7860SYan, Zheng 
2111040d7860SYan, Zheng 	if (all_negative)
2112040d7860SYan, Zheng 		shrink_dcache_parent(dentry);
2113040d7860SYan, Zheng out:
2114040d7860SYan, Zheng 	return all_negative;
2115040d7860SYan, Zheng }
2116040d7860SYan, Zheng 
21172f2dc053SSage Weil /*
21182f2dc053SSage Weil  * Trim old(er) caps.
21192f2dc053SSage Weil  *
21202f2dc053SSage Weil  * Because we can't cache an inode without one or more caps, we do
21212f2dc053SSage Weil  * this indirectly: if a cap is unused, we prune its aliases, at which
21222f2dc053SSage Weil  * point the inode will hopefully get dropped to.
21232f2dc053SSage Weil  *
21242f2dc053SSage Weil  * Yes, this is a bit sloppy.  Our only real goal here is to respond to
21252f2dc053SSage Weil  * memory pressure from the MDS, though, so it needn't be perfect.
21262f2dc053SSage Weil  */
trim_caps_cb(struct inode * inode,int mds,void * arg)2127aaf67de7SXiubo Li static int trim_caps_cb(struct inode *inode, int mds, void *arg)
21282f2dc053SSage Weil {
21292e2023e9SXiubo Li 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
2130533a2818SJeff Layton 	int *remaining = arg;
21312f2dc053SSage Weil 	struct ceph_inode_info *ci = ceph_inode(inode);
2132979abfddSYan, Zheng 	int used, wanted, oissued, mine;
2133aaf67de7SXiubo Li 	struct ceph_cap *cap;
21342f2dc053SSage Weil 
2135533a2818SJeff Layton 	if (*remaining <= 0)
21362f2dc053SSage Weil 		return -1;
21372f2dc053SSage Weil 
2138be655596SSage Weil 	spin_lock(&ci->i_ceph_lock);
2139aaf67de7SXiubo Li 	cap = __get_cap_for_mds(ci, mds);
2140aaf67de7SXiubo Li 	if (!cap) {
2141aaf67de7SXiubo Li 		spin_unlock(&ci->i_ceph_lock);
2142aaf67de7SXiubo Li 		return 0;
2143aaf67de7SXiubo Li 	}
21442f2dc053SSage Weil 	mine = cap->issued | cap->implemented;
21452f2dc053SSage Weil 	used = __ceph_caps_used(ci);
2146979abfddSYan, Zheng 	wanted = __ceph_caps_file_wanted(ci);
21472f2dc053SSage Weil 	oissued = __ceph_caps_issued_other(ci, cap);
21482f2dc053SSage Weil 
2149979abfddSYan, Zheng 	dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s\n",
21502f2dc053SSage Weil 	     inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
2151979abfddSYan, Zheng 	     ceph_cap_string(used), ceph_cap_string(wanted));
2152979abfddSYan, Zheng 	if (cap == ci->i_auth_cap) {
2153622f3e25SYan, Zheng 		if (ci->i_dirty_caps || ci->i_flushing_caps ||
2154622f3e25SYan, Zheng 		    !list_empty(&ci->i_cap_snaps))
2155979abfddSYan, Zheng 			goto out;
2156979abfddSYan, Zheng 		if ((used | wanted) & CEPH_CAP_ANY_WR)
2157979abfddSYan, Zheng 			goto out;
215889aa5930SYan, Zheng 		/* Note: it's possible that i_filelock_ref becomes non-zero
215989aa5930SYan, Zheng 		 * after dropping auth caps. It doesn't hurt because reply
216089aa5930SYan, Zheng 		 * of lock mds request will re-add auth caps. */
216189aa5930SYan, Zheng 		if (atomic_read(&ci->i_filelock_ref) > 0)
216289aa5930SYan, Zheng 			goto out;
2163979abfddSYan, Zheng 	}
21645e804ac4SYan, Zheng 	/* The inode has cached pages, but it's no longer used.
21655e804ac4SYan, Zheng 	 * we can safely drop it */
2166525d15e8SYan, Zheng 	if (S_ISREG(inode->i_mode) &&
2167525d15e8SYan, Zheng 	    wanted == 0 && used == CEPH_CAP_FILE_CACHE &&
21685e804ac4SYan, Zheng 	    !(oissued & CEPH_CAP_FILE_CACHE)) {
21695e804ac4SYan, Zheng 	  used = 0;
21705e804ac4SYan, Zheng 	  oissued = 0;
21715e804ac4SYan, Zheng 	}
2172979abfddSYan, Zheng 	if ((used | wanted) & ~oissued & mine)
21732f2dc053SSage Weil 		goto out;   /* we need these caps */
21742f2dc053SSage Weil 
21752f2dc053SSage Weil 	if (oissued) {
21762f2dc053SSage Weil 		/* we aren't the only cap.. just remove us */
21772e2023e9SXiubo Li 		ceph_remove_cap(mdsc, cap, true);
2178533a2818SJeff Layton 		(*remaining)--;
21792f2dc053SSage Weil 	} else {
2180040d7860SYan, Zheng 		struct dentry *dentry;
21815e804ac4SYan, Zheng 		/* try dropping referring dentries */
2182be655596SSage Weil 		spin_unlock(&ci->i_ceph_lock);
2183040d7860SYan, Zheng 		dentry = d_find_any_alias(inode);
2184040d7860SYan, Zheng 		if (dentry && drop_negative_children(dentry)) {
2185040d7860SYan, Zheng 			int count;
2186040d7860SYan, Zheng 			dput(dentry);
21872f2dc053SSage Weil 			d_prune_aliases(inode);
2188040d7860SYan, Zheng 			count = atomic_read(&inode->i_count);
2189040d7860SYan, Zheng 			if (count == 1)
2190533a2818SJeff Layton 				(*remaining)--;
21912f2dc053SSage Weil 			dout("trim_caps_cb %p cap %p pruned, count now %d\n",
2192040d7860SYan, Zheng 			     inode, cap, count);
2193040d7860SYan, Zheng 		} else {
2194040d7860SYan, Zheng 			dput(dentry);
2195040d7860SYan, Zheng 		}
21962f2dc053SSage Weil 		return 0;
21972f2dc053SSage Weil 	}
21982f2dc053SSage Weil 
21992f2dc053SSage Weil out:
2200be655596SSage Weil 	spin_unlock(&ci->i_ceph_lock);
22012f2dc053SSage Weil 	return 0;
22022f2dc053SSage Weil }
22032f2dc053SSage Weil 
22042f2dc053SSage Weil /*
22052f2dc053SSage Weil  * Trim session cap count down to some max number.
22062f2dc053SSage Weil  */
ceph_trim_caps(struct ceph_mds_client * mdsc,struct ceph_mds_session * session,int max_caps)2207e30ee581SZhi Zhang int ceph_trim_caps(struct ceph_mds_client *mdsc,
22082f2dc053SSage Weil 		   struct ceph_mds_session *session,
22092f2dc053SSage Weil 		   int max_caps)
22102f2dc053SSage Weil {
22112f2dc053SSage Weil 	int trim_caps = session->s_nr_caps - max_caps;
22122f2dc053SSage Weil 
22132f2dc053SSage Weil 	dout("trim_caps mds%d start: %d / %d, trim %d\n",
22142f2dc053SSage Weil 	     session->s_mds, session->s_nr_caps, max_caps, trim_caps);
22152f2dc053SSage Weil 	if (trim_caps > 0) {
2216533a2818SJeff Layton 		int remaining = trim_caps;
2217533a2818SJeff Layton 
2218533a2818SJeff Layton 		ceph_iterate_session_caps(session, trim_caps_cb, &remaining);
22192f2dc053SSage Weil 		dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
22202f2dc053SSage Weil 		     session->s_mds, session->s_nr_caps, max_caps,
2221533a2818SJeff Layton 			trim_caps - remaining);
22222f2dc053SSage Weil 	}
2223a56371d9SYan, Zheng 
2224e3ec8d68SYan, Zheng 	ceph_flush_cap_releases(mdsc, session);
22252f2dc053SSage Weil 	return 0;
22262f2dc053SSage Weil }
22272f2dc053SSage Weil 
check_caps_flush(struct ceph_mds_client * mdsc,u64 want_flush_tid)22288310b089SYan, Zheng static int check_caps_flush(struct ceph_mds_client *mdsc,
22298310b089SYan, Zheng 			    u64 want_flush_tid)
22308310b089SYan, Zheng {
22318310b089SYan, Zheng 	int ret = 1;
22328310b089SYan, Zheng 
22338310b089SYan, Zheng 	spin_lock(&mdsc->cap_dirty_lock);
2234e4500b5eSYan, Zheng 	if (!list_empty(&mdsc->cap_flush_list)) {
2235e4500b5eSYan, Zheng 		struct ceph_cap_flush *cf =
2236e4500b5eSYan, Zheng 			list_first_entry(&mdsc->cap_flush_list,
2237e4500b5eSYan, Zheng 					 struct ceph_cap_flush, g_list);
2238e4500b5eSYan, Zheng 		if (cf->tid <= want_flush_tid) {
2239e4500b5eSYan, Zheng 			dout("check_caps_flush still flushing tid "
2240e4500b5eSYan, Zheng 			     "%llu <= %llu\n", cf->tid, want_flush_tid);
22418310b089SYan, Zheng 			ret = 0;
22428310b089SYan, Zheng 		}
2243e4500b5eSYan, Zheng 	}
22448310b089SYan, Zheng 	spin_unlock(&mdsc->cap_dirty_lock);
22458310b089SYan, Zheng 	return ret;
2246d3383a8eSYan, Zheng }
2247d3383a8eSYan, Zheng 
22482f2dc053SSage Weil /*
22492f2dc053SSage Weil  * flush all dirty inode data to disk.
22502f2dc053SSage Weil  *
22518310b089SYan, Zheng  * returns true if we've flushed through want_flush_tid
22522f2dc053SSage Weil  */
wait_caps_flush(struct ceph_mds_client * mdsc,u64 want_flush_tid)2253affbc19aSYan, Zheng static void wait_caps_flush(struct ceph_mds_client *mdsc,
22540e294387SYan, Zheng 			    u64 want_flush_tid)
22552f2dc053SSage Weil {
22560e294387SYan, Zheng 	dout("check_caps_flush want %llu\n", want_flush_tid);
22578310b089SYan, Zheng 
22588310b089SYan, Zheng 	wait_event(mdsc->cap_flushing_wq,
22598310b089SYan, Zheng 		   check_caps_flush(mdsc, want_flush_tid));
22608310b089SYan, Zheng 
22618310b089SYan, Zheng 	dout("check_caps_flush ok, flushed thru %llu\n", want_flush_tid);
22622f2dc053SSage Weil }
22632f2dc053SSage Weil 
22642f2dc053SSage Weil /*
22652f2dc053SSage Weil  * called under s_mutex
22662f2dc053SSage Weil  */
ceph_send_cap_releases(struct ceph_mds_client * mdsc,struct ceph_mds_session * session)2267e3ec8d68SYan, Zheng static void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
22682f2dc053SSage Weil 				   struct ceph_mds_session *session)
22692f2dc053SSage Weil {
2270745a8e3bSYan, Zheng 	struct ceph_msg *msg = NULL;
2271745a8e3bSYan, Zheng 	struct ceph_mds_cap_release *head;
2272745a8e3bSYan, Zheng 	struct ceph_mds_cap_item *item;
227392475f05SJeff Layton 	struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
2274745a8e3bSYan, Zheng 	struct ceph_cap *cap;
2275745a8e3bSYan, Zheng 	LIST_HEAD(tmp_list);
2276745a8e3bSYan, Zheng 	int num_cap_releases;
227792475f05SJeff Layton 	__le32	barrier, *cap_barrier;
227892475f05SJeff Layton 
227992475f05SJeff Layton 	down_read(&osdc->lock);
228092475f05SJeff Layton 	barrier = cpu_to_le32(osdc->epoch_barrier);
228192475f05SJeff Layton 	up_read(&osdc->lock);
22822f2dc053SSage Weil 
22832f2dc053SSage Weil 	spin_lock(&session->s_cap_lock);
2284745a8e3bSYan, Zheng again:
2285745a8e3bSYan, Zheng 	list_splice_init(&session->s_cap_releases, &tmp_list);
2286745a8e3bSYan, Zheng 	num_cap_releases = session->s_num_cap_releases;
2287745a8e3bSYan, Zheng 	session->s_num_cap_releases = 0;
22882f2dc053SSage Weil 	spin_unlock(&session->s_cap_lock);
2289745a8e3bSYan, Zheng 
2290745a8e3bSYan, Zheng 	while (!list_empty(&tmp_list)) {
2291745a8e3bSYan, Zheng 		if (!msg) {
2292745a8e3bSYan, Zheng 			msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE,
229309cbfeafSKirill A. Shutemov 					PAGE_SIZE, GFP_NOFS, false);
2294745a8e3bSYan, Zheng 			if (!msg)
2295745a8e3bSYan, Zheng 				goto out_err;
2296745a8e3bSYan, Zheng 			head = msg->front.iov_base;
2297745a8e3bSYan, Zheng 			head->num = cpu_to_le32(0);
2298745a8e3bSYan, Zheng 			msg->front.iov_len = sizeof(*head);
229992475f05SJeff Layton 
230092475f05SJeff Layton 			msg->hdr.version = cpu_to_le16(2);
230192475f05SJeff Layton 			msg->hdr.compat_version = cpu_to_le16(1);
2302745a8e3bSYan, Zheng 		}
230392475f05SJeff Layton 
2304745a8e3bSYan, Zheng 		cap = list_first_entry(&tmp_list, struct ceph_cap,
2305745a8e3bSYan, Zheng 					session_caps);
2306745a8e3bSYan, Zheng 		list_del(&cap->session_caps);
2307745a8e3bSYan, Zheng 		num_cap_releases--;
2308745a8e3bSYan, Zheng 
2309745a8e3bSYan, Zheng 		head = msg->front.iov_base;
23104198aba4SJeff Layton 		put_unaligned_le32(get_unaligned_le32(&head->num) + 1,
23114198aba4SJeff Layton 				   &head->num);
2312745a8e3bSYan, Zheng 		item = msg->front.iov_base + msg->front.iov_len;
2313745a8e3bSYan, Zheng 		item->ino = cpu_to_le64(cap->cap_ino);
2314745a8e3bSYan, Zheng 		item->cap_id = cpu_to_le64(cap->cap_id);
2315745a8e3bSYan, Zheng 		item->migrate_seq = cpu_to_le32(cap->mseq);
2316745a8e3bSYan, Zheng 		item->seq = cpu_to_le32(cap->issue_seq);
2317745a8e3bSYan, Zheng 		msg->front.iov_len += sizeof(*item);
2318745a8e3bSYan, Zheng 
2319745a8e3bSYan, Zheng 		ceph_put_cap(mdsc, cap);
2320745a8e3bSYan, Zheng 
2321745a8e3bSYan, Zheng 		if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
232292475f05SJeff Layton 			// Append cap_barrier field
232392475f05SJeff Layton 			cap_barrier = msg->front.iov_base + msg->front.iov_len;
232492475f05SJeff Layton 			*cap_barrier = barrier;
232592475f05SJeff Layton 			msg->front.iov_len += sizeof(*cap_barrier);
232692475f05SJeff Layton 
23272f2dc053SSage Weil 			msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
23282f2dc053SSage Weil 			dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
23292f2dc053SSage Weil 			ceph_con_send(&session->s_con, msg);
2330745a8e3bSYan, Zheng 			msg = NULL;
2331745a8e3bSYan, Zheng 		}
2332745a8e3bSYan, Zheng 	}
2333745a8e3bSYan, Zheng 
2334745a8e3bSYan, Zheng 	BUG_ON(num_cap_releases != 0);
2335745a8e3bSYan, Zheng 
23360f8605f2SSage Weil 	spin_lock(&session->s_cap_lock);
2337745a8e3bSYan, Zheng 	if (!list_empty(&session->s_cap_releases))
2338745a8e3bSYan, Zheng 		goto again;
23392f2dc053SSage Weil 	spin_unlock(&session->s_cap_lock);
2340745a8e3bSYan, Zheng 
2341745a8e3bSYan, Zheng 	if (msg) {
234292475f05SJeff Layton 		// Append cap_barrier field
234392475f05SJeff Layton 		cap_barrier = msg->front.iov_base + msg->front.iov_len;
234492475f05SJeff Layton 		*cap_barrier = barrier;
234592475f05SJeff Layton 		msg->front.iov_len += sizeof(*cap_barrier);
234692475f05SJeff Layton 
2347745a8e3bSYan, Zheng 		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
2348745a8e3bSYan, Zheng 		dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
2349745a8e3bSYan, Zheng 		ceph_con_send(&session->s_con, msg);
23502f2dc053SSage Weil 	}
2351745a8e3bSYan, Zheng 	return;
2352745a8e3bSYan, Zheng out_err:
2353745a8e3bSYan, Zheng 	pr_err("send_cap_releases mds%d, failed to allocate message\n",
2354745a8e3bSYan, Zheng 		session->s_mds);
2355745a8e3bSYan, Zheng 	spin_lock(&session->s_cap_lock);
2356745a8e3bSYan, Zheng 	list_splice(&tmp_list, &session->s_cap_releases);
2357745a8e3bSYan, Zheng 	session->s_num_cap_releases += num_cap_releases;
2358745a8e3bSYan, Zheng 	spin_unlock(&session->s_cap_lock);
2359e01a5946SSage Weil }
2360e01a5946SSage Weil 
ceph_cap_release_work(struct work_struct * work)2361e3ec8d68SYan, Zheng static void ceph_cap_release_work(struct work_struct *work)
2362e3ec8d68SYan, Zheng {
2363e3ec8d68SYan, Zheng 	struct ceph_mds_session *session =
2364e3ec8d68SYan, Zheng 		container_of(work, struct ceph_mds_session, s_cap_release_work);
2365e3ec8d68SYan, Zheng 
2366e3ec8d68SYan, Zheng 	mutex_lock(&session->s_mutex);
2367e3ec8d68SYan, Zheng 	if (session->s_state == CEPH_MDS_SESSION_OPEN ||
2368e3ec8d68SYan, Zheng 	    session->s_state == CEPH_MDS_SESSION_HUNG)
2369e3ec8d68SYan, Zheng 		ceph_send_cap_releases(session->s_mdsc, session);
2370e3ec8d68SYan, Zheng 	mutex_unlock(&session->s_mutex);
2371e3ec8d68SYan, Zheng 	ceph_put_mds_session(session);
2372e3ec8d68SYan, Zheng }
2373e3ec8d68SYan, Zheng 
ceph_flush_cap_releases(struct ceph_mds_client * mdsc,struct ceph_mds_session * session)2374e3ec8d68SYan, Zheng void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
2375e3ec8d68SYan, Zheng 		             struct ceph_mds_session *session)
2376e3ec8d68SYan, Zheng {
2377e3ec8d68SYan, Zheng 	if (mdsc->stopping)
2378e3ec8d68SYan, Zheng 		return;
2379e3ec8d68SYan, Zheng 
23805b3248c6SXiubo Li 	ceph_get_mds_session(session);
2381e3ec8d68SYan, Zheng 	if (queue_work(mdsc->fsc->cap_wq,
2382e3ec8d68SYan, Zheng 		       &session->s_cap_release_work)) {
2383e3ec8d68SYan, Zheng 		dout("cap release work queued\n");
2384e3ec8d68SYan, Zheng 	} else {
2385e3ec8d68SYan, Zheng 		ceph_put_mds_session(session);
2386e3ec8d68SYan, Zheng 		dout("failed to queue cap release work\n");
2387e3ec8d68SYan, Zheng 	}
2388e3ec8d68SYan, Zheng }
2389e3ec8d68SYan, Zheng 
2390e3ec8d68SYan, Zheng /*
2391e3ec8d68SYan, Zheng  * caller holds session->s_cap_lock
2392e3ec8d68SYan, Zheng  */
__ceph_queue_cap_release(struct ceph_mds_session * session,struct ceph_cap * cap)2393e3ec8d68SYan, Zheng void __ceph_queue_cap_release(struct ceph_mds_session *session,
2394e3ec8d68SYan, Zheng 			      struct ceph_cap *cap)
2395e3ec8d68SYan, Zheng {
2396e3ec8d68SYan, Zheng 	list_add_tail(&cap->session_caps, &session->s_cap_releases);
2397e3ec8d68SYan, Zheng 	session->s_num_cap_releases++;
2398e3ec8d68SYan, Zheng 
2399e3ec8d68SYan, Zheng 	if (!(session->s_num_cap_releases % CEPH_CAPS_PER_RELEASE))
2400e3ec8d68SYan, Zheng 		ceph_flush_cap_releases(session->s_mdsc, session);
2401e3ec8d68SYan, Zheng }
2402e3ec8d68SYan, Zheng 
ceph_cap_reclaim_work(struct work_struct * work)240337c4efc1SYan, Zheng static void ceph_cap_reclaim_work(struct work_struct *work)
240437c4efc1SYan, Zheng {
240537c4efc1SYan, Zheng 	struct ceph_mds_client *mdsc =
240637c4efc1SYan, Zheng 		container_of(work, struct ceph_mds_client, cap_reclaim_work);
240737c4efc1SYan, Zheng 	int ret = ceph_trim_dentries(mdsc);
240837c4efc1SYan, Zheng 	if (ret == -EAGAIN)
240937c4efc1SYan, Zheng 		ceph_queue_cap_reclaim_work(mdsc);
241037c4efc1SYan, Zheng }
241137c4efc1SYan, Zheng 
ceph_queue_cap_reclaim_work(struct ceph_mds_client * mdsc)241237c4efc1SYan, Zheng void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc)
241337c4efc1SYan, Zheng {
241437c4efc1SYan, Zheng 	if (mdsc->stopping)
241537c4efc1SYan, Zheng 		return;
241637c4efc1SYan, Zheng 
241737c4efc1SYan, Zheng         if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_reclaim_work)) {
241837c4efc1SYan, Zheng                 dout("caps reclaim work queued\n");
241937c4efc1SYan, Zheng         } else {
242037c4efc1SYan, Zheng                 dout("failed to queue caps release work\n");
242137c4efc1SYan, Zheng         }
242237c4efc1SYan, Zheng }
242337c4efc1SYan, Zheng 
ceph_reclaim_caps_nr(struct ceph_mds_client * mdsc,int nr)2424fe33032dSYan, Zheng void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr)
2425fe33032dSYan, Zheng {
2426fe33032dSYan, Zheng 	int val;
2427fe33032dSYan, Zheng 	if (!nr)
2428fe33032dSYan, Zheng 		return;
2429fe33032dSYan, Zheng 	val = atomic_add_return(nr, &mdsc->cap_reclaim_pending);
2430bba1560bSXiubo Li 	if ((val % CEPH_CAPS_PER_RELEASE) < nr) {
2431fe33032dSYan, Zheng 		atomic_set(&mdsc->cap_reclaim_pending, 0);
2432fe33032dSYan, Zheng 		ceph_queue_cap_reclaim_work(mdsc);
2433fe33032dSYan, Zheng 	}
2434fe33032dSYan, Zheng }
2435fe33032dSYan, Zheng 
24362f2dc053SSage Weil /*
24372f2dc053SSage Weil  * requests
24382f2dc053SSage Weil  */
24392f2dc053SSage Weil 
ceph_alloc_readdir_reply_buffer(struct ceph_mds_request * req,struct inode * dir)244054008399SYan, Zheng int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
244154008399SYan, Zheng 				    struct inode *dir)
244254008399SYan, Zheng {
244354008399SYan, Zheng 	struct ceph_inode_info *ci = ceph_inode(dir);
244454008399SYan, Zheng 	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
244554008399SYan, Zheng 	struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;
24462a5beea3SYan, Zheng 	size_t size = sizeof(struct ceph_mds_reply_dir_entry);
2447ad8c28a9SJeff Layton 	unsigned int num_entries;
2448ad8c28a9SJeff Layton 	int order;
244954008399SYan, Zheng 
245054008399SYan, Zheng 	spin_lock(&ci->i_ceph_lock);
245154008399SYan, Zheng 	num_entries = ci->i_files + ci->i_subdirs;
245254008399SYan, Zheng 	spin_unlock(&ci->i_ceph_lock);
2453ad8c28a9SJeff Layton 	num_entries = max(num_entries, 1U);
245454008399SYan, Zheng 	num_entries = min(num_entries, opt->max_readdir);
245554008399SYan, Zheng 
245654008399SYan, Zheng 	order = get_order(size * num_entries);
245754008399SYan, Zheng 	while (order >= 0) {
24582a5beea3SYan, Zheng 		rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL |
24592941bf53SXiubo Li 							     __GFP_NOWARN |
24602941bf53SXiubo Li 							     __GFP_ZERO,
246154008399SYan, Zheng 							     order);
24622a5beea3SYan, Zheng 		if (rinfo->dir_entries)
246354008399SYan, Zheng 			break;
246454008399SYan, Zheng 		order--;
246554008399SYan, Zheng 	}
24662a5beea3SYan, Zheng 	if (!rinfo->dir_entries)
246754008399SYan, Zheng 		return -ENOMEM;
246854008399SYan, Zheng 
246954008399SYan, Zheng 	num_entries = (PAGE_SIZE << order) / size;
247054008399SYan, Zheng 	num_entries = min(num_entries, opt->max_readdir);
247154008399SYan, Zheng 
247254008399SYan, Zheng 	rinfo->dir_buf_size = PAGE_SIZE << order;
247354008399SYan, Zheng 	req->r_num_caps = num_entries + 1;
247454008399SYan, Zheng 	req->r_args.readdir.max_entries = cpu_to_le32(num_entries);
247554008399SYan, Zheng 	req->r_args.readdir.max_bytes = cpu_to_le32(opt->max_readdir_bytes);
247654008399SYan, Zheng 	return 0;
247754008399SYan, Zheng }
247854008399SYan, Zheng 
24792f2dc053SSage Weil /*
24802f2dc053SSage Weil  * Create an mds request.
24812f2dc053SSage Weil  */
24822f2dc053SSage Weil struct ceph_mds_request *
ceph_mdsc_create_request(struct ceph_mds_client * mdsc,int op,int mode)24832f2dc053SSage Weil ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
24842f2dc053SSage Weil {
2485058daab7SJeff Layton 	struct ceph_mds_request *req;
24862f2dc053SSage Weil 
2487058daab7SJeff Layton 	req = kmem_cache_zalloc(ceph_mds_request_cachep, GFP_NOFS);
24882f2dc053SSage Weil 	if (!req)
24892f2dc053SSage Weil 		return ERR_PTR(-ENOMEM);
24902f2dc053SSage Weil 
2491b4556396SSage Weil 	mutex_init(&req->r_fill_mutex);
249237151668SYehuda Sadeh 	req->r_mdsc = mdsc;
24932f2dc053SSage Weil 	req->r_started = jiffies;
249470c94820SXiubo Li 	req->r_start_latency = ktime_get();
24952f2dc053SSage Weil 	req->r_resend_mds = -1;
24962f2dc053SSage Weil 	INIT_LIST_HEAD(&req->r_unsafe_dir_item);
249768cd5b4bSYan, Zheng 	INIT_LIST_HEAD(&req->r_unsafe_target_item);
24982f2dc053SSage Weil 	req->r_fmode = -1;
24996eb06c46SXiubo Li 	req->r_feature_needed = -1;
2500153c8e6bSSage Weil 	kref_init(&req->r_kref);
2501fcd00b68SIlya Dryomov 	RB_CLEAR_NODE(&req->r_node);
25022f2dc053SSage Weil 	INIT_LIST_HEAD(&req->r_wait);
25032f2dc053SSage Weil 	init_completion(&req->r_completion);
25042f2dc053SSage Weil 	init_completion(&req->r_safe_completion);
25052f2dc053SSage Weil 	INIT_LIST_HEAD(&req->r_unsafe_item);
25062f2dc053SSage Weil 
2507668c9a61SDeepa Dinamani 	ktime_get_coarse_real_ts64(&req->r_stamp);
2508b8e69066SSage Weil 
25092f2dc053SSage Weil 	req->r_op = op;
25102f2dc053SSage Weil 	req->r_direct_mode = mode;
25112f2dc053SSage Weil 	return req;
25122f2dc053SSage Weil }
25132f2dc053SSage Weil 
25142f2dc053SSage Weil /*
251544ca18f2SSage Weil  * return oldest (lowest) request, tid in request tree, 0 if none.
25162f2dc053SSage Weil  *
25172f2dc053SSage Weil  * called under mdsc->mutex.
25182f2dc053SSage Weil  */
__get_oldest_req(struct ceph_mds_client * mdsc)251944ca18f2SSage Weil static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
252044ca18f2SSage Weil {
252144ca18f2SSage Weil 	if (RB_EMPTY_ROOT(&mdsc->request_tree))
252244ca18f2SSage Weil 		return NULL;
252344ca18f2SSage Weil 	return rb_entry(rb_first(&mdsc->request_tree),
252444ca18f2SSage Weil 			struct ceph_mds_request, r_node);
252544ca18f2SSage Weil }
252644ca18f2SSage Weil 
__get_oldest_tid(struct ceph_mds_client * mdsc)2527e8a7b8b1SYan, Zheng static inline  u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
25282f2dc053SSage Weil {
2529e8a7b8b1SYan, Zheng 	return mdsc->oldest_tid;
25302f2dc053SSage Weil }
25312f2dc053SSage Weil 
253224865e75SJeff Layton #if IS_ENABLED(CONFIG_FS_ENCRYPTION)
get_fscrypt_altname(const struct ceph_mds_request * req,u32 * plen)253324865e75SJeff Layton static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
253424865e75SJeff Layton {
253524865e75SJeff Layton 	struct inode *dir = req->r_parent;
253624865e75SJeff Layton 	struct dentry *dentry = req->r_dentry;
253724865e75SJeff Layton 	u8 *cryptbuf = NULL;
253824865e75SJeff Layton 	u32 len = 0;
253924865e75SJeff Layton 	int ret = 0;
254024865e75SJeff Layton 
254124865e75SJeff Layton 	/* only encode if we have parent and dentry */
254224865e75SJeff Layton 	if (!dir || !dentry)
254324865e75SJeff Layton 		goto success;
254424865e75SJeff Layton 
254524865e75SJeff Layton 	/* No-op unless this is encrypted */
254624865e75SJeff Layton 	if (!IS_ENCRYPTED(dir))
254724865e75SJeff Layton 		goto success;
254824865e75SJeff Layton 
254914e034a6SLuís Henriques 	ret = ceph_fscrypt_prepare_readdir(dir);
255014e034a6SLuís Henriques 	if (ret < 0)
255124865e75SJeff Layton 		return ERR_PTR(ret);
255224865e75SJeff Layton 
255324865e75SJeff Layton 	/* No key? Just ignore it. */
255424865e75SJeff Layton 	if (!fscrypt_has_encryption_key(dir))
255524865e75SJeff Layton 		goto success;
255624865e75SJeff Layton 
255724865e75SJeff Layton 	if (!fscrypt_fname_encrypted_size(dir, dentry->d_name.len, NAME_MAX,
255824865e75SJeff Layton 					  &len)) {
255924865e75SJeff Layton 		WARN_ON_ONCE(1);
256024865e75SJeff Layton 		return ERR_PTR(-ENAMETOOLONG);
256124865e75SJeff Layton 	}
256224865e75SJeff Layton 
256324865e75SJeff Layton 	/* No need to append altname if name is short enough */
256424865e75SJeff Layton 	if (len <= CEPH_NOHASH_NAME_MAX) {
256524865e75SJeff Layton 		len = 0;
256624865e75SJeff Layton 		goto success;
256724865e75SJeff Layton 	}
256824865e75SJeff Layton 
256924865e75SJeff Layton 	cryptbuf = kmalloc(len, GFP_KERNEL);
257024865e75SJeff Layton 	if (!cryptbuf)
257124865e75SJeff Layton 		return ERR_PTR(-ENOMEM);
257224865e75SJeff Layton 
257324865e75SJeff Layton 	ret = fscrypt_fname_encrypt(dir, &dentry->d_name, cryptbuf, len);
257424865e75SJeff Layton 	if (ret) {
257524865e75SJeff Layton 		kfree(cryptbuf);
257624865e75SJeff Layton 		return ERR_PTR(ret);
257724865e75SJeff Layton 	}
257824865e75SJeff Layton success:
257924865e75SJeff Layton 	*plen = len;
258024865e75SJeff Layton 	return cryptbuf;
258124865e75SJeff Layton }
258224865e75SJeff Layton #else
get_fscrypt_altname(const struct ceph_mds_request * req,u32 * plen)258324865e75SJeff Layton static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
258424865e75SJeff Layton {
258524865e75SJeff Layton 	*plen = 0;
258624865e75SJeff Layton 	return NULL;
258724865e75SJeff Layton }
258824865e75SJeff Layton #endif
258924865e75SJeff Layton 
25903fd945a7SJeff Layton /**
25913fd945a7SJeff Layton  * ceph_mdsc_build_path - build a path string to a given dentry
25922e2023e9SXiubo Li  * @mdsc: mds client
25933fd945a7SJeff Layton  * @dentry: dentry to which path should be built
25943fd945a7SJeff Layton  * @plen: returned length of string
25953fd945a7SJeff Layton  * @pbase: returned base inode number
25963fd945a7SJeff Layton  * @for_wire: is this path going to be sent to the MDS?
25972f2dc053SSage Weil  *
25983fd945a7SJeff Layton  * Build a string that represents the path to the dentry. This is mostly called
25993fd945a7SJeff Layton  * for two different purposes:
26003fd945a7SJeff Layton  *
26013fd945a7SJeff Layton  * 1) we need to build a path string to send to the MDS (for_wire == true)
26023fd945a7SJeff Layton  * 2) we need a path string for local presentation (e.g. debugfs)
26033fd945a7SJeff Layton  *    (for_wire == false)
26043fd945a7SJeff Layton  *
26053fd945a7SJeff Layton  * The path is built in reverse, starting with the dentry. Walk back up toward
26063fd945a7SJeff Layton  * the root, building the path until the first non-snapped inode is reached
26073fd945a7SJeff Layton  * (for_wire) or the root inode is reached (!for_wire).
26082f2dc053SSage Weil  *
26092f2dc053SSage Weil  * Encode hidden .snap dirs as a double /, i.e.
26102f2dc053SSage Weil  *   foo/.snap/bar -> foo//bar
26112f2dc053SSage Weil  */
ceph_mdsc_build_path(struct ceph_mds_client * mdsc,struct dentry * dentry,int * plen,u64 * pbase,int for_wire)26122e2023e9SXiubo Li char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
26132e2023e9SXiubo Li 			   int *plen, u64 *pbase, int for_wire)
26142f2dc053SSage Weil {
26154c793d4cSJeff Layton 	struct dentry *cur;
26164c793d4cSJeff Layton 	struct inode *inode;
26172f2dc053SSage Weil 	char *path;
2618f77f21bbSJeff Layton 	int pos;
26191b71fe2eSAl Viro 	unsigned seq;
262069a10fb3SJeff Layton 	u64 base;
26212f2dc053SSage Weil 
2622d37b1d99SMarkus Elfring 	if (!dentry)
26232f2dc053SSage Weil 		return ERR_PTR(-EINVAL);
26242f2dc053SSage Weil 
2625f77f21bbSJeff Layton 	path = __getname();
2626d37b1d99SMarkus Elfring 	if (!path)
26272f2dc053SSage Weil 		return ERR_PTR(-ENOMEM);
2628f77f21bbSJeff Layton retry:
2629f77f21bbSJeff Layton 	pos = PATH_MAX - 1;
2630f77f21bbSJeff Layton 	path[pos] = '\0';
2631f77f21bbSJeff Layton 
2632f77f21bbSJeff Layton 	seq = read_seqbegin(&rename_lock);
26334c793d4cSJeff Layton 	cur = dget(dentry);
2634f77f21bbSJeff Layton 	for (;;) {
26353fd945a7SJeff Layton 		struct dentry *parent;
26362f2dc053SSage Weil 
26374c793d4cSJeff Layton 		spin_lock(&cur->d_lock);
26384c793d4cSJeff Layton 		inode = d_inode(cur);
26392f2dc053SSage Weil 		if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
2640104648adSSage Weil 			dout("build_path path+%d: %p SNAPDIR\n",
26414c793d4cSJeff Layton 			     pos, cur);
26423fd945a7SJeff Layton 			spin_unlock(&cur->d_lock);
26433fd945a7SJeff Layton 			parent = dget_parent(cur);
26443fd945a7SJeff Layton 		} else if (for_wire && inode && dentry != cur &&
26452f2dc053SSage Weil 			   ceph_snap(inode) == CEPH_NOSNAP) {
26464c793d4cSJeff Layton 			spin_unlock(&cur->d_lock);
2647d6b8bd67SJeff Layton 			pos++; /* get rid of any prepended '/' */
26482f2dc053SSage Weil 			break;
26493fd945a7SJeff Layton 		} else if (!for_wire || !IS_ENCRYPTED(d_inode(cur->d_parent))) {
26504c793d4cSJeff Layton 			pos -= cur->d_name.len;
26511b71fe2eSAl Viro 			if (pos < 0) {
26524c793d4cSJeff Layton 				spin_unlock(&cur->d_lock);
26532f2dc053SSage Weil 				break;
26541b71fe2eSAl Viro 			}
26554c793d4cSJeff Layton 			memcpy(path + pos, cur->d_name.name, cur->d_name.len);
26563fd945a7SJeff Layton 			spin_unlock(&cur->d_lock);
26573fd945a7SJeff Layton 			parent = dget_parent(cur);
26583fd945a7SJeff Layton 		} else {
26593fd945a7SJeff Layton 			int len, ret;
26603fd945a7SJeff Layton 			char buf[NAME_MAX];
26613fd945a7SJeff Layton 
26623fd945a7SJeff Layton 			/*
26633fd945a7SJeff Layton 			 * Proactively copy name into buf, in case we need to
26643fd945a7SJeff Layton 			 * present it as-is.
26653fd945a7SJeff Layton 			 */
26663fd945a7SJeff Layton 			memcpy(buf, cur->d_name.name, cur->d_name.len);
26673fd945a7SJeff Layton 			len = cur->d_name.len;
26683fd945a7SJeff Layton 			spin_unlock(&cur->d_lock);
26693fd945a7SJeff Layton 			parent = dget_parent(cur);
26703fd945a7SJeff Layton 
267114e034a6SLuís Henriques 			ret = ceph_fscrypt_prepare_readdir(d_inode(parent));
26723fd945a7SJeff Layton 			if (ret < 0) {
26733fd945a7SJeff Layton 				dput(parent);
26743fd945a7SJeff Layton 				dput(cur);
26753fd945a7SJeff Layton 				return ERR_PTR(ret);
26762f2dc053SSage Weil 			}
26773fd945a7SJeff Layton 
26783fd945a7SJeff Layton 			if (fscrypt_has_encryption_key(d_inode(parent))) {
26793fd945a7SJeff Layton 				len = ceph_encode_encrypted_fname(d_inode(parent),
26803fd945a7SJeff Layton 								  cur, buf);
26813fd945a7SJeff Layton 				if (len < 0) {
26823fd945a7SJeff Layton 					dput(parent);
26833fd945a7SJeff Layton 					dput(cur);
26843fd945a7SJeff Layton 					return ERR_PTR(len);
26853fd945a7SJeff Layton 				}
26863fd945a7SJeff Layton 			}
26873fd945a7SJeff Layton 			pos -= len;
26883fd945a7SJeff Layton 			if (pos < 0) {
26893fd945a7SJeff Layton 				dput(parent);
26903fd945a7SJeff Layton 				break;
26913fd945a7SJeff Layton 			}
26923fd945a7SJeff Layton 			memcpy(path + pos, buf, len);
26933fd945a7SJeff Layton 		}
26943fd945a7SJeff Layton 		dput(cur);
26953fd945a7SJeff Layton 		cur = parent;
2696f77f21bbSJeff Layton 
2697f77f21bbSJeff Layton 		/* Are we at the root? */
26984c793d4cSJeff Layton 		if (IS_ROOT(cur))
2699f77f21bbSJeff Layton 			break;
2700f77f21bbSJeff Layton 
2701f77f21bbSJeff Layton 		/* Are we out of buffer? */
2702f77f21bbSJeff Layton 		if (--pos < 0)
2703f77f21bbSJeff Layton 			break;
2704f77f21bbSJeff Layton 
2705f77f21bbSJeff Layton 		path[pos] = '/';
27062f2dc053SSage Weil 	}
27074c793d4cSJeff Layton 	inode = d_inode(cur);
27084c793d4cSJeff Layton 	base = inode ? ceph_ino(inode) : 0;
27094c793d4cSJeff Layton 	dput(cur);
2710f5946bccSJeff Layton 
2711f5946bccSJeff Layton 	if (read_seqretry(&rename_lock, seq))
2712f5946bccSJeff Layton 		goto retry;
2713f5946bccSJeff Layton 
2714f5946bccSJeff Layton 	if (pos < 0) {
2715f5946bccSJeff Layton 		/*
2716f5946bccSJeff Layton 		 * A rename didn't occur, but somehow we didn't end up where
2717f5946bccSJeff Layton 		 * we thought we would. Throw a warning and try again.
2718f5946bccSJeff Layton 		 */
27193fd945a7SJeff Layton 		pr_warn("build_path did not end path lookup where expected (pos = %d)\n",
27203fd945a7SJeff Layton 			pos);
27212f2dc053SSage Weil 		goto retry;
27222f2dc053SSage Weil 	}
27232f2dc053SSage Weil 
272469a10fb3SJeff Layton 	*pbase = base;
2725f77f21bbSJeff Layton 	*plen = PATH_MAX - 1 - pos;
2726104648adSSage Weil 	dout("build_path on %p %d built %llx '%.*s'\n",
2727f77f21bbSJeff Layton 	     dentry, d_count(dentry), base, *plen, path + pos);
2728f77f21bbSJeff Layton 	return path + pos;
27292f2dc053SSage Weil }
27302f2dc053SSage Weil 
build_dentry_path(struct ceph_mds_client * mdsc,struct dentry * dentry,struct inode * dir,const char ** ppath,int * ppathlen,u64 * pino,bool * pfreepath,bool parent_locked)27312e2023e9SXiubo Li static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
27322e2023e9SXiubo Li 			     struct inode *dir, const char **ppath, int *ppathlen,
27332e2023e9SXiubo Li 			     u64 *pino, bool *pfreepath, bool parent_locked)
27342f2dc053SSage Weil {
27352f2dc053SSage Weil 	char *path;
27362f2dc053SSage Weil 
2737c6b0b656SJeff Layton 	rcu_read_lock();
2738fd36a717SJeff Layton 	if (!dir)
2739c6b0b656SJeff Layton 		dir = d_inode_rcu(dentry->d_parent);
27403fd945a7SJeff Layton 	if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP &&
27413fd945a7SJeff Layton 	    !IS_ENCRYPTED(dir)) {
2742c6b0b656SJeff Layton 		*pino = ceph_ino(dir);
2743c6b0b656SJeff Layton 		rcu_read_unlock();
27442f2dc053SSage Weil 		*ppath = dentry->d_name.name;
27452f2dc053SSage Weil 		*ppathlen = dentry->d_name.len;
27462f2dc053SSage Weil 		return 0;
27472f2dc053SSage Weil 	}
2748c6b0b656SJeff Layton 	rcu_read_unlock();
27492e2023e9SXiubo Li 	path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
27502f2dc053SSage Weil 	if (IS_ERR(path))
27512f2dc053SSage Weil 		return PTR_ERR(path);
27522f2dc053SSage Weil 	*ppath = path;
27531bcb3440SJeff Layton 	*pfreepath = true;
27542f2dc053SSage Weil 	return 0;
27552f2dc053SSage Weil }
27562f2dc053SSage Weil 
build_inode_path(struct inode * inode,const char ** ppath,int * ppathlen,u64 * pino,bool * pfreepath)27572f2dc053SSage Weil static int build_inode_path(struct inode *inode,
27582f2dc053SSage Weil 			    const char **ppath, int *ppathlen, u64 *pino,
27591bcb3440SJeff Layton 			    bool *pfreepath)
27602f2dc053SSage Weil {
27612e2023e9SXiubo Li 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
27622f2dc053SSage Weil 	struct dentry *dentry;
27632f2dc053SSage Weil 	char *path;
27642f2dc053SSage Weil 
27652f2dc053SSage Weil 	if (ceph_snap(inode) == CEPH_NOSNAP) {
27662f2dc053SSage Weil 		*pino = ceph_ino(inode);
27672f2dc053SSage Weil 		*ppathlen = 0;
27682f2dc053SSage Weil 		return 0;
27692f2dc053SSage Weil 	}
27702f2dc053SSage Weil 	dentry = d_find_alias(inode);
27712e2023e9SXiubo Li 	path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
27722f2dc053SSage Weil 	dput(dentry);
27732f2dc053SSage Weil 	if (IS_ERR(path))
27742f2dc053SSage Weil 		return PTR_ERR(path);
27752f2dc053SSage Weil 	*ppath = path;
27761bcb3440SJeff Layton 	*pfreepath = true;
27772f2dc053SSage Weil 	return 0;
27782f2dc053SSage Weil }
27792f2dc053SSage Weil 
27802f2dc053SSage Weil /*
27812f2dc053SSage Weil  * request arguments may be specified via an inode *, a dentry *, or
27822f2dc053SSage Weil  * an explicit ino+path.
27832f2dc053SSage Weil  */
set_request_path_attr(struct ceph_mds_client * mdsc,struct inode * rinode,struct dentry * rdentry,struct inode * rdiri,const char * rpath,u64 rino,const char ** ppath,int * pathlen,u64 * ino,bool * freepath,bool parent_locked)27842e2023e9SXiubo Li static int set_request_path_attr(struct ceph_mds_client *mdsc, struct inode *rinode,
27852e2023e9SXiubo Li 				 struct dentry *rdentry, struct inode *rdiri,
27862e2023e9SXiubo Li 				 const char *rpath, u64 rino, const char **ppath,
27872e2023e9SXiubo Li 				 int *pathlen, u64 *ino, bool *freepath,
27882e2023e9SXiubo Li 				 bool parent_locked)
27892f2dc053SSage Weil {
27902f2dc053SSage Weil 	int r = 0;
27912f2dc053SSage Weil 
27922f2dc053SSage Weil 	if (rinode) {
27932f2dc053SSage Weil 		r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
27942f2dc053SSage Weil 		dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
27952f2dc053SSage Weil 		     ceph_snap(rinode));
27962f2dc053SSage Weil 	} else if (rdentry) {
27972e2023e9SXiubo Li 		r = build_dentry_path(mdsc, rdentry, rdiri, ppath, pathlen, ino,
27981bcb3440SJeff Layton 					freepath, parent_locked);
27992f2dc053SSage Weil 		dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
28002f2dc053SSage Weil 		     *ppath);
2801795858dbSSage Weil 	} else if (rpath || rino) {
28022f2dc053SSage Weil 		*ino = rino;
28032f2dc053SSage Weil 		*ppath = rpath;
2804b000056aSDavid Zafman 		*pathlen = rpath ? strlen(rpath) : 0;
28052f2dc053SSage Weil 		dout(" path %.*s\n", *pathlen, rpath);
28062f2dc053SSage Weil 	}
28072f2dc053SSage Weil 
28082f2dc053SSage Weil 	return r;
28092f2dc053SSage Weil }
28102f2dc053SSage Weil 
encode_mclientrequest_tail(void ** p,const struct ceph_mds_request * req)28112d332d5bSJeff Layton static void encode_mclientrequest_tail(void **p,
281260267ba3SIlya Dryomov 				       const struct ceph_mds_request *req)
281360267ba3SIlya Dryomov {
281460267ba3SIlya Dryomov 	struct ceph_timespec ts;
281560267ba3SIlya Dryomov 	int i;
281660267ba3SIlya Dryomov 
281760267ba3SIlya Dryomov 	ceph_encode_timespec64(&ts, &req->r_stamp);
281860267ba3SIlya Dryomov 	ceph_encode_copy(p, &ts, sizeof(ts));
281960267ba3SIlya Dryomov 
282024865e75SJeff Layton 	/* v4: gid_list */
282160267ba3SIlya Dryomov 	ceph_encode_32(p, req->r_cred->group_info->ngroups);
282260267ba3SIlya Dryomov 	for (i = 0; i < req->r_cred->group_info->ngroups; i++)
282360267ba3SIlya Dryomov 		ceph_encode_64(p, from_kgid(&init_user_ns,
282460267ba3SIlya Dryomov 					    req->r_cred->group_info->gid[i]));
28252d332d5bSJeff Layton 
282624865e75SJeff Layton 	/* v5: altname */
282724865e75SJeff Layton 	ceph_encode_32(p, req->r_altname_len);
282824865e75SJeff Layton 	ceph_encode_copy(p, req->r_altname, req->r_altname_len);
28292d332d5bSJeff Layton 
28302d332d5bSJeff Layton 	/* v6: fscrypt_auth and fscrypt_file */
28312d332d5bSJeff Layton 	if (req->r_fscrypt_auth) {
28322d332d5bSJeff Layton 		u32 authlen = ceph_fscrypt_auth_len(req->r_fscrypt_auth);
28332d332d5bSJeff Layton 
28342d332d5bSJeff Layton 		ceph_encode_32(p, authlen);
28352d332d5bSJeff Layton 		ceph_encode_copy(p, req->r_fscrypt_auth, authlen);
28362d332d5bSJeff Layton 	} else {
28372d332d5bSJeff Layton 		ceph_encode_32(p, 0);
28382d332d5bSJeff Layton 	}
283916be62fcSJeff Layton 	if (test_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags)) {
284016be62fcSJeff Layton 		ceph_encode_32(p, sizeof(__le64));
284116be62fcSJeff Layton 		ceph_encode_64(p, req->r_fscrypt_file);
284216be62fcSJeff Layton 	} else {
284316be62fcSJeff Layton 		ceph_encode_32(p, 0);
284416be62fcSJeff Layton 	}
284560267ba3SIlya Dryomov }
284660267ba3SIlya Dryomov 
2847ce0d5bd3SXiubo Li static struct ceph_mds_request_head_legacy *
find_legacy_request_head(void * p,u64 features)2848ce0d5bd3SXiubo Li find_legacy_request_head(void *p, u64 features)
2849ce0d5bd3SXiubo Li {
2850ce0d5bd3SXiubo Li 	bool legacy = !(features & CEPH_FEATURE_FS_BTIME);
2851ce0d5bd3SXiubo Li 	struct ceph_mds_request_head_old *ohead;
2852ce0d5bd3SXiubo Li 
2853ce0d5bd3SXiubo Li 	if (legacy)
2854ce0d5bd3SXiubo Li 		return (struct ceph_mds_request_head_legacy *)p;
2855ce0d5bd3SXiubo Li 	ohead = (struct ceph_mds_request_head_old *)p;
2856ce0d5bd3SXiubo Li 	return (struct ceph_mds_request_head_legacy *)&ohead->oldest_client_tid;
2857ce0d5bd3SXiubo Li }
2858ce0d5bd3SXiubo Li 
28592f2dc053SSage Weil /*
28602f2dc053SSage Weil  * called under mdsc->mutex
28612f2dc053SSage Weil  */
create_request_message(struct ceph_mds_session * session,struct ceph_mds_request * req,bool drop_cap_releases)28624f1ddb1eSJeff Layton static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
28632f2dc053SSage Weil 					       struct ceph_mds_request *req,
28644f1ddb1eSJeff Layton 					       bool drop_cap_releases)
28652f2dc053SSage Weil {
28664f1ddb1eSJeff Layton 	int mds = session->s_mds;
28674f1ddb1eSJeff Layton 	struct ceph_mds_client *mdsc = session->s_mdsc;
28682f2dc053SSage Weil 	struct ceph_msg *msg;
2869ce0d5bd3SXiubo Li 	struct ceph_mds_request_head_legacy *lhead;
28702f2dc053SSage Weil 	const char *path1 = NULL;
28712f2dc053SSage Weil 	const char *path2 = NULL;
28722f2dc053SSage Weil 	u64 ino1 = 0, ino2 = 0;
28732f2dc053SSage Weil 	int pathlen1 = 0, pathlen2 = 0;
28741bcb3440SJeff Layton 	bool freepath1 = false, freepath2 = false;
2875a5ffd7b6SXiubo Li 	struct dentry *old_dentry = NULL;
287660267ba3SIlya Dryomov 	int len;
28772f2dc053SSage Weil 	u16 releases;
28782f2dc053SSage Weil 	void *p, *end;
28792f2dc053SSage Weil 	int ret;
28804f1ddb1eSJeff Layton 	bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);
2881ce0d5bd3SXiubo Li 	bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD,
2882ce0d5bd3SXiubo Li 				     &session->s_features);
28832f2dc053SSage Weil 
28842e2023e9SXiubo Li 	ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
28853dd69aabSJeff Layton 			      req->r_parent, req->r_path1, req->r_ino1.ino,
28861bcb3440SJeff Layton 			      &path1, &pathlen1, &ino1, &freepath1,
28871bcb3440SJeff Layton 			      test_bit(CEPH_MDS_R_PARENT_LOCKED,
28881bcb3440SJeff Layton 					&req->r_req_flags));
28892f2dc053SSage Weil 	if (ret < 0) {
28902f2dc053SSage Weil 		msg = ERR_PTR(ret);
28912f2dc053SSage Weil 		goto out;
28922f2dc053SSage Weil 	}
28932f2dc053SSage Weil 
28941bcb3440SJeff Layton 	/* If r_old_dentry is set, then assume that its parent is locked */
2895a5ffd7b6SXiubo Li 	if (req->r_old_dentry &&
2896a5ffd7b6SXiubo Li 	    !(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED))
2897a5ffd7b6SXiubo Li 		old_dentry = req->r_old_dentry;
28982e2023e9SXiubo Li 	ret = set_request_path_attr(mdsc, NULL, old_dentry,
2899fd36a717SJeff Layton 			      req->r_old_dentry_dir,
29002f2dc053SSage Weil 			      req->r_path2, req->r_ino2.ino,
29011bcb3440SJeff Layton 			      &path2, &pathlen2, &ino2, &freepath2, true);
29022f2dc053SSage Weil 	if (ret < 0) {
29032f2dc053SSage Weil 		msg = ERR_PTR(ret);
29042f2dc053SSage Weil 		goto out_free1;
29052f2dc053SSage Weil 	}
29062f2dc053SSage Weil 
290724865e75SJeff Layton 	req->r_altname = get_fscrypt_altname(req, &req->r_altname_len);
290824865e75SJeff Layton 	if (IS_ERR(req->r_altname)) {
290924865e75SJeff Layton 		msg = ERR_CAST(req->r_altname);
291024865e75SJeff Layton 		req->r_altname = NULL;
291124865e75SJeff Layton 		goto out_free2;
291224865e75SJeff Layton 	}
291324865e75SJeff Layton 
2914ce0d5bd3SXiubo Li 	/*
2915ce0d5bd3SXiubo Li 	 * For old cephs without supporting the 32bit retry/fwd feature
2916ce0d5bd3SXiubo Li 	 * it will copy the raw memories directly when decoding the
2917ce0d5bd3SXiubo Li 	 * requests. While new cephs will decode the head depending the
2918ce0d5bd3SXiubo Li 	 * version member, so we need to make sure it will be compatible
2919ce0d5bd3SXiubo Li 	 * with them both.
2920ce0d5bd3SXiubo Li 	 */
2921ce0d5bd3SXiubo Li 	if (legacy)
2922ce0d5bd3SXiubo Li 		len = sizeof(struct ceph_mds_request_head_legacy);
2923ce0d5bd3SXiubo Li 	else if (old_version)
2924ce0d5bd3SXiubo Li 		len = sizeof(struct ceph_mds_request_head_old);
2925ce0d5bd3SXiubo Li 	else
2926ce0d5bd3SXiubo Li 		len = sizeof(struct ceph_mds_request_head);
29272f2dc053SSage Weil 
29282d332d5bSJeff Layton 	/* filepaths */
29292d332d5bSJeff Layton 	len += 2 * (1 + sizeof(u32) + sizeof(u64));
29302d332d5bSJeff Layton 	len += pathlen1 + pathlen2;
29312d332d5bSJeff Layton 
29322d332d5bSJeff Layton 	/* cap releases */
29332f2dc053SSage Weil 	len += sizeof(struct ceph_mds_request_release) *
29342f2dc053SSage Weil 		(!!req->r_inode_drop + !!req->r_dentry_drop +
29352f2dc053SSage Weil 		 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
29364f1ddb1eSJeff Layton 
29372f2dc053SSage Weil 	if (req->r_dentry_drop)
2938c1dfc277SJeff Layton 		len += pathlen1;
29392f2dc053SSage Weil 	if (req->r_old_dentry_drop)
2940c1dfc277SJeff Layton 		len += pathlen2;
29412f2dc053SSage Weil 
29422d332d5bSJeff Layton 	/* MClientRequest tail */
29432d332d5bSJeff Layton 
29442d332d5bSJeff Layton 	/* req->r_stamp */
29452d332d5bSJeff Layton 	len += sizeof(struct ceph_timespec);
29462d332d5bSJeff Layton 
29472d332d5bSJeff Layton 	/* gid list */
29482d332d5bSJeff Layton 	len += sizeof(u32) + (sizeof(u64) * req->r_cred->group_info->ngroups);
29492d332d5bSJeff Layton 
29502d332d5bSJeff Layton 	/* alternate name */
295124865e75SJeff Layton 	len += sizeof(u32) + req->r_altname_len;
29522d332d5bSJeff Layton 
29532d332d5bSJeff Layton 	/* fscrypt_auth */
29542d332d5bSJeff Layton 	len += sizeof(u32); // fscrypt_auth
29552d332d5bSJeff Layton 	if (req->r_fscrypt_auth)
29562d332d5bSJeff Layton 		len += ceph_fscrypt_auth_len(req->r_fscrypt_auth);
29572d332d5bSJeff Layton 
29582d332d5bSJeff Layton 	/* fscrypt_file */
29592d332d5bSJeff Layton 	len += sizeof(u32);
296016be62fcSJeff Layton 	if (test_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags))
296116be62fcSJeff Layton 		len += sizeof(__le64);
29622d332d5bSJeff Layton 
29630d9c1ab3SIlya Dryomov 	msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false);
2964a79832f2SSage Weil 	if (!msg) {
2965a79832f2SSage Weil 		msg = ERR_PTR(-ENOMEM);
29662f2dc053SSage Weil 		goto out_free2;
2967a79832f2SSage Weil 	}
29682f2dc053SSage Weil 
29696df058c0SSage Weil 	msg->hdr.tid = cpu_to_le64(req->r_tid);
29706df058c0SSage Weil 
2971ce0d5bd3SXiubo Li 	lhead = find_legacy_request_head(msg->front.iov_base,
2972ce0d5bd3SXiubo Li 					 session->s_con.peer_features);
2973ce0d5bd3SXiubo Li 
29744f1ddb1eSJeff Layton 	/*
2975ce0d5bd3SXiubo Li 	 * The ceph_mds_request_head_legacy didn't contain a version field, and
29764f1ddb1eSJeff Layton 	 * one was added when we moved the message version from 3->4.
29774f1ddb1eSJeff Layton 	 */
29784f1ddb1eSJeff Layton 	if (legacy) {
29794f1ddb1eSJeff Layton 		msg->hdr.version = cpu_to_le16(3);
2980ce0d5bd3SXiubo Li 		p = msg->front.iov_base + sizeof(*lhead);
2981ce0d5bd3SXiubo Li 	} else if (old_version) {
2982ce0d5bd3SXiubo Li 		struct ceph_mds_request_head_old *ohead = msg->front.iov_base;
2983ce0d5bd3SXiubo Li 
2984ce0d5bd3SXiubo Li 		msg->hdr.version = cpu_to_le16(4);
2985ce0d5bd3SXiubo Li 		ohead->version = cpu_to_le16(1);
2986ce0d5bd3SXiubo Li 		p = msg->front.iov_base + sizeof(*ohead);
29874f1ddb1eSJeff Layton 	} else {
2988ce0d5bd3SXiubo Li 		struct ceph_mds_request_head *nhead = msg->front.iov_base;
29894f1ddb1eSJeff Layton 
29902d332d5bSJeff Layton 		msg->hdr.version = cpu_to_le16(6);
2991ce0d5bd3SXiubo Li 		nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
2992ce0d5bd3SXiubo Li 		p = msg->front.iov_base + sizeof(*nhead);
29934f1ddb1eSJeff Layton 	}
29944f1ddb1eSJeff Layton 
29952f2dc053SSage Weil 	end = msg->front.iov_base + msg->front.iov_len;
29962f2dc053SSage Weil 
2997ce0d5bd3SXiubo Li 	lhead->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
2998ce0d5bd3SXiubo Li 	lhead->op = cpu_to_le32(req->r_op);
2999ce0d5bd3SXiubo Li 	lhead->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
30007fe0cdebSJeff Layton 						  req->r_cred->fsuid));
3001ce0d5bd3SXiubo Li 	lhead->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
30027fe0cdebSJeff Layton 						  req->r_cred->fsgid));
3003ce0d5bd3SXiubo Li 	lhead->ino = cpu_to_le64(req->r_deleg_ino);
3004ce0d5bd3SXiubo Li 	lhead->args = req->r_args;
30052f2dc053SSage Weil 
30062f2dc053SSage Weil 	ceph_encode_filepath(&p, end, ino1, path1);
30072f2dc053SSage Weil 	ceph_encode_filepath(&p, end, ino2, path2);
30082f2dc053SSage Weil 
3009e979cf50SSage Weil 	/* make note of release offset, in case we need to replay */
3010e979cf50SSage Weil 	req->r_request_release_offset = p - msg->front.iov_base;
3011e979cf50SSage Weil 
30122f2dc053SSage Weil 	/* cap releases */
30132f2dc053SSage Weil 	releases = 0;
30142f2dc053SSage Weil 	if (req->r_inode_drop)
30152f2dc053SSage Weil 		releases += ceph_encode_inode_release(&p,
30162b0143b5SDavid Howells 		      req->r_inode ? req->r_inode : d_inode(req->r_dentry),
3017719a2514SYan, Zheng 		      mds, req->r_inode_drop, req->r_inode_unless,
3018719a2514SYan, Zheng 		      req->r_op == CEPH_MDS_OP_READDIR);
30193fd945a7SJeff Layton 	if (req->r_dentry_drop) {
30203fd945a7SJeff Layton 		ret = ceph_encode_dentry_release(&p, req->r_dentry,
30213dd69aabSJeff Layton 				req->r_parent, mds, req->r_dentry_drop,
3022ca6c8ae0SJeff Layton 				req->r_dentry_unless);
30233fd945a7SJeff Layton 		if (ret < 0)
30243fd945a7SJeff Layton 			goto out_err;
30253fd945a7SJeff Layton 		releases += ret;
30263fd945a7SJeff Layton 	}
30273fd945a7SJeff Layton 	if (req->r_old_dentry_drop) {
30283fd945a7SJeff Layton 		ret = ceph_encode_dentry_release(&p, req->r_old_dentry,
3029ca6c8ae0SJeff Layton 				req->r_old_dentry_dir, mds,
3030ca6c8ae0SJeff Layton 				req->r_old_dentry_drop,
3031ca6c8ae0SJeff Layton 				req->r_old_dentry_unless);
30323fd945a7SJeff Layton 		if (ret < 0)
30333fd945a7SJeff Layton 			goto out_err;
30343fd945a7SJeff Layton 		releases += ret;
30353fd945a7SJeff Layton 	}
30362f2dc053SSage Weil 	if (req->r_old_inode_drop)
30372f2dc053SSage Weil 		releases += ceph_encode_inode_release(&p,
30382b0143b5SDavid Howells 		      d_inode(req->r_old_dentry),
30392f2dc053SSage Weil 		      mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
30406e6f0923SYan, Zheng 
30416e6f0923SYan, Zheng 	if (drop_cap_releases) {
30426e6f0923SYan, Zheng 		releases = 0;
30436e6f0923SYan, Zheng 		p = msg->front.iov_base + req->r_request_release_offset;
30446e6f0923SYan, Zheng 	}
30456e6f0923SYan, Zheng 
3046ce0d5bd3SXiubo Li 	lhead->num_releases = cpu_to_le16(releases);
30472f2dc053SSage Weil 
30482d332d5bSJeff Layton 	encode_mclientrequest_tail(&p, req);
30494f1ddb1eSJeff Layton 
3050b682c6d4SXiubo Li 	if (WARN_ON_ONCE(p > end)) {
3051b682c6d4SXiubo Li 		ceph_msg_put(msg);
3052b682c6d4SXiubo Li 		msg = ERR_PTR(-ERANGE);
3053b682c6d4SXiubo Li 		goto out_free2;
3054b682c6d4SXiubo Li 	}
3055b682c6d4SXiubo Li 
30562f2dc053SSage Weil 	msg->front.iov_len = p - msg->front.iov_base;
30572f2dc053SSage Weil 	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
30582f2dc053SSage Weil 
305925e6bae3SYan, Zheng 	if (req->r_pagelist) {
306025e6bae3SYan, Zheng 		struct ceph_pagelist *pagelist = req->r_pagelist;
306125e6bae3SYan, Zheng 		ceph_msg_data_add_pagelist(msg, pagelist);
306225e6bae3SYan, Zheng 		msg->hdr.data_len = cpu_to_le32(pagelist->length);
306325e6bae3SYan, Zheng 	} else {
306425e6bae3SYan, Zheng 		msg->hdr.data_len = 0;
3065ebf18f47SAlex Elder 	}
306602afca6cSAlex Elder 
30672f2dc053SSage Weil 	msg->hdr.data_off = cpu_to_le16(0);
30682f2dc053SSage Weil 
30692f2dc053SSage Weil out_free2:
30702f2dc053SSage Weil 	if (freepath2)
3071f77f21bbSJeff Layton 		ceph_mdsc_free_path((char *)path2, pathlen2);
30722f2dc053SSage Weil out_free1:
30732f2dc053SSage Weil 	if (freepath1)
3074f77f21bbSJeff Layton 		ceph_mdsc_free_path((char *)path1, pathlen1);
30752f2dc053SSage Weil out:
30762f2dc053SSage Weil 	return msg;
30773fd945a7SJeff Layton out_err:
30783fd945a7SJeff Layton 	ceph_msg_put(msg);
30793fd945a7SJeff Layton 	msg = ERR_PTR(ret);
30803fd945a7SJeff Layton 	goto out_free2;
30812f2dc053SSage Weil }
30822f2dc053SSage Weil 
30832f2dc053SSage Weil /*
30842f2dc053SSage Weil  * called under mdsc->mutex if error, under no mutex if
30852f2dc053SSage Weil  * success.
30862f2dc053SSage Weil  */
complete_request(struct ceph_mds_client * mdsc,struct ceph_mds_request * req)30872f2dc053SSage Weil static void complete_request(struct ceph_mds_client *mdsc,
30882f2dc053SSage Weil 			     struct ceph_mds_request *req)
30892f2dc053SSage Weil {
309070c94820SXiubo Li 	req->r_end_latency = ktime_get();
309170c94820SXiubo Li 
30922f2dc053SSage Weil 	if (req->r_callback)
30932f2dc053SSage Weil 		req->r_callback(mdsc, req);
309403066f23SYehuda Sadeh 	complete_all(&req->r_completion);
30952f2dc053SSage Weil }
30962f2dc053SSage Weil 
30972f2dc053SSage Weil /*
30982f2dc053SSage Weil  * called under mdsc->mutex
30992f2dc053SSage Weil  */
__prepare_send_request(struct ceph_mds_session * session,struct ceph_mds_request * req,bool drop_cap_releases)3100396bd62cSJeff Layton static int __prepare_send_request(struct ceph_mds_session *session,
31012f2dc053SSage Weil 				  struct ceph_mds_request *req,
3102396bd62cSJeff Layton 				  bool drop_cap_releases)
31032f2dc053SSage Weil {
3104396bd62cSJeff Layton 	int mds = session->s_mds;
3105396bd62cSJeff Layton 	struct ceph_mds_client *mdsc = session->s_mdsc;
3106ce0d5bd3SXiubo Li 	struct ceph_mds_request_head_legacy *lhead;
3107ce0d5bd3SXiubo Li 	struct ceph_mds_request_head *nhead;
31082f2dc053SSage Weil 	struct ceph_msg *msg;
3109ce0d5bd3SXiubo Li 	int flags = 0, old_max_retry;
3110ce0d5bd3SXiubo Li 	bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD,
3111ce0d5bd3SXiubo Li 				     &session->s_features);
3112546a5d61SXiubo Li 
3113546a5d61SXiubo Li 	/*
3114ce0d5bd3SXiubo Li 	 * Avoid inifinite retrying after overflow. The client will
3115ce0d5bd3SXiubo Li 	 * increase the retry count and if the MDS is old version,
3116ce0d5bd3SXiubo Li 	 * so we limit to retry at most 256 times.
3117546a5d61SXiubo Li 	 */
3118ce0d5bd3SXiubo Li 	if (req->r_attempts) {
3119ce0d5bd3SXiubo Li 	       old_max_retry = sizeof_field(struct ceph_mds_request_head_old,
3120ce0d5bd3SXiubo Li 					    num_retry);
3121ce0d5bd3SXiubo Li 	       old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE);
3122ce0d5bd3SXiubo Li 	       if ((old_version && req->r_attempts >= old_max_retry) ||
3123ce0d5bd3SXiubo Li 		   ((uint32_t)req->r_attempts >= U32_MAX)) {
3124546a5d61SXiubo Li 			pr_warn_ratelimited("%s request tid %llu seq overflow\n",
3125546a5d61SXiubo Li 					    __func__, req->r_tid);
3126546a5d61SXiubo Li 			return -EMULTIHOP;
3127546a5d61SXiubo Li 	       }
3128ce0d5bd3SXiubo Li 	}
31292f2dc053SSage Weil 
31302f2dc053SSage Weil 	req->r_attempts++;
3131e55b71f8SGreg Farnum 	if (req->r_inode) {
3132e55b71f8SGreg Farnum 		struct ceph_cap *cap =
3133e55b71f8SGreg Farnum 			ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds);
3134e55b71f8SGreg Farnum 
3135e55b71f8SGreg Farnum 		if (cap)
3136e55b71f8SGreg Farnum 			req->r_sent_on_mseq = cap->mseq;
3137e55b71f8SGreg Farnum 		else
3138e55b71f8SGreg Farnum 			req->r_sent_on_mseq = -1;
3139e55b71f8SGreg Farnum 	}
3140546a5d61SXiubo Li 	dout("%s %p tid %lld %s (attempt %d)\n", __func__, req,
31412f2dc053SSage Weil 	     req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
31422f2dc053SSage Weil 
3143bc2de10dSJeff Layton 	if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
3144c5c9a0bfSYan, Zheng 		void *p;
31454f1ddb1eSJeff Layton 
314601a92f17SSage Weil 		/*
314701a92f17SSage Weil 		 * Replay.  Do not regenerate message (and rebuild
314801a92f17SSage Weil 		 * paths, etc.); just use the original message.
314901a92f17SSage Weil 		 * Rebuilding paths will break for renames because
315001a92f17SSage Weil 		 * d_move mangles the src name.
315101a92f17SSage Weil 		 */
315201a92f17SSage Weil 		msg = req->r_request;
3153ce0d5bd3SXiubo Li 		lhead = find_legacy_request_head(msg->front.iov_base,
31544f1ddb1eSJeff Layton 						 session->s_con.peer_features);
315501a92f17SSage Weil 
3156ce0d5bd3SXiubo Li 		flags = le32_to_cpu(lhead->flags);
315701a92f17SSage Weil 		flags |= CEPH_MDS_FLAG_REPLAY;
3158ce0d5bd3SXiubo Li 		lhead->flags = cpu_to_le32(flags);
315901a92f17SSage Weil 
316001a92f17SSage Weil 		if (req->r_target_inode)
3161ce0d5bd3SXiubo Li 			lhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
316201a92f17SSage Weil 
3163ce0d5bd3SXiubo Li 		lhead->num_retry = req->r_attempts - 1;
3164ce0d5bd3SXiubo Li 		if (!old_version) {
3165ce0d5bd3SXiubo Li 			nhead = (struct ceph_mds_request_head*)msg->front.iov_base;
3166ce0d5bd3SXiubo Li 			nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
3167ce0d5bd3SXiubo Li 		}
3168e979cf50SSage Weil 
3169e979cf50SSage Weil 		/* remove cap/dentry releases from message */
3170ce0d5bd3SXiubo Li 		lhead->num_releases = 0;
3171c5c9a0bfSYan, Zheng 
3172c5c9a0bfSYan, Zheng 		p = msg->front.iov_base + req->r_request_release_offset;
31732d332d5bSJeff Layton 		encode_mclientrequest_tail(&p, req);
3174c5c9a0bfSYan, Zheng 
3175c5c9a0bfSYan, Zheng 		msg->front.iov_len = p - msg->front.iov_base;
3176c5c9a0bfSYan, Zheng 		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
317701a92f17SSage Weil 		return 0;
317801a92f17SSage Weil 	}
317901a92f17SSage Weil 
31802f2dc053SSage Weil 	if (req->r_request) {
31812f2dc053SSage Weil 		ceph_msg_put(req->r_request);
31822f2dc053SSage Weil 		req->r_request = NULL;
31832f2dc053SSage Weil 	}
31844f1ddb1eSJeff Layton 	msg = create_request_message(session, req, drop_cap_releases);
31852f2dc053SSage Weil 	if (IS_ERR(msg)) {
3186e1518c7cSSage Weil 		req->r_err = PTR_ERR(msg);
3187a79832f2SSage Weil 		return PTR_ERR(msg);
31882f2dc053SSage Weil 	}
31892f2dc053SSage Weil 	req->r_request = msg;
31902f2dc053SSage Weil 
3191ce0d5bd3SXiubo Li 	lhead = find_legacy_request_head(msg->front.iov_base,
31924f1ddb1eSJeff Layton 					 session->s_con.peer_features);
3193ce0d5bd3SXiubo Li 	lhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
3194bc2de10dSJeff Layton 	if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
31952f2dc053SSage Weil 		flags |= CEPH_MDS_FLAG_REPLAY;
31963bb48b41SJeff Layton 	if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags))
31973bb48b41SJeff Layton 		flags |= CEPH_MDS_FLAG_ASYNC;
31983dd69aabSJeff Layton 	if (req->r_parent)
31992f2dc053SSage Weil 		flags |= CEPH_MDS_FLAG_WANT_DENTRY;
3200ce0d5bd3SXiubo Li 	lhead->flags = cpu_to_le32(flags);
3201ce0d5bd3SXiubo Li 	lhead->num_fwd = req->r_num_fwd;
3202ce0d5bd3SXiubo Li 	lhead->num_retry = req->r_attempts - 1;
3203ce0d5bd3SXiubo Li 	if (!old_version) {
3204ce0d5bd3SXiubo Li 		nhead = (struct ceph_mds_request_head*)msg->front.iov_base;
3205ce0d5bd3SXiubo Li 		nhead->ext_num_fwd = cpu_to_le32(req->r_num_fwd);
3206ce0d5bd3SXiubo Li 		nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
3207ce0d5bd3SXiubo Li 	}
32082f2dc053SSage Weil 
32093dd69aabSJeff Layton 	dout(" r_parent = %p\n", req->r_parent);
32102f2dc053SSage Weil 	return 0;
32112f2dc053SSage Weil }
32122f2dc053SSage Weil 
32132f2dc053SSage Weil /*
32149cf54563SXiubo Li  * called under mdsc->mutex
32159cf54563SXiubo Li  */
__send_request(struct ceph_mds_session * session,struct ceph_mds_request * req,bool drop_cap_releases)3216396bd62cSJeff Layton static int __send_request(struct ceph_mds_session *session,
32179cf54563SXiubo Li 			  struct ceph_mds_request *req,
32189cf54563SXiubo Li 			  bool drop_cap_releases)
32199cf54563SXiubo Li {
32209cf54563SXiubo Li 	int err;
32219cf54563SXiubo Li 
3222396bd62cSJeff Layton 	err = __prepare_send_request(session, req, drop_cap_releases);
32239cf54563SXiubo Li 	if (!err) {
32249cf54563SXiubo Li 		ceph_msg_get(req->r_request);
32259cf54563SXiubo Li 		ceph_con_send(&session->s_con, req->r_request);
32269cf54563SXiubo Li 	}
32279cf54563SXiubo Li 
32289cf54563SXiubo Li 	return err;
32299cf54563SXiubo Li }
32309cf54563SXiubo Li 
32319cf54563SXiubo Li /*
32322f2dc053SSage Weil  * send request, or put it on the appropriate wait list.
32332f2dc053SSage Weil  */
__do_request(struct ceph_mds_client * mdsc,struct ceph_mds_request * req)3234d5548492SChengguang Xu static void __do_request(struct ceph_mds_client *mdsc,
32352f2dc053SSage Weil 			struct ceph_mds_request *req)
32362f2dc053SSage Weil {
32372f2dc053SSage Weil 	struct ceph_mds_session *session = NULL;
32382f2dc053SSage Weil 	int mds = -1;
323948fec5d0SYan, Zheng 	int err = 0;
3240c4853e97SXiubo Li 	bool random;
32412f2dc053SSage Weil 
3242bc2de10dSJeff Layton 	if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
3243bc2de10dSJeff Layton 		if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
3244eb1b8af3SYan, Zheng 			__unregister_request(mdsc, req);
3245d5548492SChengguang Xu 		return;
3246eb1b8af3SYan, Zheng 	}
32472f2dc053SSage Weil 
3248a68e564aSXiubo Li 	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) {
3249a68e564aSXiubo Li 		dout("do_request metadata corrupted\n");
3250a68e564aSXiubo Li 		err = -EIO;
3251a68e564aSXiubo Li 		goto finish;
3252a68e564aSXiubo Li 	}
32532f2dc053SSage Weil 	if (req->r_timeout &&
32542f2dc053SSage Weil 	    time_after_eq(jiffies, req->r_started + req->r_timeout)) {
32552f2dc053SSage Weil 		dout("do_request timed out\n");
32568ccf7fccSXiubo Li 		err = -ETIMEDOUT;
32572f2dc053SSage Weil 		goto finish;
32582f2dc053SSage Weil 	}
325952953d55SSeraphime Kirkovski 	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
326048fec5d0SYan, Zheng 		dout("do_request forced umount\n");
326148fec5d0SYan, Zheng 		err = -EIO;
326248fec5d0SYan, Zheng 		goto finish;
326348fec5d0SYan, Zheng 	}
326452953d55SSeraphime Kirkovski 	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) {
3265e9e427f0SYan, Zheng 		if (mdsc->mdsmap_err) {
3266e9e427f0SYan, Zheng 			err = mdsc->mdsmap_err;
3267e9e427f0SYan, Zheng 			dout("do_request mdsmap err %d\n", err);
3268e9e427f0SYan, Zheng 			goto finish;
3269e9e427f0SYan, Zheng 		}
3270cc8e8342SYan, Zheng 		if (mdsc->mdsmap->m_epoch == 0) {
3271cc8e8342SYan, Zheng 			dout("do_request no mdsmap, waiting for map\n");
3272cc8e8342SYan, Zheng 			list_add(&req->r_wait, &mdsc->waiting_for_map);
3273d5548492SChengguang Xu 			return;
3274cc8e8342SYan, Zheng 		}
3275e9e427f0SYan, Zheng 		if (!(mdsc->fsc->mount_options->flags &
3276e9e427f0SYan, Zheng 		      CEPH_MOUNT_OPT_MOUNTWAIT) &&
3277e9e427f0SYan, Zheng 		    !ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) {
327897820058SXiubo Li 			err = -EHOSTUNREACH;
3279e9e427f0SYan, Zheng 			goto finish;
3280e9e427f0SYan, Zheng 		}
3281e9e427f0SYan, Zheng 	}
32822f2dc053SSage Weil 
3283dc69e2e9SSage Weil 	put_request_session(req);
3284dc69e2e9SSage Weil 
3285c4853e97SXiubo Li 	mds = __choose_mds(mdsc, req, &random);
32862f2dc053SSage Weil 	if (mds < 0 ||
32872f2dc053SSage Weil 	    ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
32883bb48b41SJeff Layton 		if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) {
32893bb48b41SJeff Layton 			err = -EJUKEBOX;
32903bb48b41SJeff Layton 			goto finish;
32913bb48b41SJeff Layton 		}
32922f2dc053SSage Weil 		dout("do_request no mds or not active, waiting for map\n");
32932f2dc053SSage Weil 		list_add(&req->r_wait, &mdsc->waiting_for_map);
3294d5548492SChengguang Xu 		return;
32952f2dc053SSage Weil 	}
32962f2dc053SSage Weil 
32972f2dc053SSage Weil 	/* get, open session */
32982f2dc053SSage Weil 	session = __ceph_lookup_mds_session(mdsc, mds);
32999c423956SSage Weil 	if (!session) {
33002f2dc053SSage Weil 		session = register_session(mdsc, mds);
33019c423956SSage Weil 		if (IS_ERR(session)) {
33029c423956SSage Weil 			err = PTR_ERR(session);
33039c423956SSage Weil 			goto finish;
33049c423956SSage Weil 		}
33059c423956SSage Weil 	}
33065b3248c6SXiubo Li 	req->r_session = ceph_get_mds_session(session);
3307dc69e2e9SSage Weil 
33082f2dc053SSage Weil 	dout("do_request mds%d session %p state %s\n", mds, session,
3309a687ecafSJohn Spray 	     ceph_session_state_name(session->s_state));
33106eb06c46SXiubo Li 
33116eb06c46SXiubo Li 	/*
33126eb06c46SXiubo Li 	 * The old ceph will crash the MDSs when see unknown OPs
33136eb06c46SXiubo Li 	 */
33146eb06c46SXiubo Li 	if (req->r_feature_needed > 0 &&
33156eb06c46SXiubo Li 	    !test_bit(req->r_feature_needed, &session->s_features)) {
33166eb06c46SXiubo Li 		err = -EOPNOTSUPP;
33176eb06c46SXiubo Li 		goto out_session;
33186eb06c46SXiubo Li 	}
33196eb06c46SXiubo Li 
33202f2dc053SSage Weil 	if (session->s_state != CEPH_MDS_SESSION_OPEN &&
33212f2dc053SSage Weil 	    session->s_state != CEPH_MDS_SESSION_HUNG) {
33223bb48b41SJeff Layton 		/*
33233bb48b41SJeff Layton 		 * We cannot queue async requests since the caps and delegated
33243bb48b41SJeff Layton 		 * inodes are bound to the session. Just return -EJUKEBOX and
33253bb48b41SJeff Layton 		 * let the caller retry a sync request in that case.
33263bb48b41SJeff Layton 		 */
33273bb48b41SJeff Layton 		if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) {
33283bb48b41SJeff Layton 			err = -EJUKEBOX;
33293bb48b41SJeff Layton 			goto out_session;
33303bb48b41SJeff Layton 		}
33314ae3713fSJeff Layton 
33324ae3713fSJeff Layton 		/*
33334ae3713fSJeff Layton 		 * If the session has been REJECTED, then return a hard error,
33344ae3713fSJeff Layton 		 * unless it's a CLEANRECOVER mount, in which case we'll queue
33354ae3713fSJeff Layton 		 * it to the mdsc queue.
33364ae3713fSJeff Layton 		 */
33374ae3713fSJeff Layton 		if (session->s_state == CEPH_MDS_SESSION_REJECTED) {
33384ae3713fSJeff Layton 			if (ceph_test_mount_opt(mdsc->fsc, CLEANRECOVER))
33394ae3713fSJeff Layton 				list_add(&req->r_wait, &mdsc->waiting_for_map);
33404ae3713fSJeff Layton 			else
33414ae3713fSJeff Layton 				err = -EACCES;
33424ae3713fSJeff Layton 			goto out_session;
33434ae3713fSJeff Layton 		}
33444ae3713fSJeff Layton 
33452f2dc053SSage Weil 		if (session->s_state == CEPH_MDS_SESSION_NEW ||
3346c4853e97SXiubo Li 		    session->s_state == CEPH_MDS_SESSION_CLOSING) {
3347b682c6d4SXiubo Li 			err = __open_session(mdsc, session);
3348b682c6d4SXiubo Li 			if (err)
3349b682c6d4SXiubo Li 				goto out_session;
3350c4853e97SXiubo Li 			/* retry the same mds later */
3351c4853e97SXiubo Li 			if (random)
3352c4853e97SXiubo Li 				req->r_resend_mds = mds;
3353c4853e97SXiubo Li 		}
33542f2dc053SSage Weil 		list_add(&req->r_wait, &session->s_waiting);
33552f2dc053SSage Weil 		goto out_session;
33562f2dc053SSage Weil 	}
33572f2dc053SSage Weil 
33582f2dc053SSage Weil 	/* send request */
33592f2dc053SSage Weil 	req->r_resend_mds = -1;   /* forget any previous mds hint */
33602f2dc053SSage Weil 
33612f2dc053SSage Weil 	if (req->r_request_started == 0)   /* note request start time */
33622f2dc053SSage Weil 		req->r_request_started = jiffies;
33632f2dc053SSage Weil 
336400061645SXiubo Li 	/*
336500061645SXiubo Li 	 * For async create we will choose the auth MDS of frag in parent
336600061645SXiubo Li 	 * directory to send the request and ususally this works fine, but
336700061645SXiubo Li 	 * if the migrated the dirtory to another MDS before it could handle
336800061645SXiubo Li 	 * it the request will be forwarded.
336900061645SXiubo Li 	 *
337000061645SXiubo Li 	 * And then the auth cap will be changed.
337100061645SXiubo Li 	 */
337200061645SXiubo Li 	if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) && req->r_num_fwd) {
337300061645SXiubo Li 		struct ceph_dentry_info *di = ceph_dentry(req->r_dentry);
337400061645SXiubo Li 		struct ceph_inode_info *ci;
337500061645SXiubo Li 		struct ceph_cap *cap;
337600061645SXiubo Li 
337700061645SXiubo Li 		/*
337800061645SXiubo Li 		 * The request maybe handled very fast and the new inode
337900061645SXiubo Li 		 * hasn't been linked to the dentry yet. We need to wait
338000061645SXiubo Li 		 * for the ceph_finish_async_create(), which shouldn't be
338100061645SXiubo Li 		 * stuck too long or fail in thoery, to finish when forwarding
338200061645SXiubo Li 		 * the request.
338300061645SXiubo Li 		 */
338400061645SXiubo Li 		if (!d_inode(req->r_dentry)) {
338500061645SXiubo Li 			err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_CREATE_BIT,
338600061645SXiubo Li 					  TASK_KILLABLE);
338700061645SXiubo Li 			if (err) {
338800061645SXiubo Li 				mutex_lock(&req->r_fill_mutex);
338900061645SXiubo Li 				set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
339000061645SXiubo Li 				mutex_unlock(&req->r_fill_mutex);
339100061645SXiubo Li 				goto out_session;
339200061645SXiubo Li 			}
339300061645SXiubo Li 		}
339400061645SXiubo Li 
339500061645SXiubo Li 		ci = ceph_inode(d_inode(req->r_dentry));
339600061645SXiubo Li 
339700061645SXiubo Li 		spin_lock(&ci->i_ceph_lock);
339800061645SXiubo Li 		cap = ci->i_auth_cap;
339900061645SXiubo Li 		if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE && mds != cap->mds) {
340000061645SXiubo Li 			dout("do_request session changed for auth cap %d -> %d\n",
340100061645SXiubo Li 			     cap->session->s_mds, session->s_mds);
340200061645SXiubo Li 
340300061645SXiubo Li 			/* Remove the auth cap from old session */
340400061645SXiubo Li 			spin_lock(&cap->session->s_cap_lock);
340500061645SXiubo Li 			cap->session->s_nr_caps--;
340600061645SXiubo Li 			list_del_init(&cap->session_caps);
340700061645SXiubo Li 			spin_unlock(&cap->session->s_cap_lock);
340800061645SXiubo Li 
340900061645SXiubo Li 			/* Add the auth cap to the new session */
341000061645SXiubo Li 			cap->mds = mds;
341100061645SXiubo Li 			cap->session = session;
341200061645SXiubo Li 			spin_lock(&session->s_cap_lock);
341300061645SXiubo Li 			session->s_nr_caps++;
341400061645SXiubo Li 			list_add_tail(&cap->session_caps, &session->s_caps);
341500061645SXiubo Li 			spin_unlock(&session->s_cap_lock);
341600061645SXiubo Li 
341700061645SXiubo Li 			change_auth_cap_ses(ci, session);
341800061645SXiubo Li 		}
341900061645SXiubo Li 		spin_unlock(&ci->i_ceph_lock);
342000061645SXiubo Li 	}
342100061645SXiubo Li 
3422396bd62cSJeff Layton 	err = __send_request(session, req, false);
34232f2dc053SSage Weil 
34242f2dc053SSage Weil out_session:
34252f2dc053SSage Weil 	ceph_put_mds_session(session);
34262f2dc053SSage Weil finish:
342748fec5d0SYan, Zheng 	if (err) {
342848fec5d0SYan, Zheng 		dout("__do_request early error %d\n", err);
3429e1518c7cSSage Weil 		req->r_err = err;
34302f2dc053SSage Weil 		complete_request(mdsc, req);
343148fec5d0SYan, Zheng 		__unregister_request(mdsc, req);
343248fec5d0SYan, Zheng 	}
3433d5548492SChengguang Xu 	return;
34342f2dc053SSage Weil }
34352f2dc053SSage Weil 
34362f2dc053SSage Weil /*
34372f2dc053SSage Weil  * called under mdsc->mutex
34382f2dc053SSage Weil  */
__wake_requests(struct ceph_mds_client * mdsc,struct list_head * head)34392f2dc053SSage Weil static void __wake_requests(struct ceph_mds_client *mdsc,
34402f2dc053SSage Weil 			    struct list_head *head)
34412f2dc053SSage Weil {
3442ed75ec2cSYan, Zheng 	struct ceph_mds_request *req;
3443ed75ec2cSYan, Zheng 	LIST_HEAD(tmp_list);
34442f2dc053SSage Weil 
3445ed75ec2cSYan, Zheng 	list_splice_init(head, &tmp_list);
3446ed75ec2cSYan, Zheng 
3447ed75ec2cSYan, Zheng 	while (!list_empty(&tmp_list)) {
3448ed75ec2cSYan, Zheng 		req = list_entry(tmp_list.next,
3449ed75ec2cSYan, Zheng 				 struct ceph_mds_request, r_wait);
34502f2dc053SSage Weil 		list_del_init(&req->r_wait);
34517971bd92SSage Weil 		dout(" wake request %p tid %llu\n", req, req->r_tid);
34522f2dc053SSage Weil 		__do_request(mdsc, req);
34532f2dc053SSage Weil 	}
34542f2dc053SSage Weil }
34552f2dc053SSage Weil 
34562f2dc053SSage Weil /*
34572f2dc053SSage Weil  * Wake up threads with requests pending for @mds, so that they can
345829790f26SSage Weil  * resubmit their requests to a possibly different mds.
34592f2dc053SSage Weil  */
kick_requests(struct ceph_mds_client * mdsc,int mds)346029790f26SSage Weil static void kick_requests(struct ceph_mds_client *mdsc, int mds)
34612f2dc053SSage Weil {
346244ca18f2SSage Weil 	struct ceph_mds_request *req;
3463282c1052SYan, Zheng 	struct rb_node *p = rb_first(&mdsc->request_tree);
34642f2dc053SSage Weil 
34652f2dc053SSage Weil 	dout("kick_requests mds%d\n", mds);
3466282c1052SYan, Zheng 	while (p) {
346744ca18f2SSage Weil 		req = rb_entry(p, struct ceph_mds_request, r_node);
3468282c1052SYan, Zheng 		p = rb_next(p);
3469bc2de10dSJeff Layton 		if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
34702f2dc053SSage Weil 			continue;
34713de22be6SYan, Zheng 		if (req->r_attempts > 0)
34723de22be6SYan, Zheng 			continue; /* only new requests */
347344ca18f2SSage Weil 		if (req->r_session &&
347444ca18f2SSage Weil 		    req->r_session->s_mds == mds) {
347544ca18f2SSage Weil 			dout(" kicking tid %llu\n", req->r_tid);
347603974e81SYan, Zheng 			list_del_init(&req->r_wait);
347744ca18f2SSage Weil 			__do_request(mdsc, req);
34782f2dc053SSage Weil 		}
34792f2dc053SSage Weil 	}
34802f2dc053SSage Weil }
34812f2dc053SSage Weil 
ceph_mdsc_submit_request(struct ceph_mds_client * mdsc,struct inode * dir,struct ceph_mds_request * req)348286bda539SJeff Layton int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
34832f2dc053SSage Weil 			      struct ceph_mds_request *req)
34842f2dc053SSage Weil {
3485891f3f5aSJeff Layton 	int err = 0;
348686bda539SJeff Layton 
348786bda539SJeff Layton 	/* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
348886bda539SJeff Layton 	if (req->r_inode)
348986bda539SJeff Layton 		ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
34909c1c2b35SJeff Layton 	if (req->r_parent) {
3491719a2514SYan, Zheng 		struct ceph_inode_info *ci = ceph_inode(req->r_parent);
3492719a2514SYan, Zheng 		int fmode = (req->r_op & CEPH_MDS_OP_WRITE) ?
3493719a2514SYan, Zheng 			    CEPH_FILE_MODE_WR : CEPH_FILE_MODE_RD;
3494719a2514SYan, Zheng 		spin_lock(&ci->i_ceph_lock);
3495719a2514SYan, Zheng 		ceph_take_cap_refs(ci, CEPH_CAP_PIN, false);
3496719a2514SYan, Zheng 		__ceph_touch_fmode(ci, mdsc, fmode);
3497719a2514SYan, Zheng 		spin_unlock(&ci->i_ceph_lock);
34989c1c2b35SJeff Layton 	}
349986bda539SJeff Layton 	if (req->r_old_dentry_dir)
350086bda539SJeff Layton 		ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
350186bda539SJeff Layton 				  CEPH_CAP_PIN);
350286bda539SJeff Layton 
3503891f3f5aSJeff Layton 	if (req->r_inode) {
3504891f3f5aSJeff Layton 		err = ceph_wait_on_async_create(req->r_inode);
3505891f3f5aSJeff Layton 		if (err) {
3506891f3f5aSJeff Layton 			dout("%s: wait for async create returned: %d\n",
3507891f3f5aSJeff Layton 			     __func__, err);
3508891f3f5aSJeff Layton 			return err;
3509891f3f5aSJeff Layton 		}
3510891f3f5aSJeff Layton 	}
3511891f3f5aSJeff Layton 
3512891f3f5aSJeff Layton 	if (!err && req->r_old_inode) {
3513891f3f5aSJeff Layton 		err = ceph_wait_on_async_create(req->r_old_inode);
3514891f3f5aSJeff Layton 		if (err) {
3515891f3f5aSJeff Layton 			dout("%s: wait for async create returned: %d\n",
3516891f3f5aSJeff Layton 			     __func__, err);
3517891f3f5aSJeff Layton 			return err;
3518891f3f5aSJeff Layton 		}
3519891f3f5aSJeff Layton 	}
3520891f3f5aSJeff Layton 
352186bda539SJeff Layton 	dout("submit_request on %p for inode %p\n", req, dir);
35222f2dc053SSage Weil 	mutex_lock(&mdsc->mutex);
352386bda539SJeff Layton 	__register_request(mdsc, req, dir);
35242f2dc053SSage Weil 	__do_request(mdsc, req);
352586bda539SJeff Layton 	err = req->r_err;
35262f2dc053SSage Weil 	mutex_unlock(&mdsc->mutex);
352786bda539SJeff Layton 	return err;
35282f2dc053SSage Weil }
35292f2dc053SSage Weil 
ceph_mdsc_wait_request(struct ceph_mds_client * mdsc,struct ceph_mds_request * req,ceph_mds_request_wait_callback_t wait_func)35309eaa7b79SJeff Layton int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
35319eaa7b79SJeff Layton 			   struct ceph_mds_request *req,
35329eaa7b79SJeff Layton 			   ceph_mds_request_wait_callback_t wait_func)
35332f2dc053SSage Weil {
35342f2dc053SSage Weil 	int err;
35352f2dc053SSage Weil 
35362f2dc053SSage Weil 	/* wait */
3537e1518c7cSSage Weil 	dout("do_request waiting\n");
35389eaa7b79SJeff Layton 	if (wait_func) {
35399eaa7b79SJeff Layton 		err = wait_func(mdsc, req);
35402f2dc053SSage Weil 	} else {
35415be73034SIlya Dryomov 		long timeleft = wait_for_completion_killable_timeout(
35425be73034SIlya Dryomov 					&req->r_completion,
35435be73034SIlya Dryomov 					ceph_timeout_jiffies(req->r_timeout));
35445be73034SIlya Dryomov 		if (timeleft > 0)
35455be73034SIlya Dryomov 			err = 0;
35465be73034SIlya Dryomov 		else if (!timeleft)
35478ccf7fccSXiubo Li 			err = -ETIMEDOUT;  /* timed out */
35485be73034SIlya Dryomov 		else
35495be73034SIlya Dryomov 			err = timeleft;  /* killed */
35502f2dc053SSage Weil 	}
3551e1518c7cSSage Weil 	dout("do_request waited, got %d\n", err);
35522f2dc053SSage Weil 	mutex_lock(&mdsc->mutex);
35532f2dc053SSage Weil 
3554e1518c7cSSage Weil 	/* only abort if we didn't race with a real reply */
3555bc2de10dSJeff Layton 	if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
3556e1518c7cSSage Weil 		err = le32_to_cpu(req->r_reply_info.head->result);
3557e1518c7cSSage Weil 	} else if (err < 0) {
3558e1518c7cSSage Weil 		dout("aborted request %lld with %d\n", req->r_tid, err);
3559b4556396SSage Weil 
3560b4556396SSage Weil 		/*
3561b4556396SSage Weil 		 * ensure we aren't running concurrently with
3562b4556396SSage Weil 		 * ceph_fill_trace or ceph_readdir_prepopulate, which
3563b4556396SSage Weil 		 * rely on locks (dir mutex) held by our caller.
3564b4556396SSage Weil 		 */
3565b4556396SSage Weil 		mutex_lock(&req->r_fill_mutex);
3566e1518c7cSSage Weil 		req->r_err = err;
3567bc2de10dSJeff Layton 		set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
3568b4556396SSage Weil 		mutex_unlock(&req->r_fill_mutex);
35695b1daecdSSage Weil 
35703dd69aabSJeff Layton 		if (req->r_parent &&
3571167c9e35SSage Weil 		    (req->r_op & CEPH_MDS_OP_WRITE))
3572167c9e35SSage Weil 			ceph_invalidate_dir_request(req);
35735b1daecdSSage Weil 	} else {
35742f2dc053SSage Weil 		err = req->r_err;
35752f2dc053SSage Weil 	}
35762f2dc053SSage Weil 
3577e1518c7cSSage Weil 	mutex_unlock(&mdsc->mutex);
35788340f22cSJeff Layton 	return err;
35798340f22cSJeff Layton }
35808340f22cSJeff Layton 
35818340f22cSJeff Layton /*
35828340f22cSJeff Layton  * Synchrously perform an mds request.  Take care of all of the
35838340f22cSJeff Layton  * session setup, forwarding, retry details.
35848340f22cSJeff Layton  */
ceph_mdsc_do_request(struct ceph_mds_client * mdsc,struct inode * dir,struct ceph_mds_request * req)35858340f22cSJeff Layton int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
35868340f22cSJeff Layton 			 struct inode *dir,
35878340f22cSJeff Layton 			 struct ceph_mds_request *req)
35888340f22cSJeff Layton {
35898340f22cSJeff Layton 	int err;
35908340f22cSJeff Layton 
35918340f22cSJeff Layton 	dout("do_request on %p\n", req);
35928340f22cSJeff Layton 
35938340f22cSJeff Layton 	/* issue */
35948340f22cSJeff Layton 	err = ceph_mdsc_submit_request(mdsc, dir, req);
35958340f22cSJeff Layton 	if (!err)
35969eaa7b79SJeff Layton 		err = ceph_mdsc_wait_request(mdsc, req, NULL);
35972f2dc053SSage Weil 	dout("do_request %p done, result %d\n", req, err);
35982f2dc053SSage Weil 	return err;
35992f2dc053SSage Weil }
36002f2dc053SSage Weil 
36012f2dc053SSage Weil /*
36022f276c51SYan, Zheng  * Invalidate dir's completeness, dentry lease state on an aborted MDS
3603167c9e35SSage Weil  * namespace request.
3604167c9e35SSage Weil  */
ceph_invalidate_dir_request(struct ceph_mds_request * req)3605167c9e35SSage Weil void ceph_invalidate_dir_request(struct ceph_mds_request *req)
3606167c9e35SSage Weil {
36078d8f371cSYan, Zheng 	struct inode *dir = req->r_parent;
36088d8f371cSYan, Zheng 	struct inode *old_dir = req->r_old_dentry_dir;
3609167c9e35SSage Weil 
36108d8f371cSYan, Zheng 	dout("invalidate_dir_request %p %p (complete, lease(s))\n", dir, old_dir);
3611167c9e35SSage Weil 
36128d8f371cSYan, Zheng 	ceph_dir_clear_complete(dir);
36138d8f371cSYan, Zheng 	if (old_dir)
36148d8f371cSYan, Zheng 		ceph_dir_clear_complete(old_dir);
3615167c9e35SSage Weil 	if (req->r_dentry)
3616167c9e35SSage Weil 		ceph_invalidate_dentry_lease(req->r_dentry);
3617167c9e35SSage Weil 	if (req->r_old_dentry)
3618167c9e35SSage Weil 		ceph_invalidate_dentry_lease(req->r_old_dentry);
3619167c9e35SSage Weil }
3620167c9e35SSage Weil 
3621167c9e35SSage Weil /*
36222f2dc053SSage Weil  * Handle mds reply.
36232f2dc053SSage Weil  *
36242f2dc053SSage Weil  * We take the session mutex and parse and process the reply immediately.
36252f2dc053SSage Weil  * This preserves the logical ordering of replies, capabilities, etc., sent
36262f2dc053SSage Weil  * by the MDS as they are applied to our local cache.
36272f2dc053SSage Weil  */
handle_reply(struct ceph_mds_session * session,struct ceph_msg * msg)36282f2dc053SSage Weil static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
36292f2dc053SSage Weil {
36302f2dc053SSage Weil 	struct ceph_mds_client *mdsc = session->s_mdsc;
36312f2dc053SSage Weil 	struct ceph_mds_request *req;
36322f2dc053SSage Weil 	struct ceph_mds_reply_head *head = msg->front.iov_base;
36332f2dc053SSage Weil 	struct ceph_mds_reply_info_parsed *rinfo;  /* parsed reply info */
3634982d6011SYan, Zheng 	struct ceph_snap_realm *realm;
36352f2dc053SSage Weil 	u64 tid;
36362f2dc053SSage Weil 	int err, result;
36372600d2ddSSage Weil 	int mds = session->s_mds;
3638a68e564aSXiubo Li 	bool close_sessions = false;
36392f2dc053SSage Weil 
36402f2dc053SSage Weil 	if (msg->front.iov_len < sizeof(*head)) {
36412f2dc053SSage Weil 		pr_err("mdsc_handle_reply got corrupt (short) reply\n");
36429ec7cab1SSage Weil 		ceph_msg_dump(msg);
36432f2dc053SSage Weil 		return;
36442f2dc053SSage Weil 	}
36452f2dc053SSage Weil 
36462f2dc053SSage Weil 	/* get request, session */
36476df058c0SSage Weil 	tid = le64_to_cpu(msg->hdr.tid);
36482f2dc053SSage Weil 	mutex_lock(&mdsc->mutex);
3649fcd00b68SIlya Dryomov 	req = lookup_get_request(mdsc, tid);
36502f2dc053SSage Weil 	if (!req) {
36512f2dc053SSage Weil 		dout("handle_reply on unknown tid %llu\n", tid);
36522f2dc053SSage Weil 		mutex_unlock(&mdsc->mutex);
36532f2dc053SSage Weil 		return;
36542f2dc053SSage Weil 	}
36552f2dc053SSage Weil 	dout("handle_reply %p\n", req);
36562f2dc053SSage Weil 
36572f2dc053SSage Weil 	/* correct session? */
3658d96d6049SSage Weil 	if (req->r_session != session) {
36592f2dc053SSage Weil 		pr_err("mdsc_handle_reply got %llu on session mds%d"
36602f2dc053SSage Weil 		       " not mds%d\n", tid, session->s_mds,
36612f2dc053SSage Weil 		       req->r_session ? req->r_session->s_mds : -1);
36622f2dc053SSage Weil 		mutex_unlock(&mdsc->mutex);
36632f2dc053SSage Weil 		goto out;
36642f2dc053SSage Weil 	}
36652f2dc053SSage Weil 
36662f2dc053SSage Weil 	/* dup? */
3667bc2de10dSJeff Layton 	if ((test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags) && !head->safe) ||
3668bc2de10dSJeff Layton 	    (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags) && head->safe)) {
3669f3ae1b97SFabian Frederick 		pr_warn("got a dup %s reply on %llu from mds%d\n",
36702f2dc053SSage Weil 			   head->safe ? "safe" : "unsafe", tid, mds);
36712f2dc053SSage Weil 		mutex_unlock(&mdsc->mutex);
36722f2dc053SSage Weil 		goto out;
36732f2dc053SSage Weil 	}
3674bc2de10dSJeff Layton 	if (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags)) {
3675f3ae1b97SFabian Frederick 		pr_warn("got unsafe after safe on %llu from mds%d\n",
367685792d0dSSage Weil 			   tid, mds);
367785792d0dSSage Weil 		mutex_unlock(&mdsc->mutex);
367885792d0dSSage Weil 		goto out;
367985792d0dSSage Weil 	}
36802f2dc053SSage Weil 
36812f2dc053SSage Weil 	result = le32_to_cpu(head->result);
36822f2dc053SSage Weil 
36832f2dc053SSage Weil 	if (head->safe) {
3684bc2de10dSJeff Layton 		set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags);
36852f2dc053SSage Weil 		__unregister_request(mdsc, req);
36862f2dc053SSage Weil 
368707edc057SXiubo Li 		/* last request during umount? */
368807edc057SXiubo Li 		if (mdsc->stopping && !__get_oldest_req(mdsc))
368907edc057SXiubo Li 			complete_all(&mdsc->safe_umount_waiters);
369007edc057SXiubo Li 
3691bc2de10dSJeff Layton 		if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
36922f2dc053SSage Weil 			/*
36932f2dc053SSage Weil 			 * We already handled the unsafe response, now do the
36942f2dc053SSage Weil 			 * cleanup.  No need to examine the response; the MDS
36952f2dc053SSage Weil 			 * doesn't include any result info in the safe
36962f2dc053SSage Weil 			 * response.  And even if it did, there is nothing
36972f2dc053SSage Weil 			 * useful we could do with a revised return value.
36982f2dc053SSage Weil 			 */
36992f2dc053SSage Weil 			dout("got safe reply %llu, mds%d\n", tid, mds);
37002f2dc053SSage Weil 
37012f2dc053SSage Weil 			mutex_unlock(&mdsc->mutex);
37022f2dc053SSage Weil 			goto out;
37032f2dc053SSage Weil 		}
3704e1518c7cSSage Weil 	} else {
3705bc2de10dSJeff Layton 		set_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags);
37062f2dc053SSage Weil 		list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
37072f2dc053SSage Weil 	}
37082f2dc053SSage Weil 
37092f2dc053SSage Weil 	dout("handle_reply tid %lld result %d\n", tid, result);
3710b37fe1f9SYan, Zheng 	if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features))
37113859af9eSXiubo Li 		err = parse_reply_info(session, msg, req, (u64)-1);
3712b37fe1f9SYan, Zheng 	else
37133859af9eSXiubo Li 		err = parse_reply_info(session, msg, req,
37143859af9eSXiubo Li 				       session->s_con.peer_features);
37152f2dc053SSage Weil 	mutex_unlock(&mdsc->mutex);
37162f2dc053SSage Weil 
3717bca9fc14SJeff Layton 	/* Must find target inode outside of mutexes to avoid deadlocks */
37183859af9eSXiubo Li 	rinfo = &req->r_reply_info;
3719bca9fc14SJeff Layton 	if ((err >= 0) && rinfo->head->is_target) {
3720ec9595c0SJeff Layton 		struct inode *in = xchg(&req->r_new_inode, NULL);
3721bca9fc14SJeff Layton 		struct ceph_vino tvino = {
3722bca9fc14SJeff Layton 			.ino  = le64_to_cpu(rinfo->targeti.in->ino),
3723bca9fc14SJeff Layton 			.snap = le64_to_cpu(rinfo->targeti.in->snapid)
3724bca9fc14SJeff Layton 		};
3725bca9fc14SJeff Layton 
3726ec9595c0SJeff Layton 		/*
3727ec9595c0SJeff Layton 		 * If we ended up opening an existing inode, discard
3728ec9595c0SJeff Layton 		 * r_new_inode
3729ec9595c0SJeff Layton 		 */
3730ec9595c0SJeff Layton 		if (req->r_op == CEPH_MDS_OP_CREATE &&
3731ec9595c0SJeff Layton 		    !req->r_reply_info.has_create_ino) {
3732ec9595c0SJeff Layton 			/* This should never happen on an async create */
3733ec9595c0SJeff Layton 			WARN_ON_ONCE(req->r_deleg_ino);
3734ec9595c0SJeff Layton 			iput(in);
3735ec9595c0SJeff Layton 			in = NULL;
3736ec9595c0SJeff Layton 		}
3737ec9595c0SJeff Layton 
3738ec9595c0SJeff Layton 		in = ceph_get_inode(mdsc->fsc->sb, tvino, in);
3739bca9fc14SJeff Layton 		if (IS_ERR(in)) {
3740bca9fc14SJeff Layton 			err = PTR_ERR(in);
3741bca9fc14SJeff Layton 			mutex_lock(&session->s_mutex);
3742bca9fc14SJeff Layton 			goto out_err;
3743bca9fc14SJeff Layton 		}
3744bca9fc14SJeff Layton 		req->r_target_inode = in;
3745bca9fc14SJeff Layton 	}
3746bca9fc14SJeff Layton 
37472f2dc053SSage Weil 	mutex_lock(&session->s_mutex);
37482f2dc053SSage Weil 	if (err < 0) {
374925933abdSHerb Shiu 		pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);
37509ec7cab1SSage Weil 		ceph_msg_dump(msg);
37512f2dc053SSage Weil 		goto out_err;
37522f2dc053SSage Weil 	}
37532f2dc053SSage Weil 
37542f2dc053SSage Weil 	/* snap trace */
3755982d6011SYan, Zheng 	realm = NULL;
37562f2dc053SSage Weil 	if (rinfo->snapblob_len) {
37572f2dc053SSage Weil 		down_write(&mdsc->snap_rwsem);
3758a68e564aSXiubo Li 		err = ceph_update_snap_trace(mdsc, rinfo->snapblob,
37592f2dc053SSage Weil 				rinfo->snapblob + rinfo->snapblob_len,
3760982d6011SYan, Zheng 				le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP,
3761982d6011SYan, Zheng 				&realm);
3762a68e564aSXiubo Li 		if (err) {
3763a68e564aSXiubo Li 			up_write(&mdsc->snap_rwsem);
3764a68e564aSXiubo Li 			close_sessions = true;
3765a68e564aSXiubo Li 			if (err == -EIO)
3766a68e564aSXiubo Li 				ceph_msg_dump(msg);
3767a68e564aSXiubo Li 			goto out_err;
3768a68e564aSXiubo Li 		}
37692f2dc053SSage Weil 		downgrade_write(&mdsc->snap_rwsem);
37702f2dc053SSage Weil 	} else {
37712f2dc053SSage Weil 		down_read(&mdsc->snap_rwsem);
37722f2dc053SSage Weil 	}
37732f2dc053SSage Weil 
37742f2dc053SSage Weil 	/* insert trace into our cache */
3775b4556396SSage Weil 	mutex_lock(&req->r_fill_mutex);
3776315f2408SYan, Zheng 	current->journal_info = req;
3777f5a03b08SJeff Layton 	err = ceph_fill_trace(mdsc->fsc->sb, req);
37782f2dc053SSage Weil 	if (err == 0) {
37796e8575faSSam Lang 		if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
378081c6aea5SYan, Zheng 				    req->r_op == CEPH_MDS_OP_LSSNAP))
3781af9ffa6dSXiubo Li 			err = ceph_readdir_prepopulate(req, req->r_session);
37822f2dc053SSage Weil 	}
3783315f2408SYan, Zheng 	current->journal_info = NULL;
3784b4556396SSage Weil 	mutex_unlock(&req->r_fill_mutex);
37852f2dc053SSage Weil 
37862f2dc053SSage Weil 	up_read(&mdsc->snap_rwsem);
3787982d6011SYan, Zheng 	if (realm)
3788982d6011SYan, Zheng 		ceph_put_snap_realm(mdsc, realm);
378968cd5b4bSYan, Zheng 
3790fe33032dSYan, Zheng 	if (err == 0) {
3791fe33032dSYan, Zheng 		if (req->r_target_inode &&
3792bc2de10dSJeff Layton 		    test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
3793fe33032dSYan, Zheng 			struct ceph_inode_info *ci =
3794fe33032dSYan, Zheng 				ceph_inode(req->r_target_inode);
379568cd5b4bSYan, Zheng 			spin_lock(&ci->i_unsafe_lock);
3796fe33032dSYan, Zheng 			list_add_tail(&req->r_unsafe_target_item,
3797fe33032dSYan, Zheng 				      &ci->i_unsafe_iops);
379868cd5b4bSYan, Zheng 			spin_unlock(&ci->i_unsafe_lock);
379968cd5b4bSYan, Zheng 		}
3800fe33032dSYan, Zheng 
3801fe33032dSYan, Zheng 		ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
3802fe33032dSYan, Zheng 	}
38032f2dc053SSage Weil out_err:
3804e1518c7cSSage Weil 	mutex_lock(&mdsc->mutex);
3805bc2de10dSJeff Layton 	if (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
38062f2dc053SSage Weil 		if (err) {
38072f2dc053SSage Weil 			req->r_err = err;
38082f2dc053SSage Weil 		} else {
38095fdb1389SJianpeng Ma 			req->r_reply =  ceph_msg_get(msg);
3810bc2de10dSJeff Layton 			set_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags);
38112f2dc053SSage Weil 		}
3812e1518c7cSSage Weil 	} else {
3813e1518c7cSSage Weil 		dout("reply arrived after request %lld was aborted\n", tid);
3814e1518c7cSSage Weil 	}
3815e1518c7cSSage Weil 	mutex_unlock(&mdsc->mutex);
38162f2dc053SSage Weil 
38172f2dc053SSage Weil 	mutex_unlock(&session->s_mutex);
38182f2dc053SSage Weil 
38192f2dc053SSage Weil 	/* kick calling process */
38202f2dc053SSage Weil 	complete_request(mdsc, req);
382170c94820SXiubo Li 
38228ae99ae2SXiubo Li 	ceph_update_metadata_metrics(&mdsc->metric, req->r_start_latency,
382370c94820SXiubo Li 				     req->r_end_latency, err);
38242f2dc053SSage Weil out:
38252f2dc053SSage Weil 	ceph_mdsc_put_request(req);
3826a68e564aSXiubo Li 
3827a68e564aSXiubo Li 	/* Defer closing the sessions after s_mutex lock being released */
3828a68e564aSXiubo Li 	if (close_sessions)
3829a68e564aSXiubo Li 		ceph_mdsc_close_sessions(mdsc);
38302f2dc053SSage Weil 	return;
38312f2dc053SSage Weil }
38322f2dc053SSage Weil 
38332f2dc053SSage Weil 
38342f2dc053SSage Weil 
38352f2dc053SSage Weil /*
38362f2dc053SSage Weil  * handle mds notification that our request has been forwarded.
38372f2dc053SSage Weil  */
handle_forward(struct ceph_mds_client * mdsc,struct ceph_mds_session * session,struct ceph_msg * msg)38382600d2ddSSage Weil static void handle_forward(struct ceph_mds_client *mdsc,
38392600d2ddSSage Weil 			   struct ceph_mds_session *session,
38402600d2ddSSage Weil 			   struct ceph_msg *msg)
38412f2dc053SSage Weil {
38422f2dc053SSage Weil 	struct ceph_mds_request *req;
3843a1ea787cSSage Weil 	u64 tid = le64_to_cpu(msg->hdr.tid);
38442f2dc053SSage Weil 	u32 next_mds;
38452f2dc053SSage Weil 	u32 fwd_seq;
38462f2dc053SSage Weil 	int err = -EINVAL;
38472f2dc053SSage Weil 	void *p = msg->front.iov_base;
38482f2dc053SSage Weil 	void *end = p + msg->front.iov_len;
38491980b1bfSXiubo Li 	bool aborted = false;
38502f2dc053SSage Weil 
3851a1ea787cSSage Weil 	ceph_decode_need(&p, end, 2*sizeof(u32), bad);
3852c89136eaSSage Weil 	next_mds = ceph_decode_32(&p);
3853c89136eaSSage Weil 	fwd_seq = ceph_decode_32(&p);
38542f2dc053SSage Weil 
38552f2dc053SSage Weil 	mutex_lock(&mdsc->mutex);
3856fcd00b68SIlya Dryomov 	req = lookup_get_request(mdsc, tid);
38572f2dc053SSage Weil 	if (!req) {
38581980b1bfSXiubo Li 		mutex_unlock(&mdsc->mutex);
38592a8e5e36SSage Weil 		dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
38601980b1bfSXiubo Li 		return;  /* dup reply? */
38612f2dc053SSage Weil 	}
38622f2dc053SSage Weil 
3863bc2de10dSJeff Layton 	if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
38642a8e5e36SSage Weil 		dout("forward tid %llu aborted, unregistering\n", tid);
38652a8e5e36SSage Weil 		__unregister_request(mdsc, req);
3866ce0d5bd3SXiubo Li 	} else if (fwd_seq <= req->r_num_fwd || (uint32_t)fwd_seq >= U32_MAX) {
38671980b1bfSXiubo Li 		/*
3868ce0d5bd3SXiubo Li 		 * Avoid inifinite retrying after overflow.
38691980b1bfSXiubo Li 		 *
3870ce0d5bd3SXiubo Li 		 * The MDS will increase the fwd count and in client side
3871ce0d5bd3SXiubo Li 		 * if the num_fwd is less than the one saved in request
3872ce0d5bd3SXiubo Li 		 * that means the MDS is an old version and overflowed of
3873ce0d5bd3SXiubo Li 		 * 8 bits.
38741980b1bfSXiubo Li 		 */
38751980b1bfSXiubo Li 		mutex_lock(&req->r_fill_mutex);
38761980b1bfSXiubo Li 		req->r_err = -EMULTIHOP;
38771980b1bfSXiubo Li 		set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
38781980b1bfSXiubo Li 		mutex_unlock(&req->r_fill_mutex);
38791980b1bfSXiubo Li 		aborted = true;
3880ce0d5bd3SXiubo Li 		pr_warn_ratelimited("forward tid %llu seq overflow\n", tid);
38812f2dc053SSage Weil 	} else {
38822f2dc053SSage Weil 		/* resend. forward race not possible; mds would drop */
38832a8e5e36SSage Weil 		dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
38842a8e5e36SSage Weil 		BUG_ON(req->r_err);
3885bc2de10dSJeff Layton 		BUG_ON(test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags));
38863de22be6SYan, Zheng 		req->r_attempts = 0;
38872f2dc053SSage Weil 		req->r_num_fwd = fwd_seq;
38882f2dc053SSage Weil 		req->r_resend_mds = next_mds;
38892f2dc053SSage Weil 		put_request_session(req);
38902f2dc053SSage Weil 		__do_request(mdsc, req);
38912f2dc053SSage Weil 	}
38922f2dc053SSage Weil 	mutex_unlock(&mdsc->mutex);
38931980b1bfSXiubo Li 
38941980b1bfSXiubo Li 	/* kick calling process */
38951980b1bfSXiubo Li 	if (aborted)
38961980b1bfSXiubo Li 		complete_request(mdsc, req);
38971980b1bfSXiubo Li 	ceph_mdsc_put_request(req);
38982f2dc053SSage Weil 	return;
38992f2dc053SSage Weil 
39002f2dc053SSage Weil bad:
39012f2dc053SSage Weil 	pr_err("mdsc_handle_forward decode error err=%d\n", err);
39028b0da5c5SXiubo Li 	ceph_msg_dump(msg);
39032f2dc053SSage Weil }
39042f2dc053SSage Weil 
__decode_session_metadata(void ** p,void * end,bool * blocklisted)3905131d7eb4SYan, Zheng static int __decode_session_metadata(void **p, void *end,
39060b98acd6SIlya Dryomov 				     bool *blocklisted)
390784bf3950SYan, Zheng {
390884bf3950SYan, Zheng 	/* map<string,string> */
390984bf3950SYan, Zheng 	u32 n;
3910131d7eb4SYan, Zheng 	bool err_str;
391184bf3950SYan, Zheng 	ceph_decode_32_safe(p, end, n, bad);
391284bf3950SYan, Zheng 	while (n-- > 0) {
391384bf3950SYan, Zheng 		u32 len;
391484bf3950SYan, Zheng 		ceph_decode_32_safe(p, end, len, bad);
391584bf3950SYan, Zheng 		ceph_decode_need(p, end, len, bad);
3916131d7eb4SYan, Zheng 		err_str = !strncmp(*p, "error_string", len);
391784bf3950SYan, Zheng 		*p += len;
391884bf3950SYan, Zheng 		ceph_decode_32_safe(p, end, len, bad);
391984bf3950SYan, Zheng 		ceph_decode_need(p, end, len, bad);
39204bb926e8SIlya Dryomov 		/*
39214bb926e8SIlya Dryomov 		 * Match "blocklisted (blacklisted)" from newer MDSes,
39224bb926e8SIlya Dryomov 		 * or "blacklisted" from older MDSes.
39234bb926e8SIlya Dryomov 		 */
3924131d7eb4SYan, Zheng 		if (err_str && strnstr(*p, "blacklisted", len))
39250b98acd6SIlya Dryomov 			*blocklisted = true;
392684bf3950SYan, Zheng 		*p += len;
392784bf3950SYan, Zheng 	}
392884bf3950SYan, Zheng 	return 0;
392984bf3950SYan, Zheng bad:
393084bf3950SYan, Zheng 	return -1;
393184bf3950SYan, Zheng }
393284bf3950SYan, Zheng 
39332f2dc053SSage Weil /*
39342f2dc053SSage Weil  * handle a mds session control message
39352f2dc053SSage Weil  */
handle_session(struct ceph_mds_session * session,struct ceph_msg * msg)39362f2dc053SSage Weil static void handle_session(struct ceph_mds_session *session,
39372f2dc053SSage Weil 			   struct ceph_msg *msg)
39382f2dc053SSage Weil {
39392f2dc053SSage Weil 	struct ceph_mds_client *mdsc = session->s_mdsc;
394084bf3950SYan, Zheng 	int mds = session->s_mds;
394184bf3950SYan, Zheng 	int msg_version = le16_to_cpu(msg->hdr.version);
394284bf3950SYan, Zheng 	void *p = msg->front.iov_base;
394384bf3950SYan, Zheng 	void *end = p + msg->front.iov_len;
394484bf3950SYan, Zheng 	struct ceph_mds_session_head *h;
39452f2dc053SSage Weil 	u32 op;
39460fa82633SJeff Layton 	u64 seq, features = 0;
39472f2dc053SSage Weil 	int wake = 0;
39480b98acd6SIlya Dryomov 	bool blocklisted = false;
39492f2dc053SSage Weil 
39502f2dc053SSage Weil 	/* decode */
395184bf3950SYan, Zheng 	ceph_decode_need(&p, end, sizeof(*h), bad);
395284bf3950SYan, Zheng 	h = p;
395384bf3950SYan, Zheng 	p += sizeof(*h);
395484bf3950SYan, Zheng 
39552f2dc053SSage Weil 	op = le32_to_cpu(h->op);
39562f2dc053SSage Weil 	seq = le64_to_cpu(h->seq);
39572f2dc053SSage Weil 
395884bf3950SYan, Zheng 	if (msg_version >= 3) {
395984bf3950SYan, Zheng 		u32 len;
3960e1c9788cSKotresh HR 		/* version >= 2 and < 5, decode metadata, skip otherwise
3961e1c9788cSKotresh HR 		 * as it's handled via flags.
3962e1c9788cSKotresh HR 		 */
3963e1c9788cSKotresh HR 		if (msg_version >= 5)
3964e1c9788cSKotresh HR 			ceph_decode_skip_map(&p, end, string, string, bad);
3965e1c9788cSKotresh HR 		else if (__decode_session_metadata(&p, end, &blocklisted) < 0)
396684bf3950SYan, Zheng 			goto bad;
3967e1c9788cSKotresh HR 
396884bf3950SYan, Zheng 		/* version >= 3, feature bits */
396984bf3950SYan, Zheng 		ceph_decode_32_safe(&p, end, len, bad);
397002e37571SJeff Layton 		if (len) {
39710fa82633SJeff Layton 			ceph_decode_64_safe(&p, end, features, bad);
39720fa82633SJeff Layton 			p += len - sizeof(features);
397384bf3950SYan, Zheng 		}
397402e37571SJeff Layton 	}
397584bf3950SYan, Zheng 
3976e1c9788cSKotresh HR 	if (msg_version >= 5) {
3977ea16567fSLuís Henriques 		u32 flags, len;
3978ea16567fSLuís Henriques 
3979ea16567fSLuís Henriques 		/* version >= 4 */
3980ea16567fSLuís Henriques 		ceph_decode_skip_16(&p, end, bad); /* struct_v, struct_cv */
3981ea16567fSLuís Henriques 		ceph_decode_32_safe(&p, end, len, bad); /* len */
3982ea16567fSLuís Henriques 		ceph_decode_skip_n(&p, end, len, bad); /* metric_spec */
3983ea16567fSLuís Henriques 
3984e1c9788cSKotresh HR 		/* version >= 5, flags   */
3985e1c9788cSKotresh HR 		ceph_decode_32_safe(&p, end, flags, bad);
3986e1c9788cSKotresh HR 		if (flags & CEPH_SESSION_BLOCKLISTED) {
3987e1c9788cSKotresh HR 			pr_warn("mds%d session blocklisted\n", session->s_mds);
3988e1c9788cSKotresh HR 			blocklisted = true;
3989e1c9788cSKotresh HR 		}
3990e1c9788cSKotresh HR 	}
3991e1c9788cSKotresh HR 
39922f2dc053SSage Weil 	mutex_lock(&mdsc->mutex);
39930a07fc8cSYan, Zheng 	if (op == CEPH_SESSION_CLOSE) {
39945b3248c6SXiubo Li 		ceph_get_mds_session(session);
39952600d2ddSSage Weil 		__unregister_session(mdsc, session);
39960a07fc8cSYan, Zheng 	}
39972f2dc053SSage Weil 	/* FIXME: this ttl calculation is generous */
39982f2dc053SSage Weil 	session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
39992f2dc053SSage Weil 	mutex_unlock(&mdsc->mutex);
40002f2dc053SSage Weil 
40012f2dc053SSage Weil 	mutex_lock(&session->s_mutex);
40022f2dc053SSage Weil 
40032f2dc053SSage Weil 	dout("handle_session mds%d %s %p state %s seq %llu\n",
40042f2dc053SSage Weil 	     mds, ceph_session_op_name(op), session,
4005a687ecafSJohn Spray 	     ceph_session_state_name(session->s_state), seq);
40062f2dc053SSage Weil 
40072f2dc053SSage Weil 	if (session->s_state == CEPH_MDS_SESSION_HUNG) {
40082f2dc053SSage Weil 		session->s_state = CEPH_MDS_SESSION_OPEN;
40092f2dc053SSage Weil 		pr_info("mds%d came back\n", session->s_mds);
40102f2dc053SSage Weil 	}
40112f2dc053SSage Weil 
40122f2dc053SSage Weil 	switch (op) {
40132f2dc053SSage Weil 	case CEPH_SESSION_OPEN:
401429790f26SSage Weil 		if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
401529790f26SSage Weil 			pr_info("mds%d reconnect success\n", session->s_mds);
4016300e42a2SXiubo Li 
4017987219b3SVenky Shankar 		session->s_features = features;
4018300e42a2SXiubo Li 		if (session->s_state == CEPH_MDS_SESSION_OPEN) {
4019300e42a2SXiubo Li 			pr_notice("mds%d is already opened\n", session->s_mds);
4020300e42a2SXiubo Li 		} else {
40212f2dc053SSage Weil 			session->s_state = CEPH_MDS_SESSION_OPEN;
40222f2dc053SSage Weil 			renewed_caps(mdsc, session, 0);
4023300e42a2SXiubo Li 			if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT,
4024300e42a2SXiubo Li 				     &session->s_features))
402518f473b3SXiubo Li 				metric_schedule_delayed(&mdsc->metric);
4026300e42a2SXiubo Li 		}
4027300e42a2SXiubo Li 
4028300e42a2SXiubo Li 		/*
4029300e42a2SXiubo Li 		 * The connection maybe broken and the session in client
4030300e42a2SXiubo Li 		 * side has been reinitialized, need to update the seq
4031300e42a2SXiubo Li 		 * anyway.
4032300e42a2SXiubo Li 		 */
4033300e42a2SXiubo Li 		if (!session->s_seq && seq)
4034300e42a2SXiubo Li 			session->s_seq = seq;
4035300e42a2SXiubo Li 
40362f2dc053SSage Weil 		wake = 1;
40372f2dc053SSage Weil 		if (mdsc->stopping)
40382f2dc053SSage Weil 			__close_session(mdsc, session);
40392f2dc053SSage Weil 		break;
40402f2dc053SSage Weil 
40412f2dc053SSage Weil 	case CEPH_SESSION_RENEWCAPS:
40422f2dc053SSage Weil 		if (session->s_renew_seq == seq)
40432f2dc053SSage Weil 			renewed_caps(mdsc, session, 1);
40442f2dc053SSage Weil 		break;
40452f2dc053SSage Weil 
40462f2dc053SSage Weil 	case CEPH_SESSION_CLOSE:
404729790f26SSage Weil 		if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
404829790f26SSage Weil 			pr_info("mds%d reconnect denied\n", session->s_mds);
40494d681c2fSXiubo Li 		session->s_state = CEPH_MDS_SESSION_CLOSED;
40501c841a96SYan, Zheng 		cleanup_session_requests(mdsc, session);
40512f2dc053SSage Weil 		remove_session_caps(session);
4052656e4382SYan, Zheng 		wake = 2; /* for good measure */
4053f3c60c59SSage Weil 		wake_up_all(&mdsc->session_close_wq);
40542f2dc053SSage Weil 		break;
40552f2dc053SSage Weil 
40562f2dc053SSage Weil 	case CEPH_SESSION_STALE:
40572f2dc053SSage Weil 		pr_info("mds%d caps went stale, renewing\n",
40582f2dc053SSage Weil 			session->s_mds);
405952d60f8eSJeff Layton 		atomic_inc(&session->s_cap_gen);
40601ce208a6SAlex Elder 		session->s_cap_ttl = jiffies - 1;
40612f2dc053SSage Weil 		send_renew_caps(mdsc, session);
40622f2dc053SSage Weil 		break;
40632f2dc053SSage Weil 
40642f2dc053SSage Weil 	case CEPH_SESSION_RECALL_STATE:
4065e30ee581SZhi Zhang 		ceph_trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
40662f2dc053SSage Weil 		break;
40672f2dc053SSage Weil 
4068186e4f7aSYan, Zheng 	case CEPH_SESSION_FLUSHMSG:
4069e7d84c6aSXiubo Li 		/* flush cap releases */
4070e7d84c6aSXiubo Li 		spin_lock(&session->s_cap_lock);
4071e7d84c6aSXiubo Li 		if (session->s_num_cap_releases)
4072e7d84c6aSXiubo Li 			ceph_flush_cap_releases(mdsc, session);
4073e7d84c6aSXiubo Li 		spin_unlock(&session->s_cap_lock);
4074e7d84c6aSXiubo Li 
4075186e4f7aSYan, Zheng 		send_flushmsg_ack(mdsc, session, seq);
4076186e4f7aSYan, Zheng 		break;
4077186e4f7aSYan, Zheng 
407803f4fcb0SYan, Zheng 	case CEPH_SESSION_FORCE_RO:
407903f4fcb0SYan, Zheng 		dout("force_session_readonly %p\n", session);
408003f4fcb0SYan, Zheng 		spin_lock(&session->s_cap_lock);
408103f4fcb0SYan, Zheng 		session->s_readonly = true;
408203f4fcb0SYan, Zheng 		spin_unlock(&session->s_cap_lock);
4083d2f8bb27SYan, Zheng 		wake_up_session_caps(session, FORCE_RO);
408403f4fcb0SYan, Zheng 		break;
408503f4fcb0SYan, Zheng 
4086fcff415cSYan, Zheng 	case CEPH_SESSION_REJECT:
4087fcff415cSYan, Zheng 		WARN_ON(session->s_state != CEPH_MDS_SESSION_OPENING);
4088fcff415cSYan, Zheng 		pr_info("mds%d rejected session\n", session->s_mds);
4089fcff415cSYan, Zheng 		session->s_state = CEPH_MDS_SESSION_REJECTED;
4090fcff415cSYan, Zheng 		cleanup_session_requests(mdsc, session);
4091fcff415cSYan, Zheng 		remove_session_caps(session);
40920b98acd6SIlya Dryomov 		if (blocklisted)
40930b98acd6SIlya Dryomov 			mdsc->fsc->blocklisted = true;
4094fcff415cSYan, Zheng 		wake = 2; /* for good measure */
4095fcff415cSYan, Zheng 		break;
4096fcff415cSYan, Zheng 
40972f2dc053SSage Weil 	default:
40982f2dc053SSage Weil 		pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
40992f2dc053SSage Weil 		WARN_ON(1);
41002f2dc053SSage Weil 	}
41012f2dc053SSage Weil 
41022f2dc053SSage Weil 	mutex_unlock(&session->s_mutex);
41032f2dc053SSage Weil 	if (wake) {
41042f2dc053SSage Weil 		mutex_lock(&mdsc->mutex);
41052f2dc053SSage Weil 		__wake_requests(mdsc, &session->s_waiting);
4106656e4382SYan, Zheng 		if (wake == 2)
4107656e4382SYan, Zheng 			kick_requests(mdsc, mds);
41082f2dc053SSage Weil 		mutex_unlock(&mdsc->mutex);
41092f2dc053SSage Weil 	}
41100a07fc8cSYan, Zheng 	if (op == CEPH_SESSION_CLOSE)
41110a07fc8cSYan, Zheng 		ceph_put_mds_session(session);
41122f2dc053SSage Weil 	return;
41132f2dc053SSage Weil 
41142f2dc053SSage Weil bad:
41152f2dc053SSage Weil 	pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
41162f2dc053SSage Weil 	       (int)msg->front.iov_len);
41179ec7cab1SSage Weil 	ceph_msg_dump(msg);
41182f2dc053SSage Weil 	return;
41192f2dc053SSage Weil }
41202f2dc053SSage Weil 
ceph_mdsc_release_dir_caps(struct ceph_mds_request * req)4121a25949b9SJeff Layton void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req)
4122a25949b9SJeff Layton {
4123a25949b9SJeff Layton 	int dcaps;
4124a25949b9SJeff Layton 
4125a25949b9SJeff Layton 	dcaps = xchg(&req->r_dir_caps, 0);
4126a25949b9SJeff Layton 	if (dcaps) {
4127a25949b9SJeff Layton 		dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
4128a25949b9SJeff Layton 		ceph_put_cap_refs(ceph_inode(req->r_parent), dcaps);
4129a25949b9SJeff Layton 	}
4130a25949b9SJeff Layton }
4131a25949b9SJeff Layton 
ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request * req)4132e64f44a8SXiubo Li void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req)
4133e64f44a8SXiubo Li {
4134e64f44a8SXiubo Li 	int dcaps;
4135e64f44a8SXiubo Li 
4136e64f44a8SXiubo Li 	dcaps = xchg(&req->r_dir_caps, 0);
4137e64f44a8SXiubo Li 	if (dcaps) {
4138e64f44a8SXiubo Li 		dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
4139e64f44a8SXiubo Li 		ceph_put_cap_refs_no_check_caps(ceph_inode(req->r_parent),
4140e64f44a8SXiubo Li 						dcaps);
4141e64f44a8SXiubo Li 	}
4142e64f44a8SXiubo Li }
4143e64f44a8SXiubo Li 
41442f2dc053SSage Weil /*
41452f2dc053SSage Weil  * called under session->mutex.
41462f2dc053SSage Weil  */
replay_unsafe_requests(struct ceph_mds_client * mdsc,struct ceph_mds_session * session)41472f2dc053SSage Weil static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
41482f2dc053SSage Weil 				   struct ceph_mds_session *session)
41492f2dc053SSage Weil {
41502f2dc053SSage Weil 	struct ceph_mds_request *req, *nreq;
41513de22be6SYan, Zheng 	struct rb_node *p;
41522f2dc053SSage Weil 
41532f2dc053SSage Weil 	dout("replay_unsafe_requests mds%d\n", session->s_mds);
41542f2dc053SSage Weil 
41552f2dc053SSage Weil 	mutex_lock(&mdsc->mutex);
41569cf54563SXiubo Li 	list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item)
4157396bd62cSJeff Layton 		__send_request(session, req, true);
41583de22be6SYan, Zheng 
41593de22be6SYan, Zheng 	/*
41603de22be6SYan, Zheng 	 * also re-send old requests when MDS enters reconnect stage. So that MDS
41613de22be6SYan, Zheng 	 * can process completed request in clientreplay stage.
41623de22be6SYan, Zheng 	 */
41633de22be6SYan, Zheng 	p = rb_first(&mdsc->request_tree);
41643de22be6SYan, Zheng 	while (p) {
41653de22be6SYan, Zheng 		req = rb_entry(p, struct ceph_mds_request, r_node);
41663de22be6SYan, Zheng 		p = rb_next(p);
4167bc2de10dSJeff Layton 		if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
41683de22be6SYan, Zheng 			continue;
41693de22be6SYan, Zheng 		if (req->r_attempts == 0)
41703de22be6SYan, Zheng 			continue; /* only old requests */
4171a25949b9SJeff Layton 		if (!req->r_session)
4172a25949b9SJeff Layton 			continue;
4173a25949b9SJeff Layton 		if (req->r_session->s_mds != session->s_mds)
4174a25949b9SJeff Layton 			continue;
4175a25949b9SJeff Layton 
4176e64f44a8SXiubo Li 		ceph_mdsc_release_dir_caps_no_check(req);
4177a25949b9SJeff Layton 
4178396bd62cSJeff Layton 		__send_request(session, req, true);
41793de22be6SYan, Zheng 	}
41802f2dc053SSage Weil 	mutex_unlock(&mdsc->mutex);
41812f2dc053SSage Weil }
41822f2dc053SSage Weil 
send_reconnect_partial(struct ceph_reconnect_state * recon_state)418381c5a148SYan, Zheng static int send_reconnect_partial(struct ceph_reconnect_state *recon_state)
418481c5a148SYan, Zheng {
418581c5a148SYan, Zheng 	struct ceph_msg *reply;
418681c5a148SYan, Zheng 	struct ceph_pagelist *_pagelist;
418781c5a148SYan, Zheng 	struct page *page;
418881c5a148SYan, Zheng 	__le32 *addr;
418981c5a148SYan, Zheng 	int err = -ENOMEM;
419081c5a148SYan, Zheng 
419181c5a148SYan, Zheng 	if (!recon_state->allow_multi)
419281c5a148SYan, Zheng 		return -ENOSPC;
419381c5a148SYan, Zheng 
419481c5a148SYan, Zheng 	/* can't handle message that contains both caps and realm */
419581c5a148SYan, Zheng 	BUG_ON(!recon_state->nr_caps == !recon_state->nr_realms);
419681c5a148SYan, Zheng 
419781c5a148SYan, Zheng 	/* pre-allocate new pagelist */
419881c5a148SYan, Zheng 	_pagelist = ceph_pagelist_alloc(GFP_NOFS);
419981c5a148SYan, Zheng 	if (!_pagelist)
420081c5a148SYan, Zheng 		return -ENOMEM;
420181c5a148SYan, Zheng 
420281c5a148SYan, Zheng 	reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false);
420381c5a148SYan, Zheng 	if (!reply)
420481c5a148SYan, Zheng 		goto fail_msg;
420581c5a148SYan, Zheng 
420681c5a148SYan, Zheng 	/* placeholder for nr_caps */
420781c5a148SYan, Zheng 	err = ceph_pagelist_encode_32(_pagelist, 0);
420881c5a148SYan, Zheng 	if (err < 0)
420981c5a148SYan, Zheng 		goto fail;
421081c5a148SYan, Zheng 
421181c5a148SYan, Zheng 	if (recon_state->nr_caps) {
421281c5a148SYan, Zheng 		/* currently encoding caps */
421381c5a148SYan, Zheng 		err = ceph_pagelist_encode_32(recon_state->pagelist, 0);
421481c5a148SYan, Zheng 		if (err)
421581c5a148SYan, Zheng 			goto fail;
421681c5a148SYan, Zheng 	} else {
421781c5a148SYan, Zheng 		/* placeholder for nr_realms (currently encoding relams) */
421881c5a148SYan, Zheng 		err = ceph_pagelist_encode_32(_pagelist, 0);
421981c5a148SYan, Zheng 		if (err < 0)
422081c5a148SYan, Zheng 			goto fail;
422181c5a148SYan, Zheng 	}
422281c5a148SYan, Zheng 
422381c5a148SYan, Zheng 	err = ceph_pagelist_encode_8(recon_state->pagelist, 1);
422481c5a148SYan, Zheng 	if (err)
422581c5a148SYan, Zheng 		goto fail;
422681c5a148SYan, Zheng 
422781c5a148SYan, Zheng 	page = list_first_entry(&recon_state->pagelist->head, struct page, lru);
422881c5a148SYan, Zheng 	addr = kmap_atomic(page);
422981c5a148SYan, Zheng 	if (recon_state->nr_caps) {
423081c5a148SYan, Zheng 		/* currently encoding caps */
423181c5a148SYan, Zheng 		*addr = cpu_to_le32(recon_state->nr_caps);
423281c5a148SYan, Zheng 	} else {
423381c5a148SYan, Zheng 		/* currently encoding relams */
423481c5a148SYan, Zheng 		*(addr + 1) = cpu_to_le32(recon_state->nr_realms);
423581c5a148SYan, Zheng 	}
423681c5a148SYan, Zheng 	kunmap_atomic(addr);
423781c5a148SYan, Zheng 
423881c5a148SYan, Zheng 	reply->hdr.version = cpu_to_le16(5);
423981c5a148SYan, Zheng 	reply->hdr.compat_version = cpu_to_le16(4);
424081c5a148SYan, Zheng 
424181c5a148SYan, Zheng 	reply->hdr.data_len = cpu_to_le32(recon_state->pagelist->length);
424281c5a148SYan, Zheng 	ceph_msg_data_add_pagelist(reply, recon_state->pagelist);
424381c5a148SYan, Zheng 
424481c5a148SYan, Zheng 	ceph_con_send(&recon_state->session->s_con, reply);
424581c5a148SYan, Zheng 	ceph_pagelist_release(recon_state->pagelist);
424681c5a148SYan, Zheng 
424781c5a148SYan, Zheng 	recon_state->pagelist = _pagelist;
424881c5a148SYan, Zheng 	recon_state->nr_caps = 0;
424981c5a148SYan, Zheng 	recon_state->nr_realms = 0;
425081c5a148SYan, Zheng 	recon_state->msg_version = 5;
425181c5a148SYan, Zheng 	return 0;
425281c5a148SYan, Zheng fail:
425381c5a148SYan, Zheng 	ceph_msg_put(reply);
425481c5a148SYan, Zheng fail_msg:
425581c5a148SYan, Zheng 	ceph_pagelist_release(_pagelist);
425681c5a148SYan, Zheng 	return err;
425781c5a148SYan, Zheng }
425881c5a148SYan, Zheng 
d_find_primary(struct inode * inode)4259a33f6432SYan, Zheng static struct dentry* d_find_primary(struct inode *inode)
4260a33f6432SYan, Zheng {
4261a33f6432SYan, Zheng 	struct dentry *alias, *dn = NULL;
4262a33f6432SYan, Zheng 
4263a33f6432SYan, Zheng 	if (hlist_empty(&inode->i_dentry))
4264a33f6432SYan, Zheng 		return NULL;
4265a33f6432SYan, Zheng 
4266a33f6432SYan, Zheng 	spin_lock(&inode->i_lock);
4267a33f6432SYan, Zheng 	if (hlist_empty(&inode->i_dentry))
4268a33f6432SYan, Zheng 		goto out_unlock;
4269a33f6432SYan, Zheng 
4270a33f6432SYan, Zheng 	if (S_ISDIR(inode->i_mode)) {
4271a33f6432SYan, Zheng 		alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
4272a33f6432SYan, Zheng 		if (!IS_ROOT(alias))
4273a33f6432SYan, Zheng 			dn = dget(alias);
4274a33f6432SYan, Zheng 		goto out_unlock;
4275a33f6432SYan, Zheng 	}
4276a33f6432SYan, Zheng 
4277a33f6432SYan, Zheng 	hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
4278a33f6432SYan, Zheng 		spin_lock(&alias->d_lock);
4279a33f6432SYan, Zheng 		if (!d_unhashed(alias) &&
4280a33f6432SYan, Zheng 		    (ceph_dentry(alias)->flags & CEPH_DENTRY_PRIMARY_LINK)) {
4281a33f6432SYan, Zheng 			dn = dget_dlock(alias);
4282a33f6432SYan, Zheng 		}
4283a33f6432SYan, Zheng 		spin_unlock(&alias->d_lock);
4284a33f6432SYan, Zheng 		if (dn)
4285a33f6432SYan, Zheng 			break;
4286a33f6432SYan, Zheng 	}
4287a33f6432SYan, Zheng out_unlock:
4288a33f6432SYan, Zheng 	spin_unlock(&inode->i_lock);
4289a33f6432SYan, Zheng 	return dn;
4290a33f6432SYan, Zheng }
4291a33f6432SYan, Zheng 
42922f2dc053SSage Weil /*
42932f2dc053SSage Weil  * Encode information about a cap for a reconnect with the MDS.
42942f2dc053SSage Weil  */
reconnect_caps_cb(struct inode * inode,int mds,void * arg)4295aaf67de7SXiubo Li static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
42962f2dc053SSage Weil {
42972e2023e9SXiubo Li 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
429820cb34aeSSage Weil 	union {
429920cb34aeSSage Weil 		struct ceph_mds_cap_reconnect v2;
430020cb34aeSSage Weil 		struct ceph_mds_cap_reconnect_v1 v1;
430120cb34aeSSage Weil 	} rec;
4302aaf67de7SXiubo Li 	struct ceph_inode_info *ci = ceph_inode(inode);
430320cb34aeSSage Weil 	struct ceph_reconnect_state *recon_state = arg;
430420cb34aeSSage Weil 	struct ceph_pagelist *pagelist = recon_state->pagelist;
4305a33f6432SYan, Zheng 	struct dentry *dentry;
4306aaf67de7SXiubo Li 	struct ceph_cap *cap;
4307a33f6432SYan, Zheng 	char *path;
43089aaa7eb0SXiubo Li 	int pathlen = 0, err;
4309a33f6432SYan, Zheng 	u64 pathbase;
43103469ed0dSYan, Zheng 	u64 snap_follows;
43112f2dc053SSage Weil 
4312a33f6432SYan, Zheng 	dentry = d_find_primary(inode);
4313a33f6432SYan, Zheng 	if (dentry) {
4314a33f6432SYan, Zheng 		/* set pathbase to parent dir when msg_version >= 2 */
43152e2023e9SXiubo Li 		path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase,
4316a33f6432SYan, Zheng 					    recon_state->msg_version >= 2);
4317a33f6432SYan, Zheng 		dput(dentry);
4318a33f6432SYan, Zheng 		if (IS_ERR(path)) {
4319a33f6432SYan, Zheng 			err = PTR_ERR(path);
4320a33f6432SYan, Zheng 			goto out_err;
4321a33f6432SYan, Zheng 		}
4322a33f6432SYan, Zheng 	} else {
4323a33f6432SYan, Zheng 		path = NULL;
4324a33f6432SYan, Zheng 		pathbase = 0;
4325a33f6432SYan, Zheng 	}
4326a33f6432SYan, Zheng 
4327be655596SSage Weil 	spin_lock(&ci->i_ceph_lock);
4328aaf67de7SXiubo Li 	cap = __get_cap_for_mds(ci, mds);
4329aaf67de7SXiubo Li 	if (!cap) {
4330aaf67de7SXiubo Li 		spin_unlock(&ci->i_ceph_lock);
43319aaa7eb0SXiubo Li 		err = 0;
4332aaf67de7SXiubo Li 		goto out_err;
4333aaf67de7SXiubo Li 	}
4334aaf67de7SXiubo Li 	dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
4335aaf67de7SXiubo Li 	     inode, ceph_vinop(inode), cap, cap->cap_id,
4336aaf67de7SXiubo Li 	     ceph_cap_string(cap->issued));
4337aaf67de7SXiubo Li 
43382f2dc053SSage Weil 	cap->seq = 0;        /* reset cap seq */
43392f2dc053SSage Weil 	cap->issue_seq = 0;  /* and issue_seq */
4340667ca05cSYan, Zheng 	cap->mseq = 0;       /* and migrate_seq */
434152d60f8eSJeff Layton 	cap->cap_gen = atomic_read(&cap->session->s_cap_gen);
434220cb34aeSSage Weil 
4343a25949b9SJeff Layton 	/* These are lost when the session goes away */
4344785892feSJeff Layton 	if (S_ISDIR(inode->i_mode)) {
4345785892feSJeff Layton 		if (cap->issued & CEPH_CAP_DIR_CREATE) {
4346785892feSJeff Layton 			ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
4347785892feSJeff Layton 			memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
4348785892feSJeff Layton 		}
4349a25949b9SJeff Layton 		cap->issued &= ~CEPH_CAP_ANY_DIR_OPS;
4350785892feSJeff Layton 	}
4351a25949b9SJeff Layton 
4352121f22a1SYan, Zheng 	if (recon_state->msg_version >= 2) {
435320cb34aeSSage Weil 		rec.v2.cap_id = cpu_to_le64(cap->cap_id);
435420cb34aeSSage Weil 		rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
435520cb34aeSSage Weil 		rec.v2.issued = cpu_to_le32(cap->issued);
435620cb34aeSSage Weil 		rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
4357a33f6432SYan, Zheng 		rec.v2.pathbase = cpu_to_le64(pathbase);
4358ec1dff25SJeff Layton 		rec.v2.flock_len = (__force __le32)
4359ec1dff25SJeff Layton 			((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
436020cb34aeSSage Weil 	} else {
436120cb34aeSSage Weil 		rec.v1.cap_id = cpu_to_le64(cap->cap_id);
436220cb34aeSSage Weil 		rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
436320cb34aeSSage Weil 		rec.v1.issued = cpu_to_le32(cap->issued);
43642d6795fbSJeff Layton 		rec.v1.size = cpu_to_le64(i_size_read(inode));
43659bbeab41SArnd Bergmann 		ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime);
43669bbeab41SArnd Bergmann 		ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime);
436720cb34aeSSage Weil 		rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
4368a33f6432SYan, Zheng 		rec.v1.pathbase = cpu_to_le64(pathbase);
436920cb34aeSSage Weil 	}
43703469ed0dSYan, Zheng 
43713469ed0dSYan, Zheng 	if (list_empty(&ci->i_cap_snaps)) {
437292776fd2SYan, Zheng 		snap_follows = ci->i_head_snapc ? ci->i_head_snapc->seq : 0;
43733469ed0dSYan, Zheng 	} else {
43743469ed0dSYan, Zheng 		struct ceph_cap_snap *capsnap =
43753469ed0dSYan, Zheng 			list_first_entry(&ci->i_cap_snaps,
43763469ed0dSYan, Zheng 					 struct ceph_cap_snap, ci_item);
43773469ed0dSYan, Zheng 		snap_follows = capsnap->follows;
437893cea5beSSage Weil 	}
4379be655596SSage Weil 	spin_unlock(&ci->i_ceph_lock);
43802f2dc053SSage Weil 
4381121f22a1SYan, Zheng 	if (recon_state->msg_version >= 2) {
438240819f6fSGreg Farnum 		int num_fcntl_locks, num_flock_locks;
43834deb14a2SYan, Zheng 		struct ceph_filelock *flocks = NULL;
438481c5a148SYan, Zheng 		size_t struct_len, total_len = sizeof(u64);
4385121f22a1SYan, Zheng 		u8 struct_v = 0;
438640819f6fSGreg Farnum 
438739be95e9SJim Schutt encode_again:
4388b3f8d68fSYan, Zheng 		if (rec.v2.flock_len) {
438939be95e9SJim Schutt 			ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
4390b3f8d68fSYan, Zheng 		} else {
4391b3f8d68fSYan, Zheng 			num_fcntl_locks = 0;
4392b3f8d68fSYan, Zheng 			num_flock_locks = 0;
4393b3f8d68fSYan, Zheng 		}
43944deb14a2SYan, Zheng 		if (num_fcntl_locks + num_flock_locks > 0) {
43956da2ec56SKees Cook 			flocks = kmalloc_array(num_fcntl_locks + num_flock_locks,
43966da2ec56SKees Cook 					       sizeof(struct ceph_filelock),
43976da2ec56SKees Cook 					       GFP_NOFS);
439839be95e9SJim Schutt 			if (!flocks) {
439939be95e9SJim Schutt 				err = -ENOMEM;
44005ccedf1cSYan, Zheng 				goto out_err;
440139be95e9SJim Schutt 			}
440239be95e9SJim Schutt 			err = ceph_encode_locks_to_buffer(inode, flocks,
440340819f6fSGreg Farnum 							  num_fcntl_locks,
440440819f6fSGreg Farnum 							  num_flock_locks);
440539be95e9SJim Schutt 			if (err) {
440639be95e9SJim Schutt 				kfree(flocks);
44074deb14a2SYan, Zheng 				flocks = NULL;
440839be95e9SJim Schutt 				if (err == -ENOSPC)
440939be95e9SJim Schutt 					goto encode_again;
44105ccedf1cSYan, Zheng 				goto out_err;
4411fca4451aSGreg Farnum 			}
44124deb14a2SYan, Zheng 		} else {
44134deb14a2SYan, Zheng 			kfree(flocks);
44144deb14a2SYan, Zheng 			flocks = NULL;
44154deb14a2SYan, Zheng 		}
4416121f22a1SYan, Zheng 
4417121f22a1SYan, Zheng 		if (recon_state->msg_version >= 3) {
4418121f22a1SYan, Zheng 			/* version, compat_version and struct_len */
441981c5a148SYan, Zheng 			total_len += 2 * sizeof(u8) + sizeof(u32);
44203469ed0dSYan, Zheng 			struct_v = 2;
4421121f22a1SYan, Zheng 		}
442239be95e9SJim Schutt 		/*
442339be95e9SJim Schutt 		 * number of encoded locks is stable, so copy to pagelist
442439be95e9SJim Schutt 		 */
4425121f22a1SYan, Zheng 		struct_len = 2 * sizeof(u32) +
442639be95e9SJim Schutt 			    (num_fcntl_locks + num_flock_locks) *
4427121f22a1SYan, Zheng 			    sizeof(struct ceph_filelock);
4428121f22a1SYan, Zheng 		rec.v2.flock_len = cpu_to_le32(struct_len);
4429121f22a1SYan, Zheng 
4430a33f6432SYan, Zheng 		struct_len += sizeof(u32) + pathlen + sizeof(rec.v2);
4431121f22a1SYan, Zheng 
44323469ed0dSYan, Zheng 		if (struct_v >= 2)
44333469ed0dSYan, Zheng 			struct_len += sizeof(u64); /* snap_follows */
44343469ed0dSYan, Zheng 
4435121f22a1SYan, Zheng 		total_len += struct_len;
443681c5a148SYan, Zheng 
443781c5a148SYan, Zheng 		if (pagelist->length + total_len > RECONNECT_MAX_SIZE) {
443881c5a148SYan, Zheng 			err = send_reconnect_partial(recon_state);
443981c5a148SYan, Zheng 			if (err)
444081c5a148SYan, Zheng 				goto out_freeflocks;
444181c5a148SYan, Zheng 			pagelist = recon_state->pagelist;
44425ccedf1cSYan, Zheng 		}
4443121f22a1SYan, Zheng 
444481c5a148SYan, Zheng 		err = ceph_pagelist_reserve(pagelist, total_len);
444581c5a148SYan, Zheng 		if (err)
444681c5a148SYan, Zheng 			goto out_freeflocks;
444781c5a148SYan, Zheng 
444881c5a148SYan, Zheng 		ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
4449121f22a1SYan, Zheng 		if (recon_state->msg_version >= 3) {
4450121f22a1SYan, Zheng 			ceph_pagelist_encode_8(pagelist, struct_v);
4451121f22a1SYan, Zheng 			ceph_pagelist_encode_8(pagelist, 1);
4452121f22a1SYan, Zheng 			ceph_pagelist_encode_32(pagelist, struct_len);
4453121f22a1SYan, Zheng 		}
4454a33f6432SYan, Zheng 		ceph_pagelist_encode_string(pagelist, path, pathlen);
4455121f22a1SYan, Zheng 		ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
4456121f22a1SYan, Zheng 		ceph_locks_to_pagelist(flocks, pagelist,
44575ccedf1cSYan, Zheng 				       num_fcntl_locks, num_flock_locks);
44583469ed0dSYan, Zheng 		if (struct_v >= 2)
44593469ed0dSYan, Zheng 			ceph_pagelist_encode_64(pagelist, snap_follows);
446081c5a148SYan, Zheng out_freeflocks:
446139be95e9SJim Schutt 		kfree(flocks);
44623612abbdSSage Weil 	} else {
44635ccedf1cSYan, Zheng 		err = ceph_pagelist_reserve(pagelist,
446481c5a148SYan, Zheng 					    sizeof(u64) + sizeof(u32) +
446581c5a148SYan, Zheng 					    pathlen + sizeof(rec.v1));
4466a33f6432SYan, Zheng 		if (err)
4467a33f6432SYan, Zheng 			goto out_err;
44685ccedf1cSYan, Zheng 
446981c5a148SYan, Zheng 		ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
4470121f22a1SYan, Zheng 		ceph_pagelist_encode_string(pagelist, path, pathlen);
4471121f22a1SYan, Zheng 		ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
447240819f6fSGreg Farnum 	}
447344c99757SYan, Zheng 
44745ccedf1cSYan, Zheng out_err:
4475a33f6432SYan, Zheng 	ceph_mdsc_free_path(path, pathlen);
4476a33f6432SYan, Zheng 	if (!err)
447781c5a148SYan, Zheng 		recon_state->nr_caps++;
447881c5a148SYan, Zheng 	return err;
447981c5a148SYan, Zheng }
448081c5a148SYan, Zheng 
encode_snap_realms(struct ceph_mds_client * mdsc,struct ceph_reconnect_state * recon_state)448181c5a148SYan, Zheng static int encode_snap_realms(struct ceph_mds_client *mdsc,
448281c5a148SYan, Zheng 			      struct ceph_reconnect_state *recon_state)
448381c5a148SYan, Zheng {
448481c5a148SYan, Zheng 	struct rb_node *p;
448581c5a148SYan, Zheng 	struct ceph_pagelist *pagelist = recon_state->pagelist;
448681c5a148SYan, Zheng 	int err = 0;
448781c5a148SYan, Zheng 
448881c5a148SYan, Zheng 	if (recon_state->msg_version >= 4) {
448981c5a148SYan, Zheng 		err = ceph_pagelist_encode_32(pagelist, mdsc->num_snap_realms);
449081c5a148SYan, Zheng 		if (err < 0)
449181c5a148SYan, Zheng 			goto fail;
449281c5a148SYan, Zheng 	}
449381c5a148SYan, Zheng 
449481c5a148SYan, Zheng 	/*
449581c5a148SYan, Zheng 	 * snaprealms.  we provide mds with the ino, seq (version), and
449681c5a148SYan, Zheng 	 * parent for all of our realms.  If the mds has any newer info,
449781c5a148SYan, Zheng 	 * it will tell us.
449881c5a148SYan, Zheng 	 */
449981c5a148SYan, Zheng 	for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
450081c5a148SYan, Zheng 		struct ceph_snap_realm *realm =
450181c5a148SYan, Zheng 		       rb_entry(p, struct ceph_snap_realm, node);
450281c5a148SYan, Zheng 		struct ceph_mds_snaprealm_reconnect sr_rec;
450381c5a148SYan, Zheng 
450481c5a148SYan, Zheng 		if (recon_state->msg_version >= 4) {
450581c5a148SYan, Zheng 			size_t need = sizeof(u8) * 2 + sizeof(u32) +
450681c5a148SYan, Zheng 				      sizeof(sr_rec);
450781c5a148SYan, Zheng 
450881c5a148SYan, Zheng 			if (pagelist->length + need > RECONNECT_MAX_SIZE) {
450981c5a148SYan, Zheng 				err = send_reconnect_partial(recon_state);
451081c5a148SYan, Zheng 				if (err)
451181c5a148SYan, Zheng 					goto fail;
451281c5a148SYan, Zheng 				pagelist = recon_state->pagelist;
451381c5a148SYan, Zheng 			}
451481c5a148SYan, Zheng 
451581c5a148SYan, Zheng 			err = ceph_pagelist_reserve(pagelist, need);
451681c5a148SYan, Zheng 			if (err)
451781c5a148SYan, Zheng 				goto fail;
451881c5a148SYan, Zheng 
451981c5a148SYan, Zheng 			ceph_pagelist_encode_8(pagelist, 1);
452081c5a148SYan, Zheng 			ceph_pagelist_encode_8(pagelist, 1);
452181c5a148SYan, Zheng 			ceph_pagelist_encode_32(pagelist, sizeof(sr_rec));
452281c5a148SYan, Zheng 		}
452381c5a148SYan, Zheng 
452481c5a148SYan, Zheng 		dout(" adding snap realm %llx seq %lld parent %llx\n",
452581c5a148SYan, Zheng 		     realm->ino, realm->seq, realm->parent_ino);
452681c5a148SYan, Zheng 		sr_rec.ino = cpu_to_le64(realm->ino);
452781c5a148SYan, Zheng 		sr_rec.seq = cpu_to_le64(realm->seq);
452881c5a148SYan, Zheng 		sr_rec.parent = cpu_to_le64(realm->parent_ino);
452981c5a148SYan, Zheng 
453081c5a148SYan, Zheng 		err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
453181c5a148SYan, Zheng 		if (err)
453281c5a148SYan, Zheng 			goto fail;
453381c5a148SYan, Zheng 
453481c5a148SYan, Zheng 		recon_state->nr_realms++;
453581c5a148SYan, Zheng 	}
453681c5a148SYan, Zheng fail:
453793cea5beSSage Weil 	return err;
45382f2dc053SSage Weil }
45392f2dc053SSage Weil 
45402f2dc053SSage Weil 
45412f2dc053SSage Weil /*
45422f2dc053SSage Weil  * If an MDS fails and recovers, clients need to reconnect in order to
45432f2dc053SSage Weil  * reestablish shared state.  This includes all caps issued through
45442f2dc053SSage Weil  * this session _and_ the snap_realm hierarchy.  Because it's not
45452f2dc053SSage Weil  * clear which snap realms the mds cares about, we send everything we
45462f2dc053SSage Weil  * know about.. that ensures we'll then get any new info the
45472f2dc053SSage Weil  * recovering MDS might have.
45482f2dc053SSage Weil  *
45492f2dc053SSage Weil  * This is a relatively heavyweight operation, but it's rare.
45502f2dc053SSage Weil  */
send_mds_reconnect(struct ceph_mds_client * mdsc,struct ceph_mds_session * session)455134b6c855SSage Weil static void send_mds_reconnect(struct ceph_mds_client *mdsc,
455234b6c855SSage Weil 			       struct ceph_mds_session *session)
45532f2dc053SSage Weil {
45542f2dc053SSage Weil 	struct ceph_msg *reply;
455534b6c855SSage Weil 	int mds = session->s_mds;
45569abf82b8SSage Weil 	int err = -ENOMEM;
455781c5a148SYan, Zheng 	struct ceph_reconnect_state recon_state = {
455881c5a148SYan, Zheng 		.session = session,
455981c5a148SYan, Zheng 	};
4560c8a96a31SJeff Layton 	LIST_HEAD(dispose);
45612f2dc053SSage Weil 
456234b6c855SSage Weil 	pr_info("mds%d reconnect start\n", mds);
45632f2dc053SSage Weil 
456481c5a148SYan, Zheng 	recon_state.pagelist = ceph_pagelist_alloc(GFP_NOFS);
456581c5a148SYan, Zheng 	if (!recon_state.pagelist)
456693cea5beSSage Weil 		goto fail_nopagelist;
456793cea5beSSage Weil 
45680d9c1ab3SIlya Dryomov 	reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false);
4569a79832f2SSage Weil 	if (!reply)
457093cea5beSSage Weil 		goto fail_nomsg;
457193cea5beSSage Weil 
4572d4846487SJeff Layton 	xa_destroy(&session->s_delegated_inos);
4573d4846487SJeff Layton 
45742f2dc053SSage Weil 	mutex_lock(&session->s_mutex);
45752f2dc053SSage Weil 	session->s_state = CEPH_MDS_SESSION_RECONNECTING;
45762f2dc053SSage Weil 	session->s_seq = 0;
45772f2dc053SSage Weil 
45782f2dc053SSage Weil 	dout("session %p state %s\n", session,
4579a687ecafSJohn Spray 	     ceph_session_state_name(session->s_state));
45802f2dc053SSage Weil 
458152d60f8eSJeff Layton 	atomic_inc(&session->s_cap_gen);
458299a9c273SYan, Zheng 
458399a9c273SYan, Zheng 	spin_lock(&session->s_cap_lock);
458403f4fcb0SYan, Zheng 	/* don't know if session is readonly */
458503f4fcb0SYan, Zheng 	session->s_readonly = 0;
458699a9c273SYan, Zheng 	/*
458799a9c273SYan, Zheng 	 * notify __ceph_remove_cap() that we are composing cap reconnect.
458899a9c273SYan, Zheng 	 * If a cap get released before being added to the cap reconnect,
458999a9c273SYan, Zheng 	 * __ceph_remove_cap() should skip queuing cap release.
459099a9c273SYan, Zheng 	 */
459199a9c273SYan, Zheng 	session->s_cap_reconnect = 1;
4592e01a5946SSage Weil 	/* drop old cap expires; we're about to reestablish that state */
4593c8a96a31SJeff Layton 	detach_cap_releases(session, &dispose);
4594c8a96a31SJeff Layton 	spin_unlock(&session->s_cap_lock);
4595c8a96a31SJeff Layton 	dispose_cap_releases(mdsc, &dispose);
4596e01a5946SSage Weil 
45975d23371fSYan, Zheng 	/* trim unused caps to reduce MDS's cache rejoin time */
4598c0bd50e2SYan, Zheng 	if (mdsc->fsc->sb->s_root)
45995d23371fSYan, Zheng 		shrink_dcache_parent(mdsc->fsc->sb->s_root);
46005d23371fSYan, Zheng 
46015d23371fSYan, Zheng 	ceph_con_close(&session->s_con);
46025d23371fSYan, Zheng 	ceph_con_open(&session->s_con,
46035d23371fSYan, Zheng 		      CEPH_ENTITY_TYPE_MDS, mds,
46045d23371fSYan, Zheng 		      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
46055d23371fSYan, Zheng 
46065d23371fSYan, Zheng 	/* replay unsafe requests */
46075d23371fSYan, Zheng 	replay_unsafe_requests(mdsc, session);
46085d23371fSYan, Zheng 
460981c5a148SYan, Zheng 	ceph_early_kick_flushing_caps(mdsc, session);
461081c5a148SYan, Zheng 
46115d23371fSYan, Zheng 	down_read(&mdsc->snap_rwsem);
46125d23371fSYan, Zheng 
461381c5a148SYan, Zheng 	/* placeholder for nr_caps */
461481c5a148SYan, Zheng 	err = ceph_pagelist_encode_32(recon_state.pagelist, 0);
461593cea5beSSage Weil 	if (err)
461693cea5beSSage Weil 		goto fail;
461720cb34aeSSage Weil 
461881c5a148SYan, Zheng 	if (test_bit(CEPHFS_FEATURE_MULTI_RECONNECT, &session->s_features)) {
4619121f22a1SYan, Zheng 		recon_state.msg_version = 3;
462081c5a148SYan, Zheng 		recon_state.allow_multi = true;
462181c5a148SYan, Zheng 	} else if (session->s_con.peer_features & CEPH_FEATURE_MDSENC) {
462281c5a148SYan, Zheng 		recon_state.msg_version = 3;
462381c5a148SYan, Zheng 	} else {
462423c625ceSIlya Dryomov 		recon_state.msg_version = 2;
462581c5a148SYan, Zheng 	}
462681c5a148SYan, Zheng 	/* trsaverse this session's caps */
4627a25949b9SJeff Layton 	err = ceph_iterate_session_caps(session, reconnect_caps_cb, &recon_state);
46282f2dc053SSage Weil 
462999a9c273SYan, Zheng 	spin_lock(&session->s_cap_lock);
463099a9c273SYan, Zheng 	session->s_cap_reconnect = 0;
463199a9c273SYan, Zheng 	spin_unlock(&session->s_cap_lock);
463299a9c273SYan, Zheng 
463381c5a148SYan, Zheng 	if (err < 0)
463481c5a148SYan, Zheng 		goto fail;
46352f2dc053SSage Weil 
463681c5a148SYan, Zheng 	/* check if all realms can be encoded into current message */
463781c5a148SYan, Zheng 	if (mdsc->num_snap_realms) {
463881c5a148SYan, Zheng 		size_t total_len =
463981c5a148SYan, Zheng 			recon_state.pagelist->length +
464081c5a148SYan, Zheng 			mdsc->num_snap_realms *
464181c5a148SYan, Zheng 			sizeof(struct ceph_mds_snaprealm_reconnect);
464281c5a148SYan, Zheng 		if (recon_state.msg_version >= 4) {
464381c5a148SYan, Zheng 			/* number of realms */
464481c5a148SYan, Zheng 			total_len += sizeof(u32);
464581c5a148SYan, Zheng 			/* version, compat_version and struct_len */
464681c5a148SYan, Zheng 			total_len += mdsc->num_snap_realms *
464781c5a148SYan, Zheng 				     (2 * sizeof(u8) + sizeof(u32));
464881c5a148SYan, Zheng 		}
464981c5a148SYan, Zheng 		if (total_len > RECONNECT_MAX_SIZE) {
465081c5a148SYan, Zheng 			if (!recon_state.allow_multi) {
465181c5a148SYan, Zheng 				err = -ENOSPC;
465281c5a148SYan, Zheng 				goto fail;
465381c5a148SYan, Zheng 			}
465481c5a148SYan, Zheng 			if (recon_state.nr_caps) {
465581c5a148SYan, Zheng 				err = send_reconnect_partial(&recon_state);
465693cea5beSSage Weil 				if (err)
465793cea5beSSage Weil 					goto fail;
46582f2dc053SSage Weil 			}
465981c5a148SYan, Zheng 			recon_state.msg_version = 5;
466081c5a148SYan, Zheng 		}
466181c5a148SYan, Zheng 	}
46622f2dc053SSage Weil 
466381c5a148SYan, Zheng 	err = encode_snap_realms(mdsc, &recon_state);
466481c5a148SYan, Zheng 	if (err < 0)
466581c5a148SYan, Zheng 		goto fail;
466644c99757SYan, Zheng 
466781c5a148SYan, Zheng 	if (recon_state.msg_version >= 5) {
466881c5a148SYan, Zheng 		err = ceph_pagelist_encode_8(recon_state.pagelist, 0);
466981c5a148SYan, Zheng 		if (err < 0)
467081c5a148SYan, Zheng 			goto fail;
467181c5a148SYan, Zheng 	}
467281c5a148SYan, Zheng 
467381c5a148SYan, Zheng 	if (recon_state.nr_caps || recon_state.nr_realms) {
467481c5a148SYan, Zheng 		struct page *page =
467581c5a148SYan, Zheng 			list_first_entry(&recon_state.pagelist->head,
467644c99757SYan, Zheng 					struct page, lru);
467744c99757SYan, Zheng 		__le32 *addr = kmap_atomic(page);
467881c5a148SYan, Zheng 		if (recon_state.nr_caps) {
467981c5a148SYan, Zheng 			WARN_ON(recon_state.nr_realms != mdsc->num_snap_realms);
468044c99757SYan, Zheng 			*addr = cpu_to_le32(recon_state.nr_caps);
468181c5a148SYan, Zheng 		} else if (recon_state.msg_version >= 4) {
468281c5a148SYan, Zheng 			*(addr + 1) = cpu_to_le32(recon_state.nr_realms);
468381c5a148SYan, Zheng 		}
468444c99757SYan, Zheng 		kunmap_atomic(addr);
468544c99757SYan, Zheng 	}
468644c99757SYan, Zheng 
468781c5a148SYan, Zheng 	reply->hdr.version = cpu_to_le16(recon_state.msg_version);
468881c5a148SYan, Zheng 	if (recon_state.msg_version >= 4)
468981c5a148SYan, Zheng 		reply->hdr.compat_version = cpu_to_le16(4);
4690e548e9b9SYan, Zheng 
469181c5a148SYan, Zheng 	reply->hdr.data_len = cpu_to_le32(recon_state.pagelist->length);
469281c5a148SYan, Zheng 	ceph_msg_data_add_pagelist(reply, recon_state.pagelist);
4693e548e9b9SYan, Zheng 
46942f2dc053SSage Weil 	ceph_con_send(&session->s_con, reply);
46952f2dc053SSage Weil 
46962f2dc053SSage Weil 	mutex_unlock(&session->s_mutex);
46979abf82b8SSage Weil 
46989abf82b8SSage Weil 	mutex_lock(&mdsc->mutex);
46999abf82b8SSage Weil 	__wake_requests(mdsc, &session->s_waiting);
47009abf82b8SSage Weil 	mutex_unlock(&mdsc->mutex);
47019abf82b8SSage Weil 
47029abf82b8SSage Weil 	up_read(&mdsc->snap_rwsem);
470381c5a148SYan, Zheng 	ceph_pagelist_release(recon_state.pagelist);
47042f2dc053SSage Weil 	return;
47052f2dc053SSage Weil 
470693cea5beSSage Weil fail:
47072f2dc053SSage Weil 	ceph_msg_put(reply);
47089abf82b8SSage Weil 	up_read(&mdsc->snap_rwsem);
47099abf82b8SSage Weil 	mutex_unlock(&session->s_mutex);
471093cea5beSSage Weil fail_nomsg:
471181c5a148SYan, Zheng 	ceph_pagelist_release(recon_state.pagelist);
471293cea5beSSage Weil fail_nopagelist:
47139abf82b8SSage Weil 	pr_err("error %d preparing reconnect for mds%d\n", err, mds);
47149abf82b8SSage Weil 	return;
47152f2dc053SSage Weil }
47162f2dc053SSage Weil 
47172f2dc053SSage Weil 
47182f2dc053SSage Weil /*
47192f2dc053SSage Weil  * compare old and new mdsmaps, kicking requests
47202f2dc053SSage Weil  * and closing out old connections as necessary
47212f2dc053SSage Weil  *
47222f2dc053SSage Weil  * called under mdsc->mutex.
47232f2dc053SSage Weil  */
check_new_map(struct ceph_mds_client * mdsc,struct ceph_mdsmap * newmap,struct ceph_mdsmap * oldmap)47242f2dc053SSage Weil static void check_new_map(struct ceph_mds_client *mdsc,
47252f2dc053SSage Weil 			  struct ceph_mdsmap *newmap,
47262f2dc053SSage Weil 			  struct ceph_mdsmap *oldmap)
47272f2dc053SSage Weil {
4728d517b398SXiubo Li 	int i, j, err;
47292f2dc053SSage Weil 	int oldstate, newstate;
47302f2dc053SSage Weil 	struct ceph_mds_session *s;
4731d517b398SXiubo Li 	unsigned long targets[DIV_ROUND_UP(CEPH_MAX_MDS, sizeof(unsigned long))] = {0};
47322f2dc053SSage Weil 
47332f2dc053SSage Weil 	dout("check_new_map new %u old %u\n",
47342f2dc053SSage Weil 	     newmap->m_epoch, oldmap->m_epoch);
47352f2dc053SSage Weil 
4736d517b398SXiubo Li 	if (newmap->m_info) {
4737d517b398SXiubo Li 		for (i = 0; i < newmap->possible_max_rank; i++) {
4738d517b398SXiubo Li 			for (j = 0; j < newmap->m_info[i].num_export_targets; j++)
4739d517b398SXiubo Li 				set_bit(newmap->m_info[i].export_targets[j], targets);
4740d517b398SXiubo Li 		}
4741d517b398SXiubo Li 	}
4742d517b398SXiubo Li 
4743b38c9eb4SXiubo Li 	for (i = 0; i < oldmap->possible_max_rank && i < mdsc->max_sessions; i++) {
4744d37b1d99SMarkus Elfring 		if (!mdsc->sessions[i])
47452f2dc053SSage Weil 			continue;
47462f2dc053SSage Weil 		s = mdsc->sessions[i];
47472f2dc053SSage Weil 		oldstate = ceph_mdsmap_get_state(oldmap, i);
47482f2dc053SSage Weil 		newstate = ceph_mdsmap_get_state(newmap, i);
47492f2dc053SSage Weil 
47500deb01c9SSage Weil 		dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n",
47512f2dc053SSage Weil 		     i, ceph_mds_state_name(oldstate),
47520deb01c9SSage Weil 		     ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
47532f2dc053SSage Weil 		     ceph_mds_state_name(newstate),
47540deb01c9SSage Weil 		     ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
4755a687ecafSJohn Spray 		     ceph_session_state_name(s->s_state));
47562f2dc053SSage Weil 
4757b38c9eb4SXiubo Li 		if (i >= newmap->possible_max_rank) {
47582827528dSYan, Zheng 			/* force close session for stopped mds */
47595b3248c6SXiubo Li 			ceph_get_mds_session(s);
47602827528dSYan, Zheng 			__unregister_session(mdsc, s);
47612827528dSYan, Zheng 			__wake_requests(mdsc, &s->s_waiting);
47622827528dSYan, Zheng 			mutex_unlock(&mdsc->mutex);
47632827528dSYan, Zheng 
47642827528dSYan, Zheng 			mutex_lock(&s->s_mutex);
47652827528dSYan, Zheng 			cleanup_session_requests(mdsc, s);
47662827528dSYan, Zheng 			remove_session_caps(s);
47672827528dSYan, Zheng 			mutex_unlock(&s->s_mutex);
47682827528dSYan, Zheng 
47692827528dSYan, Zheng 			ceph_put_mds_session(s);
47702827528dSYan, Zheng 
47712827528dSYan, Zheng 			mutex_lock(&mdsc->mutex);
47726f0f597bSYan, Zheng 			kick_requests(mdsc, i);
47736f0f597bSYan, Zheng 			continue;
47746f0f597bSYan, Zheng 		}
47756f0f597bSYan, Zheng 
47766f0f597bSYan, Zheng 		if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
47776f0f597bSYan, Zheng 			   ceph_mdsmap_get_addr(newmap, i),
47786f0f597bSYan, Zheng 			   sizeof(struct ceph_entity_addr))) {
47792f2dc053SSage Weil 			/* just close it */
47802f2dc053SSage Weil 			mutex_unlock(&mdsc->mutex);
47812f2dc053SSage Weil 			mutex_lock(&s->s_mutex);
47822f2dc053SSage Weil 			mutex_lock(&mdsc->mutex);
47832f2dc053SSage Weil 			ceph_con_close(&s->s_con);
47842f2dc053SSage Weil 			mutex_unlock(&s->s_mutex);
47852f2dc053SSage Weil 			s->s_state = CEPH_MDS_SESSION_RESTARTING;
47862f2dc053SSage Weil 		} else if (oldstate == newstate) {
47872f2dc053SSage Weil 			continue;  /* nothing new with this mds */
47882f2dc053SSage Weil 		}
47892f2dc053SSage Weil 
47902f2dc053SSage Weil 		/*
47912f2dc053SSage Weil 		 * send reconnect?
47922f2dc053SSage Weil 		 */
47932f2dc053SSage Weil 		if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
479434b6c855SSage Weil 		    newstate >= CEPH_MDS_STATE_RECONNECT) {
479534b6c855SSage Weil 			mutex_unlock(&mdsc->mutex);
4796d517b398SXiubo Li 			clear_bit(i, targets);
479734b6c855SSage Weil 			send_mds_reconnect(mdsc, s);
479834b6c855SSage Weil 			mutex_lock(&mdsc->mutex);
479934b6c855SSage Weil 		}
48002f2dc053SSage Weil 
48012f2dc053SSage Weil 		/*
480229790f26SSage Weil 		 * kick request on any mds that has gone active.
48032f2dc053SSage Weil 		 */
48042f2dc053SSage Weil 		if (oldstate < CEPH_MDS_STATE_ACTIVE &&
48052f2dc053SSage Weil 		    newstate >= CEPH_MDS_STATE_ACTIVE) {
480629790f26SSage Weil 			if (oldstate != CEPH_MDS_STATE_CREATING &&
480729790f26SSage Weil 			    oldstate != CEPH_MDS_STATE_STARTING)
480829790f26SSage Weil 				pr_info("mds%d recovery completed\n", s->s_mds);
480929790f26SSage Weil 			kick_requests(mdsc, i);
4810ea8412b2SXiubo Li 			mutex_unlock(&mdsc->mutex);
4811829ad4dbSJeff Layton 			mutex_lock(&s->s_mutex);
4812ea8412b2SXiubo Li 			mutex_lock(&mdsc->mutex);
48132f2dc053SSage Weil 			ceph_kick_flushing_caps(mdsc, s);
4814829ad4dbSJeff Layton 			mutex_unlock(&s->s_mutex);
4815d2f8bb27SYan, Zheng 			wake_up_session_caps(s, RECONNECT);
48162f2dc053SSage Weil 		}
48172f2dc053SSage Weil 	}
4818cb170a22SSage Weil 
4819d517b398SXiubo Li 	/*
4820d517b398SXiubo Li 	 * Only open and reconnect sessions that don't exist yet.
4821d517b398SXiubo Li 	 */
4822d517b398SXiubo Li 	for (i = 0; i < newmap->possible_max_rank; i++) {
4823d517b398SXiubo Li 		/*
4824d517b398SXiubo Li 		 * In case the import MDS is crashed just after
4825d517b398SXiubo Li 		 * the EImportStart journal is flushed, so when
4826d517b398SXiubo Li 		 * a standby MDS takes over it and is replaying
4827d517b398SXiubo Li 		 * the EImportStart journal the new MDS daemon
4828d517b398SXiubo Li 		 * will wait the client to reconnect it, but the
4829d517b398SXiubo Li 		 * client may never register/open the session yet.
4830d517b398SXiubo Li 		 *
4831d517b398SXiubo Li 		 * Will try to reconnect that MDS daemon if the
4832d517b398SXiubo Li 		 * rank number is in the export targets array and
4833d517b398SXiubo Li 		 * is the up:reconnect state.
4834d517b398SXiubo Li 		 */
4835d517b398SXiubo Li 		newstate = ceph_mdsmap_get_state(newmap, i);
4836d517b398SXiubo Li 		if (!test_bit(i, targets) || newstate != CEPH_MDS_STATE_RECONNECT)
4837d517b398SXiubo Li 			continue;
4838d517b398SXiubo Li 
4839d517b398SXiubo Li 		/*
4840d517b398SXiubo Li 		 * The session maybe registered and opened by some
4841d517b398SXiubo Li 		 * requests which were choosing random MDSes during
4842d517b398SXiubo Li 		 * the mdsc->mutex's unlock/lock gap below in rare
4843d517b398SXiubo Li 		 * case. But the related MDS daemon will just queue
4844d517b398SXiubo Li 		 * that requests and be still waiting for the client's
4845d517b398SXiubo Li 		 * reconnection request in up:reconnect state.
4846d517b398SXiubo Li 		 */
4847d517b398SXiubo Li 		s = __ceph_lookup_mds_session(mdsc, i);
4848d517b398SXiubo Li 		if (likely(!s)) {
4849d517b398SXiubo Li 			s = __open_export_target_session(mdsc, i);
4850d517b398SXiubo Li 			if (IS_ERR(s)) {
4851d517b398SXiubo Li 				err = PTR_ERR(s);
4852d517b398SXiubo Li 				pr_err("failed to open export target session, err %d\n",
4853d517b398SXiubo Li 				       err);
4854d517b398SXiubo Li 				continue;
4855d517b398SXiubo Li 			}
4856d517b398SXiubo Li 		}
4857d517b398SXiubo Li 		dout("send reconnect to export target mds.%d\n", i);
4858d517b398SXiubo Li 		mutex_unlock(&mdsc->mutex);
4859d517b398SXiubo Li 		send_mds_reconnect(mdsc, s);
4860d517b398SXiubo Li 		ceph_put_mds_session(s);
4861d517b398SXiubo Li 		mutex_lock(&mdsc->mutex);
4862d517b398SXiubo Li 	}
4863d517b398SXiubo Li 
4864b38c9eb4SXiubo Li 	for (i = 0; i < newmap->possible_max_rank && i < mdsc->max_sessions; i++) {
4865cb170a22SSage Weil 		s = mdsc->sessions[i];
4866cb170a22SSage Weil 		if (!s)
4867cb170a22SSage Weil 			continue;
4868cb170a22SSage Weil 		if (!ceph_mdsmap_is_laggy(newmap, i))
4869cb170a22SSage Weil 			continue;
4870cb170a22SSage Weil 		if (s->s_state == CEPH_MDS_SESSION_OPEN ||
4871cb170a22SSage Weil 		    s->s_state == CEPH_MDS_SESSION_HUNG ||
4872cb170a22SSage Weil 		    s->s_state == CEPH_MDS_SESSION_CLOSING) {
4873cb170a22SSage Weil 			dout(" connecting to export targets of laggy mds%d\n",
4874cb170a22SSage Weil 			     i);
4875cb170a22SSage Weil 			__open_export_target_sessions(mdsc, s);
4876cb170a22SSage Weil 		}
4877cb170a22SSage Weil 	}
48782f2dc053SSage Weil }
48792f2dc053SSage Weil 
48802f2dc053SSage Weil 
48812f2dc053SSage Weil 
48822f2dc053SSage Weil /*
48832f2dc053SSage Weil  * leases
48842f2dc053SSage Weil  */
48852f2dc053SSage Weil 
48862f2dc053SSage Weil /*
48872f2dc053SSage Weil  * caller must hold session s_mutex, dentry->d_lock
48882f2dc053SSage Weil  */
__ceph_mdsc_drop_dentry_lease(struct dentry * dentry)48892f2dc053SSage Weil void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
48902f2dc053SSage Weil {
48912f2dc053SSage Weil 	struct ceph_dentry_info *di = ceph_dentry(dentry);
48922f2dc053SSage Weil 
48932f2dc053SSage Weil 	ceph_put_mds_session(di->lease_session);
48942f2dc053SSage Weil 	di->lease_session = NULL;
48952f2dc053SSage Weil }
48962f2dc053SSage Weil 
handle_lease(struct ceph_mds_client * mdsc,struct ceph_mds_session * session,struct ceph_msg * msg)48972600d2ddSSage Weil static void handle_lease(struct ceph_mds_client *mdsc,
48982600d2ddSSage Weil 			 struct ceph_mds_session *session,
48992600d2ddSSage Weil 			 struct ceph_msg *msg)
49002f2dc053SSage Weil {
49013d14c5d2SYehuda Sadeh 	struct super_block *sb = mdsc->fsc->sb;
49022f2dc053SSage Weil 	struct inode *inode;
49032f2dc053SSage Weil 	struct dentry *parent, *dentry;
49042f2dc053SSage Weil 	struct ceph_dentry_info *di;
49052600d2ddSSage Weil 	int mds = session->s_mds;
49062f2dc053SSage Weil 	struct ceph_mds_lease *h = msg->front.iov_base;
49071e5ea23dSSage Weil 	u32 seq;
49082f2dc053SSage Weil 	struct ceph_vino vino;
49092f2dc053SSage Weil 	struct qstr dname;
49102f2dc053SSage Weil 	int release = 0;
49112f2dc053SSage Weil 
49122f2dc053SSage Weil 	dout("handle_lease from mds%d\n", mds);
49132f2dc053SSage Weil 
4914e3dfcab2SXiubo Li 	if (!ceph_inc_mds_stopping_blocker(mdsc, session))
4915e3dfcab2SXiubo Li 		return;
4916e3dfcab2SXiubo Li 
49172f2dc053SSage Weil 	/* decode */
49182f2dc053SSage Weil 	if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
49192f2dc053SSage Weil 		goto bad;
49202f2dc053SSage Weil 	vino.ino = le64_to_cpu(h->ino);
49212f2dc053SSage Weil 	vino.snap = CEPH_NOSNAP;
49221e5ea23dSSage Weil 	seq = le32_to_cpu(h->seq);
49230fcf6c02SYan, Zheng 	dname.len = get_unaligned_le32(h + 1);
49240fcf6c02SYan, Zheng 	if (msg->front.iov_len < sizeof(*h) + sizeof(u32) + dname.len)
49252f2dc053SSage Weil 		goto bad;
49260fcf6c02SYan, Zheng 	dname.name = (void *)(h + 1) + sizeof(u32);
49272f2dc053SSage Weil 
49282f2dc053SSage Weil 	/* lookup inode */
49292f2dc053SSage Weil 	inode = ceph_find_inode(sb, vino);
49302f90b852SSage Weil 	dout("handle_lease %s, ino %llx %p %.*s\n",
49312f90b852SSage Weil 	     ceph_lease_op_name(h->action), vino.ino, inode,
49321e5ea23dSSage Weil 	     dname.len, dname.name);
49336cd3bcadSYan, Zheng 
49346cd3bcadSYan, Zheng 	mutex_lock(&session->s_mutex);
4935d37b1d99SMarkus Elfring 	if (!inode) {
49362f2dc053SSage Weil 		dout("handle_lease no inode %llx\n", vino.ino);
49372f2dc053SSage Weil 		goto release;
49382f2dc053SSage Weil 	}
49392f2dc053SSage Weil 
49402f2dc053SSage Weil 	/* dentry */
49412f2dc053SSage Weil 	parent = d_find_alias(inode);
49422f2dc053SSage Weil 	if (!parent) {
49432f2dc053SSage Weil 		dout("no parent dentry on inode %p\n", inode);
49442f2dc053SSage Weil 		WARN_ON(1);
49452f2dc053SSage Weil 		goto release;  /* hrm... */
49462f2dc053SSage Weil 	}
49478387ff25SLinus Torvalds 	dname.hash = full_name_hash(parent, dname.name, dname.len);
49482f2dc053SSage Weil 	dentry = d_lookup(parent, &dname);
49492f2dc053SSage Weil 	dput(parent);
49502f2dc053SSage Weil 	if (!dentry)
49512f2dc053SSage Weil 		goto release;
49522f2dc053SSage Weil 
49532f2dc053SSage Weil 	spin_lock(&dentry->d_lock);
49542f2dc053SSage Weil 	di = ceph_dentry(dentry);
49552f2dc053SSage Weil 	switch (h->action) {
49562f2dc053SSage Weil 	case CEPH_MDS_LEASE_REVOKE:
49573d8eb7a9SSage Weil 		if (di->lease_session == session) {
49581e5ea23dSSage Weil 			if (ceph_seq_cmp(di->lease_seq, seq) > 0)
49592f2dc053SSage Weil 				h->seq = cpu_to_le32(di->lease_seq);
49602f2dc053SSage Weil 			__ceph_mdsc_drop_dentry_lease(dentry);
49612f2dc053SSage Weil 		}
49622f2dc053SSage Weil 		release = 1;
49632f2dc053SSage Weil 		break;
49642f2dc053SSage Weil 
49652f2dc053SSage Weil 	case CEPH_MDS_LEASE_RENEW:
49663d8eb7a9SSage Weil 		if (di->lease_session == session &&
496752d60f8eSJeff Layton 		    di->lease_gen == atomic_read(&session->s_cap_gen) &&
49682f2dc053SSage Weil 		    di->lease_renew_from &&
49692f2dc053SSage Weil 		    di->lease_renew_after == 0) {
49702f2dc053SSage Weil 			unsigned long duration =
49713563dbddSNicholas Mc Guire 				msecs_to_jiffies(le32_to_cpu(h->duration_ms));
49722f2dc053SSage Weil 
49731e5ea23dSSage Weil 			di->lease_seq = seq;
49749b16f03cSMiklos Szeredi 			di->time = di->lease_renew_from + duration;
49752f2dc053SSage Weil 			di->lease_renew_after = di->lease_renew_from +
49762f2dc053SSage Weil 				(duration >> 1);
49772f2dc053SSage Weil 			di->lease_renew_from = 0;
49782f2dc053SSage Weil 		}
49792f2dc053SSage Weil 		break;
49802f2dc053SSage Weil 	}
49812f2dc053SSage Weil 	spin_unlock(&dentry->d_lock);
49822f2dc053SSage Weil 	dput(dentry);
49832f2dc053SSage Weil 
49842f2dc053SSage Weil 	if (!release)
49852f2dc053SSage Weil 		goto out;
49862f2dc053SSage Weil 
49872f2dc053SSage Weil release:
49882f2dc053SSage Weil 	/* let's just reuse the same message */
49892f2dc053SSage Weil 	h->action = CEPH_MDS_LEASE_REVOKE_ACK;
49902f2dc053SSage Weil 	ceph_msg_get(msg);
49912f2dc053SSage Weil 	ceph_con_send(&session->s_con, msg);
49922f2dc053SSage Weil 
49932f2dc053SSage Weil out:
49942f2dc053SSage Weil 	mutex_unlock(&session->s_mutex);
499523c2c76eSJeff Layton 	iput(inode);
4996e3dfcab2SXiubo Li 
4997e3dfcab2SXiubo Li 	ceph_dec_mds_stopping_blocker(mdsc);
49982f2dc053SSage Weil 	return;
49992f2dc053SSage Weil 
50002f2dc053SSage Weil bad:
5001e3dfcab2SXiubo Li 	ceph_dec_mds_stopping_blocker(mdsc);
5002e3dfcab2SXiubo Li 
50032f2dc053SSage Weil 	pr_err("corrupt lease message\n");
50049ec7cab1SSage Weil 	ceph_msg_dump(msg);
50052f2dc053SSage Weil }
50062f2dc053SSage Weil 
ceph_mdsc_lease_send_msg(struct ceph_mds_session * session,struct dentry * dentry,char action,u32 seq)50072f2dc053SSage Weil void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
50082f2dc053SSage Weil 			      struct dentry *dentry, char action,
50092f2dc053SSage Weil 			      u32 seq)
50102f2dc053SSage Weil {
50112f2dc053SSage Weil 	struct ceph_msg *msg;
50122f2dc053SSage Weil 	struct ceph_mds_lease *lease;
50138f2a98efSYan, Zheng 	struct inode *dir;
50148f2a98efSYan, Zheng 	int len = sizeof(*lease) + sizeof(u32) + NAME_MAX;
50152f2dc053SSage Weil 
50168f2a98efSYan, Zheng 	dout("lease_send_msg identry %p %s to mds%d\n",
50178f2a98efSYan, Zheng 	     dentry, ceph_lease_op_name(action), session->s_mds);
50182f2dc053SSage Weil 
5019b61c2763SSage Weil 	msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false);
5020a79832f2SSage Weil 	if (!msg)
50212f2dc053SSage Weil 		return;
50222f2dc053SSage Weil 	lease = msg->front.iov_base;
50232f2dc053SSage Weil 	lease->action = action;
50242f2dc053SSage Weil 	lease->seq = cpu_to_le32(seq);
50252f2dc053SSage Weil 
50268f2a98efSYan, Zheng 	spin_lock(&dentry->d_lock);
50278f2a98efSYan, Zheng 	dir = d_inode(dentry->d_parent);
50288f2a98efSYan, Zheng 	lease->ino = cpu_to_le64(ceph_ino(dir));
50298f2a98efSYan, Zheng 	lease->first = lease->last = cpu_to_le64(ceph_snap(dir));
50308f2a98efSYan, Zheng 
50318f2a98efSYan, Zheng 	put_unaligned_le32(dentry->d_name.len, lease + 1);
50328f2a98efSYan, Zheng 	memcpy((void *)(lease + 1) + 4,
50338f2a98efSYan, Zheng 	       dentry->d_name.name, dentry->d_name.len);
50348f2a98efSYan, Zheng 	spin_unlock(&dentry->d_lock);
50352f2dc053SSage Weil 
50362f2dc053SSage Weil 	ceph_con_send(&session->s_con, msg);
50372f2dc053SSage Weil }
50382f2dc053SSage Weil 
50392f2dc053SSage Weil /*
504059b312f3SXiubo Li  * lock unlock the session, to wait ongoing session activities
50412f2dc053SSage Weil  */
lock_unlock_session(struct ceph_mds_session * s)504259b312f3SXiubo Li static void lock_unlock_session(struct ceph_mds_session *s)
50432f2dc053SSage Weil {
50442f2dc053SSage Weil 	mutex_lock(&s->s_mutex);
50452f2dc053SSage Weil 	mutex_unlock(&s->s_mutex);
50462f2dc053SSage Weil }
50472f2dc053SSage Weil 
maybe_recover_session(struct ceph_mds_client * mdsc)5048131d7eb4SYan, Zheng static void maybe_recover_session(struct ceph_mds_client *mdsc)
5049131d7eb4SYan, Zheng {
5050131d7eb4SYan, Zheng 	struct ceph_fs_client *fsc = mdsc->fsc;
50512f2dc053SSage Weil 
5052131d7eb4SYan, Zheng 	if (!ceph_test_mount_opt(fsc, CLEANRECOVER))
5053131d7eb4SYan, Zheng 		return;
5054131d7eb4SYan, Zheng 
5055131d7eb4SYan, Zheng 	if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED)
5056131d7eb4SYan, Zheng 		return;
5057131d7eb4SYan, Zheng 
50580b98acd6SIlya Dryomov 	if (!READ_ONCE(fsc->blocklisted))
5059131d7eb4SYan, Zheng 		return;
5060131d7eb4SYan, Zheng 
50610b98acd6SIlya Dryomov 	pr_info("auto reconnect after blocklisted\n");
5062131d7eb4SYan, Zheng 	ceph_force_reconnect(fsc->sb);
5063131d7eb4SYan, Zheng }
50642f2dc053SSage Weil 
check_session_state(struct ceph_mds_session * s)50653e699bd8SXiubo Li bool check_session_state(struct ceph_mds_session *s)
50663e699bd8SXiubo Li {
506762575e27SJeff Layton 	switch (s->s_state) {
506862575e27SJeff Layton 	case CEPH_MDS_SESSION_OPEN:
50693e699bd8SXiubo Li 		if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
50703e699bd8SXiubo Li 			s->s_state = CEPH_MDS_SESSION_HUNG;
50713e699bd8SXiubo Li 			pr_info("mds%d hung\n", s->s_mds);
50723e699bd8SXiubo Li 		}
507362575e27SJeff Layton 		break;
507462575e27SJeff Layton 	case CEPH_MDS_SESSION_CLOSING:
507562575e27SJeff Layton 	case CEPH_MDS_SESSION_NEW:
507662575e27SJeff Layton 	case CEPH_MDS_SESSION_RESTARTING:
507762575e27SJeff Layton 	case CEPH_MDS_SESSION_CLOSED:
507862575e27SJeff Layton 	case CEPH_MDS_SESSION_REJECTED:
50793e699bd8SXiubo Li 		return false;
508062575e27SJeff Layton 	}
50813e699bd8SXiubo Li 
50823e699bd8SXiubo Li 	return true;
50833e699bd8SXiubo Li }
50843e699bd8SXiubo Li 
50852f2dc053SSage Weil /*
508662575e27SJeff Layton  * If the sequence is incremented while we're waiting on a REQUEST_CLOSE reply,
508762575e27SJeff Layton  * then we need to retransmit that request.
508862575e27SJeff Layton  */
inc_session_sequence(struct ceph_mds_session * s)508962575e27SJeff Layton void inc_session_sequence(struct ceph_mds_session *s)
509062575e27SJeff Layton {
509162575e27SJeff Layton 	lockdep_assert_held(&s->s_mutex);
509262575e27SJeff Layton 
509362575e27SJeff Layton 	s->s_seq++;
509462575e27SJeff Layton 
509562575e27SJeff Layton 	if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
509662575e27SJeff Layton 		int ret;
509762575e27SJeff Layton 
509862575e27SJeff Layton 		dout("resending session close request for mds%d\n", s->s_mds);
509962575e27SJeff Layton 		ret = request_close_session(s);
510062575e27SJeff Layton 		if (ret < 0)
510162575e27SJeff Layton 			pr_err("unable to close session to mds%d: %d\n",
510262575e27SJeff Layton 			       s->s_mds, ret);
510362575e27SJeff Layton 	}
510462575e27SJeff Layton }
510562575e27SJeff Layton 
510662575e27SJeff Layton /*
5107bf2ba432SLuis Henriques  * delayed work -- periodically trim expired leases, renew caps with mds.  If
5108bf2ba432SLuis Henriques  * the @delay parameter is set to 0 or if it's more than 5 secs, the default
5109bf2ba432SLuis Henriques  * workqueue delay value of 5 secs will be used.
51102f2dc053SSage Weil  */
schedule_delayed(struct ceph_mds_client * mdsc,unsigned long delay)5111bf2ba432SLuis Henriques static void schedule_delayed(struct ceph_mds_client *mdsc, unsigned long delay)
51122f2dc053SSage Weil {
5113bf2ba432SLuis Henriques 	unsigned long max_delay = HZ * 5;
5114bf2ba432SLuis Henriques 
5115bf2ba432SLuis Henriques 	/* 5 secs default delay */
5116bf2ba432SLuis Henriques 	if (!delay || (delay > max_delay))
5117bf2ba432SLuis Henriques 		delay = max_delay;
5118bf2ba432SLuis Henriques 	schedule_delayed_work(&mdsc->delayed_work,
5119bf2ba432SLuis Henriques 			      round_jiffies_relative(delay));
51202f2dc053SSage Weil }
51212f2dc053SSage Weil 
delayed_work(struct work_struct * work)51222f2dc053SSage Weil static void delayed_work(struct work_struct *work)
51232f2dc053SSage Weil {
51242f2dc053SSage Weil 	struct ceph_mds_client *mdsc =
51252f2dc053SSage Weil 		container_of(work, struct ceph_mds_client, delayed_work.work);
5126bf2ba432SLuis Henriques 	unsigned long delay;
51272f2dc053SSage Weil 	int renew_interval;
51282f2dc053SSage Weil 	int renew_caps;
5129bf2ba432SLuis Henriques 	int i;
51302f2dc053SSage Weil 
51312f2dc053SSage Weil 	dout("mdsc delayed_work\n");
513275c9627eSYan, Zheng 
5133e7e607bdSXiubo Li 	if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHED)
5134fa996773SXiubo Li 		return;
5135fa996773SXiubo Li 
51362f2dc053SSage Weil 	mutex_lock(&mdsc->mutex);
51372f2dc053SSage Weil 	renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
51382f2dc053SSage Weil 	renew_caps = time_after_eq(jiffies, HZ*renew_interval +
51392f2dc053SSage Weil 				   mdsc->last_renew_caps);
51402f2dc053SSage Weil 	if (renew_caps)
51412f2dc053SSage Weil 		mdsc->last_renew_caps = jiffies;
51422f2dc053SSage Weil 
51432f2dc053SSage Weil 	for (i = 0; i < mdsc->max_sessions; i++) {
51442f2dc053SSage Weil 		struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
5145d37b1d99SMarkus Elfring 		if (!s)
51462f2dc053SSage Weil 			continue;
51473e699bd8SXiubo Li 
51483e699bd8SXiubo Li 		if (!check_session_state(s)) {
51492f2dc053SSage Weil 			ceph_put_mds_session(s);
51502f2dc053SSage Weil 			continue;
51512f2dc053SSage Weil 		}
51522f2dc053SSage Weil 		mutex_unlock(&mdsc->mutex);
51532f2dc053SSage Weil 
51542f2dc053SSage Weil 		mutex_lock(&s->s_mutex);
51552f2dc053SSage Weil 		if (renew_caps)
51562f2dc053SSage Weil 			send_renew_caps(mdsc, s);
51572f2dc053SSage Weil 		else
51582f2dc053SSage Weil 			ceph_con_keepalive(&s->s_con);
5159aab53dd9SSage Weil 		if (s->s_state == CEPH_MDS_SESSION_OPEN ||
5160aab53dd9SSage Weil 		    s->s_state == CEPH_MDS_SESSION_HUNG)
51613d7ded4dSSage Weil 			ceph_send_cap_releases(mdsc, s);
51622f2dc053SSage Weil 		mutex_unlock(&s->s_mutex);
51632f2dc053SSage Weil 		ceph_put_mds_session(s);
51642f2dc053SSage Weil 
51652f2dc053SSage Weil 		mutex_lock(&mdsc->mutex);
51662f2dc053SSage Weil 	}
51672f2dc053SSage Weil 	mutex_unlock(&mdsc->mutex);
51682f2dc053SSage Weil 
5169bf2ba432SLuis Henriques 	delay = ceph_check_delayed_caps(mdsc);
517037c4efc1SYan, Zheng 
517137c4efc1SYan, Zheng 	ceph_queue_cap_reclaim_work(mdsc);
517237c4efc1SYan, Zheng 
517337c4efc1SYan, Zheng 	ceph_trim_snapid_map(mdsc);
517437c4efc1SYan, Zheng 
5175131d7eb4SYan, Zheng 	maybe_recover_session(mdsc);
5176131d7eb4SYan, Zheng 
5177bf2ba432SLuis Henriques 	schedule_delayed(mdsc, delay);
51782f2dc053SSage Weil }
51792f2dc053SSage Weil 
ceph_mdsc_init(struct ceph_fs_client * fsc)51803d14c5d2SYehuda Sadeh int ceph_mdsc_init(struct ceph_fs_client *fsc)
51812f2dc053SSage Weil 
51822f2dc053SSage Weil {
51833d14c5d2SYehuda Sadeh 	struct ceph_mds_client *mdsc;
5184f9009efaSXiubo Li 	int err;
51853d14c5d2SYehuda Sadeh 
51863d14c5d2SYehuda Sadeh 	mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);
51873d14c5d2SYehuda Sadeh 	if (!mdsc)
51883d14c5d2SYehuda Sadeh 		return -ENOMEM;
51893d14c5d2SYehuda Sadeh 	mdsc->fsc = fsc;
51902f2dc053SSage Weil 	mutex_init(&mdsc->mutex);
51912f2dc053SSage Weil 	mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
5192d37b1d99SMarkus Elfring 	if (!mdsc->mdsmap) {
5193f9009efaSXiubo Li 		err = -ENOMEM;
5194f9009efaSXiubo Li 		goto err_mdsc;
5195fb3101b6Smajianpeng 	}
51962d06eeb8SCheng Renquan 
51972f2dc053SSage Weil 	init_completion(&mdsc->safe_umount_waiters);
5198e3dfcab2SXiubo Li 	spin_lock_init(&mdsc->stopping_lock);
5199e3dfcab2SXiubo Li 	atomic_set(&mdsc->stopping_blockers, 0);
5200e3dfcab2SXiubo Li 	init_completion(&mdsc->stopping_waiter);
5201f3c60c59SSage Weil 	init_waitqueue_head(&mdsc->session_close_wq);
52022f2dc053SSage Weil 	INIT_LIST_HEAD(&mdsc->waiting_for_map);
52030c44a8e0SLuis Henriques 	mdsc->quotarealms_inodes = RB_ROOT;
52040c44a8e0SLuis Henriques 	mutex_init(&mdsc->quotarealms_inodes_mutex);
52052f2dc053SSage Weil 	init_rwsem(&mdsc->snap_rwsem);
5206a105f00cSSage Weil 	mdsc->snap_realms = RB_ROOT;
52072f2dc053SSage Weil 	INIT_LIST_HEAD(&mdsc->snap_empty);
52082f2dc053SSage Weil 	spin_lock_init(&mdsc->snap_empty_lock);
520944ca18f2SSage Weil 	mdsc->request_tree = RB_ROOT;
52102f2dc053SSage Weil 	INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
52112f2dc053SSage Weil 	mdsc->last_renew_caps = jiffies;
52122f2dc053SSage Weil 	INIT_LIST_HEAD(&mdsc->cap_delay_list);
52133a3430afSJeff Layton 	INIT_LIST_HEAD(&mdsc->cap_wait_list);
52142f2dc053SSage Weil 	spin_lock_init(&mdsc->cap_delay_lock);
52152f2dc053SSage Weil 	INIT_LIST_HEAD(&mdsc->snap_flush_list);
52162f2dc053SSage Weil 	spin_lock_init(&mdsc->snap_flush_lock);
5217553adfd9SYan, Zheng 	mdsc->last_cap_flush_tid = 1;
5218e4500b5eSYan, Zheng 	INIT_LIST_HEAD(&mdsc->cap_flush_list);
5219db354052SSage Weil 	INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
52202f2dc053SSage Weil 	spin_lock_init(&mdsc->cap_dirty_lock);
52212f2dc053SSage Weil 	init_waitqueue_head(&mdsc->cap_flushing_wq);
522237c4efc1SYan, Zheng 	INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work);
5223f9009efaSXiubo Li 	err = ceph_metric_init(&mdsc->metric);
5224f9009efaSXiubo Li 	if (err)
5225f9009efaSXiubo Li 		goto err_mdsmap;
522637c4efc1SYan, Zheng 
522737c4efc1SYan, Zheng 	spin_lock_init(&mdsc->dentry_list_lock);
522837c4efc1SYan, Zheng 	INIT_LIST_HEAD(&mdsc->dentry_leases);
522937c4efc1SYan, Zheng 	INIT_LIST_HEAD(&mdsc->dentry_dir_leases);
52302d06eeb8SCheng Renquan 
523137151668SYehuda Sadeh 	ceph_caps_init(mdsc);
5232fe33032dSYan, Zheng 	ceph_adjust_caps_max_min(mdsc, fsc->mount_options);
523337151668SYehuda Sadeh 
523475c9627eSYan, Zheng 	spin_lock_init(&mdsc->snapid_map_lock);
523575c9627eSYan, Zheng 	mdsc->snapid_map_tree = RB_ROOT;
523675c9627eSYan, Zheng 	INIT_LIST_HEAD(&mdsc->snapid_map_lru);
523775c9627eSYan, Zheng 
523810183a69SYan, Zheng 	init_rwsem(&mdsc->pool_perm_rwsem);
523910183a69SYan, Zheng 	mdsc->pool_perm_tree = RB_ROOT;
524010183a69SYan, Zheng 
5241dfeb84d4SYan, Zheng 	strscpy(mdsc->nodename, utsname()->nodename,
5242dfeb84d4SYan, Zheng 		sizeof(mdsc->nodename));
5243a7caa88fSXiubo Li 
5244a7caa88fSXiubo Li 	fsc->mdsc = mdsc;
52455f44f142SSage Weil 	return 0;
5246f9009efaSXiubo Li 
5247f9009efaSXiubo Li err_mdsmap:
5248f9009efaSXiubo Li 	kfree(mdsc->mdsmap);
5249f9009efaSXiubo Li err_mdsc:
5250f9009efaSXiubo Li 	kfree(mdsc);
5251f9009efaSXiubo Li 	return err;
52522f2dc053SSage Weil }
52532f2dc053SSage Weil 
52542f2dc053SSage Weil /*
52552f2dc053SSage Weil  * Wait for safe replies on open mds requests.  If we time out, drop
52562f2dc053SSage Weil  * all requests from the tree to avoid dangling dentry refs.
52572f2dc053SSage Weil  */
wait_requests(struct ceph_mds_client * mdsc)52582f2dc053SSage Weil static void wait_requests(struct ceph_mds_client *mdsc)
52592f2dc053SSage Weil {
5260a319bf56SIlya Dryomov 	struct ceph_options *opts = mdsc->fsc->client->options;
52612f2dc053SSage Weil 	struct ceph_mds_request *req;
52622f2dc053SSage Weil 
52632f2dc053SSage Weil 	mutex_lock(&mdsc->mutex);
526444ca18f2SSage Weil 	if (__get_oldest_req(mdsc)) {
52652f2dc053SSage Weil 		mutex_unlock(&mdsc->mutex);
526644ca18f2SSage Weil 
52672f2dc053SSage Weil 		dout("wait_requests waiting for requests\n");
52682f2dc053SSage Weil 		wait_for_completion_timeout(&mdsc->safe_umount_waiters,
5269a319bf56SIlya Dryomov 				    ceph_timeout_jiffies(opts->mount_timeout));
52702f2dc053SSage Weil 
52712f2dc053SSage Weil 		/* tear down remaining requests */
527244ca18f2SSage Weil 		mutex_lock(&mdsc->mutex);
527344ca18f2SSage Weil 		while ((req = __get_oldest_req(mdsc))) {
52742f2dc053SSage Weil 			dout("wait_requests timed out on tid %llu\n",
52752f2dc053SSage Weil 			     req->r_tid);
5276428138c9SYan, Zheng 			list_del_init(&req->r_wait);
527744ca18f2SSage Weil 			__unregister_request(mdsc, req);
52782f2dc053SSage Weil 		}
52792f2dc053SSage Weil 	}
52802f2dc053SSage Weil 	mutex_unlock(&mdsc->mutex);
52812f2dc053SSage Weil 	dout("wait_requests done\n");
52822f2dc053SSage Weil }
52832f2dc053SSage Weil 
send_flush_mdlog(struct ceph_mds_session * s)5284d095559cSXiubo Li void send_flush_mdlog(struct ceph_mds_session *s)
5285d095559cSXiubo Li {
5286d095559cSXiubo Li 	struct ceph_msg *msg;
5287d095559cSXiubo Li 
5288d095559cSXiubo Li 	/*
5289d095559cSXiubo Li 	 * Pre-luminous MDS crashes when it sees an unknown session request
5290d095559cSXiubo Li 	 */
5291d095559cSXiubo Li 	if (!CEPH_HAVE_FEATURE(s->s_con.peer_features, SERVER_LUMINOUS))
5292d095559cSXiubo Li 		return;
5293d095559cSXiubo Li 
5294d095559cSXiubo Li 	mutex_lock(&s->s_mutex);
5295d095559cSXiubo Li 	dout("request mdlog flush to mds%d (%s)s seq %lld\n", s->s_mds,
5296d095559cSXiubo Li 	     ceph_session_state_name(s->s_state), s->s_seq);
5297d095559cSXiubo Li 	msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_FLUSH_MDLOG,
5298d095559cSXiubo Li 				      s->s_seq);
5299d095559cSXiubo Li 	if (!msg) {
5300d095559cSXiubo Li 		pr_err("failed to request mdlog flush to mds%d (%s) seq %lld\n",
5301d095559cSXiubo Li 		       s->s_mds, ceph_session_state_name(s->s_state), s->s_seq);
5302d095559cSXiubo Li 	} else {
5303d095559cSXiubo Li 		ceph_con_send(&s->s_con, msg);
5304d095559cSXiubo Li 	}
5305d095559cSXiubo Li 	mutex_unlock(&s->s_mutex);
5306d095559cSXiubo Li }
5307d095559cSXiubo Li 
53082f2dc053SSage Weil /*
53092f2dc053SSage Weil  * called before mount is ro, and before dentries are torn down.
53102f2dc053SSage Weil  * (hmm, does this still race with new lookups?)
53112f2dc053SSage Weil  */
ceph_mdsc_pre_umount(struct ceph_mds_client * mdsc)53122f2dc053SSage Weil void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
53132f2dc053SSage Weil {
53142f2dc053SSage Weil 	dout("pre_umount\n");
5315e7e607bdSXiubo Li 	mdsc->stopping = CEPH_MDSC_STOPPING_BEGIN;
53162f2dc053SSage Weil 
5317d095559cSXiubo Li 	ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true);
531859b312f3SXiubo Li 	ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false);
5319afcdaea3SSage Weil 	ceph_flush_dirty_caps(mdsc);
53202f2dc053SSage Weil 	wait_requests(mdsc);
532117c688c3SSage Weil 
532217c688c3SSage Weil 	/*
532317c688c3SSage Weil 	 * wait for reply handlers to drop their request refs and
532417c688c3SSage Weil 	 * their inode/dcache refs
532517c688c3SSage Weil 	 */
532617c688c3SSage Weil 	ceph_msgr_flush();
53270c44a8e0SLuis Henriques 
53280c44a8e0SLuis Henriques 	ceph_cleanup_quotarealms_inodes(mdsc);
53292f2dc053SSage Weil }
53302f2dc053SSage Weil 
53312f2dc053SSage Weil /*
53321b2ba3c5SXiubo Li  * flush the mdlog and wait for all write mds requests to flush.
53332f2dc053SSage Weil  */
flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client * mdsc,u64 want_tid)53341b2ba3c5SXiubo Li static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc,
53351b2ba3c5SXiubo Li 						 u64 want_tid)
53362f2dc053SSage Weil {
533780fc7314SSage Weil 	struct ceph_mds_request *req = NULL, *nextreq;
53381b2ba3c5SXiubo Li 	struct ceph_mds_session *last_session = NULL;
533944ca18f2SSage Weil 	struct rb_node *n;
53402f2dc053SSage Weil 
53412f2dc053SSage Weil 	mutex_lock(&mdsc->mutex);
53421b2ba3c5SXiubo Li 	dout("%s want %lld\n", __func__, want_tid);
534380fc7314SSage Weil restart:
534444ca18f2SSage Weil 	req = __get_oldest_req(mdsc);
534544ca18f2SSage Weil 	while (req && req->r_tid <= want_tid) {
534680fc7314SSage Weil 		/* find next request */
534780fc7314SSage Weil 		n = rb_next(&req->r_node);
534880fc7314SSage Weil 		if (n)
534980fc7314SSage Weil 			nextreq = rb_entry(n, struct ceph_mds_request, r_node);
535080fc7314SSage Weil 		else
535180fc7314SSage Weil 			nextreq = NULL;
5352e8a7b8b1SYan, Zheng 		if (req->r_op != CEPH_MDS_OP_SETFILELOCK &&
5353e8a7b8b1SYan, Zheng 		    (req->r_op & CEPH_MDS_OP_WRITE)) {
53541b2ba3c5SXiubo Li 			struct ceph_mds_session *s = req->r_session;
53551b2ba3c5SXiubo Li 
53561b2ba3c5SXiubo Li 			if (!s) {
53571b2ba3c5SXiubo Li 				req = nextreq;
53581b2ba3c5SXiubo Li 				continue;
53591b2ba3c5SXiubo Li 			}
53601b2ba3c5SXiubo Li 
536144ca18f2SSage Weil 			/* write op */
53622f2dc053SSage Weil 			ceph_mdsc_get_request(req);
536380fc7314SSage Weil 			if (nextreq)
536480fc7314SSage Weil 				ceph_mdsc_get_request(nextreq);
53651b2ba3c5SXiubo Li 			s = ceph_get_mds_session(s);
53662f2dc053SSage Weil 			mutex_unlock(&mdsc->mutex);
53671b2ba3c5SXiubo Li 
53681b2ba3c5SXiubo Li 			/* send flush mdlog request to MDS */
53691b2ba3c5SXiubo Li 			if (last_session != s) {
53701b2ba3c5SXiubo Li 				send_flush_mdlog(s);
53711b2ba3c5SXiubo Li 				ceph_put_mds_session(last_session);
53721b2ba3c5SXiubo Li 				last_session = s;
53731b2ba3c5SXiubo Li 			} else {
53741b2ba3c5SXiubo Li 				ceph_put_mds_session(s);
53751b2ba3c5SXiubo Li 			}
53761b2ba3c5SXiubo Li 			dout("%s wait on %llu (want %llu)\n", __func__,
53772f2dc053SSage Weil 			     req->r_tid, want_tid);
53782f2dc053SSage Weil 			wait_for_completion(&req->r_safe_completion);
53791b2ba3c5SXiubo Li 
53802f2dc053SSage Weil 			mutex_lock(&mdsc->mutex);
53812f2dc053SSage Weil 			ceph_mdsc_put_request(req);
538280fc7314SSage Weil 			if (!nextreq)
538380fc7314SSage Weil 				break;  /* next dne before, so we're done! */
538480fc7314SSage Weil 			if (RB_EMPTY_NODE(&nextreq->r_node)) {
538580fc7314SSage Weil 				/* next request was removed from tree */
538680fc7314SSage Weil 				ceph_mdsc_put_request(nextreq);
538780fc7314SSage Weil 				goto restart;
538844ca18f2SSage Weil 			}
538980fc7314SSage Weil 			ceph_mdsc_put_request(nextreq);  /* won't go away */
539080fc7314SSage Weil 		}
539180fc7314SSage Weil 		req = nextreq;
53922f2dc053SSage Weil 	}
53932f2dc053SSage Weil 	mutex_unlock(&mdsc->mutex);
53941b2ba3c5SXiubo Li 	ceph_put_mds_session(last_session);
53951b2ba3c5SXiubo Li 	dout("%s done\n", __func__);
53962f2dc053SSage Weil }
53972f2dc053SSage Weil 
ceph_mdsc_sync(struct ceph_mds_client * mdsc)53982f2dc053SSage Weil void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
53992f2dc053SSage Weil {
54000e294387SYan, Zheng 	u64 want_tid, want_flush;
54012f2dc053SSage Weil 
540250c9132dSJeff Layton 	if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN)
540356b7cf95SSage Weil 		return;
540456b7cf95SSage Weil 
54052f2dc053SSage Weil 	dout("sync\n");
54062f2dc053SSage Weil 	mutex_lock(&mdsc->mutex);
54072f2dc053SSage Weil 	want_tid = mdsc->last_tid;
54082f2dc053SSage Weil 	mutex_unlock(&mdsc->mutex);
54092f2dc053SSage Weil 
5410afcdaea3SSage Weil 	ceph_flush_dirty_caps(mdsc);
5411d3383a8eSYan, Zheng 	spin_lock(&mdsc->cap_dirty_lock);
54128310b089SYan, Zheng 	want_flush = mdsc->last_cap_flush_tid;
5413c8799fc4SYan, Zheng 	if (!list_empty(&mdsc->cap_flush_list)) {
5414c8799fc4SYan, Zheng 		struct ceph_cap_flush *cf =
5415c8799fc4SYan, Zheng 			list_last_entry(&mdsc->cap_flush_list,
5416c8799fc4SYan, Zheng 					struct ceph_cap_flush, g_list);
5417c8799fc4SYan, Zheng 		cf->wake = true;
5418c8799fc4SYan, Zheng 	}
5419d3383a8eSYan, Zheng 	spin_unlock(&mdsc->cap_dirty_lock);
5420d3383a8eSYan, Zheng 
54210e294387SYan, Zheng 	dout("sync want tid %lld flush_seq %lld\n",
54220e294387SYan, Zheng 	     want_tid, want_flush);
54232f2dc053SSage Weil 
54241b2ba3c5SXiubo Li 	flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid);
54250e294387SYan, Zheng 	wait_caps_flush(mdsc, want_flush);
54262f2dc053SSage Weil }
54272f2dc053SSage Weil 
5428f3c60c59SSage Weil /*
5429f3c60c59SSage Weil  * true if all sessions are closed, or we force unmount
5430f3c60c59SSage Weil  */
done_closing_sessions(struct ceph_mds_client * mdsc,int skipped)5431fcff415cSYan, Zheng static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
5432f3c60c59SSage Weil {
543352953d55SSeraphime Kirkovski 	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
5434f3c60c59SSage Weil 		return true;
5435fcff415cSYan, Zheng 	return atomic_read(&mdsc->num_sessions) <= skipped;
5436f3c60c59SSage Weil }
54372f2dc053SSage Weil 
54382f2dc053SSage Weil /*
5439a68e564aSXiubo Li  * called after sb is ro or when metadata corrupted.
54402f2dc053SSage Weil  */
ceph_mdsc_close_sessions(struct ceph_mds_client * mdsc)54412f2dc053SSage Weil void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
54422f2dc053SSage Weil {
5443a319bf56SIlya Dryomov 	struct ceph_options *opts = mdsc->fsc->client->options;
54442f2dc053SSage Weil 	struct ceph_mds_session *session;
54452f2dc053SSage Weil 	int i;
5446fcff415cSYan, Zheng 	int skipped = 0;
54472f2dc053SSage Weil 
54482f2dc053SSage Weil 	dout("close_sessions\n");
54492f2dc053SSage Weil 
54502f2dc053SSage Weil 	/* close sessions */
5451f3c60c59SSage Weil 	mutex_lock(&mdsc->mutex);
54522f2dc053SSage Weil 	for (i = 0; i < mdsc->max_sessions; i++) {
54532f2dc053SSage Weil 		session = __ceph_lookup_mds_session(mdsc, i);
54542f2dc053SSage Weil 		if (!session)
54552f2dc053SSage Weil 			continue;
54562f2dc053SSage Weil 		mutex_unlock(&mdsc->mutex);
54572f2dc053SSage Weil 		mutex_lock(&session->s_mutex);
5458fcff415cSYan, Zheng 		if (__close_session(mdsc, session) <= 0)
5459fcff415cSYan, Zheng 			skipped++;
54602f2dc053SSage Weil 		mutex_unlock(&session->s_mutex);
54612f2dc053SSage Weil 		ceph_put_mds_session(session);
54622f2dc053SSage Weil 		mutex_lock(&mdsc->mutex);
54632f2dc053SSage Weil 	}
5464f3c60c59SSage Weil 	mutex_unlock(&mdsc->mutex);
54652f2dc053SSage Weil 
54662f2dc053SSage Weil 	dout("waiting for sessions to close\n");
5467fcff415cSYan, Zheng 	wait_event_timeout(mdsc->session_close_wq,
5468fcff415cSYan, Zheng 			   done_closing_sessions(mdsc, skipped),
5469a319bf56SIlya Dryomov 			   ceph_timeout_jiffies(opts->mount_timeout));
54702f2dc053SSage Weil 
54712f2dc053SSage Weil 	/* tear down remaining sessions */
5472f3c60c59SSage Weil 	mutex_lock(&mdsc->mutex);
54732f2dc053SSage Weil 	for (i = 0; i < mdsc->max_sessions; i++) {
54742f2dc053SSage Weil 		if (mdsc->sessions[i]) {
54755b3248c6SXiubo Li 			session = ceph_get_mds_session(mdsc->sessions[i]);
54762600d2ddSSage Weil 			__unregister_session(mdsc, session);
54772f2dc053SSage Weil 			mutex_unlock(&mdsc->mutex);
54782f2dc053SSage Weil 			mutex_lock(&session->s_mutex);
54792f2dc053SSage Weil 			remove_session_caps(session);
54802f2dc053SSage Weil 			mutex_unlock(&session->s_mutex);
54812f2dc053SSage Weil 			ceph_put_mds_session(session);
54822f2dc053SSage Weil 			mutex_lock(&mdsc->mutex);
54832f2dc053SSage Weil 		}
54842f2dc053SSage Weil 	}
54852f2dc053SSage Weil 	WARN_ON(!list_empty(&mdsc->cap_delay_list));
54862f2dc053SSage Weil 	mutex_unlock(&mdsc->mutex);
54872f2dc053SSage Weil 
548875c9627eSYan, Zheng 	ceph_cleanup_snapid_map(mdsc);
54895ed91587SXiubo Li 	ceph_cleanup_global_and_empty_realms(mdsc);
54902f2dc053SSage Weil 
549137c4efc1SYan, Zheng 	cancel_work_sync(&mdsc->cap_reclaim_work);
54922f2dc053SSage Weil 	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
54932f2dc053SSage Weil 
54942f2dc053SSage Weil 	dout("stopped\n");
54952f2dc053SSage Weil }
54962f2dc053SSage Weil 
ceph_mdsc_force_umount(struct ceph_mds_client * mdsc)549748fec5d0SYan, Zheng void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
549848fec5d0SYan, Zheng {
549948fec5d0SYan, Zheng 	struct ceph_mds_session *session;
550048fec5d0SYan, Zheng 	int mds;
550148fec5d0SYan, Zheng 
550248fec5d0SYan, Zheng 	dout("force umount\n");
550348fec5d0SYan, Zheng 
550448fec5d0SYan, Zheng 	mutex_lock(&mdsc->mutex);
550548fec5d0SYan, Zheng 	for (mds = 0; mds < mdsc->max_sessions; mds++) {
550648fec5d0SYan, Zheng 		session = __ceph_lookup_mds_session(mdsc, mds);
550748fec5d0SYan, Zheng 		if (!session)
550848fec5d0SYan, Zheng 			continue;
5509d468e729SYan, Zheng 
5510d468e729SYan, Zheng 		if (session->s_state == CEPH_MDS_SESSION_REJECTED)
5511d468e729SYan, Zheng 			__unregister_session(mdsc, session);
5512d468e729SYan, Zheng 		__wake_requests(mdsc, &session->s_waiting);
551348fec5d0SYan, Zheng 		mutex_unlock(&mdsc->mutex);
5514d468e729SYan, Zheng 
551548fec5d0SYan, Zheng 		mutex_lock(&session->s_mutex);
551648fec5d0SYan, Zheng 		__close_session(mdsc, session);
551748fec5d0SYan, Zheng 		if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
551848fec5d0SYan, Zheng 			cleanup_session_requests(mdsc, session);
551948fec5d0SYan, Zheng 			remove_session_caps(session);
552048fec5d0SYan, Zheng 		}
552148fec5d0SYan, Zheng 		mutex_unlock(&session->s_mutex);
552248fec5d0SYan, Zheng 		ceph_put_mds_session(session);
5523d468e729SYan, Zheng 
552448fec5d0SYan, Zheng 		mutex_lock(&mdsc->mutex);
552548fec5d0SYan, Zheng 		kick_requests(mdsc, mds);
552648fec5d0SYan, Zheng 	}
552748fec5d0SYan, Zheng 	__wake_requests(mdsc, &mdsc->waiting_for_map);
552848fec5d0SYan, Zheng 	mutex_unlock(&mdsc->mutex);
552948fec5d0SYan, Zheng }
553048fec5d0SYan, Zheng 
ceph_mdsc_stop(struct ceph_mds_client * mdsc)55313d14c5d2SYehuda Sadeh static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
55322f2dc053SSage Weil {
55332f2dc053SSage Weil 	dout("stop\n");
5534fa996773SXiubo Li 	/*
5535fa996773SXiubo Li 	 * Make sure the delayed work stopped before releasing
5536fa996773SXiubo Li 	 * the resources.
5537fa996773SXiubo Li 	 *
5538fa996773SXiubo Li 	 * Because the cancel_delayed_work_sync() will only
5539fa996773SXiubo Li 	 * guarantee that the work finishes executing. But the
5540fa996773SXiubo Li 	 * delayed work will re-arm itself again after that.
5541fa996773SXiubo Li 	 */
5542fa996773SXiubo Li 	flush_delayed_work(&mdsc->delayed_work);
5543fa996773SXiubo Li 
55442f2dc053SSage Weil 	if (mdsc->mdsmap)
55452f2dc053SSage Weil 		ceph_mdsmap_destroy(mdsc->mdsmap);
55462f2dc053SSage Weil 	kfree(mdsc->sessions);
554737151668SYehuda Sadeh 	ceph_caps_finalize(mdsc);
554810183a69SYan, Zheng 	ceph_pool_perm_destroy(mdsc);
55492f2dc053SSage Weil }
55502f2dc053SSage Weil 
ceph_mdsc_destroy(struct ceph_fs_client * fsc)55513d14c5d2SYehuda Sadeh void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
55523d14c5d2SYehuda Sadeh {
55533d14c5d2SYehuda Sadeh 	struct ceph_mds_client *mdsc = fsc->mdsc;
5554ef550f6fSSage Weil 	dout("mdsc_destroy %p\n", mdsc);
5555ef550f6fSSage Weil 
555650c55aecSChengguang Xu 	if (!mdsc)
555750c55aecSChengguang Xu 		return;
555850c55aecSChengguang Xu 
5559ef550f6fSSage Weil 	/* flush out any connection work with references to us */
5560ef550f6fSSage Weil 	ceph_msgr_flush();
5561ef550f6fSSage Weil 
556262a65f36SYan, Zheng 	ceph_mdsc_stop(mdsc);
556362a65f36SYan, Zheng 
5564f9009efaSXiubo Li 	ceph_metric_destroy(&mdsc->metric);
5565f9009efaSXiubo Li 
55663d14c5d2SYehuda Sadeh 	fsc->mdsc = NULL;
55673d14c5d2SYehuda Sadeh 	kfree(mdsc);
5568ef550f6fSSage Weil 	dout("mdsc_destroy %p done\n", mdsc);
55693d14c5d2SYehuda Sadeh }
55703d14c5d2SYehuda Sadeh 
ceph_mdsc_handle_fsmap(struct ceph_mds_client * mdsc,struct ceph_msg * msg)5571430afbadSYan, Zheng void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
5572430afbadSYan, Zheng {
5573430afbadSYan, Zheng 	struct ceph_fs_client *fsc = mdsc->fsc;
5574430afbadSYan, Zheng 	const char *mds_namespace = fsc->mount_options->mds_namespace;
5575430afbadSYan, Zheng 	void *p = msg->front.iov_base;
5576430afbadSYan, Zheng 	void *end = p + msg->front.iov_len;
5577430afbadSYan, Zheng 	u32 epoch;
5578430afbadSYan, Zheng 	u32 num_fs;
5579430afbadSYan, Zheng 	u32 mount_fscid = (u32)-1;
5580430afbadSYan, Zheng 	int err = -EINVAL;
5581430afbadSYan, Zheng 
5582430afbadSYan, Zheng 	ceph_decode_need(&p, end, sizeof(u32), bad);
5583430afbadSYan, Zheng 	epoch = ceph_decode_32(&p);
5584430afbadSYan, Zheng 
5585430afbadSYan, Zheng 	dout("handle_fsmap epoch %u\n", epoch);
5586430afbadSYan, Zheng 
558706a1ad43SJeff Layton 	/* struct_v, struct_cv, map_len, epoch, legacy_client_fscid */
558806a1ad43SJeff Layton 	ceph_decode_skip_n(&p, end, 2 + sizeof(u32) * 3, bad);
5589430afbadSYan, Zheng 
559006a1ad43SJeff Layton 	ceph_decode_32_safe(&p, end, num_fs, bad);
5591430afbadSYan, Zheng 	while (num_fs-- > 0) {
5592430afbadSYan, Zheng 		void *info_p, *info_end;
5593430afbadSYan, Zheng 		u32 info_len;
5594430afbadSYan, Zheng 		u32 fscid, namelen;
5595430afbadSYan, Zheng 
5596430afbadSYan, Zheng 		ceph_decode_need(&p, end, 2 + sizeof(u32), bad);
559706a1ad43SJeff Layton 		p += 2;		// info_v, info_cv
5598430afbadSYan, Zheng 		info_len = ceph_decode_32(&p);
5599430afbadSYan, Zheng 		ceph_decode_need(&p, end, info_len, bad);
5600430afbadSYan, Zheng 		info_p = p;
5601430afbadSYan, Zheng 		info_end = p + info_len;
5602430afbadSYan, Zheng 		p = info_end;
5603430afbadSYan, Zheng 
5604430afbadSYan, Zheng 		ceph_decode_need(&info_p, info_end, sizeof(u32) * 2, bad);
5605430afbadSYan, Zheng 		fscid = ceph_decode_32(&info_p);
5606430afbadSYan, Zheng 		namelen = ceph_decode_32(&info_p);
5607430afbadSYan, Zheng 		ceph_decode_need(&info_p, info_end, namelen, bad);
5608430afbadSYan, Zheng 
5609430afbadSYan, Zheng 		if (mds_namespace &&
5610430afbadSYan, Zheng 		    strlen(mds_namespace) == namelen &&
5611430afbadSYan, Zheng 		    !strncmp(mds_namespace, (char *)info_p, namelen)) {
5612430afbadSYan, Zheng 			mount_fscid = fscid;
5613430afbadSYan, Zheng 			break;
5614430afbadSYan, Zheng 		}
5615430afbadSYan, Zheng 	}
5616430afbadSYan, Zheng 
5617430afbadSYan, Zheng 	ceph_monc_got_map(&fsc->client->monc, CEPH_SUB_FSMAP, epoch);
5618430afbadSYan, Zheng 	if (mount_fscid != (u32)-1) {
5619430afbadSYan, Zheng 		fsc->client->monc.fs_cluster_id = mount_fscid;
5620430afbadSYan, Zheng 		ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP,
5621430afbadSYan, Zheng 				   0, true);
5622430afbadSYan, Zheng 		ceph_monc_renew_subs(&fsc->client->monc);
5623430afbadSYan, Zheng 	} else {
5624430afbadSYan, Zheng 		err = -ENOENT;
5625430afbadSYan, Zheng 		goto err_out;
5626430afbadSYan, Zheng 	}
5627430afbadSYan, Zheng 	return;
562876bd6ec4SIlya Dryomov 
5629430afbadSYan, Zheng bad:
5630631ed4b0SJeff Layton 	pr_err("error decoding fsmap %d. Shutting down mount.\n", err);
5631631ed4b0SJeff Layton 	ceph_umount_begin(mdsc->fsc->sb);
56328b0da5c5SXiubo Li 	ceph_msg_dump(msg);
5633430afbadSYan, Zheng err_out:
5634430afbadSYan, Zheng 	mutex_lock(&mdsc->mutex);
563576bd6ec4SIlya Dryomov 	mdsc->mdsmap_err = err;
5636430afbadSYan, Zheng 	__wake_requests(mdsc, &mdsc->waiting_for_map);
5637430afbadSYan, Zheng 	mutex_unlock(&mdsc->mutex);
5638430afbadSYan, Zheng }
56392f2dc053SSage Weil 
56402f2dc053SSage Weil /*
56412f2dc053SSage Weil  * handle mds map update.
56422f2dc053SSage Weil  */
ceph_mdsc_handle_mdsmap(struct ceph_mds_client * mdsc,struct ceph_msg * msg)5643430afbadSYan, Zheng void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
56442f2dc053SSage Weil {
56452f2dc053SSage Weil 	u32 epoch;
56462f2dc053SSage Weil 	u32 maplen;
56472f2dc053SSage Weil 	void *p = msg->front.iov_base;
56482f2dc053SSage Weil 	void *end = p + msg->front.iov_len;
56492f2dc053SSage Weil 	struct ceph_mdsmap *newmap, *oldmap;
56502f2dc053SSage Weil 	struct ceph_fsid fsid;
56512f2dc053SSage Weil 	int err = -EINVAL;
56522f2dc053SSage Weil 
56532f2dc053SSage Weil 	ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
56542f2dc053SSage Weil 	ceph_decode_copy(&p, &fsid, sizeof(fsid));
56553d14c5d2SYehuda Sadeh 	if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0)
56562f2dc053SSage Weil 		return;
5657c89136eaSSage Weil 	epoch = ceph_decode_32(&p);
5658c89136eaSSage Weil 	maplen = ceph_decode_32(&p);
56592f2dc053SSage Weil 	dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
56602f2dc053SSage Weil 
56612f2dc053SSage Weil 	/* do we need it? */
56622f2dc053SSage Weil 	mutex_lock(&mdsc->mutex);
56632f2dc053SSage Weil 	if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
56642f2dc053SSage Weil 		dout("handle_map epoch %u <= our %u\n",
56652f2dc053SSage Weil 		     epoch, mdsc->mdsmap->m_epoch);
56662f2dc053SSage Weil 		mutex_unlock(&mdsc->mutex);
56672f2dc053SSage Weil 		return;
56682f2dc053SSage Weil 	}
56692f2dc053SSage Weil 
56702e2023e9SXiubo Li 	newmap = ceph_mdsmap_decode(mdsc, &p, end, ceph_msgr2(mdsc->fsc->client));
56712f2dc053SSage Weil 	if (IS_ERR(newmap)) {
56722f2dc053SSage Weil 		err = PTR_ERR(newmap);
56732f2dc053SSage Weil 		goto bad_unlock;
56742f2dc053SSage Weil 	}
56752f2dc053SSage Weil 
56762f2dc053SSage Weil 	/* swap into place */
56772f2dc053SSage Weil 	if (mdsc->mdsmap) {
56782f2dc053SSage Weil 		oldmap = mdsc->mdsmap;
56792f2dc053SSage Weil 		mdsc->mdsmap = newmap;
56802f2dc053SSage Weil 		check_new_map(mdsc, newmap, oldmap);
56812f2dc053SSage Weil 		ceph_mdsmap_destroy(oldmap);
56822f2dc053SSage Weil 	} else {
56832f2dc053SSage Weil 		mdsc->mdsmap = newmap;  /* first mds map */
56842f2dc053SSage Weil 	}
5685719784baSChengguang Xu 	mdsc->fsc->max_file_size = min((loff_t)mdsc->mdsmap->m_max_file_size,
5686719784baSChengguang Xu 					MAX_LFS_FILESIZE);
56872f2dc053SSage Weil 
56882f2dc053SSage Weil 	__wake_requests(mdsc, &mdsc->waiting_for_map);
568982dcabadSIlya Dryomov 	ceph_monc_got_map(&mdsc->fsc->client->monc, CEPH_SUB_MDSMAP,
569082dcabadSIlya Dryomov 			  mdsc->mdsmap->m_epoch);
56912f2dc053SSage Weil 
56922f2dc053SSage Weil 	mutex_unlock(&mdsc->mutex);
5693bf2ba432SLuis Henriques 	schedule_delayed(mdsc, 0);
56942f2dc053SSage Weil 	return;
56952f2dc053SSage Weil 
56962f2dc053SSage Weil bad_unlock:
56972f2dc053SSage Weil 	mutex_unlock(&mdsc->mutex);
56982f2dc053SSage Weil bad:
5699631ed4b0SJeff Layton 	pr_err("error decoding mdsmap %d. Shutting down mount.\n", err);
5700631ed4b0SJeff Layton 	ceph_umount_begin(mdsc->fsc->sb);
57018b0da5c5SXiubo Li 	ceph_msg_dump(msg);
57022f2dc053SSage Weil 	return;
57032f2dc053SSage Weil }
57042f2dc053SSage Weil 
mds_get_con(struct ceph_connection * con)57054972cf60SIlya Dryomov static struct ceph_connection *mds_get_con(struct ceph_connection *con)
57062f2dc053SSage Weil {
57072f2dc053SSage Weil 	struct ceph_mds_session *s = con->private;
57082f2dc053SSage Weil 
57095b3248c6SXiubo Li 	if (ceph_get_mds_session(s))
57102f2dc053SSage Weil 		return con;
57112f2dc053SSage Weil 	return NULL;
57122f2dc053SSage Weil }
57132f2dc053SSage Weil 
mds_put_con(struct ceph_connection * con)57144972cf60SIlya Dryomov static void mds_put_con(struct ceph_connection *con)
57152f2dc053SSage Weil {
57162f2dc053SSage Weil 	struct ceph_mds_session *s = con->private;
57172f2dc053SSage Weil 
57182f2dc053SSage Weil 	ceph_put_mds_session(s);
57192f2dc053SSage Weil }
57202f2dc053SSage Weil 
57212f2dc053SSage Weil /*
57222f2dc053SSage Weil  * if the client is unresponsive for long enough, the mds will kill
57232f2dc053SSage Weil  * the session entirely.
57242f2dc053SSage Weil  */
mds_peer_reset(struct ceph_connection * con)57254972cf60SIlya Dryomov static void mds_peer_reset(struct ceph_connection *con)
57262f2dc053SSage Weil {
57272f2dc053SSage Weil 	struct ceph_mds_session *s = con->private;
57287e70f0edSSage Weil 	struct ceph_mds_client *mdsc = s->s_mdsc;
57292f2dc053SSage Weil 
5730f3ae1b97SFabian Frederick 	pr_warn("mds%d closed our session\n", s->s_mds);
5731a68e564aSXiubo Li 	if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO)
57327e70f0edSSage Weil 		send_mds_reconnect(mdsc, s);
57332f2dc053SSage Weil }
57342f2dc053SSage Weil 
mds_dispatch(struct ceph_connection * con,struct ceph_msg * msg)57354972cf60SIlya Dryomov static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg)
57362f2dc053SSage Weil {
57372f2dc053SSage Weil 	struct ceph_mds_session *s = con->private;
57382f2dc053SSage Weil 	struct ceph_mds_client *mdsc = s->s_mdsc;
57392f2dc053SSage Weil 	int type = le16_to_cpu(msg->hdr.type);
57402f2dc053SSage Weil 
57412600d2ddSSage Weil 	mutex_lock(&mdsc->mutex);
57422600d2ddSSage Weil 	if (__verify_registered_session(mdsc, s) < 0) {
57432600d2ddSSage Weil 		mutex_unlock(&mdsc->mutex);
57442600d2ddSSage Weil 		goto out;
57452600d2ddSSage Weil 	}
57462600d2ddSSage Weil 	mutex_unlock(&mdsc->mutex);
57472600d2ddSSage Weil 
57482f2dc053SSage Weil 	switch (type) {
57492f2dc053SSage Weil 	case CEPH_MSG_MDS_MAP:
5750430afbadSYan, Zheng 		ceph_mdsc_handle_mdsmap(mdsc, msg);
5751430afbadSYan, Zheng 		break;
5752430afbadSYan, Zheng 	case CEPH_MSG_FS_MAP_USER:
5753430afbadSYan, Zheng 		ceph_mdsc_handle_fsmap(mdsc, msg);
57542f2dc053SSage Weil 		break;
57552f2dc053SSage Weil 	case CEPH_MSG_CLIENT_SESSION:
57562f2dc053SSage Weil 		handle_session(s, msg);
57572f2dc053SSage Weil 		break;
57582f2dc053SSage Weil 	case CEPH_MSG_CLIENT_REPLY:
57592f2dc053SSage Weil 		handle_reply(s, msg);
57602f2dc053SSage Weil 		break;
57612f2dc053SSage Weil 	case CEPH_MSG_CLIENT_REQUEST_FORWARD:
57622600d2ddSSage Weil 		handle_forward(mdsc, s, msg);
57632f2dc053SSage Weil 		break;
57642f2dc053SSage Weil 	case CEPH_MSG_CLIENT_CAPS:
57652f2dc053SSage Weil 		ceph_handle_caps(s, msg);
57662f2dc053SSage Weil 		break;
57672f2dc053SSage Weil 	case CEPH_MSG_CLIENT_SNAP:
57682600d2ddSSage Weil 		ceph_handle_snap(mdsc, s, msg);
57692f2dc053SSage Weil 		break;
57702f2dc053SSage Weil 	case CEPH_MSG_CLIENT_LEASE:
57712600d2ddSSage Weil 		handle_lease(mdsc, s, msg);
57722f2dc053SSage Weil 		break;
5773fb18a575SLuis Henriques 	case CEPH_MSG_CLIENT_QUOTA:
5774fb18a575SLuis Henriques 		ceph_handle_quota(mdsc, s, msg);
5775fb18a575SLuis Henriques 		break;
57762f2dc053SSage Weil 
57772f2dc053SSage Weil 	default:
57782f2dc053SSage Weil 		pr_err("received unknown message type %d %s\n", type,
57792f2dc053SSage Weil 		       ceph_msg_type_name(type));
57802f2dc053SSage Weil 	}
57812600d2ddSSage Weil out:
57822f2dc053SSage Weil 	ceph_msg_put(msg);
57832f2dc053SSage Weil }
57842f2dc053SSage Weil 
57854e7a5dcdSSage Weil /*
57864e7a5dcdSSage Weil  * authentication
57874e7a5dcdSSage Weil  */
5788a3530df3SAlex Elder 
5789a3530df3SAlex Elder /*
5790a3530df3SAlex Elder  * Note: returned pointer is the address of a structure that's
5791a3530df3SAlex Elder  * managed separately.  Caller must *not* attempt to free it.
5792a3530df3SAlex Elder  */
57934972cf60SIlya Dryomov static struct ceph_auth_handshake *
mds_get_authorizer(struct ceph_connection * con,int * proto,int force_new)57944972cf60SIlya Dryomov mds_get_authorizer(struct ceph_connection *con, int *proto, int force_new)
57954e7a5dcdSSage Weil {
57964e7a5dcdSSage Weil 	struct ceph_mds_session *s = con->private;
57974e7a5dcdSSage Weil 	struct ceph_mds_client *mdsc = s->s_mdsc;
57983d14c5d2SYehuda Sadeh 	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
579974f1869fSAlex Elder 	struct ceph_auth_handshake *auth = &s->s_auth;
5800ce287162SIlya Dryomov 	int ret;
58014e7a5dcdSSage Weil 
5802ce287162SIlya Dryomov 	ret = __ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_MDS,
5803ce287162SIlya Dryomov 					 force_new, proto, NULL, NULL);
58040bed9b5cSSage Weil 	if (ret)
58050bed9b5cSSage Weil 		return ERR_PTR(ret);
580674f1869fSAlex Elder 
5807a3530df3SAlex Elder 	return auth;
58084e7a5dcdSSage Weil }
58094e7a5dcdSSage Weil 
mds_add_authorizer_challenge(struct ceph_connection * con,void * challenge_buf,int challenge_buf_len)58104972cf60SIlya Dryomov static int mds_add_authorizer_challenge(struct ceph_connection *con,
58116daca13dSIlya Dryomov 				    void *challenge_buf, int challenge_buf_len)
58126daca13dSIlya Dryomov {
58136daca13dSIlya Dryomov 	struct ceph_mds_session *s = con->private;
58146daca13dSIlya Dryomov 	struct ceph_mds_client *mdsc = s->s_mdsc;
58156daca13dSIlya Dryomov 	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
58166daca13dSIlya Dryomov 
58176daca13dSIlya Dryomov 	return ceph_auth_add_authorizer_challenge(ac, s->s_auth.authorizer,
58186daca13dSIlya Dryomov 					    challenge_buf, challenge_buf_len);
58196daca13dSIlya Dryomov }
58204e7a5dcdSSage Weil 
mds_verify_authorizer_reply(struct ceph_connection * con)58214972cf60SIlya Dryomov static int mds_verify_authorizer_reply(struct ceph_connection *con)
58224e7a5dcdSSage Weil {
58234e7a5dcdSSage Weil 	struct ceph_mds_session *s = con->private;
58244e7a5dcdSSage Weil 	struct ceph_mds_client *mdsc = s->s_mdsc;
58253d14c5d2SYehuda Sadeh 	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
5826285ea34fSIlya Dryomov 	struct ceph_auth_handshake *auth = &s->s_auth;
58274e7a5dcdSSage Weil 
5828285ea34fSIlya Dryomov 	return ceph_auth_verify_authorizer_reply(ac, auth->authorizer,
5829285ea34fSIlya Dryomov 		auth->authorizer_reply_buf, auth->authorizer_reply_buf_len,
5830285ea34fSIlya Dryomov 		NULL, NULL, NULL, NULL);
58314e7a5dcdSSage Weil }
58324e7a5dcdSSage Weil 
mds_invalidate_authorizer(struct ceph_connection * con)58334972cf60SIlya Dryomov static int mds_invalidate_authorizer(struct ceph_connection *con)
58349bd2e6f8SSage Weil {
58359bd2e6f8SSage Weil 	struct ceph_mds_session *s = con->private;
58369bd2e6f8SSage Weil 	struct ceph_mds_client *mdsc = s->s_mdsc;
58373d14c5d2SYehuda Sadeh 	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
58389bd2e6f8SSage Weil 
583927859f97SSage Weil 	ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
58409bd2e6f8SSage Weil 
58413d14c5d2SYehuda Sadeh 	return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
58429bd2e6f8SSage Weil }
58439bd2e6f8SSage Weil 
mds_get_auth_request(struct ceph_connection * con,void * buf,int * buf_len,void ** authorizer,int * authorizer_len)5844cd1a677cSIlya Dryomov static int mds_get_auth_request(struct ceph_connection *con,
5845cd1a677cSIlya Dryomov 				void *buf, int *buf_len,
5846cd1a677cSIlya Dryomov 				void **authorizer, int *authorizer_len)
5847cd1a677cSIlya Dryomov {
5848cd1a677cSIlya Dryomov 	struct ceph_mds_session *s = con->private;
5849cd1a677cSIlya Dryomov 	struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth;
5850cd1a677cSIlya Dryomov 	struct ceph_auth_handshake *auth = &s->s_auth;
5851cd1a677cSIlya Dryomov 	int ret;
5852cd1a677cSIlya Dryomov 
5853cd1a677cSIlya Dryomov 	ret = ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_MDS,
5854cd1a677cSIlya Dryomov 				       buf, buf_len);
5855cd1a677cSIlya Dryomov 	if (ret)
5856cd1a677cSIlya Dryomov 		return ret;
5857cd1a677cSIlya Dryomov 
5858cd1a677cSIlya Dryomov 	*authorizer = auth->authorizer_buf;
5859cd1a677cSIlya Dryomov 	*authorizer_len = auth->authorizer_buf_len;
5860cd1a677cSIlya Dryomov 	return 0;
5861cd1a677cSIlya Dryomov }
5862cd1a677cSIlya Dryomov 
mds_handle_auth_reply_more(struct ceph_connection * con,void * reply,int reply_len,void * buf,int * buf_len,void ** authorizer,int * authorizer_len)5863cd1a677cSIlya Dryomov static int mds_handle_auth_reply_more(struct ceph_connection *con,
5864cd1a677cSIlya Dryomov 				      void *reply, int reply_len,
5865cd1a677cSIlya Dryomov 				      void *buf, int *buf_len,
5866cd1a677cSIlya Dryomov 				      void **authorizer, int *authorizer_len)
5867cd1a677cSIlya Dryomov {
5868cd1a677cSIlya Dryomov 	struct ceph_mds_session *s = con->private;
5869cd1a677cSIlya Dryomov 	struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth;
5870cd1a677cSIlya Dryomov 	struct ceph_auth_handshake *auth = &s->s_auth;
5871cd1a677cSIlya Dryomov 	int ret;
5872cd1a677cSIlya Dryomov 
5873cd1a677cSIlya Dryomov 	ret = ceph_auth_handle_svc_reply_more(ac, auth, reply, reply_len,
5874cd1a677cSIlya Dryomov 					      buf, buf_len);
5875cd1a677cSIlya Dryomov 	if (ret)
5876cd1a677cSIlya Dryomov 		return ret;
5877cd1a677cSIlya Dryomov 
5878cd1a677cSIlya Dryomov 	*authorizer = auth->authorizer_buf;
5879cd1a677cSIlya Dryomov 	*authorizer_len = auth->authorizer_buf_len;
5880cd1a677cSIlya Dryomov 	return 0;
5881cd1a677cSIlya Dryomov }
5882cd1a677cSIlya Dryomov 
mds_handle_auth_done(struct ceph_connection * con,u64 global_id,void * reply,int reply_len,u8 * session_key,int * session_key_len,u8 * con_secret,int * con_secret_len)5883cd1a677cSIlya Dryomov static int mds_handle_auth_done(struct ceph_connection *con,
5884cd1a677cSIlya Dryomov 				u64 global_id, void *reply, int reply_len,
5885cd1a677cSIlya Dryomov 				u8 *session_key, int *session_key_len,
5886cd1a677cSIlya Dryomov 				u8 *con_secret, int *con_secret_len)
5887cd1a677cSIlya Dryomov {
5888cd1a677cSIlya Dryomov 	struct ceph_mds_session *s = con->private;
5889cd1a677cSIlya Dryomov 	struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth;
5890cd1a677cSIlya Dryomov 	struct ceph_auth_handshake *auth = &s->s_auth;
5891cd1a677cSIlya Dryomov 
5892cd1a677cSIlya Dryomov 	return ceph_auth_handle_svc_reply_done(ac, auth, reply, reply_len,
5893cd1a677cSIlya Dryomov 					       session_key, session_key_len,
5894cd1a677cSIlya Dryomov 					       con_secret, con_secret_len);
5895cd1a677cSIlya Dryomov }
5896cd1a677cSIlya Dryomov 
mds_handle_auth_bad_method(struct ceph_connection * con,int used_proto,int result,const int * allowed_protos,int proto_cnt,const int * allowed_modes,int mode_cnt)5897cd1a677cSIlya Dryomov static int mds_handle_auth_bad_method(struct ceph_connection *con,
5898cd1a677cSIlya Dryomov 				      int used_proto, int result,
5899cd1a677cSIlya Dryomov 				      const int *allowed_protos, int proto_cnt,
5900cd1a677cSIlya Dryomov 				      const int *allowed_modes, int mode_cnt)
5901cd1a677cSIlya Dryomov {
5902cd1a677cSIlya Dryomov 	struct ceph_mds_session *s = con->private;
5903cd1a677cSIlya Dryomov 	struct ceph_mon_client *monc = &s->s_mdsc->fsc->client->monc;
5904cd1a677cSIlya Dryomov 	int ret;
5905cd1a677cSIlya Dryomov 
5906cd1a677cSIlya Dryomov 	if (ceph_auth_handle_bad_authorizer(monc->auth, CEPH_ENTITY_TYPE_MDS,
5907cd1a677cSIlya Dryomov 					    used_proto, result,
5908cd1a677cSIlya Dryomov 					    allowed_protos, proto_cnt,
5909cd1a677cSIlya Dryomov 					    allowed_modes, mode_cnt)) {
5910cd1a677cSIlya Dryomov 		ret = ceph_monc_validate_auth(monc);
5911cd1a677cSIlya Dryomov 		if (ret)
5912cd1a677cSIlya Dryomov 			return ret;
5913cd1a677cSIlya Dryomov 	}
5914cd1a677cSIlya Dryomov 
5915cd1a677cSIlya Dryomov 	return -EACCES;
5916cd1a677cSIlya Dryomov }
5917cd1a677cSIlya Dryomov 
mds_alloc_msg(struct ceph_connection * con,struct ceph_msg_header * hdr,int * skip)591853ded495SAlex Elder static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con,
591953ded495SAlex Elder 				struct ceph_msg_header *hdr, int *skip)
592053ded495SAlex Elder {
592153ded495SAlex Elder 	struct ceph_msg *msg;
592253ded495SAlex Elder 	int type = (int) le16_to_cpu(hdr->type);
592353ded495SAlex Elder 	int front_len = (int) le32_to_cpu(hdr->front_len);
592453ded495SAlex Elder 
592553ded495SAlex Elder 	if (con->in_msg)
592653ded495SAlex Elder 		return con->in_msg;
592753ded495SAlex Elder 
592853ded495SAlex Elder 	*skip = 0;
592953ded495SAlex Elder 	msg = ceph_msg_new(type, front_len, GFP_NOFS, false);
593053ded495SAlex Elder 	if (!msg) {
593153ded495SAlex Elder 		pr_err("unable to allocate msg type %d len %d\n",
593253ded495SAlex Elder 		       type, front_len);
593353ded495SAlex Elder 		return NULL;
593453ded495SAlex Elder 	}
593553ded495SAlex Elder 
593653ded495SAlex Elder 	return msg;
593753ded495SAlex Elder }
593853ded495SAlex Elder 
mds_sign_message(struct ceph_msg * msg)593979dbd1baSIlya Dryomov static int mds_sign_message(struct ceph_msg *msg)
594033d07337SYan, Zheng {
594179dbd1baSIlya Dryomov        struct ceph_mds_session *s = msg->con->private;
594233d07337SYan, Zheng        struct ceph_auth_handshake *auth = &s->s_auth;
594379dbd1baSIlya Dryomov 
594433d07337SYan, Zheng        return ceph_auth_sign_message(auth, msg);
594533d07337SYan, Zheng }
594633d07337SYan, Zheng 
mds_check_message_signature(struct ceph_msg * msg)594779dbd1baSIlya Dryomov static int mds_check_message_signature(struct ceph_msg *msg)
594833d07337SYan, Zheng {
594979dbd1baSIlya Dryomov        struct ceph_mds_session *s = msg->con->private;
595033d07337SYan, Zheng        struct ceph_auth_handshake *auth = &s->s_auth;
595179dbd1baSIlya Dryomov 
595233d07337SYan, Zheng        return ceph_auth_check_message_signature(auth, msg);
595333d07337SYan, Zheng }
595433d07337SYan, Zheng 
59559e32789fSTobias Klauser static const struct ceph_connection_operations mds_con_ops = {
59564972cf60SIlya Dryomov 	.get = mds_get_con,
59574972cf60SIlya Dryomov 	.put = mds_put_con,
595853ded495SAlex Elder 	.alloc_msg = mds_alloc_msg,
59594972cf60SIlya Dryomov 	.dispatch = mds_dispatch,
59604972cf60SIlya Dryomov 	.peer_reset = mds_peer_reset,
59614972cf60SIlya Dryomov 	.get_authorizer = mds_get_authorizer,
59624972cf60SIlya Dryomov 	.add_authorizer_challenge = mds_add_authorizer_challenge,
59634972cf60SIlya Dryomov 	.verify_authorizer_reply = mds_verify_authorizer_reply,
59644972cf60SIlya Dryomov 	.invalidate_authorizer = mds_invalidate_authorizer,
596579dbd1baSIlya Dryomov 	.sign_message = mds_sign_message,
596679dbd1baSIlya Dryomov 	.check_message_signature = mds_check_message_signature,
5967cd1a677cSIlya Dryomov 	.get_auth_request = mds_get_auth_request,
5968cd1a677cSIlya Dryomov 	.handle_auth_reply_more = mds_handle_auth_reply_more,
5969cd1a677cSIlya Dryomov 	.handle_auth_done = mds_handle_auth_done,
5970cd1a677cSIlya Dryomov 	.handle_auth_bad_method = mds_handle_auth_bad_method,
59712f2dc053SSage Weil };
59722f2dc053SSage Weil 
59732f2dc053SSage Weil /* eof */
5974