1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
23d14c5d2SYehuda Sadeh #include <linux/ceph/ceph_debug.h>
32f2dc053SSage Weil
4496e5955SSage Weil #include <linux/fs.h>
52f2dc053SSage Weil #include <linux/wait.h>
65a0e3ad6STejun Heo #include <linux/slab.h>
754008399SYan, Zheng #include <linux/gfp.h>
82f2dc053SSage Weil #include <linux/sched.h>
93d14c5d2SYehuda Sadeh #include <linux/debugfs.h>
103d14c5d2SYehuda Sadeh #include <linux/seq_file.h>
113e0708b9SYan, Zheng #include <linux/ratelimit.h>
129ba1e224SXiubo Li #include <linux/bits.h>
1370c94820SXiubo Li #include <linux/ktime.h>
14d517b398SXiubo Li #include <linux/bitmap.h>
152f2dc053SSage Weil
162f2dc053SSage Weil #include "super.h"
173d14c5d2SYehuda Sadeh #include "mds_client.h"
182d332d5bSJeff Layton #include "crypto.h"
193d14c5d2SYehuda Sadeh
201fe60e51SSage Weil #include <linux/ceph/ceph_features.h>
213d14c5d2SYehuda Sadeh #include <linux/ceph/messenger.h>
223d14c5d2SYehuda Sadeh #include <linux/ceph/decode.h>
233d14c5d2SYehuda Sadeh #include <linux/ceph/pagelist.h>
243d14c5d2SYehuda Sadeh #include <linux/ceph/auth.h>
253d14c5d2SYehuda Sadeh #include <linux/ceph/debugfs.h>
262f2dc053SSage Weil
2781c5a148SYan, Zheng #define RECONNECT_MAX_SIZE (INT_MAX - PAGE_SIZE)
2881c5a148SYan, Zheng
292f2dc053SSage Weil /*
302f2dc053SSage Weil * A cluster of MDS (metadata server) daemons is responsible for
312f2dc053SSage Weil * managing the file system namespace (the directory hierarchy and
322f2dc053SSage Weil * inodes) and for coordinating shared access to storage. Metadata is
332f2dc053SSage Weil * partitioning hierarchically across a number of servers, and that
342f2dc053SSage Weil * partition varies over time as the cluster adjusts the distribution
352f2dc053SSage Weil * in order to balance load.
362f2dc053SSage Weil *
372f2dc053SSage Weil * The MDS client is primarily responsible to managing synchronous
382f2dc053SSage Weil * metadata requests for operations like open, unlink, and so forth.
392f2dc053SSage Weil * If there is a MDS failure, we find out about it when we (possibly
402f2dc053SSage Weil * request and) receive a new MDS map, and can resubmit affected
412f2dc053SSage Weil * requests.
422f2dc053SSage Weil *
432f2dc053SSage Weil * For the most part, though, we take advantage of a lossless
442f2dc053SSage Weil * communications channel to the MDS, and do not need to worry about
452f2dc053SSage Weil * timing out or resubmitting requests.
462f2dc053SSage Weil *
472f2dc053SSage Weil * We maintain a stateful "session" with each MDS we interact with.
482f2dc053SSage Weil * Within each session, we sent periodic heartbeat messages to ensure
492f2dc053SSage Weil * any capabilities or leases we have been issues remain valid. If
502f2dc053SSage Weil * the session times out and goes stale, our leases and capabilities
512f2dc053SSage Weil * are no longer valid.
522f2dc053SSage Weil */
532f2dc053SSage Weil
5420cb34aeSSage Weil struct ceph_reconnect_state {
5581c5a148SYan, Zheng struct ceph_mds_session *session;
5681c5a148SYan, Zheng int nr_caps, nr_realms;
5720cb34aeSSage Weil struct ceph_pagelist *pagelist;
58121f22a1SYan, Zheng unsigned msg_version;
5981c5a148SYan, Zheng bool allow_multi;
6020cb34aeSSage Weil };
6120cb34aeSSage Weil
622f2dc053SSage Weil static void __wake_requests(struct ceph_mds_client *mdsc,
632f2dc053SSage Weil struct list_head *head);
64e3ec8d68SYan, Zheng static void ceph_cap_release_work(struct work_struct *work);
6537c4efc1SYan, Zheng static void ceph_cap_reclaim_work(struct work_struct *work);
662f2dc053SSage Weil
679e32789fSTobias Klauser static const struct ceph_connection_operations mds_con_ops;
682f2dc053SSage Weil
692f2dc053SSage Weil
702f2dc053SSage Weil /*
712f2dc053SSage Weil * mds reply parsing
722f2dc053SSage Weil */
732f2dc053SSage Weil
parse_reply_info_quota(void ** p,void * end,struct ceph_mds_reply_info_in * info)74b37fe1f9SYan, Zheng static int parse_reply_info_quota(void **p, void *end,
75b37fe1f9SYan, Zheng struct ceph_mds_reply_info_in *info)
76b37fe1f9SYan, Zheng {
77b37fe1f9SYan, Zheng u8 struct_v, struct_compat;
78b37fe1f9SYan, Zheng u32 struct_len;
79b37fe1f9SYan, Zheng
80b37fe1f9SYan, Zheng ceph_decode_8_safe(p, end, struct_v, bad);
81b37fe1f9SYan, Zheng ceph_decode_8_safe(p, end, struct_compat, bad);
82b37fe1f9SYan, Zheng /* struct_v is expected to be >= 1. we only
83b37fe1f9SYan, Zheng * understand encoding with struct_compat == 1. */
84b37fe1f9SYan, Zheng if (!struct_v || struct_compat != 1)
85b37fe1f9SYan, Zheng goto bad;
86b37fe1f9SYan, Zheng ceph_decode_32_safe(p, end, struct_len, bad);
87b37fe1f9SYan, Zheng ceph_decode_need(p, end, struct_len, bad);
88b37fe1f9SYan, Zheng end = *p + struct_len;
89b37fe1f9SYan, Zheng ceph_decode_64_safe(p, end, info->max_bytes, bad);
90b37fe1f9SYan, Zheng ceph_decode_64_safe(p, end, info->max_files, bad);
91b37fe1f9SYan, Zheng *p = end;
92b37fe1f9SYan, Zheng return 0;
93b37fe1f9SYan, Zheng bad:
94b37fe1f9SYan, Zheng return -EIO;
95b37fe1f9SYan, Zheng }
96b37fe1f9SYan, Zheng
972f2dc053SSage Weil /*
982f2dc053SSage Weil * parse individual inode info
992f2dc053SSage Weil */
parse_reply_info_in(void ** p,void * end,struct ceph_mds_reply_info_in * info,u64 features)1002f2dc053SSage Weil static int parse_reply_info_in(void **p, void *end,
10114303d20SSage Weil struct ceph_mds_reply_info_in *info,
10212b4629aSIlya Dryomov u64 features)
1032f2dc053SSage Weil {
104b37fe1f9SYan, Zheng int err = 0;
105b37fe1f9SYan, Zheng u8 struct_v = 0;
1062f2dc053SSage Weil
107b37fe1f9SYan, Zheng if (features == (u64)-1) {
108b37fe1f9SYan, Zheng u32 struct_len;
109b37fe1f9SYan, Zheng u8 struct_compat;
110b37fe1f9SYan, Zheng ceph_decode_8_safe(p, end, struct_v, bad);
111b37fe1f9SYan, Zheng ceph_decode_8_safe(p, end, struct_compat, bad);
112b37fe1f9SYan, Zheng /* struct_v is expected to be >= 1. we only understand
113b37fe1f9SYan, Zheng * encoding with struct_compat == 1. */
114b37fe1f9SYan, Zheng if (!struct_v || struct_compat != 1)
115b37fe1f9SYan, Zheng goto bad;
116b37fe1f9SYan, Zheng ceph_decode_32_safe(p, end, struct_len, bad);
117b37fe1f9SYan, Zheng ceph_decode_need(p, end, struct_len, bad);
118b37fe1f9SYan, Zheng end = *p + struct_len;
119b37fe1f9SYan, Zheng }
120b37fe1f9SYan, Zheng
121b37fe1f9SYan, Zheng ceph_decode_need(p, end, sizeof(struct ceph_mds_reply_inode), bad);
1222f2dc053SSage Weil info->in = *p;
1232f2dc053SSage Weil *p += sizeof(struct ceph_mds_reply_inode) +
1242f2dc053SSage Weil sizeof(*info->in->fragtree.splits) *
1252f2dc053SSage Weil le32_to_cpu(info->in->fragtree.nsplits);
1262f2dc053SSage Weil
1272f2dc053SSage Weil ceph_decode_32_safe(p, end, info->symlink_len, bad);
1282f2dc053SSage Weil ceph_decode_need(p, end, info->symlink_len, bad);
1292f2dc053SSage Weil info->symlink = *p;
1302f2dc053SSage Weil *p += info->symlink_len;
1312f2dc053SSage Weil
13214303d20SSage Weil ceph_decode_copy_safe(p, end, &info->dir_layout,
13314303d20SSage Weil sizeof(info->dir_layout), bad);
1342f2dc053SSage Weil ceph_decode_32_safe(p, end, info->xattr_len, bad);
1352f2dc053SSage Weil ceph_decode_need(p, end, info->xattr_len, bad);
1362f2dc053SSage Weil info->xattr_data = *p;
1372f2dc053SSage Weil *p += info->xattr_len;
138fb01d1f8SYan, Zheng
139b37fe1f9SYan, Zheng if (features == (u64)-1) {
140b37fe1f9SYan, Zheng /* inline data */
141b37fe1f9SYan, Zheng ceph_decode_64_safe(p, end, info->inline_version, bad);
142b37fe1f9SYan, Zheng ceph_decode_32_safe(p, end, info->inline_len, bad);
143b37fe1f9SYan, Zheng ceph_decode_need(p, end, info->inline_len, bad);
144b37fe1f9SYan, Zheng info->inline_data = *p;
145b37fe1f9SYan, Zheng *p += info->inline_len;
146b37fe1f9SYan, Zheng /* quota */
147b37fe1f9SYan, Zheng err = parse_reply_info_quota(p, end, info);
148b37fe1f9SYan, Zheng if (err < 0)
149b37fe1f9SYan, Zheng goto out_bad;
150b37fe1f9SYan, Zheng /* pool namespace */
151b37fe1f9SYan, Zheng ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
152b37fe1f9SYan, Zheng if (info->pool_ns_len > 0) {
153b37fe1f9SYan, Zheng ceph_decode_need(p, end, info->pool_ns_len, bad);
154b37fe1f9SYan, Zheng info->pool_ns_data = *p;
155b37fe1f9SYan, Zheng *p += info->pool_ns_len;
156b37fe1f9SYan, Zheng }
157245ce991SJeff Layton
158245ce991SJeff Layton /* btime */
159245ce991SJeff Layton ceph_decode_need(p, end, sizeof(info->btime), bad);
160245ce991SJeff Layton ceph_decode_copy(p, &info->btime, sizeof(info->btime));
161245ce991SJeff Layton
162245ce991SJeff Layton /* change attribute */
163a35ead31SJeff Layton ceph_decode_64_safe(p, end, info->change_attr, bad);
164b37fe1f9SYan, Zheng
16508796873SYan, Zheng /* dir pin */
16608796873SYan, Zheng if (struct_v >= 2) {
16708796873SYan, Zheng ceph_decode_32_safe(p, end, info->dir_pin, bad);
16808796873SYan, Zheng } else {
16908796873SYan, Zheng info->dir_pin = -ENODATA;
17008796873SYan, Zheng }
17108796873SYan, Zheng
172193e7b37SDavid Disseldorp /* snapshot birth time, remains zero for v<=2 */
173193e7b37SDavid Disseldorp if (struct_v >= 3) {
174193e7b37SDavid Disseldorp ceph_decode_need(p, end, sizeof(info->snap_btime), bad);
175193e7b37SDavid Disseldorp ceph_decode_copy(p, &info->snap_btime,
176193e7b37SDavid Disseldorp sizeof(info->snap_btime));
177193e7b37SDavid Disseldorp } else {
178193e7b37SDavid Disseldorp memset(&info->snap_btime, 0, sizeof(info->snap_btime));
179193e7b37SDavid Disseldorp }
180193e7b37SDavid Disseldorp
181e7f72952SYanhu Cao /* snapshot count, remains zero for v<=3 */
182e7f72952SYanhu Cao if (struct_v >= 4) {
183e7f72952SYanhu Cao ceph_decode_64_safe(p, end, info->rsnaps, bad);
184e7f72952SYanhu Cao } else {
185e7f72952SYanhu Cao info->rsnaps = 0;
186e7f72952SYanhu Cao }
187e7f72952SYanhu Cao
1882d332d5bSJeff Layton if (struct_v >= 5) {
1892d332d5bSJeff Layton u32 alen;
1902d332d5bSJeff Layton
1912d332d5bSJeff Layton ceph_decode_32_safe(p, end, alen, bad);
1922d332d5bSJeff Layton
1932d332d5bSJeff Layton while (alen--) {
1942d332d5bSJeff Layton u32 len;
1952d332d5bSJeff Layton
1962d332d5bSJeff Layton /* key */
1972d332d5bSJeff Layton ceph_decode_32_safe(p, end, len, bad);
1982d332d5bSJeff Layton ceph_decode_skip_n(p, end, len, bad);
1992d332d5bSJeff Layton /* value */
2002d332d5bSJeff Layton ceph_decode_32_safe(p, end, len, bad);
2012d332d5bSJeff Layton ceph_decode_skip_n(p, end, len, bad);
2022d332d5bSJeff Layton }
2032d332d5bSJeff Layton }
2042d332d5bSJeff Layton
2052d332d5bSJeff Layton /* fscrypt flag -- ignore */
2062d332d5bSJeff Layton if (struct_v >= 6)
2072d332d5bSJeff Layton ceph_decode_skip_8(p, end, bad);
2082d332d5bSJeff Layton
2092d332d5bSJeff Layton info->fscrypt_auth = NULL;
2102d332d5bSJeff Layton info->fscrypt_auth_len = 0;
2112d332d5bSJeff Layton info->fscrypt_file = NULL;
2122d332d5bSJeff Layton info->fscrypt_file_len = 0;
2132d332d5bSJeff Layton if (struct_v >= 7) {
2142d332d5bSJeff Layton ceph_decode_32_safe(p, end, info->fscrypt_auth_len, bad);
2152d332d5bSJeff Layton if (info->fscrypt_auth_len) {
2162d332d5bSJeff Layton info->fscrypt_auth = kmalloc(info->fscrypt_auth_len,
2172d332d5bSJeff Layton GFP_KERNEL);
2182d332d5bSJeff Layton if (!info->fscrypt_auth)
2192d332d5bSJeff Layton return -ENOMEM;
2202d332d5bSJeff Layton ceph_decode_copy_safe(p, end, info->fscrypt_auth,
2212d332d5bSJeff Layton info->fscrypt_auth_len, bad);
2222d332d5bSJeff Layton }
2232d332d5bSJeff Layton ceph_decode_32_safe(p, end, info->fscrypt_file_len, bad);
2242d332d5bSJeff Layton if (info->fscrypt_file_len) {
2252d332d5bSJeff Layton info->fscrypt_file = kmalloc(info->fscrypt_file_len,
2262d332d5bSJeff Layton GFP_KERNEL);
2272d332d5bSJeff Layton if (!info->fscrypt_file)
2282d332d5bSJeff Layton return -ENOMEM;
2292d332d5bSJeff Layton ceph_decode_copy_safe(p, end, info->fscrypt_file,
2302d332d5bSJeff Layton info->fscrypt_file_len, bad);
2312d332d5bSJeff Layton }
2322d332d5bSJeff Layton }
233b37fe1f9SYan, Zheng *p = end;
234b37fe1f9SYan, Zheng } else {
2352d332d5bSJeff Layton /* legacy (unversioned) struct */
236fb01d1f8SYan, Zheng if (features & CEPH_FEATURE_MDS_INLINE_DATA) {
237fb01d1f8SYan, Zheng ceph_decode_64_safe(p, end, info->inline_version, bad);
238fb01d1f8SYan, Zheng ceph_decode_32_safe(p, end, info->inline_len, bad);
239fb01d1f8SYan, Zheng ceph_decode_need(p, end, info->inline_len, bad);
240fb01d1f8SYan, Zheng info->inline_data = *p;
241fb01d1f8SYan, Zheng *p += info->inline_len;
242fb01d1f8SYan, Zheng } else
243fb01d1f8SYan, Zheng info->inline_version = CEPH_INLINE_NONE;
244fb01d1f8SYan, Zheng
245fb18a575SLuis Henriques if (features & CEPH_FEATURE_MDS_QUOTA) {
246b37fe1f9SYan, Zheng err = parse_reply_info_quota(p, end, info);
247b37fe1f9SYan, Zheng if (err < 0)
248b37fe1f9SYan, Zheng goto out_bad;
249fb18a575SLuis Henriques } else {
250fb18a575SLuis Henriques info->max_bytes = 0;
251fb18a575SLuis Henriques info->max_files = 0;
252fb18a575SLuis Henriques }
253fb18a575SLuis Henriques
254779fe0fbSYan, Zheng info->pool_ns_len = 0;
255779fe0fbSYan, Zheng info->pool_ns_data = NULL;
2565ea5c5e0SYan, Zheng if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
2575ea5c5e0SYan, Zheng ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
258779fe0fbSYan, Zheng if (info->pool_ns_len > 0) {
2595ea5c5e0SYan, Zheng ceph_decode_need(p, end, info->pool_ns_len, bad);
260779fe0fbSYan, Zheng info->pool_ns_data = *p;
2615ea5c5e0SYan, Zheng *p += info->pool_ns_len;
262779fe0fbSYan, Zheng }
2635ea5c5e0SYan, Zheng }
26408796873SYan, Zheng
265245ce991SJeff Layton if (features & CEPH_FEATURE_FS_BTIME) {
266245ce991SJeff Layton ceph_decode_need(p, end, sizeof(info->btime), bad);
267245ce991SJeff Layton ceph_decode_copy(p, &info->btime, sizeof(info->btime));
268a35ead31SJeff Layton ceph_decode_64_safe(p, end, info->change_attr, bad);
269245ce991SJeff Layton }
270245ce991SJeff Layton
27108796873SYan, Zheng info->dir_pin = -ENODATA;
272e7f72952SYanhu Cao /* info->snap_btime and info->rsnaps remain zero */
273b37fe1f9SYan, Zheng }
2742f2dc053SSage Weil return 0;
2752f2dc053SSage Weil bad:
276b37fe1f9SYan, Zheng err = -EIO;
277b37fe1f9SYan, Zheng out_bad:
2782f2dc053SSage Weil return err;
2792f2dc053SSage Weil }
2802f2dc053SSage Weil
parse_reply_info_dir(void ** p,void * end,struct ceph_mds_reply_dirfrag ** dirfrag,u64 features)281b37fe1f9SYan, Zheng static int parse_reply_info_dir(void **p, void *end,
282b37fe1f9SYan, Zheng struct ceph_mds_reply_dirfrag **dirfrag,
283b37fe1f9SYan, Zheng u64 features)
284b37fe1f9SYan, Zheng {
285b37fe1f9SYan, Zheng if (features == (u64)-1) {
286b37fe1f9SYan, Zheng u8 struct_v, struct_compat;
287b37fe1f9SYan, Zheng u32 struct_len;
288b37fe1f9SYan, Zheng ceph_decode_8_safe(p, end, struct_v, bad);
289b37fe1f9SYan, Zheng ceph_decode_8_safe(p, end, struct_compat, bad);
290b37fe1f9SYan, Zheng /* struct_v is expected to be >= 1. we only understand
291b37fe1f9SYan, Zheng * encoding whose struct_compat == 1. */
292b37fe1f9SYan, Zheng if (!struct_v || struct_compat != 1)
293b37fe1f9SYan, Zheng goto bad;
294b37fe1f9SYan, Zheng ceph_decode_32_safe(p, end, struct_len, bad);
295b37fe1f9SYan, Zheng ceph_decode_need(p, end, struct_len, bad);
296b37fe1f9SYan, Zheng end = *p + struct_len;
297b37fe1f9SYan, Zheng }
298b37fe1f9SYan, Zheng
299b37fe1f9SYan, Zheng ceph_decode_need(p, end, sizeof(**dirfrag), bad);
300b37fe1f9SYan, Zheng *dirfrag = *p;
301b37fe1f9SYan, Zheng *p += sizeof(**dirfrag) + sizeof(u32) * le32_to_cpu((*dirfrag)->ndist);
302b37fe1f9SYan, Zheng if (unlikely(*p > end))
303b37fe1f9SYan, Zheng goto bad;
304b37fe1f9SYan, Zheng if (features == (u64)-1)
305b37fe1f9SYan, Zheng *p = end;
306b37fe1f9SYan, Zheng return 0;
307b37fe1f9SYan, Zheng bad:
308b37fe1f9SYan, Zheng return -EIO;
309b37fe1f9SYan, Zheng }
310b37fe1f9SYan, Zheng
parse_reply_info_lease(void ** p,void * end,struct ceph_mds_reply_lease ** lease,u64 features,u32 * altname_len,u8 ** altname)311b37fe1f9SYan, Zheng static int parse_reply_info_lease(void **p, void *end,
312b37fe1f9SYan, Zheng struct ceph_mds_reply_lease **lease,
3134ac4c23eSJeff Layton u64 features, u32 *altname_len, u8 **altname)
314b37fe1f9SYan, Zheng {
3154ac4c23eSJeff Layton u8 struct_v;
316b37fe1f9SYan, Zheng u32 struct_len;
3174ac4c23eSJeff Layton void *lend;
3184ac4c23eSJeff Layton
3194ac4c23eSJeff Layton if (features == (u64)-1) {
3204ac4c23eSJeff Layton u8 struct_compat;
3214ac4c23eSJeff Layton
322b37fe1f9SYan, Zheng ceph_decode_8_safe(p, end, struct_v, bad);
323b37fe1f9SYan, Zheng ceph_decode_8_safe(p, end, struct_compat, bad);
3244ac4c23eSJeff Layton
325b37fe1f9SYan, Zheng /* struct_v is expected to be >= 1. we only understand
326b37fe1f9SYan, Zheng * encoding whose struct_compat == 1. */
327b37fe1f9SYan, Zheng if (!struct_v || struct_compat != 1)
328b37fe1f9SYan, Zheng goto bad;
3294ac4c23eSJeff Layton
330b37fe1f9SYan, Zheng ceph_decode_32_safe(p, end, struct_len, bad);
3314ac4c23eSJeff Layton } else {
3324ac4c23eSJeff Layton struct_len = sizeof(**lease);
3334ac4c23eSJeff Layton *altname_len = 0;
3344ac4c23eSJeff Layton *altname = NULL;
335b37fe1f9SYan, Zheng }
336b37fe1f9SYan, Zheng
3374ac4c23eSJeff Layton lend = *p + struct_len;
3384ac4c23eSJeff Layton ceph_decode_need(p, end, struct_len, bad);
339b37fe1f9SYan, Zheng *lease = *p;
340b37fe1f9SYan, Zheng *p += sizeof(**lease);
3414ac4c23eSJeff Layton
3424ac4c23eSJeff Layton if (features == (u64)-1) {
3434ac4c23eSJeff Layton if (struct_v >= 2) {
3444ac4c23eSJeff Layton ceph_decode_32_safe(p, end, *altname_len, bad);
3454ac4c23eSJeff Layton ceph_decode_need(p, end, *altname_len, bad);
3464ac4c23eSJeff Layton *altname = *p;
3474ac4c23eSJeff Layton *p += *altname_len;
3484ac4c23eSJeff Layton } else {
3494ac4c23eSJeff Layton *altname = NULL;
3504ac4c23eSJeff Layton *altname_len = 0;
3514ac4c23eSJeff Layton }
3524ac4c23eSJeff Layton }
3534ac4c23eSJeff Layton *p = lend;
354b37fe1f9SYan, Zheng return 0;
355b37fe1f9SYan, Zheng bad:
356b37fe1f9SYan, Zheng return -EIO;
357b37fe1f9SYan, Zheng }
358b37fe1f9SYan, Zheng
3592f2dc053SSage Weil /*
3602f2dc053SSage Weil * parse a normal reply, which may contain a (dir+)dentry and/or a
3612f2dc053SSage Weil * target inode.
3622f2dc053SSage Weil */
parse_reply_info_trace(void ** p,void * end,struct ceph_mds_reply_info_parsed * info,u64 features)3632f2dc053SSage Weil static int parse_reply_info_trace(void **p, void *end,
36414303d20SSage Weil struct ceph_mds_reply_info_parsed *info,
36512b4629aSIlya Dryomov u64 features)
3662f2dc053SSage Weil {
3672f2dc053SSage Weil int err;
3682f2dc053SSage Weil
3692f2dc053SSage Weil if (info->head->is_dentry) {
37014303d20SSage Weil err = parse_reply_info_in(p, end, &info->diri, features);
3712f2dc053SSage Weil if (err < 0)
3722f2dc053SSage Weil goto out_bad;
3732f2dc053SSage Weil
374b37fe1f9SYan, Zheng err = parse_reply_info_dir(p, end, &info->dirfrag, features);
375b37fe1f9SYan, Zheng if (err < 0)
376b37fe1f9SYan, Zheng goto out_bad;
3772f2dc053SSage Weil
3782f2dc053SSage Weil ceph_decode_32_safe(p, end, info->dname_len, bad);
3792f2dc053SSage Weil ceph_decode_need(p, end, info->dname_len, bad);
3802f2dc053SSage Weil info->dname = *p;
3812f2dc053SSage Weil *p += info->dname_len;
382b37fe1f9SYan, Zheng
3834ac4c23eSJeff Layton err = parse_reply_info_lease(p, end, &info->dlease, features,
3844ac4c23eSJeff Layton &info->altname_len, &info->altname);
385b37fe1f9SYan, Zheng if (err < 0)
386b37fe1f9SYan, Zheng goto out_bad;
3872f2dc053SSage Weil }
3882f2dc053SSage Weil
3892f2dc053SSage Weil if (info->head->is_target) {
39014303d20SSage Weil err = parse_reply_info_in(p, end, &info->targeti, features);
3912f2dc053SSage Weil if (err < 0)
3922f2dc053SSage Weil goto out_bad;
3932f2dc053SSage Weil }
3942f2dc053SSage Weil
3952f2dc053SSage Weil if (unlikely(*p != end))
3962f2dc053SSage Weil goto bad;
3972f2dc053SSage Weil return 0;
3982f2dc053SSage Weil
3992f2dc053SSage Weil bad:
4002f2dc053SSage Weil err = -EIO;
4012f2dc053SSage Weil out_bad:
4022f2dc053SSage Weil pr_err("problem parsing mds trace %d\n", err);
4032f2dc053SSage Weil return err;
4042f2dc053SSage Weil }
4052f2dc053SSage Weil
4062f2dc053SSage Weil /*
4072f2dc053SSage Weil * parse readdir results
4082f2dc053SSage Weil */
parse_reply_info_readdir(void ** p,void * end,struct ceph_mds_request * req,u64 features)409b37fe1f9SYan, Zheng static int parse_reply_info_readdir(void **p, void *end,
4103859af9eSXiubo Li struct ceph_mds_request *req,
41112b4629aSIlya Dryomov u64 features)
4122f2dc053SSage Weil {
4133859af9eSXiubo Li struct ceph_mds_reply_info_parsed *info = &req->r_reply_info;
4142f2dc053SSage Weil u32 num, i = 0;
4152f2dc053SSage Weil int err;
4162f2dc053SSage Weil
417b37fe1f9SYan, Zheng err = parse_reply_info_dir(p, end, &info->dir_dir, features);
418b37fe1f9SYan, Zheng if (err < 0)
419b37fe1f9SYan, Zheng goto out_bad;
4202f2dc053SSage Weil
4212f2dc053SSage Weil ceph_decode_need(p, end, sizeof(num) + 2, bad);
422c89136eaSSage Weil num = ceph_decode_32(p);
423956d39d6SYan, Zheng {
424956d39d6SYan, Zheng u16 flags = ceph_decode_16(p);
425956d39d6SYan, Zheng info->dir_end = !!(flags & CEPH_READDIR_FRAG_END);
426956d39d6SYan, Zheng info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE);
427f3c4ebe6SYan, Zheng info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER);
42879162547SYan, Zheng info->offset_hash = !!(flags & CEPH_READDIR_OFFSET_HASH);
429956d39d6SYan, Zheng }
4302f2dc053SSage Weil if (num == 0)
4312f2dc053SSage Weil goto done;
4322f2dc053SSage Weil
4332a5beea3SYan, Zheng BUG_ON(!info->dir_entries);
4342a5beea3SYan, Zheng if ((unsigned long)(info->dir_entries + num) >
4352a5beea3SYan, Zheng (unsigned long)info->dir_entries + info->dir_buf_size) {
43654008399SYan, Zheng pr_err("dir contents are larger than expected\n");
43754008399SYan, Zheng WARN_ON(1);
43854008399SYan, Zheng goto bad;
43954008399SYan, Zheng }
4402f2dc053SSage Weil
44154008399SYan, Zheng info->dir_nr = num;
4422f2dc053SSage Weil while (num) {
443af9ffa6dSXiubo Li struct inode *inode = d_inode(req->r_dentry);
444af9ffa6dSXiubo Li struct ceph_inode_info *ci = ceph_inode(inode);
4452a5beea3SYan, Zheng struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;
446af9ffa6dSXiubo Li struct fscrypt_str tname = FSTR_INIT(NULL, 0);
447af9ffa6dSXiubo Li struct fscrypt_str oname = FSTR_INIT(NULL, 0);
448af9ffa6dSXiubo Li struct ceph_fname fname;
449af9ffa6dSXiubo Li u32 altname_len, _name_len;
450af9ffa6dSXiubo Li u8 *altname, *_name;
451af9ffa6dSXiubo Li
4522f2dc053SSage Weil /* dentry */
453af9ffa6dSXiubo Li ceph_decode_32_safe(p, end, _name_len, bad);
454af9ffa6dSXiubo Li ceph_decode_need(p, end, _name_len, bad);
455af9ffa6dSXiubo Li _name = *p;
456af9ffa6dSXiubo Li *p += _name_len;
457af9ffa6dSXiubo Li dout("parsed dir dname '%.*s'\n", _name_len, _name);
458af9ffa6dSXiubo Li
459af9ffa6dSXiubo Li if (info->hash_order)
460af9ffa6dSXiubo Li rde->raw_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
461af9ffa6dSXiubo Li _name, _name_len);
4622f2dc053SSage Weil
463b37fe1f9SYan, Zheng /* dentry lease */
4644ac4c23eSJeff Layton err = parse_reply_info_lease(p, end, &rde->lease, features,
465af9ffa6dSXiubo Li &altname_len, &altname);
466b37fe1f9SYan, Zheng if (err)
467b37fe1f9SYan, Zheng goto out_bad;
4684ac4c23eSJeff Layton
469af9ffa6dSXiubo Li /*
470af9ffa6dSXiubo Li * Try to dencrypt the dentry names and update them
471af9ffa6dSXiubo Li * in the ceph_mds_reply_dir_entry struct.
472af9ffa6dSXiubo Li */
473af9ffa6dSXiubo Li fname.dir = inode;
474af9ffa6dSXiubo Li fname.name = _name;
475af9ffa6dSXiubo Li fname.name_len = _name_len;
476af9ffa6dSXiubo Li fname.ctext = altname;
477af9ffa6dSXiubo Li fname.ctext_len = altname_len;
478af9ffa6dSXiubo Li /*
479af9ffa6dSXiubo Li * The _name_len maybe larger than altname_len, such as
480af9ffa6dSXiubo Li * when the human readable name length is in range of
481af9ffa6dSXiubo Li * (CEPH_NOHASH_NAME_MAX, CEPH_NOHASH_NAME_MAX + SHA256_DIGEST_SIZE),
482af9ffa6dSXiubo Li * then the copy in ceph_fname_to_usr will corrupt the
483af9ffa6dSXiubo Li * data if there has no encryption key.
484af9ffa6dSXiubo Li *
485af9ffa6dSXiubo Li * Just set the no_copy flag and then if there has no
486af9ffa6dSXiubo Li * encryption key the oname.name will be assigned to
487af9ffa6dSXiubo Li * _name always.
488af9ffa6dSXiubo Li */
489af9ffa6dSXiubo Li fname.no_copy = true;
490af9ffa6dSXiubo Li if (altname_len == 0) {
491af9ffa6dSXiubo Li /*
492af9ffa6dSXiubo Li * Set tname to _name, and this will be used
493af9ffa6dSXiubo Li * to do the base64_decode in-place. It's
494af9ffa6dSXiubo Li * safe because the decoded string should
495af9ffa6dSXiubo Li * always be shorter, which is 3/4 of origin
496af9ffa6dSXiubo Li * string.
497af9ffa6dSXiubo Li */
498af9ffa6dSXiubo Li tname.name = _name;
499af9ffa6dSXiubo Li
500af9ffa6dSXiubo Li /*
501af9ffa6dSXiubo Li * Set oname to _name too, and this will be
502af9ffa6dSXiubo Li * used to do the dencryption in-place.
503af9ffa6dSXiubo Li */
504af9ffa6dSXiubo Li oname.name = _name;
505af9ffa6dSXiubo Li oname.len = _name_len;
506af9ffa6dSXiubo Li } else {
507af9ffa6dSXiubo Li /*
508af9ffa6dSXiubo Li * This will do the decryption only in-place
509af9ffa6dSXiubo Li * from altname cryptext directly.
510af9ffa6dSXiubo Li */
511af9ffa6dSXiubo Li oname.name = altname;
512af9ffa6dSXiubo Li oname.len = altname_len;
513af9ffa6dSXiubo Li }
514af9ffa6dSXiubo Li rde->is_nokey = false;
515af9ffa6dSXiubo Li err = ceph_fname_to_usr(&fname, &tname, &oname, &rde->is_nokey);
516af9ffa6dSXiubo Li if (err) {
517af9ffa6dSXiubo Li pr_err("%s unable to decode %.*s, got %d\n", __func__,
518af9ffa6dSXiubo Li _name_len, _name, err);
519af9ffa6dSXiubo Li goto out_bad;
520af9ffa6dSXiubo Li }
521af9ffa6dSXiubo Li rde->name = oname.name;
522af9ffa6dSXiubo Li rde->name_len = oname.len;
523af9ffa6dSXiubo Li
5242f2dc053SSage Weil /* inode */
5252a5beea3SYan, Zheng err = parse_reply_info_in(p, end, &rde->inode, features);
5262f2dc053SSage Weil if (err < 0)
5272f2dc053SSage Weil goto out_bad;
5288974eebdSYan, Zheng /* ceph_readdir_prepopulate() will update it */
5298974eebdSYan, Zheng rde->offset = 0;
5302f2dc053SSage Weil i++;
5312f2dc053SSage Weil num--;
5322f2dc053SSage Weil }
5332f2dc053SSage Weil
5342f2dc053SSage Weil done:
5351d3f8723SJeff Layton /* Skip over any unrecognized fields */
5361d3f8723SJeff Layton *p = end;
5372f2dc053SSage Weil return 0;
5382f2dc053SSage Weil
5392f2dc053SSage Weil bad:
5402f2dc053SSage Weil err = -EIO;
5412f2dc053SSage Weil out_bad:
5422f2dc053SSage Weil pr_err("problem parsing dir contents %d\n", err);
5432f2dc053SSage Weil return err;
5442f2dc053SSage Weil }
5452f2dc053SSage Weil
5462f2dc053SSage Weil /*
54725933abdSHerb Shiu * parse fcntl F_GETLK results
54825933abdSHerb Shiu */
parse_reply_info_filelock(void ** p,void * end,struct ceph_mds_reply_info_parsed * info,u64 features)54925933abdSHerb Shiu static int parse_reply_info_filelock(void **p, void *end,
55014303d20SSage Weil struct ceph_mds_reply_info_parsed *info,
55112b4629aSIlya Dryomov u64 features)
55225933abdSHerb Shiu {
55325933abdSHerb Shiu if (*p + sizeof(*info->filelock_reply) > end)
55425933abdSHerb Shiu goto bad;
55525933abdSHerb Shiu
55625933abdSHerb Shiu info->filelock_reply = *p;
55725933abdSHerb Shiu
5581d3f8723SJeff Layton /* Skip over any unrecognized fields */
5591d3f8723SJeff Layton *p = end;
56025933abdSHerb Shiu return 0;
56125933abdSHerb Shiu bad:
56225933abdSHerb Shiu return -EIO;
56325933abdSHerb Shiu }
56425933abdSHerb Shiu
565d4846487SJeff Layton
566d4846487SJeff Layton #if BITS_PER_LONG == 64
567d4846487SJeff Layton
568d4846487SJeff Layton #define DELEGATED_INO_AVAILABLE xa_mk_value(1)
569d4846487SJeff Layton
ceph_parse_deleg_inos(void ** p,void * end,struct ceph_mds_session * s)570d4846487SJeff Layton static int ceph_parse_deleg_inos(void **p, void *end,
571d4846487SJeff Layton struct ceph_mds_session *s)
572d4846487SJeff Layton {
573d4846487SJeff Layton u32 sets;
574d4846487SJeff Layton
575d4846487SJeff Layton ceph_decode_32_safe(p, end, sets, bad);
576d4846487SJeff Layton dout("got %u sets of delegated inodes\n", sets);
577d4846487SJeff Layton while (sets--) {
5782ecd0eddSColin Ian King u64 start, len;
579d4846487SJeff Layton
580d4846487SJeff Layton ceph_decode_64_safe(p, end, start, bad);
581d4846487SJeff Layton ceph_decode_64_safe(p, end, len, bad);
582d4f6b31dSJeff Layton
583d4f6b31dSJeff Layton /* Don't accept a delegation of system inodes */
584d4f6b31dSJeff Layton if (start < CEPH_INO_SYSTEM_BASE) {
585d4f6b31dSJeff Layton pr_warn_ratelimited("ceph: ignoring reserved inode range delegation (start=0x%llx len=0x%llx)\n",
586d4f6b31dSJeff Layton start, len);
587d4f6b31dSJeff Layton continue;
588d4f6b31dSJeff Layton }
589d4846487SJeff Layton while (len--) {
5902ecd0eddSColin Ian King int err = xa_insert(&s->s_delegated_inos, start++,
591d4846487SJeff Layton DELEGATED_INO_AVAILABLE,
592d4846487SJeff Layton GFP_KERNEL);
593d4846487SJeff Layton if (!err) {
594d4846487SJeff Layton dout("added delegated inode 0x%llx\n",
595d4846487SJeff Layton start - 1);
596d4846487SJeff Layton } else if (err == -EBUSY) {
5974868e537SXiubo Li pr_warn("MDS delegated inode 0x%llx more than once.\n",
598d4846487SJeff Layton start - 1);
599d4846487SJeff Layton } else {
600d4846487SJeff Layton return err;
601d4846487SJeff Layton }
602d4846487SJeff Layton }
603d4846487SJeff Layton }
604d4846487SJeff Layton return 0;
605d4846487SJeff Layton bad:
606d4846487SJeff Layton return -EIO;
607d4846487SJeff Layton }
608d4846487SJeff Layton
ceph_get_deleg_ino(struct ceph_mds_session * s)609d4846487SJeff Layton u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
610d4846487SJeff Layton {
611d4846487SJeff Layton unsigned long ino;
612d4846487SJeff Layton void *val;
613d4846487SJeff Layton
614d4846487SJeff Layton xa_for_each(&s->s_delegated_inos, ino, val) {
615d4846487SJeff Layton val = xa_erase(&s->s_delegated_inos, ino);
616d4846487SJeff Layton if (val == DELEGATED_INO_AVAILABLE)
617d4846487SJeff Layton return ino;
618d4846487SJeff Layton }
619d4846487SJeff Layton return 0;
620d4846487SJeff Layton }
621d4846487SJeff Layton
ceph_restore_deleg_ino(struct ceph_mds_session * s,u64 ino)622d4846487SJeff Layton int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
623d4846487SJeff Layton {
624d4846487SJeff Layton return xa_insert(&s->s_delegated_inos, ino, DELEGATED_INO_AVAILABLE,
625d4846487SJeff Layton GFP_KERNEL);
626d4846487SJeff Layton }
627d4846487SJeff Layton #else /* BITS_PER_LONG == 64 */
628d4846487SJeff Layton /*
629d4846487SJeff Layton * FIXME: xarrays can't handle 64-bit indexes on a 32-bit arch. For now, just
630d4846487SJeff Layton * ignore delegated_inos on 32 bit arch. Maybe eventually add xarrays for top
631d4846487SJeff Layton * and bottom words?
632d4846487SJeff Layton */
ceph_parse_deleg_inos(void ** p,void * end,struct ceph_mds_session * s)633d4846487SJeff Layton static int ceph_parse_deleg_inos(void **p, void *end,
634d4846487SJeff Layton struct ceph_mds_session *s)
635d4846487SJeff Layton {
636d4846487SJeff Layton u32 sets;
637d4846487SJeff Layton
638d4846487SJeff Layton ceph_decode_32_safe(p, end, sets, bad);
639d4846487SJeff Layton if (sets)
640d4846487SJeff Layton ceph_decode_skip_n(p, end, sets * 2 * sizeof(__le64), bad);
641d4846487SJeff Layton return 0;
642d4846487SJeff Layton bad:
643d4846487SJeff Layton return -EIO;
644d4846487SJeff Layton }
645d4846487SJeff Layton
ceph_get_deleg_ino(struct ceph_mds_session * s)646d4846487SJeff Layton u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
647d4846487SJeff Layton {
648d4846487SJeff Layton return 0;
649d4846487SJeff Layton }
650d4846487SJeff Layton
ceph_restore_deleg_ino(struct ceph_mds_session * s,u64 ino)651d4846487SJeff Layton int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
652d4846487SJeff Layton {
653d4846487SJeff Layton return 0;
654d4846487SJeff Layton }
655d4846487SJeff Layton #endif /* BITS_PER_LONG == 64 */
656d4846487SJeff Layton
65725933abdSHerb Shiu /*
6586e8575faSSam Lang * parse create results
6596e8575faSSam Lang */
parse_reply_info_create(void ** p,void * end,struct ceph_mds_reply_info_parsed * info,u64 features,struct ceph_mds_session * s)6606e8575faSSam Lang static int parse_reply_info_create(void **p, void *end,
6616e8575faSSam Lang struct ceph_mds_reply_info_parsed *info,
662d4846487SJeff Layton u64 features, struct ceph_mds_session *s)
6636e8575faSSam Lang {
664d4846487SJeff Layton int ret;
665d4846487SJeff Layton
666b37fe1f9SYan, Zheng if (features == (u64)-1 ||
667b37fe1f9SYan, Zheng (features & CEPH_FEATURE_REPLY_CREATE_INODE)) {
6686e8575faSSam Lang if (*p == end) {
669d4846487SJeff Layton /* Malformed reply? */
6706e8575faSSam Lang info->has_create_ino = false;
671d4846487SJeff Layton } else if (test_bit(CEPHFS_FEATURE_DELEG_INO, &s->s_features)) {
6726e8575faSSam Lang info->has_create_ino = true;
67306a1ad43SJeff Layton /* struct_v, struct_compat, and len */
67406a1ad43SJeff Layton ceph_decode_skip_n(p, end, 2 + sizeof(u32), bad);
6751d3f8723SJeff Layton ceph_decode_64_safe(p, end, info->ino, bad);
676d4846487SJeff Layton ret = ceph_parse_deleg_inos(p, end, s);
677d4846487SJeff Layton if (ret)
678d4846487SJeff Layton return ret;
679d4846487SJeff Layton } else {
680d4846487SJeff Layton /* legacy */
681d4846487SJeff Layton ceph_decode_64_safe(p, end, info->ino, bad);
682d4846487SJeff Layton info->has_create_ino = true;
6836e8575faSSam Lang }
6841d3f8723SJeff Layton } else {
6851d3f8723SJeff Layton if (*p != end)
6866e8575faSSam Lang goto bad;
6871d3f8723SJeff Layton }
6886e8575faSSam Lang
6891d3f8723SJeff Layton /* Skip over any unrecognized fields */
6901d3f8723SJeff Layton *p = end;
6911d3f8723SJeff Layton return 0;
6926e8575faSSam Lang bad:
6936e8575faSSam Lang return -EIO;
6946e8575faSSam Lang }
6956e8575faSSam Lang
parse_reply_info_getvxattr(void ** p,void * end,struct ceph_mds_reply_info_parsed * info,u64 features)6966ddf5f16SMilind Changire static int parse_reply_info_getvxattr(void **p, void *end,
6976ddf5f16SMilind Changire struct ceph_mds_reply_info_parsed *info,
6986ddf5f16SMilind Changire u64 features)
6996ddf5f16SMilind Changire {
7006ddf5f16SMilind Changire u32 value_len;
7016ddf5f16SMilind Changire
7026ddf5f16SMilind Changire ceph_decode_skip_8(p, end, bad); /* skip current version: 1 */
7036ddf5f16SMilind Changire ceph_decode_skip_8(p, end, bad); /* skip first version: 1 */
7046ddf5f16SMilind Changire ceph_decode_skip_32(p, end, bad); /* skip payload length */
7056ddf5f16SMilind Changire
7066ddf5f16SMilind Changire ceph_decode_32_safe(p, end, value_len, bad);
7076ddf5f16SMilind Changire
7086ddf5f16SMilind Changire if (value_len == end - *p) {
7096ddf5f16SMilind Changire info->xattr_info.xattr_value = *p;
7106ddf5f16SMilind Changire info->xattr_info.xattr_value_len = value_len;
7116ddf5f16SMilind Changire *p = end;
7126ddf5f16SMilind Changire return value_len;
7136ddf5f16SMilind Changire }
7146ddf5f16SMilind Changire bad:
7156ddf5f16SMilind Changire return -EIO;
7166ddf5f16SMilind Changire }
7176ddf5f16SMilind Changire
7186e8575faSSam Lang /*
71925933abdSHerb Shiu * parse extra results
72025933abdSHerb Shiu */
parse_reply_info_extra(void ** p,void * end,struct ceph_mds_request * req,u64 features,struct ceph_mds_session * s)72125933abdSHerb Shiu static int parse_reply_info_extra(void **p, void *end,
7223859af9eSXiubo Li struct ceph_mds_request *req,
723d4846487SJeff Layton u64 features, struct ceph_mds_session *s)
72425933abdSHerb Shiu {
7253859af9eSXiubo Li struct ceph_mds_reply_info_parsed *info = &req->r_reply_info;
7266df8c9d8SJeff Layton u32 op = le32_to_cpu(info->head->op);
7276df8c9d8SJeff Layton
7286df8c9d8SJeff Layton if (op == CEPH_MDS_OP_GETFILELOCK)
72914303d20SSage Weil return parse_reply_info_filelock(p, end, info, features);
7306df8c9d8SJeff Layton else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP)
7313859af9eSXiubo Li return parse_reply_info_readdir(p, end, req, features);
7326df8c9d8SJeff Layton else if (op == CEPH_MDS_OP_CREATE)
733d4846487SJeff Layton return parse_reply_info_create(p, end, info, features, s);
7346ddf5f16SMilind Changire else if (op == CEPH_MDS_OP_GETVXATTR)
7356ddf5f16SMilind Changire return parse_reply_info_getvxattr(p, end, info, features);
7366e8575faSSam Lang else
7376e8575faSSam Lang return -EIO;
73825933abdSHerb Shiu }
73925933abdSHerb Shiu
74025933abdSHerb Shiu /*
7412f2dc053SSage Weil * parse entire mds reply
7422f2dc053SSage Weil */
parse_reply_info(struct ceph_mds_session * s,struct ceph_msg * msg,struct ceph_mds_request * req,u64 features)743d4846487SJeff Layton static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
7443859af9eSXiubo Li struct ceph_mds_request *req, u64 features)
7452f2dc053SSage Weil {
7463859af9eSXiubo Li struct ceph_mds_reply_info_parsed *info = &req->r_reply_info;
7472f2dc053SSage Weil void *p, *end;
7482f2dc053SSage Weil u32 len;
7492f2dc053SSage Weil int err;
7502f2dc053SSage Weil
7512f2dc053SSage Weil info->head = msg->front.iov_base;
7522f2dc053SSage Weil p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
7532f2dc053SSage Weil end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
7542f2dc053SSage Weil
7552f2dc053SSage Weil /* trace */
7562f2dc053SSage Weil ceph_decode_32_safe(&p, end, len, bad);
7572f2dc053SSage Weil if (len > 0) {
75832852a81SXi Wang ceph_decode_need(&p, end, len, bad);
75914303d20SSage Weil err = parse_reply_info_trace(&p, p+len, info, features);
7602f2dc053SSage Weil if (err < 0)
7612f2dc053SSage Weil goto out_bad;
7622f2dc053SSage Weil }
7632f2dc053SSage Weil
76425933abdSHerb Shiu /* extra */
7652f2dc053SSage Weil ceph_decode_32_safe(&p, end, len, bad);
7662f2dc053SSage Weil if (len > 0) {
76732852a81SXi Wang ceph_decode_need(&p, end, len, bad);
7683859af9eSXiubo Li err = parse_reply_info_extra(&p, p+len, req, features, s);
7692f2dc053SSage Weil if (err < 0)
7702f2dc053SSage Weil goto out_bad;
7712f2dc053SSage Weil }
7722f2dc053SSage Weil
7732f2dc053SSage Weil /* snap blob */
7742f2dc053SSage Weil ceph_decode_32_safe(&p, end, len, bad);
7752f2dc053SSage Weil info->snapblob_len = len;
7762f2dc053SSage Weil info->snapblob = p;
7772f2dc053SSage Weil p += len;
7782f2dc053SSage Weil
7792f2dc053SSage Weil if (p != end)
7802f2dc053SSage Weil goto bad;
7812f2dc053SSage Weil return 0;
7822f2dc053SSage Weil
7832f2dc053SSage Weil bad:
7842f2dc053SSage Weil err = -EIO;
7852f2dc053SSage Weil out_bad:
7862f2dc053SSage Weil pr_err("mds parse_reply err %d\n", err);
7878b0da5c5SXiubo Li ceph_msg_dump(msg);
7882f2dc053SSage Weil return err;
7892f2dc053SSage Weil }
7902f2dc053SSage Weil
destroy_reply_info(struct ceph_mds_reply_info_parsed * info)7912f2dc053SSage Weil static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
7922f2dc053SSage Weil {
7932d332d5bSJeff Layton int i;
7942d332d5bSJeff Layton
7952d332d5bSJeff Layton kfree(info->diri.fscrypt_auth);
7962d332d5bSJeff Layton kfree(info->diri.fscrypt_file);
7972d332d5bSJeff Layton kfree(info->targeti.fscrypt_auth);
7982d332d5bSJeff Layton kfree(info->targeti.fscrypt_file);
7992a5beea3SYan, Zheng if (!info->dir_entries)
80054008399SYan, Zheng return;
8012d332d5bSJeff Layton
8022d332d5bSJeff Layton for (i = 0; i < info->dir_nr; i++) {
8032d332d5bSJeff Layton struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;
8042d332d5bSJeff Layton
8052d332d5bSJeff Layton kfree(rde->inode.fscrypt_auth);
8062d332d5bSJeff Layton kfree(rde->inode.fscrypt_file);
8072d332d5bSJeff Layton }
8082a5beea3SYan, Zheng free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size));
8092f2dc053SSage Weil }
8102f2dc053SSage Weil
8114868e537SXiubo Li /*
8124868e537SXiubo Li * In async unlink case the kclient won't wait for the first reply
8134868e537SXiubo Li * from MDS and just drop all the links and unhash the dentry and then
8144868e537SXiubo Li * succeeds immediately.
8154868e537SXiubo Li *
8164868e537SXiubo Li * For any new create/link/rename,etc requests followed by using the
8174868e537SXiubo Li * same file names we must wait for the first reply of the inflight
8184868e537SXiubo Li * unlink request, or the MDS possibly will fail these following
8194868e537SXiubo Li * requests with -EEXIST if the inflight async unlink request was
8204868e537SXiubo Li * delayed for some reasons.
8214868e537SXiubo Li *
8224868e537SXiubo Li * And the worst case is that for the none async openc request it will
8234868e537SXiubo Li * successfully open the file if the CDentry hasn't been unlinked yet,
8244868e537SXiubo Li * but later the previous delayed async unlink request will remove the
8254868e537SXiubo Li * CDenty. That means the just created file is possiblly deleted later
8264868e537SXiubo Li * by accident.
8274868e537SXiubo Li *
8284868e537SXiubo Li * We need to wait for the inflight async unlink requests to finish
8294868e537SXiubo Li * when creating new files/directories by using the same file names.
8304868e537SXiubo Li */
ceph_wait_on_conflict_unlink(struct dentry * dentry)8314868e537SXiubo Li int ceph_wait_on_conflict_unlink(struct dentry *dentry)
8324868e537SXiubo Li {
833985b9ee8SXiubo Li struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dentry->d_sb);
8344868e537SXiubo Li struct dentry *pdentry = dentry->d_parent;
8354868e537SXiubo Li struct dentry *udentry, *found = NULL;
8364868e537SXiubo Li struct ceph_dentry_info *di;
8374868e537SXiubo Li struct qstr dname;
8384868e537SXiubo Li u32 hash = dentry->d_name.hash;
8394868e537SXiubo Li int err;
8404868e537SXiubo Li
8414868e537SXiubo Li dname.name = dentry->d_name.name;
8424868e537SXiubo Li dname.len = dentry->d_name.len;
8434868e537SXiubo Li
8444868e537SXiubo Li rcu_read_lock();
8454868e537SXiubo Li hash_for_each_possible_rcu(fsc->async_unlink_conflict, di,
8464868e537SXiubo Li hnode, hash) {
8474868e537SXiubo Li udentry = di->dentry;
8484868e537SXiubo Li
8494868e537SXiubo Li spin_lock(&udentry->d_lock);
8504868e537SXiubo Li if (udentry->d_name.hash != hash)
8514868e537SXiubo Li goto next;
8524868e537SXiubo Li if (unlikely(udentry->d_parent != pdentry))
8534868e537SXiubo Li goto next;
8544868e537SXiubo Li if (!hash_hashed(&di->hnode))
8554868e537SXiubo Li goto next;
8564868e537SXiubo Li
8574868e537SXiubo Li if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags))
8584868e537SXiubo Li pr_warn("%s dentry %p:%pd async unlink bit is not set\n",
8594868e537SXiubo Li __func__, dentry, dentry);
8604868e537SXiubo Li
8614868e537SXiubo Li if (!d_same_name(udentry, pdentry, &dname))
8624868e537SXiubo Li goto next;
8634868e537SXiubo Li
864dc32464aSAl Viro found = dget_dlock(udentry);
8654868e537SXiubo Li spin_unlock(&udentry->d_lock);
8664868e537SXiubo Li break;
8674868e537SXiubo Li next:
8684868e537SXiubo Li spin_unlock(&udentry->d_lock);
8694868e537SXiubo Li }
8704868e537SXiubo Li rcu_read_unlock();
8714868e537SXiubo Li
8724868e537SXiubo Li if (likely(!found))
8734868e537SXiubo Li return 0;
8744868e537SXiubo Li
8754868e537SXiubo Li dout("%s dentry %p:%pd conflict with old %p:%pd\n", __func__,
8764868e537SXiubo Li dentry, dentry, found, found);
8774868e537SXiubo Li
8784868e537SXiubo Li err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT,
8794868e537SXiubo Li TASK_KILLABLE);
8804868e537SXiubo Li dput(found);
8814868e537SXiubo Li return err;
8824868e537SXiubo Li }
8834868e537SXiubo Li
8842f2dc053SSage Weil
8852f2dc053SSage Weil /*
8862f2dc053SSage Weil * sessions
8872f2dc053SSage Weil */
ceph_session_state_name(int s)888a687ecafSJohn Spray const char *ceph_session_state_name(int s)
8892f2dc053SSage Weil {
8902f2dc053SSage Weil switch (s) {
8912f2dc053SSage Weil case CEPH_MDS_SESSION_NEW: return "new";
8922f2dc053SSage Weil case CEPH_MDS_SESSION_OPENING: return "opening";
8932f2dc053SSage Weil case CEPH_MDS_SESSION_OPEN: return "open";
8942f2dc053SSage Weil case CEPH_MDS_SESSION_HUNG: return "hung";
8952f2dc053SSage Weil case CEPH_MDS_SESSION_CLOSING: return "closing";
8964d681c2fSXiubo Li case CEPH_MDS_SESSION_CLOSED: return "closed";
89744ca18f2SSage Weil case CEPH_MDS_SESSION_RESTARTING: return "restarting";
8982f2dc053SSage Weil case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
899fcff415cSYan, Zheng case CEPH_MDS_SESSION_REJECTED: return "rejected";
9002f2dc053SSage Weil default: return "???";
9012f2dc053SSage Weil }
9022f2dc053SSage Weil }
9032f2dc053SSage Weil
ceph_get_mds_session(struct ceph_mds_session * s)9045b3248c6SXiubo Li struct ceph_mds_session *ceph_get_mds_session(struct ceph_mds_session *s)
9052f2dc053SSage Weil {
9069f358999SJeff Layton if (refcount_inc_not_zero(&s->s_ref))
9072f2dc053SSage Weil return s;
9082f2dc053SSage Weil return NULL;
9092f2dc053SSage Weil }
9102f2dc053SSage Weil
ceph_put_mds_session(struct ceph_mds_session * s)9112f2dc053SSage Weil void ceph_put_mds_session(struct ceph_mds_session *s)
9122f2dc053SSage Weil {
9137e65624dSJeff Layton if (IS_ERR_OR_NULL(s))
9147e65624dSJeff Layton return;
9157e65624dSJeff Layton
9163997c01dSElena Reshetova if (refcount_dec_and_test(&s->s_ref)) {
9176c4a1915SAlex Elder if (s->s_auth.authorizer)
9186c1ea260SIlya Dryomov ceph_auth_destroy_authorizer(s->s_auth.authorizer);
91988828190SJeff Layton WARN_ON(mutex_is_locked(&s->s_mutex));
920d4846487SJeff Layton xa_destroy(&s->s_delegated_inos);
9212f2dc053SSage Weil kfree(s);
9222f2dc053SSage Weil }
9234e7a5dcdSSage Weil }
9242f2dc053SSage Weil
9252f2dc053SSage Weil /*
9262f2dc053SSage Weil * called under mdsc->mutex
9272f2dc053SSage Weil */
__ceph_lookup_mds_session(struct ceph_mds_client * mdsc,int mds)9282f2dc053SSage Weil struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
9292f2dc053SSage Weil int mds)
9302f2dc053SSage Weil {
931d37b1d99SMarkus Elfring if (mds >= mdsc->max_sessions || !mdsc->sessions[mds])
9322f2dc053SSage Weil return NULL;
9335b3248c6SXiubo Li return ceph_get_mds_session(mdsc->sessions[mds]);
9342f2dc053SSage Weil }
9352f2dc053SSage Weil
__have_session(struct ceph_mds_client * mdsc,int mds)9362f2dc053SSage Weil static bool __have_session(struct ceph_mds_client *mdsc, int mds)
9372f2dc053SSage Weil {
93898cfda81SChengguang Xu if (mds >= mdsc->max_sessions || !mdsc->sessions[mds])
9392f2dc053SSage Weil return false;
94098cfda81SChengguang Xu else
94198cfda81SChengguang Xu return true;
9422f2dc053SSage Weil }
9432f2dc053SSage Weil
__verify_registered_session(struct ceph_mds_client * mdsc,struct ceph_mds_session * s)9442600d2ddSSage Weil static int __verify_registered_session(struct ceph_mds_client *mdsc,
9452600d2ddSSage Weil struct ceph_mds_session *s)
9462600d2ddSSage Weil {
9472600d2ddSSage Weil if (s->s_mds >= mdsc->max_sessions ||
9482600d2ddSSage Weil mdsc->sessions[s->s_mds] != s)
9492600d2ddSSage Weil return -ENOENT;
9502600d2ddSSage Weil return 0;
9512600d2ddSSage Weil }
9522600d2ddSSage Weil
9532f2dc053SSage Weil /*
9542f2dc053SSage Weil * create+register a new session for given mds.
9552f2dc053SSage Weil * called under mdsc->mutex.
9562f2dc053SSage Weil */
register_session(struct ceph_mds_client * mdsc,int mds)9572f2dc053SSage Weil static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
9582f2dc053SSage Weil int mds)
9592f2dc053SSage Weil {
9602f2dc053SSage Weil struct ceph_mds_session *s;
9612f2dc053SSage Weil
962a68e564aSXiubo Li if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO)
963a68e564aSXiubo Li return ERR_PTR(-EIO);
964a68e564aSXiubo Li
965b38c9eb4SXiubo Li if (mds >= mdsc->mdsmap->possible_max_rank)
966c338c07cSNathaniel Yazdani return ERR_PTR(-EINVAL);
967c338c07cSNathaniel Yazdani
9682f2dc053SSage Weil s = kzalloc(sizeof(*s), GFP_NOFS);
9694736b009SDan Carpenter if (!s)
9704736b009SDan Carpenter return ERR_PTR(-ENOMEM);
97147474d0bSChengguang Xu
97247474d0bSChengguang Xu if (mds >= mdsc->max_sessions) {
97347474d0bSChengguang Xu int newmax = 1 << get_count_order(mds + 1);
97447474d0bSChengguang Xu struct ceph_mds_session **sa;
97547474d0bSChengguang Xu
97647474d0bSChengguang Xu dout("%s: realloc to %d\n", __func__, newmax);
97747474d0bSChengguang Xu sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
97847474d0bSChengguang Xu if (!sa)
97947474d0bSChengguang Xu goto fail_realloc;
98047474d0bSChengguang Xu if (mdsc->sessions) {
98147474d0bSChengguang Xu memcpy(sa, mdsc->sessions,
98247474d0bSChengguang Xu mdsc->max_sessions * sizeof(void *));
98347474d0bSChengguang Xu kfree(mdsc->sessions);
98447474d0bSChengguang Xu }
98547474d0bSChengguang Xu mdsc->sessions = sa;
98647474d0bSChengguang Xu mdsc->max_sessions = newmax;
98747474d0bSChengguang Xu }
98847474d0bSChengguang Xu
98947474d0bSChengguang Xu dout("%s: mds%d\n", __func__, mds);
9902f2dc053SSage Weil s->s_mdsc = mdsc;
9912f2dc053SSage Weil s->s_mds = mds;
9922f2dc053SSage Weil s->s_state = CEPH_MDS_SESSION_NEW;
9932f2dc053SSage Weil mutex_init(&s->s_mutex);
9942f2dc053SSage Weil
995b7a9e5ddSSage Weil ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr);
9962f2dc053SSage Weil
99752d60f8eSJeff Layton atomic_set(&s->s_cap_gen, 1);
9981ce208a6SAlex Elder s->s_cap_ttl = jiffies - 1;
999d8fb02abSAlex Elder
1000d8fb02abSAlex Elder spin_lock_init(&s->s_cap_lock);
10012f2dc053SSage Weil INIT_LIST_HEAD(&s->s_caps);
10023997c01dSElena Reshetova refcount_set(&s->s_ref, 1);
10032f2dc053SSage Weil INIT_LIST_HEAD(&s->s_waiting);
10042f2dc053SSage Weil INIT_LIST_HEAD(&s->s_unsafe);
1005d4846487SJeff Layton xa_init(&s->s_delegated_inos);
10062f2dc053SSage Weil INIT_LIST_HEAD(&s->s_cap_releases);
1007e3ec8d68SYan, Zheng INIT_WORK(&s->s_cap_release_work, ceph_cap_release_work);
1008e3ec8d68SYan, Zheng
10091cf03a68SJeff Layton INIT_LIST_HEAD(&s->s_cap_dirty);
10102f2dc053SSage Weil INIT_LIST_HEAD(&s->s_cap_flushing);
10112f2dc053SSage Weil
10122f2dc053SSage Weil mdsc->sessions[mds] = s;
101386d8f67bSYan, Zheng atomic_inc(&mdsc->num_sessions);
10143997c01dSElena Reshetova refcount_inc(&s->s_ref); /* one ref to sessions[], one to caller */
101542ce56e5SSage Weil
1016b7a9e5ddSSage Weil ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds,
1017b7a9e5ddSSage Weil ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
101842ce56e5SSage Weil
10192f2dc053SSage Weil return s;
102042ce56e5SSage Weil
102142ce56e5SSage Weil fail_realloc:
102242ce56e5SSage Weil kfree(s);
102342ce56e5SSage Weil return ERR_PTR(-ENOMEM);
10242f2dc053SSage Weil }
10252f2dc053SSage Weil
10262f2dc053SSage Weil /*
10272f2dc053SSage Weil * called under mdsc->mutex
10282f2dc053SSage Weil */
__unregister_session(struct ceph_mds_client * mdsc,struct ceph_mds_session * s)10292600d2ddSSage Weil static void __unregister_session(struct ceph_mds_client *mdsc,
103042ce56e5SSage Weil struct ceph_mds_session *s)
10312f2dc053SSage Weil {
10322600d2ddSSage Weil dout("__unregister_session mds%d %p\n", s->s_mds, s);
10332600d2ddSSage Weil BUG_ON(mdsc->sessions[s->s_mds] != s);
103442ce56e5SSage Weil mdsc->sessions[s->s_mds] = NULL;
103542ce56e5SSage Weil ceph_con_close(&s->s_con);
103642ce56e5SSage Weil ceph_put_mds_session(s);
103786d8f67bSYan, Zheng atomic_dec(&mdsc->num_sessions);
10382f2dc053SSage Weil }
10392f2dc053SSage Weil
10402f2dc053SSage Weil /*
10412f2dc053SSage Weil * drop session refs in request.
10422f2dc053SSage Weil *
10432f2dc053SSage Weil * should be last request ref, or hold mdsc->mutex
10442f2dc053SSage Weil */
put_request_session(struct ceph_mds_request * req)10452f2dc053SSage Weil static void put_request_session(struct ceph_mds_request *req)
10462f2dc053SSage Weil {
10472f2dc053SSage Weil if (req->r_session) {
10482f2dc053SSage Weil ceph_put_mds_session(req->r_session);
10492f2dc053SSage Weil req->r_session = NULL;
10502f2dc053SSage Weil }
10512f2dc053SSage Weil }
10522f2dc053SSage Weil
ceph_mdsc_iterate_sessions(struct ceph_mds_client * mdsc,void (* cb)(struct ceph_mds_session *),bool check_state)105359b312f3SXiubo Li void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc,
105459b312f3SXiubo Li void (*cb)(struct ceph_mds_session *),
105559b312f3SXiubo Li bool check_state)
105659b312f3SXiubo Li {
105759b312f3SXiubo Li int mds;
105859b312f3SXiubo Li
105959b312f3SXiubo Li mutex_lock(&mdsc->mutex);
106059b312f3SXiubo Li for (mds = 0; mds < mdsc->max_sessions; ++mds) {
106159b312f3SXiubo Li struct ceph_mds_session *s;
106259b312f3SXiubo Li
106359b312f3SXiubo Li s = __ceph_lookup_mds_session(mdsc, mds);
106459b312f3SXiubo Li if (!s)
106559b312f3SXiubo Li continue;
106659b312f3SXiubo Li
106759b312f3SXiubo Li if (check_state && !check_session_state(s)) {
106859b312f3SXiubo Li ceph_put_mds_session(s);
106959b312f3SXiubo Li continue;
107059b312f3SXiubo Li }
107159b312f3SXiubo Li
107259b312f3SXiubo Li mutex_unlock(&mdsc->mutex);
107359b312f3SXiubo Li cb(s);
107459b312f3SXiubo Li ceph_put_mds_session(s);
107559b312f3SXiubo Li mutex_lock(&mdsc->mutex);
107659b312f3SXiubo Li }
107759b312f3SXiubo Li mutex_unlock(&mdsc->mutex);
107859b312f3SXiubo Li }
107959b312f3SXiubo Li
ceph_mdsc_release_request(struct kref * kref)1080153c8e6bSSage Weil void ceph_mdsc_release_request(struct kref *kref)
10812f2dc053SSage Weil {
1082153c8e6bSSage Weil struct ceph_mds_request *req = container_of(kref,
1083153c8e6bSSage Weil struct ceph_mds_request,
1084153c8e6bSSage Weil r_kref);
1085e64f44a8SXiubo Li ceph_mdsc_release_dir_caps_no_check(req);
108654008399SYan, Zheng destroy_reply_info(&req->r_reply_info);
10872f2dc053SSage Weil if (req->r_request)
10882f2dc053SSage Weil ceph_msg_put(req->r_request);
108954008399SYan, Zheng if (req->r_reply)
10902f2dc053SSage Weil ceph_msg_put(req->r_reply);
10912f2dc053SSage Weil if (req->r_inode) {
109241b02e1fSSage Weil ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
109323c2c76eSJeff Layton iput(req->r_inode);
10942f2dc053SSage Weil }
10959c1c2b35SJeff Layton if (req->r_parent) {
10963dd69aabSJeff Layton ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
109723c2c76eSJeff Layton iput(req->r_parent);
10989c1c2b35SJeff Layton }
109923c2c76eSJeff Layton iput(req->r_target_inode);
1100ec9595c0SJeff Layton iput(req->r_new_inode);
11012f2dc053SSage Weil if (req->r_dentry)
11022f2dc053SSage Weil dput(req->r_dentry);
1103844d87c3SSage Weil if (req->r_old_dentry)
1104844d87c3SSage Weil dput(req->r_old_dentry);
1105844d87c3SSage Weil if (req->r_old_dentry_dir) {
110641b02e1fSSage Weil /*
110741b02e1fSSage Weil * track (and drop pins for) r_old_dentry_dir
110841b02e1fSSage Weil * separately, since r_old_dentry's d_parent may have
110941b02e1fSSage Weil * changed between the dir mutex being dropped and
111041b02e1fSSage Weil * this request being freed.
111141b02e1fSSage Weil */
111241b02e1fSSage Weil ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
11132f2dc053SSage Weil CEPH_CAP_PIN);
111423c2c76eSJeff Layton iput(req->r_old_dentry_dir);
11152f2dc053SSage Weil }
11162f2dc053SSage Weil kfree(req->r_path1);
11172f2dc053SSage Weil kfree(req->r_path2);
11187fe0cdebSJeff Layton put_cred(req->r_cred);
111925e6bae3SYan, Zheng if (req->r_pagelist)
112025e6bae3SYan, Zheng ceph_pagelist_release(req->r_pagelist);
11212d332d5bSJeff Layton kfree(req->r_fscrypt_auth);
112224865e75SJeff Layton kfree(req->r_altname);
11232f2dc053SSage Weil put_request_session(req);
112437151668SYehuda Sadeh ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
1125428138c9SYan, Zheng WARN_ON_ONCE(!list_empty(&req->r_wait));
1126058daab7SJeff Layton kmem_cache_free(ceph_mds_request_cachep, req);
11272f2dc053SSage Weil }
11282f2dc053SSage Weil
DEFINE_RB_FUNCS(request,struct ceph_mds_request,r_tid,r_node)1129fcd00b68SIlya Dryomov DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node)
1130fcd00b68SIlya Dryomov
11312f2dc053SSage Weil /*
11322f2dc053SSage Weil * lookup session, bump ref if found.
11332f2dc053SSage Weil *
11342f2dc053SSage Weil * called under mdsc->mutex.
11352f2dc053SSage Weil */
1136fcd00b68SIlya Dryomov static struct ceph_mds_request *
1137fcd00b68SIlya Dryomov lookup_get_request(struct ceph_mds_client *mdsc, u64 tid)
11382f2dc053SSage Weil {
11392f2dc053SSage Weil struct ceph_mds_request *req;
114044ca18f2SSage Weil
1141fcd00b68SIlya Dryomov req = lookup_request(&mdsc->request_tree, tid);
1142fcd00b68SIlya Dryomov if (req)
11432f2dc053SSage Weil ceph_mdsc_get_request(req);
1144fcd00b68SIlya Dryomov
11452f2dc053SSage Weil return req;
11462f2dc053SSage Weil }
11472f2dc053SSage Weil
11482f2dc053SSage Weil /*
11492f2dc053SSage Weil * Register an in-flight request, and assign a tid. Link to directory
11502f2dc053SSage Weil * are modifying (if any).
11512f2dc053SSage Weil *
11522f2dc053SSage Weil * Called under mdsc->mutex.
11532f2dc053SSage Weil */
__register_request(struct ceph_mds_client * mdsc,struct ceph_mds_request * req,struct inode * dir)11542f2dc053SSage Weil static void __register_request(struct ceph_mds_client *mdsc,
11552f2dc053SSage Weil struct ceph_mds_request *req,
11562f2dc053SSage Weil struct inode *dir)
11572f2dc053SSage Weil {
1158e30ee581SZhi Zhang int ret = 0;
1159e30ee581SZhi Zhang
11602f2dc053SSage Weil req->r_tid = ++mdsc->last_tid;
1161e30ee581SZhi Zhang if (req->r_num_caps) {
1162e30ee581SZhi Zhang ret = ceph_reserve_caps(mdsc, &req->r_caps_reservation,
116337151668SYehuda Sadeh req->r_num_caps);
1164e30ee581SZhi Zhang if (ret < 0) {
1165e30ee581SZhi Zhang pr_err("__register_request %p "
1166e30ee581SZhi Zhang "failed to reserve caps: %d\n", req, ret);
1167e30ee581SZhi Zhang /* set req->r_err to fail early from __do_request */
1168e30ee581SZhi Zhang req->r_err = ret;
1169e30ee581SZhi Zhang return;
1170e30ee581SZhi Zhang }
1171e30ee581SZhi Zhang }
11722f2dc053SSage Weil dout("__register_request %p tid %lld\n", req, req->r_tid);
11732f2dc053SSage Weil ceph_mdsc_get_request(req);
1174fcd00b68SIlya Dryomov insert_request(&mdsc->request_tree, req);
11752f2dc053SSage Weil
11767fe0cdebSJeff Layton req->r_cred = get_current_cred();
1177cb4276ccSSage Weil
1178e8a7b8b1SYan, Zheng if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK)
1179e8a7b8b1SYan, Zheng mdsc->oldest_tid = req->r_tid;
1180e8a7b8b1SYan, Zheng
11812f2dc053SSage Weil if (dir) {
11823db0a2fcSJeff Layton struct ceph_inode_info *ci = ceph_inode(dir);
11833db0a2fcSJeff Layton
11843b663780SSage Weil ihold(dir);
11852f2dc053SSage Weil req->r_unsafe_dir = dir;
11863db0a2fcSJeff Layton spin_lock(&ci->i_unsafe_lock);
11873db0a2fcSJeff Layton list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
11883db0a2fcSJeff Layton spin_unlock(&ci->i_unsafe_lock);
11892f2dc053SSage Weil }
11902f2dc053SSage Weil }
11912f2dc053SSage Weil
__unregister_request(struct ceph_mds_client * mdsc,struct ceph_mds_request * req)11922f2dc053SSage Weil static void __unregister_request(struct ceph_mds_client *mdsc,
11932f2dc053SSage Weil struct ceph_mds_request *req)
11942f2dc053SSage Weil {
11952f2dc053SSage Weil dout("__unregister_request %p tid %lld\n", req, req->r_tid);
1196e8a7b8b1SYan, Zheng
1197df963ea8SJeff Layton /* Never leave an unregistered request on an unsafe list! */
1198df963ea8SJeff Layton list_del_init(&req->r_unsafe_item);
1199df963ea8SJeff Layton
1200e8a7b8b1SYan, Zheng if (req->r_tid == mdsc->oldest_tid) {
1201e8a7b8b1SYan, Zheng struct rb_node *p = rb_next(&req->r_node);
1202e8a7b8b1SYan, Zheng mdsc->oldest_tid = 0;
1203e8a7b8b1SYan, Zheng while (p) {
1204e8a7b8b1SYan, Zheng struct ceph_mds_request *next_req =
1205e8a7b8b1SYan, Zheng rb_entry(p, struct ceph_mds_request, r_node);
1206e8a7b8b1SYan, Zheng if (next_req->r_op != CEPH_MDS_OP_SETFILELOCK) {
1207e8a7b8b1SYan, Zheng mdsc->oldest_tid = next_req->r_tid;
1208e8a7b8b1SYan, Zheng break;
1209e8a7b8b1SYan, Zheng }
1210e8a7b8b1SYan, Zheng p = rb_next(p);
1211e8a7b8b1SYan, Zheng }
1212e8a7b8b1SYan, Zheng }
1213e8a7b8b1SYan, Zheng
1214fcd00b68SIlya Dryomov erase_request(&mdsc->request_tree, req);
12152f2dc053SSage Weil
12163db0a2fcSJeff Layton if (req->r_unsafe_dir) {
12172f2dc053SSage Weil struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
12182f2dc053SSage Weil spin_lock(&ci->i_unsafe_lock);
12192f2dc053SSage Weil list_del_init(&req->r_unsafe_dir_item);
12202f2dc053SSage Weil spin_unlock(&ci->i_unsafe_lock);
12214c06ace8SYan, Zheng }
1222bc2de10dSJeff Layton if (req->r_target_inode &&
1223bc2de10dSJeff Layton test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
122468cd5b4bSYan, Zheng struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
122568cd5b4bSYan, Zheng spin_lock(&ci->i_unsafe_lock);
122668cd5b4bSYan, Zheng list_del_init(&req->r_unsafe_target_item);
122768cd5b4bSYan, Zheng spin_unlock(&ci->i_unsafe_lock);
122868cd5b4bSYan, Zheng }
12293b663780SSage Weil
12304c06ace8SYan, Zheng if (req->r_unsafe_dir) {
123123c2c76eSJeff Layton iput(req->r_unsafe_dir);
12323b663780SSage Weil req->r_unsafe_dir = NULL;
12332f2dc053SSage Weil }
123494aa8ae1SSage Weil
1235fc55d2c9SYan, Zheng complete_all(&req->r_safe_completion);
1236fc55d2c9SYan, Zheng
123794aa8ae1SSage Weil ceph_mdsc_put_request(req);
12382f2dc053SSage Weil }
12392f2dc053SSage Weil
12402f2dc053SSage Weil /*
124130c71233SJeff Layton * Walk back up the dentry tree until we hit a dentry representing a
124230c71233SJeff Layton * non-snapshot inode. We do this using the rcu_read_lock (which must be held
124330c71233SJeff Layton * when calling this) to ensure that the objects won't disappear while we're
124430c71233SJeff Layton * working with them. Once we hit a candidate dentry, we attempt to take a
124530c71233SJeff Layton * reference to it, and return that as the result.
124630c71233SJeff Layton */
get_nonsnap_parent(struct dentry * dentry)1247f1075480SDan Carpenter static struct inode *get_nonsnap_parent(struct dentry *dentry)
1248f1075480SDan Carpenter {
1249f1075480SDan Carpenter struct inode *inode = NULL;
125030c71233SJeff Layton
125130c71233SJeff Layton while (dentry && !IS_ROOT(dentry)) {
125230c71233SJeff Layton inode = d_inode_rcu(dentry);
125330c71233SJeff Layton if (!inode || ceph_snap(inode) == CEPH_NOSNAP)
125430c71233SJeff Layton break;
125530c71233SJeff Layton dentry = dentry->d_parent;
125630c71233SJeff Layton }
125730c71233SJeff Layton if (inode)
125830c71233SJeff Layton inode = igrab(inode);
125930c71233SJeff Layton return inode;
126030c71233SJeff Layton }
126130c71233SJeff Layton
126230c71233SJeff Layton /*
12632f2dc053SSage Weil * Choose mds to send request to next. If there is a hint set in the
12642f2dc053SSage Weil * request (e.g., due to a prior forward hint from the mds), use that.
12652f2dc053SSage Weil * Otherwise, consult frag tree and/or caps to identify the
12662f2dc053SSage Weil * appropriate mds. If all else fails, choose randomly.
12672f2dc053SSage Weil *
12682f2dc053SSage Weil * Called under mdsc->mutex.
12692f2dc053SSage Weil */
__choose_mds(struct ceph_mds_client * mdsc,struct ceph_mds_request * req,bool * random)12702f2dc053SSage Weil static int __choose_mds(struct ceph_mds_client *mdsc,
1271c4853e97SXiubo Li struct ceph_mds_request *req,
1272c4853e97SXiubo Li bool *random)
12732f2dc053SSage Weil {
12742f2dc053SSage Weil struct inode *inode;
12752f2dc053SSage Weil struct ceph_inode_info *ci;
12762f2dc053SSage Weil struct ceph_cap *cap;
12772f2dc053SSage Weil int mode = req->r_direct_mode;
12782f2dc053SSage Weil int mds = -1;
12792f2dc053SSage Weil u32 hash = req->r_direct_hash;
1280bc2de10dSJeff Layton bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
12812f2dc053SSage Weil
1282c4853e97SXiubo Li if (random)
1283c4853e97SXiubo Li *random = false;
1284c4853e97SXiubo Li
12852f2dc053SSage Weil /*
12862f2dc053SSage Weil * is there a specific mds we should try? ignore hint if we have
12872f2dc053SSage Weil * no session and the mds is not up (active or recovering).
12882f2dc053SSage Weil */
12892f2dc053SSage Weil if (req->r_resend_mds >= 0 &&
12902f2dc053SSage Weil (__have_session(mdsc, req->r_resend_mds) ||
12912f2dc053SSage Weil ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
12923c802092SXiubo Li dout("%s using resend_mds mds%d\n", __func__,
12932f2dc053SSage Weil req->r_resend_mds);
12942f2dc053SSage Weil return req->r_resend_mds;
12952f2dc053SSage Weil }
12962f2dc053SSage Weil
12972f2dc053SSage Weil if (mode == USE_RANDOM_MDS)
12982f2dc053SSage Weil goto random;
12992f2dc053SSage Weil
13002f2dc053SSage Weil inode = NULL;
13012f2dc053SSage Weil if (req->r_inode) {
13025d37ca14SYan, Zheng if (ceph_snap(req->r_inode) != CEPH_SNAPDIR) {
13032f2dc053SSage Weil inode = req->r_inode;
130430c71233SJeff Layton ihold(inode);
13055d37ca14SYan, Zheng } else {
130638f340ccSYan, Zheng /* req->r_dentry is non-null for LSSNAP request */
130738f340ccSYan, Zheng rcu_read_lock();
130838f340ccSYan, Zheng inode = get_nonsnap_parent(req->r_dentry);
130938f340ccSYan, Zheng rcu_read_unlock();
13103c802092SXiubo Li dout("%s using snapdir's parent %p\n", __func__, inode);
13115d37ca14SYan, Zheng }
131238f340ccSYan, Zheng } else if (req->r_dentry) {
1313d79698daSSage Weil /* ignore race with rename; old or new d_parent is okay */
131430c71233SJeff Layton struct dentry *parent;
131530c71233SJeff Layton struct inode *dir;
1316eb6bb1c5SSage Weil
131730c71233SJeff Layton rcu_read_lock();
131841883ba8SYan, Zheng parent = READ_ONCE(req->r_dentry->d_parent);
13193dd69aabSJeff Layton dir = req->r_parent ? : d_inode_rcu(parent);
132030c71233SJeff Layton
132130c71233SJeff Layton if (!dir || dir->i_sb != mdsc->fsc->sb) {
132230c71233SJeff Layton /* not this fs or parent went negative */
13232b0143b5SDavid Howells inode = d_inode(req->r_dentry);
132430c71233SJeff Layton if (inode)
132530c71233SJeff Layton ihold(inode);
1326eb6bb1c5SSage Weil } else if (ceph_snap(dir) != CEPH_NOSNAP) {
1327eb6bb1c5SSage Weil /* direct snapped/virtual snapdir requests
1328eb6bb1c5SSage Weil * based on parent dir inode */
132930c71233SJeff Layton inode = get_nonsnap_parent(parent);
13303c802092SXiubo Li dout("%s using nonsnap parent %p\n", __func__, inode);
1331ca18bedeSYan, Zheng } else {
1332eb6bb1c5SSage Weil /* dentry target */
13332b0143b5SDavid Howells inode = d_inode(req->r_dentry);
1334ca18bedeSYan, Zheng if (!inode || mode == USE_AUTH_MDS) {
1335eb6bb1c5SSage Weil /* dir + name */
133630c71233SJeff Layton inode = igrab(dir);
1337e5f86dc3SSage Weil hash = ceph_dentry_hash(dir, req->r_dentry);
13382f2dc053SSage Weil is_hash = true;
133930c71233SJeff Layton } else {
134030c71233SJeff Layton ihold(inode);
13412f2dc053SSage Weil }
13422f2dc053SSage Weil }
134330c71233SJeff Layton rcu_read_unlock();
1344ca18bedeSYan, Zheng }
1345eb6bb1c5SSage Weil
13463c802092SXiubo Li dout("%s %p is_hash=%d (0x%x) mode %d\n", __func__, inode, (int)is_hash,
13473c802092SXiubo Li hash, mode);
13482f2dc053SSage Weil if (!inode)
13492f2dc053SSage Weil goto random;
13502f2dc053SSage Weil ci = ceph_inode(inode);
13512f2dc053SSage Weil
13522f2dc053SSage Weil if (is_hash && S_ISDIR(inode->i_mode)) {
13532f2dc053SSage Weil struct ceph_inode_frag frag;
13542f2dc053SSage Weil int found;
13552f2dc053SSage Weil
13562f2dc053SSage Weil ceph_choose_frag(ci, hash, &frag, &found);
13572f2dc053SSage Weil if (found) {
13582f2dc053SSage Weil if (mode == USE_ANY_MDS && frag.ndist > 0) {
13592f2dc053SSage Weil u8 r;
13602f2dc053SSage Weil
13612f2dc053SSage Weil /* choose a random replica */
13622f2dc053SSage Weil get_random_bytes(&r, 1);
13632f2dc053SSage Weil r %= frag.ndist;
13642f2dc053SSage Weil mds = frag.dist[r];
13653c802092SXiubo Li dout("%s %p %llx.%llx frag %u mds%d (%d/%d)\n",
13663c802092SXiubo Li __func__, inode, ceph_vinop(inode),
13673c802092SXiubo Li frag.frag, mds, (int)r, frag.ndist);
1368d66bbd44SSage Weil if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
13695d47648fSXiubo Li CEPH_MDS_STATE_ACTIVE &&
13705d47648fSXiubo Li !ceph_mdsmap_is_laggy(mdsc->mdsmap, mds))
137130c71233SJeff Layton goto out;
13722f2dc053SSage Weil }
13732f2dc053SSage Weil
13742f2dc053SSage Weil /* since this file/dir wasn't known to be
13752f2dc053SSage Weil * replicated, then we want to look for the
13762f2dc053SSage Weil * authoritative mds. */
13772f2dc053SSage Weil if (frag.mds >= 0) {
13782f2dc053SSage Weil /* choose auth mds */
13792f2dc053SSage Weil mds = frag.mds;
13803c802092SXiubo Li dout("%s %p %llx.%llx frag %u mds%d (auth)\n",
13813c802092SXiubo Li __func__, inode, ceph_vinop(inode),
13823c802092SXiubo Li frag.frag, mds);
1383d66bbd44SSage Weil if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
13845d47648fSXiubo Li CEPH_MDS_STATE_ACTIVE) {
1385224c7b67SYanhu Cao if (!ceph_mdsmap_is_laggy(mdsc->mdsmap,
13865d47648fSXiubo Li mds))
138730c71233SJeff Layton goto out;
13882f2dc053SSage Weil }
13892f2dc053SSage Weil }
13905d47648fSXiubo Li mode = USE_AUTH_MDS;
13915d47648fSXiubo Li }
13922f2dc053SSage Weil }
13932f2dc053SSage Weil
1394be655596SSage Weil spin_lock(&ci->i_ceph_lock);
13952f2dc053SSage Weil cap = NULL;
13962f2dc053SSage Weil if (mode == USE_AUTH_MDS)
13972f2dc053SSage Weil cap = ci->i_auth_cap;
13982f2dc053SSage Weil if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
13992f2dc053SSage Weil cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
14002f2dc053SSage Weil if (!cap) {
1401be655596SSage Weil spin_unlock(&ci->i_ceph_lock);
140223c2c76eSJeff Layton iput(inode);
14032f2dc053SSage Weil goto random;
14042f2dc053SSage Weil }
14052f2dc053SSage Weil mds = cap->session->s_mds;
14063c802092SXiubo Li dout("%s %p %llx.%llx mds%d (%scap %p)\n", __func__,
14072f2dc053SSage Weil inode, ceph_vinop(inode), mds,
14082f2dc053SSage Weil cap == ci->i_auth_cap ? "auth " : "", cap);
1409be655596SSage Weil spin_unlock(&ci->i_ceph_lock);
141030c71233SJeff Layton out:
141123c2c76eSJeff Layton iput(inode);
14122f2dc053SSage Weil return mds;
14132f2dc053SSage Weil
14142f2dc053SSage Weil random:
1415c4853e97SXiubo Li if (random)
1416c4853e97SXiubo Li *random = true;
1417c4853e97SXiubo Li
14182f2dc053SSage Weil mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
14193c802092SXiubo Li dout("%s chose random mds%d\n", __func__, mds);
14202f2dc053SSage Weil return mds;
14212f2dc053SSage Weil }
14222f2dc053SSage Weil
14232f2dc053SSage Weil
14242f2dc053SSage Weil /*
14252f2dc053SSage Weil * session messages
14262f2dc053SSage Weil */
ceph_create_session_msg(u32 op,u64 seq)1427fba97e80SXiubo Li struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq)
14282f2dc053SSage Weil {
14292f2dc053SSage Weil struct ceph_msg *msg;
14302f2dc053SSage Weil struct ceph_mds_session_head *h;
14312f2dc053SSage Weil
1432b61c2763SSage Weil msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS,
1433b61c2763SSage Weil false);
1434a79832f2SSage Weil if (!msg) {
1435fba97e80SXiubo Li pr_err("ENOMEM creating session %s msg\n",
1436fba97e80SXiubo Li ceph_session_op_name(op));
1437a79832f2SSage Weil return NULL;
14382f2dc053SSage Weil }
14392f2dc053SSage Weil h = msg->front.iov_base;
14402f2dc053SSage Weil h->op = cpu_to_le32(op);
14412f2dc053SSage Weil h->seq = cpu_to_le64(seq);
1442dbd0c8bfSJohn Spray
1443dbd0c8bfSJohn Spray return msg;
1444dbd0c8bfSJohn Spray }
1445dbd0c8bfSJohn Spray
14469ba1e224SXiubo Li static const unsigned char feature_bits[] = CEPHFS_FEATURES_CLIENT_SUPPORTED;
14479ba1e224SXiubo Li #define FEATURE_BYTES(c) (DIV_ROUND_UP((size_t)feature_bits[c - 1] + 1, 64) * 8)
encode_supported_features(void ** p,void * end)1448b682c6d4SXiubo Li static int encode_supported_features(void **p, void *end)
1449342ce182SYan, Zheng {
14509ba1e224SXiubo Li static const size_t count = ARRAY_SIZE(feature_bits);
1451342ce182SYan, Zheng
1452342ce182SYan, Zheng if (count > 0) {
1453342ce182SYan, Zheng size_t i;
14549ba1e224SXiubo Li size_t size = FEATURE_BYTES(count);
1455fea013e0SLuís Henriques unsigned long bit;
1456342ce182SYan, Zheng
1457b682c6d4SXiubo Li if (WARN_ON_ONCE(*p + 4 + size > end))
1458b682c6d4SXiubo Li return -ERANGE;
1459b682c6d4SXiubo Li
1460342ce182SYan, Zheng ceph_encode_32(p, size);
1461342ce182SYan, Zheng memset(*p, 0, size);
1462fea013e0SLuís Henriques for (i = 0; i < count; i++) {
1463fea013e0SLuís Henriques bit = feature_bits[i];
1464fea013e0SLuís Henriques ((unsigned char *)(*p))[bit / 8] |= BIT(bit % 8);
1465fea013e0SLuís Henriques }
1466342ce182SYan, Zheng *p += size;
1467342ce182SYan, Zheng } else {
1468b682c6d4SXiubo Li if (WARN_ON_ONCE(*p + 4 > end))
1469b682c6d4SXiubo Li return -ERANGE;
1470b682c6d4SXiubo Li
1471342ce182SYan, Zheng ceph_encode_32(p, 0);
1472342ce182SYan, Zheng }
1473b682c6d4SXiubo Li
1474b682c6d4SXiubo Li return 0;
1475342ce182SYan, Zheng }
1476342ce182SYan, Zheng
14773b4168ddSXiubo Li static const unsigned char metric_bits[] = CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED;
14783b4168ddSXiubo Li #define METRIC_BYTES(cnt) (DIV_ROUND_UP((size_t)metric_bits[cnt - 1] + 1, 64) * 8)
encode_metric_spec(void ** p,void * end)14793b4168ddSXiubo Li static int encode_metric_spec(void **p, void *end)
14803b4168ddSXiubo Li {
14813b4168ddSXiubo Li static const size_t count = ARRAY_SIZE(metric_bits);
14823b4168ddSXiubo Li
14833b4168ddSXiubo Li /* header */
14843b4168ddSXiubo Li if (WARN_ON_ONCE(*p + 2 > end))
14853b4168ddSXiubo Li return -ERANGE;
14863b4168ddSXiubo Li
14873b4168ddSXiubo Li ceph_encode_8(p, 1); /* version */
14883b4168ddSXiubo Li ceph_encode_8(p, 1); /* compat */
14893b4168ddSXiubo Li
14903b4168ddSXiubo Li if (count > 0) {
14913b4168ddSXiubo Li size_t i;
14923b4168ddSXiubo Li size_t size = METRIC_BYTES(count);
14933b4168ddSXiubo Li
14943b4168ddSXiubo Li if (WARN_ON_ONCE(*p + 4 + 4 + size > end))
14953b4168ddSXiubo Li return -ERANGE;
14963b4168ddSXiubo Li
14973b4168ddSXiubo Li /* metric spec info length */
14983b4168ddSXiubo Li ceph_encode_32(p, 4 + size);
14993b4168ddSXiubo Li
15003b4168ddSXiubo Li /* metric spec */
15013b4168ddSXiubo Li ceph_encode_32(p, size);
15023b4168ddSXiubo Li memset(*p, 0, size);
15033b4168ddSXiubo Li for (i = 0; i < count; i++)
15043b4168ddSXiubo Li ((unsigned char *)(*p))[i / 8] |= BIT(metric_bits[i] % 8);
15053b4168ddSXiubo Li *p += size;
15063b4168ddSXiubo Li } else {
15073b4168ddSXiubo Li if (WARN_ON_ONCE(*p + 4 + 4 > end))
15083b4168ddSXiubo Li return -ERANGE;
15093b4168ddSXiubo Li
15103b4168ddSXiubo Li /* metric spec info length */
15113b4168ddSXiubo Li ceph_encode_32(p, 4);
15123b4168ddSXiubo Li /* metric spec */
15133b4168ddSXiubo Li ceph_encode_32(p, 0);
15143b4168ddSXiubo Li }
15153b4168ddSXiubo Li
15163b4168ddSXiubo Li return 0;
15173b4168ddSXiubo Li }
15183b4168ddSXiubo Li
1519dbd0c8bfSJohn Spray /*
1520dbd0c8bfSJohn Spray * session message, specialization for CEPH_SESSION_REQUEST_OPEN
1521dbd0c8bfSJohn Spray * to include additional client metadata fields.
1522dbd0c8bfSJohn Spray */
create_session_open_msg(struct ceph_mds_client * mdsc,u64 seq)1523dbd0c8bfSJohn Spray static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u64 seq)
1524dbd0c8bfSJohn Spray {
1525dbd0c8bfSJohn Spray struct ceph_msg *msg;
1526dbd0c8bfSJohn Spray struct ceph_mds_session_head *h;
15274a756db2SColin Ian King int i;
1528342ce182SYan, Zheng int extra_bytes = 0;
1529dbd0c8bfSJohn Spray int metadata_key_count = 0;
1530dbd0c8bfSJohn Spray struct ceph_options *opt = mdsc->fsc->client->options;
15313f384954SYan, Zheng struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;
15329ba1e224SXiubo Li size_t size, count;
1533342ce182SYan, Zheng void *p, *end;
1534b682c6d4SXiubo Li int ret;
1535dbd0c8bfSJohn Spray
1536a6a5ce4fSYan, Zheng const char* metadata[][2] = {
1537717e6f28SYan, Zheng {"hostname", mdsc->nodename},
1538717e6f28SYan, Zheng {"kernel_version", init_utsname()->release},
15393f384954SYan, Zheng {"entity_id", opt->name ? : ""},
15403f384954SYan, Zheng {"root", fsopt->server_path ? : "/"},
1541dbd0c8bfSJohn Spray {NULL, NULL}
1542dbd0c8bfSJohn Spray };
1543dbd0c8bfSJohn Spray
1544dbd0c8bfSJohn Spray /* Calculate serialized length of metadata */
1545342ce182SYan, Zheng extra_bytes = 4; /* map length */
1546d37b1d99SMarkus Elfring for (i = 0; metadata[i][0]; ++i) {
1547342ce182SYan, Zheng extra_bytes += 8 + strlen(metadata[i][0]) +
1548dbd0c8bfSJohn Spray strlen(metadata[i][1]);
1549dbd0c8bfSJohn Spray metadata_key_count++;
1550dbd0c8bfSJohn Spray }
15519ba1e224SXiubo Li
1552342ce182SYan, Zheng /* supported feature */
15539ba1e224SXiubo Li size = 0;
15549ba1e224SXiubo Li count = ARRAY_SIZE(feature_bits);
15559ba1e224SXiubo Li if (count > 0)
15569ba1e224SXiubo Li size = FEATURE_BYTES(count);
15579ba1e224SXiubo Li extra_bytes += 4 + size;
1558dbd0c8bfSJohn Spray
15593b4168ddSXiubo Li /* metric spec */
15603b4168ddSXiubo Li size = 0;
15613b4168ddSXiubo Li count = ARRAY_SIZE(metric_bits);
15623b4168ddSXiubo Li if (count > 0)
15633b4168ddSXiubo Li size = METRIC_BYTES(count);
15643b4168ddSXiubo Li extra_bytes += 2 + 4 + 4 + size;
15653b4168ddSXiubo Li
1566dbd0c8bfSJohn Spray /* Allocate the message */
1567342ce182SYan, Zheng msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes,
1568dbd0c8bfSJohn Spray GFP_NOFS, false);
1569dbd0c8bfSJohn Spray if (!msg) {
1570fba97e80SXiubo Li pr_err("ENOMEM creating session open msg\n");
1571b682c6d4SXiubo Li return ERR_PTR(-ENOMEM);
1572dbd0c8bfSJohn Spray }
1573342ce182SYan, Zheng p = msg->front.iov_base;
1574342ce182SYan, Zheng end = p + msg->front.iov_len;
1575342ce182SYan, Zheng
1576342ce182SYan, Zheng h = p;
1577dbd0c8bfSJohn Spray h->op = cpu_to_le32(CEPH_SESSION_REQUEST_OPEN);
1578dbd0c8bfSJohn Spray h->seq = cpu_to_le64(seq);
1579dbd0c8bfSJohn Spray
1580dbd0c8bfSJohn Spray /*
1581dbd0c8bfSJohn Spray * Serialize client metadata into waiting buffer space, using
1582dbd0c8bfSJohn Spray * the format that userspace expects for map<string, string>
15837cfa0313SJohn Spray *
15843b4168ddSXiubo Li * ClientSession messages with metadata are v4
1585dbd0c8bfSJohn Spray */
15863b4168ddSXiubo Li msg->hdr.version = cpu_to_le16(4);
15877cfa0313SJohn Spray msg->hdr.compat_version = cpu_to_le16(1);
1588dbd0c8bfSJohn Spray
1589dbd0c8bfSJohn Spray /* The write pointer, following the session_head structure */
1590342ce182SYan, Zheng p += sizeof(*h);
1591dbd0c8bfSJohn Spray
1592dbd0c8bfSJohn Spray /* Number of entries in the map */
1593dbd0c8bfSJohn Spray ceph_encode_32(&p, metadata_key_count);
1594dbd0c8bfSJohn Spray
1595dbd0c8bfSJohn Spray /* Two length-prefixed strings for each entry in the map */
1596d37b1d99SMarkus Elfring for (i = 0; metadata[i][0]; ++i) {
1597dbd0c8bfSJohn Spray size_t const key_len = strlen(metadata[i][0]);
1598dbd0c8bfSJohn Spray size_t const val_len = strlen(metadata[i][1]);
1599dbd0c8bfSJohn Spray
1600dbd0c8bfSJohn Spray ceph_encode_32(&p, key_len);
1601dbd0c8bfSJohn Spray memcpy(p, metadata[i][0], key_len);
1602dbd0c8bfSJohn Spray p += key_len;
1603dbd0c8bfSJohn Spray ceph_encode_32(&p, val_len);
1604dbd0c8bfSJohn Spray memcpy(p, metadata[i][1], val_len);
1605dbd0c8bfSJohn Spray p += val_len;
1606dbd0c8bfSJohn Spray }
1607dbd0c8bfSJohn Spray
1608b682c6d4SXiubo Li ret = encode_supported_features(&p, end);
1609b682c6d4SXiubo Li if (ret) {
1610b682c6d4SXiubo Li pr_err("encode_supported_features failed!\n");
1611b682c6d4SXiubo Li ceph_msg_put(msg);
1612b682c6d4SXiubo Li return ERR_PTR(ret);
1613b682c6d4SXiubo Li }
1614b682c6d4SXiubo Li
16153b4168ddSXiubo Li ret = encode_metric_spec(&p, end);
16163b4168ddSXiubo Li if (ret) {
16173b4168ddSXiubo Li pr_err("encode_metric_spec failed!\n");
16183b4168ddSXiubo Li ceph_msg_put(msg);
16193b4168ddSXiubo Li return ERR_PTR(ret);
16203b4168ddSXiubo Li }
16213b4168ddSXiubo Li
1622342ce182SYan, Zheng msg->front.iov_len = p - msg->front.iov_base;
1623342ce182SYan, Zheng msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1624342ce182SYan, Zheng
16252f2dc053SSage Weil return msg;
16262f2dc053SSage Weil }
16272f2dc053SSage Weil
16282f2dc053SSage Weil /*
16292f2dc053SSage Weil * send session open request.
16302f2dc053SSage Weil *
16312f2dc053SSage Weil * called under mdsc->mutex
16322f2dc053SSage Weil */
__open_session(struct ceph_mds_client * mdsc,struct ceph_mds_session * session)16332f2dc053SSage Weil static int __open_session(struct ceph_mds_client *mdsc,
16342f2dc053SSage Weil struct ceph_mds_session *session)
16352f2dc053SSage Weil {
16362f2dc053SSage Weil struct ceph_msg *msg;
16372f2dc053SSage Weil int mstate;
16382f2dc053SSage Weil int mds = session->s_mds;
16392f2dc053SSage Weil
1640a68e564aSXiubo Li if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO)
1641a68e564aSXiubo Li return -EIO;
1642a68e564aSXiubo Li
16432f2dc053SSage Weil /* wait for mds to go active? */
16442f2dc053SSage Weil mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
16452f2dc053SSage Weil dout("open_session to mds%d (%s)\n", mds,
16462f2dc053SSage Weil ceph_mds_state_name(mstate));
16472f2dc053SSage Weil session->s_state = CEPH_MDS_SESSION_OPENING;
16482f2dc053SSage Weil session->s_renew_requested = jiffies;
16492f2dc053SSage Weil
16502f2dc053SSage Weil /* send connect message */
1651dbd0c8bfSJohn Spray msg = create_session_open_msg(mdsc, session->s_seq);
1652b682c6d4SXiubo Li if (IS_ERR(msg))
1653b682c6d4SXiubo Li return PTR_ERR(msg);
16542f2dc053SSage Weil ceph_con_send(&session->s_con, msg);
16552f2dc053SSage Weil return 0;
16562f2dc053SSage Weil }
16572f2dc053SSage Weil
16582f2dc053SSage Weil /*
1659ed0552a1SSage Weil * open sessions for any export targets for the given mds
1660ed0552a1SSage Weil *
1661ed0552a1SSage Weil * called under mdsc->mutex
1662ed0552a1SSage Weil */
16635d72d13cSYan, Zheng static struct ceph_mds_session *
__open_export_target_session(struct ceph_mds_client * mdsc,int target)16645d72d13cSYan, Zheng __open_export_target_session(struct ceph_mds_client *mdsc, int target)
16655d72d13cSYan, Zheng {
16665d72d13cSYan, Zheng struct ceph_mds_session *session;
1667b682c6d4SXiubo Li int ret;
16685d72d13cSYan, Zheng
16695d72d13cSYan, Zheng session = __ceph_lookup_mds_session(mdsc, target);
16705d72d13cSYan, Zheng if (!session) {
16715d72d13cSYan, Zheng session = register_session(mdsc, target);
16725d72d13cSYan, Zheng if (IS_ERR(session))
16735d72d13cSYan, Zheng return session;
16745d72d13cSYan, Zheng }
16755d72d13cSYan, Zheng if (session->s_state == CEPH_MDS_SESSION_NEW ||
1676b682c6d4SXiubo Li session->s_state == CEPH_MDS_SESSION_CLOSING) {
1677b682c6d4SXiubo Li ret = __open_session(mdsc, session);
1678b682c6d4SXiubo Li if (ret)
1679b682c6d4SXiubo Li return ERR_PTR(ret);
1680b682c6d4SXiubo Li }
16815d72d13cSYan, Zheng
16825d72d13cSYan, Zheng return session;
16835d72d13cSYan, Zheng }
16845d72d13cSYan, Zheng
16855d72d13cSYan, Zheng struct ceph_mds_session *
ceph_mdsc_open_export_target_session(struct ceph_mds_client * mdsc,int target)16865d72d13cSYan, Zheng ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target)
16875d72d13cSYan, Zheng {
16885d72d13cSYan, Zheng struct ceph_mds_session *session;
16895d72d13cSYan, Zheng
16905d72d13cSYan, Zheng dout("open_export_target_session to mds%d\n", target);
16915d72d13cSYan, Zheng
16925d72d13cSYan, Zheng mutex_lock(&mdsc->mutex);
16935d72d13cSYan, Zheng session = __open_export_target_session(mdsc, target);
16945d72d13cSYan, Zheng mutex_unlock(&mdsc->mutex);
16955d72d13cSYan, Zheng
16965d72d13cSYan, Zheng return session;
16975d72d13cSYan, Zheng }
16985d72d13cSYan, Zheng
__open_export_target_sessions(struct ceph_mds_client * mdsc,struct ceph_mds_session * session)1699ed0552a1SSage Weil static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
1700ed0552a1SSage Weil struct ceph_mds_session *session)
1701ed0552a1SSage Weil {
1702ed0552a1SSage Weil struct ceph_mds_info *mi;
1703ed0552a1SSage Weil struct ceph_mds_session *ts;
1704ed0552a1SSage Weil int i, mds = session->s_mds;
1705ed0552a1SSage Weil
1706b38c9eb4SXiubo Li if (mds >= mdsc->mdsmap->possible_max_rank)
1707ed0552a1SSage Weil return;
17085d72d13cSYan, Zheng
1709ed0552a1SSage Weil mi = &mdsc->mdsmap->m_info[mds];
1710ed0552a1SSage Weil dout("open_export_target_sessions for mds%d (%d targets)\n",
1711ed0552a1SSage Weil session->s_mds, mi->num_export_targets);
1712ed0552a1SSage Weil
1713ed0552a1SSage Weil for (i = 0; i < mi->num_export_targets; i++) {
17145d72d13cSYan, Zheng ts = __open_export_target_session(mdsc, mi->export_targets[i]);
1715ed0552a1SSage Weil ceph_put_mds_session(ts);
1716ed0552a1SSage Weil }
1717ed0552a1SSage Weil }
1718ed0552a1SSage Weil
ceph_mdsc_open_export_target_sessions(struct ceph_mds_client * mdsc,struct ceph_mds_session * session)1719154f42c2SSage Weil void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
1720154f42c2SSage Weil struct ceph_mds_session *session)
1721154f42c2SSage Weil {
1722154f42c2SSage Weil mutex_lock(&mdsc->mutex);
1723154f42c2SSage Weil __open_export_target_sessions(mdsc, session);
1724154f42c2SSage Weil mutex_unlock(&mdsc->mutex);
1725154f42c2SSage Weil }
1726154f42c2SSage Weil
1727ed0552a1SSage Weil /*
17282f2dc053SSage Weil * session caps
17292f2dc053SSage Weil */
17302f2dc053SSage Weil
detach_cap_releases(struct ceph_mds_session * session,struct list_head * target)1731c8a96a31SJeff Layton static void detach_cap_releases(struct ceph_mds_session *session,
1732c8a96a31SJeff Layton struct list_head *target)
17332f2dc053SSage Weil {
1734c8a96a31SJeff Layton lockdep_assert_held(&session->s_cap_lock);
1735745a8e3bSYan, Zheng
1736c8a96a31SJeff Layton list_splice_init(&session->s_cap_releases, target);
1737c8a96a31SJeff Layton session->s_num_cap_releases = 0;
1738c8a96a31SJeff Layton dout("dispose_cap_releases mds%d\n", session->s_mds);
1739c8a96a31SJeff Layton }
1740c8a96a31SJeff Layton
dispose_cap_releases(struct ceph_mds_client * mdsc,struct list_head * dispose)1741c8a96a31SJeff Layton static void dispose_cap_releases(struct ceph_mds_client *mdsc,
1742c8a96a31SJeff Layton struct list_head *dispose)
1743c8a96a31SJeff Layton {
1744c8a96a31SJeff Layton while (!list_empty(dispose)) {
1745745a8e3bSYan, Zheng struct ceph_cap *cap;
1746745a8e3bSYan, Zheng /* zero out the in-progress message */
1747c8a96a31SJeff Layton cap = list_first_entry(dispose, struct ceph_cap, session_caps);
1748745a8e3bSYan, Zheng list_del(&cap->session_caps);
1749745a8e3bSYan, Zheng ceph_put_cap(mdsc, cap);
1750745a8e3bSYan, Zheng }
17512f2dc053SSage Weil }
17522f2dc053SSage Weil
cleanup_session_requests(struct ceph_mds_client * mdsc,struct ceph_mds_session * session)17531c841a96SYan, Zheng static void cleanup_session_requests(struct ceph_mds_client *mdsc,
17541c841a96SYan, Zheng struct ceph_mds_session *session)
17551c841a96SYan, Zheng {
17561c841a96SYan, Zheng struct ceph_mds_request *req;
17571c841a96SYan, Zheng struct rb_node *p;
17581c841a96SYan, Zheng
17591c841a96SYan, Zheng dout("cleanup_session_requests mds%d\n", session->s_mds);
17601c841a96SYan, Zheng mutex_lock(&mdsc->mutex);
17611c841a96SYan, Zheng while (!list_empty(&session->s_unsafe)) {
17621c841a96SYan, Zheng req = list_first_entry(&session->s_unsafe,
17631c841a96SYan, Zheng struct ceph_mds_request, r_unsafe_item);
17643e0708b9SYan, Zheng pr_warn_ratelimited(" dropping unsafe request %llu\n",
17653e0708b9SYan, Zheng req->r_tid);
17661bd85aa6SJeff Layton if (req->r_target_inode)
17671bd85aa6SJeff Layton mapping_set_error(req->r_target_inode->i_mapping, -EIO);
17681bd85aa6SJeff Layton if (req->r_unsafe_dir)
17691bd85aa6SJeff Layton mapping_set_error(req->r_unsafe_dir->i_mapping, -EIO);
17701c841a96SYan, Zheng __unregister_request(mdsc, req);
17711c841a96SYan, Zheng }
17721c841a96SYan, Zheng /* zero r_attempts, so kick_requests() will re-send requests */
17731c841a96SYan, Zheng p = rb_first(&mdsc->request_tree);
17741c841a96SYan, Zheng while (p) {
17751c841a96SYan, Zheng req = rb_entry(p, struct ceph_mds_request, r_node);
17761c841a96SYan, Zheng p = rb_next(p);
17771c841a96SYan, Zheng if (req->r_session &&
17781c841a96SYan, Zheng req->r_session->s_mds == session->s_mds)
17791c841a96SYan, Zheng req->r_attempts = 0;
17801c841a96SYan, Zheng }
17811c841a96SYan, Zheng mutex_unlock(&mdsc->mutex);
17821c841a96SYan, Zheng }
17831c841a96SYan, Zheng
17842f2dc053SSage Weil /*
1785f818a736SSage Weil * Helper to safely iterate over all caps associated with a session, with
1786f818a736SSage Weil * special care taken to handle a racing __ceph_remove_cap().
17872f2dc053SSage Weil *
1788f818a736SSage Weil * Caller must hold session s_mutex.
17892f2dc053SSage Weil */
ceph_iterate_session_caps(struct ceph_mds_session * session,int (* cb)(struct inode *,int mds,void *),void * arg)1790f5d77269SJeff Layton int ceph_iterate_session_caps(struct ceph_mds_session *session,
1791aaf67de7SXiubo Li int (*cb)(struct inode *, int mds, void *),
1792aaf67de7SXiubo Li void *arg)
17932f2dc053SSage Weil {
17947c1332b8SSage Weil struct list_head *p;
17957c1332b8SSage Weil struct ceph_cap *cap;
17967c1332b8SSage Weil struct inode *inode, *last_inode = NULL;
17977c1332b8SSage Weil struct ceph_cap *old_cap = NULL;
17982f2dc053SSage Weil int ret;
17992f2dc053SSage Weil
18002f2dc053SSage Weil dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
18012f2dc053SSage Weil spin_lock(&session->s_cap_lock);
18027c1332b8SSage Weil p = session->s_caps.next;
18037c1332b8SSage Weil while (p != &session->s_caps) {
1804aaf67de7SXiubo Li int mds;
1805aaf67de7SXiubo Li
18067c1332b8SSage Weil cap = list_entry(p, struct ceph_cap, session_caps);
1807874c8ca1SDavid Howells inode = igrab(&cap->ci->netfs.inode);
18087c1332b8SSage Weil if (!inode) {
18097c1332b8SSage Weil p = p->next;
18102f2dc053SSage Weil continue;
18117c1332b8SSage Weil }
18127c1332b8SSage Weil session->s_cap_iterator = cap;
1813aaf67de7SXiubo Li mds = cap->mds;
18142f2dc053SSage Weil spin_unlock(&session->s_cap_lock);
18157c1332b8SSage Weil
18167c1332b8SSage Weil if (last_inode) {
181723c2c76eSJeff Layton iput(last_inode);
18187c1332b8SSage Weil last_inode = NULL;
18197c1332b8SSage Weil }
18207c1332b8SSage Weil if (old_cap) {
182137151668SYehuda Sadeh ceph_put_cap(session->s_mdsc, old_cap);
18227c1332b8SSage Weil old_cap = NULL;
18237c1332b8SSage Weil }
18247c1332b8SSage Weil
1825aaf67de7SXiubo Li ret = cb(inode, mds, arg);
18267c1332b8SSage Weil last_inode = inode;
18277c1332b8SSage Weil
18282f2dc053SSage Weil spin_lock(&session->s_cap_lock);
18297c1332b8SSage Weil p = p->next;
1830d37b1d99SMarkus Elfring if (!cap->ci) {
18317c1332b8SSage Weil dout("iterate_session_caps finishing cap %p removal\n",
18327c1332b8SSage Weil cap);
18337c1332b8SSage Weil BUG_ON(cap->session != session);
1834745a8e3bSYan, Zheng cap->session = NULL;
18357c1332b8SSage Weil list_del_init(&cap->session_caps);
18367c1332b8SSage Weil session->s_nr_caps--;
18374f1d756dSXiubo Li atomic64_dec(&session->s_mdsc->metric.total_caps);
1838e3ec8d68SYan, Zheng if (cap->queue_release)
1839e3ec8d68SYan, Zheng __ceph_queue_cap_release(session, cap);
1840e3ec8d68SYan, Zheng else
18417c1332b8SSage Weil old_cap = cap; /* put_cap it w/o locks held */
18427c1332b8SSage Weil }
18435dacf091SSage Weil if (ret < 0)
18445dacf091SSage Weil goto out;
18452f2dc053SSage Weil }
18465dacf091SSage Weil ret = 0;
18475dacf091SSage Weil out:
18487c1332b8SSage Weil session->s_cap_iterator = NULL;
18492f2dc053SSage Weil spin_unlock(&session->s_cap_lock);
18507c1332b8SSage Weil
185123c2c76eSJeff Layton iput(last_inode);
18527c1332b8SSage Weil if (old_cap)
185337151668SYehuda Sadeh ceph_put_cap(session->s_mdsc, old_cap);
18547c1332b8SSage Weil
18555dacf091SSage Weil return ret;
18562f2dc053SSage Weil }
18572f2dc053SSage Weil
remove_session_caps_cb(struct inode * inode,int mds,void * arg)1858aaf67de7SXiubo Li static int remove_session_caps_cb(struct inode *inode, int mds, void *arg)
18592f2dc053SSage Weil {
18602f2dc053SSage Weil struct ceph_inode_info *ci = ceph_inode(inode);
18616c93df5dSYan, Zheng bool invalidate = false;
1862aaf67de7SXiubo Li struct ceph_cap *cap;
1863aaf67de7SXiubo Li int iputs = 0;
18646c99f254SSage Weil
1865aaf67de7SXiubo Li spin_lock(&ci->i_ceph_lock);
1866aaf67de7SXiubo Li cap = __get_cap_for_mds(ci, mds);
1867aaf67de7SXiubo Li if (cap) {
18682f2dc053SSage Weil dout(" removing cap %p, ci is %p, inode is %p\n",
1869874c8ca1SDavid Howells cap, ci, &ci->netfs.inode);
1870aaf67de7SXiubo Li
187136e6da98SJeff Layton iputs = ceph_purge_inode_cap(inode, cap, &invalidate);
1872aaf67de7SXiubo Li }
1873be655596SSage Weil spin_unlock(&ci->i_ceph_lock);
187477310320SYan, Zheng
1875aaf67de7SXiubo Li if (cap)
187677310320SYan, Zheng wake_up_all(&ci->i_cap_wq);
18776c93df5dSYan, Zheng if (invalidate)
18786c93df5dSYan, Zheng ceph_queue_invalidate(inode);
187936e6da98SJeff Layton while (iputs--)
1880a6d37ccdSXiubo Li iput(inode);
18812f2dc053SSage Weil return 0;
18822f2dc053SSage Weil }
18832f2dc053SSage Weil
18842f2dc053SSage Weil /*
18852f2dc053SSage Weil * caller must hold session s_mutex
18862f2dc053SSage Weil */
remove_session_caps(struct ceph_mds_session * session)18872f2dc053SSage Weil static void remove_session_caps(struct ceph_mds_session *session)
18882f2dc053SSage Weil {
18896c93df5dSYan, Zheng struct ceph_fs_client *fsc = session->s_mdsc->fsc;
18906c93df5dSYan, Zheng struct super_block *sb = fsc->sb;
1891c8a96a31SJeff Layton LIST_HEAD(dispose);
1892c8a96a31SJeff Layton
18932f2dc053SSage Weil dout("remove_session_caps on %p\n", session);
1894f5d77269SJeff Layton ceph_iterate_session_caps(session, remove_session_caps_cb, fsc);
18956f60f889SYan, Zheng
1896c8799fc4SYan, Zheng wake_up_all(&fsc->mdsc->cap_flushing_wq);
1897c8799fc4SYan, Zheng
18986f60f889SYan, Zheng spin_lock(&session->s_cap_lock);
18996f60f889SYan, Zheng if (session->s_nr_caps > 0) {
19006f60f889SYan, Zheng struct inode *inode;
19016f60f889SYan, Zheng struct ceph_cap *cap, *prev = NULL;
19026f60f889SYan, Zheng struct ceph_vino vino;
19036f60f889SYan, Zheng /*
19046f60f889SYan, Zheng * iterate_session_caps() skips inodes that are being
19056f60f889SYan, Zheng * deleted, we need to wait until deletions are complete.
19066f60f889SYan, Zheng * __wait_on_freeing_inode() is designed for the job,
19076f60f889SYan, Zheng * but it is not exported, so use lookup inode function
19086f60f889SYan, Zheng * to access it.
19096f60f889SYan, Zheng */
19106f60f889SYan, Zheng while (!list_empty(&session->s_caps)) {
19116f60f889SYan, Zheng cap = list_entry(session->s_caps.next,
19126f60f889SYan, Zheng struct ceph_cap, session_caps);
19136f60f889SYan, Zheng if (cap == prev)
19146f60f889SYan, Zheng break;
19156f60f889SYan, Zheng prev = cap;
19166f60f889SYan, Zheng vino = cap->ci->i_vino;
19176f60f889SYan, Zheng spin_unlock(&session->s_cap_lock);
19186f60f889SYan, Zheng
1919ed284c49SYan, Zheng inode = ceph_find_inode(sb, vino);
192023c2c76eSJeff Layton iput(inode);
19216f60f889SYan, Zheng
19226f60f889SYan, Zheng spin_lock(&session->s_cap_lock);
19236f60f889SYan, Zheng }
19246f60f889SYan, Zheng }
1925745a8e3bSYan, Zheng
1926745a8e3bSYan, Zheng // drop cap expires and unlock s_cap_lock
1927c8a96a31SJeff Layton detach_cap_releases(session, &dispose);
19286f60f889SYan, Zheng
19292f2dc053SSage Weil BUG_ON(session->s_nr_caps > 0);
19306c99f254SSage Weil BUG_ON(!list_empty(&session->s_cap_flushing));
1931c8a96a31SJeff Layton spin_unlock(&session->s_cap_lock);
1932c8a96a31SJeff Layton dispose_cap_releases(session->s_mdsc, &dispose);
19332f2dc053SSage Weil }
19342f2dc053SSage Weil
1935d2f8bb27SYan, Zheng enum {
1936d2f8bb27SYan, Zheng RECONNECT,
1937d2f8bb27SYan, Zheng RENEWCAPS,
1938d2f8bb27SYan, Zheng FORCE_RO,
1939d2f8bb27SYan, Zheng };
1940d2f8bb27SYan, Zheng
19412f2dc053SSage Weil /*
19422f2dc053SSage Weil * wake up any threads waiting on this session's caps. if the cap is
19432f2dc053SSage Weil * old (didn't get renewed on the client reconnect), remove it now.
19442f2dc053SSage Weil *
19452f2dc053SSage Weil * caller must hold s_mutex.
19462f2dc053SSage Weil */
wake_up_session_cb(struct inode * inode,int mds,void * arg)1947aaf67de7SXiubo Li static int wake_up_session_cb(struct inode *inode, int mds, void *arg)
19482f2dc053SSage Weil {
19490dc2570fSSage Weil struct ceph_inode_info *ci = ceph_inode(inode);
1950d2f8bb27SYan, Zheng unsigned long ev = (unsigned long)arg;
19510dc2570fSSage Weil
1952d2f8bb27SYan, Zheng if (ev == RECONNECT) {
1953be655596SSage Weil spin_lock(&ci->i_ceph_lock);
19540dc2570fSSage Weil ci->i_wanted_max_size = 0;
19550dc2570fSSage Weil ci->i_requested_max_size = 0;
1956be655596SSage Weil spin_unlock(&ci->i_ceph_lock);
1957d2f8bb27SYan, Zheng } else if (ev == RENEWCAPS) {
1958aaf67de7SXiubo Li struct ceph_cap *cap;
1959aaf67de7SXiubo Li
1960d2f8bb27SYan, Zheng spin_lock(&ci->i_ceph_lock);
1961aaf67de7SXiubo Li cap = __get_cap_for_mds(ci, mds);
1962aaf67de7SXiubo Li /* mds did not re-issue stale cap */
1963aaf67de7SXiubo Li if (cap && cap->cap_gen < atomic_read(&cap->session->s_cap_gen))
1964d2f8bb27SYan, Zheng cap->issued = cap->implemented = CEPH_CAP_PIN;
1965d2f8bb27SYan, Zheng spin_unlock(&ci->i_ceph_lock);
1966d2f8bb27SYan, Zheng } else if (ev == FORCE_RO) {
19670dc2570fSSage Weil }
1968e5360309SYan, Zheng wake_up_all(&ci->i_cap_wq);
19692f2dc053SSage Weil return 0;
19702f2dc053SSage Weil }
19712f2dc053SSage Weil
wake_up_session_caps(struct ceph_mds_session * session,int ev)1972d2f8bb27SYan, Zheng static void wake_up_session_caps(struct ceph_mds_session *session, int ev)
19732f2dc053SSage Weil {
19742f2dc053SSage Weil dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
1975f5d77269SJeff Layton ceph_iterate_session_caps(session, wake_up_session_cb,
1976d2f8bb27SYan, Zheng (void *)(unsigned long)ev);
19772f2dc053SSage Weil }
19782f2dc053SSage Weil
19792f2dc053SSage Weil /*
19802f2dc053SSage Weil * Send periodic message to MDS renewing all currently held caps. The
19812f2dc053SSage Weil * ack will reset the expiration for all caps from this session.
19822f2dc053SSage Weil *
19832f2dc053SSage Weil * caller holds s_mutex
19842f2dc053SSage Weil */
send_renew_caps(struct ceph_mds_client * mdsc,struct ceph_mds_session * session)19852f2dc053SSage Weil static int send_renew_caps(struct ceph_mds_client *mdsc,
19862f2dc053SSage Weil struct ceph_mds_session *session)
19872f2dc053SSage Weil {
19882f2dc053SSage Weil struct ceph_msg *msg;
19892f2dc053SSage Weil int state;
19902f2dc053SSage Weil
19912f2dc053SSage Weil if (time_after_eq(jiffies, session->s_cap_ttl) &&
19922f2dc053SSage Weil time_after_eq(session->s_cap_ttl, session->s_renew_requested))
19932f2dc053SSage Weil pr_info("mds%d caps stale\n", session->s_mds);
1994e4cb4cb8SSage Weil session->s_renew_requested = jiffies;
19952f2dc053SSage Weil
19962f2dc053SSage Weil /* do not try to renew caps until a recovering mds has reconnected
19972f2dc053SSage Weil * with its clients. */
19982f2dc053SSage Weil state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
19992f2dc053SSage Weil if (state < CEPH_MDS_STATE_RECONNECT) {
20002f2dc053SSage Weil dout("send_renew_caps ignoring mds%d (%s)\n",
20012f2dc053SSage Weil session->s_mds, ceph_mds_state_name(state));
20022f2dc053SSage Weil return 0;
20032f2dc053SSage Weil }
20042f2dc053SSage Weil
20052f2dc053SSage Weil dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
20062f2dc053SSage Weil ceph_mds_state_name(state));
2007fba97e80SXiubo Li msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
20082f2dc053SSage Weil ++session->s_renew_seq);
2009a79832f2SSage Weil if (!msg)
2010a79832f2SSage Weil return -ENOMEM;
20112f2dc053SSage Weil ceph_con_send(&session->s_con, msg);
20122f2dc053SSage Weil return 0;
20132f2dc053SSage Weil }
20142f2dc053SSage Weil
send_flushmsg_ack(struct ceph_mds_client * mdsc,struct ceph_mds_session * session,u64 seq)2015186e4f7aSYan, Zheng static int send_flushmsg_ack(struct ceph_mds_client *mdsc,
2016186e4f7aSYan, Zheng struct ceph_mds_session *session, u64 seq)
2017186e4f7aSYan, Zheng {
2018186e4f7aSYan, Zheng struct ceph_msg *msg;
2019186e4f7aSYan, Zheng
2020186e4f7aSYan, Zheng dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n",
2021a687ecafSJohn Spray session->s_mds, ceph_session_state_name(session->s_state), seq);
2022fba97e80SXiubo Li msg = ceph_create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
2023186e4f7aSYan, Zheng if (!msg)
2024186e4f7aSYan, Zheng return -ENOMEM;
2025186e4f7aSYan, Zheng ceph_con_send(&session->s_con, msg);
2026186e4f7aSYan, Zheng return 0;
2027186e4f7aSYan, Zheng }
2028186e4f7aSYan, Zheng
2029186e4f7aSYan, Zheng
20302f2dc053SSage Weil /*
20312f2dc053SSage Weil * Note new cap ttl, and any transition from stale -> not stale (fresh?).
20320dc2570fSSage Weil *
20330dc2570fSSage Weil * Called under session->s_mutex
20342f2dc053SSage Weil */
renewed_caps(struct ceph_mds_client * mdsc,struct ceph_mds_session * session,int is_renew)20352f2dc053SSage Weil static void renewed_caps(struct ceph_mds_client *mdsc,
20362f2dc053SSage Weil struct ceph_mds_session *session, int is_renew)
20372f2dc053SSage Weil {
20382f2dc053SSage Weil int was_stale;
20392f2dc053SSage Weil int wake = 0;
20402f2dc053SSage Weil
20412f2dc053SSage Weil spin_lock(&session->s_cap_lock);
20421ce208a6SAlex Elder was_stale = is_renew && time_after_eq(jiffies, session->s_cap_ttl);
20432f2dc053SSage Weil
20442f2dc053SSage Weil session->s_cap_ttl = session->s_renew_requested +
20452f2dc053SSage Weil mdsc->mdsmap->m_session_timeout*HZ;
20462f2dc053SSage Weil
20472f2dc053SSage Weil if (was_stale) {
20482f2dc053SSage Weil if (time_before(jiffies, session->s_cap_ttl)) {
20492f2dc053SSage Weil pr_info("mds%d caps renewed\n", session->s_mds);
20502f2dc053SSage Weil wake = 1;
20512f2dc053SSage Weil } else {
20522f2dc053SSage Weil pr_info("mds%d caps still stale\n", session->s_mds);
20532f2dc053SSage Weil }
20542f2dc053SSage Weil }
20552f2dc053SSage Weil dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
20562f2dc053SSage Weil session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
20572f2dc053SSage Weil time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
20582f2dc053SSage Weil spin_unlock(&session->s_cap_lock);
20592f2dc053SSage Weil
20602f2dc053SSage Weil if (wake)
2061d2f8bb27SYan, Zheng wake_up_session_caps(session, RENEWCAPS);
20622f2dc053SSage Weil }
20632f2dc053SSage Weil
20642f2dc053SSage Weil /*
20652f2dc053SSage Weil * send a session close request
20662f2dc053SSage Weil */
request_close_session(struct ceph_mds_session * session)20673e699bd8SXiubo Li static int request_close_session(struct ceph_mds_session *session)
20682f2dc053SSage Weil {
20692f2dc053SSage Weil struct ceph_msg *msg;
20702f2dc053SSage Weil
20712f2dc053SSage Weil dout("request_close_session mds%d state %s seq %lld\n",
2072a687ecafSJohn Spray session->s_mds, ceph_session_state_name(session->s_state),
20732f2dc053SSage Weil session->s_seq);
2074fba97e80SXiubo Li msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_CLOSE,
2075fba97e80SXiubo Li session->s_seq);
2076a79832f2SSage Weil if (!msg)
2077a79832f2SSage Weil return -ENOMEM;
20782f2dc053SSage Weil ceph_con_send(&session->s_con, msg);
2079fcff415cSYan, Zheng return 1;
20802f2dc053SSage Weil }
20812f2dc053SSage Weil
20822f2dc053SSage Weil /*
20832f2dc053SSage Weil * Called with s_mutex held.
20842f2dc053SSage Weil */
__close_session(struct ceph_mds_client * mdsc,struct ceph_mds_session * session)20852f2dc053SSage Weil static int __close_session(struct ceph_mds_client *mdsc,
20862f2dc053SSage Weil struct ceph_mds_session *session)
20872f2dc053SSage Weil {
20882f2dc053SSage Weil if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
20892f2dc053SSage Weil return 0;
20902f2dc053SSage Weil session->s_state = CEPH_MDS_SESSION_CLOSING;
20913e699bd8SXiubo Li return request_close_session(session);
20922f2dc053SSage Weil }
20932f2dc053SSage Weil
drop_negative_children(struct dentry * dentry)2094040d7860SYan, Zheng static bool drop_negative_children(struct dentry *dentry)
2095040d7860SYan, Zheng {
2096040d7860SYan, Zheng struct dentry *child;
2097040d7860SYan, Zheng bool all_negative = true;
2098040d7860SYan, Zheng
2099040d7860SYan, Zheng if (!d_is_dir(dentry))
2100040d7860SYan, Zheng goto out;
2101040d7860SYan, Zheng
2102040d7860SYan, Zheng spin_lock(&dentry->d_lock);
2103040d7860SYan, Zheng list_for_each_entry(child, &dentry->d_subdirs, d_child) {
2104040d7860SYan, Zheng if (d_really_is_positive(child)) {
2105040d7860SYan, Zheng all_negative = false;
2106040d7860SYan, Zheng break;
2107040d7860SYan, Zheng }
2108040d7860SYan, Zheng }
2109040d7860SYan, Zheng spin_unlock(&dentry->d_lock);
2110040d7860SYan, Zheng
2111040d7860SYan, Zheng if (all_negative)
2112040d7860SYan, Zheng shrink_dcache_parent(dentry);
2113040d7860SYan, Zheng out:
2114040d7860SYan, Zheng return all_negative;
2115040d7860SYan, Zheng }
2116040d7860SYan, Zheng
21172f2dc053SSage Weil /*
21182f2dc053SSage Weil * Trim old(er) caps.
21192f2dc053SSage Weil *
21202f2dc053SSage Weil * Because we can't cache an inode without one or more caps, we do
21212f2dc053SSage Weil * this indirectly: if a cap is unused, we prune its aliases, at which
21222f2dc053SSage Weil * point the inode will hopefully get dropped to.
21232f2dc053SSage Weil *
21242f2dc053SSage Weil * Yes, this is a bit sloppy. Our only real goal here is to respond to
21252f2dc053SSage Weil * memory pressure from the MDS, though, so it needn't be perfect.
21262f2dc053SSage Weil */
trim_caps_cb(struct inode * inode,int mds,void * arg)2127aaf67de7SXiubo Li static int trim_caps_cb(struct inode *inode, int mds, void *arg)
21282f2dc053SSage Weil {
21292e2023e9SXiubo Li struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
2130533a2818SJeff Layton int *remaining = arg;
21312f2dc053SSage Weil struct ceph_inode_info *ci = ceph_inode(inode);
2132979abfddSYan, Zheng int used, wanted, oissued, mine;
2133aaf67de7SXiubo Li struct ceph_cap *cap;
21342f2dc053SSage Weil
2135533a2818SJeff Layton if (*remaining <= 0)
21362f2dc053SSage Weil return -1;
21372f2dc053SSage Weil
2138be655596SSage Weil spin_lock(&ci->i_ceph_lock);
2139aaf67de7SXiubo Li cap = __get_cap_for_mds(ci, mds);
2140aaf67de7SXiubo Li if (!cap) {
2141aaf67de7SXiubo Li spin_unlock(&ci->i_ceph_lock);
2142aaf67de7SXiubo Li return 0;
2143aaf67de7SXiubo Li }
21442f2dc053SSage Weil mine = cap->issued | cap->implemented;
21452f2dc053SSage Weil used = __ceph_caps_used(ci);
2146979abfddSYan, Zheng wanted = __ceph_caps_file_wanted(ci);
21472f2dc053SSage Weil oissued = __ceph_caps_issued_other(ci, cap);
21482f2dc053SSage Weil
2149979abfddSYan, Zheng dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s\n",
21502f2dc053SSage Weil inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
2151979abfddSYan, Zheng ceph_cap_string(used), ceph_cap_string(wanted));
2152979abfddSYan, Zheng if (cap == ci->i_auth_cap) {
2153622f3e25SYan, Zheng if (ci->i_dirty_caps || ci->i_flushing_caps ||
2154622f3e25SYan, Zheng !list_empty(&ci->i_cap_snaps))
2155979abfddSYan, Zheng goto out;
2156979abfddSYan, Zheng if ((used | wanted) & CEPH_CAP_ANY_WR)
2157979abfddSYan, Zheng goto out;
215889aa5930SYan, Zheng /* Note: it's possible that i_filelock_ref becomes non-zero
215989aa5930SYan, Zheng * after dropping auth caps. It doesn't hurt because reply
216089aa5930SYan, Zheng * of lock mds request will re-add auth caps. */
216189aa5930SYan, Zheng if (atomic_read(&ci->i_filelock_ref) > 0)
216289aa5930SYan, Zheng goto out;
2163979abfddSYan, Zheng }
21645e804ac4SYan, Zheng /* The inode has cached pages, but it's no longer used.
21655e804ac4SYan, Zheng * we can safely drop it */
2166525d15e8SYan, Zheng if (S_ISREG(inode->i_mode) &&
2167525d15e8SYan, Zheng wanted == 0 && used == CEPH_CAP_FILE_CACHE &&
21685e804ac4SYan, Zheng !(oissued & CEPH_CAP_FILE_CACHE)) {
21695e804ac4SYan, Zheng used = 0;
21705e804ac4SYan, Zheng oissued = 0;
21715e804ac4SYan, Zheng }
2172979abfddSYan, Zheng if ((used | wanted) & ~oissued & mine)
21732f2dc053SSage Weil goto out; /* we need these caps */
21742f2dc053SSage Weil
21752f2dc053SSage Weil if (oissued) {
21762f2dc053SSage Weil /* we aren't the only cap.. just remove us */
21772e2023e9SXiubo Li ceph_remove_cap(mdsc, cap, true);
2178533a2818SJeff Layton (*remaining)--;
21792f2dc053SSage Weil } else {
2180040d7860SYan, Zheng struct dentry *dentry;
21815e804ac4SYan, Zheng /* try dropping referring dentries */
2182be655596SSage Weil spin_unlock(&ci->i_ceph_lock);
2183040d7860SYan, Zheng dentry = d_find_any_alias(inode);
2184040d7860SYan, Zheng if (dentry && drop_negative_children(dentry)) {
2185040d7860SYan, Zheng int count;
2186040d7860SYan, Zheng dput(dentry);
21872f2dc053SSage Weil d_prune_aliases(inode);
2188040d7860SYan, Zheng count = atomic_read(&inode->i_count);
2189040d7860SYan, Zheng if (count == 1)
2190533a2818SJeff Layton (*remaining)--;
21912f2dc053SSage Weil dout("trim_caps_cb %p cap %p pruned, count now %d\n",
2192040d7860SYan, Zheng inode, cap, count);
2193040d7860SYan, Zheng } else {
2194040d7860SYan, Zheng dput(dentry);
2195040d7860SYan, Zheng }
21962f2dc053SSage Weil return 0;
21972f2dc053SSage Weil }
21982f2dc053SSage Weil
21992f2dc053SSage Weil out:
2200be655596SSage Weil spin_unlock(&ci->i_ceph_lock);
22012f2dc053SSage Weil return 0;
22022f2dc053SSage Weil }
22032f2dc053SSage Weil
22042f2dc053SSage Weil /*
22052f2dc053SSage Weil * Trim session cap count down to some max number.
22062f2dc053SSage Weil */
ceph_trim_caps(struct ceph_mds_client * mdsc,struct ceph_mds_session * session,int max_caps)2207e30ee581SZhi Zhang int ceph_trim_caps(struct ceph_mds_client *mdsc,
22082f2dc053SSage Weil struct ceph_mds_session *session,
22092f2dc053SSage Weil int max_caps)
22102f2dc053SSage Weil {
22112f2dc053SSage Weil int trim_caps = session->s_nr_caps - max_caps;
22122f2dc053SSage Weil
22132f2dc053SSage Weil dout("trim_caps mds%d start: %d / %d, trim %d\n",
22142f2dc053SSage Weil session->s_mds, session->s_nr_caps, max_caps, trim_caps);
22152f2dc053SSage Weil if (trim_caps > 0) {
2216533a2818SJeff Layton int remaining = trim_caps;
2217533a2818SJeff Layton
2218533a2818SJeff Layton ceph_iterate_session_caps(session, trim_caps_cb, &remaining);
22192f2dc053SSage Weil dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
22202f2dc053SSage Weil session->s_mds, session->s_nr_caps, max_caps,
2221533a2818SJeff Layton trim_caps - remaining);
22222f2dc053SSage Weil }
2223a56371d9SYan, Zheng
2224e3ec8d68SYan, Zheng ceph_flush_cap_releases(mdsc, session);
22252f2dc053SSage Weil return 0;
22262f2dc053SSage Weil }
22272f2dc053SSage Weil
check_caps_flush(struct ceph_mds_client * mdsc,u64 want_flush_tid)22288310b089SYan, Zheng static int check_caps_flush(struct ceph_mds_client *mdsc,
22298310b089SYan, Zheng u64 want_flush_tid)
22308310b089SYan, Zheng {
22318310b089SYan, Zheng int ret = 1;
22328310b089SYan, Zheng
22338310b089SYan, Zheng spin_lock(&mdsc->cap_dirty_lock);
2234e4500b5eSYan, Zheng if (!list_empty(&mdsc->cap_flush_list)) {
2235e4500b5eSYan, Zheng struct ceph_cap_flush *cf =
2236e4500b5eSYan, Zheng list_first_entry(&mdsc->cap_flush_list,
2237e4500b5eSYan, Zheng struct ceph_cap_flush, g_list);
2238e4500b5eSYan, Zheng if (cf->tid <= want_flush_tid) {
2239e4500b5eSYan, Zheng dout("check_caps_flush still flushing tid "
2240e4500b5eSYan, Zheng "%llu <= %llu\n", cf->tid, want_flush_tid);
22418310b089SYan, Zheng ret = 0;
22428310b089SYan, Zheng }
2243e4500b5eSYan, Zheng }
22448310b089SYan, Zheng spin_unlock(&mdsc->cap_dirty_lock);
22458310b089SYan, Zheng return ret;
2246d3383a8eSYan, Zheng }
2247d3383a8eSYan, Zheng
22482f2dc053SSage Weil /*
22492f2dc053SSage Weil * flush all dirty inode data to disk.
22502f2dc053SSage Weil *
22518310b089SYan, Zheng * returns true if we've flushed through want_flush_tid
22522f2dc053SSage Weil */
wait_caps_flush(struct ceph_mds_client * mdsc,u64 want_flush_tid)2253affbc19aSYan, Zheng static void wait_caps_flush(struct ceph_mds_client *mdsc,
22540e294387SYan, Zheng u64 want_flush_tid)
22552f2dc053SSage Weil {
22560e294387SYan, Zheng dout("check_caps_flush want %llu\n", want_flush_tid);
22578310b089SYan, Zheng
22588310b089SYan, Zheng wait_event(mdsc->cap_flushing_wq,
22598310b089SYan, Zheng check_caps_flush(mdsc, want_flush_tid));
22608310b089SYan, Zheng
22618310b089SYan, Zheng dout("check_caps_flush ok, flushed thru %llu\n", want_flush_tid);
22622f2dc053SSage Weil }
22632f2dc053SSage Weil
22642f2dc053SSage Weil /*
22652f2dc053SSage Weil * called under s_mutex
22662f2dc053SSage Weil */
ceph_send_cap_releases(struct ceph_mds_client * mdsc,struct ceph_mds_session * session)2267e3ec8d68SYan, Zheng static void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
22682f2dc053SSage Weil struct ceph_mds_session *session)
22692f2dc053SSage Weil {
2270745a8e3bSYan, Zheng struct ceph_msg *msg = NULL;
2271745a8e3bSYan, Zheng struct ceph_mds_cap_release *head;
2272745a8e3bSYan, Zheng struct ceph_mds_cap_item *item;
227392475f05SJeff Layton struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
2274745a8e3bSYan, Zheng struct ceph_cap *cap;
2275745a8e3bSYan, Zheng LIST_HEAD(tmp_list);
2276745a8e3bSYan, Zheng int num_cap_releases;
227792475f05SJeff Layton __le32 barrier, *cap_barrier;
227892475f05SJeff Layton
227992475f05SJeff Layton down_read(&osdc->lock);
228092475f05SJeff Layton barrier = cpu_to_le32(osdc->epoch_barrier);
228192475f05SJeff Layton up_read(&osdc->lock);
22822f2dc053SSage Weil
22832f2dc053SSage Weil spin_lock(&session->s_cap_lock);
2284745a8e3bSYan, Zheng again:
2285745a8e3bSYan, Zheng list_splice_init(&session->s_cap_releases, &tmp_list);
2286745a8e3bSYan, Zheng num_cap_releases = session->s_num_cap_releases;
2287745a8e3bSYan, Zheng session->s_num_cap_releases = 0;
22882f2dc053SSage Weil spin_unlock(&session->s_cap_lock);
2289745a8e3bSYan, Zheng
2290745a8e3bSYan, Zheng while (!list_empty(&tmp_list)) {
2291745a8e3bSYan, Zheng if (!msg) {
2292745a8e3bSYan, Zheng msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE,
229309cbfeafSKirill A. Shutemov PAGE_SIZE, GFP_NOFS, false);
2294745a8e3bSYan, Zheng if (!msg)
2295745a8e3bSYan, Zheng goto out_err;
2296745a8e3bSYan, Zheng head = msg->front.iov_base;
2297745a8e3bSYan, Zheng head->num = cpu_to_le32(0);
2298745a8e3bSYan, Zheng msg->front.iov_len = sizeof(*head);
229992475f05SJeff Layton
230092475f05SJeff Layton msg->hdr.version = cpu_to_le16(2);
230192475f05SJeff Layton msg->hdr.compat_version = cpu_to_le16(1);
2302745a8e3bSYan, Zheng }
230392475f05SJeff Layton
2304745a8e3bSYan, Zheng cap = list_first_entry(&tmp_list, struct ceph_cap,
2305745a8e3bSYan, Zheng session_caps);
2306745a8e3bSYan, Zheng list_del(&cap->session_caps);
2307745a8e3bSYan, Zheng num_cap_releases--;
2308745a8e3bSYan, Zheng
2309745a8e3bSYan, Zheng head = msg->front.iov_base;
23104198aba4SJeff Layton put_unaligned_le32(get_unaligned_le32(&head->num) + 1,
23114198aba4SJeff Layton &head->num);
2312745a8e3bSYan, Zheng item = msg->front.iov_base + msg->front.iov_len;
2313745a8e3bSYan, Zheng item->ino = cpu_to_le64(cap->cap_ino);
2314745a8e3bSYan, Zheng item->cap_id = cpu_to_le64(cap->cap_id);
2315745a8e3bSYan, Zheng item->migrate_seq = cpu_to_le32(cap->mseq);
2316745a8e3bSYan, Zheng item->seq = cpu_to_le32(cap->issue_seq);
2317745a8e3bSYan, Zheng msg->front.iov_len += sizeof(*item);
2318745a8e3bSYan, Zheng
2319745a8e3bSYan, Zheng ceph_put_cap(mdsc, cap);
2320745a8e3bSYan, Zheng
2321745a8e3bSYan, Zheng if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
232292475f05SJeff Layton // Append cap_barrier field
232392475f05SJeff Layton cap_barrier = msg->front.iov_base + msg->front.iov_len;
232492475f05SJeff Layton *cap_barrier = barrier;
232592475f05SJeff Layton msg->front.iov_len += sizeof(*cap_barrier);
232692475f05SJeff Layton
23272f2dc053SSage Weil msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
23282f2dc053SSage Weil dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
23292f2dc053SSage Weil ceph_con_send(&session->s_con, msg);
2330745a8e3bSYan, Zheng msg = NULL;
2331745a8e3bSYan, Zheng }
2332745a8e3bSYan, Zheng }
2333745a8e3bSYan, Zheng
2334745a8e3bSYan, Zheng BUG_ON(num_cap_releases != 0);
2335745a8e3bSYan, Zheng
23360f8605f2SSage Weil spin_lock(&session->s_cap_lock);
2337745a8e3bSYan, Zheng if (!list_empty(&session->s_cap_releases))
2338745a8e3bSYan, Zheng goto again;
23392f2dc053SSage Weil spin_unlock(&session->s_cap_lock);
2340745a8e3bSYan, Zheng
2341745a8e3bSYan, Zheng if (msg) {
234292475f05SJeff Layton // Append cap_barrier field
234392475f05SJeff Layton cap_barrier = msg->front.iov_base + msg->front.iov_len;
234492475f05SJeff Layton *cap_barrier = barrier;
234592475f05SJeff Layton msg->front.iov_len += sizeof(*cap_barrier);
234692475f05SJeff Layton
2347745a8e3bSYan, Zheng msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
2348745a8e3bSYan, Zheng dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
2349745a8e3bSYan, Zheng ceph_con_send(&session->s_con, msg);
23502f2dc053SSage Weil }
2351745a8e3bSYan, Zheng return;
2352745a8e3bSYan, Zheng out_err:
2353745a8e3bSYan, Zheng pr_err("send_cap_releases mds%d, failed to allocate message\n",
2354745a8e3bSYan, Zheng session->s_mds);
2355745a8e3bSYan, Zheng spin_lock(&session->s_cap_lock);
2356745a8e3bSYan, Zheng list_splice(&tmp_list, &session->s_cap_releases);
2357745a8e3bSYan, Zheng session->s_num_cap_releases += num_cap_releases;
2358745a8e3bSYan, Zheng spin_unlock(&session->s_cap_lock);
2359e01a5946SSage Weil }
2360e01a5946SSage Weil
ceph_cap_release_work(struct work_struct * work)2361e3ec8d68SYan, Zheng static void ceph_cap_release_work(struct work_struct *work)
2362e3ec8d68SYan, Zheng {
2363e3ec8d68SYan, Zheng struct ceph_mds_session *session =
2364e3ec8d68SYan, Zheng container_of(work, struct ceph_mds_session, s_cap_release_work);
2365e3ec8d68SYan, Zheng
2366e3ec8d68SYan, Zheng mutex_lock(&session->s_mutex);
2367e3ec8d68SYan, Zheng if (session->s_state == CEPH_MDS_SESSION_OPEN ||
2368e3ec8d68SYan, Zheng session->s_state == CEPH_MDS_SESSION_HUNG)
2369e3ec8d68SYan, Zheng ceph_send_cap_releases(session->s_mdsc, session);
2370e3ec8d68SYan, Zheng mutex_unlock(&session->s_mutex);
2371e3ec8d68SYan, Zheng ceph_put_mds_session(session);
2372e3ec8d68SYan, Zheng }
2373e3ec8d68SYan, Zheng
ceph_flush_cap_releases(struct ceph_mds_client * mdsc,struct ceph_mds_session * session)2374e3ec8d68SYan, Zheng void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
2375e3ec8d68SYan, Zheng struct ceph_mds_session *session)
2376e3ec8d68SYan, Zheng {
2377e3ec8d68SYan, Zheng if (mdsc->stopping)
2378e3ec8d68SYan, Zheng return;
2379e3ec8d68SYan, Zheng
23805b3248c6SXiubo Li ceph_get_mds_session(session);
2381e3ec8d68SYan, Zheng if (queue_work(mdsc->fsc->cap_wq,
2382e3ec8d68SYan, Zheng &session->s_cap_release_work)) {
2383e3ec8d68SYan, Zheng dout("cap release work queued\n");
2384e3ec8d68SYan, Zheng } else {
2385e3ec8d68SYan, Zheng ceph_put_mds_session(session);
2386e3ec8d68SYan, Zheng dout("failed to queue cap release work\n");
2387e3ec8d68SYan, Zheng }
2388e3ec8d68SYan, Zheng }
2389e3ec8d68SYan, Zheng
2390e3ec8d68SYan, Zheng /*
2391e3ec8d68SYan, Zheng * caller holds session->s_cap_lock
2392e3ec8d68SYan, Zheng */
__ceph_queue_cap_release(struct ceph_mds_session * session,struct ceph_cap * cap)2393e3ec8d68SYan, Zheng void __ceph_queue_cap_release(struct ceph_mds_session *session,
2394e3ec8d68SYan, Zheng struct ceph_cap *cap)
2395e3ec8d68SYan, Zheng {
2396e3ec8d68SYan, Zheng list_add_tail(&cap->session_caps, &session->s_cap_releases);
2397e3ec8d68SYan, Zheng session->s_num_cap_releases++;
2398e3ec8d68SYan, Zheng
2399e3ec8d68SYan, Zheng if (!(session->s_num_cap_releases % CEPH_CAPS_PER_RELEASE))
2400e3ec8d68SYan, Zheng ceph_flush_cap_releases(session->s_mdsc, session);
2401e3ec8d68SYan, Zheng }
2402e3ec8d68SYan, Zheng
ceph_cap_reclaim_work(struct work_struct * work)240337c4efc1SYan, Zheng static void ceph_cap_reclaim_work(struct work_struct *work)
240437c4efc1SYan, Zheng {
240537c4efc1SYan, Zheng struct ceph_mds_client *mdsc =
240637c4efc1SYan, Zheng container_of(work, struct ceph_mds_client, cap_reclaim_work);
240737c4efc1SYan, Zheng int ret = ceph_trim_dentries(mdsc);
240837c4efc1SYan, Zheng if (ret == -EAGAIN)
240937c4efc1SYan, Zheng ceph_queue_cap_reclaim_work(mdsc);
241037c4efc1SYan, Zheng }
241137c4efc1SYan, Zheng
ceph_queue_cap_reclaim_work(struct ceph_mds_client * mdsc)241237c4efc1SYan, Zheng void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc)
241337c4efc1SYan, Zheng {
241437c4efc1SYan, Zheng if (mdsc->stopping)
241537c4efc1SYan, Zheng return;
241637c4efc1SYan, Zheng
241737c4efc1SYan, Zheng if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_reclaim_work)) {
241837c4efc1SYan, Zheng dout("caps reclaim work queued\n");
241937c4efc1SYan, Zheng } else {
242037c4efc1SYan, Zheng dout("failed to queue caps release work\n");
242137c4efc1SYan, Zheng }
242237c4efc1SYan, Zheng }
242337c4efc1SYan, Zheng
ceph_reclaim_caps_nr(struct ceph_mds_client * mdsc,int nr)2424fe33032dSYan, Zheng void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr)
2425fe33032dSYan, Zheng {
2426fe33032dSYan, Zheng int val;
2427fe33032dSYan, Zheng if (!nr)
2428fe33032dSYan, Zheng return;
2429fe33032dSYan, Zheng val = atomic_add_return(nr, &mdsc->cap_reclaim_pending);
2430bba1560bSXiubo Li if ((val % CEPH_CAPS_PER_RELEASE) < nr) {
2431fe33032dSYan, Zheng atomic_set(&mdsc->cap_reclaim_pending, 0);
2432fe33032dSYan, Zheng ceph_queue_cap_reclaim_work(mdsc);
2433fe33032dSYan, Zheng }
2434fe33032dSYan, Zheng }
2435fe33032dSYan, Zheng
24362f2dc053SSage Weil /*
24372f2dc053SSage Weil * requests
24382f2dc053SSage Weil */
24392f2dc053SSage Weil
ceph_alloc_readdir_reply_buffer(struct ceph_mds_request * req,struct inode * dir)244054008399SYan, Zheng int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
244154008399SYan, Zheng struct inode *dir)
244254008399SYan, Zheng {
244354008399SYan, Zheng struct ceph_inode_info *ci = ceph_inode(dir);
244454008399SYan, Zheng struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
244554008399SYan, Zheng struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;
24462a5beea3SYan, Zheng size_t size = sizeof(struct ceph_mds_reply_dir_entry);
2447ad8c28a9SJeff Layton unsigned int num_entries;
2448ad8c28a9SJeff Layton int order;
244954008399SYan, Zheng
245054008399SYan, Zheng spin_lock(&ci->i_ceph_lock);
245154008399SYan, Zheng num_entries = ci->i_files + ci->i_subdirs;
245254008399SYan, Zheng spin_unlock(&ci->i_ceph_lock);
2453ad8c28a9SJeff Layton num_entries = max(num_entries, 1U);
245454008399SYan, Zheng num_entries = min(num_entries, opt->max_readdir);
245554008399SYan, Zheng
245654008399SYan, Zheng order = get_order(size * num_entries);
245754008399SYan, Zheng while (order >= 0) {
24582a5beea3SYan, Zheng rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL |
24592941bf53SXiubo Li __GFP_NOWARN |
24602941bf53SXiubo Li __GFP_ZERO,
246154008399SYan, Zheng order);
24622a5beea3SYan, Zheng if (rinfo->dir_entries)
246354008399SYan, Zheng break;
246454008399SYan, Zheng order--;
246554008399SYan, Zheng }
24662a5beea3SYan, Zheng if (!rinfo->dir_entries)
246754008399SYan, Zheng return -ENOMEM;
246854008399SYan, Zheng
246954008399SYan, Zheng num_entries = (PAGE_SIZE << order) / size;
247054008399SYan, Zheng num_entries = min(num_entries, opt->max_readdir);
247154008399SYan, Zheng
247254008399SYan, Zheng rinfo->dir_buf_size = PAGE_SIZE << order;
247354008399SYan, Zheng req->r_num_caps = num_entries + 1;
247454008399SYan, Zheng req->r_args.readdir.max_entries = cpu_to_le32(num_entries);
247554008399SYan, Zheng req->r_args.readdir.max_bytes = cpu_to_le32(opt->max_readdir_bytes);
247654008399SYan, Zheng return 0;
247754008399SYan, Zheng }
247854008399SYan, Zheng
24792f2dc053SSage Weil /*
24802f2dc053SSage Weil * Create an mds request.
24812f2dc053SSage Weil */
24822f2dc053SSage Weil struct ceph_mds_request *
ceph_mdsc_create_request(struct ceph_mds_client * mdsc,int op,int mode)24832f2dc053SSage Weil ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
24842f2dc053SSage Weil {
2485058daab7SJeff Layton struct ceph_mds_request *req;
24862f2dc053SSage Weil
2487058daab7SJeff Layton req = kmem_cache_zalloc(ceph_mds_request_cachep, GFP_NOFS);
24882f2dc053SSage Weil if (!req)
24892f2dc053SSage Weil return ERR_PTR(-ENOMEM);
24902f2dc053SSage Weil
2491b4556396SSage Weil mutex_init(&req->r_fill_mutex);
249237151668SYehuda Sadeh req->r_mdsc = mdsc;
24932f2dc053SSage Weil req->r_started = jiffies;
249470c94820SXiubo Li req->r_start_latency = ktime_get();
24952f2dc053SSage Weil req->r_resend_mds = -1;
24962f2dc053SSage Weil INIT_LIST_HEAD(&req->r_unsafe_dir_item);
249768cd5b4bSYan, Zheng INIT_LIST_HEAD(&req->r_unsafe_target_item);
24982f2dc053SSage Weil req->r_fmode = -1;
24996eb06c46SXiubo Li req->r_feature_needed = -1;
2500153c8e6bSSage Weil kref_init(&req->r_kref);
2501fcd00b68SIlya Dryomov RB_CLEAR_NODE(&req->r_node);
25022f2dc053SSage Weil INIT_LIST_HEAD(&req->r_wait);
25032f2dc053SSage Weil init_completion(&req->r_completion);
25042f2dc053SSage Weil init_completion(&req->r_safe_completion);
25052f2dc053SSage Weil INIT_LIST_HEAD(&req->r_unsafe_item);
25062f2dc053SSage Weil
2507668c9a61SDeepa Dinamani ktime_get_coarse_real_ts64(&req->r_stamp);
2508b8e69066SSage Weil
25092f2dc053SSage Weil req->r_op = op;
25102f2dc053SSage Weil req->r_direct_mode = mode;
25112f2dc053SSage Weil return req;
25122f2dc053SSage Weil }
25132f2dc053SSage Weil
25142f2dc053SSage Weil /*
251544ca18f2SSage Weil * return oldest (lowest) request, tid in request tree, 0 if none.
25162f2dc053SSage Weil *
25172f2dc053SSage Weil * called under mdsc->mutex.
25182f2dc053SSage Weil */
__get_oldest_req(struct ceph_mds_client * mdsc)251944ca18f2SSage Weil static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
252044ca18f2SSage Weil {
252144ca18f2SSage Weil if (RB_EMPTY_ROOT(&mdsc->request_tree))
252244ca18f2SSage Weil return NULL;
252344ca18f2SSage Weil return rb_entry(rb_first(&mdsc->request_tree),
252444ca18f2SSage Weil struct ceph_mds_request, r_node);
252544ca18f2SSage Weil }
252644ca18f2SSage Weil
__get_oldest_tid(struct ceph_mds_client * mdsc)2527e8a7b8b1SYan, Zheng static inline u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
25282f2dc053SSage Weil {
2529e8a7b8b1SYan, Zheng return mdsc->oldest_tid;
25302f2dc053SSage Weil }
25312f2dc053SSage Weil
253224865e75SJeff Layton #if IS_ENABLED(CONFIG_FS_ENCRYPTION)
get_fscrypt_altname(const struct ceph_mds_request * req,u32 * plen)253324865e75SJeff Layton static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
253424865e75SJeff Layton {
253524865e75SJeff Layton struct inode *dir = req->r_parent;
253624865e75SJeff Layton struct dentry *dentry = req->r_dentry;
253724865e75SJeff Layton u8 *cryptbuf = NULL;
253824865e75SJeff Layton u32 len = 0;
253924865e75SJeff Layton int ret = 0;
254024865e75SJeff Layton
254124865e75SJeff Layton /* only encode if we have parent and dentry */
254224865e75SJeff Layton if (!dir || !dentry)
254324865e75SJeff Layton goto success;
254424865e75SJeff Layton
254524865e75SJeff Layton /* No-op unless this is encrypted */
254624865e75SJeff Layton if (!IS_ENCRYPTED(dir))
254724865e75SJeff Layton goto success;
254824865e75SJeff Layton
254914e034a6SLuís Henriques ret = ceph_fscrypt_prepare_readdir(dir);
255014e034a6SLuís Henriques if (ret < 0)
255124865e75SJeff Layton return ERR_PTR(ret);
255224865e75SJeff Layton
255324865e75SJeff Layton /* No key? Just ignore it. */
255424865e75SJeff Layton if (!fscrypt_has_encryption_key(dir))
255524865e75SJeff Layton goto success;
255624865e75SJeff Layton
255724865e75SJeff Layton if (!fscrypt_fname_encrypted_size(dir, dentry->d_name.len, NAME_MAX,
255824865e75SJeff Layton &len)) {
255924865e75SJeff Layton WARN_ON_ONCE(1);
256024865e75SJeff Layton return ERR_PTR(-ENAMETOOLONG);
256124865e75SJeff Layton }
256224865e75SJeff Layton
256324865e75SJeff Layton /* No need to append altname if name is short enough */
256424865e75SJeff Layton if (len <= CEPH_NOHASH_NAME_MAX) {
256524865e75SJeff Layton len = 0;
256624865e75SJeff Layton goto success;
256724865e75SJeff Layton }
256824865e75SJeff Layton
256924865e75SJeff Layton cryptbuf = kmalloc(len, GFP_KERNEL);
257024865e75SJeff Layton if (!cryptbuf)
257124865e75SJeff Layton return ERR_PTR(-ENOMEM);
257224865e75SJeff Layton
257324865e75SJeff Layton ret = fscrypt_fname_encrypt(dir, &dentry->d_name, cryptbuf, len);
257424865e75SJeff Layton if (ret) {
257524865e75SJeff Layton kfree(cryptbuf);
257624865e75SJeff Layton return ERR_PTR(ret);
257724865e75SJeff Layton }
257824865e75SJeff Layton success:
257924865e75SJeff Layton *plen = len;
258024865e75SJeff Layton return cryptbuf;
258124865e75SJeff Layton }
258224865e75SJeff Layton #else
get_fscrypt_altname(const struct ceph_mds_request * req,u32 * plen)258324865e75SJeff Layton static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
258424865e75SJeff Layton {
258524865e75SJeff Layton *plen = 0;
258624865e75SJeff Layton return NULL;
258724865e75SJeff Layton }
258824865e75SJeff Layton #endif
258924865e75SJeff Layton
25903fd945a7SJeff Layton /**
25913fd945a7SJeff Layton * ceph_mdsc_build_path - build a path string to a given dentry
25922e2023e9SXiubo Li * @mdsc: mds client
25933fd945a7SJeff Layton * @dentry: dentry to which path should be built
25943fd945a7SJeff Layton * @plen: returned length of string
25953fd945a7SJeff Layton * @pbase: returned base inode number
25963fd945a7SJeff Layton * @for_wire: is this path going to be sent to the MDS?
25972f2dc053SSage Weil *
25983fd945a7SJeff Layton * Build a string that represents the path to the dentry. This is mostly called
25993fd945a7SJeff Layton * for two different purposes:
26003fd945a7SJeff Layton *
26013fd945a7SJeff Layton * 1) we need to build a path string to send to the MDS (for_wire == true)
26023fd945a7SJeff Layton * 2) we need a path string for local presentation (e.g. debugfs)
26033fd945a7SJeff Layton * (for_wire == false)
26043fd945a7SJeff Layton *
26053fd945a7SJeff Layton * The path is built in reverse, starting with the dentry. Walk back up toward
26063fd945a7SJeff Layton * the root, building the path until the first non-snapped inode is reached
26073fd945a7SJeff Layton * (for_wire) or the root inode is reached (!for_wire).
26082f2dc053SSage Weil *
26092f2dc053SSage Weil * Encode hidden .snap dirs as a double /, i.e.
26102f2dc053SSage Weil * foo/.snap/bar -> foo//bar
26112f2dc053SSage Weil */
ceph_mdsc_build_path(struct ceph_mds_client * mdsc,struct dentry * dentry,int * plen,u64 * pbase,int for_wire)26122e2023e9SXiubo Li char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
26132e2023e9SXiubo Li int *plen, u64 *pbase, int for_wire)
26142f2dc053SSage Weil {
26154c793d4cSJeff Layton struct dentry *cur;
26164c793d4cSJeff Layton struct inode *inode;
26172f2dc053SSage Weil char *path;
2618f77f21bbSJeff Layton int pos;
26191b71fe2eSAl Viro unsigned seq;
262069a10fb3SJeff Layton u64 base;
26212f2dc053SSage Weil
2622d37b1d99SMarkus Elfring if (!dentry)
26232f2dc053SSage Weil return ERR_PTR(-EINVAL);
26242f2dc053SSage Weil
2625f77f21bbSJeff Layton path = __getname();
2626d37b1d99SMarkus Elfring if (!path)
26272f2dc053SSage Weil return ERR_PTR(-ENOMEM);
2628f77f21bbSJeff Layton retry:
2629f77f21bbSJeff Layton pos = PATH_MAX - 1;
2630f77f21bbSJeff Layton path[pos] = '\0';
2631f77f21bbSJeff Layton
2632f77f21bbSJeff Layton seq = read_seqbegin(&rename_lock);
26334c793d4cSJeff Layton cur = dget(dentry);
2634f77f21bbSJeff Layton for (;;) {
26353fd945a7SJeff Layton struct dentry *parent;
26362f2dc053SSage Weil
26374c793d4cSJeff Layton spin_lock(&cur->d_lock);
26384c793d4cSJeff Layton inode = d_inode(cur);
26392f2dc053SSage Weil if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
2640104648adSSage Weil dout("build_path path+%d: %p SNAPDIR\n",
26414c793d4cSJeff Layton pos, cur);
26423fd945a7SJeff Layton spin_unlock(&cur->d_lock);
26433fd945a7SJeff Layton parent = dget_parent(cur);
26443fd945a7SJeff Layton } else if (for_wire && inode && dentry != cur &&
26452f2dc053SSage Weil ceph_snap(inode) == CEPH_NOSNAP) {
26464c793d4cSJeff Layton spin_unlock(&cur->d_lock);
2647d6b8bd67SJeff Layton pos++; /* get rid of any prepended '/' */
26482f2dc053SSage Weil break;
26493fd945a7SJeff Layton } else if (!for_wire || !IS_ENCRYPTED(d_inode(cur->d_parent))) {
26504c793d4cSJeff Layton pos -= cur->d_name.len;
26511b71fe2eSAl Viro if (pos < 0) {
26524c793d4cSJeff Layton spin_unlock(&cur->d_lock);
26532f2dc053SSage Weil break;
26541b71fe2eSAl Viro }
26554c793d4cSJeff Layton memcpy(path + pos, cur->d_name.name, cur->d_name.len);
26563fd945a7SJeff Layton spin_unlock(&cur->d_lock);
26573fd945a7SJeff Layton parent = dget_parent(cur);
26583fd945a7SJeff Layton } else {
26593fd945a7SJeff Layton int len, ret;
26603fd945a7SJeff Layton char buf[NAME_MAX];
26613fd945a7SJeff Layton
26623fd945a7SJeff Layton /*
26633fd945a7SJeff Layton * Proactively copy name into buf, in case we need to
26643fd945a7SJeff Layton * present it as-is.
26653fd945a7SJeff Layton */
26663fd945a7SJeff Layton memcpy(buf, cur->d_name.name, cur->d_name.len);
26673fd945a7SJeff Layton len = cur->d_name.len;
26683fd945a7SJeff Layton spin_unlock(&cur->d_lock);
26693fd945a7SJeff Layton parent = dget_parent(cur);
26703fd945a7SJeff Layton
267114e034a6SLuís Henriques ret = ceph_fscrypt_prepare_readdir(d_inode(parent));
26723fd945a7SJeff Layton if (ret < 0) {
26733fd945a7SJeff Layton dput(parent);
26743fd945a7SJeff Layton dput(cur);
26753fd945a7SJeff Layton return ERR_PTR(ret);
26762f2dc053SSage Weil }
26773fd945a7SJeff Layton
26783fd945a7SJeff Layton if (fscrypt_has_encryption_key(d_inode(parent))) {
26793fd945a7SJeff Layton len = ceph_encode_encrypted_fname(d_inode(parent),
26803fd945a7SJeff Layton cur, buf);
26813fd945a7SJeff Layton if (len < 0) {
26823fd945a7SJeff Layton dput(parent);
26833fd945a7SJeff Layton dput(cur);
26843fd945a7SJeff Layton return ERR_PTR(len);
26853fd945a7SJeff Layton }
26863fd945a7SJeff Layton }
26873fd945a7SJeff Layton pos -= len;
26883fd945a7SJeff Layton if (pos < 0) {
26893fd945a7SJeff Layton dput(parent);
26903fd945a7SJeff Layton break;
26913fd945a7SJeff Layton }
26923fd945a7SJeff Layton memcpy(path + pos, buf, len);
26933fd945a7SJeff Layton }
26943fd945a7SJeff Layton dput(cur);
26953fd945a7SJeff Layton cur = parent;
2696f77f21bbSJeff Layton
2697f77f21bbSJeff Layton /* Are we at the root? */
26984c793d4cSJeff Layton if (IS_ROOT(cur))
2699f77f21bbSJeff Layton break;
2700f77f21bbSJeff Layton
2701f77f21bbSJeff Layton /* Are we out of buffer? */
2702f77f21bbSJeff Layton if (--pos < 0)
2703f77f21bbSJeff Layton break;
2704f77f21bbSJeff Layton
2705f77f21bbSJeff Layton path[pos] = '/';
27062f2dc053SSage Weil }
27074c793d4cSJeff Layton inode = d_inode(cur);
27084c793d4cSJeff Layton base = inode ? ceph_ino(inode) : 0;
27094c793d4cSJeff Layton dput(cur);
2710f5946bccSJeff Layton
2711f5946bccSJeff Layton if (read_seqretry(&rename_lock, seq))
2712f5946bccSJeff Layton goto retry;
2713f5946bccSJeff Layton
2714f5946bccSJeff Layton if (pos < 0) {
2715f5946bccSJeff Layton /*
2716*c47ed911SMax Kellermann * The path is longer than PATH_MAX and this function
2717*c47ed911SMax Kellermann * cannot ever succeed. Creating paths that long is
2718*c47ed911SMax Kellermann * possible with Ceph, but Linux cannot use them.
2719f5946bccSJeff Layton */
2720*c47ed911SMax Kellermann return ERR_PTR(-ENAMETOOLONG);
27212f2dc053SSage Weil }
27222f2dc053SSage Weil
272369a10fb3SJeff Layton *pbase = base;
2724f77f21bbSJeff Layton *plen = PATH_MAX - 1 - pos;
2725104648adSSage Weil dout("build_path on %p %d built %llx '%.*s'\n",
2726f77f21bbSJeff Layton dentry, d_count(dentry), base, *plen, path + pos);
2727f77f21bbSJeff Layton return path + pos;
27282f2dc053SSage Weil }
27292f2dc053SSage Weil
build_dentry_path(struct ceph_mds_client * mdsc,struct dentry * dentry,struct inode * dir,const char ** ppath,int * ppathlen,u64 * pino,bool * pfreepath,bool parent_locked)27302e2023e9SXiubo Li static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
27312e2023e9SXiubo Li struct inode *dir, const char **ppath, int *ppathlen,
27322e2023e9SXiubo Li u64 *pino, bool *pfreepath, bool parent_locked)
27332f2dc053SSage Weil {
27342f2dc053SSage Weil char *path;
27352f2dc053SSage Weil
2736c6b0b656SJeff Layton rcu_read_lock();
2737fd36a717SJeff Layton if (!dir)
2738c6b0b656SJeff Layton dir = d_inode_rcu(dentry->d_parent);
27393fd945a7SJeff Layton if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP &&
27403fd945a7SJeff Layton !IS_ENCRYPTED(dir)) {
2741c6b0b656SJeff Layton *pino = ceph_ino(dir);
2742c6b0b656SJeff Layton rcu_read_unlock();
27432f2dc053SSage Weil *ppath = dentry->d_name.name;
27442f2dc053SSage Weil *ppathlen = dentry->d_name.len;
27452f2dc053SSage Weil return 0;
27462f2dc053SSage Weil }
2747c6b0b656SJeff Layton rcu_read_unlock();
27482e2023e9SXiubo Li path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
27492f2dc053SSage Weil if (IS_ERR(path))
27502f2dc053SSage Weil return PTR_ERR(path);
27512f2dc053SSage Weil *ppath = path;
27521bcb3440SJeff Layton *pfreepath = true;
27532f2dc053SSage Weil return 0;
27542f2dc053SSage Weil }
27552f2dc053SSage Weil
build_inode_path(struct inode * inode,const char ** ppath,int * ppathlen,u64 * pino,bool * pfreepath)27562f2dc053SSage Weil static int build_inode_path(struct inode *inode,
27572f2dc053SSage Weil const char **ppath, int *ppathlen, u64 *pino,
27581bcb3440SJeff Layton bool *pfreepath)
27592f2dc053SSage Weil {
27602e2023e9SXiubo Li struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
27612f2dc053SSage Weil struct dentry *dentry;
27622f2dc053SSage Weil char *path;
27632f2dc053SSage Weil
27642f2dc053SSage Weil if (ceph_snap(inode) == CEPH_NOSNAP) {
27652f2dc053SSage Weil *pino = ceph_ino(inode);
27662f2dc053SSage Weil *ppathlen = 0;
27672f2dc053SSage Weil return 0;
27682f2dc053SSage Weil }
27692f2dc053SSage Weil dentry = d_find_alias(inode);
27702e2023e9SXiubo Li path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
27712f2dc053SSage Weil dput(dentry);
27722f2dc053SSage Weil if (IS_ERR(path))
27732f2dc053SSage Weil return PTR_ERR(path);
27742f2dc053SSage Weil *ppath = path;
27751bcb3440SJeff Layton *pfreepath = true;
27762f2dc053SSage Weil return 0;
27772f2dc053SSage Weil }
27782f2dc053SSage Weil
27792f2dc053SSage Weil /*
27802f2dc053SSage Weil * request arguments may be specified via an inode *, a dentry *, or
27812f2dc053SSage Weil * an explicit ino+path.
27822f2dc053SSage Weil */
set_request_path_attr(struct ceph_mds_client * mdsc,struct inode * rinode,struct dentry * rdentry,struct inode * rdiri,const char * rpath,u64 rino,const char ** ppath,int * pathlen,u64 * ino,bool * freepath,bool parent_locked)27832e2023e9SXiubo Li static int set_request_path_attr(struct ceph_mds_client *mdsc, struct inode *rinode,
27842e2023e9SXiubo Li struct dentry *rdentry, struct inode *rdiri,
27852e2023e9SXiubo Li const char *rpath, u64 rino, const char **ppath,
27862e2023e9SXiubo Li int *pathlen, u64 *ino, bool *freepath,
27872e2023e9SXiubo Li bool parent_locked)
27882f2dc053SSage Weil {
27892f2dc053SSage Weil int r = 0;
27902f2dc053SSage Weil
27912f2dc053SSage Weil if (rinode) {
27922f2dc053SSage Weil r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
27932f2dc053SSage Weil dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
27942f2dc053SSage Weil ceph_snap(rinode));
27952f2dc053SSage Weil } else if (rdentry) {
27962e2023e9SXiubo Li r = build_dentry_path(mdsc, rdentry, rdiri, ppath, pathlen, ino,
27971bcb3440SJeff Layton freepath, parent_locked);
27982f2dc053SSage Weil dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
27992f2dc053SSage Weil *ppath);
2800795858dbSSage Weil } else if (rpath || rino) {
28012f2dc053SSage Weil *ino = rino;
28022f2dc053SSage Weil *ppath = rpath;
2803b000056aSDavid Zafman *pathlen = rpath ? strlen(rpath) : 0;
28042f2dc053SSage Weil dout(" path %.*s\n", *pathlen, rpath);
28052f2dc053SSage Weil }
28062f2dc053SSage Weil
28072f2dc053SSage Weil return r;
28082f2dc053SSage Weil }
28092f2dc053SSage Weil
encode_mclientrequest_tail(void ** p,const struct ceph_mds_request * req)28102d332d5bSJeff Layton static void encode_mclientrequest_tail(void **p,
281160267ba3SIlya Dryomov const struct ceph_mds_request *req)
281260267ba3SIlya Dryomov {
281360267ba3SIlya Dryomov struct ceph_timespec ts;
281460267ba3SIlya Dryomov int i;
281560267ba3SIlya Dryomov
281660267ba3SIlya Dryomov ceph_encode_timespec64(&ts, &req->r_stamp);
281760267ba3SIlya Dryomov ceph_encode_copy(p, &ts, sizeof(ts));
281860267ba3SIlya Dryomov
281924865e75SJeff Layton /* v4: gid_list */
282060267ba3SIlya Dryomov ceph_encode_32(p, req->r_cred->group_info->ngroups);
282160267ba3SIlya Dryomov for (i = 0; i < req->r_cred->group_info->ngroups; i++)
282260267ba3SIlya Dryomov ceph_encode_64(p, from_kgid(&init_user_ns,
282360267ba3SIlya Dryomov req->r_cred->group_info->gid[i]));
28242d332d5bSJeff Layton
282524865e75SJeff Layton /* v5: altname */
282624865e75SJeff Layton ceph_encode_32(p, req->r_altname_len);
282724865e75SJeff Layton ceph_encode_copy(p, req->r_altname, req->r_altname_len);
28282d332d5bSJeff Layton
28292d332d5bSJeff Layton /* v6: fscrypt_auth and fscrypt_file */
28302d332d5bSJeff Layton if (req->r_fscrypt_auth) {
28312d332d5bSJeff Layton u32 authlen = ceph_fscrypt_auth_len(req->r_fscrypt_auth);
28322d332d5bSJeff Layton
28332d332d5bSJeff Layton ceph_encode_32(p, authlen);
28342d332d5bSJeff Layton ceph_encode_copy(p, req->r_fscrypt_auth, authlen);
28352d332d5bSJeff Layton } else {
28362d332d5bSJeff Layton ceph_encode_32(p, 0);
28372d332d5bSJeff Layton }
283816be62fcSJeff Layton if (test_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags)) {
283916be62fcSJeff Layton ceph_encode_32(p, sizeof(__le64));
284016be62fcSJeff Layton ceph_encode_64(p, req->r_fscrypt_file);
284116be62fcSJeff Layton } else {
284216be62fcSJeff Layton ceph_encode_32(p, 0);
284316be62fcSJeff Layton }
284460267ba3SIlya Dryomov }
284560267ba3SIlya Dryomov
2846ce0d5bd3SXiubo Li static struct ceph_mds_request_head_legacy *
find_legacy_request_head(void * p,u64 features)2847ce0d5bd3SXiubo Li find_legacy_request_head(void *p, u64 features)
2848ce0d5bd3SXiubo Li {
2849ce0d5bd3SXiubo Li bool legacy = !(features & CEPH_FEATURE_FS_BTIME);
2850ce0d5bd3SXiubo Li struct ceph_mds_request_head_old *ohead;
2851ce0d5bd3SXiubo Li
2852ce0d5bd3SXiubo Li if (legacy)
2853ce0d5bd3SXiubo Li return (struct ceph_mds_request_head_legacy *)p;
2854ce0d5bd3SXiubo Li ohead = (struct ceph_mds_request_head_old *)p;
2855ce0d5bd3SXiubo Li return (struct ceph_mds_request_head_legacy *)&ohead->oldest_client_tid;
2856ce0d5bd3SXiubo Li }
2857ce0d5bd3SXiubo Li
28582f2dc053SSage Weil /*
28592f2dc053SSage Weil * called under mdsc->mutex
28602f2dc053SSage Weil */
create_request_message(struct ceph_mds_session * session,struct ceph_mds_request * req,bool drop_cap_releases)28614f1ddb1eSJeff Layton static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
28622f2dc053SSage Weil struct ceph_mds_request *req,
28634f1ddb1eSJeff Layton bool drop_cap_releases)
28642f2dc053SSage Weil {
28654f1ddb1eSJeff Layton int mds = session->s_mds;
28664f1ddb1eSJeff Layton struct ceph_mds_client *mdsc = session->s_mdsc;
28672f2dc053SSage Weil struct ceph_msg *msg;
2868ce0d5bd3SXiubo Li struct ceph_mds_request_head_legacy *lhead;
28692f2dc053SSage Weil const char *path1 = NULL;
28702f2dc053SSage Weil const char *path2 = NULL;
28712f2dc053SSage Weil u64 ino1 = 0, ino2 = 0;
28722f2dc053SSage Weil int pathlen1 = 0, pathlen2 = 0;
28731bcb3440SJeff Layton bool freepath1 = false, freepath2 = false;
2874a5ffd7b6SXiubo Li struct dentry *old_dentry = NULL;
287560267ba3SIlya Dryomov int len;
28762f2dc053SSage Weil u16 releases;
28772f2dc053SSage Weil void *p, *end;
28782f2dc053SSage Weil int ret;
28794f1ddb1eSJeff Layton bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);
2880ce0d5bd3SXiubo Li bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD,
2881ce0d5bd3SXiubo Li &session->s_features);
28822f2dc053SSage Weil
28832e2023e9SXiubo Li ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
28843dd69aabSJeff Layton req->r_parent, req->r_path1, req->r_ino1.ino,
28851bcb3440SJeff Layton &path1, &pathlen1, &ino1, &freepath1,
28861bcb3440SJeff Layton test_bit(CEPH_MDS_R_PARENT_LOCKED,
28871bcb3440SJeff Layton &req->r_req_flags));
28882f2dc053SSage Weil if (ret < 0) {
28892f2dc053SSage Weil msg = ERR_PTR(ret);
28902f2dc053SSage Weil goto out;
28912f2dc053SSage Weil }
28922f2dc053SSage Weil
28931bcb3440SJeff Layton /* If r_old_dentry is set, then assume that its parent is locked */
2894a5ffd7b6SXiubo Li if (req->r_old_dentry &&
2895a5ffd7b6SXiubo Li !(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED))
2896a5ffd7b6SXiubo Li old_dentry = req->r_old_dentry;
28972e2023e9SXiubo Li ret = set_request_path_attr(mdsc, NULL, old_dentry,
2898fd36a717SJeff Layton req->r_old_dentry_dir,
28992f2dc053SSage Weil req->r_path2, req->r_ino2.ino,
29001bcb3440SJeff Layton &path2, &pathlen2, &ino2, &freepath2, true);
29012f2dc053SSage Weil if (ret < 0) {
29022f2dc053SSage Weil msg = ERR_PTR(ret);
29032f2dc053SSage Weil goto out_free1;
29042f2dc053SSage Weil }
29052f2dc053SSage Weil
290624865e75SJeff Layton req->r_altname = get_fscrypt_altname(req, &req->r_altname_len);
290724865e75SJeff Layton if (IS_ERR(req->r_altname)) {
290824865e75SJeff Layton msg = ERR_CAST(req->r_altname);
290924865e75SJeff Layton req->r_altname = NULL;
291024865e75SJeff Layton goto out_free2;
291124865e75SJeff Layton }
291224865e75SJeff Layton
2913ce0d5bd3SXiubo Li /*
2914ce0d5bd3SXiubo Li * For old cephs without supporting the 32bit retry/fwd feature
2915ce0d5bd3SXiubo Li * it will copy the raw memories directly when decoding the
2916ce0d5bd3SXiubo Li * requests. While new cephs will decode the head depending the
2917ce0d5bd3SXiubo Li * version member, so we need to make sure it will be compatible
2918ce0d5bd3SXiubo Li * with them both.
2919ce0d5bd3SXiubo Li */
2920ce0d5bd3SXiubo Li if (legacy)
2921ce0d5bd3SXiubo Li len = sizeof(struct ceph_mds_request_head_legacy);
2922ce0d5bd3SXiubo Li else if (old_version)
2923ce0d5bd3SXiubo Li len = sizeof(struct ceph_mds_request_head_old);
2924ce0d5bd3SXiubo Li else
2925ce0d5bd3SXiubo Li len = sizeof(struct ceph_mds_request_head);
29262f2dc053SSage Weil
29272d332d5bSJeff Layton /* filepaths */
29282d332d5bSJeff Layton len += 2 * (1 + sizeof(u32) + sizeof(u64));
29292d332d5bSJeff Layton len += pathlen1 + pathlen2;
29302d332d5bSJeff Layton
29312d332d5bSJeff Layton /* cap releases */
29322f2dc053SSage Weil len += sizeof(struct ceph_mds_request_release) *
29332f2dc053SSage Weil (!!req->r_inode_drop + !!req->r_dentry_drop +
29342f2dc053SSage Weil !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
29354f1ddb1eSJeff Layton
29362f2dc053SSage Weil if (req->r_dentry_drop)
2937c1dfc277SJeff Layton len += pathlen1;
29382f2dc053SSage Weil if (req->r_old_dentry_drop)
2939c1dfc277SJeff Layton len += pathlen2;
29402f2dc053SSage Weil
29412d332d5bSJeff Layton /* MClientRequest tail */
29422d332d5bSJeff Layton
29432d332d5bSJeff Layton /* req->r_stamp */
29442d332d5bSJeff Layton len += sizeof(struct ceph_timespec);
29452d332d5bSJeff Layton
29462d332d5bSJeff Layton /* gid list */
29472d332d5bSJeff Layton len += sizeof(u32) + (sizeof(u64) * req->r_cred->group_info->ngroups);
29482d332d5bSJeff Layton
29492d332d5bSJeff Layton /* alternate name */
295024865e75SJeff Layton len += sizeof(u32) + req->r_altname_len;
29512d332d5bSJeff Layton
29522d332d5bSJeff Layton /* fscrypt_auth */
29532d332d5bSJeff Layton len += sizeof(u32); // fscrypt_auth
29542d332d5bSJeff Layton if (req->r_fscrypt_auth)
29552d332d5bSJeff Layton len += ceph_fscrypt_auth_len(req->r_fscrypt_auth);
29562d332d5bSJeff Layton
29572d332d5bSJeff Layton /* fscrypt_file */
29582d332d5bSJeff Layton len += sizeof(u32);
295916be62fcSJeff Layton if (test_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags))
296016be62fcSJeff Layton len += sizeof(__le64);
29612d332d5bSJeff Layton
29620d9c1ab3SIlya Dryomov msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false);
2963a79832f2SSage Weil if (!msg) {
2964a79832f2SSage Weil msg = ERR_PTR(-ENOMEM);
29652f2dc053SSage Weil goto out_free2;
2966a79832f2SSage Weil }
29672f2dc053SSage Weil
29686df058c0SSage Weil msg->hdr.tid = cpu_to_le64(req->r_tid);
29696df058c0SSage Weil
2970ce0d5bd3SXiubo Li lhead = find_legacy_request_head(msg->front.iov_base,
2971ce0d5bd3SXiubo Li session->s_con.peer_features);
2972ce0d5bd3SXiubo Li
29734f1ddb1eSJeff Layton /*
2974ce0d5bd3SXiubo Li * The ceph_mds_request_head_legacy didn't contain a version field, and
29754f1ddb1eSJeff Layton * one was added when we moved the message version from 3->4.
29764f1ddb1eSJeff Layton */
29774f1ddb1eSJeff Layton if (legacy) {
29784f1ddb1eSJeff Layton msg->hdr.version = cpu_to_le16(3);
2979ce0d5bd3SXiubo Li p = msg->front.iov_base + sizeof(*lhead);
2980ce0d5bd3SXiubo Li } else if (old_version) {
2981ce0d5bd3SXiubo Li struct ceph_mds_request_head_old *ohead = msg->front.iov_base;
2982ce0d5bd3SXiubo Li
2983ce0d5bd3SXiubo Li msg->hdr.version = cpu_to_le16(4);
2984ce0d5bd3SXiubo Li ohead->version = cpu_to_le16(1);
2985ce0d5bd3SXiubo Li p = msg->front.iov_base + sizeof(*ohead);
29864f1ddb1eSJeff Layton } else {
2987ce0d5bd3SXiubo Li struct ceph_mds_request_head *nhead = msg->front.iov_base;
29884f1ddb1eSJeff Layton
29892d332d5bSJeff Layton msg->hdr.version = cpu_to_le16(6);
2990ce0d5bd3SXiubo Li nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
2991ce0d5bd3SXiubo Li p = msg->front.iov_base + sizeof(*nhead);
29924f1ddb1eSJeff Layton }
29934f1ddb1eSJeff Layton
29942f2dc053SSage Weil end = msg->front.iov_base + msg->front.iov_len;
29952f2dc053SSage Weil
2996ce0d5bd3SXiubo Li lhead->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
2997ce0d5bd3SXiubo Li lhead->op = cpu_to_le32(req->r_op);
2998ce0d5bd3SXiubo Li lhead->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
29997fe0cdebSJeff Layton req->r_cred->fsuid));
3000ce0d5bd3SXiubo Li lhead->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
30017fe0cdebSJeff Layton req->r_cred->fsgid));
3002ce0d5bd3SXiubo Li lhead->ino = cpu_to_le64(req->r_deleg_ino);
3003ce0d5bd3SXiubo Li lhead->args = req->r_args;
30042f2dc053SSage Weil
30052f2dc053SSage Weil ceph_encode_filepath(&p, end, ino1, path1);
30062f2dc053SSage Weil ceph_encode_filepath(&p, end, ino2, path2);
30072f2dc053SSage Weil
3008e979cf50SSage Weil /* make note of release offset, in case we need to replay */
3009e979cf50SSage Weil req->r_request_release_offset = p - msg->front.iov_base;
3010e979cf50SSage Weil
30112f2dc053SSage Weil /* cap releases */
30122f2dc053SSage Weil releases = 0;
30132f2dc053SSage Weil if (req->r_inode_drop)
30142f2dc053SSage Weil releases += ceph_encode_inode_release(&p,
30152b0143b5SDavid Howells req->r_inode ? req->r_inode : d_inode(req->r_dentry),
3016719a2514SYan, Zheng mds, req->r_inode_drop, req->r_inode_unless,
3017719a2514SYan, Zheng req->r_op == CEPH_MDS_OP_READDIR);
30183fd945a7SJeff Layton if (req->r_dentry_drop) {
30193fd945a7SJeff Layton ret = ceph_encode_dentry_release(&p, req->r_dentry,
30203dd69aabSJeff Layton req->r_parent, mds, req->r_dentry_drop,
3021ca6c8ae0SJeff Layton req->r_dentry_unless);
30223fd945a7SJeff Layton if (ret < 0)
30233fd945a7SJeff Layton goto out_err;
30243fd945a7SJeff Layton releases += ret;
30253fd945a7SJeff Layton }
30263fd945a7SJeff Layton if (req->r_old_dentry_drop) {
30273fd945a7SJeff Layton ret = ceph_encode_dentry_release(&p, req->r_old_dentry,
3028ca6c8ae0SJeff Layton req->r_old_dentry_dir, mds,
3029ca6c8ae0SJeff Layton req->r_old_dentry_drop,
3030ca6c8ae0SJeff Layton req->r_old_dentry_unless);
30313fd945a7SJeff Layton if (ret < 0)
30323fd945a7SJeff Layton goto out_err;
30333fd945a7SJeff Layton releases += ret;
30343fd945a7SJeff Layton }
30352f2dc053SSage Weil if (req->r_old_inode_drop)
30362f2dc053SSage Weil releases += ceph_encode_inode_release(&p,
30372b0143b5SDavid Howells d_inode(req->r_old_dentry),
30382f2dc053SSage Weil mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
30396e6f0923SYan, Zheng
30406e6f0923SYan, Zheng if (drop_cap_releases) {
30416e6f0923SYan, Zheng releases = 0;
30426e6f0923SYan, Zheng p = msg->front.iov_base + req->r_request_release_offset;
30436e6f0923SYan, Zheng }
30446e6f0923SYan, Zheng
3045ce0d5bd3SXiubo Li lhead->num_releases = cpu_to_le16(releases);
30462f2dc053SSage Weil
30472d332d5bSJeff Layton encode_mclientrequest_tail(&p, req);
30484f1ddb1eSJeff Layton
3049b682c6d4SXiubo Li if (WARN_ON_ONCE(p > end)) {
3050b682c6d4SXiubo Li ceph_msg_put(msg);
3051b682c6d4SXiubo Li msg = ERR_PTR(-ERANGE);
3052b682c6d4SXiubo Li goto out_free2;
3053b682c6d4SXiubo Li }
3054b682c6d4SXiubo Li
30552f2dc053SSage Weil msg->front.iov_len = p - msg->front.iov_base;
30562f2dc053SSage Weil msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
30572f2dc053SSage Weil
305825e6bae3SYan, Zheng if (req->r_pagelist) {
305925e6bae3SYan, Zheng struct ceph_pagelist *pagelist = req->r_pagelist;
306025e6bae3SYan, Zheng ceph_msg_data_add_pagelist(msg, pagelist);
306125e6bae3SYan, Zheng msg->hdr.data_len = cpu_to_le32(pagelist->length);
306225e6bae3SYan, Zheng } else {
306325e6bae3SYan, Zheng msg->hdr.data_len = 0;
3064ebf18f47SAlex Elder }
306502afca6cSAlex Elder
30662f2dc053SSage Weil msg->hdr.data_off = cpu_to_le16(0);
30672f2dc053SSage Weil
30682f2dc053SSage Weil out_free2:
30692f2dc053SSage Weil if (freepath2)
3070f77f21bbSJeff Layton ceph_mdsc_free_path((char *)path2, pathlen2);
30712f2dc053SSage Weil out_free1:
30722f2dc053SSage Weil if (freepath1)
3073f77f21bbSJeff Layton ceph_mdsc_free_path((char *)path1, pathlen1);
30742f2dc053SSage Weil out:
30752f2dc053SSage Weil return msg;
30763fd945a7SJeff Layton out_err:
30773fd945a7SJeff Layton ceph_msg_put(msg);
30783fd945a7SJeff Layton msg = ERR_PTR(ret);
30793fd945a7SJeff Layton goto out_free2;
30802f2dc053SSage Weil }
30812f2dc053SSage Weil
30822f2dc053SSage Weil /*
30832f2dc053SSage Weil * called under mdsc->mutex if error, under no mutex if
30842f2dc053SSage Weil * success.
30852f2dc053SSage Weil */
complete_request(struct ceph_mds_client * mdsc,struct ceph_mds_request * req)30862f2dc053SSage Weil static void complete_request(struct ceph_mds_client *mdsc,
30872f2dc053SSage Weil struct ceph_mds_request *req)
30882f2dc053SSage Weil {
308970c94820SXiubo Li req->r_end_latency = ktime_get();
309070c94820SXiubo Li
30912f2dc053SSage Weil if (req->r_callback)
30922f2dc053SSage Weil req->r_callback(mdsc, req);
309303066f23SYehuda Sadeh complete_all(&req->r_completion);
30942f2dc053SSage Weil }
30952f2dc053SSage Weil
30962f2dc053SSage Weil /*
30972f2dc053SSage Weil * called under mdsc->mutex
30982f2dc053SSage Weil */
__prepare_send_request(struct ceph_mds_session * session,struct ceph_mds_request * req,bool drop_cap_releases)3099396bd62cSJeff Layton static int __prepare_send_request(struct ceph_mds_session *session,
31002f2dc053SSage Weil struct ceph_mds_request *req,
3101396bd62cSJeff Layton bool drop_cap_releases)
31022f2dc053SSage Weil {
3103396bd62cSJeff Layton int mds = session->s_mds;
3104396bd62cSJeff Layton struct ceph_mds_client *mdsc = session->s_mdsc;
3105ce0d5bd3SXiubo Li struct ceph_mds_request_head_legacy *lhead;
3106ce0d5bd3SXiubo Li struct ceph_mds_request_head *nhead;
31072f2dc053SSage Weil struct ceph_msg *msg;
3108ce0d5bd3SXiubo Li int flags = 0, old_max_retry;
3109ce0d5bd3SXiubo Li bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD,
3110ce0d5bd3SXiubo Li &session->s_features);
3111546a5d61SXiubo Li
3112546a5d61SXiubo Li /*
3113ce0d5bd3SXiubo Li * Avoid inifinite retrying after overflow. The client will
3114ce0d5bd3SXiubo Li * increase the retry count and if the MDS is old version,
3115ce0d5bd3SXiubo Li * so we limit to retry at most 256 times.
3116546a5d61SXiubo Li */
3117ce0d5bd3SXiubo Li if (req->r_attempts) {
3118ce0d5bd3SXiubo Li old_max_retry = sizeof_field(struct ceph_mds_request_head_old,
3119ce0d5bd3SXiubo Li num_retry);
3120ce0d5bd3SXiubo Li old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE);
3121ce0d5bd3SXiubo Li if ((old_version && req->r_attempts >= old_max_retry) ||
3122ce0d5bd3SXiubo Li ((uint32_t)req->r_attempts >= U32_MAX)) {
3123546a5d61SXiubo Li pr_warn_ratelimited("%s request tid %llu seq overflow\n",
3124546a5d61SXiubo Li __func__, req->r_tid);
3125546a5d61SXiubo Li return -EMULTIHOP;
3126546a5d61SXiubo Li }
3127ce0d5bd3SXiubo Li }
31282f2dc053SSage Weil
31292f2dc053SSage Weil req->r_attempts++;
3130e55b71f8SGreg Farnum if (req->r_inode) {
3131e55b71f8SGreg Farnum struct ceph_cap *cap =
3132e55b71f8SGreg Farnum ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds);
3133e55b71f8SGreg Farnum
3134e55b71f8SGreg Farnum if (cap)
3135e55b71f8SGreg Farnum req->r_sent_on_mseq = cap->mseq;
3136e55b71f8SGreg Farnum else
3137e55b71f8SGreg Farnum req->r_sent_on_mseq = -1;
3138e55b71f8SGreg Farnum }
3139546a5d61SXiubo Li dout("%s %p tid %lld %s (attempt %d)\n", __func__, req,
31402f2dc053SSage Weil req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
31412f2dc053SSage Weil
3142bc2de10dSJeff Layton if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
3143c5c9a0bfSYan, Zheng void *p;
31444f1ddb1eSJeff Layton
314501a92f17SSage Weil /*
314601a92f17SSage Weil * Replay. Do not regenerate message (and rebuild
314701a92f17SSage Weil * paths, etc.); just use the original message.
314801a92f17SSage Weil * Rebuilding paths will break for renames because
314901a92f17SSage Weil * d_move mangles the src name.
315001a92f17SSage Weil */
315101a92f17SSage Weil msg = req->r_request;
3152ce0d5bd3SXiubo Li lhead = find_legacy_request_head(msg->front.iov_base,
31534f1ddb1eSJeff Layton session->s_con.peer_features);
315401a92f17SSage Weil
3155ce0d5bd3SXiubo Li flags = le32_to_cpu(lhead->flags);
315601a92f17SSage Weil flags |= CEPH_MDS_FLAG_REPLAY;
3157ce0d5bd3SXiubo Li lhead->flags = cpu_to_le32(flags);
315801a92f17SSage Weil
315901a92f17SSage Weil if (req->r_target_inode)
3160ce0d5bd3SXiubo Li lhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
316101a92f17SSage Weil
3162ce0d5bd3SXiubo Li lhead->num_retry = req->r_attempts - 1;
3163ce0d5bd3SXiubo Li if (!old_version) {
3164ce0d5bd3SXiubo Li nhead = (struct ceph_mds_request_head*)msg->front.iov_base;
3165ce0d5bd3SXiubo Li nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
3166ce0d5bd3SXiubo Li }
3167e979cf50SSage Weil
3168e979cf50SSage Weil /* remove cap/dentry releases from message */
3169ce0d5bd3SXiubo Li lhead->num_releases = 0;
3170c5c9a0bfSYan, Zheng
3171c5c9a0bfSYan, Zheng p = msg->front.iov_base + req->r_request_release_offset;
31722d332d5bSJeff Layton encode_mclientrequest_tail(&p, req);
3173c5c9a0bfSYan, Zheng
3174c5c9a0bfSYan, Zheng msg->front.iov_len = p - msg->front.iov_base;
3175c5c9a0bfSYan, Zheng msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
317601a92f17SSage Weil return 0;
317701a92f17SSage Weil }
317801a92f17SSage Weil
31792f2dc053SSage Weil if (req->r_request) {
31802f2dc053SSage Weil ceph_msg_put(req->r_request);
31812f2dc053SSage Weil req->r_request = NULL;
31822f2dc053SSage Weil }
31834f1ddb1eSJeff Layton msg = create_request_message(session, req, drop_cap_releases);
31842f2dc053SSage Weil if (IS_ERR(msg)) {
3185e1518c7cSSage Weil req->r_err = PTR_ERR(msg);
3186a79832f2SSage Weil return PTR_ERR(msg);
31872f2dc053SSage Weil }
31882f2dc053SSage Weil req->r_request = msg;
31892f2dc053SSage Weil
3190ce0d5bd3SXiubo Li lhead = find_legacy_request_head(msg->front.iov_base,
31914f1ddb1eSJeff Layton session->s_con.peer_features);
3192ce0d5bd3SXiubo Li lhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
3193bc2de10dSJeff Layton if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
31942f2dc053SSage Weil flags |= CEPH_MDS_FLAG_REPLAY;
31953bb48b41SJeff Layton if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags))
31963bb48b41SJeff Layton flags |= CEPH_MDS_FLAG_ASYNC;
31973dd69aabSJeff Layton if (req->r_parent)
31982f2dc053SSage Weil flags |= CEPH_MDS_FLAG_WANT_DENTRY;
3199ce0d5bd3SXiubo Li lhead->flags = cpu_to_le32(flags);
3200ce0d5bd3SXiubo Li lhead->num_fwd = req->r_num_fwd;
3201ce0d5bd3SXiubo Li lhead->num_retry = req->r_attempts - 1;
3202ce0d5bd3SXiubo Li if (!old_version) {
3203ce0d5bd3SXiubo Li nhead = (struct ceph_mds_request_head*)msg->front.iov_base;
3204ce0d5bd3SXiubo Li nhead->ext_num_fwd = cpu_to_le32(req->r_num_fwd);
3205ce0d5bd3SXiubo Li nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
3206ce0d5bd3SXiubo Li }
32072f2dc053SSage Weil
32083dd69aabSJeff Layton dout(" r_parent = %p\n", req->r_parent);
32092f2dc053SSage Weil return 0;
32102f2dc053SSage Weil }
32112f2dc053SSage Weil
32122f2dc053SSage Weil /*
32139cf54563SXiubo Li * called under mdsc->mutex
32149cf54563SXiubo Li */
__send_request(struct ceph_mds_session * session,struct ceph_mds_request * req,bool drop_cap_releases)3215396bd62cSJeff Layton static int __send_request(struct ceph_mds_session *session,
32169cf54563SXiubo Li struct ceph_mds_request *req,
32179cf54563SXiubo Li bool drop_cap_releases)
32189cf54563SXiubo Li {
32199cf54563SXiubo Li int err;
32209cf54563SXiubo Li
3221396bd62cSJeff Layton err = __prepare_send_request(session, req, drop_cap_releases);
32229cf54563SXiubo Li if (!err) {
32239cf54563SXiubo Li ceph_msg_get(req->r_request);
32249cf54563SXiubo Li ceph_con_send(&session->s_con, req->r_request);
32259cf54563SXiubo Li }
32269cf54563SXiubo Li
32279cf54563SXiubo Li return err;
32289cf54563SXiubo Li }
32299cf54563SXiubo Li
32309cf54563SXiubo Li /*
32312f2dc053SSage Weil * send request, or put it on the appropriate wait list.
32322f2dc053SSage Weil */
__do_request(struct ceph_mds_client * mdsc,struct ceph_mds_request * req)3233d5548492SChengguang Xu static void __do_request(struct ceph_mds_client *mdsc,
32342f2dc053SSage Weil struct ceph_mds_request *req)
32352f2dc053SSage Weil {
32362f2dc053SSage Weil struct ceph_mds_session *session = NULL;
32372f2dc053SSage Weil int mds = -1;
323848fec5d0SYan, Zheng int err = 0;
3239c4853e97SXiubo Li bool random;
32402f2dc053SSage Weil
3241bc2de10dSJeff Layton if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
3242bc2de10dSJeff Layton if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
3243eb1b8af3SYan, Zheng __unregister_request(mdsc, req);
3244d5548492SChengguang Xu return;
3245eb1b8af3SYan, Zheng }
32462f2dc053SSage Weil
3247a68e564aSXiubo Li if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) {
3248a68e564aSXiubo Li dout("do_request metadata corrupted\n");
3249a68e564aSXiubo Li err = -EIO;
3250a68e564aSXiubo Li goto finish;
3251a68e564aSXiubo Li }
32522f2dc053SSage Weil if (req->r_timeout &&
32532f2dc053SSage Weil time_after_eq(jiffies, req->r_started + req->r_timeout)) {
32542f2dc053SSage Weil dout("do_request timed out\n");
32558ccf7fccSXiubo Li err = -ETIMEDOUT;
32562f2dc053SSage Weil goto finish;
32572f2dc053SSage Weil }
325852953d55SSeraphime Kirkovski if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
325948fec5d0SYan, Zheng dout("do_request forced umount\n");
326048fec5d0SYan, Zheng err = -EIO;
326148fec5d0SYan, Zheng goto finish;
326248fec5d0SYan, Zheng }
326352953d55SSeraphime Kirkovski if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) {
3264e9e427f0SYan, Zheng if (mdsc->mdsmap_err) {
3265e9e427f0SYan, Zheng err = mdsc->mdsmap_err;
3266e9e427f0SYan, Zheng dout("do_request mdsmap err %d\n", err);
3267e9e427f0SYan, Zheng goto finish;
3268e9e427f0SYan, Zheng }
3269cc8e8342SYan, Zheng if (mdsc->mdsmap->m_epoch == 0) {
3270cc8e8342SYan, Zheng dout("do_request no mdsmap, waiting for map\n");
3271cc8e8342SYan, Zheng list_add(&req->r_wait, &mdsc->waiting_for_map);
3272d5548492SChengguang Xu return;
3273cc8e8342SYan, Zheng }
3274e9e427f0SYan, Zheng if (!(mdsc->fsc->mount_options->flags &
3275e9e427f0SYan, Zheng CEPH_MOUNT_OPT_MOUNTWAIT) &&
3276e9e427f0SYan, Zheng !ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) {
327797820058SXiubo Li err = -EHOSTUNREACH;
3278e9e427f0SYan, Zheng goto finish;
3279e9e427f0SYan, Zheng }
3280e9e427f0SYan, Zheng }
32812f2dc053SSage Weil
3282dc69e2e9SSage Weil put_request_session(req);
3283dc69e2e9SSage Weil
3284c4853e97SXiubo Li mds = __choose_mds(mdsc, req, &random);
32852f2dc053SSage Weil if (mds < 0 ||
32862f2dc053SSage Weil ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
32873bb48b41SJeff Layton if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) {
32883bb48b41SJeff Layton err = -EJUKEBOX;
32893bb48b41SJeff Layton goto finish;
32903bb48b41SJeff Layton }
32912f2dc053SSage Weil dout("do_request no mds or not active, waiting for map\n");
32922f2dc053SSage Weil list_add(&req->r_wait, &mdsc->waiting_for_map);
3293d5548492SChengguang Xu return;
32942f2dc053SSage Weil }
32952f2dc053SSage Weil
32962f2dc053SSage Weil /* get, open session */
32972f2dc053SSage Weil session = __ceph_lookup_mds_session(mdsc, mds);
32989c423956SSage Weil if (!session) {
32992f2dc053SSage Weil session = register_session(mdsc, mds);
33009c423956SSage Weil if (IS_ERR(session)) {
33019c423956SSage Weil err = PTR_ERR(session);
33029c423956SSage Weil goto finish;
33039c423956SSage Weil }
33049c423956SSage Weil }
33055b3248c6SXiubo Li req->r_session = ceph_get_mds_session(session);
3306dc69e2e9SSage Weil
33072f2dc053SSage Weil dout("do_request mds%d session %p state %s\n", mds, session,
3308a687ecafSJohn Spray ceph_session_state_name(session->s_state));
33096eb06c46SXiubo Li
33106eb06c46SXiubo Li /*
33116eb06c46SXiubo Li * The old ceph will crash the MDSs when see unknown OPs
33126eb06c46SXiubo Li */
33136eb06c46SXiubo Li if (req->r_feature_needed > 0 &&
33146eb06c46SXiubo Li !test_bit(req->r_feature_needed, &session->s_features)) {
33156eb06c46SXiubo Li err = -EOPNOTSUPP;
33166eb06c46SXiubo Li goto out_session;
33176eb06c46SXiubo Li }
33186eb06c46SXiubo Li
33192f2dc053SSage Weil if (session->s_state != CEPH_MDS_SESSION_OPEN &&
33202f2dc053SSage Weil session->s_state != CEPH_MDS_SESSION_HUNG) {
33213bb48b41SJeff Layton /*
33223bb48b41SJeff Layton * We cannot queue async requests since the caps and delegated
33233bb48b41SJeff Layton * inodes are bound to the session. Just return -EJUKEBOX and
33243bb48b41SJeff Layton * let the caller retry a sync request in that case.
33253bb48b41SJeff Layton */
33263bb48b41SJeff Layton if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) {
33273bb48b41SJeff Layton err = -EJUKEBOX;
33283bb48b41SJeff Layton goto out_session;
33293bb48b41SJeff Layton }
33304ae3713fSJeff Layton
33314ae3713fSJeff Layton /*
33324ae3713fSJeff Layton * If the session has been REJECTED, then return a hard error,
33334ae3713fSJeff Layton * unless it's a CLEANRECOVER mount, in which case we'll queue
33344ae3713fSJeff Layton * it to the mdsc queue.
33354ae3713fSJeff Layton */
33364ae3713fSJeff Layton if (session->s_state == CEPH_MDS_SESSION_REJECTED) {
33374ae3713fSJeff Layton if (ceph_test_mount_opt(mdsc->fsc, CLEANRECOVER))
33384ae3713fSJeff Layton list_add(&req->r_wait, &mdsc->waiting_for_map);
33394ae3713fSJeff Layton else
33404ae3713fSJeff Layton err = -EACCES;
33414ae3713fSJeff Layton goto out_session;
33424ae3713fSJeff Layton }
33434ae3713fSJeff Layton
33442f2dc053SSage Weil if (session->s_state == CEPH_MDS_SESSION_NEW ||
3345c4853e97SXiubo Li session->s_state == CEPH_MDS_SESSION_CLOSING) {
3346b682c6d4SXiubo Li err = __open_session(mdsc, session);
3347b682c6d4SXiubo Li if (err)
3348b682c6d4SXiubo Li goto out_session;
3349c4853e97SXiubo Li /* retry the same mds later */
3350c4853e97SXiubo Li if (random)
3351c4853e97SXiubo Li req->r_resend_mds = mds;
3352c4853e97SXiubo Li }
33532f2dc053SSage Weil list_add(&req->r_wait, &session->s_waiting);
33542f2dc053SSage Weil goto out_session;
33552f2dc053SSage Weil }
33562f2dc053SSage Weil
33572f2dc053SSage Weil /* send request */
33582f2dc053SSage Weil req->r_resend_mds = -1; /* forget any previous mds hint */
33592f2dc053SSage Weil
33602f2dc053SSage Weil if (req->r_request_started == 0) /* note request start time */
33612f2dc053SSage Weil req->r_request_started = jiffies;
33622f2dc053SSage Weil
336300061645SXiubo Li /*
336400061645SXiubo Li * For async create we will choose the auth MDS of frag in parent
336500061645SXiubo Li * directory to send the request and ususally this works fine, but
336600061645SXiubo Li * if the migrated the dirtory to another MDS before it could handle
336700061645SXiubo Li * it the request will be forwarded.
336800061645SXiubo Li *
336900061645SXiubo Li * And then the auth cap will be changed.
337000061645SXiubo Li */
337100061645SXiubo Li if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) && req->r_num_fwd) {
337200061645SXiubo Li struct ceph_dentry_info *di = ceph_dentry(req->r_dentry);
337300061645SXiubo Li struct ceph_inode_info *ci;
337400061645SXiubo Li struct ceph_cap *cap;
337500061645SXiubo Li
337600061645SXiubo Li /*
337700061645SXiubo Li * The request maybe handled very fast and the new inode
337800061645SXiubo Li * hasn't been linked to the dentry yet. We need to wait
337900061645SXiubo Li * for the ceph_finish_async_create(), which shouldn't be
338000061645SXiubo Li * stuck too long or fail in thoery, to finish when forwarding
338100061645SXiubo Li * the request.
338200061645SXiubo Li */
338300061645SXiubo Li if (!d_inode(req->r_dentry)) {
338400061645SXiubo Li err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_CREATE_BIT,
338500061645SXiubo Li TASK_KILLABLE);
338600061645SXiubo Li if (err) {
338700061645SXiubo Li mutex_lock(&req->r_fill_mutex);
338800061645SXiubo Li set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
338900061645SXiubo Li mutex_unlock(&req->r_fill_mutex);
339000061645SXiubo Li goto out_session;
339100061645SXiubo Li }
339200061645SXiubo Li }
339300061645SXiubo Li
339400061645SXiubo Li ci = ceph_inode(d_inode(req->r_dentry));
339500061645SXiubo Li
339600061645SXiubo Li spin_lock(&ci->i_ceph_lock);
339700061645SXiubo Li cap = ci->i_auth_cap;
339800061645SXiubo Li if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE && mds != cap->mds) {
339900061645SXiubo Li dout("do_request session changed for auth cap %d -> %d\n",
340000061645SXiubo Li cap->session->s_mds, session->s_mds);
340100061645SXiubo Li
340200061645SXiubo Li /* Remove the auth cap from old session */
340300061645SXiubo Li spin_lock(&cap->session->s_cap_lock);
340400061645SXiubo Li cap->session->s_nr_caps--;
340500061645SXiubo Li list_del_init(&cap->session_caps);
340600061645SXiubo Li spin_unlock(&cap->session->s_cap_lock);
340700061645SXiubo Li
340800061645SXiubo Li /* Add the auth cap to the new session */
340900061645SXiubo Li cap->mds = mds;
341000061645SXiubo Li cap->session = session;
341100061645SXiubo Li spin_lock(&session->s_cap_lock);
341200061645SXiubo Li session->s_nr_caps++;
341300061645SXiubo Li list_add_tail(&cap->session_caps, &session->s_caps);
341400061645SXiubo Li spin_unlock(&session->s_cap_lock);
341500061645SXiubo Li
341600061645SXiubo Li change_auth_cap_ses(ci, session);
341700061645SXiubo Li }
341800061645SXiubo Li spin_unlock(&ci->i_ceph_lock);
341900061645SXiubo Li }
342000061645SXiubo Li
3421396bd62cSJeff Layton err = __send_request(session, req, false);
34222f2dc053SSage Weil
34232f2dc053SSage Weil out_session:
34242f2dc053SSage Weil ceph_put_mds_session(session);
34252f2dc053SSage Weil finish:
342648fec5d0SYan, Zheng if (err) {
342748fec5d0SYan, Zheng dout("__do_request early error %d\n", err);
3428e1518c7cSSage Weil req->r_err = err;
34292f2dc053SSage Weil complete_request(mdsc, req);
343048fec5d0SYan, Zheng __unregister_request(mdsc, req);
343148fec5d0SYan, Zheng }
3432d5548492SChengguang Xu return;
34332f2dc053SSage Weil }
34342f2dc053SSage Weil
34352f2dc053SSage Weil /*
34362f2dc053SSage Weil * called under mdsc->mutex
34372f2dc053SSage Weil */
__wake_requests(struct ceph_mds_client * mdsc,struct list_head * head)34382f2dc053SSage Weil static void __wake_requests(struct ceph_mds_client *mdsc,
34392f2dc053SSage Weil struct list_head *head)
34402f2dc053SSage Weil {
3441ed75ec2cSYan, Zheng struct ceph_mds_request *req;
3442ed75ec2cSYan, Zheng LIST_HEAD(tmp_list);
34432f2dc053SSage Weil
3444ed75ec2cSYan, Zheng list_splice_init(head, &tmp_list);
3445ed75ec2cSYan, Zheng
3446ed75ec2cSYan, Zheng while (!list_empty(&tmp_list)) {
3447ed75ec2cSYan, Zheng req = list_entry(tmp_list.next,
3448ed75ec2cSYan, Zheng struct ceph_mds_request, r_wait);
34492f2dc053SSage Weil list_del_init(&req->r_wait);
34507971bd92SSage Weil dout(" wake request %p tid %llu\n", req, req->r_tid);
34512f2dc053SSage Weil __do_request(mdsc, req);
34522f2dc053SSage Weil }
34532f2dc053SSage Weil }
34542f2dc053SSage Weil
34552f2dc053SSage Weil /*
34562f2dc053SSage Weil * Wake up threads with requests pending for @mds, so that they can
345729790f26SSage Weil * resubmit their requests to a possibly different mds.
34582f2dc053SSage Weil */
kick_requests(struct ceph_mds_client * mdsc,int mds)345929790f26SSage Weil static void kick_requests(struct ceph_mds_client *mdsc, int mds)
34602f2dc053SSage Weil {
346144ca18f2SSage Weil struct ceph_mds_request *req;
3462282c1052SYan, Zheng struct rb_node *p = rb_first(&mdsc->request_tree);
34632f2dc053SSage Weil
34642f2dc053SSage Weil dout("kick_requests mds%d\n", mds);
3465282c1052SYan, Zheng while (p) {
346644ca18f2SSage Weil req = rb_entry(p, struct ceph_mds_request, r_node);
3467282c1052SYan, Zheng p = rb_next(p);
3468bc2de10dSJeff Layton if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
34692f2dc053SSage Weil continue;
34703de22be6SYan, Zheng if (req->r_attempts > 0)
34713de22be6SYan, Zheng continue; /* only new requests */
347244ca18f2SSage Weil if (req->r_session &&
347344ca18f2SSage Weil req->r_session->s_mds == mds) {
347444ca18f2SSage Weil dout(" kicking tid %llu\n", req->r_tid);
347503974e81SYan, Zheng list_del_init(&req->r_wait);
347644ca18f2SSage Weil __do_request(mdsc, req);
34772f2dc053SSage Weil }
34782f2dc053SSage Weil }
34792f2dc053SSage Weil }
34802f2dc053SSage Weil
ceph_mdsc_submit_request(struct ceph_mds_client * mdsc,struct inode * dir,struct ceph_mds_request * req)348186bda539SJeff Layton int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
34822f2dc053SSage Weil struct ceph_mds_request *req)
34832f2dc053SSage Weil {
3484891f3f5aSJeff Layton int err = 0;
348586bda539SJeff Layton
348686bda539SJeff Layton /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
348786bda539SJeff Layton if (req->r_inode)
348886bda539SJeff Layton ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
34899c1c2b35SJeff Layton if (req->r_parent) {
3490719a2514SYan, Zheng struct ceph_inode_info *ci = ceph_inode(req->r_parent);
3491719a2514SYan, Zheng int fmode = (req->r_op & CEPH_MDS_OP_WRITE) ?
3492719a2514SYan, Zheng CEPH_FILE_MODE_WR : CEPH_FILE_MODE_RD;
3493719a2514SYan, Zheng spin_lock(&ci->i_ceph_lock);
3494719a2514SYan, Zheng ceph_take_cap_refs(ci, CEPH_CAP_PIN, false);
3495719a2514SYan, Zheng __ceph_touch_fmode(ci, mdsc, fmode);
3496719a2514SYan, Zheng spin_unlock(&ci->i_ceph_lock);
34979c1c2b35SJeff Layton }
349886bda539SJeff Layton if (req->r_old_dentry_dir)
349986bda539SJeff Layton ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
350086bda539SJeff Layton CEPH_CAP_PIN);
350186bda539SJeff Layton
3502891f3f5aSJeff Layton if (req->r_inode) {
3503891f3f5aSJeff Layton err = ceph_wait_on_async_create(req->r_inode);
3504891f3f5aSJeff Layton if (err) {
3505891f3f5aSJeff Layton dout("%s: wait for async create returned: %d\n",
3506891f3f5aSJeff Layton __func__, err);
3507891f3f5aSJeff Layton return err;
3508891f3f5aSJeff Layton }
3509891f3f5aSJeff Layton }
3510891f3f5aSJeff Layton
3511891f3f5aSJeff Layton if (!err && req->r_old_inode) {
3512891f3f5aSJeff Layton err = ceph_wait_on_async_create(req->r_old_inode);
3513891f3f5aSJeff Layton if (err) {
3514891f3f5aSJeff Layton dout("%s: wait for async create returned: %d\n",
3515891f3f5aSJeff Layton __func__, err);
3516891f3f5aSJeff Layton return err;
3517891f3f5aSJeff Layton }
3518891f3f5aSJeff Layton }
3519891f3f5aSJeff Layton
352086bda539SJeff Layton dout("submit_request on %p for inode %p\n", req, dir);
35212f2dc053SSage Weil mutex_lock(&mdsc->mutex);
352286bda539SJeff Layton __register_request(mdsc, req, dir);
35232f2dc053SSage Weil __do_request(mdsc, req);
352486bda539SJeff Layton err = req->r_err;
35252f2dc053SSage Weil mutex_unlock(&mdsc->mutex);
352686bda539SJeff Layton return err;
35272f2dc053SSage Weil }
35282f2dc053SSage Weil
ceph_mdsc_wait_request(struct ceph_mds_client * mdsc,struct ceph_mds_request * req,ceph_mds_request_wait_callback_t wait_func)35299eaa7b79SJeff Layton int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
35309eaa7b79SJeff Layton struct ceph_mds_request *req,
35319eaa7b79SJeff Layton ceph_mds_request_wait_callback_t wait_func)
35322f2dc053SSage Weil {
35332f2dc053SSage Weil int err;
35342f2dc053SSage Weil
35352f2dc053SSage Weil /* wait */
3536e1518c7cSSage Weil dout("do_request waiting\n");
35379eaa7b79SJeff Layton if (wait_func) {
35389eaa7b79SJeff Layton err = wait_func(mdsc, req);
35392f2dc053SSage Weil } else {
35405be73034SIlya Dryomov long timeleft = wait_for_completion_killable_timeout(
35415be73034SIlya Dryomov &req->r_completion,
35425be73034SIlya Dryomov ceph_timeout_jiffies(req->r_timeout));
35435be73034SIlya Dryomov if (timeleft > 0)
35445be73034SIlya Dryomov err = 0;
35455be73034SIlya Dryomov else if (!timeleft)
35468ccf7fccSXiubo Li err = -ETIMEDOUT; /* timed out */
35475be73034SIlya Dryomov else
35485be73034SIlya Dryomov err = timeleft; /* killed */
35492f2dc053SSage Weil }
3550e1518c7cSSage Weil dout("do_request waited, got %d\n", err);
35512f2dc053SSage Weil mutex_lock(&mdsc->mutex);
35522f2dc053SSage Weil
3553e1518c7cSSage Weil /* only abort if we didn't race with a real reply */
3554bc2de10dSJeff Layton if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
3555e1518c7cSSage Weil err = le32_to_cpu(req->r_reply_info.head->result);
3556e1518c7cSSage Weil } else if (err < 0) {
3557e1518c7cSSage Weil dout("aborted request %lld with %d\n", req->r_tid, err);
3558b4556396SSage Weil
3559b4556396SSage Weil /*
3560b4556396SSage Weil * ensure we aren't running concurrently with
3561b4556396SSage Weil * ceph_fill_trace or ceph_readdir_prepopulate, which
3562b4556396SSage Weil * rely on locks (dir mutex) held by our caller.
3563b4556396SSage Weil */
3564b4556396SSage Weil mutex_lock(&req->r_fill_mutex);
3565e1518c7cSSage Weil req->r_err = err;
3566bc2de10dSJeff Layton set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
3567b4556396SSage Weil mutex_unlock(&req->r_fill_mutex);
35685b1daecdSSage Weil
35693dd69aabSJeff Layton if (req->r_parent &&
3570167c9e35SSage Weil (req->r_op & CEPH_MDS_OP_WRITE))
3571167c9e35SSage Weil ceph_invalidate_dir_request(req);
35725b1daecdSSage Weil } else {
35732f2dc053SSage Weil err = req->r_err;
35742f2dc053SSage Weil }
35752f2dc053SSage Weil
3576e1518c7cSSage Weil mutex_unlock(&mdsc->mutex);
35778340f22cSJeff Layton return err;
35788340f22cSJeff Layton }
35798340f22cSJeff Layton
35808340f22cSJeff Layton /*
35818340f22cSJeff Layton * Synchrously perform an mds request. Take care of all of the
35828340f22cSJeff Layton * session setup, forwarding, retry details.
35838340f22cSJeff Layton */
ceph_mdsc_do_request(struct ceph_mds_client * mdsc,struct inode * dir,struct ceph_mds_request * req)35848340f22cSJeff Layton int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
35858340f22cSJeff Layton struct inode *dir,
35868340f22cSJeff Layton struct ceph_mds_request *req)
35878340f22cSJeff Layton {
35888340f22cSJeff Layton int err;
35898340f22cSJeff Layton
35908340f22cSJeff Layton dout("do_request on %p\n", req);
35918340f22cSJeff Layton
35928340f22cSJeff Layton /* issue */
35938340f22cSJeff Layton err = ceph_mdsc_submit_request(mdsc, dir, req);
35948340f22cSJeff Layton if (!err)
35959eaa7b79SJeff Layton err = ceph_mdsc_wait_request(mdsc, req, NULL);
35962f2dc053SSage Weil dout("do_request %p done, result %d\n", req, err);
35972f2dc053SSage Weil return err;
35982f2dc053SSage Weil }
35992f2dc053SSage Weil
36002f2dc053SSage Weil /*
36012f276c51SYan, Zheng * Invalidate dir's completeness, dentry lease state on an aborted MDS
3602167c9e35SSage Weil * namespace request.
3603167c9e35SSage Weil */
ceph_invalidate_dir_request(struct ceph_mds_request * req)3604167c9e35SSage Weil void ceph_invalidate_dir_request(struct ceph_mds_request *req)
3605167c9e35SSage Weil {
36068d8f371cSYan, Zheng struct inode *dir = req->r_parent;
36078d8f371cSYan, Zheng struct inode *old_dir = req->r_old_dentry_dir;
3608167c9e35SSage Weil
36098d8f371cSYan, Zheng dout("invalidate_dir_request %p %p (complete, lease(s))\n", dir, old_dir);
3610167c9e35SSage Weil
36118d8f371cSYan, Zheng ceph_dir_clear_complete(dir);
36128d8f371cSYan, Zheng if (old_dir)
36138d8f371cSYan, Zheng ceph_dir_clear_complete(old_dir);
3614167c9e35SSage Weil if (req->r_dentry)
3615167c9e35SSage Weil ceph_invalidate_dentry_lease(req->r_dentry);
3616167c9e35SSage Weil if (req->r_old_dentry)
3617167c9e35SSage Weil ceph_invalidate_dentry_lease(req->r_old_dentry);
3618167c9e35SSage Weil }
3619167c9e35SSage Weil
3620167c9e35SSage Weil /*
36212f2dc053SSage Weil * Handle mds reply.
36222f2dc053SSage Weil *
36232f2dc053SSage Weil * We take the session mutex and parse and process the reply immediately.
36242f2dc053SSage Weil * This preserves the logical ordering of replies, capabilities, etc., sent
36252f2dc053SSage Weil * by the MDS as they are applied to our local cache.
36262f2dc053SSage Weil */
handle_reply(struct ceph_mds_session * session,struct ceph_msg * msg)36272f2dc053SSage Weil static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
36282f2dc053SSage Weil {
36292f2dc053SSage Weil struct ceph_mds_client *mdsc = session->s_mdsc;
36302f2dc053SSage Weil struct ceph_mds_request *req;
36312f2dc053SSage Weil struct ceph_mds_reply_head *head = msg->front.iov_base;
36322f2dc053SSage Weil struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */
3633982d6011SYan, Zheng struct ceph_snap_realm *realm;
36342f2dc053SSage Weil u64 tid;
36352f2dc053SSage Weil int err, result;
36362600d2ddSSage Weil int mds = session->s_mds;
3637a68e564aSXiubo Li bool close_sessions = false;
36382f2dc053SSage Weil
36392f2dc053SSage Weil if (msg->front.iov_len < sizeof(*head)) {
36402f2dc053SSage Weil pr_err("mdsc_handle_reply got corrupt (short) reply\n");
36419ec7cab1SSage Weil ceph_msg_dump(msg);
36422f2dc053SSage Weil return;
36432f2dc053SSage Weil }
36442f2dc053SSage Weil
36452f2dc053SSage Weil /* get request, session */
36466df058c0SSage Weil tid = le64_to_cpu(msg->hdr.tid);
36472f2dc053SSage Weil mutex_lock(&mdsc->mutex);
3648fcd00b68SIlya Dryomov req = lookup_get_request(mdsc, tid);
36492f2dc053SSage Weil if (!req) {
36502f2dc053SSage Weil dout("handle_reply on unknown tid %llu\n", tid);
36512f2dc053SSage Weil mutex_unlock(&mdsc->mutex);
36522f2dc053SSage Weil return;
36532f2dc053SSage Weil }
36542f2dc053SSage Weil dout("handle_reply %p\n", req);
36552f2dc053SSage Weil
36562f2dc053SSage Weil /* correct session? */
3657d96d6049SSage Weil if (req->r_session != session) {
36582f2dc053SSage Weil pr_err("mdsc_handle_reply got %llu on session mds%d"
36592f2dc053SSage Weil " not mds%d\n", tid, session->s_mds,
36602f2dc053SSage Weil req->r_session ? req->r_session->s_mds : -1);
36612f2dc053SSage Weil mutex_unlock(&mdsc->mutex);
36622f2dc053SSage Weil goto out;
36632f2dc053SSage Weil }
36642f2dc053SSage Weil
36652f2dc053SSage Weil /* dup? */
3666bc2de10dSJeff Layton if ((test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags) && !head->safe) ||
3667bc2de10dSJeff Layton (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags) && head->safe)) {
3668f3ae1b97SFabian Frederick pr_warn("got a dup %s reply on %llu from mds%d\n",
36692f2dc053SSage Weil head->safe ? "safe" : "unsafe", tid, mds);
36702f2dc053SSage Weil mutex_unlock(&mdsc->mutex);
36712f2dc053SSage Weil goto out;
36722f2dc053SSage Weil }
3673bc2de10dSJeff Layton if (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags)) {
3674f3ae1b97SFabian Frederick pr_warn("got unsafe after safe on %llu from mds%d\n",
367585792d0dSSage Weil tid, mds);
367685792d0dSSage Weil mutex_unlock(&mdsc->mutex);
367785792d0dSSage Weil goto out;
367885792d0dSSage Weil }
36792f2dc053SSage Weil
36802f2dc053SSage Weil result = le32_to_cpu(head->result);
36812f2dc053SSage Weil
36822f2dc053SSage Weil if (head->safe) {
3683bc2de10dSJeff Layton set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags);
36842f2dc053SSage Weil __unregister_request(mdsc, req);
36852f2dc053SSage Weil
368607edc057SXiubo Li /* last request during umount? */
368707edc057SXiubo Li if (mdsc->stopping && !__get_oldest_req(mdsc))
368807edc057SXiubo Li complete_all(&mdsc->safe_umount_waiters);
368907edc057SXiubo Li
3690bc2de10dSJeff Layton if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
36912f2dc053SSage Weil /*
36922f2dc053SSage Weil * We already handled the unsafe response, now do the
36932f2dc053SSage Weil * cleanup. No need to examine the response; the MDS
36942f2dc053SSage Weil * doesn't include any result info in the safe
36952f2dc053SSage Weil * response. And even if it did, there is nothing
36962f2dc053SSage Weil * useful we could do with a revised return value.
36972f2dc053SSage Weil */
36982f2dc053SSage Weil dout("got safe reply %llu, mds%d\n", tid, mds);
36992f2dc053SSage Weil
37002f2dc053SSage Weil mutex_unlock(&mdsc->mutex);
37012f2dc053SSage Weil goto out;
37022f2dc053SSage Weil }
3703e1518c7cSSage Weil } else {
3704bc2de10dSJeff Layton set_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags);
37052f2dc053SSage Weil list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
37062f2dc053SSage Weil }
37072f2dc053SSage Weil
37082f2dc053SSage Weil dout("handle_reply tid %lld result %d\n", tid, result);
3709b37fe1f9SYan, Zheng if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features))
37103859af9eSXiubo Li err = parse_reply_info(session, msg, req, (u64)-1);
3711b37fe1f9SYan, Zheng else
37123859af9eSXiubo Li err = parse_reply_info(session, msg, req,
37133859af9eSXiubo Li session->s_con.peer_features);
37142f2dc053SSage Weil mutex_unlock(&mdsc->mutex);
37152f2dc053SSage Weil
3716bca9fc14SJeff Layton /* Must find target inode outside of mutexes to avoid deadlocks */
37173859af9eSXiubo Li rinfo = &req->r_reply_info;
3718bca9fc14SJeff Layton if ((err >= 0) && rinfo->head->is_target) {
3719ec9595c0SJeff Layton struct inode *in = xchg(&req->r_new_inode, NULL);
3720bca9fc14SJeff Layton struct ceph_vino tvino = {
3721bca9fc14SJeff Layton .ino = le64_to_cpu(rinfo->targeti.in->ino),
3722bca9fc14SJeff Layton .snap = le64_to_cpu(rinfo->targeti.in->snapid)
3723bca9fc14SJeff Layton };
3724bca9fc14SJeff Layton
3725ec9595c0SJeff Layton /*
3726ec9595c0SJeff Layton * If we ended up opening an existing inode, discard
3727ec9595c0SJeff Layton * r_new_inode
3728ec9595c0SJeff Layton */
3729ec9595c0SJeff Layton if (req->r_op == CEPH_MDS_OP_CREATE &&
3730ec9595c0SJeff Layton !req->r_reply_info.has_create_ino) {
3731ec9595c0SJeff Layton /* This should never happen on an async create */
3732ec9595c0SJeff Layton WARN_ON_ONCE(req->r_deleg_ino);
3733ec9595c0SJeff Layton iput(in);
3734ec9595c0SJeff Layton in = NULL;
3735ec9595c0SJeff Layton }
3736ec9595c0SJeff Layton
3737ec9595c0SJeff Layton in = ceph_get_inode(mdsc->fsc->sb, tvino, in);
3738bca9fc14SJeff Layton if (IS_ERR(in)) {
3739bca9fc14SJeff Layton err = PTR_ERR(in);
3740bca9fc14SJeff Layton mutex_lock(&session->s_mutex);
3741bca9fc14SJeff Layton goto out_err;
3742bca9fc14SJeff Layton }
3743bca9fc14SJeff Layton req->r_target_inode = in;
3744bca9fc14SJeff Layton }
3745bca9fc14SJeff Layton
37462f2dc053SSage Weil mutex_lock(&session->s_mutex);
37472f2dc053SSage Weil if (err < 0) {
374825933abdSHerb Shiu pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);
37499ec7cab1SSage Weil ceph_msg_dump(msg);
37502f2dc053SSage Weil goto out_err;
37512f2dc053SSage Weil }
37522f2dc053SSage Weil
37532f2dc053SSage Weil /* snap trace */
3754982d6011SYan, Zheng realm = NULL;
37552f2dc053SSage Weil if (rinfo->snapblob_len) {
37562f2dc053SSage Weil down_write(&mdsc->snap_rwsem);
3757a68e564aSXiubo Li err = ceph_update_snap_trace(mdsc, rinfo->snapblob,
37582f2dc053SSage Weil rinfo->snapblob + rinfo->snapblob_len,
3759982d6011SYan, Zheng le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP,
3760982d6011SYan, Zheng &realm);
3761a68e564aSXiubo Li if (err) {
3762a68e564aSXiubo Li up_write(&mdsc->snap_rwsem);
3763a68e564aSXiubo Li close_sessions = true;
3764a68e564aSXiubo Li if (err == -EIO)
3765a68e564aSXiubo Li ceph_msg_dump(msg);
3766a68e564aSXiubo Li goto out_err;
3767a68e564aSXiubo Li }
37682f2dc053SSage Weil downgrade_write(&mdsc->snap_rwsem);
37692f2dc053SSage Weil } else {
37702f2dc053SSage Weil down_read(&mdsc->snap_rwsem);
37712f2dc053SSage Weil }
37722f2dc053SSage Weil
37732f2dc053SSage Weil /* insert trace into our cache */
3774b4556396SSage Weil mutex_lock(&req->r_fill_mutex);
3775315f2408SYan, Zheng current->journal_info = req;
3776f5a03b08SJeff Layton err = ceph_fill_trace(mdsc->fsc->sb, req);
37772f2dc053SSage Weil if (err == 0) {
37786e8575faSSam Lang if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
377981c6aea5SYan, Zheng req->r_op == CEPH_MDS_OP_LSSNAP))
3780af9ffa6dSXiubo Li err = ceph_readdir_prepopulate(req, req->r_session);
37812f2dc053SSage Weil }
3782315f2408SYan, Zheng current->journal_info = NULL;
3783b4556396SSage Weil mutex_unlock(&req->r_fill_mutex);
37842f2dc053SSage Weil
37852f2dc053SSage Weil up_read(&mdsc->snap_rwsem);
3786982d6011SYan, Zheng if (realm)
3787982d6011SYan, Zheng ceph_put_snap_realm(mdsc, realm);
378868cd5b4bSYan, Zheng
3789fe33032dSYan, Zheng if (err == 0) {
3790fe33032dSYan, Zheng if (req->r_target_inode &&
3791bc2de10dSJeff Layton test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
3792fe33032dSYan, Zheng struct ceph_inode_info *ci =
3793fe33032dSYan, Zheng ceph_inode(req->r_target_inode);
379468cd5b4bSYan, Zheng spin_lock(&ci->i_unsafe_lock);
3795fe33032dSYan, Zheng list_add_tail(&req->r_unsafe_target_item,
3796fe33032dSYan, Zheng &ci->i_unsafe_iops);
379768cd5b4bSYan, Zheng spin_unlock(&ci->i_unsafe_lock);
379868cd5b4bSYan, Zheng }
3799fe33032dSYan, Zheng
3800fe33032dSYan, Zheng ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
3801fe33032dSYan, Zheng }
38022f2dc053SSage Weil out_err:
3803e1518c7cSSage Weil mutex_lock(&mdsc->mutex);
3804bc2de10dSJeff Layton if (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
38052f2dc053SSage Weil if (err) {
38062f2dc053SSage Weil req->r_err = err;
38072f2dc053SSage Weil } else {
38085fdb1389SJianpeng Ma req->r_reply = ceph_msg_get(msg);
3809bc2de10dSJeff Layton set_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags);
38102f2dc053SSage Weil }
3811e1518c7cSSage Weil } else {
3812e1518c7cSSage Weil dout("reply arrived after request %lld was aborted\n", tid);
3813e1518c7cSSage Weil }
3814e1518c7cSSage Weil mutex_unlock(&mdsc->mutex);
38152f2dc053SSage Weil
38162f2dc053SSage Weil mutex_unlock(&session->s_mutex);
38172f2dc053SSage Weil
38182f2dc053SSage Weil /* kick calling process */
38192f2dc053SSage Weil complete_request(mdsc, req);
382070c94820SXiubo Li
38218ae99ae2SXiubo Li ceph_update_metadata_metrics(&mdsc->metric, req->r_start_latency,
382270c94820SXiubo Li req->r_end_latency, err);
38232f2dc053SSage Weil out:
38242f2dc053SSage Weil ceph_mdsc_put_request(req);
3825a68e564aSXiubo Li
3826a68e564aSXiubo Li /* Defer closing the sessions after s_mutex lock being released */
3827a68e564aSXiubo Li if (close_sessions)
3828a68e564aSXiubo Li ceph_mdsc_close_sessions(mdsc);
38292f2dc053SSage Weil return;
38302f2dc053SSage Weil }
38312f2dc053SSage Weil
38322f2dc053SSage Weil
38332f2dc053SSage Weil
38342f2dc053SSage Weil /*
38352f2dc053SSage Weil * handle mds notification that our request has been forwarded.
38362f2dc053SSage Weil */
handle_forward(struct ceph_mds_client * mdsc,struct ceph_mds_session * session,struct ceph_msg * msg)38372600d2ddSSage Weil static void handle_forward(struct ceph_mds_client *mdsc,
38382600d2ddSSage Weil struct ceph_mds_session *session,
38392600d2ddSSage Weil struct ceph_msg *msg)
38402f2dc053SSage Weil {
38412f2dc053SSage Weil struct ceph_mds_request *req;
3842a1ea787cSSage Weil u64 tid = le64_to_cpu(msg->hdr.tid);
38432f2dc053SSage Weil u32 next_mds;
38442f2dc053SSage Weil u32 fwd_seq;
38452f2dc053SSage Weil int err = -EINVAL;
38462f2dc053SSage Weil void *p = msg->front.iov_base;
38472f2dc053SSage Weil void *end = p + msg->front.iov_len;
38481980b1bfSXiubo Li bool aborted = false;
38492f2dc053SSage Weil
3850a1ea787cSSage Weil ceph_decode_need(&p, end, 2*sizeof(u32), bad);
3851c89136eaSSage Weil next_mds = ceph_decode_32(&p);
3852c89136eaSSage Weil fwd_seq = ceph_decode_32(&p);
38532f2dc053SSage Weil
38542f2dc053SSage Weil mutex_lock(&mdsc->mutex);
3855fcd00b68SIlya Dryomov req = lookup_get_request(mdsc, tid);
38562f2dc053SSage Weil if (!req) {
38571980b1bfSXiubo Li mutex_unlock(&mdsc->mutex);
38582a8e5e36SSage Weil dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
38591980b1bfSXiubo Li return; /* dup reply? */
38602f2dc053SSage Weil }
38612f2dc053SSage Weil
3862bc2de10dSJeff Layton if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
38632a8e5e36SSage Weil dout("forward tid %llu aborted, unregistering\n", tid);
38642a8e5e36SSage Weil __unregister_request(mdsc, req);
3865ce0d5bd3SXiubo Li } else if (fwd_seq <= req->r_num_fwd || (uint32_t)fwd_seq >= U32_MAX) {
38661980b1bfSXiubo Li /*
3867ce0d5bd3SXiubo Li * Avoid inifinite retrying after overflow.
38681980b1bfSXiubo Li *
3869ce0d5bd3SXiubo Li * The MDS will increase the fwd count and in client side
3870ce0d5bd3SXiubo Li * if the num_fwd is less than the one saved in request
3871ce0d5bd3SXiubo Li * that means the MDS is an old version and overflowed of
3872ce0d5bd3SXiubo Li * 8 bits.
38731980b1bfSXiubo Li */
38741980b1bfSXiubo Li mutex_lock(&req->r_fill_mutex);
38751980b1bfSXiubo Li req->r_err = -EMULTIHOP;
38761980b1bfSXiubo Li set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
38771980b1bfSXiubo Li mutex_unlock(&req->r_fill_mutex);
38781980b1bfSXiubo Li aborted = true;
3879ce0d5bd3SXiubo Li pr_warn_ratelimited("forward tid %llu seq overflow\n", tid);
38802f2dc053SSage Weil } else {
38812f2dc053SSage Weil /* resend. forward race not possible; mds would drop */
38822a8e5e36SSage Weil dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
38832a8e5e36SSage Weil BUG_ON(req->r_err);
3884bc2de10dSJeff Layton BUG_ON(test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags));
38853de22be6SYan, Zheng req->r_attempts = 0;
38862f2dc053SSage Weil req->r_num_fwd = fwd_seq;
38872f2dc053SSage Weil req->r_resend_mds = next_mds;
38882f2dc053SSage Weil put_request_session(req);
38892f2dc053SSage Weil __do_request(mdsc, req);
38902f2dc053SSage Weil }
38912f2dc053SSage Weil mutex_unlock(&mdsc->mutex);
38921980b1bfSXiubo Li
38931980b1bfSXiubo Li /* kick calling process */
38941980b1bfSXiubo Li if (aborted)
38951980b1bfSXiubo Li complete_request(mdsc, req);
38961980b1bfSXiubo Li ceph_mdsc_put_request(req);
38972f2dc053SSage Weil return;
38982f2dc053SSage Weil
38992f2dc053SSage Weil bad:
39002f2dc053SSage Weil pr_err("mdsc_handle_forward decode error err=%d\n", err);
39018b0da5c5SXiubo Li ceph_msg_dump(msg);
39022f2dc053SSage Weil }
39032f2dc053SSage Weil
__decode_session_metadata(void ** p,void * end,bool * blocklisted)3904131d7eb4SYan, Zheng static int __decode_session_metadata(void **p, void *end,
39050b98acd6SIlya Dryomov bool *blocklisted)
390684bf3950SYan, Zheng {
390784bf3950SYan, Zheng /* map<string,string> */
390884bf3950SYan, Zheng u32 n;
3909131d7eb4SYan, Zheng bool err_str;
391084bf3950SYan, Zheng ceph_decode_32_safe(p, end, n, bad);
391184bf3950SYan, Zheng while (n-- > 0) {
391284bf3950SYan, Zheng u32 len;
391384bf3950SYan, Zheng ceph_decode_32_safe(p, end, len, bad);
391484bf3950SYan, Zheng ceph_decode_need(p, end, len, bad);
3915131d7eb4SYan, Zheng err_str = !strncmp(*p, "error_string", len);
391684bf3950SYan, Zheng *p += len;
391784bf3950SYan, Zheng ceph_decode_32_safe(p, end, len, bad);
391884bf3950SYan, Zheng ceph_decode_need(p, end, len, bad);
39194bb926e8SIlya Dryomov /*
39204bb926e8SIlya Dryomov * Match "blocklisted (blacklisted)" from newer MDSes,
39214bb926e8SIlya Dryomov * or "blacklisted" from older MDSes.
39224bb926e8SIlya Dryomov */
3923131d7eb4SYan, Zheng if (err_str && strnstr(*p, "blacklisted", len))
39240b98acd6SIlya Dryomov *blocklisted = true;
392584bf3950SYan, Zheng *p += len;
392684bf3950SYan, Zheng }
392784bf3950SYan, Zheng return 0;
392884bf3950SYan, Zheng bad:
392984bf3950SYan, Zheng return -1;
393084bf3950SYan, Zheng }
393184bf3950SYan, Zheng
39322f2dc053SSage Weil /*
39332f2dc053SSage Weil * handle a mds session control message
39342f2dc053SSage Weil */
handle_session(struct ceph_mds_session * session,struct ceph_msg * msg)39352f2dc053SSage Weil static void handle_session(struct ceph_mds_session *session,
39362f2dc053SSage Weil struct ceph_msg *msg)
39372f2dc053SSage Weil {
39382f2dc053SSage Weil struct ceph_mds_client *mdsc = session->s_mdsc;
393984bf3950SYan, Zheng int mds = session->s_mds;
394084bf3950SYan, Zheng int msg_version = le16_to_cpu(msg->hdr.version);
394184bf3950SYan, Zheng void *p = msg->front.iov_base;
394284bf3950SYan, Zheng void *end = p + msg->front.iov_len;
394384bf3950SYan, Zheng struct ceph_mds_session_head *h;
39442f2dc053SSage Weil u32 op;
39450fa82633SJeff Layton u64 seq, features = 0;
39462f2dc053SSage Weil int wake = 0;
39470b98acd6SIlya Dryomov bool blocklisted = false;
39482f2dc053SSage Weil
39492f2dc053SSage Weil /* decode */
395084bf3950SYan, Zheng ceph_decode_need(&p, end, sizeof(*h), bad);
395184bf3950SYan, Zheng h = p;
395284bf3950SYan, Zheng p += sizeof(*h);
395384bf3950SYan, Zheng
39542f2dc053SSage Weil op = le32_to_cpu(h->op);
39552f2dc053SSage Weil seq = le64_to_cpu(h->seq);
39562f2dc053SSage Weil
395784bf3950SYan, Zheng if (msg_version >= 3) {
395884bf3950SYan, Zheng u32 len;
3959e1c9788cSKotresh HR /* version >= 2 and < 5, decode metadata, skip otherwise
3960e1c9788cSKotresh HR * as it's handled via flags.
3961e1c9788cSKotresh HR */
3962e1c9788cSKotresh HR if (msg_version >= 5)
3963e1c9788cSKotresh HR ceph_decode_skip_map(&p, end, string, string, bad);
3964e1c9788cSKotresh HR else if (__decode_session_metadata(&p, end, &blocklisted) < 0)
396584bf3950SYan, Zheng goto bad;
3966e1c9788cSKotresh HR
396784bf3950SYan, Zheng /* version >= 3, feature bits */
396884bf3950SYan, Zheng ceph_decode_32_safe(&p, end, len, bad);
396902e37571SJeff Layton if (len) {
39700fa82633SJeff Layton ceph_decode_64_safe(&p, end, features, bad);
39710fa82633SJeff Layton p += len - sizeof(features);
397284bf3950SYan, Zheng }
397302e37571SJeff Layton }
397484bf3950SYan, Zheng
3975e1c9788cSKotresh HR if (msg_version >= 5) {
3976ea16567fSLuís Henriques u32 flags, len;
3977ea16567fSLuís Henriques
3978ea16567fSLuís Henriques /* version >= 4 */
3979ea16567fSLuís Henriques ceph_decode_skip_16(&p, end, bad); /* struct_v, struct_cv */
3980ea16567fSLuís Henriques ceph_decode_32_safe(&p, end, len, bad); /* len */
3981ea16567fSLuís Henriques ceph_decode_skip_n(&p, end, len, bad); /* metric_spec */
3982ea16567fSLuís Henriques
3983e1c9788cSKotresh HR /* version >= 5, flags */
3984e1c9788cSKotresh HR ceph_decode_32_safe(&p, end, flags, bad);
3985e1c9788cSKotresh HR if (flags & CEPH_SESSION_BLOCKLISTED) {
3986e1c9788cSKotresh HR pr_warn("mds%d session blocklisted\n", session->s_mds);
3987e1c9788cSKotresh HR blocklisted = true;
3988e1c9788cSKotresh HR }
3989e1c9788cSKotresh HR }
3990e1c9788cSKotresh HR
39912f2dc053SSage Weil mutex_lock(&mdsc->mutex);
39920a07fc8cSYan, Zheng if (op == CEPH_SESSION_CLOSE) {
39935b3248c6SXiubo Li ceph_get_mds_session(session);
39942600d2ddSSage Weil __unregister_session(mdsc, session);
39950a07fc8cSYan, Zheng }
39962f2dc053SSage Weil /* FIXME: this ttl calculation is generous */
39972f2dc053SSage Weil session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
39982f2dc053SSage Weil mutex_unlock(&mdsc->mutex);
39992f2dc053SSage Weil
40002f2dc053SSage Weil mutex_lock(&session->s_mutex);
40012f2dc053SSage Weil
40022f2dc053SSage Weil dout("handle_session mds%d %s %p state %s seq %llu\n",
40032f2dc053SSage Weil mds, ceph_session_op_name(op), session,
4004a687ecafSJohn Spray ceph_session_state_name(session->s_state), seq);
40052f2dc053SSage Weil
40062f2dc053SSage Weil if (session->s_state == CEPH_MDS_SESSION_HUNG) {
40072f2dc053SSage Weil session->s_state = CEPH_MDS_SESSION_OPEN;
40082f2dc053SSage Weil pr_info("mds%d came back\n", session->s_mds);
40092f2dc053SSage Weil }
40102f2dc053SSage Weil
40112f2dc053SSage Weil switch (op) {
40122f2dc053SSage Weil case CEPH_SESSION_OPEN:
401329790f26SSage Weil if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
401429790f26SSage Weil pr_info("mds%d reconnect success\n", session->s_mds);
4015300e42a2SXiubo Li
4016987219b3SVenky Shankar session->s_features = features;
4017300e42a2SXiubo Li if (session->s_state == CEPH_MDS_SESSION_OPEN) {
4018300e42a2SXiubo Li pr_notice("mds%d is already opened\n", session->s_mds);
4019300e42a2SXiubo Li } else {
40202f2dc053SSage Weil session->s_state = CEPH_MDS_SESSION_OPEN;
40212f2dc053SSage Weil renewed_caps(mdsc, session, 0);
4022300e42a2SXiubo Li if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT,
4023300e42a2SXiubo Li &session->s_features))
402418f473b3SXiubo Li metric_schedule_delayed(&mdsc->metric);
4025300e42a2SXiubo Li }
4026300e42a2SXiubo Li
4027300e42a2SXiubo Li /*
4028300e42a2SXiubo Li * The connection maybe broken and the session in client
4029300e42a2SXiubo Li * side has been reinitialized, need to update the seq
4030300e42a2SXiubo Li * anyway.
4031300e42a2SXiubo Li */
4032300e42a2SXiubo Li if (!session->s_seq && seq)
4033300e42a2SXiubo Li session->s_seq = seq;
4034300e42a2SXiubo Li
40352f2dc053SSage Weil wake = 1;
40362f2dc053SSage Weil if (mdsc->stopping)
40372f2dc053SSage Weil __close_session(mdsc, session);
40382f2dc053SSage Weil break;
40392f2dc053SSage Weil
40402f2dc053SSage Weil case CEPH_SESSION_RENEWCAPS:
40412f2dc053SSage Weil if (session->s_renew_seq == seq)
40422f2dc053SSage Weil renewed_caps(mdsc, session, 1);
40432f2dc053SSage Weil break;
40442f2dc053SSage Weil
40452f2dc053SSage Weil case CEPH_SESSION_CLOSE:
404629790f26SSage Weil if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
404729790f26SSage Weil pr_info("mds%d reconnect denied\n", session->s_mds);
40484d681c2fSXiubo Li session->s_state = CEPH_MDS_SESSION_CLOSED;
40491c841a96SYan, Zheng cleanup_session_requests(mdsc, session);
40502f2dc053SSage Weil remove_session_caps(session);
4051656e4382SYan, Zheng wake = 2; /* for good measure */
4052f3c60c59SSage Weil wake_up_all(&mdsc->session_close_wq);
40532f2dc053SSage Weil break;
40542f2dc053SSage Weil
40552f2dc053SSage Weil case CEPH_SESSION_STALE:
40562f2dc053SSage Weil pr_info("mds%d caps went stale, renewing\n",
40572f2dc053SSage Weil session->s_mds);
405852d60f8eSJeff Layton atomic_inc(&session->s_cap_gen);
40591ce208a6SAlex Elder session->s_cap_ttl = jiffies - 1;
40602f2dc053SSage Weil send_renew_caps(mdsc, session);
40612f2dc053SSage Weil break;
40622f2dc053SSage Weil
40632f2dc053SSage Weil case CEPH_SESSION_RECALL_STATE:
4064e30ee581SZhi Zhang ceph_trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
40652f2dc053SSage Weil break;
40662f2dc053SSage Weil
4067186e4f7aSYan, Zheng case CEPH_SESSION_FLUSHMSG:
4068e7d84c6aSXiubo Li /* flush cap releases */
4069e7d84c6aSXiubo Li spin_lock(&session->s_cap_lock);
4070e7d84c6aSXiubo Li if (session->s_num_cap_releases)
4071e7d84c6aSXiubo Li ceph_flush_cap_releases(mdsc, session);
4072e7d84c6aSXiubo Li spin_unlock(&session->s_cap_lock);
4073e7d84c6aSXiubo Li
4074186e4f7aSYan, Zheng send_flushmsg_ack(mdsc, session, seq);
4075186e4f7aSYan, Zheng break;
4076186e4f7aSYan, Zheng
407703f4fcb0SYan, Zheng case CEPH_SESSION_FORCE_RO:
407803f4fcb0SYan, Zheng dout("force_session_readonly %p\n", session);
407903f4fcb0SYan, Zheng spin_lock(&session->s_cap_lock);
408003f4fcb0SYan, Zheng session->s_readonly = true;
408103f4fcb0SYan, Zheng spin_unlock(&session->s_cap_lock);
4082d2f8bb27SYan, Zheng wake_up_session_caps(session, FORCE_RO);
408303f4fcb0SYan, Zheng break;
408403f4fcb0SYan, Zheng
4085fcff415cSYan, Zheng case CEPH_SESSION_REJECT:
4086fcff415cSYan, Zheng WARN_ON(session->s_state != CEPH_MDS_SESSION_OPENING);
4087fcff415cSYan, Zheng pr_info("mds%d rejected session\n", session->s_mds);
4088fcff415cSYan, Zheng session->s_state = CEPH_MDS_SESSION_REJECTED;
4089fcff415cSYan, Zheng cleanup_session_requests(mdsc, session);
4090fcff415cSYan, Zheng remove_session_caps(session);
40910b98acd6SIlya Dryomov if (blocklisted)
40920b98acd6SIlya Dryomov mdsc->fsc->blocklisted = true;
4093fcff415cSYan, Zheng wake = 2; /* for good measure */
4094fcff415cSYan, Zheng break;
4095fcff415cSYan, Zheng
40962f2dc053SSage Weil default:
40972f2dc053SSage Weil pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
40982f2dc053SSage Weil WARN_ON(1);
40992f2dc053SSage Weil }
41002f2dc053SSage Weil
41012f2dc053SSage Weil mutex_unlock(&session->s_mutex);
41022f2dc053SSage Weil if (wake) {
41032f2dc053SSage Weil mutex_lock(&mdsc->mutex);
41042f2dc053SSage Weil __wake_requests(mdsc, &session->s_waiting);
4105656e4382SYan, Zheng if (wake == 2)
4106656e4382SYan, Zheng kick_requests(mdsc, mds);
41072f2dc053SSage Weil mutex_unlock(&mdsc->mutex);
41082f2dc053SSage Weil }
41090a07fc8cSYan, Zheng if (op == CEPH_SESSION_CLOSE)
41100a07fc8cSYan, Zheng ceph_put_mds_session(session);
41112f2dc053SSage Weil return;
41122f2dc053SSage Weil
41132f2dc053SSage Weil bad:
41142f2dc053SSage Weil pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
41152f2dc053SSage Weil (int)msg->front.iov_len);
41169ec7cab1SSage Weil ceph_msg_dump(msg);
41172f2dc053SSage Weil return;
41182f2dc053SSage Weil }
41192f2dc053SSage Weil
ceph_mdsc_release_dir_caps(struct ceph_mds_request * req)4120a25949b9SJeff Layton void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req)
4121a25949b9SJeff Layton {
4122a25949b9SJeff Layton int dcaps;
4123a25949b9SJeff Layton
4124a25949b9SJeff Layton dcaps = xchg(&req->r_dir_caps, 0);
4125a25949b9SJeff Layton if (dcaps) {
4126a25949b9SJeff Layton dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
4127a25949b9SJeff Layton ceph_put_cap_refs(ceph_inode(req->r_parent), dcaps);
4128a25949b9SJeff Layton }
4129a25949b9SJeff Layton }
4130a25949b9SJeff Layton
ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request * req)4131e64f44a8SXiubo Li void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req)
4132e64f44a8SXiubo Li {
4133e64f44a8SXiubo Li int dcaps;
4134e64f44a8SXiubo Li
4135e64f44a8SXiubo Li dcaps = xchg(&req->r_dir_caps, 0);
4136e64f44a8SXiubo Li if (dcaps) {
4137e64f44a8SXiubo Li dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
4138e64f44a8SXiubo Li ceph_put_cap_refs_no_check_caps(ceph_inode(req->r_parent),
4139e64f44a8SXiubo Li dcaps);
4140e64f44a8SXiubo Li }
4141e64f44a8SXiubo Li }
4142e64f44a8SXiubo Li
41432f2dc053SSage Weil /*
41442f2dc053SSage Weil * called under session->mutex.
41452f2dc053SSage Weil */
replay_unsafe_requests(struct ceph_mds_client * mdsc,struct ceph_mds_session * session)41462f2dc053SSage Weil static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
41472f2dc053SSage Weil struct ceph_mds_session *session)
41482f2dc053SSage Weil {
41492f2dc053SSage Weil struct ceph_mds_request *req, *nreq;
41503de22be6SYan, Zheng struct rb_node *p;
41512f2dc053SSage Weil
41522f2dc053SSage Weil dout("replay_unsafe_requests mds%d\n", session->s_mds);
41532f2dc053SSage Weil
41542f2dc053SSage Weil mutex_lock(&mdsc->mutex);
41559cf54563SXiubo Li list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item)
4156396bd62cSJeff Layton __send_request(session, req, true);
41573de22be6SYan, Zheng
41583de22be6SYan, Zheng /*
41593de22be6SYan, Zheng * also re-send old requests when MDS enters reconnect stage. So that MDS
41603de22be6SYan, Zheng * can process completed request in clientreplay stage.
41613de22be6SYan, Zheng */
41623de22be6SYan, Zheng p = rb_first(&mdsc->request_tree);
41633de22be6SYan, Zheng while (p) {
41643de22be6SYan, Zheng req = rb_entry(p, struct ceph_mds_request, r_node);
41653de22be6SYan, Zheng p = rb_next(p);
4166bc2de10dSJeff Layton if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
41673de22be6SYan, Zheng continue;
41683de22be6SYan, Zheng if (req->r_attempts == 0)
41693de22be6SYan, Zheng continue; /* only old requests */
4170a25949b9SJeff Layton if (!req->r_session)
4171a25949b9SJeff Layton continue;
4172a25949b9SJeff Layton if (req->r_session->s_mds != session->s_mds)
4173a25949b9SJeff Layton continue;
4174a25949b9SJeff Layton
4175e64f44a8SXiubo Li ceph_mdsc_release_dir_caps_no_check(req);
4176a25949b9SJeff Layton
4177396bd62cSJeff Layton __send_request(session, req, true);
41783de22be6SYan, Zheng }
41792f2dc053SSage Weil mutex_unlock(&mdsc->mutex);
41802f2dc053SSage Weil }
41812f2dc053SSage Weil
send_reconnect_partial(struct ceph_reconnect_state * recon_state)418281c5a148SYan, Zheng static int send_reconnect_partial(struct ceph_reconnect_state *recon_state)
418381c5a148SYan, Zheng {
418481c5a148SYan, Zheng struct ceph_msg *reply;
418581c5a148SYan, Zheng struct ceph_pagelist *_pagelist;
418681c5a148SYan, Zheng struct page *page;
418781c5a148SYan, Zheng __le32 *addr;
418881c5a148SYan, Zheng int err = -ENOMEM;
418981c5a148SYan, Zheng
419081c5a148SYan, Zheng if (!recon_state->allow_multi)
419181c5a148SYan, Zheng return -ENOSPC;
419281c5a148SYan, Zheng
419381c5a148SYan, Zheng /* can't handle message that contains both caps and realm */
419481c5a148SYan, Zheng BUG_ON(!recon_state->nr_caps == !recon_state->nr_realms);
419581c5a148SYan, Zheng
419681c5a148SYan, Zheng /* pre-allocate new pagelist */
419781c5a148SYan, Zheng _pagelist = ceph_pagelist_alloc(GFP_NOFS);
419881c5a148SYan, Zheng if (!_pagelist)
419981c5a148SYan, Zheng return -ENOMEM;
420081c5a148SYan, Zheng
420181c5a148SYan, Zheng reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false);
420281c5a148SYan, Zheng if (!reply)
420381c5a148SYan, Zheng goto fail_msg;
420481c5a148SYan, Zheng
420581c5a148SYan, Zheng /* placeholder for nr_caps */
420681c5a148SYan, Zheng err = ceph_pagelist_encode_32(_pagelist, 0);
420781c5a148SYan, Zheng if (err < 0)
420881c5a148SYan, Zheng goto fail;
420981c5a148SYan, Zheng
421081c5a148SYan, Zheng if (recon_state->nr_caps) {
421181c5a148SYan, Zheng /* currently encoding caps */
421281c5a148SYan, Zheng err = ceph_pagelist_encode_32(recon_state->pagelist, 0);
421381c5a148SYan, Zheng if (err)
421481c5a148SYan, Zheng goto fail;
421581c5a148SYan, Zheng } else {
421681c5a148SYan, Zheng /* placeholder for nr_realms (currently encoding relams) */
421781c5a148SYan, Zheng err = ceph_pagelist_encode_32(_pagelist, 0);
421881c5a148SYan, Zheng if (err < 0)
421981c5a148SYan, Zheng goto fail;
422081c5a148SYan, Zheng }
422181c5a148SYan, Zheng
422281c5a148SYan, Zheng err = ceph_pagelist_encode_8(recon_state->pagelist, 1);
422381c5a148SYan, Zheng if (err)
422481c5a148SYan, Zheng goto fail;
422581c5a148SYan, Zheng
422681c5a148SYan, Zheng page = list_first_entry(&recon_state->pagelist->head, struct page, lru);
422781c5a148SYan, Zheng addr = kmap_atomic(page);
422881c5a148SYan, Zheng if (recon_state->nr_caps) {
422981c5a148SYan, Zheng /* currently encoding caps */
423081c5a148SYan, Zheng *addr = cpu_to_le32(recon_state->nr_caps);
423181c5a148SYan, Zheng } else {
423281c5a148SYan, Zheng /* currently encoding relams */
423381c5a148SYan, Zheng *(addr + 1) = cpu_to_le32(recon_state->nr_realms);
423481c5a148SYan, Zheng }
423581c5a148SYan, Zheng kunmap_atomic(addr);
423681c5a148SYan, Zheng
423781c5a148SYan, Zheng reply->hdr.version = cpu_to_le16(5);
423881c5a148SYan, Zheng reply->hdr.compat_version = cpu_to_le16(4);
423981c5a148SYan, Zheng
424081c5a148SYan, Zheng reply->hdr.data_len = cpu_to_le32(recon_state->pagelist->length);
424181c5a148SYan, Zheng ceph_msg_data_add_pagelist(reply, recon_state->pagelist);
424281c5a148SYan, Zheng
424381c5a148SYan, Zheng ceph_con_send(&recon_state->session->s_con, reply);
424481c5a148SYan, Zheng ceph_pagelist_release(recon_state->pagelist);
424581c5a148SYan, Zheng
424681c5a148SYan, Zheng recon_state->pagelist = _pagelist;
424781c5a148SYan, Zheng recon_state->nr_caps = 0;
424881c5a148SYan, Zheng recon_state->nr_realms = 0;
424981c5a148SYan, Zheng recon_state->msg_version = 5;
425081c5a148SYan, Zheng return 0;
425181c5a148SYan, Zheng fail:
425281c5a148SYan, Zheng ceph_msg_put(reply);
425381c5a148SYan, Zheng fail_msg:
425481c5a148SYan, Zheng ceph_pagelist_release(_pagelist);
425581c5a148SYan, Zheng return err;
425681c5a148SYan, Zheng }
425781c5a148SYan, Zheng
d_find_primary(struct inode * inode)4258a33f6432SYan, Zheng static struct dentry* d_find_primary(struct inode *inode)
4259a33f6432SYan, Zheng {
4260a33f6432SYan, Zheng struct dentry *alias, *dn = NULL;
4261a33f6432SYan, Zheng
4262a33f6432SYan, Zheng if (hlist_empty(&inode->i_dentry))
4263a33f6432SYan, Zheng return NULL;
4264a33f6432SYan, Zheng
4265a33f6432SYan, Zheng spin_lock(&inode->i_lock);
4266a33f6432SYan, Zheng if (hlist_empty(&inode->i_dentry))
4267a33f6432SYan, Zheng goto out_unlock;
4268a33f6432SYan, Zheng
4269a33f6432SYan, Zheng if (S_ISDIR(inode->i_mode)) {
4270a33f6432SYan, Zheng alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
4271a33f6432SYan, Zheng if (!IS_ROOT(alias))
4272a33f6432SYan, Zheng dn = dget(alias);
4273a33f6432SYan, Zheng goto out_unlock;
4274a33f6432SYan, Zheng }
4275a33f6432SYan, Zheng
4276a33f6432SYan, Zheng hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
4277a33f6432SYan, Zheng spin_lock(&alias->d_lock);
4278a33f6432SYan, Zheng if (!d_unhashed(alias) &&
4279a33f6432SYan, Zheng (ceph_dentry(alias)->flags & CEPH_DENTRY_PRIMARY_LINK)) {
4280a33f6432SYan, Zheng dn = dget_dlock(alias);
4281a33f6432SYan, Zheng }
4282a33f6432SYan, Zheng spin_unlock(&alias->d_lock);
4283a33f6432SYan, Zheng if (dn)
4284a33f6432SYan, Zheng break;
4285a33f6432SYan, Zheng }
4286a33f6432SYan, Zheng out_unlock:
4287a33f6432SYan, Zheng spin_unlock(&inode->i_lock);
4288a33f6432SYan, Zheng return dn;
4289a33f6432SYan, Zheng }
4290a33f6432SYan, Zheng
42912f2dc053SSage Weil /*
42922f2dc053SSage Weil * Encode information about a cap for a reconnect with the MDS.
42932f2dc053SSage Weil */
reconnect_caps_cb(struct inode * inode,int mds,void * arg)4294aaf67de7SXiubo Li static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
42952f2dc053SSage Weil {
42962e2023e9SXiubo Li struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
429720cb34aeSSage Weil union {
429820cb34aeSSage Weil struct ceph_mds_cap_reconnect v2;
429920cb34aeSSage Weil struct ceph_mds_cap_reconnect_v1 v1;
430020cb34aeSSage Weil } rec;
4301aaf67de7SXiubo Li struct ceph_inode_info *ci = ceph_inode(inode);
430220cb34aeSSage Weil struct ceph_reconnect_state *recon_state = arg;
430320cb34aeSSage Weil struct ceph_pagelist *pagelist = recon_state->pagelist;
4304a33f6432SYan, Zheng struct dentry *dentry;
4305aaf67de7SXiubo Li struct ceph_cap *cap;
4306a33f6432SYan, Zheng char *path;
43079aaa7eb0SXiubo Li int pathlen = 0, err;
4308a33f6432SYan, Zheng u64 pathbase;
43093469ed0dSYan, Zheng u64 snap_follows;
43102f2dc053SSage Weil
4311a33f6432SYan, Zheng dentry = d_find_primary(inode);
4312a33f6432SYan, Zheng if (dentry) {
4313a33f6432SYan, Zheng /* set pathbase to parent dir when msg_version >= 2 */
43142e2023e9SXiubo Li path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase,
4315a33f6432SYan, Zheng recon_state->msg_version >= 2);
4316a33f6432SYan, Zheng dput(dentry);
4317a33f6432SYan, Zheng if (IS_ERR(path)) {
4318a33f6432SYan, Zheng err = PTR_ERR(path);
4319a33f6432SYan, Zheng goto out_err;
4320a33f6432SYan, Zheng }
4321a33f6432SYan, Zheng } else {
4322a33f6432SYan, Zheng path = NULL;
4323a33f6432SYan, Zheng pathbase = 0;
4324a33f6432SYan, Zheng }
4325a33f6432SYan, Zheng
4326be655596SSage Weil spin_lock(&ci->i_ceph_lock);
4327aaf67de7SXiubo Li cap = __get_cap_for_mds(ci, mds);
4328aaf67de7SXiubo Li if (!cap) {
4329aaf67de7SXiubo Li spin_unlock(&ci->i_ceph_lock);
43309aaa7eb0SXiubo Li err = 0;
4331aaf67de7SXiubo Li goto out_err;
4332aaf67de7SXiubo Li }
4333aaf67de7SXiubo Li dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
4334aaf67de7SXiubo Li inode, ceph_vinop(inode), cap, cap->cap_id,
4335aaf67de7SXiubo Li ceph_cap_string(cap->issued));
4336aaf67de7SXiubo Li
43372f2dc053SSage Weil cap->seq = 0; /* reset cap seq */
43382f2dc053SSage Weil cap->issue_seq = 0; /* and issue_seq */
4339667ca05cSYan, Zheng cap->mseq = 0; /* and migrate_seq */
434052d60f8eSJeff Layton cap->cap_gen = atomic_read(&cap->session->s_cap_gen);
434120cb34aeSSage Weil
4342a25949b9SJeff Layton /* These are lost when the session goes away */
4343785892feSJeff Layton if (S_ISDIR(inode->i_mode)) {
4344785892feSJeff Layton if (cap->issued & CEPH_CAP_DIR_CREATE) {
4345785892feSJeff Layton ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
4346785892feSJeff Layton memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
4347785892feSJeff Layton }
4348a25949b9SJeff Layton cap->issued &= ~CEPH_CAP_ANY_DIR_OPS;
4349785892feSJeff Layton }
4350a25949b9SJeff Layton
4351121f22a1SYan, Zheng if (recon_state->msg_version >= 2) {
435220cb34aeSSage Weil rec.v2.cap_id = cpu_to_le64(cap->cap_id);
435320cb34aeSSage Weil rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
435420cb34aeSSage Weil rec.v2.issued = cpu_to_le32(cap->issued);
435520cb34aeSSage Weil rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
4356a33f6432SYan, Zheng rec.v2.pathbase = cpu_to_le64(pathbase);
4357ec1dff25SJeff Layton rec.v2.flock_len = (__force __le32)
4358ec1dff25SJeff Layton ((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
435920cb34aeSSage Weil } else {
436020cb34aeSSage Weil rec.v1.cap_id = cpu_to_le64(cap->cap_id);
436120cb34aeSSage Weil rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
436220cb34aeSSage Weil rec.v1.issued = cpu_to_le32(cap->issued);
43632d6795fbSJeff Layton rec.v1.size = cpu_to_le64(i_size_read(inode));
43649bbeab41SArnd Bergmann ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime);
43659bbeab41SArnd Bergmann ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime);
436620cb34aeSSage Weil rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
4367a33f6432SYan, Zheng rec.v1.pathbase = cpu_to_le64(pathbase);
436820cb34aeSSage Weil }
43693469ed0dSYan, Zheng
43703469ed0dSYan, Zheng if (list_empty(&ci->i_cap_snaps)) {
437192776fd2SYan, Zheng snap_follows = ci->i_head_snapc ? ci->i_head_snapc->seq : 0;
43723469ed0dSYan, Zheng } else {
43733469ed0dSYan, Zheng struct ceph_cap_snap *capsnap =
43743469ed0dSYan, Zheng list_first_entry(&ci->i_cap_snaps,
43753469ed0dSYan, Zheng struct ceph_cap_snap, ci_item);
43763469ed0dSYan, Zheng snap_follows = capsnap->follows;
437793cea5beSSage Weil }
4378be655596SSage Weil spin_unlock(&ci->i_ceph_lock);
43792f2dc053SSage Weil
4380121f22a1SYan, Zheng if (recon_state->msg_version >= 2) {
438140819f6fSGreg Farnum int num_fcntl_locks, num_flock_locks;
43824deb14a2SYan, Zheng struct ceph_filelock *flocks = NULL;
438381c5a148SYan, Zheng size_t struct_len, total_len = sizeof(u64);
4384121f22a1SYan, Zheng u8 struct_v = 0;
438540819f6fSGreg Farnum
438639be95e9SJim Schutt encode_again:
4387b3f8d68fSYan, Zheng if (rec.v2.flock_len) {
438839be95e9SJim Schutt ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
4389b3f8d68fSYan, Zheng } else {
4390b3f8d68fSYan, Zheng num_fcntl_locks = 0;
4391b3f8d68fSYan, Zheng num_flock_locks = 0;
4392b3f8d68fSYan, Zheng }
43934deb14a2SYan, Zheng if (num_fcntl_locks + num_flock_locks > 0) {
43946da2ec56SKees Cook flocks = kmalloc_array(num_fcntl_locks + num_flock_locks,
43956da2ec56SKees Cook sizeof(struct ceph_filelock),
43966da2ec56SKees Cook GFP_NOFS);
439739be95e9SJim Schutt if (!flocks) {
439839be95e9SJim Schutt err = -ENOMEM;
43995ccedf1cSYan, Zheng goto out_err;
440039be95e9SJim Schutt }
440139be95e9SJim Schutt err = ceph_encode_locks_to_buffer(inode, flocks,
440240819f6fSGreg Farnum num_fcntl_locks,
440340819f6fSGreg Farnum num_flock_locks);
440439be95e9SJim Schutt if (err) {
440539be95e9SJim Schutt kfree(flocks);
44064deb14a2SYan, Zheng flocks = NULL;
440739be95e9SJim Schutt if (err == -ENOSPC)
440839be95e9SJim Schutt goto encode_again;
44095ccedf1cSYan, Zheng goto out_err;
4410fca4451aSGreg Farnum }
44114deb14a2SYan, Zheng } else {
44124deb14a2SYan, Zheng kfree(flocks);
44134deb14a2SYan, Zheng flocks = NULL;
44144deb14a2SYan, Zheng }
4415121f22a1SYan, Zheng
4416121f22a1SYan, Zheng if (recon_state->msg_version >= 3) {
4417121f22a1SYan, Zheng /* version, compat_version and struct_len */
441881c5a148SYan, Zheng total_len += 2 * sizeof(u8) + sizeof(u32);
44193469ed0dSYan, Zheng struct_v = 2;
4420121f22a1SYan, Zheng }
442139be95e9SJim Schutt /*
442239be95e9SJim Schutt * number of encoded locks is stable, so copy to pagelist
442339be95e9SJim Schutt */
4424121f22a1SYan, Zheng struct_len = 2 * sizeof(u32) +
442539be95e9SJim Schutt (num_fcntl_locks + num_flock_locks) *
4426121f22a1SYan, Zheng sizeof(struct ceph_filelock);
4427121f22a1SYan, Zheng rec.v2.flock_len = cpu_to_le32(struct_len);
4428121f22a1SYan, Zheng
4429a33f6432SYan, Zheng struct_len += sizeof(u32) + pathlen + sizeof(rec.v2);
4430121f22a1SYan, Zheng
44313469ed0dSYan, Zheng if (struct_v >= 2)
44323469ed0dSYan, Zheng struct_len += sizeof(u64); /* snap_follows */
44333469ed0dSYan, Zheng
4434121f22a1SYan, Zheng total_len += struct_len;
443581c5a148SYan, Zheng
443681c5a148SYan, Zheng if (pagelist->length + total_len > RECONNECT_MAX_SIZE) {
443781c5a148SYan, Zheng err = send_reconnect_partial(recon_state);
443881c5a148SYan, Zheng if (err)
443981c5a148SYan, Zheng goto out_freeflocks;
444081c5a148SYan, Zheng pagelist = recon_state->pagelist;
44415ccedf1cSYan, Zheng }
4442121f22a1SYan, Zheng
444381c5a148SYan, Zheng err = ceph_pagelist_reserve(pagelist, total_len);
444481c5a148SYan, Zheng if (err)
444581c5a148SYan, Zheng goto out_freeflocks;
444681c5a148SYan, Zheng
444781c5a148SYan, Zheng ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
4448121f22a1SYan, Zheng if (recon_state->msg_version >= 3) {
4449121f22a1SYan, Zheng ceph_pagelist_encode_8(pagelist, struct_v);
4450121f22a1SYan, Zheng ceph_pagelist_encode_8(pagelist, 1);
4451121f22a1SYan, Zheng ceph_pagelist_encode_32(pagelist, struct_len);
4452121f22a1SYan, Zheng }
4453a33f6432SYan, Zheng ceph_pagelist_encode_string(pagelist, path, pathlen);
4454121f22a1SYan, Zheng ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
4455121f22a1SYan, Zheng ceph_locks_to_pagelist(flocks, pagelist,
44565ccedf1cSYan, Zheng num_fcntl_locks, num_flock_locks);
44573469ed0dSYan, Zheng if (struct_v >= 2)
44583469ed0dSYan, Zheng ceph_pagelist_encode_64(pagelist, snap_follows);
445981c5a148SYan, Zheng out_freeflocks:
446039be95e9SJim Schutt kfree(flocks);
44613612abbdSSage Weil } else {
44625ccedf1cSYan, Zheng err = ceph_pagelist_reserve(pagelist,
446381c5a148SYan, Zheng sizeof(u64) + sizeof(u32) +
446481c5a148SYan, Zheng pathlen + sizeof(rec.v1));
4465a33f6432SYan, Zheng if (err)
4466a33f6432SYan, Zheng goto out_err;
44675ccedf1cSYan, Zheng
446881c5a148SYan, Zheng ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
4469121f22a1SYan, Zheng ceph_pagelist_encode_string(pagelist, path, pathlen);
4470121f22a1SYan, Zheng ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
447140819f6fSGreg Farnum }
447244c99757SYan, Zheng
44735ccedf1cSYan, Zheng out_err:
4474a33f6432SYan, Zheng ceph_mdsc_free_path(path, pathlen);
4475a33f6432SYan, Zheng if (!err)
447681c5a148SYan, Zheng recon_state->nr_caps++;
447781c5a148SYan, Zheng return err;
447881c5a148SYan, Zheng }
447981c5a148SYan, Zheng
encode_snap_realms(struct ceph_mds_client * mdsc,struct ceph_reconnect_state * recon_state)448081c5a148SYan, Zheng static int encode_snap_realms(struct ceph_mds_client *mdsc,
448181c5a148SYan, Zheng struct ceph_reconnect_state *recon_state)
448281c5a148SYan, Zheng {
448381c5a148SYan, Zheng struct rb_node *p;
448481c5a148SYan, Zheng struct ceph_pagelist *pagelist = recon_state->pagelist;
448581c5a148SYan, Zheng int err = 0;
448681c5a148SYan, Zheng
448781c5a148SYan, Zheng if (recon_state->msg_version >= 4) {
448881c5a148SYan, Zheng err = ceph_pagelist_encode_32(pagelist, mdsc->num_snap_realms);
448981c5a148SYan, Zheng if (err < 0)
449081c5a148SYan, Zheng goto fail;
449181c5a148SYan, Zheng }
449281c5a148SYan, Zheng
449381c5a148SYan, Zheng /*
449481c5a148SYan, Zheng * snaprealms. we provide mds with the ino, seq (version), and
449581c5a148SYan, Zheng * parent for all of our realms. If the mds has any newer info,
449681c5a148SYan, Zheng * it will tell us.
449781c5a148SYan, Zheng */
449881c5a148SYan, Zheng for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
449981c5a148SYan, Zheng struct ceph_snap_realm *realm =
450081c5a148SYan, Zheng rb_entry(p, struct ceph_snap_realm, node);
450181c5a148SYan, Zheng struct ceph_mds_snaprealm_reconnect sr_rec;
450281c5a148SYan, Zheng
450381c5a148SYan, Zheng if (recon_state->msg_version >= 4) {
450481c5a148SYan, Zheng size_t need = sizeof(u8) * 2 + sizeof(u32) +
450581c5a148SYan, Zheng sizeof(sr_rec);
450681c5a148SYan, Zheng
450781c5a148SYan, Zheng if (pagelist->length + need > RECONNECT_MAX_SIZE) {
450881c5a148SYan, Zheng err = send_reconnect_partial(recon_state);
450981c5a148SYan, Zheng if (err)
451081c5a148SYan, Zheng goto fail;
451181c5a148SYan, Zheng pagelist = recon_state->pagelist;
451281c5a148SYan, Zheng }
451381c5a148SYan, Zheng
451481c5a148SYan, Zheng err = ceph_pagelist_reserve(pagelist, need);
451581c5a148SYan, Zheng if (err)
451681c5a148SYan, Zheng goto fail;
451781c5a148SYan, Zheng
451881c5a148SYan, Zheng ceph_pagelist_encode_8(pagelist, 1);
451981c5a148SYan, Zheng ceph_pagelist_encode_8(pagelist, 1);
452081c5a148SYan, Zheng ceph_pagelist_encode_32(pagelist, sizeof(sr_rec));
452181c5a148SYan, Zheng }
452281c5a148SYan, Zheng
452381c5a148SYan, Zheng dout(" adding snap realm %llx seq %lld parent %llx\n",
452481c5a148SYan, Zheng realm->ino, realm->seq, realm->parent_ino);
452581c5a148SYan, Zheng sr_rec.ino = cpu_to_le64(realm->ino);
452681c5a148SYan, Zheng sr_rec.seq = cpu_to_le64(realm->seq);
452781c5a148SYan, Zheng sr_rec.parent = cpu_to_le64(realm->parent_ino);
452881c5a148SYan, Zheng
452981c5a148SYan, Zheng err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
453081c5a148SYan, Zheng if (err)
453181c5a148SYan, Zheng goto fail;
453281c5a148SYan, Zheng
453381c5a148SYan, Zheng recon_state->nr_realms++;
453481c5a148SYan, Zheng }
453581c5a148SYan, Zheng fail:
453693cea5beSSage Weil return err;
45372f2dc053SSage Weil }
45382f2dc053SSage Weil
45392f2dc053SSage Weil
45402f2dc053SSage Weil /*
45412f2dc053SSage Weil * If an MDS fails and recovers, clients need to reconnect in order to
45422f2dc053SSage Weil * reestablish shared state. This includes all caps issued through
45432f2dc053SSage Weil * this session _and_ the snap_realm hierarchy. Because it's not
45442f2dc053SSage Weil * clear which snap realms the mds cares about, we send everything we
45452f2dc053SSage Weil * know about.. that ensures we'll then get any new info the
45462f2dc053SSage Weil * recovering MDS might have.
45472f2dc053SSage Weil *
45482f2dc053SSage Weil * This is a relatively heavyweight operation, but it's rare.
45492f2dc053SSage Weil */
send_mds_reconnect(struct ceph_mds_client * mdsc,struct ceph_mds_session * session)455034b6c855SSage Weil static void send_mds_reconnect(struct ceph_mds_client *mdsc,
455134b6c855SSage Weil struct ceph_mds_session *session)
45522f2dc053SSage Weil {
45532f2dc053SSage Weil struct ceph_msg *reply;
455434b6c855SSage Weil int mds = session->s_mds;
45559abf82b8SSage Weil int err = -ENOMEM;
455681c5a148SYan, Zheng struct ceph_reconnect_state recon_state = {
455781c5a148SYan, Zheng .session = session,
455881c5a148SYan, Zheng };
4559c8a96a31SJeff Layton LIST_HEAD(dispose);
45602f2dc053SSage Weil
456134b6c855SSage Weil pr_info("mds%d reconnect start\n", mds);
45622f2dc053SSage Weil
456381c5a148SYan, Zheng recon_state.pagelist = ceph_pagelist_alloc(GFP_NOFS);
456481c5a148SYan, Zheng if (!recon_state.pagelist)
456593cea5beSSage Weil goto fail_nopagelist;
456693cea5beSSage Weil
45670d9c1ab3SIlya Dryomov reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false);
4568a79832f2SSage Weil if (!reply)
456993cea5beSSage Weil goto fail_nomsg;
457093cea5beSSage Weil
4571d4846487SJeff Layton xa_destroy(&session->s_delegated_inos);
4572d4846487SJeff Layton
45732f2dc053SSage Weil mutex_lock(&session->s_mutex);
45742f2dc053SSage Weil session->s_state = CEPH_MDS_SESSION_RECONNECTING;
45752f2dc053SSage Weil session->s_seq = 0;
45762f2dc053SSage Weil
45772f2dc053SSage Weil dout("session %p state %s\n", session,
4578a687ecafSJohn Spray ceph_session_state_name(session->s_state));
45792f2dc053SSage Weil
458052d60f8eSJeff Layton atomic_inc(&session->s_cap_gen);
458199a9c273SYan, Zheng
458299a9c273SYan, Zheng spin_lock(&session->s_cap_lock);
458303f4fcb0SYan, Zheng /* don't know if session is readonly */
458403f4fcb0SYan, Zheng session->s_readonly = 0;
458599a9c273SYan, Zheng /*
458699a9c273SYan, Zheng * notify __ceph_remove_cap() that we are composing cap reconnect.
458799a9c273SYan, Zheng * If a cap get released before being added to the cap reconnect,
458899a9c273SYan, Zheng * __ceph_remove_cap() should skip queuing cap release.
458999a9c273SYan, Zheng */
459099a9c273SYan, Zheng session->s_cap_reconnect = 1;
4591e01a5946SSage Weil /* drop old cap expires; we're about to reestablish that state */
4592c8a96a31SJeff Layton detach_cap_releases(session, &dispose);
4593c8a96a31SJeff Layton spin_unlock(&session->s_cap_lock);
4594c8a96a31SJeff Layton dispose_cap_releases(mdsc, &dispose);
4595e01a5946SSage Weil
45965d23371fSYan, Zheng /* trim unused caps to reduce MDS's cache rejoin time */
4597c0bd50e2SYan, Zheng if (mdsc->fsc->sb->s_root)
45985d23371fSYan, Zheng shrink_dcache_parent(mdsc->fsc->sb->s_root);
45995d23371fSYan, Zheng
46005d23371fSYan, Zheng ceph_con_close(&session->s_con);
46015d23371fSYan, Zheng ceph_con_open(&session->s_con,
46025d23371fSYan, Zheng CEPH_ENTITY_TYPE_MDS, mds,
46035d23371fSYan, Zheng ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
46045d23371fSYan, Zheng
46055d23371fSYan, Zheng /* replay unsafe requests */
46065d23371fSYan, Zheng replay_unsafe_requests(mdsc, session);
46075d23371fSYan, Zheng
460881c5a148SYan, Zheng ceph_early_kick_flushing_caps(mdsc, session);
460981c5a148SYan, Zheng
46105d23371fSYan, Zheng down_read(&mdsc->snap_rwsem);
46115d23371fSYan, Zheng
461281c5a148SYan, Zheng /* placeholder for nr_caps */
461381c5a148SYan, Zheng err = ceph_pagelist_encode_32(recon_state.pagelist, 0);
461493cea5beSSage Weil if (err)
461593cea5beSSage Weil goto fail;
461620cb34aeSSage Weil
461781c5a148SYan, Zheng if (test_bit(CEPHFS_FEATURE_MULTI_RECONNECT, &session->s_features)) {
4618121f22a1SYan, Zheng recon_state.msg_version = 3;
461981c5a148SYan, Zheng recon_state.allow_multi = true;
462081c5a148SYan, Zheng } else if (session->s_con.peer_features & CEPH_FEATURE_MDSENC) {
462181c5a148SYan, Zheng recon_state.msg_version = 3;
462281c5a148SYan, Zheng } else {
462323c625ceSIlya Dryomov recon_state.msg_version = 2;
462481c5a148SYan, Zheng }
462581c5a148SYan, Zheng /* trsaverse this session's caps */
4626a25949b9SJeff Layton err = ceph_iterate_session_caps(session, reconnect_caps_cb, &recon_state);
46272f2dc053SSage Weil
462899a9c273SYan, Zheng spin_lock(&session->s_cap_lock);
462999a9c273SYan, Zheng session->s_cap_reconnect = 0;
463099a9c273SYan, Zheng spin_unlock(&session->s_cap_lock);
463199a9c273SYan, Zheng
463281c5a148SYan, Zheng if (err < 0)
463381c5a148SYan, Zheng goto fail;
46342f2dc053SSage Weil
463581c5a148SYan, Zheng /* check if all realms can be encoded into current message */
463681c5a148SYan, Zheng if (mdsc->num_snap_realms) {
463781c5a148SYan, Zheng size_t total_len =
463881c5a148SYan, Zheng recon_state.pagelist->length +
463981c5a148SYan, Zheng mdsc->num_snap_realms *
464081c5a148SYan, Zheng sizeof(struct ceph_mds_snaprealm_reconnect);
464181c5a148SYan, Zheng if (recon_state.msg_version >= 4) {
464281c5a148SYan, Zheng /* number of realms */
464381c5a148SYan, Zheng total_len += sizeof(u32);
464481c5a148SYan, Zheng /* version, compat_version and struct_len */
464581c5a148SYan, Zheng total_len += mdsc->num_snap_realms *
464681c5a148SYan, Zheng (2 * sizeof(u8) + sizeof(u32));
464781c5a148SYan, Zheng }
464881c5a148SYan, Zheng if (total_len > RECONNECT_MAX_SIZE) {
464981c5a148SYan, Zheng if (!recon_state.allow_multi) {
465081c5a148SYan, Zheng err = -ENOSPC;
465181c5a148SYan, Zheng goto fail;
465281c5a148SYan, Zheng }
465381c5a148SYan, Zheng if (recon_state.nr_caps) {
465481c5a148SYan, Zheng err = send_reconnect_partial(&recon_state);
465593cea5beSSage Weil if (err)
465693cea5beSSage Weil goto fail;
46572f2dc053SSage Weil }
465881c5a148SYan, Zheng recon_state.msg_version = 5;
465981c5a148SYan, Zheng }
466081c5a148SYan, Zheng }
46612f2dc053SSage Weil
466281c5a148SYan, Zheng err = encode_snap_realms(mdsc, &recon_state);
466381c5a148SYan, Zheng if (err < 0)
466481c5a148SYan, Zheng goto fail;
466544c99757SYan, Zheng
466681c5a148SYan, Zheng if (recon_state.msg_version >= 5) {
466781c5a148SYan, Zheng err = ceph_pagelist_encode_8(recon_state.pagelist, 0);
466881c5a148SYan, Zheng if (err < 0)
466981c5a148SYan, Zheng goto fail;
467081c5a148SYan, Zheng }
467181c5a148SYan, Zheng
467281c5a148SYan, Zheng if (recon_state.nr_caps || recon_state.nr_realms) {
467381c5a148SYan, Zheng struct page *page =
467481c5a148SYan, Zheng list_first_entry(&recon_state.pagelist->head,
467544c99757SYan, Zheng struct page, lru);
467644c99757SYan, Zheng __le32 *addr = kmap_atomic(page);
467781c5a148SYan, Zheng if (recon_state.nr_caps) {
467881c5a148SYan, Zheng WARN_ON(recon_state.nr_realms != mdsc->num_snap_realms);
467944c99757SYan, Zheng *addr = cpu_to_le32(recon_state.nr_caps);
468081c5a148SYan, Zheng } else if (recon_state.msg_version >= 4) {
468181c5a148SYan, Zheng *(addr + 1) = cpu_to_le32(recon_state.nr_realms);
468281c5a148SYan, Zheng }
468344c99757SYan, Zheng kunmap_atomic(addr);
468444c99757SYan, Zheng }
468544c99757SYan, Zheng
468681c5a148SYan, Zheng reply->hdr.version = cpu_to_le16(recon_state.msg_version);
468781c5a148SYan, Zheng if (recon_state.msg_version >= 4)
468881c5a148SYan, Zheng reply->hdr.compat_version = cpu_to_le16(4);
4689e548e9b9SYan, Zheng
469081c5a148SYan, Zheng reply->hdr.data_len = cpu_to_le32(recon_state.pagelist->length);
469181c5a148SYan, Zheng ceph_msg_data_add_pagelist(reply, recon_state.pagelist);
4692e548e9b9SYan, Zheng
46932f2dc053SSage Weil ceph_con_send(&session->s_con, reply);
46942f2dc053SSage Weil
46952f2dc053SSage Weil mutex_unlock(&session->s_mutex);
46969abf82b8SSage Weil
46979abf82b8SSage Weil mutex_lock(&mdsc->mutex);
46989abf82b8SSage Weil __wake_requests(mdsc, &session->s_waiting);
46999abf82b8SSage Weil mutex_unlock(&mdsc->mutex);
47009abf82b8SSage Weil
47019abf82b8SSage Weil up_read(&mdsc->snap_rwsem);
470281c5a148SYan, Zheng ceph_pagelist_release(recon_state.pagelist);
47032f2dc053SSage Weil return;
47042f2dc053SSage Weil
470593cea5beSSage Weil fail:
47062f2dc053SSage Weil ceph_msg_put(reply);
47079abf82b8SSage Weil up_read(&mdsc->snap_rwsem);
47089abf82b8SSage Weil mutex_unlock(&session->s_mutex);
470993cea5beSSage Weil fail_nomsg:
471081c5a148SYan, Zheng ceph_pagelist_release(recon_state.pagelist);
471193cea5beSSage Weil fail_nopagelist:
47129abf82b8SSage Weil pr_err("error %d preparing reconnect for mds%d\n", err, mds);
47139abf82b8SSage Weil return;
47142f2dc053SSage Weil }
47152f2dc053SSage Weil
47162f2dc053SSage Weil
47172f2dc053SSage Weil /*
47182f2dc053SSage Weil * compare old and new mdsmaps, kicking requests
47192f2dc053SSage Weil * and closing out old connections as necessary
47202f2dc053SSage Weil *
47212f2dc053SSage Weil * called under mdsc->mutex.
47222f2dc053SSage Weil */
check_new_map(struct ceph_mds_client * mdsc,struct ceph_mdsmap * newmap,struct ceph_mdsmap * oldmap)47232f2dc053SSage Weil static void check_new_map(struct ceph_mds_client *mdsc,
47242f2dc053SSage Weil struct ceph_mdsmap *newmap,
47252f2dc053SSage Weil struct ceph_mdsmap *oldmap)
47262f2dc053SSage Weil {
4727d517b398SXiubo Li int i, j, err;
47282f2dc053SSage Weil int oldstate, newstate;
47292f2dc053SSage Weil struct ceph_mds_session *s;
4730d517b398SXiubo Li unsigned long targets[DIV_ROUND_UP(CEPH_MAX_MDS, sizeof(unsigned long))] = {0};
47312f2dc053SSage Weil
47322f2dc053SSage Weil dout("check_new_map new %u old %u\n",
47332f2dc053SSage Weil newmap->m_epoch, oldmap->m_epoch);
47342f2dc053SSage Weil
4735d517b398SXiubo Li if (newmap->m_info) {
4736d517b398SXiubo Li for (i = 0; i < newmap->possible_max_rank; i++) {
4737d517b398SXiubo Li for (j = 0; j < newmap->m_info[i].num_export_targets; j++)
4738d517b398SXiubo Li set_bit(newmap->m_info[i].export_targets[j], targets);
4739d517b398SXiubo Li }
4740d517b398SXiubo Li }
4741d517b398SXiubo Li
4742b38c9eb4SXiubo Li for (i = 0; i < oldmap->possible_max_rank && i < mdsc->max_sessions; i++) {
4743d37b1d99SMarkus Elfring if (!mdsc->sessions[i])
47442f2dc053SSage Weil continue;
47452f2dc053SSage Weil s = mdsc->sessions[i];
47462f2dc053SSage Weil oldstate = ceph_mdsmap_get_state(oldmap, i);
47472f2dc053SSage Weil newstate = ceph_mdsmap_get_state(newmap, i);
47482f2dc053SSage Weil
47490deb01c9SSage Weil dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n",
47502f2dc053SSage Weil i, ceph_mds_state_name(oldstate),
47510deb01c9SSage Weil ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
47522f2dc053SSage Weil ceph_mds_state_name(newstate),
47530deb01c9SSage Weil ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
4754a687ecafSJohn Spray ceph_session_state_name(s->s_state));
47552f2dc053SSage Weil
4756b38c9eb4SXiubo Li if (i >= newmap->possible_max_rank) {
47572827528dSYan, Zheng /* force close session for stopped mds */
47585b3248c6SXiubo Li ceph_get_mds_session(s);
47592827528dSYan, Zheng __unregister_session(mdsc, s);
47602827528dSYan, Zheng __wake_requests(mdsc, &s->s_waiting);
47612827528dSYan, Zheng mutex_unlock(&mdsc->mutex);
47622827528dSYan, Zheng
47632827528dSYan, Zheng mutex_lock(&s->s_mutex);
47642827528dSYan, Zheng cleanup_session_requests(mdsc, s);
47652827528dSYan, Zheng remove_session_caps(s);
47662827528dSYan, Zheng mutex_unlock(&s->s_mutex);
47672827528dSYan, Zheng
47682827528dSYan, Zheng ceph_put_mds_session(s);
47692827528dSYan, Zheng
47702827528dSYan, Zheng mutex_lock(&mdsc->mutex);
47716f0f597bSYan, Zheng kick_requests(mdsc, i);
47726f0f597bSYan, Zheng continue;
47736f0f597bSYan, Zheng }
47746f0f597bSYan, Zheng
47756f0f597bSYan, Zheng if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
47766f0f597bSYan, Zheng ceph_mdsmap_get_addr(newmap, i),
47776f0f597bSYan, Zheng sizeof(struct ceph_entity_addr))) {
47782f2dc053SSage Weil /* just close it */
47792f2dc053SSage Weil mutex_unlock(&mdsc->mutex);
47802f2dc053SSage Weil mutex_lock(&s->s_mutex);
47812f2dc053SSage Weil mutex_lock(&mdsc->mutex);
47822f2dc053SSage Weil ceph_con_close(&s->s_con);
47832f2dc053SSage Weil mutex_unlock(&s->s_mutex);
47842f2dc053SSage Weil s->s_state = CEPH_MDS_SESSION_RESTARTING;
47852f2dc053SSage Weil } else if (oldstate == newstate) {
47862f2dc053SSage Weil continue; /* nothing new with this mds */
47872f2dc053SSage Weil }
47882f2dc053SSage Weil
47892f2dc053SSage Weil /*
47902f2dc053SSage Weil * send reconnect?
47912f2dc053SSage Weil */
47922f2dc053SSage Weil if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
479334b6c855SSage Weil newstate >= CEPH_MDS_STATE_RECONNECT) {
479434b6c855SSage Weil mutex_unlock(&mdsc->mutex);
4795d517b398SXiubo Li clear_bit(i, targets);
479634b6c855SSage Weil send_mds_reconnect(mdsc, s);
479734b6c855SSage Weil mutex_lock(&mdsc->mutex);
479834b6c855SSage Weil }
47992f2dc053SSage Weil
48002f2dc053SSage Weil /*
480129790f26SSage Weil * kick request on any mds that has gone active.
48022f2dc053SSage Weil */
48032f2dc053SSage Weil if (oldstate < CEPH_MDS_STATE_ACTIVE &&
48042f2dc053SSage Weil newstate >= CEPH_MDS_STATE_ACTIVE) {
480529790f26SSage Weil if (oldstate != CEPH_MDS_STATE_CREATING &&
480629790f26SSage Weil oldstate != CEPH_MDS_STATE_STARTING)
480729790f26SSage Weil pr_info("mds%d recovery completed\n", s->s_mds);
480829790f26SSage Weil kick_requests(mdsc, i);
4809ea8412b2SXiubo Li mutex_unlock(&mdsc->mutex);
4810829ad4dbSJeff Layton mutex_lock(&s->s_mutex);
4811ea8412b2SXiubo Li mutex_lock(&mdsc->mutex);
48122f2dc053SSage Weil ceph_kick_flushing_caps(mdsc, s);
4813829ad4dbSJeff Layton mutex_unlock(&s->s_mutex);
4814d2f8bb27SYan, Zheng wake_up_session_caps(s, RECONNECT);
48152f2dc053SSage Weil }
48162f2dc053SSage Weil }
4817cb170a22SSage Weil
4818d517b398SXiubo Li /*
4819d517b398SXiubo Li * Only open and reconnect sessions that don't exist yet.
4820d517b398SXiubo Li */
4821d517b398SXiubo Li for (i = 0; i < newmap->possible_max_rank; i++) {
4822d517b398SXiubo Li /*
4823d517b398SXiubo Li * In case the import MDS is crashed just after
4824d517b398SXiubo Li * the EImportStart journal is flushed, so when
4825d517b398SXiubo Li * a standby MDS takes over it and is replaying
4826d517b398SXiubo Li * the EImportStart journal the new MDS daemon
4827d517b398SXiubo Li * will wait the client to reconnect it, but the
4828d517b398SXiubo Li * client may never register/open the session yet.
4829d517b398SXiubo Li *
4830d517b398SXiubo Li * Will try to reconnect that MDS daemon if the
4831d517b398SXiubo Li * rank number is in the export targets array and
4832d517b398SXiubo Li * is the up:reconnect state.
4833d517b398SXiubo Li */
4834d517b398SXiubo Li newstate = ceph_mdsmap_get_state(newmap, i);
4835d517b398SXiubo Li if (!test_bit(i, targets) || newstate != CEPH_MDS_STATE_RECONNECT)
4836d517b398SXiubo Li continue;
4837d517b398SXiubo Li
4838d517b398SXiubo Li /*
4839d517b398SXiubo Li * The session maybe registered and opened by some
4840d517b398SXiubo Li * requests which were choosing random MDSes during
4841d517b398SXiubo Li * the mdsc->mutex's unlock/lock gap below in rare
4842d517b398SXiubo Li * case. But the related MDS daemon will just queue
4843d517b398SXiubo Li * that requests and be still waiting for the client's
4844d517b398SXiubo Li * reconnection request in up:reconnect state.
4845d517b398SXiubo Li */
4846d517b398SXiubo Li s = __ceph_lookup_mds_session(mdsc, i);
4847d517b398SXiubo Li if (likely(!s)) {
4848d517b398SXiubo Li s = __open_export_target_session(mdsc, i);
4849d517b398SXiubo Li if (IS_ERR(s)) {
4850d517b398SXiubo Li err = PTR_ERR(s);
4851d517b398SXiubo Li pr_err("failed to open export target session, err %d\n",
4852d517b398SXiubo Li err);
4853d517b398SXiubo Li continue;
4854d517b398SXiubo Li }
4855d517b398SXiubo Li }
4856d517b398SXiubo Li dout("send reconnect to export target mds.%d\n", i);
4857d517b398SXiubo Li mutex_unlock(&mdsc->mutex);
4858d517b398SXiubo Li send_mds_reconnect(mdsc, s);
4859d517b398SXiubo Li ceph_put_mds_session(s);
4860d517b398SXiubo Li mutex_lock(&mdsc->mutex);
4861d517b398SXiubo Li }
4862d517b398SXiubo Li
4863b38c9eb4SXiubo Li for (i = 0; i < newmap->possible_max_rank && i < mdsc->max_sessions; i++) {
4864cb170a22SSage Weil s = mdsc->sessions[i];
4865cb170a22SSage Weil if (!s)
4866cb170a22SSage Weil continue;
4867cb170a22SSage Weil if (!ceph_mdsmap_is_laggy(newmap, i))
4868cb170a22SSage Weil continue;
4869cb170a22SSage Weil if (s->s_state == CEPH_MDS_SESSION_OPEN ||
4870cb170a22SSage Weil s->s_state == CEPH_MDS_SESSION_HUNG ||
4871cb170a22SSage Weil s->s_state == CEPH_MDS_SESSION_CLOSING) {
4872cb170a22SSage Weil dout(" connecting to export targets of laggy mds%d\n",
4873cb170a22SSage Weil i);
4874cb170a22SSage Weil __open_export_target_sessions(mdsc, s);
4875cb170a22SSage Weil }
4876cb170a22SSage Weil }
48772f2dc053SSage Weil }
48782f2dc053SSage Weil
48792f2dc053SSage Weil
48802f2dc053SSage Weil
48812f2dc053SSage Weil /*
48822f2dc053SSage Weil * leases
48832f2dc053SSage Weil */
48842f2dc053SSage Weil
48852f2dc053SSage Weil /*
48862f2dc053SSage Weil * caller must hold session s_mutex, dentry->d_lock
48872f2dc053SSage Weil */
__ceph_mdsc_drop_dentry_lease(struct dentry * dentry)48882f2dc053SSage Weil void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
48892f2dc053SSage Weil {
48902f2dc053SSage Weil struct ceph_dentry_info *di = ceph_dentry(dentry);
48912f2dc053SSage Weil
48922f2dc053SSage Weil ceph_put_mds_session(di->lease_session);
48932f2dc053SSage Weil di->lease_session = NULL;
48942f2dc053SSage Weil }
48952f2dc053SSage Weil
handle_lease(struct ceph_mds_client * mdsc,struct ceph_mds_session * session,struct ceph_msg * msg)48962600d2ddSSage Weil static void handle_lease(struct ceph_mds_client *mdsc,
48972600d2ddSSage Weil struct ceph_mds_session *session,
48982600d2ddSSage Weil struct ceph_msg *msg)
48992f2dc053SSage Weil {
49003d14c5d2SYehuda Sadeh struct super_block *sb = mdsc->fsc->sb;
49012f2dc053SSage Weil struct inode *inode;
49022f2dc053SSage Weil struct dentry *parent, *dentry;
49032f2dc053SSage Weil struct ceph_dentry_info *di;
49042600d2ddSSage Weil int mds = session->s_mds;
49052f2dc053SSage Weil struct ceph_mds_lease *h = msg->front.iov_base;
49061e5ea23dSSage Weil u32 seq;
49072f2dc053SSage Weil struct ceph_vino vino;
49082f2dc053SSage Weil struct qstr dname;
49092f2dc053SSage Weil int release = 0;
49102f2dc053SSage Weil
49112f2dc053SSage Weil dout("handle_lease from mds%d\n", mds);
49122f2dc053SSage Weil
4913e3dfcab2SXiubo Li if (!ceph_inc_mds_stopping_blocker(mdsc, session))
4914e3dfcab2SXiubo Li return;
4915e3dfcab2SXiubo Li
49162f2dc053SSage Weil /* decode */
49172f2dc053SSage Weil if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
49182f2dc053SSage Weil goto bad;
49192f2dc053SSage Weil vino.ino = le64_to_cpu(h->ino);
49202f2dc053SSage Weil vino.snap = CEPH_NOSNAP;
49211e5ea23dSSage Weil seq = le32_to_cpu(h->seq);
49220fcf6c02SYan, Zheng dname.len = get_unaligned_le32(h + 1);
49230fcf6c02SYan, Zheng if (msg->front.iov_len < sizeof(*h) + sizeof(u32) + dname.len)
49242f2dc053SSage Weil goto bad;
49250fcf6c02SYan, Zheng dname.name = (void *)(h + 1) + sizeof(u32);
49262f2dc053SSage Weil
49272f2dc053SSage Weil /* lookup inode */
49282f2dc053SSage Weil inode = ceph_find_inode(sb, vino);
49292f90b852SSage Weil dout("handle_lease %s, ino %llx %p %.*s\n",
49302f90b852SSage Weil ceph_lease_op_name(h->action), vino.ino, inode,
49311e5ea23dSSage Weil dname.len, dname.name);
49326cd3bcadSYan, Zheng
49336cd3bcadSYan, Zheng mutex_lock(&session->s_mutex);
4934d37b1d99SMarkus Elfring if (!inode) {
49352f2dc053SSage Weil dout("handle_lease no inode %llx\n", vino.ino);
49362f2dc053SSage Weil goto release;
49372f2dc053SSage Weil }
49382f2dc053SSage Weil
49392f2dc053SSage Weil /* dentry */
49402f2dc053SSage Weil parent = d_find_alias(inode);
49412f2dc053SSage Weil if (!parent) {
49422f2dc053SSage Weil dout("no parent dentry on inode %p\n", inode);
49432f2dc053SSage Weil WARN_ON(1);
49442f2dc053SSage Weil goto release; /* hrm... */
49452f2dc053SSage Weil }
49468387ff25SLinus Torvalds dname.hash = full_name_hash(parent, dname.name, dname.len);
49472f2dc053SSage Weil dentry = d_lookup(parent, &dname);
49482f2dc053SSage Weil dput(parent);
49492f2dc053SSage Weil if (!dentry)
49502f2dc053SSage Weil goto release;
49512f2dc053SSage Weil
49522f2dc053SSage Weil spin_lock(&dentry->d_lock);
49532f2dc053SSage Weil di = ceph_dentry(dentry);
49542f2dc053SSage Weil switch (h->action) {
49552f2dc053SSage Weil case CEPH_MDS_LEASE_REVOKE:
49563d8eb7a9SSage Weil if (di->lease_session == session) {
49571e5ea23dSSage Weil if (ceph_seq_cmp(di->lease_seq, seq) > 0)
49582f2dc053SSage Weil h->seq = cpu_to_le32(di->lease_seq);
49592f2dc053SSage Weil __ceph_mdsc_drop_dentry_lease(dentry);
49602f2dc053SSage Weil }
49612f2dc053SSage Weil release = 1;
49622f2dc053SSage Weil break;
49632f2dc053SSage Weil
49642f2dc053SSage Weil case CEPH_MDS_LEASE_RENEW:
49653d8eb7a9SSage Weil if (di->lease_session == session &&
496652d60f8eSJeff Layton di->lease_gen == atomic_read(&session->s_cap_gen) &&
49672f2dc053SSage Weil di->lease_renew_from &&
49682f2dc053SSage Weil di->lease_renew_after == 0) {
49692f2dc053SSage Weil unsigned long duration =
49703563dbddSNicholas Mc Guire msecs_to_jiffies(le32_to_cpu(h->duration_ms));
49712f2dc053SSage Weil
49721e5ea23dSSage Weil di->lease_seq = seq;
49739b16f03cSMiklos Szeredi di->time = di->lease_renew_from + duration;
49742f2dc053SSage Weil di->lease_renew_after = di->lease_renew_from +
49752f2dc053SSage Weil (duration >> 1);
49762f2dc053SSage Weil di->lease_renew_from = 0;
49772f2dc053SSage Weil }
49782f2dc053SSage Weil break;
49792f2dc053SSage Weil }
49802f2dc053SSage Weil spin_unlock(&dentry->d_lock);
49812f2dc053SSage Weil dput(dentry);
49822f2dc053SSage Weil
49832f2dc053SSage Weil if (!release)
49842f2dc053SSage Weil goto out;
49852f2dc053SSage Weil
49862f2dc053SSage Weil release:
49872f2dc053SSage Weil /* let's just reuse the same message */
49882f2dc053SSage Weil h->action = CEPH_MDS_LEASE_REVOKE_ACK;
49892f2dc053SSage Weil ceph_msg_get(msg);
49902f2dc053SSage Weil ceph_con_send(&session->s_con, msg);
49912f2dc053SSage Weil
49922f2dc053SSage Weil out:
49932f2dc053SSage Weil mutex_unlock(&session->s_mutex);
499423c2c76eSJeff Layton iput(inode);
4995e3dfcab2SXiubo Li
4996e3dfcab2SXiubo Li ceph_dec_mds_stopping_blocker(mdsc);
49972f2dc053SSage Weil return;
49982f2dc053SSage Weil
49992f2dc053SSage Weil bad:
5000e3dfcab2SXiubo Li ceph_dec_mds_stopping_blocker(mdsc);
5001e3dfcab2SXiubo Li
50022f2dc053SSage Weil pr_err("corrupt lease message\n");
50039ec7cab1SSage Weil ceph_msg_dump(msg);
50042f2dc053SSage Weil }
50052f2dc053SSage Weil
ceph_mdsc_lease_send_msg(struct ceph_mds_session * session,struct dentry * dentry,char action,u32 seq)50062f2dc053SSage Weil void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
50072f2dc053SSage Weil struct dentry *dentry, char action,
50082f2dc053SSage Weil u32 seq)
50092f2dc053SSage Weil {
50102f2dc053SSage Weil struct ceph_msg *msg;
50112f2dc053SSage Weil struct ceph_mds_lease *lease;
50128f2a98efSYan, Zheng struct inode *dir;
50138f2a98efSYan, Zheng int len = sizeof(*lease) + sizeof(u32) + NAME_MAX;
50142f2dc053SSage Weil
50158f2a98efSYan, Zheng dout("lease_send_msg identry %p %s to mds%d\n",
50168f2a98efSYan, Zheng dentry, ceph_lease_op_name(action), session->s_mds);
50172f2dc053SSage Weil
5018b61c2763SSage Weil msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false);
5019a79832f2SSage Weil if (!msg)
50202f2dc053SSage Weil return;
50212f2dc053SSage Weil lease = msg->front.iov_base;
50222f2dc053SSage Weil lease->action = action;
50232f2dc053SSage Weil lease->seq = cpu_to_le32(seq);
50242f2dc053SSage Weil
50258f2a98efSYan, Zheng spin_lock(&dentry->d_lock);
50268f2a98efSYan, Zheng dir = d_inode(dentry->d_parent);
50278f2a98efSYan, Zheng lease->ino = cpu_to_le64(ceph_ino(dir));
50288f2a98efSYan, Zheng lease->first = lease->last = cpu_to_le64(ceph_snap(dir));
50298f2a98efSYan, Zheng
50308f2a98efSYan, Zheng put_unaligned_le32(dentry->d_name.len, lease + 1);
50318f2a98efSYan, Zheng memcpy((void *)(lease + 1) + 4,
50328f2a98efSYan, Zheng dentry->d_name.name, dentry->d_name.len);
50338f2a98efSYan, Zheng spin_unlock(&dentry->d_lock);
50342f2dc053SSage Weil
50352f2dc053SSage Weil ceph_con_send(&session->s_con, msg);
50362f2dc053SSage Weil }
50372f2dc053SSage Weil
50382f2dc053SSage Weil /*
503959b312f3SXiubo Li * lock unlock the session, to wait ongoing session activities
50402f2dc053SSage Weil */
lock_unlock_session(struct ceph_mds_session * s)504159b312f3SXiubo Li static void lock_unlock_session(struct ceph_mds_session *s)
50422f2dc053SSage Weil {
50432f2dc053SSage Weil mutex_lock(&s->s_mutex);
50442f2dc053SSage Weil mutex_unlock(&s->s_mutex);
50452f2dc053SSage Weil }
50462f2dc053SSage Weil
maybe_recover_session(struct ceph_mds_client * mdsc)5047131d7eb4SYan, Zheng static void maybe_recover_session(struct ceph_mds_client *mdsc)
5048131d7eb4SYan, Zheng {
5049131d7eb4SYan, Zheng struct ceph_fs_client *fsc = mdsc->fsc;
50502f2dc053SSage Weil
5051131d7eb4SYan, Zheng if (!ceph_test_mount_opt(fsc, CLEANRECOVER))
5052131d7eb4SYan, Zheng return;
5053131d7eb4SYan, Zheng
5054131d7eb4SYan, Zheng if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED)
5055131d7eb4SYan, Zheng return;
5056131d7eb4SYan, Zheng
50570b98acd6SIlya Dryomov if (!READ_ONCE(fsc->blocklisted))
5058131d7eb4SYan, Zheng return;
5059131d7eb4SYan, Zheng
50600b98acd6SIlya Dryomov pr_info("auto reconnect after blocklisted\n");
5061131d7eb4SYan, Zheng ceph_force_reconnect(fsc->sb);
5062131d7eb4SYan, Zheng }
50632f2dc053SSage Weil
check_session_state(struct ceph_mds_session * s)50643e699bd8SXiubo Li bool check_session_state(struct ceph_mds_session *s)
50653e699bd8SXiubo Li {
506662575e27SJeff Layton switch (s->s_state) {
506762575e27SJeff Layton case CEPH_MDS_SESSION_OPEN:
50683e699bd8SXiubo Li if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
50693e699bd8SXiubo Li s->s_state = CEPH_MDS_SESSION_HUNG;
50703e699bd8SXiubo Li pr_info("mds%d hung\n", s->s_mds);
50713e699bd8SXiubo Li }
507262575e27SJeff Layton break;
507362575e27SJeff Layton case CEPH_MDS_SESSION_CLOSING:
507462575e27SJeff Layton case CEPH_MDS_SESSION_NEW:
507562575e27SJeff Layton case CEPH_MDS_SESSION_RESTARTING:
507662575e27SJeff Layton case CEPH_MDS_SESSION_CLOSED:
507762575e27SJeff Layton case CEPH_MDS_SESSION_REJECTED:
50783e699bd8SXiubo Li return false;
507962575e27SJeff Layton }
50803e699bd8SXiubo Li
50813e699bd8SXiubo Li return true;
50823e699bd8SXiubo Li }
50833e699bd8SXiubo Li
50842f2dc053SSage Weil /*
508562575e27SJeff Layton * If the sequence is incremented while we're waiting on a REQUEST_CLOSE reply,
508662575e27SJeff Layton * then we need to retransmit that request.
508762575e27SJeff Layton */
inc_session_sequence(struct ceph_mds_session * s)508862575e27SJeff Layton void inc_session_sequence(struct ceph_mds_session *s)
508962575e27SJeff Layton {
509062575e27SJeff Layton lockdep_assert_held(&s->s_mutex);
509162575e27SJeff Layton
509262575e27SJeff Layton s->s_seq++;
509362575e27SJeff Layton
509462575e27SJeff Layton if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
509562575e27SJeff Layton int ret;
509662575e27SJeff Layton
509762575e27SJeff Layton dout("resending session close request for mds%d\n", s->s_mds);
509862575e27SJeff Layton ret = request_close_session(s);
509962575e27SJeff Layton if (ret < 0)
510062575e27SJeff Layton pr_err("unable to close session to mds%d: %d\n",
510162575e27SJeff Layton s->s_mds, ret);
510262575e27SJeff Layton }
510362575e27SJeff Layton }
510462575e27SJeff Layton
510562575e27SJeff Layton /*
5106bf2ba432SLuis Henriques * delayed work -- periodically trim expired leases, renew caps with mds. If
5107bf2ba432SLuis Henriques * the @delay parameter is set to 0 or if it's more than 5 secs, the default
5108bf2ba432SLuis Henriques * workqueue delay value of 5 secs will be used.
51092f2dc053SSage Weil */
schedule_delayed(struct ceph_mds_client * mdsc,unsigned long delay)5110bf2ba432SLuis Henriques static void schedule_delayed(struct ceph_mds_client *mdsc, unsigned long delay)
51112f2dc053SSage Weil {
5112bf2ba432SLuis Henriques unsigned long max_delay = HZ * 5;
5113bf2ba432SLuis Henriques
5114bf2ba432SLuis Henriques /* 5 secs default delay */
5115bf2ba432SLuis Henriques if (!delay || (delay > max_delay))
5116bf2ba432SLuis Henriques delay = max_delay;
5117bf2ba432SLuis Henriques schedule_delayed_work(&mdsc->delayed_work,
5118bf2ba432SLuis Henriques round_jiffies_relative(delay));
51192f2dc053SSage Weil }
51202f2dc053SSage Weil
delayed_work(struct work_struct * work)51212f2dc053SSage Weil static void delayed_work(struct work_struct *work)
51222f2dc053SSage Weil {
51232f2dc053SSage Weil struct ceph_mds_client *mdsc =
51242f2dc053SSage Weil container_of(work, struct ceph_mds_client, delayed_work.work);
5125bf2ba432SLuis Henriques unsigned long delay;
51262f2dc053SSage Weil int renew_interval;
51272f2dc053SSage Weil int renew_caps;
5128bf2ba432SLuis Henriques int i;
51292f2dc053SSage Weil
51302f2dc053SSage Weil dout("mdsc delayed_work\n");
513175c9627eSYan, Zheng
5132e7e607bdSXiubo Li if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHED)
5133fa996773SXiubo Li return;
5134fa996773SXiubo Li
51352f2dc053SSage Weil mutex_lock(&mdsc->mutex);
51362f2dc053SSage Weil renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
51372f2dc053SSage Weil renew_caps = time_after_eq(jiffies, HZ*renew_interval +
51382f2dc053SSage Weil mdsc->last_renew_caps);
51392f2dc053SSage Weil if (renew_caps)
51402f2dc053SSage Weil mdsc->last_renew_caps = jiffies;
51412f2dc053SSage Weil
51422f2dc053SSage Weil for (i = 0; i < mdsc->max_sessions; i++) {
51432f2dc053SSage Weil struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
5144d37b1d99SMarkus Elfring if (!s)
51452f2dc053SSage Weil continue;
51463e699bd8SXiubo Li
51473e699bd8SXiubo Li if (!check_session_state(s)) {
51482f2dc053SSage Weil ceph_put_mds_session(s);
51492f2dc053SSage Weil continue;
51502f2dc053SSage Weil }
51512f2dc053SSage Weil mutex_unlock(&mdsc->mutex);
51522f2dc053SSage Weil
51532f2dc053SSage Weil mutex_lock(&s->s_mutex);
51542f2dc053SSage Weil if (renew_caps)
51552f2dc053SSage Weil send_renew_caps(mdsc, s);
51562f2dc053SSage Weil else
51572f2dc053SSage Weil ceph_con_keepalive(&s->s_con);
5158aab53dd9SSage Weil if (s->s_state == CEPH_MDS_SESSION_OPEN ||
5159aab53dd9SSage Weil s->s_state == CEPH_MDS_SESSION_HUNG)
51603d7ded4dSSage Weil ceph_send_cap_releases(mdsc, s);
51612f2dc053SSage Weil mutex_unlock(&s->s_mutex);
51622f2dc053SSage Weil ceph_put_mds_session(s);
51632f2dc053SSage Weil
51642f2dc053SSage Weil mutex_lock(&mdsc->mutex);
51652f2dc053SSage Weil }
51662f2dc053SSage Weil mutex_unlock(&mdsc->mutex);
51672f2dc053SSage Weil
5168bf2ba432SLuis Henriques delay = ceph_check_delayed_caps(mdsc);
516937c4efc1SYan, Zheng
517037c4efc1SYan, Zheng ceph_queue_cap_reclaim_work(mdsc);
517137c4efc1SYan, Zheng
517237c4efc1SYan, Zheng ceph_trim_snapid_map(mdsc);
517337c4efc1SYan, Zheng
5174131d7eb4SYan, Zheng maybe_recover_session(mdsc);
5175131d7eb4SYan, Zheng
5176bf2ba432SLuis Henriques schedule_delayed(mdsc, delay);
51772f2dc053SSage Weil }
51782f2dc053SSage Weil
ceph_mdsc_init(struct ceph_fs_client * fsc)51793d14c5d2SYehuda Sadeh int ceph_mdsc_init(struct ceph_fs_client *fsc)
51802f2dc053SSage Weil
51812f2dc053SSage Weil {
51823d14c5d2SYehuda Sadeh struct ceph_mds_client *mdsc;
5183f9009efaSXiubo Li int err;
51843d14c5d2SYehuda Sadeh
51853d14c5d2SYehuda Sadeh mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);
51863d14c5d2SYehuda Sadeh if (!mdsc)
51873d14c5d2SYehuda Sadeh return -ENOMEM;
51883d14c5d2SYehuda Sadeh mdsc->fsc = fsc;
51892f2dc053SSage Weil mutex_init(&mdsc->mutex);
51902f2dc053SSage Weil mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
5191d37b1d99SMarkus Elfring if (!mdsc->mdsmap) {
5192f9009efaSXiubo Li err = -ENOMEM;
5193f9009efaSXiubo Li goto err_mdsc;
5194fb3101b6Smajianpeng }
51952d06eeb8SCheng Renquan
51962f2dc053SSage Weil init_completion(&mdsc->safe_umount_waiters);
5197e3dfcab2SXiubo Li spin_lock_init(&mdsc->stopping_lock);
5198e3dfcab2SXiubo Li atomic_set(&mdsc->stopping_blockers, 0);
5199e3dfcab2SXiubo Li init_completion(&mdsc->stopping_waiter);
5200f3c60c59SSage Weil init_waitqueue_head(&mdsc->session_close_wq);
52012f2dc053SSage Weil INIT_LIST_HEAD(&mdsc->waiting_for_map);
52020c44a8e0SLuis Henriques mdsc->quotarealms_inodes = RB_ROOT;
52030c44a8e0SLuis Henriques mutex_init(&mdsc->quotarealms_inodes_mutex);
52042f2dc053SSage Weil init_rwsem(&mdsc->snap_rwsem);
5205a105f00cSSage Weil mdsc->snap_realms = RB_ROOT;
52062f2dc053SSage Weil INIT_LIST_HEAD(&mdsc->snap_empty);
52072f2dc053SSage Weil spin_lock_init(&mdsc->snap_empty_lock);
520844ca18f2SSage Weil mdsc->request_tree = RB_ROOT;
52092f2dc053SSage Weil INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
52102f2dc053SSage Weil mdsc->last_renew_caps = jiffies;
52112f2dc053SSage Weil INIT_LIST_HEAD(&mdsc->cap_delay_list);
52123a3430afSJeff Layton INIT_LIST_HEAD(&mdsc->cap_wait_list);
52132f2dc053SSage Weil spin_lock_init(&mdsc->cap_delay_lock);
52142f2dc053SSage Weil INIT_LIST_HEAD(&mdsc->snap_flush_list);
52152f2dc053SSage Weil spin_lock_init(&mdsc->snap_flush_lock);
5216553adfd9SYan, Zheng mdsc->last_cap_flush_tid = 1;
5217e4500b5eSYan, Zheng INIT_LIST_HEAD(&mdsc->cap_flush_list);
5218db354052SSage Weil INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
52192f2dc053SSage Weil spin_lock_init(&mdsc->cap_dirty_lock);
52202f2dc053SSage Weil init_waitqueue_head(&mdsc->cap_flushing_wq);
522137c4efc1SYan, Zheng INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work);
5222f9009efaSXiubo Li err = ceph_metric_init(&mdsc->metric);
5223f9009efaSXiubo Li if (err)
5224f9009efaSXiubo Li goto err_mdsmap;
522537c4efc1SYan, Zheng
522637c4efc1SYan, Zheng spin_lock_init(&mdsc->dentry_list_lock);
522737c4efc1SYan, Zheng INIT_LIST_HEAD(&mdsc->dentry_leases);
522837c4efc1SYan, Zheng INIT_LIST_HEAD(&mdsc->dentry_dir_leases);
52292d06eeb8SCheng Renquan
523037151668SYehuda Sadeh ceph_caps_init(mdsc);
5231fe33032dSYan, Zheng ceph_adjust_caps_max_min(mdsc, fsc->mount_options);
523237151668SYehuda Sadeh
523375c9627eSYan, Zheng spin_lock_init(&mdsc->snapid_map_lock);
523475c9627eSYan, Zheng mdsc->snapid_map_tree = RB_ROOT;
523575c9627eSYan, Zheng INIT_LIST_HEAD(&mdsc->snapid_map_lru);
523675c9627eSYan, Zheng
523710183a69SYan, Zheng init_rwsem(&mdsc->pool_perm_rwsem);
523810183a69SYan, Zheng mdsc->pool_perm_tree = RB_ROOT;
523910183a69SYan, Zheng
5240dfeb84d4SYan, Zheng strscpy(mdsc->nodename, utsname()->nodename,
5241dfeb84d4SYan, Zheng sizeof(mdsc->nodename));
5242a7caa88fSXiubo Li
5243a7caa88fSXiubo Li fsc->mdsc = mdsc;
52445f44f142SSage Weil return 0;
5245f9009efaSXiubo Li
5246f9009efaSXiubo Li err_mdsmap:
5247f9009efaSXiubo Li kfree(mdsc->mdsmap);
5248f9009efaSXiubo Li err_mdsc:
5249f9009efaSXiubo Li kfree(mdsc);
5250f9009efaSXiubo Li return err;
52512f2dc053SSage Weil }
52522f2dc053SSage Weil
52532f2dc053SSage Weil /*
52542f2dc053SSage Weil * Wait for safe replies on open mds requests. If we time out, drop
52552f2dc053SSage Weil * all requests from the tree to avoid dangling dentry refs.
52562f2dc053SSage Weil */
wait_requests(struct ceph_mds_client * mdsc)52572f2dc053SSage Weil static void wait_requests(struct ceph_mds_client *mdsc)
52582f2dc053SSage Weil {
5259a319bf56SIlya Dryomov struct ceph_options *opts = mdsc->fsc->client->options;
52602f2dc053SSage Weil struct ceph_mds_request *req;
52612f2dc053SSage Weil
52622f2dc053SSage Weil mutex_lock(&mdsc->mutex);
526344ca18f2SSage Weil if (__get_oldest_req(mdsc)) {
52642f2dc053SSage Weil mutex_unlock(&mdsc->mutex);
526544ca18f2SSage Weil
52662f2dc053SSage Weil dout("wait_requests waiting for requests\n");
52672f2dc053SSage Weil wait_for_completion_timeout(&mdsc->safe_umount_waiters,
5268a319bf56SIlya Dryomov ceph_timeout_jiffies(opts->mount_timeout));
52692f2dc053SSage Weil
52702f2dc053SSage Weil /* tear down remaining requests */
527144ca18f2SSage Weil mutex_lock(&mdsc->mutex);
527244ca18f2SSage Weil while ((req = __get_oldest_req(mdsc))) {
52732f2dc053SSage Weil dout("wait_requests timed out on tid %llu\n",
52742f2dc053SSage Weil req->r_tid);
5275428138c9SYan, Zheng list_del_init(&req->r_wait);
527644ca18f2SSage Weil __unregister_request(mdsc, req);
52772f2dc053SSage Weil }
52782f2dc053SSage Weil }
52792f2dc053SSage Weil mutex_unlock(&mdsc->mutex);
52802f2dc053SSage Weil dout("wait_requests done\n");
52812f2dc053SSage Weil }
52822f2dc053SSage Weil
send_flush_mdlog(struct ceph_mds_session * s)5283d095559cSXiubo Li void send_flush_mdlog(struct ceph_mds_session *s)
5284d095559cSXiubo Li {
5285d095559cSXiubo Li struct ceph_msg *msg;
5286d095559cSXiubo Li
5287d095559cSXiubo Li /*
5288d095559cSXiubo Li * Pre-luminous MDS crashes when it sees an unknown session request
5289d095559cSXiubo Li */
5290d095559cSXiubo Li if (!CEPH_HAVE_FEATURE(s->s_con.peer_features, SERVER_LUMINOUS))
5291d095559cSXiubo Li return;
5292d095559cSXiubo Li
5293d095559cSXiubo Li mutex_lock(&s->s_mutex);
5294d095559cSXiubo Li dout("request mdlog flush to mds%d (%s)s seq %lld\n", s->s_mds,
5295d095559cSXiubo Li ceph_session_state_name(s->s_state), s->s_seq);
5296d095559cSXiubo Li msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_FLUSH_MDLOG,
5297d095559cSXiubo Li s->s_seq);
5298d095559cSXiubo Li if (!msg) {
5299d095559cSXiubo Li pr_err("failed to request mdlog flush to mds%d (%s) seq %lld\n",
5300d095559cSXiubo Li s->s_mds, ceph_session_state_name(s->s_state), s->s_seq);
5301d095559cSXiubo Li } else {
5302d095559cSXiubo Li ceph_con_send(&s->s_con, msg);
5303d095559cSXiubo Li }
5304d095559cSXiubo Li mutex_unlock(&s->s_mutex);
5305d095559cSXiubo Li }
5306d095559cSXiubo Li
53072f2dc053SSage Weil /*
53082f2dc053SSage Weil * called before mount is ro, and before dentries are torn down.
53092f2dc053SSage Weil * (hmm, does this still race with new lookups?)
53102f2dc053SSage Weil */
ceph_mdsc_pre_umount(struct ceph_mds_client * mdsc)53112f2dc053SSage Weil void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
53122f2dc053SSage Weil {
53132f2dc053SSage Weil dout("pre_umount\n");
5314e7e607bdSXiubo Li mdsc->stopping = CEPH_MDSC_STOPPING_BEGIN;
53152f2dc053SSage Weil
5316d095559cSXiubo Li ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true);
531759b312f3SXiubo Li ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false);
5318afcdaea3SSage Weil ceph_flush_dirty_caps(mdsc);
53192f2dc053SSage Weil wait_requests(mdsc);
532017c688c3SSage Weil
532117c688c3SSage Weil /*
532217c688c3SSage Weil * wait for reply handlers to drop their request refs and
532317c688c3SSage Weil * their inode/dcache refs
532417c688c3SSage Weil */
532517c688c3SSage Weil ceph_msgr_flush();
53260c44a8e0SLuis Henriques
53270c44a8e0SLuis Henriques ceph_cleanup_quotarealms_inodes(mdsc);
53282f2dc053SSage Weil }
53292f2dc053SSage Weil
53302f2dc053SSage Weil /*
53311b2ba3c5SXiubo Li * flush the mdlog and wait for all write mds requests to flush.
53322f2dc053SSage Weil */
flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client * mdsc,u64 want_tid)53331b2ba3c5SXiubo Li static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc,
53341b2ba3c5SXiubo Li u64 want_tid)
53352f2dc053SSage Weil {
533680fc7314SSage Weil struct ceph_mds_request *req = NULL, *nextreq;
53371b2ba3c5SXiubo Li struct ceph_mds_session *last_session = NULL;
533844ca18f2SSage Weil struct rb_node *n;
53392f2dc053SSage Weil
53402f2dc053SSage Weil mutex_lock(&mdsc->mutex);
53411b2ba3c5SXiubo Li dout("%s want %lld\n", __func__, want_tid);
534280fc7314SSage Weil restart:
534344ca18f2SSage Weil req = __get_oldest_req(mdsc);
534444ca18f2SSage Weil while (req && req->r_tid <= want_tid) {
534580fc7314SSage Weil /* find next request */
534680fc7314SSage Weil n = rb_next(&req->r_node);
534780fc7314SSage Weil if (n)
534880fc7314SSage Weil nextreq = rb_entry(n, struct ceph_mds_request, r_node);
534980fc7314SSage Weil else
535080fc7314SSage Weil nextreq = NULL;
5351e8a7b8b1SYan, Zheng if (req->r_op != CEPH_MDS_OP_SETFILELOCK &&
5352e8a7b8b1SYan, Zheng (req->r_op & CEPH_MDS_OP_WRITE)) {
53531b2ba3c5SXiubo Li struct ceph_mds_session *s = req->r_session;
53541b2ba3c5SXiubo Li
53551b2ba3c5SXiubo Li if (!s) {
53561b2ba3c5SXiubo Li req = nextreq;
53571b2ba3c5SXiubo Li continue;
53581b2ba3c5SXiubo Li }
53591b2ba3c5SXiubo Li
536044ca18f2SSage Weil /* write op */
53612f2dc053SSage Weil ceph_mdsc_get_request(req);
536280fc7314SSage Weil if (nextreq)
536380fc7314SSage Weil ceph_mdsc_get_request(nextreq);
53641b2ba3c5SXiubo Li s = ceph_get_mds_session(s);
53652f2dc053SSage Weil mutex_unlock(&mdsc->mutex);
53661b2ba3c5SXiubo Li
53671b2ba3c5SXiubo Li /* send flush mdlog request to MDS */
53681b2ba3c5SXiubo Li if (last_session != s) {
53691b2ba3c5SXiubo Li send_flush_mdlog(s);
53701b2ba3c5SXiubo Li ceph_put_mds_session(last_session);
53711b2ba3c5SXiubo Li last_session = s;
53721b2ba3c5SXiubo Li } else {
53731b2ba3c5SXiubo Li ceph_put_mds_session(s);
53741b2ba3c5SXiubo Li }
53751b2ba3c5SXiubo Li dout("%s wait on %llu (want %llu)\n", __func__,
53762f2dc053SSage Weil req->r_tid, want_tid);
53772f2dc053SSage Weil wait_for_completion(&req->r_safe_completion);
53781b2ba3c5SXiubo Li
53792f2dc053SSage Weil mutex_lock(&mdsc->mutex);
53802f2dc053SSage Weil ceph_mdsc_put_request(req);
538180fc7314SSage Weil if (!nextreq)
538280fc7314SSage Weil break; /* next dne before, so we're done! */
538380fc7314SSage Weil if (RB_EMPTY_NODE(&nextreq->r_node)) {
538480fc7314SSage Weil /* next request was removed from tree */
538580fc7314SSage Weil ceph_mdsc_put_request(nextreq);
538680fc7314SSage Weil goto restart;
538744ca18f2SSage Weil }
538880fc7314SSage Weil ceph_mdsc_put_request(nextreq); /* won't go away */
538980fc7314SSage Weil }
539080fc7314SSage Weil req = nextreq;
53912f2dc053SSage Weil }
53922f2dc053SSage Weil mutex_unlock(&mdsc->mutex);
53931b2ba3c5SXiubo Li ceph_put_mds_session(last_session);
53941b2ba3c5SXiubo Li dout("%s done\n", __func__);
53952f2dc053SSage Weil }
53962f2dc053SSage Weil
ceph_mdsc_sync(struct ceph_mds_client * mdsc)53972f2dc053SSage Weil void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
53982f2dc053SSage Weil {
53990e294387SYan, Zheng u64 want_tid, want_flush;
54002f2dc053SSage Weil
540150c9132dSJeff Layton if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN)
540256b7cf95SSage Weil return;
540356b7cf95SSage Weil
54042f2dc053SSage Weil dout("sync\n");
54052f2dc053SSage Weil mutex_lock(&mdsc->mutex);
54062f2dc053SSage Weil want_tid = mdsc->last_tid;
54072f2dc053SSage Weil mutex_unlock(&mdsc->mutex);
54082f2dc053SSage Weil
5409afcdaea3SSage Weil ceph_flush_dirty_caps(mdsc);
5410d3383a8eSYan, Zheng spin_lock(&mdsc->cap_dirty_lock);
54118310b089SYan, Zheng want_flush = mdsc->last_cap_flush_tid;
5412c8799fc4SYan, Zheng if (!list_empty(&mdsc->cap_flush_list)) {
5413c8799fc4SYan, Zheng struct ceph_cap_flush *cf =
5414c8799fc4SYan, Zheng list_last_entry(&mdsc->cap_flush_list,
5415c8799fc4SYan, Zheng struct ceph_cap_flush, g_list);
5416c8799fc4SYan, Zheng cf->wake = true;
5417c8799fc4SYan, Zheng }
5418d3383a8eSYan, Zheng spin_unlock(&mdsc->cap_dirty_lock);
5419d3383a8eSYan, Zheng
54200e294387SYan, Zheng dout("sync want tid %lld flush_seq %lld\n",
54210e294387SYan, Zheng want_tid, want_flush);
54222f2dc053SSage Weil
54231b2ba3c5SXiubo Li flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid);
54240e294387SYan, Zheng wait_caps_flush(mdsc, want_flush);
54252f2dc053SSage Weil }
54262f2dc053SSage Weil
5427f3c60c59SSage Weil /*
5428f3c60c59SSage Weil * true if all sessions are closed, or we force unmount
5429f3c60c59SSage Weil */
done_closing_sessions(struct ceph_mds_client * mdsc,int skipped)5430fcff415cSYan, Zheng static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
5431f3c60c59SSage Weil {
543252953d55SSeraphime Kirkovski if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
5433f3c60c59SSage Weil return true;
5434fcff415cSYan, Zheng return atomic_read(&mdsc->num_sessions) <= skipped;
5435f3c60c59SSage Weil }
54362f2dc053SSage Weil
54372f2dc053SSage Weil /*
5438a68e564aSXiubo Li * called after sb is ro or when metadata corrupted.
54392f2dc053SSage Weil */
ceph_mdsc_close_sessions(struct ceph_mds_client * mdsc)54402f2dc053SSage Weil void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
54412f2dc053SSage Weil {
5442a319bf56SIlya Dryomov struct ceph_options *opts = mdsc->fsc->client->options;
54432f2dc053SSage Weil struct ceph_mds_session *session;
54442f2dc053SSage Weil int i;
5445fcff415cSYan, Zheng int skipped = 0;
54462f2dc053SSage Weil
54472f2dc053SSage Weil dout("close_sessions\n");
54482f2dc053SSage Weil
54492f2dc053SSage Weil /* close sessions */
5450f3c60c59SSage Weil mutex_lock(&mdsc->mutex);
54512f2dc053SSage Weil for (i = 0; i < mdsc->max_sessions; i++) {
54522f2dc053SSage Weil session = __ceph_lookup_mds_session(mdsc, i);
54532f2dc053SSage Weil if (!session)
54542f2dc053SSage Weil continue;
54552f2dc053SSage Weil mutex_unlock(&mdsc->mutex);
54562f2dc053SSage Weil mutex_lock(&session->s_mutex);
5457fcff415cSYan, Zheng if (__close_session(mdsc, session) <= 0)
5458fcff415cSYan, Zheng skipped++;
54592f2dc053SSage Weil mutex_unlock(&session->s_mutex);
54602f2dc053SSage Weil ceph_put_mds_session(session);
54612f2dc053SSage Weil mutex_lock(&mdsc->mutex);
54622f2dc053SSage Weil }
5463f3c60c59SSage Weil mutex_unlock(&mdsc->mutex);
54642f2dc053SSage Weil
54652f2dc053SSage Weil dout("waiting for sessions to close\n");
5466fcff415cSYan, Zheng wait_event_timeout(mdsc->session_close_wq,
5467fcff415cSYan, Zheng done_closing_sessions(mdsc, skipped),
5468a319bf56SIlya Dryomov ceph_timeout_jiffies(opts->mount_timeout));
54692f2dc053SSage Weil
54702f2dc053SSage Weil /* tear down remaining sessions */
5471f3c60c59SSage Weil mutex_lock(&mdsc->mutex);
54722f2dc053SSage Weil for (i = 0; i < mdsc->max_sessions; i++) {
54732f2dc053SSage Weil if (mdsc->sessions[i]) {
54745b3248c6SXiubo Li session = ceph_get_mds_session(mdsc->sessions[i]);
54752600d2ddSSage Weil __unregister_session(mdsc, session);
54762f2dc053SSage Weil mutex_unlock(&mdsc->mutex);
54772f2dc053SSage Weil mutex_lock(&session->s_mutex);
54782f2dc053SSage Weil remove_session_caps(session);
54792f2dc053SSage Weil mutex_unlock(&session->s_mutex);
54802f2dc053SSage Weil ceph_put_mds_session(session);
54812f2dc053SSage Weil mutex_lock(&mdsc->mutex);
54822f2dc053SSage Weil }
54832f2dc053SSage Weil }
54842f2dc053SSage Weil WARN_ON(!list_empty(&mdsc->cap_delay_list));
54852f2dc053SSage Weil mutex_unlock(&mdsc->mutex);
54862f2dc053SSage Weil
548775c9627eSYan, Zheng ceph_cleanup_snapid_map(mdsc);
54885ed91587SXiubo Li ceph_cleanup_global_and_empty_realms(mdsc);
54892f2dc053SSage Weil
549037c4efc1SYan, Zheng cancel_work_sync(&mdsc->cap_reclaim_work);
54912f2dc053SSage Weil cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
54922f2dc053SSage Weil
54932f2dc053SSage Weil dout("stopped\n");
54942f2dc053SSage Weil }
54952f2dc053SSage Weil
ceph_mdsc_force_umount(struct ceph_mds_client * mdsc)549648fec5d0SYan, Zheng void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
549748fec5d0SYan, Zheng {
549848fec5d0SYan, Zheng struct ceph_mds_session *session;
549948fec5d0SYan, Zheng int mds;
550048fec5d0SYan, Zheng
550148fec5d0SYan, Zheng dout("force umount\n");
550248fec5d0SYan, Zheng
550348fec5d0SYan, Zheng mutex_lock(&mdsc->mutex);
550448fec5d0SYan, Zheng for (mds = 0; mds < mdsc->max_sessions; mds++) {
550548fec5d0SYan, Zheng session = __ceph_lookup_mds_session(mdsc, mds);
550648fec5d0SYan, Zheng if (!session)
550748fec5d0SYan, Zheng continue;
5508d468e729SYan, Zheng
5509d468e729SYan, Zheng if (session->s_state == CEPH_MDS_SESSION_REJECTED)
5510d468e729SYan, Zheng __unregister_session(mdsc, session);
5511d468e729SYan, Zheng __wake_requests(mdsc, &session->s_waiting);
551248fec5d0SYan, Zheng mutex_unlock(&mdsc->mutex);
5513d468e729SYan, Zheng
551448fec5d0SYan, Zheng mutex_lock(&session->s_mutex);
551548fec5d0SYan, Zheng __close_session(mdsc, session);
551648fec5d0SYan, Zheng if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
551748fec5d0SYan, Zheng cleanup_session_requests(mdsc, session);
551848fec5d0SYan, Zheng remove_session_caps(session);
551948fec5d0SYan, Zheng }
552048fec5d0SYan, Zheng mutex_unlock(&session->s_mutex);
552148fec5d0SYan, Zheng ceph_put_mds_session(session);
5522d468e729SYan, Zheng
552348fec5d0SYan, Zheng mutex_lock(&mdsc->mutex);
552448fec5d0SYan, Zheng kick_requests(mdsc, mds);
552548fec5d0SYan, Zheng }
552648fec5d0SYan, Zheng __wake_requests(mdsc, &mdsc->waiting_for_map);
552748fec5d0SYan, Zheng mutex_unlock(&mdsc->mutex);
552848fec5d0SYan, Zheng }
552948fec5d0SYan, Zheng
ceph_mdsc_stop(struct ceph_mds_client * mdsc)55303d14c5d2SYehuda Sadeh static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
55312f2dc053SSage Weil {
55322f2dc053SSage Weil dout("stop\n");
5533fa996773SXiubo Li /*
5534fa996773SXiubo Li * Make sure the delayed work stopped before releasing
5535fa996773SXiubo Li * the resources.
5536fa996773SXiubo Li *
5537fa996773SXiubo Li * Because the cancel_delayed_work_sync() will only
5538fa996773SXiubo Li * guarantee that the work finishes executing. But the
5539fa996773SXiubo Li * delayed work will re-arm itself again after that.
5540fa996773SXiubo Li */
5541fa996773SXiubo Li flush_delayed_work(&mdsc->delayed_work);
5542fa996773SXiubo Li
55432f2dc053SSage Weil if (mdsc->mdsmap)
55442f2dc053SSage Weil ceph_mdsmap_destroy(mdsc->mdsmap);
55452f2dc053SSage Weil kfree(mdsc->sessions);
554637151668SYehuda Sadeh ceph_caps_finalize(mdsc);
554710183a69SYan, Zheng ceph_pool_perm_destroy(mdsc);
55482f2dc053SSage Weil }
55492f2dc053SSage Weil
ceph_mdsc_destroy(struct ceph_fs_client * fsc)55503d14c5d2SYehuda Sadeh void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
55513d14c5d2SYehuda Sadeh {
55523d14c5d2SYehuda Sadeh struct ceph_mds_client *mdsc = fsc->mdsc;
5553ef550f6fSSage Weil dout("mdsc_destroy %p\n", mdsc);
5554ef550f6fSSage Weil
555550c55aecSChengguang Xu if (!mdsc)
555650c55aecSChengguang Xu return;
555750c55aecSChengguang Xu
5558ef550f6fSSage Weil /* flush out any connection work with references to us */
5559ef550f6fSSage Weil ceph_msgr_flush();
5560ef550f6fSSage Weil
556162a65f36SYan, Zheng ceph_mdsc_stop(mdsc);
556262a65f36SYan, Zheng
5563f9009efaSXiubo Li ceph_metric_destroy(&mdsc->metric);
5564f9009efaSXiubo Li
55653d14c5d2SYehuda Sadeh fsc->mdsc = NULL;
55663d14c5d2SYehuda Sadeh kfree(mdsc);
5567ef550f6fSSage Weil dout("mdsc_destroy %p done\n", mdsc);
55683d14c5d2SYehuda Sadeh }
55693d14c5d2SYehuda Sadeh
ceph_mdsc_handle_fsmap(struct ceph_mds_client * mdsc,struct ceph_msg * msg)5570430afbadSYan, Zheng void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
5571430afbadSYan, Zheng {
5572430afbadSYan, Zheng struct ceph_fs_client *fsc = mdsc->fsc;
5573430afbadSYan, Zheng const char *mds_namespace = fsc->mount_options->mds_namespace;
5574430afbadSYan, Zheng void *p = msg->front.iov_base;
5575430afbadSYan, Zheng void *end = p + msg->front.iov_len;
5576430afbadSYan, Zheng u32 epoch;
5577430afbadSYan, Zheng u32 num_fs;
5578430afbadSYan, Zheng u32 mount_fscid = (u32)-1;
5579430afbadSYan, Zheng int err = -EINVAL;
5580430afbadSYan, Zheng
5581430afbadSYan, Zheng ceph_decode_need(&p, end, sizeof(u32), bad);
5582430afbadSYan, Zheng epoch = ceph_decode_32(&p);
5583430afbadSYan, Zheng
5584430afbadSYan, Zheng dout("handle_fsmap epoch %u\n", epoch);
5585430afbadSYan, Zheng
558606a1ad43SJeff Layton /* struct_v, struct_cv, map_len, epoch, legacy_client_fscid */
558706a1ad43SJeff Layton ceph_decode_skip_n(&p, end, 2 + sizeof(u32) * 3, bad);
5588430afbadSYan, Zheng
558906a1ad43SJeff Layton ceph_decode_32_safe(&p, end, num_fs, bad);
5590430afbadSYan, Zheng while (num_fs-- > 0) {
5591430afbadSYan, Zheng void *info_p, *info_end;
5592430afbadSYan, Zheng u32 info_len;
5593430afbadSYan, Zheng u32 fscid, namelen;
5594430afbadSYan, Zheng
5595430afbadSYan, Zheng ceph_decode_need(&p, end, 2 + sizeof(u32), bad);
559606a1ad43SJeff Layton p += 2; // info_v, info_cv
5597430afbadSYan, Zheng info_len = ceph_decode_32(&p);
5598430afbadSYan, Zheng ceph_decode_need(&p, end, info_len, bad);
5599430afbadSYan, Zheng info_p = p;
5600430afbadSYan, Zheng info_end = p + info_len;
5601430afbadSYan, Zheng p = info_end;
5602430afbadSYan, Zheng
5603430afbadSYan, Zheng ceph_decode_need(&info_p, info_end, sizeof(u32) * 2, bad);
5604430afbadSYan, Zheng fscid = ceph_decode_32(&info_p);
5605430afbadSYan, Zheng namelen = ceph_decode_32(&info_p);
5606430afbadSYan, Zheng ceph_decode_need(&info_p, info_end, namelen, bad);
5607430afbadSYan, Zheng
5608430afbadSYan, Zheng if (mds_namespace &&
5609430afbadSYan, Zheng strlen(mds_namespace) == namelen &&
5610430afbadSYan, Zheng !strncmp(mds_namespace, (char *)info_p, namelen)) {
5611430afbadSYan, Zheng mount_fscid = fscid;
5612430afbadSYan, Zheng break;
5613430afbadSYan, Zheng }
5614430afbadSYan, Zheng }
5615430afbadSYan, Zheng
5616430afbadSYan, Zheng ceph_monc_got_map(&fsc->client->monc, CEPH_SUB_FSMAP, epoch);
5617430afbadSYan, Zheng if (mount_fscid != (u32)-1) {
5618430afbadSYan, Zheng fsc->client->monc.fs_cluster_id = mount_fscid;
5619430afbadSYan, Zheng ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP,
5620430afbadSYan, Zheng 0, true);
5621430afbadSYan, Zheng ceph_monc_renew_subs(&fsc->client->monc);
5622430afbadSYan, Zheng } else {
5623430afbadSYan, Zheng err = -ENOENT;
5624430afbadSYan, Zheng goto err_out;
5625430afbadSYan, Zheng }
5626430afbadSYan, Zheng return;
562776bd6ec4SIlya Dryomov
5628430afbadSYan, Zheng bad:
5629631ed4b0SJeff Layton pr_err("error decoding fsmap %d. Shutting down mount.\n", err);
5630631ed4b0SJeff Layton ceph_umount_begin(mdsc->fsc->sb);
56318b0da5c5SXiubo Li ceph_msg_dump(msg);
5632430afbadSYan, Zheng err_out:
5633430afbadSYan, Zheng mutex_lock(&mdsc->mutex);
563476bd6ec4SIlya Dryomov mdsc->mdsmap_err = err;
5635430afbadSYan, Zheng __wake_requests(mdsc, &mdsc->waiting_for_map);
5636430afbadSYan, Zheng mutex_unlock(&mdsc->mutex);
5637430afbadSYan, Zheng }
56382f2dc053SSage Weil
56392f2dc053SSage Weil /*
56402f2dc053SSage Weil * handle mds map update.
56412f2dc053SSage Weil */
ceph_mdsc_handle_mdsmap(struct ceph_mds_client * mdsc,struct ceph_msg * msg)5642430afbadSYan, Zheng void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
56432f2dc053SSage Weil {
56442f2dc053SSage Weil u32 epoch;
56452f2dc053SSage Weil u32 maplen;
56462f2dc053SSage Weil void *p = msg->front.iov_base;
56472f2dc053SSage Weil void *end = p + msg->front.iov_len;
56482f2dc053SSage Weil struct ceph_mdsmap *newmap, *oldmap;
56492f2dc053SSage Weil struct ceph_fsid fsid;
56502f2dc053SSage Weil int err = -EINVAL;
56512f2dc053SSage Weil
56522f2dc053SSage Weil ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
56532f2dc053SSage Weil ceph_decode_copy(&p, &fsid, sizeof(fsid));
56543d14c5d2SYehuda Sadeh if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0)
56552f2dc053SSage Weil return;
5656c89136eaSSage Weil epoch = ceph_decode_32(&p);
5657c89136eaSSage Weil maplen = ceph_decode_32(&p);
56582f2dc053SSage Weil dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
56592f2dc053SSage Weil
56602f2dc053SSage Weil /* do we need it? */
56612f2dc053SSage Weil mutex_lock(&mdsc->mutex);
56622f2dc053SSage Weil if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
56632f2dc053SSage Weil dout("handle_map epoch %u <= our %u\n",
56642f2dc053SSage Weil epoch, mdsc->mdsmap->m_epoch);
56652f2dc053SSage Weil mutex_unlock(&mdsc->mutex);
56662f2dc053SSage Weil return;
56672f2dc053SSage Weil }
56682f2dc053SSage Weil
56692e2023e9SXiubo Li newmap = ceph_mdsmap_decode(mdsc, &p, end, ceph_msgr2(mdsc->fsc->client));
56702f2dc053SSage Weil if (IS_ERR(newmap)) {
56712f2dc053SSage Weil err = PTR_ERR(newmap);
56722f2dc053SSage Weil goto bad_unlock;
56732f2dc053SSage Weil }
56742f2dc053SSage Weil
56752f2dc053SSage Weil /* swap into place */
56762f2dc053SSage Weil if (mdsc->mdsmap) {
56772f2dc053SSage Weil oldmap = mdsc->mdsmap;
56782f2dc053SSage Weil mdsc->mdsmap = newmap;
56792f2dc053SSage Weil check_new_map(mdsc, newmap, oldmap);
56802f2dc053SSage Weil ceph_mdsmap_destroy(oldmap);
56812f2dc053SSage Weil } else {
56822f2dc053SSage Weil mdsc->mdsmap = newmap; /* first mds map */
56832f2dc053SSage Weil }
5684719784baSChengguang Xu mdsc->fsc->max_file_size = min((loff_t)mdsc->mdsmap->m_max_file_size,
5685719784baSChengguang Xu MAX_LFS_FILESIZE);
56862f2dc053SSage Weil
56872f2dc053SSage Weil __wake_requests(mdsc, &mdsc->waiting_for_map);
568882dcabadSIlya Dryomov ceph_monc_got_map(&mdsc->fsc->client->monc, CEPH_SUB_MDSMAP,
568982dcabadSIlya Dryomov mdsc->mdsmap->m_epoch);
56902f2dc053SSage Weil
56912f2dc053SSage Weil mutex_unlock(&mdsc->mutex);
5692bf2ba432SLuis Henriques schedule_delayed(mdsc, 0);
56932f2dc053SSage Weil return;
56942f2dc053SSage Weil
56952f2dc053SSage Weil bad_unlock:
56962f2dc053SSage Weil mutex_unlock(&mdsc->mutex);
56972f2dc053SSage Weil bad:
5698631ed4b0SJeff Layton pr_err("error decoding mdsmap %d. Shutting down mount.\n", err);
5699631ed4b0SJeff Layton ceph_umount_begin(mdsc->fsc->sb);
57008b0da5c5SXiubo Li ceph_msg_dump(msg);
57012f2dc053SSage Weil return;
57022f2dc053SSage Weil }
57032f2dc053SSage Weil
mds_get_con(struct ceph_connection * con)57044972cf60SIlya Dryomov static struct ceph_connection *mds_get_con(struct ceph_connection *con)
57052f2dc053SSage Weil {
57062f2dc053SSage Weil struct ceph_mds_session *s = con->private;
57072f2dc053SSage Weil
57085b3248c6SXiubo Li if (ceph_get_mds_session(s))
57092f2dc053SSage Weil return con;
57102f2dc053SSage Weil return NULL;
57112f2dc053SSage Weil }
57122f2dc053SSage Weil
mds_put_con(struct ceph_connection * con)57134972cf60SIlya Dryomov static void mds_put_con(struct ceph_connection *con)
57142f2dc053SSage Weil {
57152f2dc053SSage Weil struct ceph_mds_session *s = con->private;
57162f2dc053SSage Weil
57172f2dc053SSage Weil ceph_put_mds_session(s);
57182f2dc053SSage Weil }
57192f2dc053SSage Weil
57202f2dc053SSage Weil /*
57212f2dc053SSage Weil * if the client is unresponsive for long enough, the mds will kill
57222f2dc053SSage Weil * the session entirely.
57232f2dc053SSage Weil */
mds_peer_reset(struct ceph_connection * con)57244972cf60SIlya Dryomov static void mds_peer_reset(struct ceph_connection *con)
57252f2dc053SSage Weil {
57262f2dc053SSage Weil struct ceph_mds_session *s = con->private;
57277e70f0edSSage Weil struct ceph_mds_client *mdsc = s->s_mdsc;
57282f2dc053SSage Weil
5729f3ae1b97SFabian Frederick pr_warn("mds%d closed our session\n", s->s_mds);
5730a68e564aSXiubo Li if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO)
57317e70f0edSSage Weil send_mds_reconnect(mdsc, s);
57322f2dc053SSage Weil }
57332f2dc053SSage Weil
mds_dispatch(struct ceph_connection * con,struct ceph_msg * msg)57344972cf60SIlya Dryomov static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg)
57352f2dc053SSage Weil {
57362f2dc053SSage Weil struct ceph_mds_session *s = con->private;
57372f2dc053SSage Weil struct ceph_mds_client *mdsc = s->s_mdsc;
57382f2dc053SSage Weil int type = le16_to_cpu(msg->hdr.type);
57392f2dc053SSage Weil
57402600d2ddSSage Weil mutex_lock(&mdsc->mutex);
57412600d2ddSSage Weil if (__verify_registered_session(mdsc, s) < 0) {
57422600d2ddSSage Weil mutex_unlock(&mdsc->mutex);
57432600d2ddSSage Weil goto out;
57442600d2ddSSage Weil }
57452600d2ddSSage Weil mutex_unlock(&mdsc->mutex);
57462600d2ddSSage Weil
57472f2dc053SSage Weil switch (type) {
57482f2dc053SSage Weil case CEPH_MSG_MDS_MAP:
5749430afbadSYan, Zheng ceph_mdsc_handle_mdsmap(mdsc, msg);
5750430afbadSYan, Zheng break;
5751430afbadSYan, Zheng case CEPH_MSG_FS_MAP_USER:
5752430afbadSYan, Zheng ceph_mdsc_handle_fsmap(mdsc, msg);
57532f2dc053SSage Weil break;
57542f2dc053SSage Weil case CEPH_MSG_CLIENT_SESSION:
57552f2dc053SSage Weil handle_session(s, msg);
57562f2dc053SSage Weil break;
57572f2dc053SSage Weil case CEPH_MSG_CLIENT_REPLY:
57582f2dc053SSage Weil handle_reply(s, msg);
57592f2dc053SSage Weil break;
57602f2dc053SSage Weil case CEPH_MSG_CLIENT_REQUEST_FORWARD:
57612600d2ddSSage Weil handle_forward(mdsc, s, msg);
57622f2dc053SSage Weil break;
57632f2dc053SSage Weil case CEPH_MSG_CLIENT_CAPS:
57642f2dc053SSage Weil ceph_handle_caps(s, msg);
57652f2dc053SSage Weil break;
57662f2dc053SSage Weil case CEPH_MSG_CLIENT_SNAP:
57672600d2ddSSage Weil ceph_handle_snap(mdsc, s, msg);
57682f2dc053SSage Weil break;
57692f2dc053SSage Weil case CEPH_MSG_CLIENT_LEASE:
57702600d2ddSSage Weil handle_lease(mdsc, s, msg);
57712f2dc053SSage Weil break;
5772fb18a575SLuis Henriques case CEPH_MSG_CLIENT_QUOTA:
5773fb18a575SLuis Henriques ceph_handle_quota(mdsc, s, msg);
5774fb18a575SLuis Henriques break;
57752f2dc053SSage Weil
57762f2dc053SSage Weil default:
57772f2dc053SSage Weil pr_err("received unknown message type %d %s\n", type,
57782f2dc053SSage Weil ceph_msg_type_name(type));
57792f2dc053SSage Weil }
57802600d2ddSSage Weil out:
57812f2dc053SSage Weil ceph_msg_put(msg);
57822f2dc053SSage Weil }
57832f2dc053SSage Weil
57844e7a5dcdSSage Weil /*
57854e7a5dcdSSage Weil * authentication
57864e7a5dcdSSage Weil */
5787a3530df3SAlex Elder
5788a3530df3SAlex Elder /*
5789a3530df3SAlex Elder * Note: returned pointer is the address of a structure that's
5790a3530df3SAlex Elder * managed separately. Caller must *not* attempt to free it.
5791a3530df3SAlex Elder */
57924972cf60SIlya Dryomov static struct ceph_auth_handshake *
mds_get_authorizer(struct ceph_connection * con,int * proto,int force_new)57934972cf60SIlya Dryomov mds_get_authorizer(struct ceph_connection *con, int *proto, int force_new)
57944e7a5dcdSSage Weil {
57954e7a5dcdSSage Weil struct ceph_mds_session *s = con->private;
57964e7a5dcdSSage Weil struct ceph_mds_client *mdsc = s->s_mdsc;
57973d14c5d2SYehuda Sadeh struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
579874f1869fSAlex Elder struct ceph_auth_handshake *auth = &s->s_auth;
5799ce287162SIlya Dryomov int ret;
58004e7a5dcdSSage Weil
5801ce287162SIlya Dryomov ret = __ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_MDS,
5802ce287162SIlya Dryomov force_new, proto, NULL, NULL);
58030bed9b5cSSage Weil if (ret)
58040bed9b5cSSage Weil return ERR_PTR(ret);
580574f1869fSAlex Elder
5806a3530df3SAlex Elder return auth;
58074e7a5dcdSSage Weil }
58084e7a5dcdSSage Weil
mds_add_authorizer_challenge(struct ceph_connection * con,void * challenge_buf,int challenge_buf_len)58094972cf60SIlya Dryomov static int mds_add_authorizer_challenge(struct ceph_connection *con,
58106daca13dSIlya Dryomov void *challenge_buf, int challenge_buf_len)
58116daca13dSIlya Dryomov {
58126daca13dSIlya Dryomov struct ceph_mds_session *s = con->private;
58136daca13dSIlya Dryomov struct ceph_mds_client *mdsc = s->s_mdsc;
58146daca13dSIlya Dryomov struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
58156daca13dSIlya Dryomov
58166daca13dSIlya Dryomov return ceph_auth_add_authorizer_challenge(ac, s->s_auth.authorizer,
58176daca13dSIlya Dryomov challenge_buf, challenge_buf_len);
58186daca13dSIlya Dryomov }
58194e7a5dcdSSage Weil
mds_verify_authorizer_reply(struct ceph_connection * con)58204972cf60SIlya Dryomov static int mds_verify_authorizer_reply(struct ceph_connection *con)
58214e7a5dcdSSage Weil {
58224e7a5dcdSSage Weil struct ceph_mds_session *s = con->private;
58234e7a5dcdSSage Weil struct ceph_mds_client *mdsc = s->s_mdsc;
58243d14c5d2SYehuda Sadeh struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
5825285ea34fSIlya Dryomov struct ceph_auth_handshake *auth = &s->s_auth;
58264e7a5dcdSSage Weil
5827285ea34fSIlya Dryomov return ceph_auth_verify_authorizer_reply(ac, auth->authorizer,
5828285ea34fSIlya Dryomov auth->authorizer_reply_buf, auth->authorizer_reply_buf_len,
5829285ea34fSIlya Dryomov NULL, NULL, NULL, NULL);
58304e7a5dcdSSage Weil }
58314e7a5dcdSSage Weil
mds_invalidate_authorizer(struct ceph_connection * con)58324972cf60SIlya Dryomov static int mds_invalidate_authorizer(struct ceph_connection *con)
58339bd2e6f8SSage Weil {
58349bd2e6f8SSage Weil struct ceph_mds_session *s = con->private;
58359bd2e6f8SSage Weil struct ceph_mds_client *mdsc = s->s_mdsc;
58363d14c5d2SYehuda Sadeh struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
58379bd2e6f8SSage Weil
583827859f97SSage Weil ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
58399bd2e6f8SSage Weil
58403d14c5d2SYehuda Sadeh return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
58419bd2e6f8SSage Weil }
58429bd2e6f8SSage Weil
mds_get_auth_request(struct ceph_connection * con,void * buf,int * buf_len,void ** authorizer,int * authorizer_len)5843cd1a677cSIlya Dryomov static int mds_get_auth_request(struct ceph_connection *con,
5844cd1a677cSIlya Dryomov void *buf, int *buf_len,
5845cd1a677cSIlya Dryomov void **authorizer, int *authorizer_len)
5846cd1a677cSIlya Dryomov {
5847cd1a677cSIlya Dryomov struct ceph_mds_session *s = con->private;
5848cd1a677cSIlya Dryomov struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth;
5849cd1a677cSIlya Dryomov struct ceph_auth_handshake *auth = &s->s_auth;
5850cd1a677cSIlya Dryomov int ret;
5851cd1a677cSIlya Dryomov
5852cd1a677cSIlya Dryomov ret = ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_MDS,
5853cd1a677cSIlya Dryomov buf, buf_len);
5854cd1a677cSIlya Dryomov if (ret)
5855cd1a677cSIlya Dryomov return ret;
5856cd1a677cSIlya Dryomov
5857cd1a677cSIlya Dryomov *authorizer = auth->authorizer_buf;
5858cd1a677cSIlya Dryomov *authorizer_len = auth->authorizer_buf_len;
5859cd1a677cSIlya Dryomov return 0;
5860cd1a677cSIlya Dryomov }
5861cd1a677cSIlya Dryomov
mds_handle_auth_reply_more(struct ceph_connection * con,void * reply,int reply_len,void * buf,int * buf_len,void ** authorizer,int * authorizer_len)5862cd1a677cSIlya Dryomov static int mds_handle_auth_reply_more(struct ceph_connection *con,
5863cd1a677cSIlya Dryomov void *reply, int reply_len,
5864cd1a677cSIlya Dryomov void *buf, int *buf_len,
5865cd1a677cSIlya Dryomov void **authorizer, int *authorizer_len)
5866cd1a677cSIlya Dryomov {
5867cd1a677cSIlya Dryomov struct ceph_mds_session *s = con->private;
5868cd1a677cSIlya Dryomov struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth;
5869cd1a677cSIlya Dryomov struct ceph_auth_handshake *auth = &s->s_auth;
5870cd1a677cSIlya Dryomov int ret;
5871cd1a677cSIlya Dryomov
5872cd1a677cSIlya Dryomov ret = ceph_auth_handle_svc_reply_more(ac, auth, reply, reply_len,
5873cd1a677cSIlya Dryomov buf, buf_len);
5874cd1a677cSIlya Dryomov if (ret)
5875cd1a677cSIlya Dryomov return ret;
5876cd1a677cSIlya Dryomov
5877cd1a677cSIlya Dryomov *authorizer = auth->authorizer_buf;
5878cd1a677cSIlya Dryomov *authorizer_len = auth->authorizer_buf_len;
5879cd1a677cSIlya Dryomov return 0;
5880cd1a677cSIlya Dryomov }
5881cd1a677cSIlya Dryomov
mds_handle_auth_done(struct ceph_connection * con,u64 global_id,void * reply,int reply_len,u8 * session_key,int * session_key_len,u8 * con_secret,int * con_secret_len)5882cd1a677cSIlya Dryomov static int mds_handle_auth_done(struct ceph_connection *con,
5883cd1a677cSIlya Dryomov u64 global_id, void *reply, int reply_len,
5884cd1a677cSIlya Dryomov u8 *session_key, int *session_key_len,
5885cd1a677cSIlya Dryomov u8 *con_secret, int *con_secret_len)
5886cd1a677cSIlya Dryomov {
5887cd1a677cSIlya Dryomov struct ceph_mds_session *s = con->private;
5888cd1a677cSIlya Dryomov struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth;
5889cd1a677cSIlya Dryomov struct ceph_auth_handshake *auth = &s->s_auth;
5890cd1a677cSIlya Dryomov
5891cd1a677cSIlya Dryomov return ceph_auth_handle_svc_reply_done(ac, auth, reply, reply_len,
5892cd1a677cSIlya Dryomov session_key, session_key_len,
5893cd1a677cSIlya Dryomov con_secret, con_secret_len);
5894cd1a677cSIlya Dryomov }
5895cd1a677cSIlya Dryomov
mds_handle_auth_bad_method(struct ceph_connection * con,int used_proto,int result,const int * allowed_protos,int proto_cnt,const int * allowed_modes,int mode_cnt)5896cd1a677cSIlya Dryomov static int mds_handle_auth_bad_method(struct ceph_connection *con,
5897cd1a677cSIlya Dryomov int used_proto, int result,
5898cd1a677cSIlya Dryomov const int *allowed_protos, int proto_cnt,
5899cd1a677cSIlya Dryomov const int *allowed_modes, int mode_cnt)
5900cd1a677cSIlya Dryomov {
5901cd1a677cSIlya Dryomov struct ceph_mds_session *s = con->private;
5902cd1a677cSIlya Dryomov struct ceph_mon_client *monc = &s->s_mdsc->fsc->client->monc;
5903cd1a677cSIlya Dryomov int ret;
5904cd1a677cSIlya Dryomov
5905cd1a677cSIlya Dryomov if (ceph_auth_handle_bad_authorizer(monc->auth, CEPH_ENTITY_TYPE_MDS,
5906cd1a677cSIlya Dryomov used_proto, result,
5907cd1a677cSIlya Dryomov allowed_protos, proto_cnt,
5908cd1a677cSIlya Dryomov allowed_modes, mode_cnt)) {
5909cd1a677cSIlya Dryomov ret = ceph_monc_validate_auth(monc);
5910cd1a677cSIlya Dryomov if (ret)
5911cd1a677cSIlya Dryomov return ret;
5912cd1a677cSIlya Dryomov }
5913cd1a677cSIlya Dryomov
5914cd1a677cSIlya Dryomov return -EACCES;
5915cd1a677cSIlya Dryomov }
5916cd1a677cSIlya Dryomov
mds_alloc_msg(struct ceph_connection * con,struct ceph_msg_header * hdr,int * skip)591753ded495SAlex Elder static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con,
591853ded495SAlex Elder struct ceph_msg_header *hdr, int *skip)
591953ded495SAlex Elder {
592053ded495SAlex Elder struct ceph_msg *msg;
592153ded495SAlex Elder int type = (int) le16_to_cpu(hdr->type);
592253ded495SAlex Elder int front_len = (int) le32_to_cpu(hdr->front_len);
592353ded495SAlex Elder
592453ded495SAlex Elder if (con->in_msg)
592553ded495SAlex Elder return con->in_msg;
592653ded495SAlex Elder
592753ded495SAlex Elder *skip = 0;
592853ded495SAlex Elder msg = ceph_msg_new(type, front_len, GFP_NOFS, false);
592953ded495SAlex Elder if (!msg) {
593053ded495SAlex Elder pr_err("unable to allocate msg type %d len %d\n",
593153ded495SAlex Elder type, front_len);
593253ded495SAlex Elder return NULL;
593353ded495SAlex Elder }
593453ded495SAlex Elder
593553ded495SAlex Elder return msg;
593653ded495SAlex Elder }
593753ded495SAlex Elder
mds_sign_message(struct ceph_msg * msg)593879dbd1baSIlya Dryomov static int mds_sign_message(struct ceph_msg *msg)
593933d07337SYan, Zheng {
594079dbd1baSIlya Dryomov struct ceph_mds_session *s = msg->con->private;
594133d07337SYan, Zheng struct ceph_auth_handshake *auth = &s->s_auth;
594279dbd1baSIlya Dryomov
594333d07337SYan, Zheng return ceph_auth_sign_message(auth, msg);
594433d07337SYan, Zheng }
594533d07337SYan, Zheng
mds_check_message_signature(struct ceph_msg * msg)594679dbd1baSIlya Dryomov static int mds_check_message_signature(struct ceph_msg *msg)
594733d07337SYan, Zheng {
594879dbd1baSIlya Dryomov struct ceph_mds_session *s = msg->con->private;
594933d07337SYan, Zheng struct ceph_auth_handshake *auth = &s->s_auth;
595079dbd1baSIlya Dryomov
595133d07337SYan, Zheng return ceph_auth_check_message_signature(auth, msg);
595233d07337SYan, Zheng }
595333d07337SYan, Zheng
59549e32789fSTobias Klauser static const struct ceph_connection_operations mds_con_ops = {
59554972cf60SIlya Dryomov .get = mds_get_con,
59564972cf60SIlya Dryomov .put = mds_put_con,
595753ded495SAlex Elder .alloc_msg = mds_alloc_msg,
59584972cf60SIlya Dryomov .dispatch = mds_dispatch,
59594972cf60SIlya Dryomov .peer_reset = mds_peer_reset,
59604972cf60SIlya Dryomov .get_authorizer = mds_get_authorizer,
59614972cf60SIlya Dryomov .add_authorizer_challenge = mds_add_authorizer_challenge,
59624972cf60SIlya Dryomov .verify_authorizer_reply = mds_verify_authorizer_reply,
59634972cf60SIlya Dryomov .invalidate_authorizer = mds_invalidate_authorizer,
596479dbd1baSIlya Dryomov .sign_message = mds_sign_message,
596579dbd1baSIlya Dryomov .check_message_signature = mds_check_message_signature,
5966cd1a677cSIlya Dryomov .get_auth_request = mds_get_auth_request,
5967cd1a677cSIlya Dryomov .handle_auth_reply_more = mds_handle_auth_reply_more,
5968cd1a677cSIlya Dryomov .handle_auth_done = mds_handle_auth_done,
5969cd1a677cSIlya Dryomov .handle_auth_bad_method = mds_handle_auth_bad_method,
59702f2dc053SSage Weil };
59712f2dc053SSage Weil
59722f2dc053SSage Weil /* eof */
5973