xref: /openbmc/linux/fs/ceph/caps.c (revision 985b9ee8)
1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
23d14c5d2SYehuda Sadeh #include <linux/ceph/ceph_debug.h>
3a8599bd8SSage Weil 
4a8599bd8SSage Weil #include <linux/fs.h>
5a8599bd8SSage Weil #include <linux/kernel.h>
6174cd4b1SIngo Molnar #include <linux/sched/signal.h>
75a0e3ad6STejun Heo #include <linux/slab.h>
8a8599bd8SSage Weil #include <linux/vmalloc.h>
9a8599bd8SSage Weil #include <linux/wait.h>
10f1a3d572SStephen Rothwell #include <linux/writeback.h>
11176c77c9SJeff Layton #include <linux/iversion.h>
125970e15dSJeff Layton #include <linux/filelock.h>
13a8599bd8SSage Weil 
14a8599bd8SSage Weil #include "super.h"
153d14c5d2SYehuda Sadeh #include "mds_client.h"
1699ccbd22SMilosz Tanski #include "cache.h"
172d332d5bSJeff Layton #include "crypto.h"
183d14c5d2SYehuda Sadeh #include <linux/ceph/decode.h>
193d14c5d2SYehuda Sadeh #include <linux/ceph/messenger.h>
20a8599bd8SSage Weil 
21a8599bd8SSage Weil /*
22a8599bd8SSage Weil  * Capability management
23a8599bd8SSage Weil  *
24a8599bd8SSage Weil  * The Ceph metadata servers control client access to inode metadata
25a8599bd8SSage Weil  * and file data by issuing capabilities, granting clients permission
26a8599bd8SSage Weil  * to read and/or write both inode field and file data to OSDs
27a8599bd8SSage Weil  * (storage nodes).  Each capability consists of a set of bits
28a8599bd8SSage Weil  * indicating which operations are allowed.
29a8599bd8SSage Weil  *
30a8599bd8SSage Weil  * If the client holds a *_SHARED cap, the client has a coherent value
31a8599bd8SSage Weil  * that can be safely read from the cached inode.
32a8599bd8SSage Weil  *
33a8599bd8SSage Weil  * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
34a8599bd8SSage Weil  * client is allowed to change inode attributes (e.g., file size,
35a8599bd8SSage Weil  * mtime), note its dirty state in the ceph_cap, and asynchronously
36a8599bd8SSage Weil  * flush that metadata change to the MDS.
37a8599bd8SSage Weil  *
38a8599bd8SSage Weil  * In the event of a conflicting operation (perhaps by another
39a8599bd8SSage Weil  * client), the MDS will revoke the conflicting client capabilities.
40a8599bd8SSage Weil  *
41a8599bd8SSage Weil  * In order for a client to cache an inode, it must hold a capability
42a8599bd8SSage Weil  * with at least one MDS server.  When inodes are released, release
43a8599bd8SSage Weil  * notifications are batched and periodically sent en masse to the MDS
44a8599bd8SSage Weil  * cluster to release server state.
45a8599bd8SSage Weil  */
46a8599bd8SSage Weil 
470e294387SYan, Zheng static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc);
487bc00fddSYan, Zheng static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
497bc00fddSYan, Zheng 				 struct ceph_mds_session *session,
507bc00fddSYan, Zheng 				 struct ceph_inode_info *ci,
517bc00fddSYan, Zheng 				 u64 oldest_flush_tid);
52a8599bd8SSage Weil 
53a8599bd8SSage Weil /*
54a8599bd8SSage Weil  * Generate readable cap strings for debugging output.
55a8599bd8SSage Weil  */
56a8599bd8SSage Weil #define MAX_CAP_STR 20
57a8599bd8SSage Weil static char cap_str[MAX_CAP_STR][40];
58a8599bd8SSage Weil static DEFINE_SPINLOCK(cap_str_lock);
59a8599bd8SSage Weil static int last_cap_str;
60a8599bd8SSage Weil 
gcap_string(char * s,int c)61a8599bd8SSage Weil static char *gcap_string(char *s, int c)
62a8599bd8SSage Weil {
63a8599bd8SSage Weil 	if (c & CEPH_CAP_GSHARED)
64a8599bd8SSage Weil 		*s++ = 's';
65a8599bd8SSage Weil 	if (c & CEPH_CAP_GEXCL)
66a8599bd8SSage Weil 		*s++ = 'x';
67a8599bd8SSage Weil 	if (c & CEPH_CAP_GCACHE)
68a8599bd8SSage Weil 		*s++ = 'c';
69a8599bd8SSage Weil 	if (c & CEPH_CAP_GRD)
70a8599bd8SSage Weil 		*s++ = 'r';
71a8599bd8SSage Weil 	if (c & CEPH_CAP_GWR)
72a8599bd8SSage Weil 		*s++ = 'w';
73a8599bd8SSage Weil 	if (c & CEPH_CAP_GBUFFER)
74a8599bd8SSage Weil 		*s++ = 'b';
7549a9f4f6SYan, Zheng 	if (c & CEPH_CAP_GWREXTEND)
7649a9f4f6SYan, Zheng 		*s++ = 'a';
77a8599bd8SSage Weil 	if (c & CEPH_CAP_GLAZYIO)
78a8599bd8SSage Weil 		*s++ = 'l';
79a8599bd8SSage Weil 	return s;
80a8599bd8SSage Weil }
81a8599bd8SSage Weil 
ceph_cap_string(int caps)82a8599bd8SSage Weil const char *ceph_cap_string(int caps)
83a8599bd8SSage Weil {
84a8599bd8SSage Weil 	int i;
85a8599bd8SSage Weil 	char *s;
86a8599bd8SSage Weil 	int c;
87a8599bd8SSage Weil 
88a8599bd8SSage Weil 	spin_lock(&cap_str_lock);
89a8599bd8SSage Weil 	i = last_cap_str++;
90a8599bd8SSage Weil 	if (last_cap_str == MAX_CAP_STR)
91a8599bd8SSage Weil 		last_cap_str = 0;
92a8599bd8SSage Weil 	spin_unlock(&cap_str_lock);
93a8599bd8SSage Weil 
94a8599bd8SSage Weil 	s = cap_str[i];
95a8599bd8SSage Weil 
96a8599bd8SSage Weil 	if (caps & CEPH_CAP_PIN)
97a8599bd8SSage Weil 		*s++ = 'p';
98a8599bd8SSage Weil 
99a8599bd8SSage Weil 	c = (caps >> CEPH_CAP_SAUTH) & 3;
100a8599bd8SSage Weil 	if (c) {
101a8599bd8SSage Weil 		*s++ = 'A';
102a8599bd8SSage Weil 		s = gcap_string(s, c);
103a8599bd8SSage Weil 	}
104a8599bd8SSage Weil 
105a8599bd8SSage Weil 	c = (caps >> CEPH_CAP_SLINK) & 3;
106a8599bd8SSage Weil 	if (c) {
107a8599bd8SSage Weil 		*s++ = 'L';
108a8599bd8SSage Weil 		s = gcap_string(s, c);
109a8599bd8SSage Weil 	}
110a8599bd8SSage Weil 
111a8599bd8SSage Weil 	c = (caps >> CEPH_CAP_SXATTR) & 3;
112a8599bd8SSage Weil 	if (c) {
113a8599bd8SSage Weil 		*s++ = 'X';
114a8599bd8SSage Weil 		s = gcap_string(s, c);
115a8599bd8SSage Weil 	}
116a8599bd8SSage Weil 
117a8599bd8SSage Weil 	c = caps >> CEPH_CAP_SFILE;
118a8599bd8SSage Weil 	if (c) {
119a8599bd8SSage Weil 		*s++ = 'F';
120a8599bd8SSage Weil 		s = gcap_string(s, c);
121a8599bd8SSage Weil 	}
122a8599bd8SSage Weil 
123a8599bd8SSage Weil 	if (s == cap_str[i])
124a8599bd8SSage Weil 		*s++ = '-';
125a8599bd8SSage Weil 	*s = 0;
126a8599bd8SSage Weil 	return cap_str[i];
127a8599bd8SSage Weil }
128a8599bd8SSage Weil 
ceph_caps_init(struct ceph_mds_client * mdsc)12937151668SYehuda Sadeh void ceph_caps_init(struct ceph_mds_client *mdsc)
130a8599bd8SSage Weil {
13137151668SYehuda Sadeh 	INIT_LIST_HEAD(&mdsc->caps_list);
13237151668SYehuda Sadeh 	spin_lock_init(&mdsc->caps_list_lock);
133a8599bd8SSage Weil }
134a8599bd8SSage Weil 
ceph_caps_finalize(struct ceph_mds_client * mdsc)13537151668SYehuda Sadeh void ceph_caps_finalize(struct ceph_mds_client *mdsc)
136a8599bd8SSage Weil {
137a8599bd8SSage Weil 	struct ceph_cap *cap;
138a8599bd8SSage Weil 
13937151668SYehuda Sadeh 	spin_lock(&mdsc->caps_list_lock);
14037151668SYehuda Sadeh 	while (!list_empty(&mdsc->caps_list)) {
14137151668SYehuda Sadeh 		cap = list_first_entry(&mdsc->caps_list,
14237151668SYehuda Sadeh 				       struct ceph_cap, caps_item);
143a8599bd8SSage Weil 		list_del(&cap->caps_item);
144a8599bd8SSage Weil 		kmem_cache_free(ceph_cap_cachep, cap);
145a8599bd8SSage Weil 	}
14637151668SYehuda Sadeh 	mdsc->caps_total_count = 0;
14737151668SYehuda Sadeh 	mdsc->caps_avail_count = 0;
14837151668SYehuda Sadeh 	mdsc->caps_use_count = 0;
14937151668SYehuda Sadeh 	mdsc->caps_reserve_count = 0;
15037151668SYehuda Sadeh 	mdsc->caps_min_count = 0;
15137151668SYehuda Sadeh 	spin_unlock(&mdsc->caps_list_lock);
15285ccce43SSage Weil }
15385ccce43SSage Weil 
ceph_adjust_caps_max_min(struct ceph_mds_client * mdsc,struct ceph_mount_options * fsopt)154fe33032dSYan, Zheng void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc,
155fe33032dSYan, Zheng 			      struct ceph_mount_options *fsopt)
15685ccce43SSage Weil {
15737151668SYehuda Sadeh 	spin_lock(&mdsc->caps_list_lock);
158fe33032dSYan, Zheng 	mdsc->caps_min_count = fsopt->max_readdir;
159fe33032dSYan, Zheng 	if (mdsc->caps_min_count < 1024)
160fe33032dSYan, Zheng 		mdsc->caps_min_count = 1024;
161fe33032dSYan, Zheng 	mdsc->caps_use_max = fsopt->caps_max;
162fe33032dSYan, Zheng 	if (mdsc->caps_use_max > 0 &&
163fe33032dSYan, Zheng 	    mdsc->caps_use_max < mdsc->caps_min_count)
164fe33032dSYan, Zheng 		mdsc->caps_use_max = mdsc->caps_min_count;
16537151668SYehuda Sadeh 	spin_unlock(&mdsc->caps_list_lock);
166a8599bd8SSage Weil }
167a8599bd8SSage Weil 
__ceph_unreserve_caps(struct ceph_mds_client * mdsc,int nr_caps)1687bf8f736SChengguang Xu static void __ceph_unreserve_caps(struct ceph_mds_client *mdsc, int nr_caps)
1697bf8f736SChengguang Xu {
1707bf8f736SChengguang Xu 	struct ceph_cap *cap;
1717bf8f736SChengguang Xu 	int i;
1727bf8f736SChengguang Xu 
1737bf8f736SChengguang Xu 	if (nr_caps) {
1747bf8f736SChengguang Xu 		BUG_ON(mdsc->caps_reserve_count < nr_caps);
1757bf8f736SChengguang Xu 		mdsc->caps_reserve_count -= nr_caps;
1767bf8f736SChengguang Xu 		if (mdsc->caps_avail_count >=
1777bf8f736SChengguang Xu 		    mdsc->caps_reserve_count + mdsc->caps_min_count) {
1787bf8f736SChengguang Xu 			mdsc->caps_total_count -= nr_caps;
1797bf8f736SChengguang Xu 			for (i = 0; i < nr_caps; i++) {
1807bf8f736SChengguang Xu 				cap = list_first_entry(&mdsc->caps_list,
1817bf8f736SChengguang Xu 					struct ceph_cap, caps_item);
1827bf8f736SChengguang Xu 				list_del(&cap->caps_item);
1837bf8f736SChengguang Xu 				kmem_cache_free(ceph_cap_cachep, cap);
1847bf8f736SChengguang Xu 			}
1857bf8f736SChengguang Xu 		} else {
1867bf8f736SChengguang Xu 			mdsc->caps_avail_count += nr_caps;
1877bf8f736SChengguang Xu 		}
1887bf8f736SChengguang Xu 
1897bf8f736SChengguang Xu 		dout("%s: caps %d = %d used + %d resv + %d avail\n",
1907bf8f736SChengguang Xu 		     __func__,
1917bf8f736SChengguang Xu 		     mdsc->caps_total_count, mdsc->caps_use_count,
1927bf8f736SChengguang Xu 		     mdsc->caps_reserve_count, mdsc->caps_avail_count);
1937bf8f736SChengguang Xu 		BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
1947bf8f736SChengguang Xu 						 mdsc->caps_reserve_count +
1957bf8f736SChengguang Xu 						 mdsc->caps_avail_count);
1967bf8f736SChengguang Xu 	}
1977bf8f736SChengguang Xu }
1987bf8f736SChengguang Xu 
199e30ee581SZhi Zhang /*
200e30ee581SZhi Zhang  * Called under mdsc->mutex.
201e30ee581SZhi Zhang  */
ceph_reserve_caps(struct ceph_mds_client * mdsc,struct ceph_cap_reservation * ctx,int need)202e30ee581SZhi Zhang int ceph_reserve_caps(struct ceph_mds_client *mdsc,
20337151668SYehuda Sadeh 		      struct ceph_cap_reservation *ctx, int need)
204a8599bd8SSage Weil {
205e30ee581SZhi Zhang 	int i, j;
206a8599bd8SSage Weil 	struct ceph_cap *cap;
207a8599bd8SSage Weil 	int have;
208a8599bd8SSage Weil 	int alloc = 0;
209e30ee581SZhi Zhang 	int max_caps;
210e5bc08d0SChengguang Xu 	int err = 0;
211e30ee581SZhi Zhang 	bool trimmed = false;
212e30ee581SZhi Zhang 	struct ceph_mds_session *s;
213a8599bd8SSage Weil 	LIST_HEAD(newcaps);
214a8599bd8SSage Weil 
215a8599bd8SSage Weil 	dout("reserve caps ctx=%p need=%d\n", ctx, need);
216a8599bd8SSage Weil 
217a8599bd8SSage Weil 	/* first reserve any caps that are already allocated */
21837151668SYehuda Sadeh 	spin_lock(&mdsc->caps_list_lock);
21937151668SYehuda Sadeh 	if (mdsc->caps_avail_count >= need)
220a8599bd8SSage Weil 		have = need;
221a8599bd8SSage Weil 	else
22237151668SYehuda Sadeh 		have = mdsc->caps_avail_count;
22337151668SYehuda Sadeh 	mdsc->caps_avail_count -= have;
22437151668SYehuda Sadeh 	mdsc->caps_reserve_count += have;
22537151668SYehuda Sadeh 	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
22637151668SYehuda Sadeh 					 mdsc->caps_reserve_count +
22737151668SYehuda Sadeh 					 mdsc->caps_avail_count);
22837151668SYehuda Sadeh 	spin_unlock(&mdsc->caps_list_lock);
229a8599bd8SSage Weil 
23079cd674aSChengguang Xu 	for (i = have; i < need; ) {
231a8599bd8SSage Weil 		cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
23279cd674aSChengguang Xu 		if (cap) {
23379cd674aSChengguang Xu 			list_add(&cap->caps_item, &newcaps);
23479cd674aSChengguang Xu 			alloc++;
23579cd674aSChengguang Xu 			i++;
23679cd674aSChengguang Xu 			continue;
23779cd674aSChengguang Xu 		}
23879cd674aSChengguang Xu 
239e30ee581SZhi Zhang 		if (!trimmed) {
240e30ee581SZhi Zhang 			for (j = 0; j < mdsc->max_sessions; j++) {
241e30ee581SZhi Zhang 				s = __ceph_lookup_mds_session(mdsc, j);
242e30ee581SZhi Zhang 				if (!s)
243e30ee581SZhi Zhang 					continue;
244e30ee581SZhi Zhang 				mutex_unlock(&mdsc->mutex);
245e30ee581SZhi Zhang 
246e30ee581SZhi Zhang 				mutex_lock(&s->s_mutex);
247e30ee581SZhi Zhang 				max_caps = s->s_nr_caps - (need - i);
248e30ee581SZhi Zhang 				ceph_trim_caps(mdsc, s, max_caps);
249e30ee581SZhi Zhang 				mutex_unlock(&s->s_mutex);
250e30ee581SZhi Zhang 
251e30ee581SZhi Zhang 				ceph_put_mds_session(s);
252e30ee581SZhi Zhang 				mutex_lock(&mdsc->mutex);
253e30ee581SZhi Zhang 			}
254e30ee581SZhi Zhang 			trimmed = true;
25579cd674aSChengguang Xu 
25679cd674aSChengguang Xu 			spin_lock(&mdsc->caps_list_lock);
25779cd674aSChengguang Xu 			if (mdsc->caps_avail_count) {
25879cd674aSChengguang Xu 				int more_have;
25979cd674aSChengguang Xu 				if (mdsc->caps_avail_count >= need - i)
26079cd674aSChengguang Xu 					more_have = need - i;
26179cd674aSChengguang Xu 				else
26279cd674aSChengguang Xu 					more_have = mdsc->caps_avail_count;
26379cd674aSChengguang Xu 
26479cd674aSChengguang Xu 				i += more_have;
26579cd674aSChengguang Xu 				have += more_have;
26679cd674aSChengguang Xu 				mdsc->caps_avail_count -= more_have;
26779cd674aSChengguang Xu 				mdsc->caps_reserve_count += more_have;
26879cd674aSChengguang Xu 
26979cd674aSChengguang Xu 			}
27079cd674aSChengguang Xu 			spin_unlock(&mdsc->caps_list_lock);
27179cd674aSChengguang Xu 
27279cd674aSChengguang Xu 			continue;
27379cd674aSChengguang Xu 		}
27479cd674aSChengguang Xu 
27579cd674aSChengguang Xu 		pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
276e30ee581SZhi Zhang 			ctx, need, have + alloc);
277e5bc08d0SChengguang Xu 		err = -ENOMEM;
278e5bc08d0SChengguang Xu 		break;
279e30ee581SZhi Zhang 	}
280e5bc08d0SChengguang Xu 
281e5bc08d0SChengguang Xu 	if (!err) {
282e30ee581SZhi Zhang 		BUG_ON(have + alloc != need);
283e5bc08d0SChengguang Xu 		ctx->count = need;
284fe33032dSYan, Zheng 		ctx->used = 0;
285e5bc08d0SChengguang Xu 	}
286a8599bd8SSage Weil 
28737151668SYehuda Sadeh 	spin_lock(&mdsc->caps_list_lock);
28837151668SYehuda Sadeh 	mdsc->caps_total_count += alloc;
28937151668SYehuda Sadeh 	mdsc->caps_reserve_count += alloc;
29037151668SYehuda Sadeh 	list_splice(&newcaps, &mdsc->caps_list);
291a8599bd8SSage Weil 
29237151668SYehuda Sadeh 	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
29337151668SYehuda Sadeh 					 mdsc->caps_reserve_count +
29437151668SYehuda Sadeh 					 mdsc->caps_avail_count);
295e5bc08d0SChengguang Xu 
296e5bc08d0SChengguang Xu 	if (err)
297e5bc08d0SChengguang Xu 		__ceph_unreserve_caps(mdsc, have + alloc);
298e5bc08d0SChengguang Xu 
29937151668SYehuda Sadeh 	spin_unlock(&mdsc->caps_list_lock);
300a8599bd8SSage Weil 
301a8599bd8SSage Weil 	dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
30237151668SYehuda Sadeh 	     ctx, mdsc->caps_total_count, mdsc->caps_use_count,
30337151668SYehuda Sadeh 	     mdsc->caps_reserve_count, mdsc->caps_avail_count);
304e5bc08d0SChengguang Xu 	return err;
305a8599bd8SSage Weil }
306a8599bd8SSage Weil 
ceph_unreserve_caps(struct ceph_mds_client * mdsc,struct ceph_cap_reservation * ctx)3077bf8f736SChengguang Xu void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
30837151668SYehuda Sadeh 			 struct ceph_cap_reservation *ctx)
309a8599bd8SSage Weil {
310fe33032dSYan, Zheng 	bool reclaim = false;
311fe33032dSYan, Zheng 	if (!ctx->count)
312fe33032dSYan, Zheng 		return;
313fe33032dSYan, Zheng 
314a8599bd8SSage Weil 	dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
31537151668SYehuda Sadeh 	spin_lock(&mdsc->caps_list_lock);
3167bf8f736SChengguang Xu 	__ceph_unreserve_caps(mdsc, ctx->count);
317a8599bd8SSage Weil 	ctx->count = 0;
318fe33032dSYan, Zheng 
319fe33032dSYan, Zheng 	if (mdsc->caps_use_max > 0 &&
320fe33032dSYan, Zheng 	    mdsc->caps_use_count > mdsc->caps_use_max)
321fe33032dSYan, Zheng 		reclaim = true;
32237151668SYehuda Sadeh 	spin_unlock(&mdsc->caps_list_lock);
323fe33032dSYan, Zheng 
324fe33032dSYan, Zheng 	if (reclaim)
325fe33032dSYan, Zheng 		ceph_reclaim_caps_nr(mdsc, ctx->used);
326a8599bd8SSage Weil }
327a8599bd8SSage Weil 
ceph_get_cap(struct ceph_mds_client * mdsc,struct ceph_cap_reservation * ctx)328d9df2783SYan, Zheng struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
32937151668SYehuda Sadeh 			      struct ceph_cap_reservation *ctx)
330a8599bd8SSage Weil {
331a8599bd8SSage Weil 	struct ceph_cap *cap = NULL;
332a8599bd8SSage Weil 
333a8599bd8SSage Weil 	/* temporary, until we do something about cap import/export */
334443b3760SSage Weil 	if (!ctx) {
335443b3760SSage Weil 		cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
336443b3760SSage Weil 		if (cap) {
3374d1d0534SYan, Zheng 			spin_lock(&mdsc->caps_list_lock);
33837151668SYehuda Sadeh 			mdsc->caps_use_count++;
33937151668SYehuda Sadeh 			mdsc->caps_total_count++;
3404d1d0534SYan, Zheng 			spin_unlock(&mdsc->caps_list_lock);
341e327ce06SChengguang Xu 		} else {
342e327ce06SChengguang Xu 			spin_lock(&mdsc->caps_list_lock);
343e327ce06SChengguang Xu 			if (mdsc->caps_avail_count) {
344e327ce06SChengguang Xu 				BUG_ON(list_empty(&mdsc->caps_list));
345e327ce06SChengguang Xu 
346e327ce06SChengguang Xu 				mdsc->caps_avail_count--;
347e327ce06SChengguang Xu 				mdsc->caps_use_count++;
348e327ce06SChengguang Xu 				cap = list_first_entry(&mdsc->caps_list,
349e327ce06SChengguang Xu 						struct ceph_cap, caps_item);
350e327ce06SChengguang Xu 				list_del(&cap->caps_item);
351e327ce06SChengguang Xu 
352e327ce06SChengguang Xu 				BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
353e327ce06SChengguang Xu 				       mdsc->caps_reserve_count + mdsc->caps_avail_count);
354443b3760SSage Weil 			}
355e327ce06SChengguang Xu 			spin_unlock(&mdsc->caps_list_lock);
356e327ce06SChengguang Xu 		}
357e327ce06SChengguang Xu 
358443b3760SSage Weil 		return cap;
359443b3760SSage Weil 	}
360a8599bd8SSage Weil 
36137151668SYehuda Sadeh 	spin_lock(&mdsc->caps_list_lock);
362a8599bd8SSage Weil 	dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
36337151668SYehuda Sadeh 	     ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count,
36437151668SYehuda Sadeh 	     mdsc->caps_reserve_count, mdsc->caps_avail_count);
365a8599bd8SSage Weil 	BUG_ON(!ctx->count);
36637151668SYehuda Sadeh 	BUG_ON(ctx->count > mdsc->caps_reserve_count);
36737151668SYehuda Sadeh 	BUG_ON(list_empty(&mdsc->caps_list));
368a8599bd8SSage Weil 
369a8599bd8SSage Weil 	ctx->count--;
370fe33032dSYan, Zheng 	ctx->used++;
37137151668SYehuda Sadeh 	mdsc->caps_reserve_count--;
37237151668SYehuda Sadeh 	mdsc->caps_use_count++;
373a8599bd8SSage Weil 
37437151668SYehuda Sadeh 	cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item);
375a8599bd8SSage Weil 	list_del(&cap->caps_item);
376a8599bd8SSage Weil 
37737151668SYehuda Sadeh 	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
37837151668SYehuda Sadeh 	       mdsc->caps_reserve_count + mdsc->caps_avail_count);
37937151668SYehuda Sadeh 	spin_unlock(&mdsc->caps_list_lock);
380a8599bd8SSage Weil 	return cap;
381a8599bd8SSage Weil }
382a8599bd8SSage Weil 
ceph_put_cap(struct ceph_mds_client * mdsc,struct ceph_cap * cap)38337151668SYehuda Sadeh void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
384a8599bd8SSage Weil {
38537151668SYehuda Sadeh 	spin_lock(&mdsc->caps_list_lock);
3867c1332b8SSage Weil 	dout("put_cap %p %d = %d used + %d resv + %d avail\n",
38737151668SYehuda Sadeh 	     cap, mdsc->caps_total_count, mdsc->caps_use_count,
38837151668SYehuda Sadeh 	     mdsc->caps_reserve_count, mdsc->caps_avail_count);
38937151668SYehuda Sadeh 	mdsc->caps_use_count--;
390a8599bd8SSage Weil 	/*
39185ccce43SSage Weil 	 * Keep some preallocated caps around (ceph_min_count), to
39285ccce43SSage Weil 	 * avoid lots of free/alloc churn.
393a8599bd8SSage Weil 	 */
39437151668SYehuda Sadeh 	if (mdsc->caps_avail_count >= mdsc->caps_reserve_count +
39537151668SYehuda Sadeh 				      mdsc->caps_min_count) {
39637151668SYehuda Sadeh 		mdsc->caps_total_count--;
397a8599bd8SSage Weil 		kmem_cache_free(ceph_cap_cachep, cap);
398a8599bd8SSage Weil 	} else {
39937151668SYehuda Sadeh 		mdsc->caps_avail_count++;
40037151668SYehuda Sadeh 		list_add(&cap->caps_item, &mdsc->caps_list);
401a8599bd8SSage Weil 	}
402a8599bd8SSage Weil 
40337151668SYehuda Sadeh 	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
40437151668SYehuda Sadeh 	       mdsc->caps_reserve_count + mdsc->caps_avail_count);
40537151668SYehuda Sadeh 	spin_unlock(&mdsc->caps_list_lock);
406a8599bd8SSage Weil }
407a8599bd8SSage Weil 
ceph_reservation_status(struct ceph_fs_client * fsc,int * total,int * avail,int * used,int * reserved,int * min)4083d14c5d2SYehuda Sadeh void ceph_reservation_status(struct ceph_fs_client *fsc,
40985ccce43SSage Weil 			     int *total, int *avail, int *used, int *reserved,
41085ccce43SSage Weil 			     int *min)
411a8599bd8SSage Weil {
4123d14c5d2SYehuda Sadeh 	struct ceph_mds_client *mdsc = fsc->mdsc;
41337151668SYehuda Sadeh 
414b884014aSChengguang Xu 	spin_lock(&mdsc->caps_list_lock);
415b884014aSChengguang Xu 
416a8599bd8SSage Weil 	if (total)
41737151668SYehuda Sadeh 		*total = mdsc->caps_total_count;
418a8599bd8SSage Weil 	if (avail)
41937151668SYehuda Sadeh 		*avail = mdsc->caps_avail_count;
420a8599bd8SSage Weil 	if (used)
42137151668SYehuda Sadeh 		*used = mdsc->caps_use_count;
422a8599bd8SSage Weil 	if (reserved)
42337151668SYehuda Sadeh 		*reserved = mdsc->caps_reserve_count;
42485ccce43SSage Weil 	if (min)
42537151668SYehuda Sadeh 		*min = mdsc->caps_min_count;
426b884014aSChengguang Xu 
427b884014aSChengguang Xu 	spin_unlock(&mdsc->caps_list_lock);
428a8599bd8SSage Weil }
429a8599bd8SSage Weil 
430a8599bd8SSage Weil /*
431a8599bd8SSage Weil  * Find ceph_cap for given mds, if any.
432a8599bd8SSage Weil  *
433be655596SSage Weil  * Called with i_ceph_lock held.
434a8599bd8SSage Weil  */
__get_cap_for_mds(struct ceph_inode_info * ci,int mds)435aaf67de7SXiubo Li struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
436a8599bd8SSage Weil {
437a8599bd8SSage Weil 	struct ceph_cap *cap;
438a8599bd8SSage Weil 	struct rb_node *n = ci->i_caps.rb_node;
439a8599bd8SSage Weil 
440a8599bd8SSage Weil 	while (n) {
441a8599bd8SSage Weil 		cap = rb_entry(n, struct ceph_cap, ci_node);
442a8599bd8SSage Weil 		if (mds < cap->mds)
443a8599bd8SSage Weil 			n = n->rb_left;
444a8599bd8SSage Weil 		else if (mds > cap->mds)
445a8599bd8SSage Weil 			n = n->rb_right;
446a8599bd8SSage Weil 		else
447a8599bd8SSage Weil 			return cap;
448a8599bd8SSage Weil 	}
449a8599bd8SSage Weil 	return NULL;
450a8599bd8SSage Weil }
451a8599bd8SSage Weil 
ceph_get_cap_for_mds(struct ceph_inode_info * ci,int mds)4522bc50259SGreg Farnum struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds)
4532bc50259SGreg Farnum {
4542bc50259SGreg Farnum 	struct ceph_cap *cap;
4552bc50259SGreg Farnum 
456be655596SSage Weil 	spin_lock(&ci->i_ceph_lock);
4572bc50259SGreg Farnum 	cap = __get_cap_for_mds(ci, mds);
458be655596SSage Weil 	spin_unlock(&ci->i_ceph_lock);
4592bc50259SGreg Farnum 	return cap;
4602bc50259SGreg Farnum }
4612bc50259SGreg Farnum 
462a8599bd8SSage Weil /*
463be655596SSage Weil  * Called under i_ceph_lock.
464a8599bd8SSage Weil  */
__insert_cap_node(struct ceph_inode_info * ci,struct ceph_cap * new)465a8599bd8SSage Weil static void __insert_cap_node(struct ceph_inode_info *ci,
466a8599bd8SSage Weil 			      struct ceph_cap *new)
467a8599bd8SSage Weil {
468a8599bd8SSage Weil 	struct rb_node **p = &ci->i_caps.rb_node;
469a8599bd8SSage Weil 	struct rb_node *parent = NULL;
470a8599bd8SSage Weil 	struct ceph_cap *cap = NULL;
471a8599bd8SSage Weil 
472a8599bd8SSage Weil 	while (*p) {
473a8599bd8SSage Weil 		parent = *p;
474a8599bd8SSage Weil 		cap = rb_entry(parent, struct ceph_cap, ci_node);
475a8599bd8SSage Weil 		if (new->mds < cap->mds)
476a8599bd8SSage Weil 			p = &(*p)->rb_left;
477a8599bd8SSage Weil 		else if (new->mds > cap->mds)
478a8599bd8SSage Weil 			p = &(*p)->rb_right;
479a8599bd8SSage Weil 		else
480a8599bd8SSage Weil 			BUG();
481a8599bd8SSage Weil 	}
482a8599bd8SSage Weil 
483a8599bd8SSage Weil 	rb_link_node(&new->ci_node, parent, p);
484a8599bd8SSage Weil 	rb_insert_color(&new->ci_node, &ci->i_caps);
485a8599bd8SSage Weil }
486a8599bd8SSage Weil 
487a8599bd8SSage Weil /*
488a8599bd8SSage Weil  * (re)set cap hold timeouts, which control the delayed release
489a8599bd8SSage Weil  * of unused caps back to the MDS.  Should be called on cap use.
490a8599bd8SSage Weil  */
__cap_set_timeouts(struct ceph_mds_client * mdsc,struct ceph_inode_info * ci)491a8599bd8SSage Weil static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
492a8599bd8SSage Weil 			       struct ceph_inode_info *ci)
493a8599bd8SSage Weil {
494fe33032dSYan, Zheng 	struct ceph_mount_options *opt = mdsc->fsc->mount_options;
495a8599bd8SSage Weil 	ci->i_hold_caps_max = round_jiffies(jiffies +
496fe33032dSYan, Zheng 					    opt->caps_wanted_delay_max * HZ);
497874c8ca1SDavid Howells 	dout("__cap_set_timeouts %p %lu\n", &ci->netfs.inode,
498a0d93e32SYan, Zheng 	     ci->i_hold_caps_max - jiffies);
499a8599bd8SSage Weil }
500a8599bd8SSage Weil 
501a8599bd8SSage Weil /*
502a8599bd8SSage Weil  * (Re)queue cap at the end of the delayed cap release list.
503a8599bd8SSage Weil  *
504a8599bd8SSage Weil  * If I_FLUSH is set, leave the inode at the front of the list.
505a8599bd8SSage Weil  *
506be655596SSage Weil  * Caller holds i_ceph_lock
507a8599bd8SSage Weil  *    -> we take mdsc->cap_delay_lock
508a8599bd8SSage Weil  */
__cap_delay_requeue(struct ceph_mds_client * mdsc,struct ceph_inode_info * ci)509a8599bd8SSage Weil static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
510a0d93e32SYan, Zheng 				struct ceph_inode_info *ci)
511a8599bd8SSage Weil {
512874c8ca1SDavid Howells 	dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->netfs.inode,
513a8599bd8SSage Weil 	     ci->i_ceph_flags, ci->i_hold_caps_max);
514a8599bd8SSage Weil 	if (!mdsc->stopping) {
515a8599bd8SSage Weil 		spin_lock(&mdsc->cap_delay_lock);
516a8599bd8SSage Weil 		if (!list_empty(&ci->i_cap_delay_list)) {
517a8599bd8SSage Weil 			if (ci->i_ceph_flags & CEPH_I_FLUSH)
518a8599bd8SSage Weil 				goto no_change;
519a8599bd8SSage Weil 			list_del_init(&ci->i_cap_delay_list);
520a8599bd8SSage Weil 		}
5213167893aSChengguang Xu 		__cap_set_timeouts(mdsc, ci);
522a8599bd8SSage Weil 		list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
523a8599bd8SSage Weil no_change:
524a8599bd8SSage Weil 		spin_unlock(&mdsc->cap_delay_lock);
525a8599bd8SSage Weil 	}
526a8599bd8SSage Weil }
527a8599bd8SSage Weil 
528a8599bd8SSage Weil /*
529a8599bd8SSage Weil  * Queue an inode for immediate writeback.  Mark inode with I_FLUSH,
530a8599bd8SSage Weil  * indicating we should send a cap message to flush dirty metadata
531a8599bd8SSage Weil  * asap, and move to the front of the delayed cap list.
532a8599bd8SSage Weil  */
__cap_delay_requeue_front(struct ceph_mds_client * mdsc,struct ceph_inode_info * ci)533a8599bd8SSage Weil static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
534a8599bd8SSage Weil 				      struct ceph_inode_info *ci)
535a8599bd8SSage Weil {
536874c8ca1SDavid Howells 	dout("__cap_delay_requeue_front %p\n", &ci->netfs.inode);
537a8599bd8SSage Weil 	spin_lock(&mdsc->cap_delay_lock);
538a8599bd8SSage Weil 	ci->i_ceph_flags |= CEPH_I_FLUSH;
539a8599bd8SSage Weil 	if (!list_empty(&ci->i_cap_delay_list))
540a8599bd8SSage Weil 		list_del_init(&ci->i_cap_delay_list);
541a8599bd8SSage Weil 	list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
542a8599bd8SSage Weil 	spin_unlock(&mdsc->cap_delay_lock);
543a8599bd8SSage Weil }
544a8599bd8SSage Weil 
545a8599bd8SSage Weil /*
546a8599bd8SSage Weil  * Cancel delayed work on cap.
547a8599bd8SSage Weil  *
548be655596SSage Weil  * Caller must hold i_ceph_lock.
549a8599bd8SSage Weil  */
__cap_delay_cancel(struct ceph_mds_client * mdsc,struct ceph_inode_info * ci)550a8599bd8SSage Weil static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
551a8599bd8SSage Weil 			       struct ceph_inode_info *ci)
552a8599bd8SSage Weil {
553874c8ca1SDavid Howells 	dout("__cap_delay_cancel %p\n", &ci->netfs.inode);
554a8599bd8SSage Weil 	if (list_empty(&ci->i_cap_delay_list))
555a8599bd8SSage Weil 		return;
556a8599bd8SSage Weil 	spin_lock(&mdsc->cap_delay_lock);
557a8599bd8SSage Weil 	list_del_init(&ci->i_cap_delay_list);
558a8599bd8SSage Weil 	spin_unlock(&mdsc->cap_delay_lock);
559a8599bd8SSage Weil }
560a8599bd8SSage Weil 
561785892feSJeff Layton /* Common issue checks for add_cap, handle_cap_grant. */
__check_cap_issue(struct ceph_inode_info * ci,struct ceph_cap * cap,unsigned issued)562a8599bd8SSage Weil static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
563a8599bd8SSage Weil 			      unsigned issued)
564a8599bd8SSage Weil {
565a8599bd8SSage Weil 	unsigned had = __ceph_caps_issued(ci, NULL);
566a8599bd8SSage Weil 
567785892feSJeff Layton 	lockdep_assert_held(&ci->i_ceph_lock);
568785892feSJeff Layton 
569a8599bd8SSage Weil 	/*
570a8599bd8SSage Weil 	 * Each time we receive FILE_CACHE anew, we increment
571a8599bd8SSage Weil 	 * i_rdcache_gen.
572a8599bd8SSage Weil 	 */
573874c8ca1SDavid Howells 	if (S_ISREG(ci->netfs.inode.i_mode) &&
574525d15e8SYan, Zheng 	    (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
57599ccbd22SMilosz Tanski 	    (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) {
576a8599bd8SSage Weil 		ci->i_rdcache_gen++;
57799ccbd22SMilosz Tanski 	}
578a8599bd8SSage Weil 
579a8599bd8SSage Weil 	/*
58015b51bd6SYan, Zheng 	 * If FILE_SHARED is newly issued, mark dir not complete. We don't
58115b51bd6SYan, Zheng 	 * know what happened to this directory while we didn't have the cap.
58215b51bd6SYan, Zheng 	 * If FILE_SHARED is being revoked, also mark dir not complete. It
58315b51bd6SYan, Zheng 	 * stops on-going cached readdir.
584a8599bd8SSage Weil 	 */
58515b51bd6SYan, Zheng 	if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) {
58615b51bd6SYan, Zheng 		if (issued & CEPH_CAP_FILE_SHARED)
58797aeb6bfSYan, Zheng 			atomic_inc(&ci->i_shared_gen);
588874c8ca1SDavid Howells 		if (S_ISDIR(ci->netfs.inode.i_mode)) {
589874c8ca1SDavid Howells 			dout(" marking %p NOT complete\n", &ci->netfs.inode);
5902f276c51SYan, Zheng 			__ceph_dir_clear_complete(ci);
591a8673d61SYan, Zheng 		}
592a8599bd8SSage Weil 	}
593785892feSJeff Layton 
594785892feSJeff Layton 	/* Wipe saved layout if we're losing DIR_CREATE caps */
595874c8ca1SDavid Howells 	if (S_ISDIR(ci->netfs.inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) &&
596785892feSJeff Layton 		!(issued & CEPH_CAP_DIR_CREATE)) {
597785892feSJeff Layton 	     ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
598785892feSJeff Layton 	     memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
599785892feSJeff Layton 	}
600a8599bd8SSage Weil }
601a8599bd8SSage Weil 
6021cf03a68SJeff Layton /**
6031cf03a68SJeff Layton  * change_auth_cap_ses - move inode to appropriate lists when auth caps change
6041cf03a68SJeff Layton  * @ci: inode to be moved
6051cf03a68SJeff Layton  * @session: new auth caps session
6061cf03a68SJeff Layton  */
change_auth_cap_ses(struct ceph_inode_info * ci,struct ceph_mds_session * session)607e19feff9SXiubo Li void change_auth_cap_ses(struct ceph_inode_info *ci,
6081cf03a68SJeff Layton 			 struct ceph_mds_session *session)
6091cf03a68SJeff Layton {
6101cf03a68SJeff Layton 	lockdep_assert_held(&ci->i_ceph_lock);
6111cf03a68SJeff Layton 
6121cf03a68SJeff Layton 	if (list_empty(&ci->i_dirty_item) && list_empty(&ci->i_flushing_item))
6131cf03a68SJeff Layton 		return;
6141cf03a68SJeff Layton 
6151cf03a68SJeff Layton 	spin_lock(&session->s_mdsc->cap_dirty_lock);
6161cf03a68SJeff Layton 	if (!list_empty(&ci->i_dirty_item))
6171cf03a68SJeff Layton 		list_move(&ci->i_dirty_item, &session->s_cap_dirty);
6181cf03a68SJeff Layton 	if (!list_empty(&ci->i_flushing_item))
6191cf03a68SJeff Layton 		list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
6201cf03a68SJeff Layton 	spin_unlock(&session->s_mdsc->cap_dirty_lock);
6211cf03a68SJeff Layton }
6221cf03a68SJeff Layton 
623a8599bd8SSage Weil /*
624a8599bd8SSage Weil  * Add a capability under the given MDS session.
625a8599bd8SSage Weil  *
626354c63a0SJeff Layton  * Caller should hold session snap_rwsem (read) and ci->i_ceph_lock
627a8599bd8SSage Weil  *
628a8599bd8SSage Weil  * @fmode is the open file mode, if we are opening a file, otherwise
629a8599bd8SSage Weil  * it is < 0.  (This is so we can atomically add the cap and add an
630a8599bd8SSage Weil  * open file reference to it.)
631a8599bd8SSage Weil  */
ceph_add_cap(struct inode * inode,struct ceph_mds_session * session,u64 cap_id,unsigned issued,unsigned wanted,unsigned seq,unsigned mseq,u64 realmino,int flags,struct ceph_cap ** new_cap)632d9df2783SYan, Zheng void ceph_add_cap(struct inode *inode,
633a8599bd8SSage Weil 		  struct ceph_mds_session *session, u64 cap_id,
634135e671eSYan, Zheng 		  unsigned issued, unsigned wanted,
635a8599bd8SSage Weil 		  unsigned seq, unsigned mseq, u64 realmino, int flags,
636d9df2783SYan, Zheng 		  struct ceph_cap **new_cap)
637a8599bd8SSage Weil {
638985b9ee8SXiubo Li 	struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
639a8599bd8SSage Weil 	struct ceph_inode_info *ci = ceph_inode(inode);
640a8599bd8SSage Weil 	struct ceph_cap *cap;
641a8599bd8SSage Weil 	int mds = session->s_mds;
642a8599bd8SSage Weil 	int actual_wanted;
643606d1023SJeff Layton 	u32 gen;
644a8599bd8SSage Weil 
645354c63a0SJeff Layton 	lockdep_assert_held(&ci->i_ceph_lock);
646354c63a0SJeff Layton 
647a8599bd8SSage Weil 	dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
648a8599bd8SSage Weil 	     session->s_mds, cap_id, ceph_cap_string(issued), seq);
649a8599bd8SSage Weil 
65052d60f8eSJeff Layton 	gen = atomic_read(&session->s_cap_gen);
651606d1023SJeff Layton 
652a8599bd8SSage Weil 	cap = __get_cap_for_mds(ci, mds);
653a8599bd8SSage Weil 	if (!cap) {
654d9df2783SYan, Zheng 		cap = *new_cap;
655d9df2783SYan, Zheng 		*new_cap = NULL;
656a8599bd8SSage Weil 
657a8599bd8SSage Weil 		cap->issued = 0;
658a8599bd8SSage Weil 		cap->implemented = 0;
659a8599bd8SSage Weil 		cap->mds = mds;
660a8599bd8SSage Weil 		cap->mds_wanted = 0;
661964266ccSYan, Zheng 		cap->mseq = 0;
662a8599bd8SSage Weil 
663a8599bd8SSage Weil 		cap->ci = ci;
664a8599bd8SSage Weil 		__insert_cap_node(ci, cap);
665a8599bd8SSage Weil 
666a8599bd8SSage Weil 		/* add to session cap list */
667a8599bd8SSage Weil 		cap->session = session;
668a8599bd8SSage Weil 		spin_lock(&session->s_cap_lock);
669a8599bd8SSage Weil 		list_add_tail(&cap->session_caps, &session->s_caps);
670a8599bd8SSage Weil 		session->s_nr_caps++;
6714f1d756dSXiubo Li 		atomic64_inc(&mdsc->metric.total_caps);
672a8599bd8SSage Weil 		spin_unlock(&session->s_cap_lock);
67311df2dfbSYan, Zheng 	} else {
67432f6511aSYan, Zheng 		spin_lock(&session->s_cap_lock);
67532f6511aSYan, Zheng 		list_move_tail(&cap->session_caps, &session->s_caps);
67632f6511aSYan, Zheng 		spin_unlock(&session->s_cap_lock);
67732f6511aSYan, Zheng 
678606d1023SJeff Layton 		if (cap->cap_gen < gen)
679d2f8bb27SYan, Zheng 			cap->issued = cap->implemented = CEPH_CAP_PIN;
680d2f8bb27SYan, Zheng 
68111df2dfbSYan, Zheng 		/*
68211df2dfbSYan, Zheng 		 * auth mds of the inode changed. we received the cap export
68311df2dfbSYan, Zheng 		 * message, but still haven't received the cap import message.
68411df2dfbSYan, Zheng 		 * handle_cap_export() updated the new auth MDS' cap.
68511df2dfbSYan, Zheng 		 *
68611df2dfbSYan, Zheng 		 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
68711df2dfbSYan, Zheng 		 * a message that was send before the cap import message. So
68811df2dfbSYan, Zheng 		 * don't remove caps.
68911df2dfbSYan, Zheng 		 */
69011df2dfbSYan, Zheng 		if (ceph_seq_cmp(seq, cap->seq) <= 0) {
69111df2dfbSYan, Zheng 			WARN_ON(cap != ci->i_auth_cap);
69211df2dfbSYan, Zheng 			WARN_ON(cap->cap_id != cap_id);
69311df2dfbSYan, Zheng 			seq = cap->seq;
69411df2dfbSYan, Zheng 			mseq = cap->mseq;
69511df2dfbSYan, Zheng 			issued |= cap->issued;
69611df2dfbSYan, Zheng 			flags |= CEPH_CAP_FLAG_AUTH;
69711df2dfbSYan, Zheng 		}
69811df2dfbSYan, Zheng 	}
69911df2dfbSYan, Zheng 
7007d9c9193SYan, Zheng 	if (!ci->i_snap_realm ||
7017d9c9193SYan, Zheng 	    ((flags & CEPH_CAP_FLAG_AUTH) &&
7027d9c9193SYan, Zheng 	     realmino != (u64)-1 && ci->i_snap_realm->ino != realmino)) {
703a8599bd8SSage Weil 		/*
704a8599bd8SSage Weil 		 * add this inode to the appropriate snap realm
705a8599bd8SSage Weil 		 */
706a8599bd8SSage Weil 		struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
707a8599bd8SSage Weil 							       realmino);
708692e1715SJeff Layton 		if (realm)
7090ba92e1cSJeff Layton 			ceph_change_snap_realm(inode, realm);
710692e1715SJeff Layton 		else
711692e1715SJeff Layton 			WARN(1, "%s: couldn't find snap realm 0x%llx (ino 0x%llx oldrealm 0x%llx)\n",
712692e1715SJeff Layton 			     __func__, realmino, ci->i_vino.ino,
713692e1715SJeff Layton 			     ci->i_snap_realm ? ci->i_snap_realm->ino : 0);
714a8599bd8SSage Weil 	}
715a8599bd8SSage Weil 
716a8599bd8SSage Weil 	__check_cap_issue(ci, cap, issued);
717a8599bd8SSage Weil 
718a8599bd8SSage Weil 	/*
719a8599bd8SSage Weil 	 * If we are issued caps we don't want, or the mds' wanted
720a8599bd8SSage Weil 	 * value appears to be off, queue a check so we'll release
721a8599bd8SSage Weil 	 * later and/or update the mds wanted value.
722a8599bd8SSage Weil 	 */
723a8599bd8SSage Weil 	actual_wanted = __ceph_caps_wanted(ci);
724a8599bd8SSage Weil 	if ((wanted & ~actual_wanted) ||
725a8599bd8SSage Weil 	    (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
726a8599bd8SSage Weil 		dout(" issued %s, mds wanted %s, actual %s, queueing\n",
727a8599bd8SSage Weil 		     ceph_cap_string(issued), ceph_cap_string(wanted),
728a8599bd8SSage Weil 		     ceph_cap_string(actual_wanted));
729a0d93e32SYan, Zheng 		__cap_delay_requeue(mdsc, ci);
730a8599bd8SSage Weil 	}
731a8599bd8SSage Weil 
732b8c2f3aeSYan, Zheng 	if (flags & CEPH_CAP_FLAG_AUTH) {
733d37b1d99SMarkus Elfring 		if (!ci->i_auth_cap ||
734d9ffc4f7SYan, Zheng 		    ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) {
7351cf03a68SJeff Layton 			if (ci->i_auth_cap &&
7361cf03a68SJeff Layton 			    ci->i_auth_cap->session != cap->session)
7371cf03a68SJeff Layton 				change_auth_cap_ses(ci, cap->session);
738a8599bd8SSage Weil 			ci->i_auth_cap = cap;
739d9ffc4f7SYan, Zheng 			cap->mds_wanted = wanted;
740d9ffc4f7SYan, Zheng 		}
74111df2dfbSYan, Zheng 	} else {
74211df2dfbSYan, Zheng 		WARN_ON(ci->i_auth_cap == cap);
7438a92a119SYan, Zheng 	}
744a8599bd8SSage Weil 
745a8599bd8SSage Weil 	dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
746a8599bd8SSage Weil 	     inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
747a8599bd8SSage Weil 	     ceph_cap_string(issued|cap->issued), seq, mds);
748a8599bd8SSage Weil 	cap->cap_id = cap_id;
749a8599bd8SSage Weil 	cap->issued = issued;
750a8599bd8SSage Weil 	cap->implemented |= issued;
751d1b87809SYan, Zheng 	if (ceph_seq_cmp(mseq, cap->mseq) > 0)
752964266ccSYan, Zheng 		cap->mds_wanted = wanted;
753964266ccSYan, Zheng 	else
754a8599bd8SSage Weil 		cap->mds_wanted |= wanted;
755a8599bd8SSage Weil 	cap->seq = seq;
756a8599bd8SSage Weil 	cap->issue_seq = seq;
757a8599bd8SSage Weil 	cap->mseq = mseq;
758606d1023SJeff Layton 	cap->cap_gen = gen;
759f7913573SXiubo Li 	wake_up_all(&ci->i_cap_wq);
760a8599bd8SSage Weil }
761a8599bd8SSage Weil 
762a8599bd8SSage Weil /*
763a8599bd8SSage Weil  * Return true if cap has not timed out and belongs to the current
764a8599bd8SSage Weil  * generation of the MDS session (i.e. has not gone 'stale' due to
765a8599bd8SSage Weil  * us losing touch with the mds).
766a8599bd8SSage Weil  */
__cap_is_valid(struct ceph_cap * cap)767a8599bd8SSage Weil static int __cap_is_valid(struct ceph_cap *cap)
768a8599bd8SSage Weil {
769a8599bd8SSage Weil 	unsigned long ttl;
770cdac8303SSage Weil 	u32 gen;
771a8599bd8SSage Weil 
77252d60f8eSJeff Layton 	gen = atomic_read(&cap->session->s_cap_gen);
773a8599bd8SSage Weil 	ttl = cap->session->s_cap_ttl;
774a8599bd8SSage Weil 
775685f9a5dSSage Weil 	if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
776a8599bd8SSage Weil 		dout("__cap_is_valid %p cap %p issued %s "
777874c8ca1SDavid Howells 		     "but STALE (gen %u vs %u)\n", &cap->ci->netfs.inode,
778685f9a5dSSage Weil 		     cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
779a8599bd8SSage Weil 		return 0;
780a8599bd8SSage Weil 	}
781a8599bd8SSage Weil 
782a8599bd8SSage Weil 	return 1;
783a8599bd8SSage Weil }
784a8599bd8SSage Weil 
785a8599bd8SSage Weil /*
786a8599bd8SSage Weil  * Return set of valid cap bits issued to us.  Note that caps time
787a8599bd8SSage Weil  * out, and may be invalidated in bulk if the client session times out
788a8599bd8SSage Weil  * and session->s_cap_gen is bumped.
789a8599bd8SSage Weil  */
__ceph_caps_issued(struct ceph_inode_info * ci,int * implemented)790a8599bd8SSage Weil int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
791a8599bd8SSage Weil {
792d9df2783SYan, Zheng 	int have = ci->i_snap_caps;
793a8599bd8SSage Weil 	struct ceph_cap *cap;
794a8599bd8SSage Weil 	struct rb_node *p;
795a8599bd8SSage Weil 
796a8599bd8SSage Weil 	if (implemented)
797a8599bd8SSage Weil 		*implemented = 0;
798a8599bd8SSage Weil 	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
799a8599bd8SSage Weil 		cap = rb_entry(p, struct ceph_cap, ci_node);
800a8599bd8SSage Weil 		if (!__cap_is_valid(cap))
801a8599bd8SSage Weil 			continue;
802a8599bd8SSage Weil 		dout("__ceph_caps_issued %p cap %p issued %s\n",
803874c8ca1SDavid Howells 		     &ci->netfs.inode, cap, ceph_cap_string(cap->issued));
804a8599bd8SSage Weil 		have |= cap->issued;
805a8599bd8SSage Weil 		if (implemented)
806a8599bd8SSage Weil 			*implemented |= cap->implemented;
807a8599bd8SSage Weil 	}
808b1530f57SYan, Zheng 	/*
809b1530f57SYan, Zheng 	 * exclude caps issued by non-auth MDS, but are been revoking
810b1530f57SYan, Zheng 	 * by the auth MDS. The non-auth MDS should be revoking/exporting
811b1530f57SYan, Zheng 	 * these caps, but the message is delayed.
812b1530f57SYan, Zheng 	 */
813b1530f57SYan, Zheng 	if (ci->i_auth_cap) {
814b1530f57SYan, Zheng 		cap = ci->i_auth_cap;
815b1530f57SYan, Zheng 		have &= ~cap->implemented | cap->issued;
816b1530f57SYan, Zheng 	}
817a8599bd8SSage Weil 	return have;
818a8599bd8SSage Weil }
819a8599bd8SSage Weil 
820a8599bd8SSage Weil /*
821a8599bd8SSage Weil  * Get cap bits issued by caps other than @ocap
822a8599bd8SSage Weil  */
__ceph_caps_issued_other(struct ceph_inode_info * ci,struct ceph_cap * ocap)823a8599bd8SSage Weil int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
824a8599bd8SSage Weil {
825a8599bd8SSage Weil 	int have = ci->i_snap_caps;
826a8599bd8SSage Weil 	struct ceph_cap *cap;
827a8599bd8SSage Weil 	struct rb_node *p;
828a8599bd8SSage Weil 
829a8599bd8SSage Weil 	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
830a8599bd8SSage Weil 		cap = rb_entry(p, struct ceph_cap, ci_node);
831a8599bd8SSage Weil 		if (cap == ocap)
832a8599bd8SSage Weil 			continue;
833a8599bd8SSage Weil 		if (!__cap_is_valid(cap))
834a8599bd8SSage Weil 			continue;
835a8599bd8SSage Weil 		have |= cap->issued;
836a8599bd8SSage Weil 	}
837a8599bd8SSage Weil 	return have;
838a8599bd8SSage Weil }
839a8599bd8SSage Weil 
840a8599bd8SSage Weil /*
841a8599bd8SSage Weil  * Move a cap to the end of the LRU (oldest caps at list head, newest
842a8599bd8SSage Weil  * at list tail).
843a8599bd8SSage Weil  */
__touch_cap(struct ceph_cap * cap)844a8599bd8SSage Weil static void __touch_cap(struct ceph_cap *cap)
845a8599bd8SSage Weil {
846a8599bd8SSage Weil 	struct ceph_mds_session *s = cap->session;
847a8599bd8SSage Weil 
8485dacf091SSage Weil 	spin_lock(&s->s_cap_lock);
849d37b1d99SMarkus Elfring 	if (!s->s_cap_iterator) {
850874c8ca1SDavid Howells 		dout("__touch_cap %p cap %p mds%d\n", &cap->ci->netfs.inode, cap,
851a8599bd8SSage Weil 		     s->s_mds);
852a8599bd8SSage Weil 		list_move_tail(&cap->session_caps, &s->s_caps);
8535dacf091SSage Weil 	} else {
8545dacf091SSage Weil 		dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
855874c8ca1SDavid Howells 		     &cap->ci->netfs.inode, cap, s->s_mds);
8565dacf091SSage Weil 	}
857a8599bd8SSage Weil 	spin_unlock(&s->s_cap_lock);
858a8599bd8SSage Weil }
859a8599bd8SSage Weil 
860a8599bd8SSage Weil /*
861a8599bd8SSage Weil  * Check if we hold the given mask.  If so, move the cap(s) to the
862a8599bd8SSage Weil  * front of their respective LRUs.  (This is the preferred way for
863a8599bd8SSage Weil  * callers to check for caps they want.)
864a8599bd8SSage Weil  */
__ceph_caps_issued_mask(struct ceph_inode_info * ci,int mask,int touch)865a8599bd8SSage Weil int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
866a8599bd8SSage Weil {
867a8599bd8SSage Weil 	struct ceph_cap *cap;
868a8599bd8SSage Weil 	struct rb_node *p;
869a8599bd8SSage Weil 	int have = ci->i_snap_caps;
870a8599bd8SSage Weil 
871a8599bd8SSage Weil 	if ((have & mask) == mask) {
872ebce3eb2SJeff Layton 		dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
873874c8ca1SDavid Howells 		     " (mask %s)\n", ceph_ino(&ci->netfs.inode),
874a8599bd8SSage Weil 		     ceph_cap_string(have),
875a8599bd8SSage Weil 		     ceph_cap_string(mask));
876a8599bd8SSage Weil 		return 1;
877a8599bd8SSage Weil 	}
878a8599bd8SSage Weil 
879a8599bd8SSage Weil 	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
880a8599bd8SSage Weil 		cap = rb_entry(p, struct ceph_cap, ci_node);
881a8599bd8SSage Weil 		if (!__cap_is_valid(cap))
882a8599bd8SSage Weil 			continue;
883a8599bd8SSage Weil 		if ((cap->issued & mask) == mask) {
884ebce3eb2SJeff Layton 			dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
885874c8ca1SDavid Howells 			     " (mask %s)\n", ceph_ino(&ci->netfs.inode), cap,
886a8599bd8SSage Weil 			     ceph_cap_string(cap->issued),
887a8599bd8SSage Weil 			     ceph_cap_string(mask));
888a8599bd8SSage Weil 			if (touch)
889a8599bd8SSage Weil 				__touch_cap(cap);
890a8599bd8SSage Weil 			return 1;
891a8599bd8SSage Weil 		}
892a8599bd8SSage Weil 
893a8599bd8SSage Weil 		/* does a combination of caps satisfy mask? */
894a8599bd8SSage Weil 		have |= cap->issued;
895a8599bd8SSage Weil 		if ((have & mask) == mask) {
896ebce3eb2SJeff Layton 			dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
897874c8ca1SDavid Howells 			     " (mask %s)\n", ceph_ino(&ci->netfs.inode),
898a8599bd8SSage Weil 			     ceph_cap_string(cap->issued),
899a8599bd8SSage Weil 			     ceph_cap_string(mask));
900a8599bd8SSage Weil 			if (touch) {
901a8599bd8SSage Weil 				struct rb_node *q;
902a8599bd8SSage Weil 
90325985edcSLucas De Marchi 				/* touch this + preceding caps */
904a8599bd8SSage Weil 				__touch_cap(cap);
905a8599bd8SSage Weil 				for (q = rb_first(&ci->i_caps); q != p;
906a8599bd8SSage Weil 				     q = rb_next(q)) {
907a8599bd8SSage Weil 					cap = rb_entry(q, struct ceph_cap,
908a8599bd8SSage Weil 						       ci_node);
909a8599bd8SSage Weil 					if (!__cap_is_valid(cap))
910a8599bd8SSage Weil 						continue;
9119f8b72b3SXiubo Li 					if (cap->issued & mask)
912a8599bd8SSage Weil 						__touch_cap(cap);
913a8599bd8SSage Weil 				}
914a8599bd8SSage Weil 			}
915a8599bd8SSage Weil 			return 1;
916a8599bd8SSage Weil 		}
917a8599bd8SSage Weil 	}
918a8599bd8SSage Weil 
919a8599bd8SSage Weil 	return 0;
920a8599bd8SSage Weil }
921a8599bd8SSage Weil 
__ceph_caps_issued_mask_metric(struct ceph_inode_info * ci,int mask,int touch)9221af16d54SXiubo Li int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask,
9231af16d54SXiubo Li 				   int touch)
9241af16d54SXiubo Li {
925985b9ee8SXiubo Li 	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb);
9261af16d54SXiubo Li 	int r;
9271af16d54SXiubo Li 
9281af16d54SXiubo Li 	r = __ceph_caps_issued_mask(ci, mask, touch);
9291af16d54SXiubo Li 	if (r)
9301af16d54SXiubo Li 		ceph_update_cap_hit(&fsc->mdsc->metric);
9311af16d54SXiubo Li 	else
9321af16d54SXiubo Li 		ceph_update_cap_mis(&fsc->mdsc->metric);
9331af16d54SXiubo Li 	return r;
9341af16d54SXiubo Li }
9351af16d54SXiubo Li 
936a8599bd8SSage Weil /*
937a8599bd8SSage Weil  * Return true if mask caps are currently being revoked by an MDS.
938a8599bd8SSage Weil  */
__ceph_caps_revoking_other(struct ceph_inode_info * ci,struct ceph_cap * ocap,int mask)9396ee6b953SYan, Zheng int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
9406ee6b953SYan, Zheng 			       struct ceph_cap *ocap, int mask)
9416ee6b953SYan, Zheng {
9426ee6b953SYan, Zheng 	struct ceph_cap *cap;
9436ee6b953SYan, Zheng 	struct rb_node *p;
9446ee6b953SYan, Zheng 
9456ee6b953SYan, Zheng 	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
9466ee6b953SYan, Zheng 		cap = rb_entry(p, struct ceph_cap, ci_node);
9479563f88cSYan, Zheng 		if (cap != ocap &&
9486ee6b953SYan, Zheng 		    (cap->implemented & ~cap->issued & mask))
9496ee6b953SYan, Zheng 			return 1;
9506ee6b953SYan, Zheng 	}
9516ee6b953SYan, Zheng 	return 0;
9526ee6b953SYan, Zheng }
9536ee6b953SYan, Zheng 
ceph_caps_revoking(struct ceph_inode_info * ci,int mask)954a8599bd8SSage Weil int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
955a8599bd8SSage Weil {
956874c8ca1SDavid Howells 	struct inode *inode = &ci->netfs.inode;
9576ee6b953SYan, Zheng 	int ret;
958a8599bd8SSage Weil 
959be655596SSage Weil 	spin_lock(&ci->i_ceph_lock);
9606ee6b953SYan, Zheng 	ret = __ceph_caps_revoking_other(ci, NULL, mask);
961be655596SSage Weil 	spin_unlock(&ci->i_ceph_lock);
962a8599bd8SSage Weil 	dout("ceph_caps_revoking %p %s = %d\n", inode,
963a8599bd8SSage Weil 	     ceph_cap_string(mask), ret);
964a8599bd8SSage Weil 	return ret;
965a8599bd8SSage Weil }
966a8599bd8SSage Weil 
__ceph_caps_used(struct ceph_inode_info * ci)967a8599bd8SSage Weil int __ceph_caps_used(struct ceph_inode_info *ci)
968a8599bd8SSage Weil {
969a8599bd8SSage Weil 	int used = 0;
970a8599bd8SSage Weil 	if (ci->i_pin_ref)
971a8599bd8SSage Weil 		used |= CEPH_CAP_PIN;
972a8599bd8SSage Weil 	if (ci->i_rd_ref)
973a8599bd8SSage Weil 		used |= CEPH_CAP_FILE_RD;
974fdd4e158SYan, Zheng 	if (ci->i_rdcache_ref ||
975874c8ca1SDavid Howells 	    (S_ISREG(ci->netfs.inode.i_mode) &&
976874c8ca1SDavid Howells 	     ci->netfs.inode.i_data.nrpages))
977a8599bd8SSage Weil 		used |= CEPH_CAP_FILE_CACHE;
978a8599bd8SSage Weil 	if (ci->i_wr_ref)
979a8599bd8SSage Weil 		used |= CEPH_CAP_FILE_WR;
980d3d0720dSHenry C Chang 	if (ci->i_wb_ref || ci->i_wrbuffer_ref)
981a8599bd8SSage Weil 		used |= CEPH_CAP_FILE_BUFFER;
982f85122afSJeff Layton 	if (ci->i_fx_ref)
983f85122afSJeff Layton 		used |= CEPH_CAP_FILE_EXCL;
984a8599bd8SSage Weil 	return used;
985a8599bd8SSage Weil }
986a8599bd8SSage Weil 
987719a2514SYan, Zheng #define FMODE_WAIT_BIAS 1000
988719a2514SYan, Zheng 
989a8599bd8SSage Weil /*
990a8599bd8SSage Weil  * wanted, by virtue of open file modes
991a8599bd8SSage Weil  */
__ceph_caps_file_wanted(struct ceph_inode_info * ci)992a8599bd8SSage Weil int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
993a8599bd8SSage Weil {
994719a2514SYan, Zheng 	const int PIN_SHIFT = ffs(CEPH_FILE_MODE_PIN);
995719a2514SYan, Zheng 	const int RD_SHIFT = ffs(CEPH_FILE_MODE_RD);
996719a2514SYan, Zheng 	const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR);
997719a2514SYan, Zheng 	const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY);
998719a2514SYan, Zheng 	struct ceph_mount_options *opt =
999985b9ee8SXiubo Li 		ceph_inode_to_fs_client(&ci->netfs.inode)->mount_options;
1000719a2514SYan, Zheng 	unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ;
1001719a2514SYan, Zheng 	unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ;
1002719a2514SYan, Zheng 
1003874c8ca1SDavid Howells 	if (S_ISDIR(ci->netfs.inode.i_mode)) {
1004719a2514SYan, Zheng 		int want = 0;
1005719a2514SYan, Zheng 
1006719a2514SYan, Zheng 		/* use used_cutoff here, to keep dir's wanted caps longer */
1007719a2514SYan, Zheng 		if (ci->i_nr_by_mode[RD_SHIFT] > 0 ||
1008719a2514SYan, Zheng 		    time_after(ci->i_last_rd, used_cutoff))
1009719a2514SYan, Zheng 			want |= CEPH_CAP_ANY_SHARED;
1010719a2514SYan, Zheng 
1011719a2514SYan, Zheng 		if (ci->i_nr_by_mode[WR_SHIFT] > 0 ||
1012719a2514SYan, Zheng 		    time_after(ci->i_last_wr, used_cutoff)) {
1013719a2514SYan, Zheng 			want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
1014719a2514SYan, Zheng 			if (opt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)
1015719a2514SYan, Zheng 				want |= CEPH_CAP_ANY_DIR_OPS;
1016774a6a11SYan, Zheng 		}
1017719a2514SYan, Zheng 
1018719a2514SYan, Zheng 		if (want || ci->i_nr_by_mode[PIN_SHIFT] > 0)
1019719a2514SYan, Zheng 			want |= CEPH_CAP_PIN;
1020719a2514SYan, Zheng 
1021719a2514SYan, Zheng 		return want;
1022719a2514SYan, Zheng 	} else {
1023719a2514SYan, Zheng 		int bits = 0;
1024719a2514SYan, Zheng 
1025719a2514SYan, Zheng 		if (ci->i_nr_by_mode[RD_SHIFT] > 0) {
1026719a2514SYan, Zheng 			if (ci->i_nr_by_mode[RD_SHIFT] >= FMODE_WAIT_BIAS ||
1027719a2514SYan, Zheng 			    time_after(ci->i_last_rd, used_cutoff))
1028719a2514SYan, Zheng 				bits |= 1 << RD_SHIFT;
1029719a2514SYan, Zheng 		} else if (time_after(ci->i_last_rd, idle_cutoff)) {
1030719a2514SYan, Zheng 			bits |= 1 << RD_SHIFT;
1031719a2514SYan, Zheng 		}
1032719a2514SYan, Zheng 
1033719a2514SYan, Zheng 		if (ci->i_nr_by_mode[WR_SHIFT] > 0) {
1034719a2514SYan, Zheng 			if (ci->i_nr_by_mode[WR_SHIFT] >= FMODE_WAIT_BIAS ||
1035719a2514SYan, Zheng 			    time_after(ci->i_last_wr, used_cutoff))
1036719a2514SYan, Zheng 				bits |= 1 << WR_SHIFT;
1037719a2514SYan, Zheng 		} else if (time_after(ci->i_last_wr, idle_cutoff)) {
1038719a2514SYan, Zheng 			bits |= 1 << WR_SHIFT;
1039719a2514SYan, Zheng 		}
1040719a2514SYan, Zheng 
1041719a2514SYan, Zheng 		/* check lazyio only when read/write is wanted */
1042719a2514SYan, Zheng 		if ((bits & (CEPH_FILE_MODE_RDWR << 1)) &&
1043719a2514SYan, Zheng 		    ci->i_nr_by_mode[LAZY_SHIFT] > 0)
1044719a2514SYan, Zheng 			bits |= 1 << LAZY_SHIFT;
1045719a2514SYan, Zheng 
1046719a2514SYan, Zheng 		return bits ? ceph_caps_for_mode(bits >> 1) : 0;
1047719a2514SYan, Zheng 	}
1048a8599bd8SSage Weil }
1049a8599bd8SSage Weil 
1050a8599bd8SSage Weil /*
1051525d15e8SYan, Zheng  * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
1052525d15e8SYan, Zheng  */
__ceph_caps_wanted(struct ceph_inode_info * ci)1053525d15e8SYan, Zheng int __ceph_caps_wanted(struct ceph_inode_info *ci)
1054525d15e8SYan, Zheng {
1055525d15e8SYan, Zheng 	int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
1056874c8ca1SDavid Howells 	if (S_ISDIR(ci->netfs.inode.i_mode)) {
1057a25949b9SJeff Layton 		/* we want EXCL if holding caps of dir ops */
1058a25949b9SJeff Layton 		if (w & CEPH_CAP_ANY_DIR_OPS)
1059a25949b9SJeff Layton 			w |= CEPH_CAP_FILE_EXCL;
1060a25949b9SJeff Layton 	} else {
1061525d15e8SYan, Zheng 		/* we want EXCL if dirty data */
1062525d15e8SYan, Zheng 		if (w & CEPH_CAP_FILE_BUFFER)
1063525d15e8SYan, Zheng 			w |= CEPH_CAP_FILE_EXCL;
1064525d15e8SYan, Zheng 	}
1065525d15e8SYan, Zheng 	return w;
1066525d15e8SYan, Zheng }
1067525d15e8SYan, Zheng 
1068525d15e8SYan, Zheng /*
1069a8599bd8SSage Weil  * Return caps we have registered with the MDS(s) as 'wanted'.
1070a8599bd8SSage Weil  */
__ceph_caps_mds_wanted(struct ceph_inode_info * ci,bool check)1071c1944fedSYan, Zheng int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
1072a8599bd8SSage Weil {
1073a8599bd8SSage Weil 	struct ceph_cap *cap;
1074a8599bd8SSage Weil 	struct rb_node *p;
1075a8599bd8SSage Weil 	int mds_wanted = 0;
1076a8599bd8SSage Weil 
1077a8599bd8SSage Weil 	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
1078a8599bd8SSage Weil 		cap = rb_entry(p, struct ceph_cap, ci_node);
1079c1944fedSYan, Zheng 		if (check && !__cap_is_valid(cap))
1080a8599bd8SSage Weil 			continue;
1081a2550604SYan, Zheng 		if (cap == ci->i_auth_cap)
1082a8599bd8SSage Weil 			mds_wanted |= cap->mds_wanted;
1083a2550604SYan, Zheng 		else
1084a2550604SYan, Zheng 			mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR);
1085a8599bd8SSage Weil 	}
1086a8599bd8SSage Weil 	return mds_wanted;
1087a8599bd8SSage Weil }
1088a8599bd8SSage Weil 
ceph_is_any_caps(struct inode * inode)10899215aeeaSYan, Zheng int ceph_is_any_caps(struct inode *inode)
10909215aeeaSYan, Zheng {
10919215aeeaSYan, Zheng 	struct ceph_inode_info *ci = ceph_inode(inode);
10929215aeeaSYan, Zheng 	int ret;
10939215aeeaSYan, Zheng 
10949215aeeaSYan, Zheng 	spin_lock(&ci->i_ceph_lock);
1095bd84fbcbSXiubo Li 	ret = __ceph_is_any_real_caps(ci);
10969215aeeaSYan, Zheng 	spin_unlock(&ci->i_ceph_lock);
10979215aeeaSYan, Zheng 
10989215aeeaSYan, Zheng 	return ret;
10999215aeeaSYan, Zheng }
11009215aeeaSYan, Zheng 
1101a8599bd8SSage Weil /*
1102f818a736SSage Weil  * Remove a cap.  Take steps to deal with a racing iterate_session_caps.
1103f818a736SSage Weil  *
1104be655596SSage Weil  * caller should hold i_ceph_lock.
1105a6369741SSage Weil  * caller will not hold session s_mutex if called from destroy_inode.
1106a8599bd8SSage Weil  */
__ceph_remove_cap(struct ceph_cap * cap,bool queue_release)1107a096b09aSYan, Zheng void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
1108a8599bd8SSage Weil {
1109a8599bd8SSage Weil 	struct ceph_mds_session *session = cap->session;
1110a8599bd8SSage Weil 	struct ceph_inode_info *ci = cap->ci;
1111e5cafce3SLuis Henriques 	struct ceph_mds_client *mdsc;
1112f818a736SSage Weil 	int removed = 0;
1113a8599bd8SSage Weil 
1114e5cafce3SLuis Henriques 	/* 'ci' being NULL means the remove have already occurred */
1115e5cafce3SLuis Henriques 	if (!ci) {
1116e5cafce3SLuis Henriques 		dout("%s: cap inode is NULL\n", __func__);
1117e5cafce3SLuis Henriques 		return;
1118e5cafce3SLuis Henriques 	}
1119e5cafce3SLuis Henriques 
1120a76d0a9cSXiubo Li 	lockdep_assert_held(&ci->i_ceph_lock);
1121a76d0a9cSXiubo Li 
1122874c8ca1SDavid Howells 	dout("__ceph_remove_cap %p from %p\n", cap, &ci->netfs.inode);
1123a8599bd8SSage Weil 
1124985b9ee8SXiubo Li 	mdsc = ceph_inode_to_fs_client(&ci->netfs.inode)->mdsc;
1125e5cafce3SLuis Henriques 
1126ea60ed6fSLuis Henriques 	/* remove from inode's cap rbtree, and clear auth cap */
1127ea60ed6fSLuis Henriques 	rb_erase(&cap->ci_node, &ci->i_caps);
1128a76d0a9cSXiubo Li 	if (ci->i_auth_cap == cap)
1129ea60ed6fSLuis Henriques 		ci->i_auth_cap = NULL;
1130ea60ed6fSLuis Henriques 
11317c1332b8SSage Weil 	/* remove from session list */
11327c1332b8SSage Weil 	spin_lock(&session->s_cap_lock);
11337c1332b8SSage Weil 	if (session->s_cap_iterator == cap) {
11347c1332b8SSage Weil 		/* not yet, we are iterating over this very cap */
11357c1332b8SSage Weil 		dout("__ceph_remove_cap  delaying %p removal from session %p\n",
11367c1332b8SSage Weil 		     cap, cap->session);
11377c1332b8SSage Weil 	} else {
11387c1332b8SSage Weil 		list_del_init(&cap->session_caps);
11397c1332b8SSage Weil 		session->s_nr_caps--;
11404f1d756dSXiubo Li 		atomic64_dec(&mdsc->metric.total_caps);
11417c1332b8SSage Weil 		cap->session = NULL;
1142f818a736SSage Weil 		removed = 1;
11437c1332b8SSage Weil 	}
1144f818a736SSage Weil 	/* protect backpointer with s_cap_lock: see iterate_session_caps */
1145f818a736SSage Weil 	cap->ci = NULL;
1146745a8e3bSYan, Zheng 
1147745a8e3bSYan, Zheng 	/*
1148745a8e3bSYan, Zheng 	 * s_cap_reconnect is protected by s_cap_lock. no one changes
1149745a8e3bSYan, Zheng 	 * s_cap_gen while session is in the reconnect state.
1150745a8e3bSYan, Zheng 	 */
1151745a8e3bSYan, Zheng 	if (queue_release &&
115252d60f8eSJeff Layton 	    (!session->s_cap_reconnect ||
115352d60f8eSJeff Layton 	     cap->cap_gen == atomic_read(&session->s_cap_gen))) {
1154745a8e3bSYan, Zheng 		cap->queue_release = 1;
1155745a8e3bSYan, Zheng 		if (removed) {
1156e3ec8d68SYan, Zheng 			__ceph_queue_cap_release(session, cap);
1157745a8e3bSYan, Zheng 			removed = 0;
1158745a8e3bSYan, Zheng 		}
1159745a8e3bSYan, Zheng 	} else {
1160745a8e3bSYan, Zheng 		cap->queue_release = 0;
1161745a8e3bSYan, Zheng 	}
1162745a8e3bSYan, Zheng 	cap->cap_ino = ci->i_vino.ino;
1163745a8e3bSYan, Zheng 
11647c1332b8SSage Weil 	spin_unlock(&session->s_cap_lock);
11657c1332b8SSage Weil 
1166f818a736SSage Weil 	if (removed)
116737151668SYehuda Sadeh 		ceph_put_cap(mdsc, cap);
1168a8599bd8SSage Weil 
1169bd84fbcbSXiubo Li 	if (!__ceph_is_any_real_caps(ci)) {
1170db40cc17SYan, Zheng 		/* when reconnect denied, we remove session caps forcibly,
1171db40cc17SYan, Zheng 		 * i_wr_ref can be non-zero. If there are ongoing write,
1172db40cc17SYan, Zheng 		 * keep i_snap_realm.
1173db40cc17SYan, Zheng 		 */
1174bd84fbcbSXiubo Li 		if (ci->i_wr_ref == 0 && ci->i_snap_realm)
1175874c8ca1SDavid Howells 			ceph_change_snap_realm(&ci->netfs.inode, NULL);
1176db40cc17SYan, Zheng 
1177a8599bd8SSage Weil 		__cap_delay_cancel(mdsc, ci);
1178a8599bd8SSage Weil 	}
1179bd84fbcbSXiubo Li }
1180a8599bd8SSage Weil 
ceph_remove_cap(struct ceph_mds_client * mdsc,struct ceph_cap * cap,bool queue_release)11812e2023e9SXiubo Li void ceph_remove_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
11822e2023e9SXiubo Li 		     bool queue_release)
1183a76d0a9cSXiubo Li {
1184a76d0a9cSXiubo Li 	struct ceph_inode_info *ci = cap->ci;
1185a76d0a9cSXiubo Li 	struct ceph_fs_client *fsc;
1186a76d0a9cSXiubo Li 
1187a76d0a9cSXiubo Li 	/* 'ci' being NULL means the remove have already occurred */
1188a76d0a9cSXiubo Li 	if (!ci) {
1189a76d0a9cSXiubo Li 		dout("%s: cap inode is NULL\n", __func__);
1190a76d0a9cSXiubo Li 		return;
1191a76d0a9cSXiubo Li 	}
1192a76d0a9cSXiubo Li 
1193a76d0a9cSXiubo Li 	lockdep_assert_held(&ci->i_ceph_lock);
1194a76d0a9cSXiubo Li 
1195985b9ee8SXiubo Li 	fsc = ceph_inode_to_fs_client(&ci->netfs.inode);
1196a76d0a9cSXiubo Li 	WARN_ON_ONCE(ci->i_auth_cap == cap &&
1197a76d0a9cSXiubo Li 		     !list_empty(&ci->i_dirty_item) &&
1198a76d0a9cSXiubo Li 		     !fsc->blocklisted &&
1199874c8ca1SDavid Howells 		     !ceph_inode_is_shutdown(&ci->netfs.inode));
1200a76d0a9cSXiubo Li 
1201a76d0a9cSXiubo Li 	__ceph_remove_cap(cap, queue_release);
1202a76d0a9cSXiubo Li }
1203a76d0a9cSXiubo Li 
12040ff8bfb3SJeff Layton struct cap_msg_args {
12050ff8bfb3SJeff Layton 	struct ceph_mds_session	*session;
12060ff8bfb3SJeff Layton 	u64			ino, cid, follows;
12070ff8bfb3SJeff Layton 	u64			flush_tid, oldest_flush_tid, size, max_size;
12080ff8bfb3SJeff Layton 	u64			xattr_version;
1209176c77c9SJeff Layton 	u64			change_attr;
12100ff8bfb3SJeff Layton 	struct ceph_buffer	*xattr_buf;
12110a454bddSJeff Layton 	struct ceph_buffer	*old_xattr_buf;
1212ec62b894SJeff Layton 	struct timespec64	atime, mtime, ctime, btime;
12130ff8bfb3SJeff Layton 	int			op, caps, wanted, dirty;
12140ff8bfb3SJeff Layton 	u32			seq, issue_seq, mseq, time_warp_seq;
12151e4ef0c6SJeff Layton 	u32			flags;
12160ff8bfb3SJeff Layton 	kuid_t			uid;
12170ff8bfb3SJeff Layton 	kgid_t			gid;
12180ff8bfb3SJeff Layton 	umode_t			mode;
12190ff8bfb3SJeff Layton 	bool			inline_data;
12200a454bddSJeff Layton 	bool			wake;
122116be62fcSJeff Layton 	bool			encrypted;
12222d332d5bSJeff Layton 	u32			fscrypt_auth_len;
12232d332d5bSJeff Layton 	u8			fscrypt_auth[sizeof(struct ceph_fscrypt_auth)]; // for context
12240ff8bfb3SJeff Layton };
12250ff8bfb3SJeff Layton 
122616d68903SJeff Layton /* Marshal up the cap msg to the MDS */
encode_cap_msg(struct ceph_msg * msg,struct cap_msg_args * arg)122716d68903SJeff Layton static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg)
1228a8599bd8SSage Weil {
1229a8599bd8SSage Weil 	struct ceph_mds_caps *fc;
1230e20d258dSYan, Zheng 	void *p;
123192475f05SJeff Layton 	struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc;
1232a8599bd8SSage Weil 
123316d68903SJeff Layton 	dout("%s %s %llx %llx caps %s wanted %s dirty %s seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu xattr_ver %llu xattr_len %d\n",
123416d68903SJeff Layton 	     __func__, ceph_cap_op_name(arg->op), arg->cid, arg->ino,
123516d68903SJeff Layton 	     ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted),
123616d68903SJeff Layton 	     ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq,
123716d68903SJeff Layton 	     arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows,
123816d68903SJeff Layton 	     arg->size, arg->max_size, arg->xattr_version,
12390ff8bfb3SJeff Layton 	     arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0);
1240a8599bd8SSage Weil 
12412d332d5bSJeff Layton 	msg->hdr.version = cpu_to_le16(12);
12420ff8bfb3SJeff Layton 	msg->hdr.tid = cpu_to_le64(arg->flush_tid);
1243a8599bd8SSage Weil 
12446df058c0SSage Weil 	fc = msg->front.iov_base;
1245a8599bd8SSage Weil 	memset(fc, 0, sizeof(*fc));
1246a8599bd8SSage Weil 
12470ff8bfb3SJeff Layton 	fc->cap_id = cpu_to_le64(arg->cid);
12480ff8bfb3SJeff Layton 	fc->op = cpu_to_le32(arg->op);
12490ff8bfb3SJeff Layton 	fc->seq = cpu_to_le32(arg->seq);
12500ff8bfb3SJeff Layton 	fc->issue_seq = cpu_to_le32(arg->issue_seq);
12510ff8bfb3SJeff Layton 	fc->migrate_seq = cpu_to_le32(arg->mseq);
12520ff8bfb3SJeff Layton 	fc->caps = cpu_to_le32(arg->caps);
12530ff8bfb3SJeff Layton 	fc->wanted = cpu_to_le32(arg->wanted);
12540ff8bfb3SJeff Layton 	fc->dirty = cpu_to_le32(arg->dirty);
12550ff8bfb3SJeff Layton 	fc->ino = cpu_to_le64(arg->ino);
12560ff8bfb3SJeff Layton 	fc->snap_follows = cpu_to_le64(arg->follows);
1257a8599bd8SSage Weil 
125816be62fcSJeff Layton #if IS_ENABLED(CONFIG_FS_ENCRYPTION)
125916be62fcSJeff Layton 	if (arg->encrypted)
126016be62fcSJeff Layton 		fc->size = cpu_to_le64(round_up(arg->size,
126116be62fcSJeff Layton 						CEPH_FSCRYPT_BLOCK_SIZE));
126216be62fcSJeff Layton 	else
126316be62fcSJeff Layton #endif
12640ff8bfb3SJeff Layton 		fc->size = cpu_to_le64(arg->size);
12650ff8bfb3SJeff Layton 	fc->max_size = cpu_to_le64(arg->max_size);
12669bbeab41SArnd Bergmann 	ceph_encode_timespec64(&fc->mtime, &arg->mtime);
12679bbeab41SArnd Bergmann 	ceph_encode_timespec64(&fc->atime, &arg->atime);
12689bbeab41SArnd Bergmann 	ceph_encode_timespec64(&fc->ctime, &arg->ctime);
12690ff8bfb3SJeff Layton 	fc->time_warp_seq = cpu_to_le32(arg->time_warp_seq);
1270a8599bd8SSage Weil 
12710ff8bfb3SJeff Layton 	fc->uid = cpu_to_le32(from_kuid(&init_user_ns, arg->uid));
12720ff8bfb3SJeff Layton 	fc->gid = cpu_to_le32(from_kgid(&init_user_ns, arg->gid));
12730ff8bfb3SJeff Layton 	fc->mode = cpu_to_le32(arg->mode);
1274a8599bd8SSage Weil 
12750ff8bfb3SJeff Layton 	fc->xattr_version = cpu_to_le64(arg->xattr_version);
12760ff8bfb3SJeff Layton 	if (arg->xattr_buf) {
12770ff8bfb3SJeff Layton 		msg->middle = ceph_buffer_get(arg->xattr_buf);
12780ff8bfb3SJeff Layton 		fc->xattr_len = cpu_to_le32(arg->xattr_buf->vec.iov_len);
12790ff8bfb3SJeff Layton 		msg->hdr.middle_len = cpu_to_le32(arg->xattr_buf->vec.iov_len);
12809670079fSJeff Layton 	}
12819670079fSJeff Layton 
1282e20d258dSYan, Zheng 	p = fc + 1;
128343b29673SJeff Layton 	/* flock buffer size (version 2) */
1284e20d258dSYan, Zheng 	ceph_encode_32(&p, 0);
128543b29673SJeff Layton 	/* inline version (version 4) */
12860ff8bfb3SJeff Layton 	ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE);
1287e20d258dSYan, Zheng 	/* inline data size */
1288e20d258dSYan, Zheng 	ceph_encode_32(&p, 0);
128992475f05SJeff Layton 	/*
129092475f05SJeff Layton 	 * osd_epoch_barrier (version 5)
129192475f05SJeff Layton 	 * The epoch_barrier is protected osdc->lock, so READ_ONCE here in
129292475f05SJeff Layton 	 * case it was recently changed
129392475f05SJeff Layton 	 */
129492475f05SJeff Layton 	ceph_encode_32(&p, READ_ONCE(osdc->epoch_barrier));
129543b29673SJeff Layton 	/* oldest_flush_tid (version 6) */
12960ff8bfb3SJeff Layton 	ceph_encode_64(&p, arg->oldest_flush_tid);
1297e20d258dSYan, Zheng 
129843b29673SJeff Layton 	/*
129943b29673SJeff Layton 	 * caller_uid/caller_gid (version 7)
130043b29673SJeff Layton 	 *
130143b29673SJeff Layton 	 * Currently, we don't properly track which caller dirtied the caps
130243b29673SJeff Layton 	 * last, and force a flush of them when there is a conflict. For now,
130343b29673SJeff Layton 	 * just set this to 0:0, to emulate how the MDS has worked up to now.
130443b29673SJeff Layton 	 */
130543b29673SJeff Layton 	ceph_encode_32(&p, 0);
130643b29673SJeff Layton 	ceph_encode_32(&p, 0);
130743b29673SJeff Layton 
130843b29673SJeff Layton 	/* pool namespace (version 8) (mds always ignores this) */
130943b29673SJeff Layton 	ceph_encode_32(&p, 0);
131043b29673SJeff Layton 
1311176c77c9SJeff Layton 	/* btime and change_attr (version 9) */
1312ec62b894SJeff Layton 	ceph_encode_timespec64(p, &arg->btime);
131343b29673SJeff Layton 	p += sizeof(struct ceph_timespec);
1314176c77c9SJeff Layton 	ceph_encode_64(&p, arg->change_attr);
131543b29673SJeff Layton 
131643b29673SJeff Layton 	/* Advisory flags (version 10) */
13171e4ef0c6SJeff Layton 	ceph_encode_32(&p, arg->flags);
13182d332d5bSJeff Layton 
13192d332d5bSJeff Layton 	/* dirstats (version 11) - these are r/o on the client */
13202d332d5bSJeff Layton 	ceph_encode_64(&p, 0);
13212d332d5bSJeff Layton 	ceph_encode_64(&p, 0);
13222d332d5bSJeff Layton 
13232d332d5bSJeff Layton #if IS_ENABLED(CONFIG_FS_ENCRYPTION)
132416be62fcSJeff Layton 	/*
132516be62fcSJeff Layton 	 * fscrypt_auth and fscrypt_file (version 12)
132616be62fcSJeff Layton 	 *
132716be62fcSJeff Layton 	 * fscrypt_auth holds the crypto context (if any). fscrypt_file
132816be62fcSJeff Layton 	 * tracks the real i_size as an __le64 field (and we use a rounded-up
132916be62fcSJeff Layton 	 * i_size in the traditional size field).
133016be62fcSJeff Layton 	 */
13312d332d5bSJeff Layton 	ceph_encode_32(&p, arg->fscrypt_auth_len);
13322d332d5bSJeff Layton 	ceph_encode_copy(&p, arg->fscrypt_auth, arg->fscrypt_auth_len);
133316be62fcSJeff Layton 	ceph_encode_32(&p, sizeof(__le64));
133416be62fcSJeff Layton 	ceph_encode_64(&p, arg->size);
13352d332d5bSJeff Layton #else /* CONFIG_FS_ENCRYPTION */
13362d332d5bSJeff Layton 	ceph_encode_32(&p, 0);
13372d332d5bSJeff Layton 	ceph_encode_32(&p, 0);
13382d332d5bSJeff Layton #endif /* CONFIG_FS_ENCRYPTION */
1339a8599bd8SSage Weil }
1340a8599bd8SSage Weil 
1341a8599bd8SSage Weil /*
1342d6e47819SYan, Zheng  * Queue cap releases when an inode is dropped from our cache.
1343a8599bd8SSage Weil  */
__ceph_remove_caps(struct ceph_inode_info * ci)1344d6e47819SYan, Zheng void __ceph_remove_caps(struct ceph_inode_info *ci)
1345a8599bd8SSage Weil {
13462e2023e9SXiubo Li 	struct inode *inode = &ci->netfs.inode;
1347985b9ee8SXiubo Li 	struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
1348a8599bd8SSage Weil 	struct rb_node *p;
1349a8599bd8SSage Weil 
1350d6e47819SYan, Zheng 	/* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU)
1351d6e47819SYan, Zheng 	 * may call __ceph_caps_issued_mask() on a freeing inode. */
1352d6e47819SYan, Zheng 	spin_lock(&ci->i_ceph_lock);
1353a8599bd8SSage Weil 	p = rb_first(&ci->i_caps);
1354a8599bd8SSage Weil 	while (p) {
1355a8599bd8SSage Weil 		struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
1356a8599bd8SSage Weil 		p = rb_next(p);
13572e2023e9SXiubo Li 		ceph_remove_cap(mdsc, cap, true);
1358a8599bd8SSage Weil 	}
1359d6e47819SYan, Zheng 	spin_unlock(&ci->i_ceph_lock);
1360a8599bd8SSage Weil }
1361a8599bd8SSage Weil 
1362a8599bd8SSage Weil /*
13630a454bddSJeff Layton  * Prepare to send a cap message to an MDS. Update the cap state, and populate
13640a454bddSJeff Layton  * the arg struct with the parameters that will need to be sent. This should
13650a454bddSJeff Layton  * be done under the i_ceph_lock to guard against changes to cap state.
1366a8599bd8SSage Weil  *
1367a8599bd8SSage Weil  * Make note of max_size reported/requested from mds, revoked caps
1368a8599bd8SSage Weil  * that have now been implemented.
1369a8599bd8SSage Weil  */
__prep_cap(struct cap_msg_args * arg,struct ceph_cap * cap,int op,int flags,int used,int want,int retain,int flushing,u64 flush_tid,u64 oldest_flush_tid)13700a454bddSJeff Layton static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
137149ada6e8SYan, Zheng 		       int op, int flags, int used, int want, int retain,
13721e4ef0c6SJeff Layton 		       int flushing, u64 flush_tid, u64 oldest_flush_tid)
1373a8599bd8SSage Weil {
1374a8599bd8SSage Weil 	struct ceph_inode_info *ci = cap->ci;
1375874c8ca1SDavid Howells 	struct inode *inode = &ci->netfs.inode;
1376bb0581f0SColin Ian King 	int held, revoking;
1377a8599bd8SSage Weil 
13780a454bddSJeff Layton 	lockdep_assert_held(&ci->i_ceph_lock);
1379891f3f5aSJeff Layton 
138068c28323SSage Weil 	held = cap->issued | cap->implemented;
138168c28323SSage Weil 	revoking = cap->implemented & ~cap->issued;
138268c28323SSage Weil 	retain &= ~revoking;
138368c28323SSage Weil 
13840a454bddSJeff Layton 	dout("%s %p cap %p session %p %s -> %s (revoking %s)\n",
13850a454bddSJeff Layton 	     __func__, inode, cap, cap->session,
1386a8599bd8SSage Weil 	     ceph_cap_string(held), ceph_cap_string(held & retain),
1387a8599bd8SSage Weil 	     ceph_cap_string(revoking));
1388a8599bd8SSage Weil 	BUG_ON((retain & CEPH_CAP_PIN) == 0);
1389a8599bd8SSage Weil 
1390a0d93e32SYan, Zheng 	ci->i_ceph_flags &= ~CEPH_I_FLUSH;
1391a8599bd8SSage Weil 
1392a8599bd8SSage Weil 	cap->issued &= retain;  /* drop bits we don't want */
1393a8599bd8SSage Weil 	/*
13940a454bddSJeff Layton 	 * Wake up any waiters on wanted -> needed transition. This is due to
13950a454bddSJeff Layton 	 * the weird transition from buffered to sync IO... we need to flush
13960a454bddSJeff Layton 	 * dirty pages _before_ allowing sync writes to avoid reordering.
1397a8599bd8SSage Weil 	 */
13980a454bddSJeff Layton 	arg->wake = cap->implemented & ~cap->issued;
1399a8599bd8SSage Weil 	cap->implemented &= cap->issued | used;
1400a8599bd8SSage Weil 	cap->mds_wanted = want;
1401a8599bd8SSage Weil 
14020a454bddSJeff Layton 	arg->session = cap->session;
14030a454bddSJeff Layton 	arg->ino = ceph_vino(inode).ino;
14040a454bddSJeff Layton 	arg->cid = cap->cap_id;
14050a454bddSJeff Layton 	arg->follows = flushing ? ci->i_head_snapc->seq : 0;
14060a454bddSJeff Layton 	arg->flush_tid = flush_tid;
14070a454bddSJeff Layton 	arg->oldest_flush_tid = oldest_flush_tid;
14082d6795fbSJeff Layton 	arg->size = i_size_read(inode);
14090a454bddSJeff Layton 	ci->i_reported_size = arg->size;
14100a454bddSJeff Layton 	arg->max_size = ci->i_wanted_max_size;
14116f05b30eSYan, Zheng 	if (cap == ci->i_auth_cap) {
14126f05b30eSYan, Zheng 		if (want & CEPH_CAP_ANY_FILE_WR)
14130a454bddSJeff Layton 			ci->i_requested_max_size = arg->max_size;
14146f05b30eSYan, Zheng 		else
14156f05b30eSYan, Zheng 			ci->i_requested_max_size = 0;
14166f05b30eSYan, Zheng 	}
1417a8599bd8SSage Weil 
1418082afec9SSage Weil 	if (flushing & CEPH_CAP_XATTR_EXCL) {
14190a454bddSJeff Layton 		arg->old_xattr_buf = __ceph_build_xattrs_blob(ci);
14200a454bddSJeff Layton 		arg->xattr_version = ci->i_xattrs.version;
1421ae20db45SRishabh Dave 		arg->xattr_buf = ceph_buffer_get(ci->i_xattrs.blob);
14220ff8bfb3SJeff Layton 	} else {
14230a454bddSJeff Layton 		arg->xattr_buf = NULL;
14240a454bddSJeff Layton 		arg->old_xattr_buf = NULL;
1425a8599bd8SSage Weil 	}
1426a8599bd8SSage Weil 
14270a454bddSJeff Layton 	arg->mtime = inode->i_mtime;
14280a454bddSJeff Layton 	arg->atime = inode->i_atime;
14297795aef0SJeff Layton 	arg->ctime = inode_get_ctime(inode);
14300a454bddSJeff Layton 	arg->btime = ci->i_btime;
14310a454bddSJeff Layton 	arg->change_attr = inode_peek_iversion_raw(inode);
14320ff8bfb3SJeff Layton 
14330a454bddSJeff Layton 	arg->op = op;
14340a454bddSJeff Layton 	arg->caps = cap->implemented;
14350a454bddSJeff Layton 	arg->wanted = want;
14360a454bddSJeff Layton 	arg->dirty = flushing;
14370ff8bfb3SJeff Layton 
14380a454bddSJeff Layton 	arg->seq = cap->seq;
14390a454bddSJeff Layton 	arg->issue_seq = cap->issue_seq;
14400a454bddSJeff Layton 	arg->mseq = cap->mseq;
14410a454bddSJeff Layton 	arg->time_warp_seq = ci->i_time_warp_seq;
14420ff8bfb3SJeff Layton 
14430a454bddSJeff Layton 	arg->uid = inode->i_uid;
14440a454bddSJeff Layton 	arg->gid = inode->i_gid;
14450a454bddSJeff Layton 	arg->mode = inode->i_mode;
14460ff8bfb3SJeff Layton 
14470a454bddSJeff Layton 	arg->inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
144849ada6e8SYan, Zheng 	if (!(flags & CEPH_CLIENT_CAPS_PENDING_CAPSNAP) &&
144949ada6e8SYan, Zheng 	    !list_empty(&ci->i_cap_snaps)) {
145049ada6e8SYan, Zheng 		struct ceph_cap_snap *capsnap;
145149ada6e8SYan, Zheng 		list_for_each_entry_reverse(capsnap, &ci->i_cap_snaps, ci_item) {
145249ada6e8SYan, Zheng 			if (capsnap->cap_flush.tid)
145349ada6e8SYan, Zheng 				break;
145449ada6e8SYan, Zheng 			if (capsnap->need_flush) {
145549ada6e8SYan, Zheng 				flags |= CEPH_CLIENT_CAPS_PENDING_CAPSNAP;
145649ada6e8SYan, Zheng 				break;
145749ada6e8SYan, Zheng 			}
145849ada6e8SYan, Zheng 		}
145949ada6e8SYan, Zheng 	}
14600a454bddSJeff Layton 	arg->flags = flags;
146116be62fcSJeff Layton 	arg->encrypted = IS_ENCRYPTED(inode);
14622d332d5bSJeff Layton #if IS_ENABLED(CONFIG_FS_ENCRYPTION)
14632d332d5bSJeff Layton 	if (ci->fscrypt_auth_len &&
14642d332d5bSJeff Layton 	    WARN_ON_ONCE(ci->fscrypt_auth_len > sizeof(struct ceph_fscrypt_auth))) {
14652d332d5bSJeff Layton 		/* Don't set this if it's too big */
14662d332d5bSJeff Layton 		arg->fscrypt_auth_len = 0;
14672d332d5bSJeff Layton 	} else {
14682d332d5bSJeff Layton 		arg->fscrypt_auth_len = ci->fscrypt_auth_len;
14692d332d5bSJeff Layton 		memcpy(arg->fscrypt_auth, ci->fscrypt_auth,
14702d332d5bSJeff Layton 		       min_t(size_t, ci->fscrypt_auth_len,
14712d332d5bSJeff Layton 			     sizeof(arg->fscrypt_auth)));
14720a454bddSJeff Layton 	}
14732d332d5bSJeff Layton #endif /* CONFIG_FS_ENCRYPTION */
14742d332d5bSJeff Layton }
14752d332d5bSJeff Layton 
147616be62fcSJeff Layton #if IS_ENABLED(CONFIG_FS_ENCRYPTION)
147716be62fcSJeff Layton #define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \
147816be62fcSJeff Layton 		      4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4 + 8)
147916be62fcSJeff Layton 
cap_msg_size(struct cap_msg_args * arg)148016be62fcSJeff Layton static inline int cap_msg_size(struct cap_msg_args *arg)
148116be62fcSJeff Layton {
148216be62fcSJeff Layton 	return CAP_MSG_FIXED_FIELDS + arg->fscrypt_auth_len;
148316be62fcSJeff Layton }
148416be62fcSJeff Layton #else
14852d332d5bSJeff Layton #define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \
14862d332d5bSJeff Layton 		      4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4)
14872d332d5bSJeff Layton 
cap_msg_size(struct cap_msg_args * arg)14882d332d5bSJeff Layton static inline int cap_msg_size(struct cap_msg_args *arg)
14892d332d5bSJeff Layton {
14902d332d5bSJeff Layton 	return CAP_MSG_FIXED_FIELDS;
14912d332d5bSJeff Layton }
14922d332d5bSJeff Layton #endif /* CONFIG_FS_ENCRYPTION */
1493e20d258dSYan, Zheng 
14940a454bddSJeff Layton /*
14950a454bddSJeff Layton  * Send a cap msg on the given inode.
14960a454bddSJeff Layton  *
14970a454bddSJeff Layton  * Caller should hold snap_rwsem (read), s_mutex.
14980a454bddSJeff Layton  */
__send_cap(struct cap_msg_args * arg,struct ceph_inode_info * ci)149952311980SJeff Layton static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci)
15000a454bddSJeff Layton {
150116d68903SJeff Layton 	struct ceph_msg *msg;
1502874c8ca1SDavid Howells 	struct inode *inode = &ci->netfs.inode;
1503a8599bd8SSage Weil 
15042d332d5bSJeff Layton 	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(arg), GFP_NOFS,
15052d332d5bSJeff Layton 			   false);
150616d68903SJeff Layton 	if (!msg) {
150716d68903SJeff Layton 		pr_err("error allocating cap msg: ino (%llx.%llx) flushing %s tid %llu, requeuing cap.\n",
15080a454bddSJeff Layton 		       ceph_vinop(inode), ceph_cap_string(arg->dirty),
15090a454bddSJeff Layton 		       arg->flush_tid);
1510a0d93e32SYan, Zheng 		spin_lock(&ci->i_ceph_lock);
151152311980SJeff Layton 		__cap_delay_requeue(arg->session->s_mdsc, ci);
1512a0d93e32SYan, Zheng 		spin_unlock(&ci->i_ceph_lock);
151316d68903SJeff Layton 		return;
1514a8599bd8SSage Weil 	}
1515a8599bd8SSage Weil 
151616d68903SJeff Layton 	encode_cap_msg(msg, arg);
151716d68903SJeff Layton 	ceph_con_send(&arg->session->s_con, msg);
15180a454bddSJeff Layton 	ceph_buffer_put(arg->old_xattr_buf);
1519ae20db45SRishabh Dave 	ceph_buffer_put(arg->xattr_buf);
15200a454bddSJeff Layton 	if (arg->wake)
15210a454bddSJeff Layton 		wake_up_all(&ci->i_cap_wq);
1522a8599bd8SSage Weil }
1523a8599bd8SSage Weil 
__send_flush_snap(struct inode * inode,struct ceph_mds_session * session,struct ceph_cap_snap * capsnap,u32 mseq,u64 oldest_flush_tid)15240e294387SYan, Zheng static inline int __send_flush_snap(struct inode *inode,
15250e294387SYan, Zheng 				    struct ceph_mds_session *session,
15260e294387SYan, Zheng 				    struct ceph_cap_snap *capsnap,
15270e294387SYan, Zheng 				    u32 mseq, u64 oldest_flush_tid)
15280e294387SYan, Zheng {
15290ff8bfb3SJeff Layton 	struct cap_msg_args	arg;
153016d68903SJeff Layton 	struct ceph_msg		*msg;
153116d68903SJeff Layton 
15320ff8bfb3SJeff Layton 	arg.session = session;
15330ff8bfb3SJeff Layton 	arg.ino = ceph_vino(inode).ino;
15340ff8bfb3SJeff Layton 	arg.cid = 0;
15350ff8bfb3SJeff Layton 	arg.follows = capsnap->follows;
15360ff8bfb3SJeff Layton 	arg.flush_tid = capsnap->cap_flush.tid;
15370ff8bfb3SJeff Layton 	arg.oldest_flush_tid = oldest_flush_tid;
15380ff8bfb3SJeff Layton 
15390ff8bfb3SJeff Layton 	arg.size = capsnap->size;
15400ff8bfb3SJeff Layton 	arg.max_size = 0;
15410ff8bfb3SJeff Layton 	arg.xattr_version = capsnap->xattr_version;
15420ff8bfb3SJeff Layton 	arg.xattr_buf = capsnap->xattr_blob;
15430a454bddSJeff Layton 	arg.old_xattr_buf = NULL;
15440ff8bfb3SJeff Layton 
15450ff8bfb3SJeff Layton 	arg.atime = capsnap->atime;
15460ff8bfb3SJeff Layton 	arg.mtime = capsnap->mtime;
15470ff8bfb3SJeff Layton 	arg.ctime = capsnap->ctime;
1548ec62b894SJeff Layton 	arg.btime = capsnap->btime;
1549176c77c9SJeff Layton 	arg.change_attr = capsnap->change_attr;
15500ff8bfb3SJeff Layton 
15510ff8bfb3SJeff Layton 	arg.op = CEPH_CAP_OP_FLUSHSNAP;
15520ff8bfb3SJeff Layton 	arg.caps = capsnap->issued;
15530ff8bfb3SJeff Layton 	arg.wanted = 0;
15540ff8bfb3SJeff Layton 	arg.dirty = capsnap->dirty;
15550ff8bfb3SJeff Layton 
15560ff8bfb3SJeff Layton 	arg.seq = 0;
15570ff8bfb3SJeff Layton 	arg.issue_seq = 0;
15580ff8bfb3SJeff Layton 	arg.mseq = mseq;
15590ff8bfb3SJeff Layton 	arg.time_warp_seq = capsnap->time_warp_seq;
15600ff8bfb3SJeff Layton 
15610ff8bfb3SJeff Layton 	arg.uid = capsnap->uid;
15620ff8bfb3SJeff Layton 	arg.gid = capsnap->gid;
15630ff8bfb3SJeff Layton 	arg.mode = capsnap->mode;
15640ff8bfb3SJeff Layton 
15650ff8bfb3SJeff Layton 	arg.inline_data = capsnap->inline_data;
15661e4ef0c6SJeff Layton 	arg.flags = 0;
15670a454bddSJeff Layton 	arg.wake = false;
156816be62fcSJeff Layton 	arg.encrypted = IS_ENCRYPTED(inode);
15690ff8bfb3SJeff Layton 
157016be62fcSJeff Layton 	/* No fscrypt_auth changes from a capsnap.*/
15712d332d5bSJeff Layton 	arg.fscrypt_auth_len = 0;
15722d332d5bSJeff Layton 
15732d332d5bSJeff Layton 	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(&arg),
15742d332d5bSJeff Layton 			   GFP_NOFS, false);
15752d332d5bSJeff Layton 	if (!msg)
15762d332d5bSJeff Layton 		return -ENOMEM;
15770ff8bfb3SJeff Layton 
157816d68903SJeff Layton 	encode_cap_msg(msg, &arg);
157916d68903SJeff Layton 	ceph_con_send(&arg.session->s_con, msg);
158016d68903SJeff Layton 	return 0;
15810e294387SYan, Zheng }
15820e294387SYan, Zheng 
1583a8599bd8SSage Weil /*
1584a8599bd8SSage Weil  * When a snapshot is taken, clients accumulate dirty metadata on
1585a8599bd8SSage Weil  * inodes with capabilities in ceph_cap_snaps to describe the file
1586a8599bd8SSage Weil  * state at the time the snapshot was taken.  This must be flushed
1587a8599bd8SSage Weil  * asynchronously back to the MDS once sync writes complete and dirty
1588a8599bd8SSage Weil  * data is written out.
1589a8599bd8SSage Weil  *
15907732fe16SJeff Layton  * Called under i_ceph_lock.
1591a8599bd8SSage Weil  */
__ceph_flush_snaps(struct ceph_inode_info * ci,struct ceph_mds_session * session)1592ed9b430cSYan, Zheng static void __ceph_flush_snaps(struct ceph_inode_info *ci,
1593ed9b430cSYan, Zheng 			       struct ceph_mds_session *session)
1594be655596SSage Weil 		__releases(ci->i_ceph_lock)
1595be655596SSage Weil 		__acquires(ci->i_ceph_lock)
1596a8599bd8SSage Weil {
1597874c8ca1SDavid Howells 	struct inode *inode = &ci->netfs.inode;
1598ed9b430cSYan, Zheng 	struct ceph_mds_client *mdsc = session->s_mdsc;
1599a8599bd8SSage Weil 	struct ceph_cap_snap *capsnap;
1600ed9b430cSYan, Zheng 	u64 oldest_flush_tid = 0;
1601ed9b430cSYan, Zheng 	u64 first_tid = 1, last_tid = 0;
1602a8599bd8SSage Weil 
1603ed9b430cSYan, Zheng 	dout("__flush_snaps %p session %p\n", inode, session);
1604a8599bd8SSage Weil 
1605a8599bd8SSage Weil 	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
1606a8599bd8SSage Weil 		/*
1607a8599bd8SSage Weil 		 * we need to wait for sync writes to complete and for dirty
1608a8599bd8SSage Weil 		 * pages to be written out.
1609a8599bd8SSage Weil 		 */
1610a8599bd8SSage Weil 		if (capsnap->dirty_pages || capsnap->writing)
1611cfc0bf66SSage Weil 			break;
1612a8599bd8SSage Weil 
161386056090SYan, Zheng 		/* should be removed by ceph_try_drop_cap_snap() */
161486056090SYan, Zheng 		BUG_ON(!capsnap->need_flush);
1615819ccbfaSSage Weil 
1616e835124cSSage Weil 		/* only flush each capsnap once */
16170e294387SYan, Zheng 		if (capsnap->cap_flush.tid > 0) {
1618e835124cSSage Weil 			dout(" already flushed %p, skipping\n", capsnap);
1619e835124cSSage Weil 			continue;
1620e835124cSSage Weil 		}
1621e835124cSSage Weil 
1622ed9b430cSYan, Zheng 		spin_lock(&mdsc->cap_dirty_lock);
1623ed9b430cSYan, Zheng 		capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid;
1624ed9b430cSYan, Zheng 		list_add_tail(&capsnap->cap_flush.g_list,
1625ed9b430cSYan, Zheng 			      &mdsc->cap_flush_list);
1626ed9b430cSYan, Zheng 		if (oldest_flush_tid == 0)
1627ed9b430cSYan, Zheng 			oldest_flush_tid = __get_oldest_flush_tid(mdsc);
1628ed9b430cSYan, Zheng 		if (list_empty(&ci->i_flushing_item)) {
1629ed9b430cSYan, Zheng 			list_add_tail(&ci->i_flushing_item,
1630ed9b430cSYan, Zheng 				      &session->s_cap_flushing);
1631ed9b430cSYan, Zheng 		}
1632ed9b430cSYan, Zheng 		spin_unlock(&mdsc->cap_dirty_lock);
1633ca81f3f6SSage Weil 
1634ed9b430cSYan, Zheng 		list_add_tail(&capsnap->cap_flush.i_list,
1635ed9b430cSYan, Zheng 			      &ci->i_cap_flush_list);
1636ed9b430cSYan, Zheng 
1637ed9b430cSYan, Zheng 		if (first_tid == 1)
1638ed9b430cSYan, Zheng 			first_tid = capsnap->cap_flush.tid;
1639ed9b430cSYan, Zheng 		last_tid = capsnap->cap_flush.tid;
1640ed9b430cSYan, Zheng 	}
1641ed9b430cSYan, Zheng 
1642ed9b430cSYan, Zheng 	ci->i_ceph_flags &= ~CEPH_I_FLUSH_SNAPS;
1643ed9b430cSYan, Zheng 
1644ed9b430cSYan, Zheng 	while (first_tid <= last_tid) {
1645ed9b430cSYan, Zheng 		struct ceph_cap *cap = ci->i_auth_cap;
164657a5df0eSJakob Koschel 		struct ceph_cap_flush *cf = NULL, *iter;
1647ed9b430cSYan, Zheng 		int ret;
1648ed9b430cSYan, Zheng 
1649ed9b430cSYan, Zheng 		if (!(cap && cap->session == session)) {
1650ed9b430cSYan, Zheng 			dout("__flush_snaps %p auth cap %p not mds%d, "
1651ed9b430cSYan, Zheng 			     "stop\n", inode, cap, session->s_mds);
1652ed9b430cSYan, Zheng 			break;
1653ed9b430cSYan, Zheng 		}
1654ed9b430cSYan, Zheng 
1655ed9b430cSYan, Zheng 		ret = -ENOENT;
165657a5df0eSJakob Koschel 		list_for_each_entry(iter, &ci->i_cap_flush_list, i_list) {
165757a5df0eSJakob Koschel 			if (iter->tid >= first_tid) {
165857a5df0eSJakob Koschel 				cf = iter;
1659ed9b430cSYan, Zheng 				ret = 0;
1660ed9b430cSYan, Zheng 				break;
1661ed9b430cSYan, Zheng 			}
1662ed9b430cSYan, Zheng 		}
1663ed9b430cSYan, Zheng 		if (ret < 0)
1664ed9b430cSYan, Zheng 			break;
1665ed9b430cSYan, Zheng 
1666ed9b430cSYan, Zheng 		first_tid = cf->tid + 1;
1667ed9b430cSYan, Zheng 
1668ed9b430cSYan, Zheng 		capsnap = container_of(cf, struct ceph_cap_snap, cap_flush);
1669805692d0SElena Reshetova 		refcount_inc(&capsnap->nref);
1670ed9b430cSYan, Zheng 		spin_unlock(&ci->i_ceph_lock);
1671ed9b430cSYan, Zheng 
1672ed9b430cSYan, Zheng 		dout("__flush_snaps %p capsnap %p tid %llu %s\n",
1673ed9b430cSYan, Zheng 		     inode, capsnap, cf->tid, ceph_cap_string(capsnap->dirty));
1674ed9b430cSYan, Zheng 
1675ed9b430cSYan, Zheng 		ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
1676ed9b430cSYan, Zheng 					oldest_flush_tid);
1677ed9b430cSYan, Zheng 		if (ret < 0) {
1678ed9b430cSYan, Zheng 			pr_err("__flush_snaps: error sending cap flushsnap, "
1679ed9b430cSYan, Zheng 			       "ino (%llx.%llx) tid %llu follows %llu\n",
1680ed9b430cSYan, Zheng 				ceph_vinop(inode), cf->tid, capsnap->follows);
1681ed9b430cSYan, Zheng 		}
1682ed9b430cSYan, Zheng 
1683ed9b430cSYan, Zheng 		ceph_put_cap_snap(capsnap);
1684ed9b430cSYan, Zheng 		spin_lock(&ci->i_ceph_lock);
1685ed9b430cSYan, Zheng 	}
1686ed9b430cSYan, Zheng }
1687ed9b430cSYan, Zheng 
ceph_flush_snaps(struct ceph_inode_info * ci,struct ceph_mds_session ** psession)1688ed9b430cSYan, Zheng void ceph_flush_snaps(struct ceph_inode_info *ci,
1689ed9b430cSYan, Zheng 		      struct ceph_mds_session **psession)
1690ed9b430cSYan, Zheng {
1691874c8ca1SDavid Howells 	struct inode *inode = &ci->netfs.inode;
1692985b9ee8SXiubo Li 	struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
1693e4d2b16aSYan, Zheng 	struct ceph_mds_session *session = NULL;
1694409e873eSXiubo Li 	bool need_put = false;
1695ed9b430cSYan, Zheng 	int mds;
1696e4d2b16aSYan, Zheng 
1697ed9b430cSYan, Zheng 	dout("ceph_flush_snaps %p\n", inode);
1698e4d2b16aSYan, Zheng 	if (psession)
1699e4d2b16aSYan, Zheng 		session = *psession;
1700ed9b430cSYan, Zheng retry:
1701ed9b430cSYan, Zheng 	spin_lock(&ci->i_ceph_lock);
1702ed9b430cSYan, Zheng 	if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) {
1703ed9b430cSYan, Zheng 		dout(" no capsnap needs flush, doing nothing\n");
1704ed9b430cSYan, Zheng 		goto out;
1705ed9b430cSYan, Zheng 	}
1706ed9b430cSYan, Zheng 	if (!ci->i_auth_cap) {
1707ed9b430cSYan, Zheng 		dout(" no auth cap (migrating?), doing nothing\n");
1708ed9b430cSYan, Zheng 		goto out;
1709ed9b430cSYan, Zheng 	}
1710ed9b430cSYan, Zheng 
1711ed9b430cSYan, Zheng 	mds = ci->i_auth_cap->session->s_mds;
1712a8599bd8SSage Weil 	if (session && session->s_mds != mds) {
1713a8599bd8SSage Weil 		dout(" oops, wrong session %p mutex\n", session);
1714a8599bd8SSage Weil 		ceph_put_mds_session(session);
1715a8599bd8SSage Weil 		session = NULL;
1716a8599bd8SSage Weil 	}
1717a8599bd8SSage Weil 	if (!session) {
1718be655596SSage Weil 		spin_unlock(&ci->i_ceph_lock);
1719a8599bd8SSage Weil 		mutex_lock(&mdsc->mutex);
1720a8599bd8SSage Weil 		session = __ceph_lookup_mds_session(mdsc, mds);
1721a8599bd8SSage Weil 		mutex_unlock(&mdsc->mutex);
1722a8599bd8SSage Weil 		goto retry;
1723a8599bd8SSage Weil 	}
1724a8599bd8SSage Weil 
172524d063acSYan, Zheng 	// make sure flushsnap messages are sent in proper order.
1726054f8d41SYan, Zheng 	if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
172724d063acSYan, Zheng 		__kick_flushing_caps(mdsc, session, ci, 0);
172824d063acSYan, Zheng 
1729ed9b430cSYan, Zheng 	__ceph_flush_snaps(ci, session);
1730ed9b430cSYan, Zheng out:
1731be655596SSage Weil 	spin_unlock(&ci->i_ceph_lock);
1732a8599bd8SSage Weil 
17337732fe16SJeff Layton 	if (psession)
1734ed9b430cSYan, Zheng 		*psession = session;
17357732fe16SJeff Layton 	else
1736ed9b430cSYan, Zheng 		ceph_put_mds_session(session);
1737a8599bd8SSage Weil 	/* we flushed them all; remove this inode from the queue */
1738a8599bd8SSage Weil 	spin_lock(&mdsc->snap_flush_lock);
1739409e873eSXiubo Li 	if (!list_empty(&ci->i_snap_flush_item))
1740409e873eSXiubo Li 		need_put = true;
1741a8599bd8SSage Weil 	list_del_init(&ci->i_snap_flush_item);
1742a8599bd8SSage Weil 	spin_unlock(&mdsc->snap_flush_lock);
1743409e873eSXiubo Li 
1744409e873eSXiubo Li 	if (need_put)
1745409e873eSXiubo Li 		iput(inode);
1746a8599bd8SSage Weil }
1747a8599bd8SSage Weil 
1748a8599bd8SSage Weil /*
1749fca65b4aSSage Weil  * Mark caps dirty.  If inode is newly dirty, return the dirty flags.
1750fca65b4aSSage Weil  * Caller is then responsible for calling __mark_inode_dirty with the
1751fca65b4aSSage Weil  * returned flags value.
175276e3b390SSage Weil  */
__ceph_mark_dirty_caps(struct ceph_inode_info * ci,int mask,struct ceph_cap_flush ** pcf)1753f66fd9f0SYan, Zheng int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
1754f66fd9f0SYan, Zheng 			   struct ceph_cap_flush **pcf)
175576e3b390SSage Weil {
1756640ef79dSCheng Renquan 	struct ceph_mds_client *mdsc =
1757985b9ee8SXiubo Li 		ceph_sb_to_fs_client(ci->netfs.inode.i_sb)->mdsc;
1758874c8ca1SDavid Howells 	struct inode *inode = &ci->netfs.inode;
175976e3b390SSage Weil 	int was = ci->i_dirty_caps;
176076e3b390SSage Weil 	int dirty = 0;
176176e3b390SSage Weil 
1762c7e4f85cSJeff Layton 	lockdep_assert_held(&ci->i_ceph_lock);
1763c7e4f85cSJeff Layton 
1764571ade33SYan, Zheng 	if (!ci->i_auth_cap) {
1765571ade33SYan, Zheng 		pr_warn("__mark_dirty_caps %p %llx mask %s, "
1766571ade33SYan, Zheng 			"but no auth cap (session was closed?)\n",
1767571ade33SYan, Zheng 			inode, ceph_ino(inode), ceph_cap_string(mask));
1768571ade33SYan, Zheng 		return 0;
1769571ade33SYan, Zheng 	}
1770571ade33SYan, Zheng 
1771874c8ca1SDavid Howells 	dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->netfs.inode,
177276e3b390SSage Weil 	     ceph_cap_string(mask), ceph_cap_string(was),
177376e3b390SSage Weil 	     ceph_cap_string(was | mask));
177476e3b390SSage Weil 	ci->i_dirty_caps |= mask;
177576e3b390SSage Weil 	if (was == 0) {
17761cf03a68SJeff Layton 		struct ceph_mds_session *session = ci->i_auth_cap->session;
17771cf03a68SJeff Layton 
1778f66fd9f0SYan, Zheng 		WARN_ON_ONCE(ci->i_prealloc_cap_flush);
1779f66fd9f0SYan, Zheng 		swap(ci->i_prealloc_cap_flush, *pcf);
1780f66fd9f0SYan, Zheng 
1781604d1b02SYan, Zheng 		if (!ci->i_head_snapc) {
1782604d1b02SYan, Zheng 			WARN_ON_ONCE(!rwsem_is_locked(&mdsc->snap_rwsem));
17837d8cb26dSSage Weil 			ci->i_head_snapc = ceph_get_snap_context(
17847d8cb26dSSage Weil 				ci->i_snap_realm->cached_context);
1785604d1b02SYan, Zheng 		}
17860685235fSYan, Zheng 		dout(" inode %p now dirty snapc %p auth cap %p\n",
1787874c8ca1SDavid Howells 		     &ci->netfs.inode, ci->i_head_snapc, ci->i_auth_cap);
178876e3b390SSage Weil 		BUG_ON(!list_empty(&ci->i_dirty_item));
178976e3b390SSage Weil 		spin_lock(&mdsc->cap_dirty_lock);
17901cf03a68SJeff Layton 		list_add(&ci->i_dirty_item, &session->s_cap_dirty);
179176e3b390SSage Weil 		spin_unlock(&mdsc->cap_dirty_lock);
179276e3b390SSage Weil 		if (ci->i_flushing_caps == 0) {
17933772d26dSSage Weil 			ihold(inode);
179476e3b390SSage Weil 			dirty |= I_DIRTY_SYNC;
179576e3b390SSage Weil 		}
1796f66fd9f0SYan, Zheng 	} else {
1797f66fd9f0SYan, Zheng 		WARN_ON_ONCE(!ci->i_prealloc_cap_flush);
179876e3b390SSage Weil 	}
179976e3b390SSage Weil 	BUG_ON(list_empty(&ci->i_dirty_item));
180076e3b390SSage Weil 	if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
180176e3b390SSage Weil 	    (mask & CEPH_CAP_FILE_BUFFER))
180276e3b390SSage Weil 		dirty |= I_DIRTY_DATASYNC;
1803a0d93e32SYan, Zheng 	__cap_delay_requeue(mdsc, ci);
1804fca65b4aSSage Weil 	return dirty;
180576e3b390SSage Weil }
180676e3b390SSage Weil 
ceph_alloc_cap_flush(void)1807f66fd9f0SYan, Zheng struct ceph_cap_flush *ceph_alloc_cap_flush(void)
1808f66fd9f0SYan, Zheng {
1809b2f9fa1fSXiubo Li 	struct ceph_cap_flush *cf;
1810b2f9fa1fSXiubo Li 
1811b2f9fa1fSXiubo Li 	cf = kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
181205a444d3SColin Ian King 	if (!cf)
181305a444d3SColin Ian King 		return NULL;
181405a444d3SColin Ian King 
1815b2f9fa1fSXiubo Li 	cf->is_capsnap = false;
1816b2f9fa1fSXiubo Li 	return cf;
1817f66fd9f0SYan, Zheng }
1818f66fd9f0SYan, Zheng 
ceph_free_cap_flush(struct ceph_cap_flush * cf)1819f66fd9f0SYan, Zheng void ceph_free_cap_flush(struct ceph_cap_flush *cf)
1820f66fd9f0SYan, Zheng {
1821f66fd9f0SYan, Zheng 	if (cf)
1822f66fd9f0SYan, Zheng 		kmem_cache_free(ceph_cap_flush_cachep, cf);
1823f66fd9f0SYan, Zheng }
1824f66fd9f0SYan, Zheng 
__get_oldest_flush_tid(struct ceph_mds_client * mdsc)1825a2971c8cSYan, Zheng static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc)
1826a2971c8cSYan, Zheng {
1827e4500b5eSYan, Zheng 	if (!list_empty(&mdsc->cap_flush_list)) {
1828a2971c8cSYan, Zheng 		struct ceph_cap_flush *cf =
1829e4500b5eSYan, Zheng 			list_first_entry(&mdsc->cap_flush_list,
1830e4500b5eSYan, Zheng 					 struct ceph_cap_flush, g_list);
1831a2971c8cSYan, Zheng 		return cf->tid;
1832a2971c8cSYan, Zheng 	}
1833a2971c8cSYan, Zheng 	return 0;
1834a2971c8cSYan, Zheng }
1835a2971c8cSYan, Zheng 
183676e3b390SSage Weil /*
1837c8799fc4SYan, Zheng  * Remove cap_flush from the mdsc's or inode's flushing cap list.
1838c8799fc4SYan, Zheng  * Return true if caller needs to wake up flush waiters.
1839c8799fc4SYan, Zheng  */
__detach_cap_flush_from_mdsc(struct ceph_mds_client * mdsc,struct ceph_cap_flush * cf)1840681ac634SJeff Layton static bool __detach_cap_flush_from_mdsc(struct ceph_mds_client *mdsc,
1841c8799fc4SYan, Zheng 					 struct ceph_cap_flush *cf)
1842c8799fc4SYan, Zheng {
1843c8799fc4SYan, Zheng 	struct ceph_cap_flush *prev;
1844c8799fc4SYan, Zheng 	bool wake = cf->wake;
1845681ac634SJeff Layton 
1846c8799fc4SYan, Zheng 	if (wake && cf->g_list.prev != &mdsc->cap_flush_list) {
1847c8799fc4SYan, Zheng 		prev = list_prev_entry(cf, g_list);
1848c8799fc4SYan, Zheng 		prev->wake = true;
1849c8799fc4SYan, Zheng 		wake = false;
1850c8799fc4SYan, Zheng 	}
1851b2f9fa1fSXiubo Li 	list_del_init(&cf->g_list);
1852681ac634SJeff Layton 	return wake;
1853681ac634SJeff Layton }
1854681ac634SJeff Layton 
__detach_cap_flush_from_ci(struct ceph_inode_info * ci,struct ceph_cap_flush * cf)1855681ac634SJeff Layton static bool __detach_cap_flush_from_ci(struct ceph_inode_info *ci,
1856681ac634SJeff Layton 				       struct ceph_cap_flush *cf)
1857681ac634SJeff Layton {
1858681ac634SJeff Layton 	struct ceph_cap_flush *prev;
1859681ac634SJeff Layton 	bool wake = cf->wake;
1860681ac634SJeff Layton 
1861c8799fc4SYan, Zheng 	if (wake && cf->i_list.prev != &ci->i_cap_flush_list) {
1862c8799fc4SYan, Zheng 		prev = list_prev_entry(cf, i_list);
1863c8799fc4SYan, Zheng 		prev->wake = true;
1864c8799fc4SYan, Zheng 		wake = false;
1865c8799fc4SYan, Zheng 	}
1866b2f9fa1fSXiubo Li 	list_del_init(&cf->i_list);
1867c8799fc4SYan, Zheng 	return wake;
1868c8799fc4SYan, Zheng }
1869c8799fc4SYan, Zheng 
1870c8799fc4SYan, Zheng /*
1871a8599bd8SSage Weil  * Add dirty inode to the flushing list.  Assigned a seq number so we
1872a8599bd8SSage Weil  * can wait for caps to flush without starving.
1873cdc35f96SSage Weil  *
18749f3345d8SJeff Layton  * Called under i_ceph_lock. Returns the flush tid.
1875a8599bd8SSage Weil  */
__mark_caps_flushing(struct inode * inode,struct ceph_mds_session * session,bool wake,u64 * oldest_flush_tid)18769f3345d8SJeff Layton static u64 __mark_caps_flushing(struct inode *inode,
1877c8799fc4SYan, Zheng 				struct ceph_mds_session *session, bool wake,
18789f3345d8SJeff Layton 				u64 *oldest_flush_tid)
1879a8599bd8SSage Weil {
1880985b9ee8SXiubo Li 	struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
1881a8599bd8SSage Weil 	struct ceph_inode_info *ci = ceph_inode(inode);
1882f66fd9f0SYan, Zheng 	struct ceph_cap_flush *cf = NULL;
1883cdc35f96SSage Weil 	int flushing;
1884a8599bd8SSage Weil 
1885c7e4f85cSJeff Layton 	lockdep_assert_held(&ci->i_ceph_lock);
1886cdc35f96SSage Weil 	BUG_ON(ci->i_dirty_caps == 0);
1887a8599bd8SSage Weil 	BUG_ON(list_empty(&ci->i_dirty_item));
1888f66fd9f0SYan, Zheng 	BUG_ON(!ci->i_prealloc_cap_flush);
1889cdc35f96SSage Weil 
1890cdc35f96SSage Weil 	flushing = ci->i_dirty_caps;
1891cdc35f96SSage Weil 	dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
1892cdc35f96SSage Weil 	     ceph_cap_string(flushing),
1893cdc35f96SSage Weil 	     ceph_cap_string(ci->i_flushing_caps),
1894cdc35f96SSage Weil 	     ceph_cap_string(ci->i_flushing_caps | flushing));
1895cdc35f96SSage Weil 	ci->i_flushing_caps |= flushing;
1896cdc35f96SSage Weil 	ci->i_dirty_caps = 0;
1897afcdaea3SSage Weil 	dout(" inode %p now !dirty\n", inode);
1898cdc35f96SSage Weil 
1899f66fd9f0SYan, Zheng 	swap(cf, ci->i_prealloc_cap_flush);
1900553adfd9SYan, Zheng 	cf->caps = flushing;
1901c8799fc4SYan, Zheng 	cf->wake = wake;
1902553adfd9SYan, Zheng 
1903a8599bd8SSage Weil 	spin_lock(&mdsc->cap_dirty_lock);
1904cdc35f96SSage Weil 	list_del_init(&ci->i_dirty_item);
1905afcdaea3SSage Weil 
1906553adfd9SYan, Zheng 	cf->tid = ++mdsc->last_cap_flush_tid;
1907e4500b5eSYan, Zheng 	list_add_tail(&cf->g_list, &mdsc->cap_flush_list);
1908a2971c8cSYan, Zheng 	*oldest_flush_tid = __get_oldest_flush_tid(mdsc);
1909553adfd9SYan, Zheng 
1910afcdaea3SSage Weil 	if (list_empty(&ci->i_flushing_item)) {
1911a8599bd8SSage Weil 		list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1912a8599bd8SSage Weil 		mdsc->num_cap_flushing++;
1913a8599bd8SSage Weil 	}
1914a8599bd8SSage Weil 	spin_unlock(&mdsc->cap_dirty_lock);
1915cdc35f96SSage Weil 
1916e4500b5eSYan, Zheng 	list_add_tail(&cf->i_list, &ci->i_cap_flush_list);
1917553adfd9SYan, Zheng 
19189f3345d8SJeff Layton 	return cf->tid;
1919a8599bd8SSage Weil }
1920a8599bd8SSage Weil 
1921a8599bd8SSage Weil /*
19225ecad6fdSSage Weil  * try to invalidate mapping pages without blocking.
19235ecad6fdSSage Weil  */
try_nonblocking_invalidate(struct inode * inode)19245ecad6fdSSage Weil static int try_nonblocking_invalidate(struct inode *inode)
19253eaf5aa1SJeff Layton 	__releases(ci->i_ceph_lock)
19263eaf5aa1SJeff Layton 	__acquires(ci->i_ceph_lock)
19275ecad6fdSSage Weil {
19285ecad6fdSSage Weil 	struct ceph_inode_info *ci = ceph_inode(inode);
19295ecad6fdSSage Weil 	u32 invalidating_gen = ci->i_rdcache_gen;
19305ecad6fdSSage Weil 
1931be655596SSage Weil 	spin_unlock(&ci->i_ceph_lock);
1932400e1286SJeff Layton 	ceph_fscache_invalidate(inode, false);
19335ecad6fdSSage Weil 	invalidate_mapping_pages(&inode->i_data, 0, -1);
1934be655596SSage Weil 	spin_lock(&ci->i_ceph_lock);
19355ecad6fdSSage Weil 
193618a38193SSage Weil 	if (inode->i_data.nrpages == 0 &&
19375ecad6fdSSage Weil 	    invalidating_gen == ci->i_rdcache_gen) {
19385ecad6fdSSage Weil 		/* success. */
19395ecad6fdSSage Weil 		dout("try_nonblocking_invalidate %p success\n", inode);
1940cd045cb4SSage Weil 		/* save any racing async invalidate some trouble */
1941cd045cb4SSage Weil 		ci->i_rdcache_revoking = ci->i_rdcache_gen - 1;
19425ecad6fdSSage Weil 		return 0;
19435ecad6fdSSage Weil 	}
19445ecad6fdSSage Weil 	dout("try_nonblocking_invalidate %p failed\n", inode);
19455ecad6fdSSage Weil 	return -1;
19465ecad6fdSSage Weil }
19475ecad6fdSSage Weil 
__ceph_should_report_size(struct ceph_inode_info * ci)1948efb0ca76SYan, Zheng bool __ceph_should_report_size(struct ceph_inode_info *ci)
1949efb0ca76SYan, Zheng {
1950874c8ca1SDavid Howells 	loff_t size = i_size_read(&ci->netfs.inode);
1951efb0ca76SYan, Zheng 	/* mds will adjust max size according to the reported size */
1952efb0ca76SYan, Zheng 	if (ci->i_flushing_caps & CEPH_CAP_FILE_WR)
1953efb0ca76SYan, Zheng 		return false;
1954efb0ca76SYan, Zheng 	if (size >= ci->i_max_size)
1955efb0ca76SYan, Zheng 		return true;
1956efb0ca76SYan, Zheng 	/* half of previous max_size increment has been used */
1957efb0ca76SYan, Zheng 	if (ci->i_max_size > ci->i_reported_size &&
1958efb0ca76SYan, Zheng 	    (size << 1) >= ci->i_max_size + ci->i_reported_size)
1959efb0ca76SYan, Zheng 		return true;
1960efb0ca76SYan, Zheng 	return false;
1961efb0ca76SYan, Zheng }
1962efb0ca76SYan, Zheng 
19635ecad6fdSSage Weil /*
1964a8599bd8SSage Weil  * Swiss army knife function to examine currently used and wanted
1965a8599bd8SSage Weil  * versus held caps.  Release, flush, ack revoked caps to mds as
1966a8599bd8SSage Weil  * appropriate.
1967a8599bd8SSage Weil  *
1968a8599bd8SSage Weil  *  CHECK_CAPS_AUTHONLY - we should only check the auth cap
1969a8599bd8SSage Weil  *  CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
1970a8599bd8SSage Weil  *    further delay.
1971a8599bd8SSage Weil  */
ceph_check_caps(struct ceph_inode_info * ci,int flags)1972e4b731ccSXiubo Li void ceph_check_caps(struct ceph_inode_info *ci, int flags)
1973a8599bd8SSage Weil {
1974874c8ca1SDavid Howells 	struct inode *inode = &ci->netfs.inode;
19752678da88SXiubo Li 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
1976a8599bd8SSage Weil 	struct ceph_cap *cap;
1977a2971c8cSYan, Zheng 	u64 flush_tid, oldest_flush_tid;
1978395c312bSYan, Zheng 	int file_wanted, used, cap_used;
1979cbd03635SSage Weil 	int issued, implemented, want, retain, revoking, flushing = 0;
1980a8599bd8SSage Weil 	int mds = -1;   /* keep track of how far we've gone through i_caps list
1981a8599bd8SSage Weil 			   to avoid an infinite loop on retry */
1982a8599bd8SSage Weil 	struct rb_node *p;
19833609404fSYan, Zheng 	bool queue_invalidate = false;
19843609404fSYan, Zheng 	bool tried_invalidate = false;
1985a7437954SXiubo Li 	bool queue_writeback = false;
1986e4b731ccSXiubo Li 	struct ceph_mds_session *session = NULL;
19876a92b08fSJeff Layton 
1988be655596SSage Weil 	spin_lock(&ci->i_ceph_lock);
1989fbed7045SJeff Layton 	if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
199068c62beeSXiubo Li 		ci->i_ceph_flags |= CEPH_I_ASYNC_CHECK_CAPS;
199168c62beeSXiubo Li 
1992fbed7045SJeff Layton 		/* Don't send messages until we get async create reply */
1993fbed7045SJeff Layton 		spin_unlock(&ci->i_ceph_lock);
1994fbed7045SJeff Layton 		return;
1995fbed7045SJeff Layton 	}
1996fbed7045SJeff Layton 
1997a8599bd8SSage Weil 	if (ci->i_ceph_flags & CEPH_I_FLUSH)
1998a8599bd8SSage Weil 		flags |= CHECK_CAPS_FLUSH;
1999a8599bd8SSage Weil retry:
2000c74d79afSJeff Layton 	/* Caps wanted by virtue of active open files. */
2001a8599bd8SSage Weil 	file_wanted = __ceph_caps_file_wanted(ci);
2002c74d79afSJeff Layton 
2003c74d79afSJeff Layton 	/* Caps which have active references against them */
2004a8599bd8SSage Weil 	used = __ceph_caps_used(ci);
2005c74d79afSJeff Layton 
2006c74d79afSJeff Layton 	/*
2007c74d79afSJeff Layton 	 * "issued" represents the current caps that the MDS wants us to have.
2008c74d79afSJeff Layton 	 * "implemented" is the set that we have been granted, and includes the
2009c74d79afSJeff Layton 	 * ones that have not yet been returned to the MDS (the "revoking" set,
2010c74d79afSJeff Layton 	 * usually because they have outstanding references).
2011c74d79afSJeff Layton 	 */
2012cbd03635SSage Weil 	issued = __ceph_caps_issued(ci, &implemented);
2013cbd03635SSage Weil 	revoking = implemented & ~issued;
2014a8599bd8SSage Weil 
201541445999SYan, Zheng 	want = file_wanted;
2016c74d79afSJeff Layton 
2017c74d79afSJeff Layton 	/* The ones we currently want to retain (may be adjusted below) */
201841445999SYan, Zheng 	retain = file_wanted | used | CEPH_CAP_PIN;
2019a8599bd8SSage Weil 	if (!mdsc->stopping && inode->i_nlink > 0) {
202041445999SYan, Zheng 		if (file_wanted) {
2021a8599bd8SSage Weil 			retain |= CEPH_CAP_ANY;       /* be greedy */
202232ec4397SYan, Zheng 		} else if (S_ISDIR(inode->i_mode) &&
202332ec4397SYan, Zheng 			   (issued & CEPH_CAP_FILE_SHARED) &&
202432ec4397SYan, Zheng 			   __ceph_dir_is_complete(ci)) {
202532ec4397SYan, Zheng 			/*
202632ec4397SYan, Zheng 			 * If a directory is complete, we want to keep
202732ec4397SYan, Zheng 			 * the exclusive cap. So that MDS does not end up
202832ec4397SYan, Zheng 			 * revoking the shared cap on every create/unlink
202932ec4397SYan, Zheng 			 * operation.
203032ec4397SYan, Zheng 			 */
2031a25949b9SJeff Layton 			if (IS_RDONLY(inode)) {
20328a2ac3a8SYan, Zheng 				want = CEPH_CAP_ANY_SHARED;
2033a25949b9SJeff Layton 			} else {
2034719a2514SYan, Zheng 				want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
2035a25949b9SJeff Layton 			}
203632ec4397SYan, Zheng 			retain |= want;
2037a8599bd8SSage Weil 		} else {
203832ec4397SYan, Zheng 
2039a8599bd8SSage Weil 			retain |= CEPH_CAP_ANY_SHARED;
2040a8599bd8SSage Weil 			/*
2041a8599bd8SSage Weil 			 * keep RD only if we didn't have the file open RW,
2042a8599bd8SSage Weil 			 * because then the mds would revoke it anyway to
2043a8599bd8SSage Weil 			 * journal max_size=0.
2044a8599bd8SSage Weil 			 */
2045a8599bd8SSage Weil 			if (ci->i_max_size == 0)
2046a8599bd8SSage Weil 				retain |= CEPH_CAP_ANY_RD;
2047a8599bd8SSage Weil 		}
2048a8599bd8SSage Weil 	}
2049a8599bd8SSage Weil 
20506407fbb9SJeff Layton 	dout("check_caps %llx.%llx file_want %s used %s dirty %s flushing %s"
2051e027ddb6SXiubo Li 	     " issued %s revoking %s retain %s %s%s%s\n", ceph_vinop(inode),
2052a8599bd8SSage Weil 	     ceph_cap_string(file_wanted),
2053a8599bd8SSage Weil 	     ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
2054a8599bd8SSage Weil 	     ceph_cap_string(ci->i_flushing_caps),
2055cbd03635SSage Weil 	     ceph_cap_string(issued), ceph_cap_string(revoking),
2056a8599bd8SSage Weil 	     ceph_cap_string(retain),
2057a8599bd8SSage Weil 	     (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
2058e027ddb6SXiubo Li 	     (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "",
2059e027ddb6SXiubo Li 	     (flags & CHECK_CAPS_NOINVAL) ? " NOINVAL" : "");
2060a8599bd8SSage Weil 
2061a8599bd8SSage Weil 	/*
2062a8599bd8SSage Weil 	 * If we no longer need to hold onto old our caps, and we may
2063a8599bd8SSage Weil 	 * have cached pages, but don't want them, then try to invalidate.
2064a8599bd8SSage Weil 	 * If we fail, it's because pages are locked.... try again later.
2065a8599bd8SSage Weil 	 */
2066a0d93e32SYan, Zheng 	if ((!(flags & CHECK_CAPS_NOINVAL) || mdsc->stopping) &&
2067525d15e8SYan, Zheng 	    S_ISREG(inode->i_mode) &&
20689abd4db7SYan, Zheng 	    !(ci->i_wb_ref || ci->i_wrbuffer_ref) &&   /* no dirty pages... */
206993afd449SSage Weil 	    inode->i_data.nrpages &&		/* have cached pages */
20702962507cSSage Weil 	    (revoking & (CEPH_CAP_FILE_CACHE|
20715e804ac4SYan, Zheng 			 CEPH_CAP_FILE_LAZYIO)) && /*  or revoking cache */
2072a8599bd8SSage Weil 	    !tried_invalidate) {
20736407fbb9SJeff Layton 		dout("check_caps trying to invalidate on %llx.%llx\n",
20746407fbb9SJeff Layton 		     ceph_vinop(inode));
20755ecad6fdSSage Weil 		if (try_nonblocking_invalidate(inode) < 0) {
2076cbd03635SSage Weil 			dout("check_caps queuing invalidate\n");
20773609404fSYan, Zheng 			queue_invalidate = true;
2078cbd03635SSage Weil 			ci->i_rdcache_revoking = ci->i_rdcache_gen;
20795ecad6fdSSage Weil 		}
20803609404fSYan, Zheng 		tried_invalidate = true;
20816a92b08fSJeff Layton 		goto retry;
2082a8599bd8SSage Weil 	}
2083a8599bd8SSage Weil 
2084a8599bd8SSage Weil 	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
2085d67c72e6SJeff Layton 		int mflags = 0;
20860a454bddSJeff Layton 		struct cap_msg_args arg;
20870a454bddSJeff Layton 
2088a8599bd8SSage Weil 		cap = rb_entry(p, struct ceph_cap, ci_node);
2089a8599bd8SSage Weil 
2090a8599bd8SSage Weil 		/* avoid looping forever */
2091a8599bd8SSage Weil 		if (mds >= cap->mds ||
2092a8599bd8SSage Weil 		    ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
2093a8599bd8SSage Weil 			continue;
2094a8599bd8SSage Weil 
2095c74d79afSJeff Layton 		/*
2096c74d79afSJeff Layton 		 * If we have an auth cap, we don't need to consider any
2097c74d79afSJeff Layton 		 * overlapping caps as used.
2098c74d79afSJeff Layton 		 */
2099395c312bSYan, Zheng 		cap_used = used;
2100395c312bSYan, Zheng 		if (ci->i_auth_cap && cap != ci->i_auth_cap)
2101395c312bSYan, Zheng 			cap_used &= ~ci->i_auth_cap->issued;
2102395c312bSYan, Zheng 
2103a8599bd8SSage Weil 		revoking = cap->implemented & ~cap->issued;
2104395c312bSYan, Zheng 		dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
21059abd4db7SYan, Zheng 		     cap->mds, cap, ceph_cap_string(cap_used),
21069abd4db7SYan, Zheng 		     ceph_cap_string(cap->issued),
2107088b3f5eSSage Weil 		     ceph_cap_string(cap->implemented),
2108a8599bd8SSage Weil 		     ceph_cap_string(revoking));
2109a8599bd8SSage Weil 
2110a8599bd8SSage Weil 		if (cap == ci->i_auth_cap &&
2111a8599bd8SSage Weil 		    (cap->issued & CEPH_CAP_FILE_WR)) {
2112a8599bd8SSage Weil 			/* request larger max_size from MDS? */
2113a8599bd8SSage Weil 			if (ci->i_wanted_max_size > ci->i_max_size &&
2114a8599bd8SSage Weil 			    ci->i_wanted_max_size > ci->i_requested_max_size) {
2115a8599bd8SSage Weil 				dout("requesting new max_size\n");
2116a8599bd8SSage Weil 				goto ack;
2117a8599bd8SSage Weil 			}
2118a8599bd8SSage Weil 
2119a8599bd8SSage Weil 			/* approaching file_max? */
2120efb0ca76SYan, Zheng 			if (__ceph_should_report_size(ci)) {
2121a8599bd8SSage Weil 				dout("i_size approaching max_size\n");
2122a8599bd8SSage Weil 				goto ack;
2123a8599bd8SSage Weil 			}
2124a8599bd8SSage Weil 		}
2125a8599bd8SSage Weil 		/* flush anything dirty? */
21267bc00fddSYan, Zheng 		if (cap == ci->i_auth_cap) {
21277bc00fddSYan, Zheng 			if ((flags & CHECK_CAPS_FLUSH) && ci->i_dirty_caps) {
2128a8599bd8SSage Weil 				dout("flushing dirty caps\n");
2129a8599bd8SSage Weil 				goto ack;
2130a8599bd8SSage Weil 			}
21317bc00fddSYan, Zheng 			if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) {
21327bc00fddSYan, Zheng 				dout("flushing snap caps\n");
21337bc00fddSYan, Zheng 				goto ack;
21347bc00fddSYan, Zheng 			}
21357bc00fddSYan, Zheng 		}
2136a8599bd8SSage Weil 
2137a8599bd8SSage Weil 		/* completed revocation? going down and there are no caps? */
2138a7437954SXiubo Li 		if (revoking) {
2139a7437954SXiubo Li 			if ((revoking & cap_used) == 0) {
2140a8599bd8SSage Weil 				dout("completed revocation of %s\n",
2141a8599bd8SSage Weil 				      ceph_cap_string(cap->implemented & ~cap->issued));
2142a8599bd8SSage Weil 				goto ack;
2143a8599bd8SSage Weil 			}
2144a8599bd8SSage Weil 
2145a7437954SXiubo Li 			/*
2146a7437954SXiubo Li 			 * If the "i_wrbuffer_ref" was increased by mmap or generic
2147a7437954SXiubo Li 			 * cache write just before the ceph_check_caps() is called,
2148a7437954SXiubo Li 			 * the Fb capability revoking will fail this time. Then we
2149a7437954SXiubo Li 			 * must wait for the BDI's delayed work to flush the dirty
2150a7437954SXiubo Li 			 * pages and to release the "i_wrbuffer_ref", which will cost
2151a7437954SXiubo Li 			 * at most 5 seconds. That means the MDS needs to wait at
2152a7437954SXiubo Li 			 * most 5 seconds to finished the Fb capability's revocation.
2153a7437954SXiubo Li 			 *
2154a7437954SXiubo Li 			 * Let's queue a writeback for it.
2155a7437954SXiubo Li 			 */
2156a7437954SXiubo Li 			if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref &&
2157a7437954SXiubo Li 			    (revoking & CEPH_CAP_FILE_BUFFER))
2158a7437954SXiubo Li 				queue_writeback = true;
2159a7437954SXiubo Li 		}
2160a7437954SXiubo Li 
2161a8599bd8SSage Weil 		/* want more caps from mds? */
21620aa971b6SYan, Zheng 		if (want & ~cap->mds_wanted) {
2163a8599bd8SSage Weil 			if (want & ~(cap->mds_wanted | cap->issued))
2164a8599bd8SSage Weil 				goto ack;
21650aa971b6SYan, Zheng 			if (!__cap_is_valid(cap))
21660aa971b6SYan, Zheng 				goto ack;
21670aa971b6SYan, Zheng 		}
2168a8599bd8SSage Weil 
2169a8599bd8SSage Weil 		/* things we might delay */
2170fdac94faSYan, Zheng 		if ((cap->issued & ~retain) == 0)
2171a8599bd8SSage Weil 			continue;     /* nope, all good */
2172a8599bd8SSage Weil 
2173a8599bd8SSage Weil ack:
2174dc3da046SJeff Layton 		ceph_put_mds_session(session);
21756a92b08fSJeff Layton 		session = ceph_get_mds_session(cap->session);
21767bc00fddSYan, Zheng 
21777bc00fddSYan, Zheng 		/* kick flushing and flush snaps before sending normal
21787bc00fddSYan, Zheng 		 * cap message */
21797bc00fddSYan, Zheng 		if (cap == ci->i_auth_cap &&
21807bc00fddSYan, Zheng 		    (ci->i_ceph_flags &
21817bc00fddSYan, Zheng 		     (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) {
2182054f8d41SYan, Zheng 			if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
218324d063acSYan, Zheng 				__kick_flushing_caps(mdsc, session, ci, 0);
2184ed9b430cSYan, Zheng 			if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
2185ed9b430cSYan, Zheng 				__ceph_flush_snaps(ci, session);
2186ed9b430cSYan, Zheng 
2187a8599bd8SSage Weil 			goto retry;
2188a8599bd8SSage Weil 		}
2189a8599bd8SSage Weil 
2190553adfd9SYan, Zheng 		if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
21919f3345d8SJeff Layton 			flushing = ci->i_dirty_caps;
21929f3345d8SJeff Layton 			flush_tid = __mark_caps_flushing(inode, session, false,
2193a2971c8cSYan, Zheng 							 &oldest_flush_tid);
2194d67c72e6SJeff Layton 			if (flags & CHECK_CAPS_FLUSH &&
2195d67c72e6SJeff Layton 			    list_empty(&session->s_cap_dirty))
2196d67c72e6SJeff Layton 				mflags |= CEPH_CLIENT_CAPS_SYNC;
2197553adfd9SYan, Zheng 		} else {
219824be0c48SSage Weil 			flushing = 0;
2199553adfd9SYan, Zheng 			flush_tid = 0;
2200a2971c8cSYan, Zheng 			spin_lock(&mdsc->cap_dirty_lock);
2201a2971c8cSYan, Zheng 			oldest_flush_tid = __get_oldest_flush_tid(mdsc);
2202a2971c8cSYan, Zheng 			spin_unlock(&mdsc->cap_dirty_lock);
2203553adfd9SYan, Zheng 		}
2204a8599bd8SSage Weil 
2205a8599bd8SSage Weil 		mds = cap->mds;  /* remember mds, so we don't repeat */
2206a8599bd8SSage Weil 
2207d67c72e6SJeff Layton 		__prep_cap(&arg, cap, CEPH_CAP_OP_UPDATE, mflags, cap_used,
2208d67c72e6SJeff Layton 			   want, retain, flushing, flush_tid, oldest_flush_tid);
22090a454bddSJeff Layton 
22106a92b08fSJeff Layton 		spin_unlock(&ci->i_ceph_lock);
221152311980SJeff Layton 		__send_cap(&arg, ci);
22126a92b08fSJeff Layton 		spin_lock(&ci->i_ceph_lock);
22130a454bddSJeff Layton 
2214be655596SSage Weil 		goto retry; /* retake i_ceph_lock and restart our cap scan. */
2215a8599bd8SSage Weil 	}
2216a8599bd8SSage Weil 
2217a0d93e32SYan, Zheng 	/* periodically re-calculate caps wanted by open files */
2218a0d93e32SYan, Zheng 	if (__ceph_is_any_real_caps(ci) &&
2219a0d93e32SYan, Zheng 	    list_empty(&ci->i_cap_delay_list) &&
2220719a2514SYan, Zheng 	    (file_wanted & ~CEPH_CAP_PIN) &&
2221719a2514SYan, Zheng 	    !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
2222a0d93e32SYan, Zheng 		__cap_delay_requeue(mdsc, ci);
2223719a2514SYan, Zheng 	}
2224a8599bd8SSage Weil 
2225be655596SSage Weil 	spin_unlock(&ci->i_ceph_lock);
2226a8599bd8SSage Weil 
22276a92b08fSJeff Layton 	ceph_put_mds_session(session);
2228a7437954SXiubo Li 	if (queue_writeback)
2229a7437954SXiubo Li 		ceph_queue_writeback(inode);
2230cbd03635SSage Weil 	if (queue_invalidate)
22313c6f6b79SSage Weil 		ceph_queue_invalidate(inode);
2232a8599bd8SSage Weil }
2233a8599bd8SSage Weil 
2234a8599bd8SSage Weil /*
2235a8599bd8SSage Weil  * Try to flush dirty caps back to the auth mds.
2236a8599bd8SSage Weil  */
try_flush_caps(struct inode * inode,u64 * ptid)2237553adfd9SYan, Zheng static int try_flush_caps(struct inode *inode, u64 *ptid)
2238a8599bd8SSage Weil {
2239985b9ee8SXiubo Li 	struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
2240a8599bd8SSage Weil 	struct ceph_inode_info *ci = ceph_inode(inode);
224189b52fe1SYan, Zheng 	int flushing = 0;
2242a2971c8cSYan, Zheng 	u64 flush_tid = 0, oldest_flush_tid = 0;
2243a8599bd8SSage Weil 
2244be655596SSage Weil 	spin_lock(&ci->i_ceph_lock);
2245d6cee9dbSYan, Zheng retry_locked:
2246a8599bd8SSage Weil 	if (ci->i_dirty_caps && ci->i_auth_cap) {
2247a8599bd8SSage Weil 		struct ceph_cap *cap = ci->i_auth_cap;
22480a454bddSJeff Layton 		struct cap_msg_args arg;
22490449a352SJeff Layton 		struct ceph_mds_session *session = cap->session;
2250a8599bd8SSage Weil 
22510449a352SJeff Layton 		if (session->s_state < CEPH_MDS_SESSION_OPEN) {
22526c2838fbSJeff Layton 			spin_unlock(&ci->i_ceph_lock);
2253a8599bd8SSage Weil 			goto out;
22546c2838fbSJeff Layton 		}
2255a8599bd8SSage Weil 
2256d6cee9dbSYan, Zheng 		if (ci->i_ceph_flags &
2257d6cee9dbSYan, Zheng 		    (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS)) {
2258d6cee9dbSYan, Zheng 			if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
2259d6cee9dbSYan, Zheng 				__kick_flushing_caps(mdsc, session, ci, 0);
2260d6cee9dbSYan, Zheng 			if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
2261d6cee9dbSYan, Zheng 				__ceph_flush_snaps(ci, session);
2262d6cee9dbSYan, Zheng 			goto retry_locked;
2263d6cee9dbSYan, Zheng 		}
2264d6cee9dbSYan, Zheng 
22659f3345d8SJeff Layton 		flushing = ci->i_dirty_caps;
22669f3345d8SJeff Layton 		flush_tid = __mark_caps_flushing(inode, session, true,
22679f3345d8SJeff Layton 						 &oldest_flush_tid);
2268a8599bd8SSage Weil 
22690a454bddSJeff Layton 		__prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC,
2270a0d93e32SYan, Zheng 			   __ceph_caps_used(ci), __ceph_caps_wanted(ci),
2271d6cee9dbSYan, Zheng 			   (cap->issued | cap->implemented),
2272a2971c8cSYan, Zheng 			   flushing, flush_tid, oldest_flush_tid);
22730a454bddSJeff Layton 		spin_unlock(&ci->i_ceph_lock);
22740a454bddSJeff Layton 
227552311980SJeff Layton 		__send_cap(&arg, ci);
2276553adfd9SYan, Zheng 	} else {
2277e4500b5eSYan, Zheng 		if (!list_empty(&ci->i_cap_flush_list)) {
2278553adfd9SYan, Zheng 			struct ceph_cap_flush *cf =
2279e4500b5eSYan, Zheng 				list_last_entry(&ci->i_cap_flush_list,
2280e4500b5eSYan, Zheng 						struct ceph_cap_flush, i_list);
2281c8799fc4SYan, Zheng 			cf->wake = true;
2282553adfd9SYan, Zheng 			flush_tid = cf->tid;
2283553adfd9SYan, Zheng 		}
2284553adfd9SYan, Zheng 		flushing = ci->i_flushing_caps;
2285553adfd9SYan, Zheng 		spin_unlock(&ci->i_ceph_lock);
2286553adfd9SYan, Zheng 	}
2287553adfd9SYan, Zheng out:
2288553adfd9SYan, Zheng 	*ptid = flush_tid;
2289a8599bd8SSage Weil 	return flushing;
2290a8599bd8SSage Weil }
2291a8599bd8SSage Weil 
2292a8599bd8SSage Weil /*
2293a8599bd8SSage Weil  * Return true if we've flushed caps through the given flush_tid.
2294a8599bd8SSage Weil  */
caps_are_flushed(struct inode * inode,u64 flush_tid)2295553adfd9SYan, Zheng static int caps_are_flushed(struct inode *inode, u64 flush_tid)
2296a8599bd8SSage Weil {
2297a8599bd8SSage Weil 	struct ceph_inode_info *ci = ceph_inode(inode);
2298553adfd9SYan, Zheng 	int ret = 1;
2299a8599bd8SSage Weil 
2300be655596SSage Weil 	spin_lock(&ci->i_ceph_lock);
2301e4500b5eSYan, Zheng 	if (!list_empty(&ci->i_cap_flush_list)) {
2302e4500b5eSYan, Zheng 		struct ceph_cap_flush * cf =
2303e4500b5eSYan, Zheng 			list_first_entry(&ci->i_cap_flush_list,
2304e4500b5eSYan, Zheng 					 struct ceph_cap_flush, i_list);
2305553adfd9SYan, Zheng 		if (cf->tid <= flush_tid)
2306a8599bd8SSage Weil 			ret = 0;
230789b52fe1SYan, Zheng 	}
2308be655596SSage Weil 	spin_unlock(&ci->i_ceph_lock);
2309a8599bd8SSage Weil 	return ret;
2310a8599bd8SSage Weil }
2311a8599bd8SSage Weil 
2312a8599bd8SSage Weil /*
2313ae067063SXiubo Li  * flush the mdlog and wait for any unsafe requests to complete.
2314da819c81SYan, Zheng  */
flush_mdlog_and_wait_inode_unsafe_requests(struct inode * inode)2315ae067063SXiubo Li static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
2316da819c81SYan, Zheng {
2317985b9ee8SXiubo Li 	struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
2318da819c81SYan, Zheng 	struct ceph_inode_info *ci = ceph_inode(inode);
231968cd5b4bSYan, Zheng 	struct ceph_mds_request *req1 = NULL, *req2 = NULL;
232068cd5b4bSYan, Zheng 	int ret, err = 0;
2321da819c81SYan, Zheng 
2322da819c81SYan, Zheng 	spin_lock(&ci->i_unsafe_lock);
232368cd5b4bSYan, Zheng 	if (S_ISDIR(inode->i_mode) && !list_empty(&ci->i_unsafe_dirops)) {
232468cd5b4bSYan, Zheng 		req1 = list_last_entry(&ci->i_unsafe_dirops,
232568cd5b4bSYan, Zheng 					struct ceph_mds_request,
2326da819c81SYan, Zheng 					r_unsafe_dir_item);
232768cd5b4bSYan, Zheng 		ceph_mdsc_get_request(req1);
232868cd5b4bSYan, Zheng 	}
232968cd5b4bSYan, Zheng 	if (!list_empty(&ci->i_unsafe_iops)) {
233068cd5b4bSYan, Zheng 		req2 = list_last_entry(&ci->i_unsafe_iops,
233168cd5b4bSYan, Zheng 					struct ceph_mds_request,
233268cd5b4bSYan, Zheng 					r_unsafe_target_item);
233368cd5b4bSYan, Zheng 		ceph_mdsc_get_request(req2);
233468cd5b4bSYan, Zheng 	}
2335da819c81SYan, Zheng 	spin_unlock(&ci->i_unsafe_lock);
2336da819c81SYan, Zheng 
2337e1a4541eSXiubo Li 	/*
2338e1a4541eSXiubo Li 	 * Trigger to flush the journal logs in all the relevant MDSes
2339e1a4541eSXiubo Li 	 * manually, or in the worst case we must wait at most 5 seconds
2340e1a4541eSXiubo Li 	 * to wait the journal logs to be flushed by the MDSes periodically.
2341e1a4541eSXiubo Li 	 */
23425bd76b8dSXiubo Li 	if (req1 || req2) {
2343e1a4541eSXiubo Li 		struct ceph_mds_request *req;
23445bd76b8dSXiubo Li 		struct ceph_mds_session **sessions;
23455bd76b8dSXiubo Li 		struct ceph_mds_session *s;
23465bd76b8dSXiubo Li 		unsigned int max_sessions;
2347e1a4541eSXiubo Li 		int i;
2348e1a4541eSXiubo Li 
23495bd76b8dSXiubo Li 		mutex_lock(&mdsc->mutex);
23505bd76b8dSXiubo Li 		max_sessions = mdsc->max_sessions;
23515bd76b8dSXiubo Li 
2352aa1d6272SKenneth Lee 		sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL);
235389d43d05SXiubo Li 		if (!sessions) {
23545bd76b8dSXiubo Li 			mutex_unlock(&mdsc->mutex);
235589d43d05SXiubo Li 			err = -ENOMEM;
235689d43d05SXiubo Li 			goto out;
235789d43d05SXiubo Li 		}
2358e1a4541eSXiubo Li 
2359e1a4541eSXiubo Li 		spin_lock(&ci->i_unsafe_lock);
2360e1a4541eSXiubo Li 		if (req1) {
2361e1a4541eSXiubo Li 			list_for_each_entry(req, &ci->i_unsafe_dirops,
2362e1a4541eSXiubo Li 					    r_unsafe_dir_item) {
2363e1a4541eSXiubo Li 				s = req->r_session;
23647acae618SXiubo Li 				if (!s)
23657acae618SXiubo Li 					continue;
2366e1a4541eSXiubo Li 				if (!sessions[s->s_mds]) {
2367e1a4541eSXiubo Li 					s = ceph_get_mds_session(s);
2368e1a4541eSXiubo Li 					sessions[s->s_mds] = s;
2369e1a4541eSXiubo Li 				}
2370e1a4541eSXiubo Li 			}
2371e1a4541eSXiubo Li 		}
2372e1a4541eSXiubo Li 		if (req2) {
2373e1a4541eSXiubo Li 			list_for_each_entry(req, &ci->i_unsafe_iops,
2374e1a4541eSXiubo Li 					    r_unsafe_target_item) {
2375e1a4541eSXiubo Li 				s = req->r_session;
23767acae618SXiubo Li 				if (!s)
23777acae618SXiubo Li 					continue;
2378e1a4541eSXiubo Li 				if (!sessions[s->s_mds]) {
2379e1a4541eSXiubo Li 					s = ceph_get_mds_session(s);
2380e1a4541eSXiubo Li 					sessions[s->s_mds] = s;
2381e1a4541eSXiubo Li 				}
2382e1a4541eSXiubo Li 			}
2383e1a4541eSXiubo Li 		}
2384e1a4541eSXiubo Li 		spin_unlock(&ci->i_unsafe_lock);
2385e1a4541eSXiubo Li 
2386e1a4541eSXiubo Li 		/* the auth MDS */
2387e1a4541eSXiubo Li 		spin_lock(&ci->i_ceph_lock);
2388e1a4541eSXiubo Li 		if (ci->i_auth_cap) {
2389e1a4541eSXiubo Li 			s = ci->i_auth_cap->session;
2390e1a4541eSXiubo Li 			if (!sessions[s->s_mds])
2391e1a4541eSXiubo Li 				sessions[s->s_mds] = ceph_get_mds_session(s);
2392e1a4541eSXiubo Li 		}
2393e1a4541eSXiubo Li 		spin_unlock(&ci->i_ceph_lock);
23945bd76b8dSXiubo Li 		mutex_unlock(&mdsc->mutex);
2395e1a4541eSXiubo Li 
2396e1a4541eSXiubo Li 		/* send flush mdlog request to MDSes */
239789d43d05SXiubo Li 		for (i = 0; i < max_sessions; i++) {
2398e1a4541eSXiubo Li 			s = sessions[i];
2399e1a4541eSXiubo Li 			if (s) {
2400e1a4541eSXiubo Li 				send_flush_mdlog(s);
2401e1a4541eSXiubo Li 				ceph_put_mds_session(s);
2402e1a4541eSXiubo Li 			}
2403e1a4541eSXiubo Li 		}
2404e1a4541eSXiubo Li 		kfree(sessions);
2405e1a4541eSXiubo Li 	}
2406e1a4541eSXiubo Li 
2407ae067063SXiubo Li 	dout("%s %p wait on tid %llu %llu\n", __func__,
240868cd5b4bSYan, Zheng 	     inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
240968cd5b4bSYan, Zheng 	if (req1) {
241068cd5b4bSYan, Zheng 		ret = !wait_for_completion_timeout(&req1->r_safe_completion,
241168cd5b4bSYan, Zheng 					ceph_timeout_jiffies(req1->r_timeout));
2412da819c81SYan, Zheng 		if (ret)
241368cd5b4bSYan, Zheng 			err = -EIO;
241468cd5b4bSYan, Zheng 	}
241568cd5b4bSYan, Zheng 	if (req2) {
241668cd5b4bSYan, Zheng 		ret = !wait_for_completion_timeout(&req2->r_safe_completion,
241768cd5b4bSYan, Zheng 					ceph_timeout_jiffies(req2->r_timeout));
241868cd5b4bSYan, Zheng 		if (ret)
241968cd5b4bSYan, Zheng 			err = -EIO;
242068cd5b4bSYan, Zheng 	}
242189d43d05SXiubo Li 
242289d43d05SXiubo Li out:
242389d43d05SXiubo Li 	if (req1)
242489d43d05SXiubo Li 		ceph_mdsc_put_request(req1);
242589d43d05SXiubo Li 	if (req2)
242689d43d05SXiubo Li 		ceph_mdsc_put_request(req2);
242768cd5b4bSYan, Zheng 	return err;
2428da819c81SYan, Zheng }
2429da819c81SYan, Zheng 
ceph_fsync(struct file * file,loff_t start,loff_t end,int datasync)243002c24a82SJosef Bacik int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2431a8599bd8SSage Weil {
24327ea80859SChristoph Hellwig 	struct inode *inode = file->f_mapping->host;
2433a8599bd8SSage Weil 	struct ceph_inode_info *ci = ceph_inode(inode);
2434553adfd9SYan, Zheng 	u64 flush_tid;
2435f4b97866SYan, Zheng 	int ret, err;
2436a8599bd8SSage Weil 	int dirty;
2437a8599bd8SSage Weil 
2438a8599bd8SSage Weil 	dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
24399a5530c6SYan, Zheng 
2440b74fceaeSJeff Layton 	ret = file_write_and_wait_range(file, start, end);
2441da819c81SYan, Zheng 	if (datasync)
2442da819c81SYan, Zheng 		goto out;
2443da819c81SYan, Zheng 
2444891f3f5aSJeff Layton 	ret = ceph_wait_on_async_create(inode);
2445891f3f5aSJeff Layton 	if (ret)
2446891f3f5aSJeff Layton 		goto out;
2447891f3f5aSJeff Layton 
2448553adfd9SYan, Zheng 	dirty = try_flush_caps(inode, &flush_tid);
2449a8599bd8SSage Weil 	dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
2450a8599bd8SSage Weil 
2451ae067063SXiubo Li 	err = flush_mdlog_and_wait_inode_unsafe_requests(inode);
2452da819c81SYan, Zheng 
2453a8599bd8SSage Weil 	/*
2454a8599bd8SSage Weil 	 * only wait on non-file metadata writeback (the mds
2455a8599bd8SSage Weil 	 * can recover size and mtime, so we don't need to
2456a8599bd8SSage Weil 	 * wait for that)
2457a8599bd8SSage Weil 	 */
2458f4b97866SYan, Zheng 	if (!err && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
2459f4b97866SYan, Zheng 		err = wait_event_interruptible(ci->i_cap_wq,
2460a8599bd8SSage Weil 					caps_are_flushed(inode, flush_tid));
2461a8599bd8SSage Weil 	}
2462f4b97866SYan, Zheng 
2463f4b97866SYan, Zheng 	if (err < 0)
2464f4b97866SYan, Zheng 		ret = err;
2465f4b97866SYan, Zheng 
24661bd85aa6SJeff Layton 	err = file_check_and_advance_wb_err(file);
2467f4b97866SYan, Zheng 	if (err < 0)
2468f4b97866SYan, Zheng 		ret = err;
2469da819c81SYan, Zheng out:
2470da819c81SYan, Zheng 	dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
2471a8599bd8SSage Weil 	return ret;
2472a8599bd8SSage Weil }
2473a8599bd8SSage Weil 
2474a8599bd8SSage Weil /*
2475a8599bd8SSage Weil  * Flush any dirty caps back to the mds.  If we aren't asked to wait,
2476a8599bd8SSage Weil  * queue inode for flush but don't do so immediately, because we can
2477a8599bd8SSage Weil  * get by with fewer MDS messages if we wait for data writeback to
2478a8599bd8SSage Weil  * complete first.
2479a8599bd8SSage Weil  */
ceph_write_inode(struct inode * inode,struct writeback_control * wbc)2480f1a3d572SStephen Rothwell int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
2481a8599bd8SSage Weil {
2482a8599bd8SSage Weil 	struct ceph_inode_info *ci = ceph_inode(inode);
2483553adfd9SYan, Zheng 	u64 flush_tid;
2484a8599bd8SSage Weil 	int err = 0;
2485a8599bd8SSage Weil 	int dirty;
248616515a6dSChengguang Xu 	int wait = (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync);
2487a8599bd8SSage Weil 
2488a8599bd8SSage Weil 	dout("write_inode %p wait=%d\n", inode, wait);
2489400e1286SJeff Layton 	ceph_fscache_unpin_writeback(inode, wbc);
2490a8599bd8SSage Weil 	if (wait) {
2491fbed7045SJeff Layton 		err = ceph_wait_on_async_create(inode);
2492fbed7045SJeff Layton 		if (err)
2493fbed7045SJeff Layton 			return err;
2494553adfd9SYan, Zheng 		dirty = try_flush_caps(inode, &flush_tid);
2495a8599bd8SSage Weil 		if (dirty)
2496a8599bd8SSage Weil 			err = wait_event_interruptible(ci->i_cap_wq,
2497a8599bd8SSage Weil 				       caps_are_flushed(inode, flush_tid));
2498a8599bd8SSage Weil 	} else {
2499640ef79dSCheng Renquan 		struct ceph_mds_client *mdsc =
2500985b9ee8SXiubo Li 			ceph_sb_to_fs_client(inode->i_sb)->mdsc;
2501a8599bd8SSage Weil 
2502be655596SSage Weil 		spin_lock(&ci->i_ceph_lock);
2503a8599bd8SSage Weil 		if (__ceph_caps_dirty(ci))
2504a8599bd8SSage Weil 			__cap_delay_requeue_front(mdsc, ci);
2505be655596SSage Weil 		spin_unlock(&ci->i_ceph_lock);
2506a8599bd8SSage Weil 	}
2507a8599bd8SSage Weil 	return err;
2508a8599bd8SSage Weil }
2509a8599bd8SSage Weil 
__kick_flushing_caps(struct ceph_mds_client * mdsc,struct ceph_mds_session * session,struct ceph_inode_info * ci,u64 oldest_flush_tid)25100e294387SYan, Zheng static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
2511553adfd9SYan, Zheng 				 struct ceph_mds_session *session,
25120e294387SYan, Zheng 				 struct ceph_inode_info *ci,
25130e294387SYan, Zheng 				 u64 oldest_flush_tid)
25140e294387SYan, Zheng 	__releases(ci->i_ceph_lock)
25150e294387SYan, Zheng 	__acquires(ci->i_ceph_lock)
2516553adfd9SYan, Zheng {
2517874c8ca1SDavid Howells 	struct inode *inode = &ci->netfs.inode;
2518553adfd9SYan, Zheng 	struct ceph_cap *cap;
2519553adfd9SYan, Zheng 	struct ceph_cap_flush *cf;
25200e294387SYan, Zheng 	int ret;
2521553adfd9SYan, Zheng 	u64 first_tid = 0;
252249ada6e8SYan, Zheng 	u64 last_snap_flush = 0;
2523a2971c8cSYan, Zheng 
2524fbed7045SJeff Layton 	/* Don't do anything until create reply comes in */
2525fbed7045SJeff Layton 	if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE)
2526fbed7045SJeff Layton 		return;
2527fbed7045SJeff Layton 
2528054f8d41SYan, Zheng 	ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
2529054f8d41SYan, Zheng 
253049ada6e8SYan, Zheng 	list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) {
2531b2f9fa1fSXiubo Li 		if (cf->is_capsnap) {
253249ada6e8SYan, Zheng 			last_snap_flush = cf->tid;
253349ada6e8SYan, Zheng 			break;
253449ada6e8SYan, Zheng 		}
253549ada6e8SYan, Zheng 	}
253649ada6e8SYan, Zheng 
2537e4500b5eSYan, Zheng 	list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
2538e4500b5eSYan, Zheng 		if (cf->tid < first_tid)
2539e4500b5eSYan, Zheng 			continue;
2540e4500b5eSYan, Zheng 
2541553adfd9SYan, Zheng 		cap = ci->i_auth_cap;
2542553adfd9SYan, Zheng 		if (!(cap && cap->session == session)) {
25430e294387SYan, Zheng 			pr_err("%p auth cap %p not mds%d ???\n",
25440e294387SYan, Zheng 			       inode, cap, session->s_mds);
2545553adfd9SYan, Zheng 			break;
2546553adfd9SYan, Zheng 		}
2547553adfd9SYan, Zheng 
2548553adfd9SYan, Zheng 		first_tid = cf->tid + 1;
2549553adfd9SYan, Zheng 
2550b2f9fa1fSXiubo Li 		if (!cf->is_capsnap) {
25510a454bddSJeff Layton 			struct cap_msg_args arg;
25520a454bddSJeff Layton 
25530e294387SYan, Zheng 			dout("kick_flushing_caps %p cap %p tid %llu %s\n",
25540e294387SYan, Zheng 			     inode, cap, cf->tid, ceph_cap_string(cf->caps));
25550a454bddSJeff Layton 			__prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH,
255649ada6e8SYan, Zheng 					 (cf->tid < last_snap_flush ?
255749ada6e8SYan, Zheng 					  CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0),
255849ada6e8SYan, Zheng 					  __ceph_caps_used(ci),
2559553adfd9SYan, Zheng 					  __ceph_caps_wanted(ci),
256049ada6e8SYan, Zheng 					  (cap->issued | cap->implemented),
2561a2971c8cSYan, Zheng 					  cf->caps, cf->tid, oldest_flush_tid);
25620a454bddSJeff Layton 			spin_unlock(&ci->i_ceph_lock);
256352311980SJeff Layton 			__send_cap(&arg, ci);
25640e294387SYan, Zheng 		} else {
25650e294387SYan, Zheng 			struct ceph_cap_snap *capsnap =
25660e294387SYan, Zheng 					container_of(cf, struct ceph_cap_snap,
25670e294387SYan, Zheng 						    cap_flush);
25680e294387SYan, Zheng 			dout("kick_flushing_caps %p capsnap %p tid %llu %s\n",
25690e294387SYan, Zheng 			     inode, capsnap, cf->tid,
25700e294387SYan, Zheng 			     ceph_cap_string(capsnap->dirty));
25710e294387SYan, Zheng 
2572805692d0SElena Reshetova 			refcount_inc(&capsnap->nref);
25730e294387SYan, Zheng 			spin_unlock(&ci->i_ceph_lock);
25740e294387SYan, Zheng 
25750e294387SYan, Zheng 			ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
25760e294387SYan, Zheng 						oldest_flush_tid);
25770e294387SYan, Zheng 			if (ret < 0) {
25780e294387SYan, Zheng 				pr_err("kick_flushing_caps: error sending "
25790e294387SYan, Zheng 					"cap flushsnap, ino (%llx.%llx) "
25800e294387SYan, Zheng 					"tid %llu follows %llu\n",
25810e294387SYan, Zheng 					ceph_vinop(inode), cf->tid,
25820e294387SYan, Zheng 					capsnap->follows);
25830e294387SYan, Zheng 			}
25840e294387SYan, Zheng 
25850e294387SYan, Zheng 			ceph_put_cap_snap(capsnap);
25860e294387SYan, Zheng 		}
2587e4500b5eSYan, Zheng 
2588e4500b5eSYan, Zheng 		spin_lock(&ci->i_ceph_lock);
2589553adfd9SYan, Zheng 	}
2590553adfd9SYan, Zheng }
2591553adfd9SYan, Zheng 
ceph_early_kick_flushing_caps(struct ceph_mds_client * mdsc,struct ceph_mds_session * session)2592e548e9b9SYan, Zheng void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
2593e548e9b9SYan, Zheng 				   struct ceph_mds_session *session)
2594e548e9b9SYan, Zheng {
2595e548e9b9SYan, Zheng 	struct ceph_inode_info *ci;
2596e548e9b9SYan, Zheng 	struct ceph_cap *cap;
25970e294387SYan, Zheng 	u64 oldest_flush_tid;
2598e548e9b9SYan, Zheng 
2599e548e9b9SYan, Zheng 	dout("early_kick_flushing_caps mds%d\n", session->s_mds);
26000e294387SYan, Zheng 
26010e294387SYan, Zheng 	spin_lock(&mdsc->cap_dirty_lock);
26020e294387SYan, Zheng 	oldest_flush_tid = __get_oldest_flush_tid(mdsc);
26030e294387SYan, Zheng 	spin_unlock(&mdsc->cap_dirty_lock);
26040e294387SYan, Zheng 
2605e548e9b9SYan, Zheng 	list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
2606e548e9b9SYan, Zheng 		spin_lock(&ci->i_ceph_lock);
2607e548e9b9SYan, Zheng 		cap = ci->i_auth_cap;
2608e548e9b9SYan, Zheng 		if (!(cap && cap->session == session)) {
2609e548e9b9SYan, Zheng 			pr_err("%p auth cap %p not mds%d ???\n",
2610874c8ca1SDavid Howells 				&ci->netfs.inode, cap, session->s_mds);
2611e548e9b9SYan, Zheng 			spin_unlock(&ci->i_ceph_lock);
2612e548e9b9SYan, Zheng 			continue;
2613e548e9b9SYan, Zheng 		}
2614e548e9b9SYan, Zheng 
2615e548e9b9SYan, Zheng 
2616e548e9b9SYan, Zheng 		/*
2617e548e9b9SYan, Zheng 		 * if flushing caps were revoked, we re-send the cap flush
2618e548e9b9SYan, Zheng 		 * in client reconnect stage. This guarantees MDS * processes
2619e548e9b9SYan, Zheng 		 * the cap flush message before issuing the flushing caps to
2620e548e9b9SYan, Zheng 		 * other client.
2621e548e9b9SYan, Zheng 		 */
2622e548e9b9SYan, Zheng 		if ((cap->issued & ci->i_flushing_caps) !=
2623e548e9b9SYan, Zheng 		    ci->i_flushing_caps) {
262481c5a148SYan, Zheng 			/* encode_caps_cb() also will reset these sequence
262581c5a148SYan, Zheng 			 * numbers. make sure sequence numbers in cap flush
262681c5a148SYan, Zheng 			 * message match later reconnect message */
262781c5a148SYan, Zheng 			cap->seq = 0;
262881c5a148SYan, Zheng 			cap->issue_seq = 0;
262981c5a148SYan, Zheng 			cap->mseq = 0;
26300e294387SYan, Zheng 			__kick_flushing_caps(mdsc, session, ci,
26310e294387SYan, Zheng 					     oldest_flush_tid);
263213c2b57dSYan, Zheng 		} else {
263313c2b57dSYan, Zheng 			ci->i_ceph_flags |= CEPH_I_KICK_FLUSH;
2634e548e9b9SYan, Zheng 		}
2635e548e9b9SYan, Zheng 
2636e548e9b9SYan, Zheng 		spin_unlock(&ci->i_ceph_lock);
2637e548e9b9SYan, Zheng 	}
2638e548e9b9SYan, Zheng }
2639e548e9b9SYan, Zheng 
ceph_kick_flushing_caps(struct ceph_mds_client * mdsc,struct ceph_mds_session * session)2640a8599bd8SSage Weil void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
2641a8599bd8SSage Weil 			     struct ceph_mds_session *session)
2642a8599bd8SSage Weil {
2643a8599bd8SSage Weil 	struct ceph_inode_info *ci;
264413c2b57dSYan, Zheng 	struct ceph_cap *cap;
26450e294387SYan, Zheng 	u64 oldest_flush_tid;
2646a8599bd8SSage Weil 
2647829ad4dbSJeff Layton 	lockdep_assert_held(&session->s_mutex);
2648829ad4dbSJeff Layton 
2649a8599bd8SSage Weil 	dout("kick_flushing_caps mds%d\n", session->s_mds);
26500e294387SYan, Zheng 
26510e294387SYan, Zheng 	spin_lock(&mdsc->cap_dirty_lock);
26520e294387SYan, Zheng 	oldest_flush_tid = __get_oldest_flush_tid(mdsc);
26530e294387SYan, Zheng 	spin_unlock(&mdsc->cap_dirty_lock);
26540e294387SYan, Zheng 
2655a8599bd8SSage Weil 	list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
2656be655596SSage Weil 		spin_lock(&ci->i_ceph_lock);
265713c2b57dSYan, Zheng 		cap = ci->i_auth_cap;
265813c2b57dSYan, Zheng 		if (!(cap && cap->session == session)) {
265913c2b57dSYan, Zheng 			pr_err("%p auth cap %p not mds%d ???\n",
2660874c8ca1SDavid Howells 				&ci->netfs.inode, cap, session->s_mds);
266113c2b57dSYan, Zheng 			spin_unlock(&ci->i_ceph_lock);
266213c2b57dSYan, Zheng 			continue;
266313c2b57dSYan, Zheng 		}
266413c2b57dSYan, Zheng 		if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
266513c2b57dSYan, Zheng 			__kick_flushing_caps(mdsc, session, ci,
266613c2b57dSYan, Zheng 					     oldest_flush_tid);
266713c2b57dSYan, Zheng 		}
2668be655596SSage Weil 		spin_unlock(&ci->i_ceph_lock);
2669a8599bd8SSage Weil 	}
2670a8599bd8SSage Weil }
2671a8599bd8SSage Weil 
ceph_kick_flushing_inode_caps(struct ceph_mds_session * session,struct ceph_inode_info * ci)2672e8a4d267SJeff Layton void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
2673e8a4d267SJeff Layton 				   struct ceph_inode_info *ci)
2674088b3f5eSSage Weil {
2675e8a4d267SJeff Layton 	struct ceph_mds_client *mdsc = session->s_mdsc;
2676e8a4d267SJeff Layton 	struct ceph_cap *cap = ci->i_auth_cap;
2677088b3f5eSSage Weil 
2678e8a4d267SJeff Layton 	lockdep_assert_held(&ci->i_ceph_lock);
2679e8a4d267SJeff Layton 
2680874c8ca1SDavid Howells 	dout("%s %p flushing %s\n", __func__, &ci->netfs.inode,
26818310b089SYan, Zheng 	     ceph_cap_string(ci->i_flushing_caps));
2682005c4697SYan, Zheng 
26830e294387SYan, Zheng 	if (!list_empty(&ci->i_cap_flush_list)) {
26840e294387SYan, Zheng 		u64 oldest_flush_tid;
2685005c4697SYan, Zheng 		spin_lock(&mdsc->cap_dirty_lock);
2686005c4697SYan, Zheng 		list_move_tail(&ci->i_flushing_item,
2687005c4697SYan, Zheng 			       &cap->session->s_cap_flushing);
26880e294387SYan, Zheng 		oldest_flush_tid = __get_oldest_flush_tid(mdsc);
2689005c4697SYan, Zheng 		spin_unlock(&mdsc->cap_dirty_lock);
2690005c4697SYan, Zheng 
26910e294387SYan, Zheng 		__kick_flushing_caps(mdsc, session, ci, oldest_flush_tid);
2692088b3f5eSSage Weil 	}
2693088b3f5eSSage Weil }
2694088b3f5eSSage Weil 
2695a8599bd8SSage Weil 
2696a8599bd8SSage Weil /*
2697a8599bd8SSage Weil  * Take references to capabilities we hold, so that we don't release
2698a8599bd8SSage Weil  * them to the MDS prematurely.
2699a8599bd8SSage Weil  */
ceph_take_cap_refs(struct ceph_inode_info * ci,int got,bool snap_rwsem_locked)270040dcf75eSJeff Layton void ceph_take_cap_refs(struct ceph_inode_info *ci, int got,
27015dda377cSYan, Zheng 			    bool snap_rwsem_locked)
2702a8599bd8SSage Weil {
270340dcf75eSJeff Layton 	lockdep_assert_held(&ci->i_ceph_lock);
270440dcf75eSJeff Layton 
2705a8599bd8SSage Weil 	if (got & CEPH_CAP_PIN)
2706a8599bd8SSage Weil 		ci->i_pin_ref++;
2707a8599bd8SSage Weil 	if (got & CEPH_CAP_FILE_RD)
2708a8599bd8SSage Weil 		ci->i_rd_ref++;
2709a8599bd8SSage Weil 	if (got & CEPH_CAP_FILE_CACHE)
2710a8599bd8SSage Weil 		ci->i_rdcache_ref++;
2711f85122afSJeff Layton 	if (got & CEPH_CAP_FILE_EXCL)
2712f85122afSJeff Layton 		ci->i_fx_ref++;
27135dda377cSYan, Zheng 	if (got & CEPH_CAP_FILE_WR) {
27145dda377cSYan, Zheng 		if (ci->i_wr_ref == 0 && !ci->i_head_snapc) {
27155dda377cSYan, Zheng 			BUG_ON(!snap_rwsem_locked);
27165dda377cSYan, Zheng 			ci->i_head_snapc = ceph_get_snap_context(
27175dda377cSYan, Zheng 					ci->i_snap_realm->cached_context);
27185dda377cSYan, Zheng 		}
2719a8599bd8SSage Weil 		ci->i_wr_ref++;
27205dda377cSYan, Zheng 	}
2721a8599bd8SSage Weil 	if (got & CEPH_CAP_FILE_BUFFER) {
2722d3d0720dSHenry C Chang 		if (ci->i_wb_ref == 0)
2723874c8ca1SDavid Howells 			ihold(&ci->netfs.inode);
2724d3d0720dSHenry C Chang 		ci->i_wb_ref++;
272540dcf75eSJeff Layton 		dout("%s %p wb %d -> %d (?)\n", __func__,
2726874c8ca1SDavid Howells 		     &ci->netfs.inode, ci->i_wb_ref-1, ci->i_wb_ref);
2727a8599bd8SSage Weil 	}
2728a8599bd8SSage Weil }
2729a8599bd8SSage Weil 
2730a8599bd8SSage Weil /*
2731a8599bd8SSage Weil  * Try to grab cap references.  Specify those refs we @want, and the
2732a8599bd8SSage Weil  * minimal set we @need.  Also include the larger offset we are writing
2733a8599bd8SSage Weil  * to (when applicable), and check against max_size here as well.
2734a8599bd8SSage Weil  * Note that caller is responsible for ensuring max_size increases are
2735a8599bd8SSage Weil  * requested from the MDS.
27361199d7daSJeff Layton  *
2737546d4020SYan, Zheng  * Returns 0 if caps were not able to be acquired (yet), 1 if succeed,
2738546d4020SYan, Zheng  * or a negative error code. There are 3 speical error codes:
2739546d4020SYan, Zheng  *  -EAGAIN:  need to sleep but non-blocking is specified
2740546d4020SYan, Zheng  *  -EFBIG:   ask caller to call check_max_size() and try again.
27418006daffSJeff Layton  *  -EUCLEAN: ask caller to call ceph_renew_caps() and try again.
2742a8599bd8SSage Weil  */
2743ff5d913dSYan, Zheng enum {
2744719a2514SYan, Zheng 	/* first 8 bits are reserved for CEPH_FILE_MODE_FOO */
2745719a2514SYan, Zheng 	NON_BLOCKING	= (1 << 8),
2746719a2514SYan, Zheng 	CHECK_FILELOCK	= (1 << 9),
2747ff5d913dSYan, Zheng };
2748ff5d913dSYan, Zheng 
try_get_cap_refs(struct inode * inode,int need,int want,loff_t endoff,int flags,int * got)27495e3ded1bSYan, Zheng static int try_get_cap_refs(struct inode *inode, int need, int want,
2750ff5d913dSYan, Zheng 			    loff_t endoff, int flags, int *got)
2751a8599bd8SSage Weil {
27525e3ded1bSYan, Zheng 	struct ceph_inode_info *ci = ceph_inode(inode);
2753985b9ee8SXiubo Li 	struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
2754a8599bd8SSage Weil 	int ret = 0;
2755c4d4a582SYan, Zheng 	int have, implemented;
27565dda377cSYan, Zheng 	bool snap_rwsem_locked = false;
2757a8599bd8SSage Weil 
2758a8599bd8SSage Weil 	dout("get_cap_refs %p need %s want %s\n", inode,
2759a8599bd8SSage Weil 	     ceph_cap_string(need), ceph_cap_string(want));
2760c4d4a582SYan, Zheng 
27615dda377cSYan, Zheng again:
2762be655596SSage Weil 	spin_lock(&ci->i_ceph_lock);
2763a8599bd8SSage Weil 
2764ff5d913dSYan, Zheng 	if ((flags & CHECK_FILELOCK) &&
2765ff5d913dSYan, Zheng 	    (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) {
2766ff5d913dSYan, Zheng 		dout("try_get_cap_refs %p error filelock\n", inode);
2767ff5d913dSYan, Zheng 		ret = -EIO;
2768ff5d913dSYan, Zheng 		goto out_unlock;
2769ff5d913dSYan, Zheng 	}
2770ff5d913dSYan, Zheng 
277137505d57SYan, Zheng 	/* finish pending truncate */
277237505d57SYan, Zheng 	while (ci->i_truncate_pending) {
277337505d57SYan, Zheng 		spin_unlock(&ci->i_ceph_lock);
27745dda377cSYan, Zheng 		if (snap_rwsem_locked) {
27755dda377cSYan, Zheng 			up_read(&mdsc->snap_rwsem);
27765dda377cSYan, Zheng 			snap_rwsem_locked = false;
27775dda377cSYan, Zheng 		}
2778b415bf4fSYan, Zheng 		__ceph_do_pending_vmtruncate(inode);
277937505d57SYan, Zheng 		spin_lock(&ci->i_ceph_lock);
278037505d57SYan, Zheng 	}
278137505d57SYan, Zheng 
27823871cbb9SYan, Zheng 	have = __ceph_caps_issued(ci, &implemented);
27833871cbb9SYan, Zheng 
27843871cbb9SYan, Zheng 	if (have & need & CEPH_CAP_FILE_WR) {
2785a8599bd8SSage Weil 		if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
2786a8599bd8SSage Weil 			dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
2787a8599bd8SSage Weil 			     inode, endoff, ci->i_max_size);
27881199d7daSJeff Layton 			if (endoff > ci->i_requested_max_size)
27898006daffSJeff Layton 				ret = ci->i_auth_cap ? -EFBIG : -EUCLEAN;
27903738daa6SYan, Zheng 			goto out_unlock;
2791a8599bd8SSage Weil 		}
2792a8599bd8SSage Weil 		/*
2793a8599bd8SSage Weil 		 * If a sync write is in progress, we must wait, so that we
2794a8599bd8SSage Weil 		 * can get a final snapshot value for size+mtime.
2795a8599bd8SSage Weil 		 */
2796a8599bd8SSage Weil 		if (__ceph_have_pending_cap_snap(ci)) {
2797a8599bd8SSage Weil 			dout("get_cap_refs %p cap_snap_pending\n", inode);
27983738daa6SYan, Zheng 			goto out_unlock;
2799a8599bd8SSage Weil 		}
2800a8599bd8SSage Weil 	}
2801a8599bd8SSage Weil 
2802a8599bd8SSage Weil 	if ((have & need) == need) {
2803a8599bd8SSage Weil 		/*
2804a8599bd8SSage Weil 		 * Look at (implemented & ~have & not) so that we keep waiting
2805a8599bd8SSage Weil 		 * on transition from wanted -> needed caps.  This is needed
2806a8599bd8SSage Weil 		 * for WRBUFFER|WR -> WR to avoid a new WR sync write from
2807a8599bd8SSage Weil 		 * going before a prior buffered writeback happens.
28087c3ea987SXiubo Li 		 *
28097c3ea987SXiubo Li 		 * For RDCACHE|RD -> RD, there is not need to wait and we can
28107c3ea987SXiubo Li 		 * just exclude the revoking caps and force to sync read.
2811a8599bd8SSage Weil 		 */
2812a8599bd8SSage Weil 		int not = want & ~(have & need);
2813a8599bd8SSage Weil 		int revoking = implemented & ~have;
28147c3ea987SXiubo Li 		int exclude = revoking & not;
2815a8599bd8SSage Weil 		dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
2816a8599bd8SSage Weil 		     inode, ceph_cap_string(have), ceph_cap_string(not),
2817a8599bd8SSage Weil 		     ceph_cap_string(revoking));
28187c3ea987SXiubo Li 		if (!exclude || !(exclude & CEPH_CAP_FILE_BUFFER)) {
28195dda377cSYan, Zheng 			if (!snap_rwsem_locked &&
28205dda377cSYan, Zheng 			    !ci->i_head_snapc &&
28215dda377cSYan, Zheng 			    (need & CEPH_CAP_FILE_WR)) {
28225dda377cSYan, Zheng 				if (!down_read_trylock(&mdsc->snap_rwsem)) {
28235dda377cSYan, Zheng 					/*
28245dda377cSYan, Zheng 					 * we can not call down_read() when
28255dda377cSYan, Zheng 					 * task isn't in TASK_RUNNING state
28265dda377cSYan, Zheng 					 */
2827ff5d913dSYan, Zheng 					if (flags & NON_BLOCKING) {
28281199d7daSJeff Layton 						ret = -EAGAIN;
28295dda377cSYan, Zheng 						goto out_unlock;
28305dda377cSYan, Zheng 					}
28315dda377cSYan, Zheng 
28325dda377cSYan, Zheng 					spin_unlock(&ci->i_ceph_lock);
28335dda377cSYan, Zheng 					down_read(&mdsc->snap_rwsem);
28345dda377cSYan, Zheng 					snap_rwsem_locked = true;
28355dda377cSYan, Zheng 					goto again;
28365dda377cSYan, Zheng 				}
28375dda377cSYan, Zheng 				snap_rwsem_locked = true;
28385dda377cSYan, Zheng 			}
2839173e70e8SYan, Zheng 			if ((have & want) == want)
28407c3ea987SXiubo Li 				*got = need | (want & ~exclude);
2841173e70e8SYan, Zheng 			else
2842173e70e8SYan, Zheng 				*got = need;
284340dcf75eSJeff Layton 			ceph_take_cap_refs(ci, *got, true);
2844a8599bd8SSage Weil 			ret = 1;
2845a8599bd8SSage Weil 		}
2846a8599bd8SSage Weil 	} else {
284703f4fcb0SYan, Zheng 		int session_readonly = false;
2848c0e385b1SYan, Zheng 		int mds_wanted;
2849525d15e8SYan, Zheng 		if (ci->i_auth_cap &&
2850525d15e8SYan, Zheng 		    (need & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_EXCL))) {
285103f4fcb0SYan, Zheng 			struct ceph_mds_session *s = ci->i_auth_cap->session;
285203f4fcb0SYan, Zheng 			spin_lock(&s->s_cap_lock);
285303f4fcb0SYan, Zheng 			session_readonly = s->s_readonly;
285403f4fcb0SYan, Zheng 			spin_unlock(&s->s_cap_lock);
285503f4fcb0SYan, Zheng 		}
285603f4fcb0SYan, Zheng 		if (session_readonly) {
2857c0e385b1SYan, Zheng 			dout("get_cap_refs %p need %s but mds%d readonly\n",
285803f4fcb0SYan, Zheng 			     inode, ceph_cap_string(need), ci->i_auth_cap->mds);
28591199d7daSJeff Layton 			ret = -EROFS;
286003f4fcb0SYan, Zheng 			goto out_unlock;
286103f4fcb0SYan, Zheng 		}
286203f4fcb0SYan, Zheng 
28635d6451b1SJeff Layton 		if (ceph_inode_is_shutdown(inode)) {
28645d6451b1SJeff Layton 			dout("get_cap_refs %p inode is shutdown\n", inode);
28655d6451b1SJeff Layton 			ret = -ESTALE;
286648fec5d0SYan, Zheng 			goto out_unlock;
286748fec5d0SYan, Zheng 		}
2868c1944fedSYan, Zheng 		mds_wanted = __ceph_caps_mds_wanted(ci, false);
2869c0e385b1SYan, Zheng 		if (need & ~mds_wanted) {
2870c0e385b1SYan, Zheng 			dout("get_cap_refs %p need %s > mds_wanted %s\n",
2871c0e385b1SYan, Zheng 			     inode, ceph_cap_string(need),
2872c0e385b1SYan, Zheng 			     ceph_cap_string(mds_wanted));
28738006daffSJeff Layton 			ret = -EUCLEAN;
287477310320SYan, Zheng 			goto out_unlock;
287577310320SYan, Zheng 		}
287648fec5d0SYan, Zheng 
2877c0e385b1SYan, Zheng 		dout("get_cap_refs %p have %s need %s\n", inode,
2878a8599bd8SSage Weil 		     ceph_cap_string(have), ceph_cap_string(need));
2879a8599bd8SSage Weil 	}
28803738daa6SYan, Zheng out_unlock:
2881719a2514SYan, Zheng 
2882719a2514SYan, Zheng 	__ceph_touch_fmode(ci, mdsc, flags);
2883719a2514SYan, Zheng 
2884be655596SSage Weil 	spin_unlock(&ci->i_ceph_lock);
28855dda377cSYan, Zheng 	if (snap_rwsem_locked)
28865dda377cSYan, Zheng 		up_read(&mdsc->snap_rwsem);
28873738daa6SYan, Zheng 
28881af16d54SXiubo Li 	if (!ret)
28891af16d54SXiubo Li 		ceph_update_cap_mis(&mdsc->metric);
28901af16d54SXiubo Li 	else if (ret == 1)
28911af16d54SXiubo Li 		ceph_update_cap_hit(&mdsc->metric);
28921af16d54SXiubo Li 
2893a8599bd8SSage Weil 	dout("get_cap_refs %p ret %d got %s\n", inode,
2894c4d4a582SYan, Zheng 	     ret, ceph_cap_string(*got));
2895a8599bd8SSage Weil 	return ret;
2896a8599bd8SSage Weil }
2897a8599bd8SSage Weil 
2898a8599bd8SSage Weil /*
2899a8599bd8SSage Weil  * Check the offset we are writing up to against our current
2900a8599bd8SSage Weil  * max_size.  If necessary, tell the MDS we want to write to
2901a8599bd8SSage Weil  * a larger offset.
2902a8599bd8SSage Weil  */
check_max_size(struct inode * inode,loff_t endoff)2903a8599bd8SSage Weil static void check_max_size(struct inode *inode, loff_t endoff)
2904a8599bd8SSage Weil {
2905a8599bd8SSage Weil 	struct ceph_inode_info *ci = ceph_inode(inode);
2906a8599bd8SSage Weil 	int check = 0;
2907a8599bd8SSage Weil 
2908a8599bd8SSage Weil 	/* do we need to explicitly request a larger max_size? */
2909be655596SSage Weil 	spin_lock(&ci->i_ceph_lock);
29103871cbb9SYan, Zheng 	if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) {
2911a8599bd8SSage Weil 		dout("write %p at large endoff %llu, req max_size\n",
2912a8599bd8SSage Weil 		     inode, endoff);
2913a8599bd8SSage Weil 		ci->i_wanted_max_size = endoff;
2914a8599bd8SSage Weil 	}
29153871cbb9SYan, Zheng 	/* duplicate ceph_check_caps()'s logic */
29163871cbb9SYan, Zheng 	if (ci->i_auth_cap &&
29173871cbb9SYan, Zheng 	    (ci->i_auth_cap->issued & CEPH_CAP_FILE_WR) &&
29183871cbb9SYan, Zheng 	    ci->i_wanted_max_size > ci->i_max_size &&
29193871cbb9SYan, Zheng 	    ci->i_wanted_max_size > ci->i_requested_max_size)
29203871cbb9SYan, Zheng 		check = 1;
2921be655596SSage Weil 	spin_unlock(&ci->i_ceph_lock);
2922a8599bd8SSage Weil 	if (check)
2923e4b731ccSXiubo Li 		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY);
2924a8599bd8SSage Weil }
2925a8599bd8SSage Weil 
get_used_fmode(int caps)2926719a2514SYan, Zheng static inline int get_used_fmode(int caps)
2927719a2514SYan, Zheng {
2928719a2514SYan, Zheng 	int fmode = 0;
2929719a2514SYan, Zheng 	if (caps & CEPH_CAP_FILE_RD)
2930719a2514SYan, Zheng 		fmode |= CEPH_FILE_MODE_RD;
2931719a2514SYan, Zheng 	if (caps & CEPH_CAP_FILE_WR)
2932719a2514SYan, Zheng 		fmode |= CEPH_FILE_MODE_WR;
2933719a2514SYan, Zheng 	return fmode;
2934719a2514SYan, Zheng }
2935719a2514SYan, Zheng 
ceph_try_get_caps(struct inode * inode,int need,int want,bool nonblock,int * got)29365e3ded1bSYan, Zheng int ceph_try_get_caps(struct inode *inode, int need, int want,
29372ee9dd95SLuis Henriques 		      bool nonblock, int *got)
29382b1ac852SYan, Zheng {
2939719a2514SYan, Zheng 	int ret, flags;
29402b1ac852SYan, Zheng 
29412b1ac852SYan, Zheng 	BUG_ON(need & ~CEPH_CAP_FILE_RD);
2942a25949b9SJeff Layton 	BUG_ON(want & ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO |
2943a25949b9SJeff Layton 			CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
2944a25949b9SJeff Layton 			CEPH_CAP_ANY_DIR_OPS));
2945a25949b9SJeff Layton 	if (need) {
29465e3ded1bSYan, Zheng 		ret = ceph_pool_perm_check(inode, need);
29472b1ac852SYan, Zheng 		if (ret < 0)
29482b1ac852SYan, Zheng 			return ret;
2949a25949b9SJeff Layton 	}
29502b1ac852SYan, Zheng 
2951719a2514SYan, Zheng 	flags = get_used_fmode(need | want);
2952719a2514SYan, Zheng 	if (nonblock)
2953719a2514SYan, Zheng 		flags |= NON_BLOCKING;
2954719a2514SYan, Zheng 
2955719a2514SYan, Zheng 	ret = try_get_cap_refs(inode, need, want, 0, flags, got);
2956546d4020SYan, Zheng 	/* three special error codes */
29578006daffSJeff Layton 	if (ret == -EAGAIN || ret == -EFBIG || ret == -EUCLEAN)
2958546d4020SYan, Zheng 		ret = 0;
2959546d4020SYan, Zheng 	return ret;
29602b1ac852SYan, Zheng }
29612b1ac852SYan, Zheng 
2962a8599bd8SSage Weil /*
2963a8599bd8SSage Weil  * Wait for caps, and take cap references.  If we can't get a WR cap
2964a8599bd8SSage Weil  * due to a small max_size, make sure we check_max_size (and possibly
2965a8599bd8SSage Weil  * ask the mds) so we don't get hung up indefinitely.
2966a8599bd8SSage Weil  */
__ceph_get_caps(struct inode * inode,struct ceph_file_info * fi,int need,int want,loff_t endoff,int * got)29675c64737dSXiubo Li int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need,
29685c64737dSXiubo Li 		    int want, loff_t endoff, int *got)
2969a8599bd8SSage Weil {
29705e3ded1bSYan, Zheng 	struct ceph_inode_info *ci = ceph_inode(inode);
2971985b9ee8SXiubo Li 	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
2972ff5d913dSYan, Zheng 	int ret, _got, flags;
2973a8599bd8SSage Weil 
29745e3ded1bSYan, Zheng 	ret = ceph_pool_perm_check(inode, need);
297510183a69SYan, Zheng 	if (ret < 0)
297610183a69SYan, Zheng 		return ret;
297710183a69SYan, Zheng 
29785c64737dSXiubo Li 	if (fi && (fi->fmode & CEPH_FILE_MODE_WR) &&
297981f148a9SYan, Zheng 	    fi->filp_gen != READ_ONCE(fsc->filp_gen))
298081f148a9SYan, Zheng 		return -EBADF;
298181f148a9SYan, Zheng 
2982719a2514SYan, Zheng 	flags = get_used_fmode(need | want);
2983719a2514SYan, Zheng 
29845dda377cSYan, Zheng 	while (true) {
2985719a2514SYan, Zheng 		flags &= CEPH_FILE_MODE_MASK;
2986461ab10eSXiubo Li 		if (vfs_inode_has_locks(inode))
2987719a2514SYan, Zheng 			flags |= CHECK_FILELOCK;
2988c4d4a582SYan, Zheng 		_got = 0;
29895e3ded1bSYan, Zheng 		ret = try_get_cap_refs(inode, need, want, endoff,
2990ff5d913dSYan, Zheng 				       flags, &_got);
2991546d4020SYan, Zheng 		WARN_ON_ONCE(ret == -EAGAIN);
29927b2f936fSYan, Zheng 		if (!ret) {
29933a3430afSJeff Layton 			struct ceph_mds_client *mdsc = fsc->mdsc;
29943a3430afSJeff Layton 			struct cap_wait cw;
29955c341ee3SNikolay Borisov 			DEFINE_WAIT_FUNC(wait, woken_wake_function);
29963a3430afSJeff Layton 
2997ebce3eb2SJeff Layton 			cw.ino = ceph_ino(inode);
29983a3430afSJeff Layton 			cw.tgid = current->tgid;
29993a3430afSJeff Layton 			cw.need = need;
30003a3430afSJeff Layton 			cw.want = want;
30013a3430afSJeff Layton 
30023a3430afSJeff Layton 			spin_lock(&mdsc->caps_list_lock);
30033a3430afSJeff Layton 			list_add(&cw.list, &mdsc->cap_wait_list);
30043a3430afSJeff Layton 			spin_unlock(&mdsc->caps_list_lock);
30053a3430afSJeff Layton 
3006719a2514SYan, Zheng 			/* make sure used fmode not timeout */
3007719a2514SYan, Zheng 			ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS);
30085c341ee3SNikolay Borisov 			add_wait_queue(&ci->i_cap_wq, &wait);
30095c341ee3SNikolay Borisov 
3010ff5d913dSYan, Zheng 			flags |= NON_BLOCKING;
30115e3ded1bSYan, Zheng 			while (!(ret = try_get_cap_refs(inode, need, want,
3012ff5d913dSYan, Zheng 							endoff, flags, &_got))) {
30136e09d0fbSYan, Zheng 				if (signal_pending(current)) {
30146e09d0fbSYan, Zheng 					ret = -ERESTARTSYS;
30156e09d0fbSYan, Zheng 					break;
30166e09d0fbSYan, Zheng 				}
30175c341ee3SNikolay Borisov 				wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
30186e09d0fbSYan, Zheng 			}
30195c341ee3SNikolay Borisov 
30205c341ee3SNikolay Borisov 			remove_wait_queue(&ci->i_cap_wq, &wait);
3021719a2514SYan, Zheng 			ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS);
30223a3430afSJeff Layton 
30233a3430afSJeff Layton 			spin_lock(&mdsc->caps_list_lock);
30243a3430afSJeff Layton 			list_del(&cw.list);
30253a3430afSJeff Layton 			spin_unlock(&mdsc->caps_list_lock);
30263a3430afSJeff Layton 
30277b2f936fSYan, Zheng 			if (ret == -EAGAIN)
30285dda377cSYan, Zheng 				continue;
302977310320SYan, Zheng 		}
303081f148a9SYan, Zheng 
30315c64737dSXiubo Li 		if (fi && (fi->fmode & CEPH_FILE_MODE_WR) &&
303281f148a9SYan, Zheng 		    fi->filp_gen != READ_ONCE(fsc->filp_gen)) {
303381f148a9SYan, Zheng 			if (ret >= 0 && _got)
303481f148a9SYan, Zheng 				ceph_put_cap_refs(ci, _got);
303581f148a9SYan, Zheng 			return -EBADF;
303681f148a9SYan, Zheng 		}
303781f148a9SYan, Zheng 
30387b2f936fSYan, Zheng 		if (ret < 0) {
30398006daffSJeff Layton 			if (ret == -EFBIG || ret == -EUCLEAN) {
30409bccb765SYan, Zheng 				int ret2 = ceph_wait_on_async_create(inode);
30419bccb765SYan, Zheng 				if (ret2 < 0)
30429bccb765SYan, Zheng 					return ret2;
30439bccb765SYan, Zheng 			}
3044546d4020SYan, Zheng 			if (ret == -EFBIG) {
3045546d4020SYan, Zheng 				check_max_size(inode, endoff);
3046546d4020SYan, Zheng 				continue;
3047546d4020SYan, Zheng 			}
30488006daffSJeff Layton 			if (ret == -EUCLEAN) {
304977310320SYan, Zheng 				/* session was killed, try renew caps */
3050719a2514SYan, Zheng 				ret = ceph_renew_caps(inode, flags);
305177310320SYan, Zheng 				if (ret == 0)
305277310320SYan, Zheng 					continue;
30537b2f936fSYan, Zheng 			}
3054c4d4a582SYan, Zheng 			return ret;
30555dda377cSYan, Zheng 		}
3056c4d4a582SYan, Zheng 
3057874c8ca1SDavid Howells 		if (S_ISREG(ci->netfs.inode.i_mode) &&
305848490776SXiubo Li 		    ceph_has_inline_data(ci) &&
3059c4d4a582SYan, Zheng 		    (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
30605e3ded1bSYan, Zheng 		    i_size_read(inode) > 0) {
30615dda377cSYan, Zheng 			struct page *page =
30625e3ded1bSYan, Zheng 				find_get_page(inode->i_mapping, 0);
3063c4d4a582SYan, Zheng 			if (page) {
3064e72968e1SJeff Layton 				bool uptodate = PageUptodate(page);
3065e72968e1SJeff Layton 
306609cbfeafSKirill A. Shutemov 				put_page(page);
3067e72968e1SJeff Layton 				if (uptodate)
3068e72968e1SJeff Layton 					break;
3069c4d4a582SYan, Zheng 			}
3070c4d4a582SYan, Zheng 			/*
30715dda377cSYan, Zheng 			 * drop cap refs first because getattr while
30725dda377cSYan, Zheng 			 * holding * caps refs can cause deadlock.
3073c4d4a582SYan, Zheng 			 */
3074c4d4a582SYan, Zheng 			ceph_put_cap_refs(ci, _got);
3075c4d4a582SYan, Zheng 			_got = 0;
3076c4d4a582SYan, Zheng 
30775dda377cSYan, Zheng 			/*
30785dda377cSYan, Zheng 			 * getattr request will bring inline data into
30795dda377cSYan, Zheng 			 * page cache
30805dda377cSYan, Zheng 			 */
30815e3ded1bSYan, Zheng 			ret = __ceph_do_getattr(inode, NULL,
30825dda377cSYan, Zheng 						CEPH_STAT_CAP_INLINE_DATA,
30835dda377cSYan, Zheng 						true);
3084c4d4a582SYan, Zheng 			if (ret < 0)
3085a8599bd8SSage Weil 				return ret;
30865dda377cSYan, Zheng 			continue;
3087c4d4a582SYan, Zheng 		}
30885dda377cSYan, Zheng 		break;
30895dda377cSYan, Zheng 	}
3090c4d4a582SYan, Zheng 	*got = _got;
3091c4d4a582SYan, Zheng 	return 0;
3092a8599bd8SSage Weil }
3093a8599bd8SSage Weil 
ceph_get_caps(struct file * filp,int need,int want,loff_t endoff,int * got)30945c64737dSXiubo Li int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff,
30955c64737dSXiubo Li 		  int *got)
30965c64737dSXiubo Li {
30975c64737dSXiubo Li 	struct ceph_file_info *fi = filp->private_data;
30985c64737dSXiubo Li 	struct inode *inode = file_inode(filp);
30995c64737dSXiubo Li 
31005c64737dSXiubo Li 	return __ceph_get_caps(inode, fi, need, want, endoff, got);
31015c64737dSXiubo Li }
31025c64737dSXiubo Li 
3103a8599bd8SSage Weil /*
3104a8599bd8SSage Weil  * Take cap refs.  Caller must already know we hold at least one ref
3105a8599bd8SSage Weil  * on the caps in question or we don't know this is safe.
3106a8599bd8SSage Weil  */
ceph_get_cap_refs(struct ceph_inode_info * ci,int caps)3107a8599bd8SSage Weil void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
3108a8599bd8SSage Weil {
3109be655596SSage Weil 	spin_lock(&ci->i_ceph_lock);
311040dcf75eSJeff Layton 	ceph_take_cap_refs(ci, caps, false);
3111be655596SSage Weil 	spin_unlock(&ci->i_ceph_lock);
3112a8599bd8SSage Weil }
3113a8599bd8SSage Weil 
311486056090SYan, Zheng 
311586056090SYan, Zheng /*
311686056090SYan, Zheng  * drop cap_snap that is not associated with any snapshot.
311786056090SYan, Zheng  * we don't need to send FLUSHSNAP message for it.
311886056090SYan, Zheng  */
ceph_try_drop_cap_snap(struct ceph_inode_info * ci,struct ceph_cap_snap * capsnap)311970220ac8SYan, Zheng static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci,
312070220ac8SYan, Zheng 				  struct ceph_cap_snap *capsnap)
312186056090SYan, Zheng {
312286056090SYan, Zheng 	if (!capsnap->need_flush &&
312386056090SYan, Zheng 	    !capsnap->writing && !capsnap->dirty_pages) {
312486056090SYan, Zheng 		dout("dropping cap_snap %p follows %llu\n",
312586056090SYan, Zheng 		     capsnap, capsnap->follows);
31260e294387SYan, Zheng 		BUG_ON(capsnap->cap_flush.tid > 0);
312786056090SYan, Zheng 		ceph_put_snap_context(capsnap->context);
312870220ac8SYan, Zheng 		if (!list_is_last(&capsnap->ci_item, &ci->i_cap_snaps))
312970220ac8SYan, Zheng 			ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
313070220ac8SYan, Zheng 
313186056090SYan, Zheng 		list_del(&capsnap->ci_item);
313286056090SYan, Zheng 		ceph_put_cap_snap(capsnap);
313386056090SYan, Zheng 		return 1;
313486056090SYan, Zheng 	}
313586056090SYan, Zheng 	return 0;
313686056090SYan, Zheng }
313786056090SYan, Zheng 
3138a8810cdcSJeff Layton enum put_cap_refs_mode {
3139a8810cdcSJeff Layton 	PUT_CAP_REFS_SYNC = 0,
3140a8810cdcSJeff Layton 	PUT_CAP_REFS_NO_CHECK,
3141a8810cdcSJeff Layton 	PUT_CAP_REFS_ASYNC,
3142a8810cdcSJeff Layton };
3143a8810cdcSJeff Layton 
3144a8599bd8SSage Weil /*
3145a8599bd8SSage Weil  * Release cap refs.
3146a8599bd8SSage Weil  *
3147a8599bd8SSage Weil  * If we released the last ref on any given cap, call ceph_check_caps
3148a8599bd8SSage Weil  * to release (or schedule a release).
3149a8599bd8SSage Weil  *
3150a8599bd8SSage Weil  * If we are releasing a WR cap (from a sync write), finalize any affected
3151a8599bd8SSage Weil  * cap_snap, and wake up any waiters.
3152a8599bd8SSage Weil  */
__ceph_put_cap_refs(struct ceph_inode_info * ci,int had,enum put_cap_refs_mode mode)3153e64f44a8SXiubo Li static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
3154a8810cdcSJeff Layton 				enum put_cap_refs_mode mode)
3155a8599bd8SSage Weil {
3156874c8ca1SDavid Howells 	struct inode *inode = &ci->netfs.inode;
3157a8599bd8SSage Weil 	int last = 0, put = 0, flushsnaps = 0, wake = 0;
3158558b4510SXiubo Li 	bool check_flushsnaps = false;
3159a8599bd8SSage Weil 
3160be655596SSage Weil 	spin_lock(&ci->i_ceph_lock);
3161a8599bd8SSage Weil 	if (had & CEPH_CAP_PIN)
3162a8599bd8SSage Weil 		--ci->i_pin_ref;
3163a8599bd8SSage Weil 	if (had & CEPH_CAP_FILE_RD)
3164a8599bd8SSage Weil 		if (--ci->i_rd_ref == 0)
3165a8599bd8SSage Weil 			last++;
3166a8599bd8SSage Weil 	if (had & CEPH_CAP_FILE_CACHE)
3167a8599bd8SSage Weil 		if (--ci->i_rdcache_ref == 0)
3168a8599bd8SSage Weil 			last++;
3169f85122afSJeff Layton 	if (had & CEPH_CAP_FILE_EXCL)
3170f85122afSJeff Layton 		if (--ci->i_fx_ref == 0)
3171f85122afSJeff Layton 			last++;
3172a8599bd8SSage Weil 	if (had & CEPH_CAP_FILE_BUFFER) {
3173d3d0720dSHenry C Chang 		if (--ci->i_wb_ref == 0) {
3174a8599bd8SSage Weil 			last++;
3175558b4510SXiubo Li 			/* put the ref held by ceph_take_cap_refs() */
3176a8599bd8SSage Weil 			put++;
3177558b4510SXiubo Li 			check_flushsnaps = true;
3178a8599bd8SSage Weil 		}
3179d3d0720dSHenry C Chang 		dout("put_cap_refs %p wb %d -> %d (?)\n",
3180d3d0720dSHenry C Chang 		     inode, ci->i_wb_ref+1, ci->i_wb_ref);
3181a8599bd8SSage Weil 	}
3182558b4510SXiubo Li 	if (had & CEPH_CAP_FILE_WR) {
3183a8599bd8SSage Weil 		if (--ci->i_wr_ref == 0) {
31842d12ad95SXiubo Li 			/*
31852d12ad95SXiubo Li 			 * The Fb caps will always be took and released
31862d12ad95SXiubo Li 			 * together with the Fw caps.
31872d12ad95SXiubo Li 			 */
31882d12ad95SXiubo Li 			WARN_ON_ONCE(ci->i_wb_ref);
31892d12ad95SXiubo Li 
3190a8599bd8SSage Weil 			last++;
3191558b4510SXiubo Li 			check_flushsnaps = true;
31925dda377cSYan, Zheng 			if (ci->i_wrbuffer_ref_head == 0 &&
31935dda377cSYan, Zheng 			    ci->i_dirty_caps == 0 &&
31945dda377cSYan, Zheng 			    ci->i_flushing_caps == 0) {
31955dda377cSYan, Zheng 				BUG_ON(!ci->i_head_snapc);
31965dda377cSYan, Zheng 				ceph_put_snap_context(ci->i_head_snapc);
31975dda377cSYan, Zheng 				ci->i_head_snapc = NULL;
31985dda377cSYan, Zheng 			}
3199db40cc17SYan, Zheng 			/* see comment in __ceph_remove_cap() */
3200bd84fbcbSXiubo Li 			if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm)
32010ba92e1cSJeff Layton 				ceph_change_snap_realm(inode, NULL);
3202a8599bd8SSage Weil 		}
3203558b4510SXiubo Li 	}
3204558b4510SXiubo Li 	if (check_flushsnaps && __ceph_have_pending_cap_snap(ci)) {
3205558b4510SXiubo Li 		struct ceph_cap_snap *capsnap =
3206558b4510SXiubo Li 			list_last_entry(&ci->i_cap_snaps,
3207558b4510SXiubo Li 					struct ceph_cap_snap,
3208558b4510SXiubo Li 					ci_item);
3209558b4510SXiubo Li 
3210558b4510SXiubo Li 		capsnap->writing = 0;
3211558b4510SXiubo Li 		if (ceph_try_drop_cap_snap(ci, capsnap))
3212558b4510SXiubo Li 			/* put the ref held by ceph_queue_cap_snap() */
3213558b4510SXiubo Li 			put++;
3214558b4510SXiubo Li 		else if (__ceph_finish_cap_snap(ci, capsnap))
3215558b4510SXiubo Li 			flushsnaps = 1;
3216558b4510SXiubo Li 		wake = 1;
3217558b4510SXiubo Li 	}
3218be655596SSage Weil 	spin_unlock(&ci->i_ceph_lock);
3219a8599bd8SSage Weil 
3220819ccbfaSSage Weil 	dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
3221819ccbfaSSage Weil 	     last ? " last" : "", put ? " put" : "");
3222a8599bd8SSage Weil 
3223a8810cdcSJeff Layton 	switch (mode) {
3224a8810cdcSJeff Layton 	case PUT_CAP_REFS_SYNC:
322564f36da5SJeff Layton 		if (last)
3226e4b731ccSXiubo Li 			ceph_check_caps(ci, 0);
3227a8599bd8SSage Weil 		else if (flushsnaps)
3228ed9b430cSYan, Zheng 			ceph_flush_snaps(ci, NULL);
3229a8810cdcSJeff Layton 		break;
3230a8810cdcSJeff Layton 	case PUT_CAP_REFS_ASYNC:
3231a8810cdcSJeff Layton 		if (last)
3232a8810cdcSJeff Layton 			ceph_queue_check_caps(inode);
3233a8810cdcSJeff Layton 		else if (flushsnaps)
3234a8810cdcSJeff Layton 			ceph_queue_flush_snaps(inode);
3235a8810cdcSJeff Layton 		break;
3236a8810cdcSJeff Layton 	default:
3237a8810cdcSJeff Layton 		break;
323864f36da5SJeff Layton 	}
3239a8599bd8SSage Weil 	if (wake)
324003066f23SYehuda Sadeh 		wake_up_all(&ci->i_cap_wq);
324186056090SYan, Zheng 	while (put-- > 0)
3242a8599bd8SSage Weil 		iput(inode);
3243a8599bd8SSage Weil }
3244a8599bd8SSage Weil 
ceph_put_cap_refs(struct ceph_inode_info * ci,int had)3245e64f44a8SXiubo Li void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
3246e64f44a8SXiubo Li {
3247a8810cdcSJeff Layton 	__ceph_put_cap_refs(ci, had, PUT_CAP_REFS_SYNC);
3248a8810cdcSJeff Layton }
3249a8810cdcSJeff Layton 
ceph_put_cap_refs_async(struct ceph_inode_info * ci,int had)3250a8810cdcSJeff Layton void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had)
3251a8810cdcSJeff Layton {
3252a8810cdcSJeff Layton 	__ceph_put_cap_refs(ci, had, PUT_CAP_REFS_ASYNC);
3253e64f44a8SXiubo Li }
3254e64f44a8SXiubo Li 
ceph_put_cap_refs_no_check_caps(struct ceph_inode_info * ci,int had)3255e64f44a8SXiubo Li void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had)
3256e64f44a8SXiubo Li {
3257a8810cdcSJeff Layton 	__ceph_put_cap_refs(ci, had, PUT_CAP_REFS_NO_CHECK);
3258e64f44a8SXiubo Li }
3259e64f44a8SXiubo Li 
3260a8599bd8SSage Weil /*
3261a8599bd8SSage Weil  * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
3262a8599bd8SSage Weil  * context.  Adjust per-snap dirty page accounting as appropriate.
3263a8599bd8SSage Weil  * Once all dirty data for a cap_snap is flushed, flush snapped file
3264a8599bd8SSage Weil  * metadata back to the MDS.  If we dropped the last ref, call
3265a8599bd8SSage Weil  * ceph_check_caps.
3266a8599bd8SSage Weil  */
ceph_put_wrbuffer_cap_refs(struct ceph_inode_info * ci,int nr,struct ceph_snap_context * snapc)3267a8599bd8SSage Weil void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
3268a8599bd8SSage Weil 				struct ceph_snap_context *snapc)
3269a8599bd8SSage Weil {
3270874c8ca1SDavid Howells 	struct inode *inode = &ci->netfs.inode;
32713ffa9d6fSJakob Koschel 	struct ceph_cap_snap *capsnap = NULL, *iter;
327270220ac8SYan, Zheng 	int put = 0;
327370220ac8SYan, Zheng 	bool last = false;
327470220ac8SYan, Zheng 	bool flush_snaps = false;
327570220ac8SYan, Zheng 	bool complete_capsnap = false;
3276a8599bd8SSage Weil 
3277be655596SSage Weil 	spin_lock(&ci->i_ceph_lock);
3278a8599bd8SSage Weil 	ci->i_wrbuffer_ref -= nr;
327970220ac8SYan, Zheng 	if (ci->i_wrbuffer_ref == 0) {
328070220ac8SYan, Zheng 		last = true;
328170220ac8SYan, Zheng 		put++;
328270220ac8SYan, Zheng 	}
3283a8599bd8SSage Weil 
3284a8599bd8SSage Weil 	if (ci->i_head_snapc == snapc) {
3285a8599bd8SSage Weil 		ci->i_wrbuffer_ref_head -= nr;
32867d8cb26dSSage Weil 		if (ci->i_wrbuffer_ref_head == 0 &&
32875dda377cSYan, Zheng 		    ci->i_wr_ref == 0 &&
32885dda377cSYan, Zheng 		    ci->i_dirty_caps == 0 &&
32895dda377cSYan, Zheng 		    ci->i_flushing_caps == 0) {
32907d8cb26dSSage Weil 			BUG_ON(!ci->i_head_snapc);
3291a8599bd8SSage Weil 			ceph_put_snap_context(ci->i_head_snapc);
3292a8599bd8SSage Weil 			ci->i_head_snapc = NULL;
3293a8599bd8SSage Weil 		}
3294a8599bd8SSage Weil 		dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
3295a8599bd8SSage Weil 		     inode,
3296a8599bd8SSage Weil 		     ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
3297a8599bd8SSage Weil 		     ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
3298a8599bd8SSage Weil 		     last ? " LAST" : "");
3299a8599bd8SSage Weil 	} else {
33003ffa9d6fSJakob Koschel 		list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) {
33013ffa9d6fSJakob Koschel 			if (iter->context == snapc) {
33023ffa9d6fSJakob Koschel 				capsnap = iter;
3303a8599bd8SSage Weil 				break;
3304a8599bd8SSage Weil 			}
3305a8599bd8SSage Weil 		}
3306a6d37ccdSXiubo Li 
33073ffa9d6fSJakob Koschel 		if (!capsnap) {
3308a6d37ccdSXiubo Li 			/*
3309a6d37ccdSXiubo Li 			 * The capsnap should already be removed when removing
3310a6d37ccdSXiubo Li 			 * auth cap in the case of a forced unmount.
3311a6d37ccdSXiubo Li 			 */
3312a6d37ccdSXiubo Li 			WARN_ON_ONCE(ci->i_auth_cap);
3313a6d37ccdSXiubo Li 			goto unlock;
3314a6d37ccdSXiubo Li 		}
3315a6d37ccdSXiubo Li 
3316819ccbfaSSage Weil 		capsnap->dirty_pages -= nr;
3317819ccbfaSSage Weil 		if (capsnap->dirty_pages == 0) {
331870220ac8SYan, Zheng 			complete_capsnap = true;
331970220ac8SYan, Zheng 			if (!capsnap->writing) {
332070220ac8SYan, Zheng 				if (ceph_try_drop_cap_snap(ci, capsnap)) {
332170220ac8SYan, Zheng 					put++;
332270220ac8SYan, Zheng 				} else {
332370220ac8SYan, Zheng 					ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
332470220ac8SYan, Zheng 					flush_snaps = true;
332570220ac8SYan, Zheng 				}
332670220ac8SYan, Zheng 			}
3327819ccbfaSSage Weil 		}
3328a8599bd8SSage Weil 		dout("put_wrbuffer_cap_refs on %p cap_snap %p "
332986056090SYan, Zheng 		     " snap %lld %d/%d -> %d/%d %s%s\n",
3330a8599bd8SSage Weil 		     inode, capsnap, capsnap->context->seq,
3331a8599bd8SSage Weil 		     ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
3332a8599bd8SSage Weil 		     ci->i_wrbuffer_ref, capsnap->dirty_pages,
3333a8599bd8SSage Weil 		     last ? " (wrbuffer last)" : "",
333486056090SYan, Zheng 		     complete_capsnap ? " (complete capsnap)" : "");
3335a8599bd8SSage Weil 	}
3336a8599bd8SSage Weil 
3337a6d37ccdSXiubo Li unlock:
3338be655596SSage Weil 	spin_unlock(&ci->i_ceph_lock);
3339a8599bd8SSage Weil 
3340a8599bd8SSage Weil 	if (last) {
3341e4b731ccSXiubo Li 		ceph_check_caps(ci, 0);
334270220ac8SYan, Zheng 	} else if (flush_snaps) {
3343ed9b430cSYan, Zheng 		ceph_flush_snaps(ci, NULL);
3344a8599bd8SSage Weil 	}
334570220ac8SYan, Zheng 	if (complete_capsnap)
334670220ac8SYan, Zheng 		wake_up_all(&ci->i_cap_wq);
33473e1d0452SYan, Zheng 	while (put-- > 0) {
334823c2c76eSJeff Layton 		iput(inode);
33493e1d0452SYan, Zheng 	}
3350a8599bd8SSage Weil }
3351a8599bd8SSage Weil 
3352a8599bd8SSage Weil /*
3353ca20c991SYan, Zheng  * Invalidate unlinked inode's aliases, so we can drop the inode ASAP.
3354ca20c991SYan, Zheng  */
invalidate_aliases(struct inode * inode)3355ca20c991SYan, Zheng static void invalidate_aliases(struct inode *inode)
3356ca20c991SYan, Zheng {
3357ca20c991SYan, Zheng 	struct dentry *dn, *prev = NULL;
3358ca20c991SYan, Zheng 
3359ca20c991SYan, Zheng 	dout("invalidate_aliases inode %p\n", inode);
3360ca20c991SYan, Zheng 	d_prune_aliases(inode);
3361ca20c991SYan, Zheng 	/*
3362ca20c991SYan, Zheng 	 * For non-directory inode, d_find_alias() only returns
3363fc12c80aSJ. Bruce Fields 	 * hashed dentry. After calling d_invalidate(), the
3364fc12c80aSJ. Bruce Fields 	 * dentry becomes unhashed.
3365ca20c991SYan, Zheng 	 *
3366a8d436f0SYan, Zheng 	 * For directory inode, d_find_alias() can return
3367fc12c80aSJ. Bruce Fields 	 * unhashed dentry. But directory inode should have
3368ca20c991SYan, Zheng 	 * one alias at most.
3369ca20c991SYan, Zheng 	 */
3370ca20c991SYan, Zheng 	while ((dn = d_find_alias(inode))) {
3371ca20c991SYan, Zheng 		if (dn == prev) {
3372ca20c991SYan, Zheng 			dput(dn);
3373ca20c991SYan, Zheng 			break;
3374ca20c991SYan, Zheng 		}
3375a8d436f0SYan, Zheng 		d_invalidate(dn);
3376ca20c991SYan, Zheng 		if (prev)
3377ca20c991SYan, Zheng 			dput(prev);
3378ca20c991SYan, Zheng 		prev = dn;
3379ca20c991SYan, Zheng 	}
3380ca20c991SYan, Zheng 	if (prev)
3381ca20c991SYan, Zheng 		dput(prev);
3382ca20c991SYan, Zheng }
3383ca20c991SYan, Zheng 
3384a1c6b835SYan, Zheng struct cap_extra_info {
3385a1c6b835SYan, Zheng 	struct ceph_string *pool_ns;
3386a1c6b835SYan, Zheng 	/* inline data */
3387a1c6b835SYan, Zheng 	u64 inline_version;
3388a1c6b835SYan, Zheng 	void *inline_data;
3389a1c6b835SYan, Zheng 	u32 inline_len;
33904985d6f9SYan, Zheng 	/* dirstat */
33914985d6f9SYan, Zheng 	bool dirstat_valid;
33924985d6f9SYan, Zheng 	u64 nfiles;
33934985d6f9SYan, Zheng 	u64 nsubdirs;
3394176c77c9SJeff Layton 	u64 change_attr;
3395a1c6b835SYan, Zheng 	/* currently issued */
3396a1c6b835SYan, Zheng 	int issued;
3397ec62b894SJeff Layton 	struct timespec64 btime;
33980d91f0adSJeff Layton 	u8 *fscrypt_auth;
33990d91f0adSJeff Layton 	u32 fscrypt_auth_len;
34000d91f0adSJeff Layton 	u64 fscrypt_file_size;
3401a1c6b835SYan, Zheng };
3402a1c6b835SYan, Zheng 
3403ca20c991SYan, Zheng /*
3404a8599bd8SSage Weil  * Handle a cap GRANT message from the MDS.  (Note that a GRANT may
3405a8599bd8SSage Weil  * actually be a revocation if it specifies a smaller cap set.)
3406a8599bd8SSage Weil  *
3407be655596SSage Weil  * caller holds s_mutex and i_ceph_lock, we drop both.
3408a8599bd8SSage Weil  */
handle_cap_grant(struct inode * inode,struct ceph_mds_session * session,struct ceph_cap * cap,struct ceph_mds_caps * grant,struct ceph_buffer * xattr_buf,struct cap_extra_info * extra_info)3409a1c6b835SYan, Zheng static void handle_cap_grant(struct inode *inode,
3410a8599bd8SSage Weil 			     struct ceph_mds_session *session,
3411a1c6b835SYan, Zheng 			     struct ceph_cap *cap,
3412a1c6b835SYan, Zheng 			     struct ceph_mds_caps *grant,
3413a1c6b835SYan, Zheng 			     struct ceph_buffer *xattr_buf,
3414a1c6b835SYan, Zheng 			     struct cap_extra_info *extra_info)
3415be655596SSage Weil 	__releases(ci->i_ceph_lock)
3416a1c6b835SYan, Zheng 	__releases(session->s_mdsc->snap_rwsem)
3417a8599bd8SSage Weil {
3418a8599bd8SSage Weil 	struct ceph_inode_info *ci = ceph_inode(inode);
34192f56f56aSSage Weil 	int seq = le32_to_cpu(grant->seq);
3420a8599bd8SSage Weil 	int newcaps = le32_to_cpu(grant->caps);
34212cd698beSYan, Zheng 	int used, wanted, dirty;
3422a8599bd8SSage Weil 	u64 size = le64_to_cpu(grant->size);
3423a8599bd8SSage Weil 	u64 max_size = le64_to_cpu(grant->max_size);
3424fdac94faSYan, Zheng 	unsigned char check_caps = 0;
342552d60f8eSJeff Layton 	bool was_stale = cap->cap_gen < atomic_read(&session->s_cap_gen);
3426ab6c2c3eSFabian Frederick 	bool wake = false;
3427ab6c2c3eSFabian Frederick 	bool writeback = false;
3428ab6c2c3eSFabian Frederick 	bool queue_trunc = false;
3429ab6c2c3eSFabian Frederick 	bool queue_invalidate = false;
3430ab6c2c3eSFabian Frederick 	bool deleted_inode = false;
343131c542a1SYan, Zheng 	bool fill_inline = false;
3432a8599bd8SSage Weil 
34330d91f0adSJeff Layton 	/*
34340d91f0adSJeff Layton 	 * If there is at least one crypto block then we'll trust
34350d91f0adSJeff Layton 	 * fscrypt_file_size. If the real length of the file is 0, then
34360d91f0adSJeff Layton 	 * ignore it (it has probably been truncated down to 0 by the MDS).
34370d91f0adSJeff Layton 	 */
34380d91f0adSJeff Layton 	if (IS_ENCRYPTED(inode) && size)
34390d91f0adSJeff Layton 		size = extra_info->fscrypt_file_size;
34400d91f0adSJeff Layton 
34412f56f56aSSage Weil 	dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
3442a1c6b835SYan, Zheng 	     inode, cap, session->s_mds, seq, ceph_cap_string(newcaps));
3443a8599bd8SSage Weil 	dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
34442d6795fbSJeff Layton 		i_size_read(inode));
3445a8599bd8SSage Weil 
344611df2dfbSYan, Zheng 
344711df2dfbSYan, Zheng 	/*
3448a8599bd8SSage Weil 	 * If CACHE is being revoked, and we have no dirty buffers,
3449a8599bd8SSage Weil 	 * try to invalidate (once).  (If there are dirty buffers, we
3450a8599bd8SSage Weil 	 * will invalidate _after_ writeback.)
3451a8599bd8SSage Weil 	 */
3452525d15e8SYan, Zheng 	if (S_ISREG(inode->i_mode) && /* don't invalidate readdir cache */
3453fdd4e158SYan, Zheng 	    ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
34543b454c49SSage Weil 	    (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
34559abd4db7SYan, Zheng 	    !(ci->i_wrbuffer_ref || ci->i_wb_ref)) {
3456e9075743SLi Wang 		if (try_nonblocking_invalidate(inode)) {
3457a8599bd8SSage Weil 			/* there were locked pages.. invalidate later
3458a8599bd8SSage Weil 			   in a separate thread. */
3459a8599bd8SSage Weil 			if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
3460ab6c2c3eSFabian Frederick 				queue_invalidate = true;
3461a8599bd8SSage Weil 				ci->i_rdcache_revoking = ci->i_rdcache_gen;
3462a8599bd8SSage Weil 			}
3463a8599bd8SSage Weil 		}
3464a8599bd8SSage Weil 	}
3465a8599bd8SSage Weil 
3466d2f8bb27SYan, Zheng 	if (was_stale)
3467d2f8bb27SYan, Zheng 		cap->issued = cap->implemented = CEPH_CAP_PIN;
3468d2f8bb27SYan, Zheng 
3469d2f8bb27SYan, Zheng 	/*
3470d2f8bb27SYan, Zheng 	 * auth mds of the inode changed. we received the cap export message,
3471d2f8bb27SYan, Zheng 	 * but still haven't received the cap import message. handle_cap_export
3472d2f8bb27SYan, Zheng 	 * updated the new auth MDS' cap.
3473d2f8bb27SYan, Zheng 	 *
3474d2f8bb27SYan, Zheng 	 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message
3475d2f8bb27SYan, Zheng 	 * that was sent before the cap import message. So don't remove caps.
3476d2f8bb27SYan, Zheng 	 */
3477d2f8bb27SYan, Zheng 	if (ceph_seq_cmp(seq, cap->seq) <= 0) {
3478d2f8bb27SYan, Zheng 		WARN_ON(cap != ci->i_auth_cap);
3479d2f8bb27SYan, Zheng 		WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id));
3480d2f8bb27SYan, Zheng 		seq = cap->seq;
3481d2f8bb27SYan, Zheng 		newcaps |= cap->issued;
3482d2f8bb27SYan, Zheng 	}
3483d2f8bb27SYan, Zheng 
3484a8599bd8SSage Weil 	/* side effects now are allowed */
348552d60f8eSJeff Layton 	cap->cap_gen = atomic_read(&session->s_cap_gen);
348611df2dfbSYan, Zheng 	cap->seq = seq;
3487a8599bd8SSage Weil 
3488a8599bd8SSage Weil 	__check_cap_issue(ci, cap, newcaps);
3489a8599bd8SSage Weil 
3490176c77c9SJeff Layton 	inode_set_max_iversion_raw(inode, extra_info->change_attr);
3491176c77c9SJeff Layton 
3492f98a128aSYan, Zheng 	if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
3493a1c6b835SYan, Zheng 	    (extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) {
3494ed94f87cSJeff Layton 		umode_t mode = le32_to_cpu(grant->mode);
3495ed94f87cSJeff Layton 
3496ed94f87cSJeff Layton 		if (inode_wrong_type(inode, mode))
3497ed94f87cSJeff Layton 			pr_warn_once("inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n",
3498ed94f87cSJeff Layton 				     ceph_vinop(inode), inode->i_mode, mode);
3499ed94f87cSJeff Layton 		else
3500ed94f87cSJeff Layton 			inode->i_mode = mode;
350105cb11c1SEric W. Biederman 		inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid));
350205cb11c1SEric W. Biederman 		inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));
3503ec62b894SJeff Layton 		ci->i_btime = extra_info->btime;
3504a8599bd8SSage Weil 		dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
3505bd2bae6aSEric W. Biederman 		     from_kuid(&init_user_ns, inode->i_uid),
3506bd2bae6aSEric W. Biederman 		     from_kgid(&init_user_ns, inode->i_gid));
35070d91f0adSJeff Layton #if IS_ENABLED(CONFIG_FS_ENCRYPTION)
35080d91f0adSJeff Layton 		if (ci->fscrypt_auth_len != extra_info->fscrypt_auth_len ||
35090d91f0adSJeff Layton 		    memcmp(ci->fscrypt_auth, extra_info->fscrypt_auth,
35100d91f0adSJeff Layton 			   ci->fscrypt_auth_len))
35110d91f0adSJeff Layton 			pr_warn_ratelimited("%s: cap grant attempt to change fscrypt_auth on non-I_NEW inode (old len %d new len %d)\n",
35120d91f0adSJeff Layton 				__func__, ci->fscrypt_auth_len,
35130d91f0adSJeff Layton 				extra_info->fscrypt_auth_len);
35140d91f0adSJeff Layton #endif
3515a8599bd8SSage Weil 	}
3516a8599bd8SSage Weil 
3517fa466743SYan, Zheng 	if ((newcaps & CEPH_CAP_LINK_SHARED) &&
3518a1c6b835SYan, Zheng 	    (extra_info->issued & CEPH_CAP_LINK_EXCL) == 0) {
3519bfe86848SMiklos Szeredi 		set_nlink(inode, le32_to_cpu(grant->nlink));
352076bdbc7aSXiubo Li 		if (inode->i_nlink == 0)
3521ab6c2c3eSFabian Frederick 			deleted_inode = true;
3522ca20c991SYan, Zheng 	}
3523a8599bd8SSage Weil 
3524a1c6b835SYan, Zheng 	if ((extra_info->issued & CEPH_CAP_XATTR_EXCL) == 0 &&
3525a1c6b835SYan, Zheng 	    grant->xattr_len) {
3526a8599bd8SSage Weil 		int len = le32_to_cpu(grant->xattr_len);
3527a8599bd8SSage Weil 		u64 version = le64_to_cpu(grant->xattr_version);
3528a8599bd8SSage Weil 
3529a8599bd8SSage Weil 		if (version > ci->i_xattrs.version) {
3530a8599bd8SSage Weil 			dout(" got new xattrs v%llu on %p len %d\n",
3531a8599bd8SSage Weil 			     version, inode, len);
3532a8599bd8SSage Weil 			if (ci->i_xattrs.blob)
3533a8599bd8SSage Weil 				ceph_buffer_put(ci->i_xattrs.blob);
3534a8599bd8SSage Weil 			ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
3535a8599bd8SSage Weil 			ci->i_xattrs.version = version;
35367221fe4cSGuangliang Zhao 			ceph_forget_all_cached_acls(inode);
3537ac6713ccSYan, Zheng 			ceph_security_invalidate_secctx(inode);
3538a8599bd8SSage Weil 		}
3539a8599bd8SSage Weil 	}
3540a8599bd8SSage Weil 
3541f98a128aSYan, Zheng 	if (newcaps & CEPH_CAP_ANY_RD) {
35429bbeab41SArnd Bergmann 		struct timespec64 mtime, atime, ctime;
3543f98a128aSYan, Zheng 		/* ctime/mtime/atime? */
35449bbeab41SArnd Bergmann 		ceph_decode_timespec64(&mtime, &grant->mtime);
35459bbeab41SArnd Bergmann 		ceph_decode_timespec64(&atime, &grant->atime);
35469bbeab41SArnd Bergmann 		ceph_decode_timespec64(&ctime, &grant->ctime);
3547a1c6b835SYan, Zheng 		ceph_fill_file_time(inode, extra_info->issued,
3548f98a128aSYan, Zheng 				    le32_to_cpu(grant->time_warp_seq),
3549f98a128aSYan, Zheng 				    &ctime, &mtime, &atime);
3550f98a128aSYan, Zheng 	}
3551a8599bd8SSage Weil 
35524985d6f9SYan, Zheng 	if ((newcaps & CEPH_CAP_FILE_SHARED) && extra_info->dirstat_valid) {
35534985d6f9SYan, Zheng 		ci->i_files = extra_info->nfiles;
35544985d6f9SYan, Zheng 		ci->i_subdirs = extra_info->nsubdirs;
35554985d6f9SYan, Zheng 	}
35564985d6f9SYan, Zheng 
3557f98a128aSYan, Zheng 	if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
355811df2dfbSYan, Zheng 		/* file layout may have changed */
35597627151eSYan, Zheng 		s64 old_pool = ci->i_layout.pool_id;
3560779fe0fbSYan, Zheng 		struct ceph_string *old_ns;
3561779fe0fbSYan, Zheng 
35627627151eSYan, Zheng 		ceph_file_layout_from_legacy(&ci->i_layout, &grant->layout);
3563779fe0fbSYan, Zheng 		old_ns = rcu_dereference_protected(ci->i_layout.pool_ns,
3564779fe0fbSYan, Zheng 					lockdep_is_held(&ci->i_ceph_lock));
3565a1c6b835SYan, Zheng 		rcu_assign_pointer(ci->i_layout.pool_ns, extra_info->pool_ns);
3566779fe0fbSYan, Zheng 
3567a1c6b835SYan, Zheng 		if (ci->i_layout.pool_id != old_pool ||
3568a1c6b835SYan, Zheng 		    extra_info->pool_ns != old_ns)
35697627151eSYan, Zheng 			ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
35705ea5c5e0SYan, Zheng 
3571a1c6b835SYan, Zheng 		extra_info->pool_ns = old_ns;
3572779fe0fbSYan, Zheng 
3573f98a128aSYan, Zheng 		/* size/truncate_seq? */
3574a1c6b835SYan, Zheng 		queue_trunc = ceph_fill_file_size(inode, extra_info->issued,
3575f98a128aSYan, Zheng 					le32_to_cpu(grant->truncate_seq),
3576f98a128aSYan, Zheng 					le64_to_cpu(grant->truncate_size),
3577f98a128aSYan, Zheng 					size);
357884eea8c7SYan, Zheng 	}
357984eea8c7SYan, Zheng 
358084eea8c7SYan, Zheng 	if (ci->i_auth_cap == cap && (newcaps & CEPH_CAP_ANY_FILE_WR)) {
358184eea8c7SYan, Zheng 		if (max_size != ci->i_max_size) {
3582f98a128aSYan, Zheng 			dout("max_size %lld -> %llu\n",
3583f98a128aSYan, Zheng 			     ci->i_max_size, max_size);
3584a8599bd8SSage Weil 			ci->i_max_size = max_size;
3585a8599bd8SSage Weil 			if (max_size >= ci->i_wanted_max_size) {
3586a8599bd8SSage Weil 				ci->i_wanted_max_size = 0;  /* reset */
3587a8599bd8SSage Weil 				ci->i_requested_max_size = 0;
3588a8599bd8SSage Weil 			}
3589ab6c2c3eSFabian Frederick 			wake = true;
3590a8599bd8SSage Weil 		}
3591f98a128aSYan, Zheng 	}
3592a8599bd8SSage Weil 
3593a8599bd8SSage Weil 	/* check cap bits */
3594a8599bd8SSage Weil 	wanted = __ceph_caps_wanted(ci);
3595a8599bd8SSage Weil 	used = __ceph_caps_used(ci);
3596a8599bd8SSage Weil 	dirty = __ceph_caps_dirty(ci);
3597a8599bd8SSage Weil 	dout(" my wanted = %s, used = %s, dirty %s\n",
3598a8599bd8SSage Weil 	     ceph_cap_string(wanted),
3599a8599bd8SSage Weil 	     ceph_cap_string(used),
3600a8599bd8SSage Weil 	     ceph_cap_string(dirty));
3601fdac94faSYan, Zheng 
3602fdac94faSYan, Zheng 	if ((was_stale || le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) &&
3603fdac94faSYan, Zheng 	    (wanted & ~(cap->mds_wanted | newcaps))) {
3604fdac94faSYan, Zheng 		/*
3605fdac94faSYan, Zheng 		 * If mds is importing cap, prior cap messages that update
3606fdac94faSYan, Zheng 		 * 'wanted' may get dropped by mds (migrate seq mismatch).
3607fdac94faSYan, Zheng 		 *
3608fdac94faSYan, Zheng 		 * We don't send cap message to update 'wanted' if what we
3609fdac94faSYan, Zheng 		 * want are already issued. If mds revokes caps, cap message
3610fdac94faSYan, Zheng 		 * that releases caps also tells mds what we want. But if
3611fdac94faSYan, Zheng 		 * caps got revoked by mds forcedly (session stale). We may
3612fdac94faSYan, Zheng 		 * haven't told mds what we want.
3613fdac94faSYan, Zheng 		 */
3614390306c3SYan, Zheng 		check_caps = 1;
3615a8599bd8SSage Weil 	}
3616a8599bd8SSage Weil 
3617a8599bd8SSage Weil 	/* revocation, grant, or no-op? */
3618a8599bd8SSage Weil 	if (cap->issued & ~newcaps) {
36193b454c49SSage Weil 		int revoking = cap->issued & ~newcaps;
36203b454c49SSage Weil 
36213b454c49SSage Weil 		dout("revocation: %s -> %s (revoking %s)\n",
36223b454c49SSage Weil 		     ceph_cap_string(cap->issued),
36233b454c49SSage Weil 		     ceph_cap_string(newcaps),
36243b454c49SSage Weil 		     ceph_cap_string(revoking));
3625525d15e8SYan, Zheng 		if (S_ISREG(inode->i_mode) &&
3626525d15e8SYan, Zheng 		    (revoking & used & CEPH_CAP_FILE_BUFFER))
3627ab6c2c3eSFabian Frederick 			writeback = true;  /* initiate writeback; will delay ack */
3628525d15e8SYan, Zheng 		else if (queue_invalidate &&
3629525d15e8SYan, Zheng 			 revoking == CEPH_CAP_FILE_CACHE &&
3630525d15e8SYan, Zheng 			 (newcaps & CEPH_CAP_FILE_LAZYIO) == 0)
36313b454c49SSage Weil 			; /* do nothing yet, invalidation will be queued */
36323b454c49SSage Weil 		else if (cap == ci->i_auth_cap)
36333b454c49SSage Weil 			check_caps = 1; /* check auth cap only */
36343b454c49SSage Weil 		else
36353b454c49SSage Weil 			check_caps = 2; /* check all caps */
3636f7913573SXiubo Li 		/* If there is new caps, try to wake up the waiters */
3637f7913573SXiubo Li 		if (~cap->issued & newcaps)
3638f7913573SXiubo Li 			wake = true;
3639a8599bd8SSage Weil 		cap->issued = newcaps;
3640978097c9SSage Weil 		cap->implemented |= newcaps;
3641a8599bd8SSage Weil 	} else if (cap->issued == newcaps) {
3642a8599bd8SSage Weil 		dout("caps unchanged: %s -> %s\n",
3643a8599bd8SSage Weil 		     ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
3644a8599bd8SSage Weil 	} else {
3645a8599bd8SSage Weil 		dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
3646a8599bd8SSage Weil 		     ceph_cap_string(newcaps));
36476ee6b953SYan, Zheng 		/* non-auth MDS is revoking the newly grant caps ? */
36486ee6b953SYan, Zheng 		if (cap == ci->i_auth_cap &&
36496ee6b953SYan, Zheng 		    __ceph_caps_revoking_other(ci, cap, newcaps))
36506ee6b953SYan, Zheng 		    check_caps = 2;
36516ee6b953SYan, Zheng 
3652a8599bd8SSage Weil 		cap->issued = newcaps;
3653a8599bd8SSage Weil 		cap->implemented |= newcaps; /* add bits only, to
3654a8599bd8SSage Weil 					      * avoid stepping on a
3655a8599bd8SSage Weil 					      * pending revocation */
3656ab6c2c3eSFabian Frederick 		wake = true;
3657a8599bd8SSage Weil 	}
3658978097c9SSage Weil 	BUG_ON(cap->issued & ~cap->implemented);
3659a8599bd8SSage Weil 
3660257e6172SXiubo Li 	/* don't let check_caps skip sending a response to MDS for revoke msgs */
3661257e6172SXiubo Li 	if (le32_to_cpu(grant->op) == CEPH_CAP_OP_REVOKE) {
3662257e6172SXiubo Li 		cap->mds_wanted = 0;
3663257e6172SXiubo Li 		if (cap == ci->i_auth_cap)
3664257e6172SXiubo Li 			check_caps = 1; /* check auth cap only */
3665257e6172SXiubo Li 		else
3666257e6172SXiubo Li 			check_caps = 2; /* check all caps */
3667257e6172SXiubo Li 	}
3668257e6172SXiubo Li 
3669a1c6b835SYan, Zheng 	if (extra_info->inline_version > 0 &&
3670a1c6b835SYan, Zheng 	    extra_info->inline_version >= ci->i_inline_version) {
3671a1c6b835SYan, Zheng 		ci->i_inline_version = extra_info->inline_version;
367231c542a1SYan, Zheng 		if (ci->i_inline_version != CEPH_INLINE_NONE &&
367331c542a1SYan, Zheng 		    (newcaps & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)))
367431c542a1SYan, Zheng 			fill_inline = true;
367531c542a1SYan, Zheng 	}
367631c542a1SYan, Zheng 
367758dd4385SJeff Layton 	if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
367858dd4385SJeff Layton 		if (ci->i_auth_cap == cap) {
3679a1c6b835SYan, Zheng 			if (newcaps & ~extra_info->issued)
3680ab6c2c3eSFabian Frederick 				wake = true;
36816f05b30eSYan, Zheng 
36826f05b30eSYan, Zheng 			if (ci->i_requested_max_size > max_size ||
36836f05b30eSYan, Zheng 			    !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) {
36846f05b30eSYan, Zheng 				/* re-request max_size if necessary */
36856f05b30eSYan, Zheng 				ci->i_requested_max_size = 0;
36866f05b30eSYan, Zheng 				wake = true;
36876f05b30eSYan, Zheng 			}
36886f05b30eSYan, Zheng 
3689e8a4d267SJeff Layton 			ceph_kick_flushing_inode_caps(session, ci);
36902cd698beSYan, Zheng 		}
369158dd4385SJeff Layton 		up_read(&session->s_mdsc->snap_rwsem);
369258dd4385SJeff Layton 	}
369358dd4385SJeff Layton 	spin_unlock(&ci->i_ceph_lock);
36942cd698beSYan, Zheng 
369531c542a1SYan, Zheng 	if (fill_inline)
3696a1c6b835SYan, Zheng 		ceph_fill_inline_data(inode, NULL, extra_info->inline_data,
3697a1c6b835SYan, Zheng 				      extra_info->inline_len);
369831c542a1SYan, Zheng 
369914649758SYan, Zheng 	if (queue_trunc)
3700c6bcda6fSYan, Zheng 		ceph_queue_vmtruncate(inode);
3701c6bcda6fSYan, Zheng 
37023c6f6b79SSage Weil 	if (writeback)
3703a8599bd8SSage Weil 		/*
3704a8599bd8SSage Weil 		 * queue inode for writeback: we can't actually call
3705a8599bd8SSage Weil 		 * filemap_write_and_wait, etc. from message handler
3706a8599bd8SSage Weil 		 * context.
3707a8599bd8SSage Weil 		 */
37083c6f6b79SSage Weil 		ceph_queue_writeback(inode);
37093c6f6b79SSage Weil 	if (queue_invalidate)
37103c6f6b79SSage Weil 		ceph_queue_invalidate(inode);
3711ca20c991SYan, Zheng 	if (deleted_inode)
3712ca20c991SYan, Zheng 		invalidate_aliases(inode);
3713a8599bd8SSage Weil 	if (wake)
371403066f23SYehuda Sadeh 		wake_up_all(&ci->i_cap_wq);
371515637c8bSSage Weil 
37166a92b08fSJeff Layton 	mutex_unlock(&session->s_mutex);
371715637c8bSSage Weil 	if (check_caps == 1)
3718e4b731ccSXiubo Li 		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL);
371915637c8bSSage Weil 	else if (check_caps == 2)
3720e4b731ccSXiubo Li 		ceph_check_caps(ci, CHECK_CAPS_NOINVAL);
3721a8599bd8SSage Weil }
3722a8599bd8SSage Weil 
3723a8599bd8SSage Weil /*
3724a8599bd8SSage Weil  * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
3725a8599bd8SSage Weil  * MDS has been safely committed.
3726a8599bd8SSage Weil  */
handle_cap_flush_ack(struct inode * inode,u64 flush_tid,struct ceph_mds_caps * m,struct ceph_mds_session * session,struct ceph_cap * cap)37276df058c0SSage Weil static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
3728a8599bd8SSage Weil 				 struct ceph_mds_caps *m,
3729a8599bd8SSage Weil 				 struct ceph_mds_session *session,
3730a8599bd8SSage Weil 				 struct ceph_cap *cap)
3731be655596SSage Weil 	__releases(ci->i_ceph_lock)
3732a8599bd8SSage Weil {
3733a8599bd8SSage Weil 	struct ceph_inode_info *ci = ceph_inode(inode);
3734985b9ee8SXiubo Li 	struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
3735e4500b5eSYan, Zheng 	struct ceph_cap_flush *cf, *tmp_cf;
3736553adfd9SYan, Zheng 	LIST_HEAD(to_remove);
3737a8599bd8SSage Weil 	unsigned seq = le32_to_cpu(m->seq);
3738a8599bd8SSage Weil 	int dirty = le32_to_cpu(m->dirty);
3739a8599bd8SSage Weil 	int cleaned = 0;
3740c8799fc4SYan, Zheng 	bool drop = false;
37417271efa7SThomas Meyer 	bool wake_ci = false;
37427271efa7SThomas Meyer 	bool wake_mdsc = false;
3743a8599bd8SSage Weil 
3744e4500b5eSYan, Zheng 	list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) {
3745d7dbfb4fSJeff Layton 		/* Is this the one that was flushed? */
3746553adfd9SYan, Zheng 		if (cf->tid == flush_tid)
3747553adfd9SYan, Zheng 			cleaned = cf->caps;
3748d7dbfb4fSJeff Layton 
3749d7dbfb4fSJeff Layton 		/* Is this a capsnap? */
3750b2f9fa1fSXiubo Li 		if (cf->is_capsnap)
37510e294387SYan, Zheng 			continue;
3752d7dbfb4fSJeff Layton 
3753553adfd9SYan, Zheng 		if (cf->tid <= flush_tid) {
3754d7dbfb4fSJeff Layton 			/*
3755d7dbfb4fSJeff Layton 			 * An earlier or current tid. The FLUSH_ACK should
3756d7dbfb4fSJeff Layton 			 * represent a superset of this flush's caps.
3757d7dbfb4fSJeff Layton 			 */
3758681ac634SJeff Layton 			wake_ci |= __detach_cap_flush_from_ci(ci, cf);
3759e4500b5eSYan, Zheng 			list_add_tail(&cf->i_list, &to_remove);
3760553adfd9SYan, Zheng 		} else {
3761d7dbfb4fSJeff Layton 			/*
3762d7dbfb4fSJeff Layton 			 * This is a later one. Any caps in it are still dirty
3763d7dbfb4fSJeff Layton 			 * so don't count them as cleaned.
3764d7dbfb4fSJeff Layton 			 */
3765553adfd9SYan, Zheng 			cleaned &= ~cf->caps;
3766553adfd9SYan, Zheng 			if (!cleaned)
3767553adfd9SYan, Zheng 				break;
3768553adfd9SYan, Zheng 		}
3769553adfd9SYan, Zheng 	}
3770a8599bd8SSage Weil 
3771a8599bd8SSage Weil 	dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
3772a8599bd8SSage Weil 	     " flushing %s -> %s\n",
3773a8599bd8SSage Weil 	     inode, session->s_mds, seq, ceph_cap_string(dirty),
3774a8599bd8SSage Weil 	     ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
3775a8599bd8SSage Weil 	     ceph_cap_string(ci->i_flushing_caps & ~cleaned));
3776a8599bd8SSage Weil 
37778310b089SYan, Zheng 	if (list_empty(&to_remove) && !cleaned)
3778a8599bd8SSage Weil 		goto out;
3779a8599bd8SSage Weil 
3780a8599bd8SSage Weil 	ci->i_flushing_caps &= ~cleaned;
3781a8599bd8SSage Weil 
3782a8599bd8SSage Weil 	spin_lock(&mdsc->cap_dirty_lock);
37838310b089SYan, Zheng 
3784681ac634SJeff Layton 	list_for_each_entry(cf, &to_remove, i_list)
3785681ac634SJeff Layton 		wake_mdsc |= __detach_cap_flush_from_mdsc(mdsc, cf);
37868310b089SYan, Zheng 
3787a8599bd8SSage Weil 	if (ci->i_flushing_caps == 0) {
37880e294387SYan, Zheng 		if (list_empty(&ci->i_cap_flush_list)) {
3789a8599bd8SSage Weil 			list_del_init(&ci->i_flushing_item);
37900e294387SYan, Zheng 			if (!list_empty(&session->s_cap_flushing)) {
3791a8599bd8SSage Weil 				dout(" mds%d still flushing cap on %p\n",
3792a8599bd8SSage Weil 				     session->s_mds,
37930e294387SYan, Zheng 				     &list_first_entry(&session->s_cap_flushing,
3794a8599bd8SSage Weil 						struct ceph_inode_info,
3795874c8ca1SDavid Howells 						i_flushing_item)->netfs.inode);
37960e294387SYan, Zheng 			}
37970e294387SYan, Zheng 		}
3798a8599bd8SSage Weil 		mdsc->num_cap_flushing--;
3799a8599bd8SSage Weil 		dout(" inode %p now !flushing\n", inode);
3800afcdaea3SSage Weil 
3801afcdaea3SSage Weil 		if (ci->i_dirty_caps == 0) {
3802a8599bd8SSage Weil 			dout(" inode %p now clean\n", inode);
3803afcdaea3SSage Weil 			BUG_ON(!list_empty(&ci->i_dirty_item));
3804c8799fc4SYan, Zheng 			drop = true;
38055dda377cSYan, Zheng 			if (ci->i_wr_ref == 0 &&
38065dda377cSYan, Zheng 			    ci->i_wrbuffer_ref_head == 0) {
38077d8cb26dSSage Weil 				BUG_ON(!ci->i_head_snapc);
38087d8cb26dSSage Weil 				ceph_put_snap_context(ci->i_head_snapc);
38097d8cb26dSSage Weil 				ci->i_head_snapc = NULL;
38107d8cb26dSSage Weil 			}
381176e3b390SSage Weil 		} else {
381276e3b390SSage Weil 			BUG_ON(list_empty(&ci->i_dirty_item));
3813afcdaea3SSage Weil 		}
3814a8599bd8SSage Weil 	}
3815a8599bd8SSage Weil 	spin_unlock(&mdsc->cap_dirty_lock);
3816a8599bd8SSage Weil 
3817a8599bd8SSage Weil out:
3818be655596SSage Weil 	spin_unlock(&ci->i_ceph_lock);
3819553adfd9SYan, Zheng 
3820553adfd9SYan, Zheng 	while (!list_empty(&to_remove)) {
3821553adfd9SYan, Zheng 		cf = list_first_entry(&to_remove,
3822e4500b5eSYan, Zheng 				      struct ceph_cap_flush, i_list);
3823b2f9fa1fSXiubo Li 		list_del_init(&cf->i_list);
3824b2f9fa1fSXiubo Li 		if (!cf->is_capsnap)
3825f66fd9f0SYan, Zheng 			ceph_free_cap_flush(cf);
3826553adfd9SYan, Zheng 	}
3827c8799fc4SYan, Zheng 
3828c8799fc4SYan, Zheng 	if (wake_ci)
3829c8799fc4SYan, Zheng 		wake_up_all(&ci->i_cap_wq);
3830c8799fc4SYan, Zheng 	if (wake_mdsc)
3831c8799fc4SYan, Zheng 		wake_up_all(&mdsc->cap_flushing_wq);
3832afcdaea3SSage Weil 	if (drop)
3833a8599bd8SSage Weil 		iput(inode);
3834a8599bd8SSage Weil }
3835a8599bd8SSage Weil 
__ceph_remove_capsnap(struct inode * inode,struct ceph_cap_snap * capsnap,bool * wake_ci,bool * wake_mdsc)3836a6d37ccdSXiubo Li void __ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap,
3837a6d37ccdSXiubo Li 			   bool *wake_ci, bool *wake_mdsc)
3838a6d37ccdSXiubo Li {
3839a6d37ccdSXiubo Li 	struct ceph_inode_info *ci = ceph_inode(inode);
3840985b9ee8SXiubo Li 	struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
3841a6d37ccdSXiubo Li 	bool ret;
3842a6d37ccdSXiubo Li 
3843a6d37ccdSXiubo Li 	lockdep_assert_held(&ci->i_ceph_lock);
3844a6d37ccdSXiubo Li 
3845a6d37ccdSXiubo Li 	dout("removing capsnap %p, inode %p ci %p\n", capsnap, inode, ci);
3846a6d37ccdSXiubo Li 
3847a6d37ccdSXiubo Li 	list_del_init(&capsnap->ci_item);
3848a6d37ccdSXiubo Li 	ret = __detach_cap_flush_from_ci(ci, &capsnap->cap_flush);
3849a6d37ccdSXiubo Li 	if (wake_ci)
3850a6d37ccdSXiubo Li 		*wake_ci = ret;
3851a6d37ccdSXiubo Li 
3852a6d37ccdSXiubo Li 	spin_lock(&mdsc->cap_dirty_lock);
3853a6d37ccdSXiubo Li 	if (list_empty(&ci->i_cap_flush_list))
3854a6d37ccdSXiubo Li 		list_del_init(&ci->i_flushing_item);
3855a6d37ccdSXiubo Li 
3856a6d37ccdSXiubo Li 	ret = __detach_cap_flush_from_mdsc(mdsc, &capsnap->cap_flush);
3857a6d37ccdSXiubo Li 	if (wake_mdsc)
3858a6d37ccdSXiubo Li 		*wake_mdsc = ret;
3859a6d37ccdSXiubo Li 	spin_unlock(&mdsc->cap_dirty_lock);
3860a6d37ccdSXiubo Li }
3861a6d37ccdSXiubo Li 
ceph_remove_capsnap(struct inode * inode,struct ceph_cap_snap * capsnap,bool * wake_ci,bool * wake_mdsc)3862a6d37ccdSXiubo Li void ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap,
3863a6d37ccdSXiubo Li 			 bool *wake_ci, bool *wake_mdsc)
3864a6d37ccdSXiubo Li {
3865a6d37ccdSXiubo Li 	struct ceph_inode_info *ci = ceph_inode(inode);
3866a6d37ccdSXiubo Li 
3867a6d37ccdSXiubo Li 	lockdep_assert_held(&ci->i_ceph_lock);
3868a6d37ccdSXiubo Li 
3869a6d37ccdSXiubo Li 	WARN_ON_ONCE(capsnap->dirty_pages || capsnap->writing);
3870a6d37ccdSXiubo Li 	__ceph_remove_capsnap(inode, capsnap, wake_ci, wake_mdsc);
3871a6d37ccdSXiubo Li }
3872a6d37ccdSXiubo Li 
3873a8599bd8SSage Weil /*
3874a8599bd8SSage Weil  * Handle FLUSHSNAP_ACK.  MDS has flushed snap data to disk and we can
3875a8599bd8SSage Weil  * throw away our cap_snap.
3876a8599bd8SSage Weil  *
3877a8599bd8SSage Weil  * Caller hold s_mutex.
3878a8599bd8SSage Weil  */
handle_cap_flushsnap_ack(struct inode * inode,u64 flush_tid,struct ceph_mds_caps * m,struct ceph_mds_session * session)38796df058c0SSage Weil static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
3880a8599bd8SSage Weil 				     struct ceph_mds_caps *m,
3881a8599bd8SSage Weil 				     struct ceph_mds_session *session)
3882a8599bd8SSage Weil {
3883a8599bd8SSage Weil 	struct ceph_inode_info *ci = ceph_inode(inode);
3884985b9ee8SXiubo Li 	struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
3885a8599bd8SSage Weil 	u64 follows = le64_to_cpu(m->snap_follows);
38863ffa9d6fSJakob Koschel 	struct ceph_cap_snap *capsnap = NULL, *iter;
3887c8799fc4SYan, Zheng 	bool wake_ci = false;
3888c8799fc4SYan, Zheng 	bool wake_mdsc = false;
3889a8599bd8SSage Weil 
3890a8599bd8SSage Weil 	dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
3891a8599bd8SSage Weil 	     inode, ci, session->s_mds, follows);
3892a8599bd8SSage Weil 
3893be655596SSage Weil 	spin_lock(&ci->i_ceph_lock);
38943ffa9d6fSJakob Koschel 	list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) {
38953ffa9d6fSJakob Koschel 		if (iter->follows == follows) {
38963ffa9d6fSJakob Koschel 			if (iter->cap_flush.tid != flush_tid) {
3897a8599bd8SSage Weil 				dout(" cap_snap %p follows %lld tid %lld !="
38983ffa9d6fSJakob Koschel 				     " %lld\n", iter, follows,
38993ffa9d6fSJakob Koschel 				     flush_tid, iter->cap_flush.tid);
3900a8599bd8SSage Weil 				break;
3901a8599bd8SSage Weil 			}
39023ffa9d6fSJakob Koschel 			capsnap = iter;
3903a8599bd8SSage Weil 			break;
3904a8599bd8SSage Weil 		} else {
3905a8599bd8SSage Weil 			dout(" skipping cap_snap %p follows %lld\n",
39063ffa9d6fSJakob Koschel 			     iter, iter->follows);
3907a8599bd8SSage Weil 		}
3908a8599bd8SSage Weil 	}
39093ffa9d6fSJakob Koschel 	if (capsnap)
3910a6d37ccdSXiubo Li 		ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc);
3911be655596SSage Weil 	spin_unlock(&ci->i_ceph_lock);
3912a6d37ccdSXiubo Li 
39133ffa9d6fSJakob Koschel 	if (capsnap) {
39140e294387SYan, Zheng 		ceph_put_snap_context(capsnap->context);
39150e294387SYan, Zheng 		ceph_put_cap_snap(capsnap);
3916c8799fc4SYan, Zheng 		if (wake_ci)
3917c8799fc4SYan, Zheng 			wake_up_all(&ci->i_cap_wq);
3918c8799fc4SYan, Zheng 		if (wake_mdsc)
3919c8799fc4SYan, Zheng 			wake_up_all(&mdsc->cap_flushing_wq);
3920a8599bd8SSage Weil 		iput(inode);
3921a8599bd8SSage Weil 	}
39220e294387SYan, Zheng }
3923a8599bd8SSage Weil 
3924a8599bd8SSage Weil /*
3925a8599bd8SSage Weil  * Handle TRUNC from MDS, indicating file truncation.
3926a8599bd8SSage Weil  *
3927a8599bd8SSage Weil  * caller hold s_mutex.
3928a8599bd8SSage Weil  */
handle_cap_trunc(struct inode * inode,struct ceph_mds_caps * trunc,struct ceph_mds_session * session,struct cap_extra_info * extra_info)39297391fba2SJeff Layton static bool handle_cap_trunc(struct inode *inode,
3930a8599bd8SSage Weil 			     struct ceph_mds_caps *trunc,
39310d91f0adSJeff Layton 			     struct ceph_mds_session *session,
39320d91f0adSJeff Layton 			     struct cap_extra_info *extra_info)
3933a8599bd8SSage Weil {
3934a8599bd8SSage Weil 	struct ceph_inode_info *ci = ceph_inode(inode);
3935a8599bd8SSage Weil 	int mds = session->s_mds;
3936a8599bd8SSage Weil 	int seq = le32_to_cpu(trunc->seq);
3937a8599bd8SSage Weil 	u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
3938a8599bd8SSage Weil 	u64 truncate_size = le64_to_cpu(trunc->truncate_size);
3939a8599bd8SSage Weil 	u64 size = le64_to_cpu(trunc->size);
3940a8599bd8SSage Weil 	int implemented = 0;
3941a8599bd8SSage Weil 	int dirty = __ceph_caps_dirty(ci);
3942a8599bd8SSage Weil 	int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
39437391fba2SJeff Layton 	bool queue_trunc = false;
39447391fba2SJeff Layton 
39457391fba2SJeff Layton 	lockdep_assert_held(&ci->i_ceph_lock);
3946a8599bd8SSage Weil 
3947a8599bd8SSage Weil 	issued |= implemented | dirty;
3948a8599bd8SSage Weil 
39490d91f0adSJeff Layton 	/*
39500d91f0adSJeff Layton 	 * If there is at least one crypto block then we'll trust
39510d91f0adSJeff Layton 	 * fscrypt_file_size. If the real length of the file is 0, then
39520d91f0adSJeff Layton 	 * ignore it (it has probably been truncated down to 0 by the MDS).
39530d91f0adSJeff Layton 	 */
39540d91f0adSJeff Layton 	if (IS_ENCRYPTED(inode) && size)
39550d91f0adSJeff Layton 		size = extra_info->fscrypt_file_size;
39560d91f0adSJeff Layton 
3957295fc4aaSXiubo Li 	dout("%s inode %p mds%d seq %d to %lld truncate seq %d\n",
3958295fc4aaSXiubo Li 	     __func__, inode, mds, seq, truncate_size, truncate_seq);
3959a8599bd8SSage Weil 	queue_trunc = ceph_fill_file_size(inode, issued,
3960a8599bd8SSage Weil 					  truncate_seq, truncate_size, size);
39617391fba2SJeff Layton 	return queue_trunc;
3962a8599bd8SSage Weil }
3963a8599bd8SSage Weil 
3964a8599bd8SSage Weil /*
3965a8599bd8SSage Weil  * Handle EXPORT from MDS.  Cap is being migrated _from_ this mds to a
3966a8599bd8SSage Weil  * different one.  If we are the most recent migration we've seen (as
3967a8599bd8SSage Weil  * indicated by mseq), make note of the migrating cap bits for the
3968a8599bd8SSage Weil  * duration (until we see the corresponding IMPORT).
3969a8599bd8SSage Weil  *
3970a8599bd8SSage Weil  * caller holds s_mutex
3971a8599bd8SSage Weil  */
handle_cap_export(struct inode * inode,struct ceph_mds_caps * ex,struct ceph_mds_cap_peer * ph,struct ceph_mds_session * session)3972a8599bd8SSage Weil static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
397311df2dfbSYan, Zheng 			      struct ceph_mds_cap_peer *ph,
397411df2dfbSYan, Zheng 			      struct ceph_mds_session *session)
3975a8599bd8SSage Weil {
3976985b9ee8SXiubo Li 	struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
397711df2dfbSYan, Zheng 	struct ceph_mds_session *tsession = NULL;
3978d9df2783SYan, Zheng 	struct ceph_cap *cap, *tcap, *new_cap = NULL;
3979a8599bd8SSage Weil 	struct ceph_inode_info *ci = ceph_inode(inode);
398011df2dfbSYan, Zheng 	u64 t_cap_id;
3981a8599bd8SSage Weil 	unsigned mseq = le32_to_cpu(ex->migrate_seq);
398211df2dfbSYan, Zheng 	unsigned t_seq, t_mseq;
398311df2dfbSYan, Zheng 	int target, issued;
398411df2dfbSYan, Zheng 	int mds = session->s_mds;
3985a8599bd8SSage Weil 
398611df2dfbSYan, Zheng 	if (ph) {
398711df2dfbSYan, Zheng 		t_cap_id = le64_to_cpu(ph->cap_id);
398811df2dfbSYan, Zheng 		t_seq = le32_to_cpu(ph->seq);
398911df2dfbSYan, Zheng 		t_mseq = le32_to_cpu(ph->mseq);
399011df2dfbSYan, Zheng 		target = le32_to_cpu(ph->mds);
399111df2dfbSYan, Zheng 	} else {
399211df2dfbSYan, Zheng 		t_cap_id = t_seq = t_mseq = 0;
399311df2dfbSYan, Zheng 		target = -1;
399411df2dfbSYan, Zheng 	}
3995a8599bd8SSage Weil 
399611df2dfbSYan, Zheng 	dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n",
399711df2dfbSYan, Zheng 	     inode, ci, mds, mseq, target);
399811df2dfbSYan, Zheng retry:
39997f47f7f3SNiels Dossche 	down_read(&mdsc->snap_rwsem);
4000be655596SSage Weil 	spin_lock(&ci->i_ceph_lock);
400111df2dfbSYan, Zheng 	cap = __get_cap_for_mds(ci, mds);
4002ca665e02SYan, Zheng 	if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id))
400311df2dfbSYan, Zheng 		goto out_unlock;
4004a8599bd8SSage Weil 
400511df2dfbSYan, Zheng 	if (target < 0) {
40062e2023e9SXiubo Li 		ceph_remove_cap(mdsc, cap, false);
400711df2dfbSYan, Zheng 		goto out_unlock;
4008a8599bd8SSage Weil 	}
4009a8599bd8SSage Weil 
4010154f42c2SSage Weil 	/*
401111df2dfbSYan, Zheng 	 * now we know we haven't received the cap import message yet
401211df2dfbSYan, Zheng 	 * because the exported cap still exist.
4013154f42c2SSage Weil 	 */
4014db354052SSage Weil 
401511df2dfbSYan, Zheng 	issued = cap->issued;
4016d84b37f9SYan, Zheng 	if (issued != cap->implemented)
4017d84b37f9SYan, Zheng 		pr_err_ratelimited("handle_cap_export: issued != implemented: "
4018d84b37f9SYan, Zheng 				"ino (%llx.%llx) mds%d seq %d mseq %d "
4019d84b37f9SYan, Zheng 				"issued %s implemented %s\n",
4020d84b37f9SYan, Zheng 				ceph_vinop(inode), mds, cap->seq, cap->mseq,
4021d84b37f9SYan, Zheng 				ceph_cap_string(issued),
4022d84b37f9SYan, Zheng 				ceph_cap_string(cap->implemented));
4023d84b37f9SYan, Zheng 
402411df2dfbSYan, Zheng 
402511df2dfbSYan, Zheng 	tcap = __get_cap_for_mds(ci, target);
402611df2dfbSYan, Zheng 	if (tcap) {
402711df2dfbSYan, Zheng 		/* already have caps from the target */
4028fa0aa3b8SYan, Zheng 		if (tcap->cap_id == t_cap_id &&
402911df2dfbSYan, Zheng 		    ceph_seq_cmp(tcap->seq, t_seq) < 0) {
403011df2dfbSYan, Zheng 			dout(" updating import cap %p mds%d\n", tcap, target);
403111df2dfbSYan, Zheng 			tcap->cap_id = t_cap_id;
403211df2dfbSYan, Zheng 			tcap->seq = t_seq - 1;
403311df2dfbSYan, Zheng 			tcap->issue_seq = t_seq - 1;
403411df2dfbSYan, Zheng 			tcap->issued |= issued;
403511df2dfbSYan, Zheng 			tcap->implemented |= issued;
40361cf03a68SJeff Layton 			if (cap == ci->i_auth_cap) {
403711df2dfbSYan, Zheng 				ci->i_auth_cap = tcap;
40381cf03a68SJeff Layton 				change_auth_cap_ses(ci, tcap->session);
4039a8599bd8SSage Weil 			}
4040a8599bd8SSage Weil 		}
40412e2023e9SXiubo Li 		ceph_remove_cap(mdsc, cap, false);
404211df2dfbSYan, Zheng 		goto out_unlock;
4043d9df2783SYan, Zheng 	} else if (tsession) {
404411df2dfbSYan, Zheng 		/* add placeholder for the export tagert */
4045d9df2783SYan, Zheng 		int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
404600f06cbaSYan, Zheng 		tcap = new_cap;
4047135e671eSYan, Zheng 		ceph_add_cap(inode, tsession, t_cap_id, issued, 0,
4048d9df2783SYan, Zheng 			     t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
4049d9df2783SYan, Zheng 
405000f06cbaSYan, Zheng 		if (!list_empty(&ci->i_cap_flush_list) &&
405100f06cbaSYan, Zheng 		    ci->i_auth_cap == tcap) {
405200f06cbaSYan, Zheng 			spin_lock(&mdsc->cap_dirty_lock);
405300f06cbaSYan, Zheng 			list_move_tail(&ci->i_flushing_item,
405400f06cbaSYan, Zheng 				       &tcap->session->s_cap_flushing);
405500f06cbaSYan, Zheng 			spin_unlock(&mdsc->cap_dirty_lock);
405600f06cbaSYan, Zheng 		}
405700f06cbaSYan, Zheng 
40582e2023e9SXiubo Li 		ceph_remove_cap(mdsc, cap, false);
4059d9df2783SYan, Zheng 		goto out_unlock;
406011df2dfbSYan, Zheng 	}
4061a8599bd8SSage Weil 
4062be655596SSage Weil 	spin_unlock(&ci->i_ceph_lock);
40637f47f7f3SNiels Dossche 	up_read(&mdsc->snap_rwsem);
406411df2dfbSYan, Zheng 	mutex_unlock(&session->s_mutex);
406511df2dfbSYan, Zheng 
406611df2dfbSYan, Zheng 	/* open target session */
406711df2dfbSYan, Zheng 	tsession = ceph_mdsc_open_export_target_session(mdsc, target);
406811df2dfbSYan, Zheng 	if (!IS_ERR(tsession)) {
406911df2dfbSYan, Zheng 		if (mds > target) {
407011df2dfbSYan, Zheng 			mutex_lock(&session->s_mutex);
407111df2dfbSYan, Zheng 			mutex_lock_nested(&tsession->s_mutex,
407211df2dfbSYan, Zheng 					  SINGLE_DEPTH_NESTING);
407311df2dfbSYan, Zheng 		} else {
407411df2dfbSYan, Zheng 			mutex_lock(&tsession->s_mutex);
407511df2dfbSYan, Zheng 			mutex_lock_nested(&session->s_mutex,
407611df2dfbSYan, Zheng 					  SINGLE_DEPTH_NESTING);
407711df2dfbSYan, Zheng 		}
4078d9df2783SYan, Zheng 		new_cap = ceph_get_cap(mdsc, NULL);
407911df2dfbSYan, Zheng 	} else {
408011df2dfbSYan, Zheng 		WARN_ON(1);
408111df2dfbSYan, Zheng 		tsession = NULL;
408211df2dfbSYan, Zheng 		target = -1;
40834d8e28ffSWu Bo 		mutex_lock(&session->s_mutex);
408411df2dfbSYan, Zheng 	}
408511df2dfbSYan, Zheng 	goto retry;
408611df2dfbSYan, Zheng 
408711df2dfbSYan, Zheng out_unlock:
408811df2dfbSYan, Zheng 	spin_unlock(&ci->i_ceph_lock);
40897f47f7f3SNiels Dossche 	up_read(&mdsc->snap_rwsem);
409011df2dfbSYan, Zheng 	mutex_unlock(&session->s_mutex);
409111df2dfbSYan, Zheng 	if (tsession) {
409211df2dfbSYan, Zheng 		mutex_unlock(&tsession->s_mutex);
409311df2dfbSYan, Zheng 		ceph_put_mds_session(tsession);
409411df2dfbSYan, Zheng 	}
4095d9df2783SYan, Zheng 	if (new_cap)
4096d9df2783SYan, Zheng 		ceph_put_cap(mdsc, new_cap);
4097a8599bd8SSage Weil }
4098a8599bd8SSage Weil 
4099a8599bd8SSage Weil /*
41002cd698beSYan, Zheng  * Handle cap IMPORT.
4101a8599bd8SSage Weil  *
41022cd698beSYan, Zheng  * caller holds s_mutex. acquires i_ceph_lock
4103a8599bd8SSage Weil  */
handle_cap_import(struct ceph_mds_client * mdsc,struct inode * inode,struct ceph_mds_caps * im,struct ceph_mds_cap_peer * ph,struct ceph_mds_session * session,struct ceph_cap ** target_cap,int * old_issued)4104a8599bd8SSage Weil static void handle_cap_import(struct ceph_mds_client *mdsc,
4105a8599bd8SSage Weil 			      struct inode *inode, struct ceph_mds_caps *im,
41064ee6a914SYan, Zheng 			      struct ceph_mds_cap_peer *ph,
4107a8599bd8SSage Weil 			      struct ceph_mds_session *session,
41082cd698beSYan, Zheng 			      struct ceph_cap **target_cap, int *old_issued)
4109a8599bd8SSage Weil {
4110a8599bd8SSage Weil 	struct ceph_inode_info *ci = ceph_inode(inode);
41112cd698beSYan, Zheng 	struct ceph_cap *cap, *ocap, *new_cap = NULL;
4112a8599bd8SSage Weil 	int mds = session->s_mds;
41132cd698beSYan, Zheng 	int issued;
41142cd698beSYan, Zheng 	unsigned caps = le32_to_cpu(im->caps);
4115a8599bd8SSage Weil 	unsigned wanted = le32_to_cpu(im->wanted);
4116a8599bd8SSage Weil 	unsigned seq = le32_to_cpu(im->seq);
4117a8599bd8SSage Weil 	unsigned mseq = le32_to_cpu(im->migrate_seq);
4118a8599bd8SSage Weil 	u64 realmino = le64_to_cpu(im->realm);
4119a8599bd8SSage Weil 	u64 cap_id = le64_to_cpu(im->cap_id);
41204ee6a914SYan, Zheng 	u64 p_cap_id;
41214ee6a914SYan, Zheng 	int peer;
4122a8599bd8SSage Weil 
41234ee6a914SYan, Zheng 	if (ph) {
41244ee6a914SYan, Zheng 		p_cap_id = le64_to_cpu(ph->cap_id);
41254ee6a914SYan, Zheng 		peer = le32_to_cpu(ph->mds);
4126a8599bd8SSage Weil 	} else {
41274ee6a914SYan, Zheng 		p_cap_id = 0;
41284ee6a914SYan, Zheng 		peer = -1;
4129a8599bd8SSage Weil 	}
4130a8599bd8SSage Weil 
41314ee6a914SYan, Zheng 	dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n",
41324ee6a914SYan, Zheng 	     inode, ci, mds, mseq, peer);
4133d9df2783SYan, Zheng retry:
4134d9df2783SYan, Zheng 	cap = __get_cap_for_mds(ci, mds);
4135d9df2783SYan, Zheng 	if (!cap) {
4136d9df2783SYan, Zheng 		if (!new_cap) {
4137d9df2783SYan, Zheng 			spin_unlock(&ci->i_ceph_lock);
4138d9df2783SYan, Zheng 			new_cap = ceph_get_cap(mdsc, NULL);
413978333233SJeff Layton 			spin_lock(&ci->i_ceph_lock);
4140d9df2783SYan, Zheng 			goto retry;
4141d9df2783SYan, Zheng 		}
41422cd698beSYan, Zheng 		cap = new_cap;
41432cd698beSYan, Zheng 	} else {
41442cd698beSYan, Zheng 		if (new_cap) {
41452cd698beSYan, Zheng 			ceph_put_cap(mdsc, new_cap);
41462cd698beSYan, Zheng 			new_cap = NULL;
41472cd698beSYan, Zheng 		}
4148d9df2783SYan, Zheng 	}
4149d9df2783SYan, Zheng 
41502cd698beSYan, Zheng 	__ceph_caps_issued(ci, &issued);
41512cd698beSYan, Zheng 	issued |= __ceph_caps_dirty(ci);
41522cd698beSYan, Zheng 
4153135e671eSYan, Zheng 	ceph_add_cap(inode, session, cap_id, caps, wanted, seq, mseq,
4154d9df2783SYan, Zheng 		     realmino, CEPH_CAP_FLAG_AUTH, &new_cap);
4155d9df2783SYan, Zheng 
41562cd698beSYan, Zheng 	ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
41572cd698beSYan, Zheng 	if (ocap && ocap->cap_id == p_cap_id) {
41584ee6a914SYan, Zheng 		dout(" remove export cap %p mds%d flags %d\n",
41592cd698beSYan, Zheng 		     ocap, peer, ph->flags);
41604ee6a914SYan, Zheng 		if ((ph->flags & CEPH_CAP_FLAG_AUTH) &&
41612cd698beSYan, Zheng 		    (ocap->seq != le32_to_cpu(ph->seq) ||
41622cd698beSYan, Zheng 		     ocap->mseq != le32_to_cpu(ph->mseq))) {
4163d84b37f9SYan, Zheng 			pr_err_ratelimited("handle_cap_import: "
4164d84b37f9SYan, Zheng 					"mismatched seq/mseq: ino (%llx.%llx) "
4165d84b37f9SYan, Zheng 					"mds%d seq %d mseq %d importer mds%d "
4166d84b37f9SYan, Zheng 					"has peer seq %d mseq %d\n",
41672cd698beSYan, Zheng 					ceph_vinop(inode), peer, ocap->seq,
41682cd698beSYan, Zheng 					ocap->mseq, mds, le32_to_cpu(ph->seq),
41694ee6a914SYan, Zheng 					le32_to_cpu(ph->mseq));
41704ee6a914SYan, Zheng 		}
41712e2023e9SXiubo Li 		ceph_remove_cap(mdsc, ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
41724ee6a914SYan, Zheng 	}
41734ee6a914SYan, Zheng 
41742cd698beSYan, Zheng 	*old_issued = issued;
41752cd698beSYan, Zheng 	*target_cap = cap;
4176a8599bd8SSage Weil }
4177a8599bd8SSage Weil 
41780d91f0adSJeff Layton #ifdef CONFIG_FS_ENCRYPTION
parse_fscrypt_fields(void ** p,void * end,struct cap_extra_info * extra)41790d91f0adSJeff Layton static int parse_fscrypt_fields(void **p, void *end,
41800d91f0adSJeff Layton 				struct cap_extra_info *extra)
41810d91f0adSJeff Layton {
41820d91f0adSJeff Layton 	u32 len;
41830d91f0adSJeff Layton 
41840d91f0adSJeff Layton 	ceph_decode_32_safe(p, end, extra->fscrypt_auth_len, bad);
41850d91f0adSJeff Layton 	if (extra->fscrypt_auth_len) {
41860d91f0adSJeff Layton 		ceph_decode_need(p, end, extra->fscrypt_auth_len, bad);
41870d91f0adSJeff Layton 		extra->fscrypt_auth = kmalloc(extra->fscrypt_auth_len,
41880d91f0adSJeff Layton 					      GFP_KERNEL);
41890d91f0adSJeff Layton 		if (!extra->fscrypt_auth)
41900d91f0adSJeff Layton 			return -ENOMEM;
41910d91f0adSJeff Layton 		ceph_decode_copy_safe(p, end, extra->fscrypt_auth,
41920d91f0adSJeff Layton 					extra->fscrypt_auth_len, bad);
41930d91f0adSJeff Layton 	}
41940d91f0adSJeff Layton 
41950d91f0adSJeff Layton 	ceph_decode_32_safe(p, end, len, bad);
41960d91f0adSJeff Layton 	if (len >= sizeof(u64)) {
41970d91f0adSJeff Layton 		ceph_decode_64_safe(p, end, extra->fscrypt_file_size, bad);
41980d91f0adSJeff Layton 		len -= sizeof(u64);
41990d91f0adSJeff Layton 	}
42000d91f0adSJeff Layton 	ceph_decode_skip_n(p, end, len, bad);
42010d91f0adSJeff Layton 	return 0;
42020d91f0adSJeff Layton bad:
42030d91f0adSJeff Layton 	return -EIO;
42040d91f0adSJeff Layton }
42050d91f0adSJeff Layton #else
parse_fscrypt_fields(void ** p,void * end,struct cap_extra_info * extra)42060d91f0adSJeff Layton static int parse_fscrypt_fields(void **p, void *end,
42070d91f0adSJeff Layton 				struct cap_extra_info *extra)
42080d91f0adSJeff Layton {
42090d91f0adSJeff Layton 	u32 len;
42100d91f0adSJeff Layton 
42110d91f0adSJeff Layton 	/* Don't care about these fields unless we're encryption-capable */
42120d91f0adSJeff Layton 	ceph_decode_32_safe(p, end, len, bad);
42130d91f0adSJeff Layton 	if (len)
42140d91f0adSJeff Layton 		ceph_decode_skip_n(p, end, len, bad);
42150d91f0adSJeff Layton 	ceph_decode_32_safe(p, end, len, bad);
42160d91f0adSJeff Layton 	if (len)
42170d91f0adSJeff Layton 		ceph_decode_skip_n(p, end, len, bad);
42180d91f0adSJeff Layton 	return 0;
42190d91f0adSJeff Layton bad:
42200d91f0adSJeff Layton 	return -EIO;
42210d91f0adSJeff Layton }
42220d91f0adSJeff Layton #endif
42230d91f0adSJeff Layton 
4224a8599bd8SSage Weil /*
4225a8599bd8SSage Weil  * Handle a caps message from the MDS.
4226a8599bd8SSage Weil  *
4227a8599bd8SSage Weil  * Identify the appropriate session, inode, and call the right handler
4228a8599bd8SSage Weil  * based on the cap op.
4229a8599bd8SSage Weil  */
ceph_handle_caps(struct ceph_mds_session * session,struct ceph_msg * msg)4230a8599bd8SSage Weil void ceph_handle_caps(struct ceph_mds_session *session,
4231a8599bd8SSage Weil 		      struct ceph_msg *msg)
4232a8599bd8SSage Weil {
4233a8599bd8SSage Weil 	struct ceph_mds_client *mdsc = session->s_mdsc;
4234a8599bd8SSage Weil 	struct inode *inode;
4235be655596SSage Weil 	struct ceph_inode_info *ci;
4236a8599bd8SSage Weil 	struct ceph_cap *cap;
4237a8599bd8SSage Weil 	struct ceph_mds_caps *h;
42384ee6a914SYan, Zheng 	struct ceph_mds_cap_peer *peer = NULL;
4239779fe0fbSYan, Zheng 	struct ceph_snap_realm *realm = NULL;
4240a1c6b835SYan, Zheng 	int op;
42414985d6f9SYan, Zheng 	int msg_version = le16_to_cpu(msg->hdr.version);
42423d7ded4dSSage Weil 	u32 seq, mseq;
4243a8599bd8SSage Weil 	struct ceph_vino vino;
424470edb55bSSage Weil 	void *snaptrace;
4245ce1fbc8dSSage Weil 	size_t snaptrace_len;
4246fb01d1f8SYan, Zheng 	void *p, *end;
4247a1c6b835SYan, Zheng 	struct cap_extra_info extra_info = {};
42487391fba2SJeff Layton 	bool queue_trunc;
4249a68e564aSXiubo Li 	bool close_sessions = false;
4250ce72d4e0SXiubo Li 	bool do_cap_release = false;
4251a8599bd8SSage Weil 
4252a1c6b835SYan, Zheng 	dout("handle_caps from mds%d\n", session->s_mds);
4253a8599bd8SSage Weil 
4254e3dfcab2SXiubo Li 	if (!ceph_inc_mds_stopping_blocker(mdsc, session))
4255e3dfcab2SXiubo Li 		return;
4256e3dfcab2SXiubo Li 
4257a8599bd8SSage Weil 	/* decode */
42584ee6a914SYan, Zheng 	end = msg->front.iov_base + msg->front.iov_len;
4259a8599bd8SSage Weil 	if (msg->front.iov_len < sizeof(*h))
4260a8599bd8SSage Weil 		goto bad;
4261a8599bd8SSage Weil 	h = msg->front.iov_base;
4262a8599bd8SSage Weil 	op = le32_to_cpu(h->op);
4263a8599bd8SSage Weil 	vino.ino = le64_to_cpu(h->ino);
4264a8599bd8SSage Weil 	vino.snap = CEPH_NOSNAP;
4265a8599bd8SSage Weil 	seq = le32_to_cpu(h->seq);
42663d7ded4dSSage Weil 	mseq = le32_to_cpu(h->migrate_seq);
4267a8599bd8SSage Weil 
4268ce1fbc8dSSage Weil 	snaptrace = h + 1;
4269ce1fbc8dSSage Weil 	snaptrace_len = le32_to_cpu(h->snap_trace_len);
4270fb01d1f8SYan, Zheng 	p = snaptrace + snaptrace_len;
4271ce1fbc8dSSage Weil 
42724985d6f9SYan, Zheng 	if (msg_version >= 2) {
4273fb01d1f8SYan, Zheng 		u32 flock_len;
4274ce1fbc8dSSage Weil 		ceph_decode_32_safe(&p, end, flock_len, bad);
42754ee6a914SYan, Zheng 		if (p + flock_len > end)
42764ee6a914SYan, Zheng 			goto bad;
4277fb01d1f8SYan, Zheng 		p += flock_len;
4278ce1fbc8dSSage Weil 	}
4279ce1fbc8dSSage Weil 
42804985d6f9SYan, Zheng 	if (msg_version >= 3) {
42814ee6a914SYan, Zheng 		if (op == CEPH_CAP_OP_IMPORT) {
42824ee6a914SYan, Zheng 			if (p + sizeof(*peer) > end)
42834ee6a914SYan, Zheng 				goto bad;
42844ee6a914SYan, Zheng 			peer = p;
4285fb01d1f8SYan, Zheng 			p += sizeof(*peer);
428611df2dfbSYan, Zheng 		} else if (op == CEPH_CAP_OP_EXPORT) {
428711df2dfbSYan, Zheng 			/* recorded in unused fields */
428811df2dfbSYan, Zheng 			peer = (void *)&h->size;
42894ee6a914SYan, Zheng 		}
42904ee6a914SYan, Zheng 	}
42914ee6a914SYan, Zheng 
42924985d6f9SYan, Zheng 	if (msg_version >= 4) {
4293a1c6b835SYan, Zheng 		ceph_decode_64_safe(&p, end, extra_info.inline_version, bad);
4294a1c6b835SYan, Zheng 		ceph_decode_32_safe(&p, end, extra_info.inline_len, bad);
4295a1c6b835SYan, Zheng 		if (p + extra_info.inline_len > end)
4296fb01d1f8SYan, Zheng 			goto bad;
4297a1c6b835SYan, Zheng 		extra_info.inline_data = p;
4298a1c6b835SYan, Zheng 		p += extra_info.inline_len;
4299fb01d1f8SYan, Zheng 	}
4300fb01d1f8SYan, Zheng 
43014985d6f9SYan, Zheng 	if (msg_version >= 5) {
430292475f05SJeff Layton 		struct ceph_osd_client	*osdc = &mdsc->fsc->client->osdc;
430392475f05SJeff Layton 		u32			epoch_barrier;
430492475f05SJeff Layton 
430592475f05SJeff Layton 		ceph_decode_32_safe(&p, end, epoch_barrier, bad);
430692475f05SJeff Layton 		ceph_osdc_update_epoch_barrier(osdc, epoch_barrier);
430792475f05SJeff Layton 	}
430892475f05SJeff Layton 
43094985d6f9SYan, Zheng 	if (msg_version >= 8) {
4310779fe0fbSYan, Zheng 		u32 pool_ns_len;
431192475f05SJeff Layton 
43125ea5c5e0SYan, Zheng 		/* version >= 6 */
431306a1ad43SJeff Layton 		ceph_decode_skip_64(&p, end, bad);	// flush_tid
43145ea5c5e0SYan, Zheng 		/* version >= 7 */
431506a1ad43SJeff Layton 		ceph_decode_skip_32(&p, end, bad);	// caller_uid
431606a1ad43SJeff Layton 		ceph_decode_skip_32(&p, end, bad);	// caller_gid
43175ea5c5e0SYan, Zheng 		/* version >= 8 */
43185ea5c5e0SYan, Zheng 		ceph_decode_32_safe(&p, end, pool_ns_len, bad);
4319779fe0fbSYan, Zheng 		if (pool_ns_len > 0) {
4320779fe0fbSYan, Zheng 			ceph_decode_need(&p, end, pool_ns_len, bad);
4321a1c6b835SYan, Zheng 			extra_info.pool_ns =
4322a1c6b835SYan, Zheng 				ceph_find_or_create_string(p, pool_ns_len);
4323779fe0fbSYan, Zheng 			p += pool_ns_len;
4324779fe0fbSYan, Zheng 		}
43255ea5c5e0SYan, Zheng 	}
43265ea5c5e0SYan, Zheng 
4327ec62b894SJeff Layton 	if (msg_version >= 9) {
43284985d6f9SYan, Zheng 		struct ceph_timespec *btime;
43294985d6f9SYan, Zheng 
43304985d6f9SYan, Zheng 		if (p + sizeof(*btime) > end)
43314985d6f9SYan, Zheng 			goto bad;
43324985d6f9SYan, Zheng 		btime = p;
4333ec62b894SJeff Layton 		ceph_decode_timespec64(&extra_info.btime, btime);
43344985d6f9SYan, Zheng 		p += sizeof(*btime);
4335176c77c9SJeff Layton 		ceph_decode_64_safe(&p, end, extra_info.change_attr, bad);
4336ec62b894SJeff Layton 	}
4337ec62b894SJeff Layton 
4338ec62b894SJeff Layton 	if (msg_version >= 11) {
43394985d6f9SYan, Zheng 		/* version >= 10 */
434006a1ad43SJeff Layton 		ceph_decode_skip_32(&p, end, bad); // flags
43414985d6f9SYan, Zheng 		/* version >= 11 */
43424985d6f9SYan, Zheng 		extra_info.dirstat_valid = true;
43434985d6f9SYan, Zheng 		ceph_decode_64_safe(&p, end, extra_info.nfiles, bad);
43444985d6f9SYan, Zheng 		ceph_decode_64_safe(&p, end, extra_info.nsubdirs, bad);
43454985d6f9SYan, Zheng 	}
43464985d6f9SYan, Zheng 
43470d91f0adSJeff Layton 	if (msg_version >= 12) {
43480d91f0adSJeff Layton 		if (parse_fscrypt_fields(&p, end, &extra_info))
43490d91f0adSJeff Layton 			goto bad;
43500d91f0adSJeff Layton 	}
43510d91f0adSJeff Layton 
43526cd3bcadSYan, Zheng 	/* lookup ino */
4353a1c6b835SYan, Zheng 	inode = ceph_find_inode(mdsc->fsc->sb, vino);
43546cd3bcadSYan, Zheng 	dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
43556cd3bcadSYan, Zheng 	     vino.snap, inode);
43566cd3bcadSYan, Zheng 
4357a8599bd8SSage Weil 	mutex_lock(&session->s_mutex);
4358a8599bd8SSage Weil 	dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
4359a8599bd8SSage Weil 	     (unsigned)seq);
4360a8599bd8SSage Weil 
4361a8599bd8SSage Weil 	if (!inode) {
4362a8599bd8SSage Weil 		dout(" i don't have ino %llx\n", vino.ino);
43633d7ded4dSSage Weil 
4364ce72d4e0SXiubo Li 		switch (op) {
4365ce72d4e0SXiubo Li 		case CEPH_CAP_OP_IMPORT:
4366ce72d4e0SXiubo Li 		case CEPH_CAP_OP_REVOKE:
4367ce72d4e0SXiubo Li 		case CEPH_CAP_OP_GRANT:
4368ce72d4e0SXiubo Li 			do_cap_release = true;
4369ce72d4e0SXiubo Li 			break;
4370ce72d4e0SXiubo Li 		default:
4371ce72d4e0SXiubo Li 			break;
4372a096b09aSYan, Zheng 		}
4373fb33c114SJeff Layton 		goto flush_cap_releases;
4374a8599bd8SSage Weil 	}
43751ad3bb28SXiubo Li 	ci = ceph_inode(inode);
4376a8599bd8SSage Weil 
4377a8599bd8SSage Weil 	/* these will work even if we don't have a cap yet */
4378a8599bd8SSage Weil 	switch (op) {
4379a8599bd8SSage Weil 	case CEPH_CAP_OP_FLUSHSNAP_ACK:
4380a1c6b835SYan, Zheng 		handle_cap_flushsnap_ack(inode, le64_to_cpu(msg->hdr.tid),
4381a1c6b835SYan, Zheng 					 h, session);
4382a8599bd8SSage Weil 		goto done;
4383a8599bd8SSage Weil 
4384a8599bd8SSage Weil 	case CEPH_CAP_OP_EXPORT:
438511df2dfbSYan, Zheng 		handle_cap_export(inode, h, peer, session);
438611df2dfbSYan, Zheng 		goto done_unlocked;
4387a8599bd8SSage Weil 
4388a8599bd8SSage Weil 	case CEPH_CAP_OP_IMPORT:
4389982d6011SYan, Zheng 		realm = NULL;
4390982d6011SYan, Zheng 		if (snaptrace_len) {
4391982d6011SYan, Zheng 			down_write(&mdsc->snap_rwsem);
4392a68e564aSXiubo Li 			if (ceph_update_snap_trace(mdsc, snaptrace,
4393982d6011SYan, Zheng 						   snaptrace + snaptrace_len,
4394a68e564aSXiubo Li 						   false, &realm)) {
4395a68e564aSXiubo Li 				up_write(&mdsc->snap_rwsem);
4396a68e564aSXiubo Li 				close_sessions = true;
4397a68e564aSXiubo Li 				goto done;
4398a68e564aSXiubo Li 			}
4399982d6011SYan, Zheng 			downgrade_write(&mdsc->snap_rwsem);
4400982d6011SYan, Zheng 		} else {
4401982d6011SYan, Zheng 			down_read(&mdsc->snap_rwsem);
4402982d6011SYan, Zheng 		}
440378333233SJeff Layton 		spin_lock(&ci->i_ceph_lock);
44044ee6a914SYan, Zheng 		handle_cap_import(mdsc, inode, h, peer, session,
4405a1c6b835SYan, Zheng 				  &cap, &extra_info.issued);
4406a1c6b835SYan, Zheng 		handle_cap_grant(inode, session, cap,
4407a1c6b835SYan, Zheng 				 h, msg->middle, &extra_info);
4408982d6011SYan, Zheng 		if (realm)
4409982d6011SYan, Zheng 			ceph_put_snap_realm(mdsc, realm);
44102cd698beSYan, Zheng 		goto done_unlocked;
4411a8599bd8SSage Weil 	}
4412a8599bd8SSage Weil 
4413a8599bd8SSage Weil 	/* the rest require a cap */
4414be655596SSage Weil 	spin_lock(&ci->i_ceph_lock);
4415a1c6b835SYan, Zheng 	cap = __get_cap_for_mds(ceph_inode(inode), session->s_mds);
4416a8599bd8SSage Weil 	if (!cap) {
44179dbd412fSSage Weil 		dout(" no cap on %p ino %llx.%llx from mds%d\n",
4418a1c6b835SYan, Zheng 		     inode, ceph_ino(inode), ceph_snap(inode),
4419a1c6b835SYan, Zheng 		     session->s_mds);
4420be655596SSage Weil 		spin_unlock(&ci->i_ceph_lock);
4421ce72d4e0SXiubo Li 		switch (op) {
4422ce72d4e0SXiubo Li 		case CEPH_CAP_OP_REVOKE:
4423ce72d4e0SXiubo Li 		case CEPH_CAP_OP_GRANT:
4424ce72d4e0SXiubo Li 			do_cap_release = true;
4425ce72d4e0SXiubo Li 			break;
4426ce72d4e0SXiubo Li 		default:
4427ce72d4e0SXiubo Li 			break;
4428ce72d4e0SXiubo Li 		}
442921b559deSGreg Farnum 		goto flush_cap_releases;
4430a8599bd8SSage Weil 	}
4431a8599bd8SSage Weil 
4432be655596SSage Weil 	/* note that each of these drops i_ceph_lock for us */
4433a8599bd8SSage Weil 	switch (op) {
4434a8599bd8SSage Weil 	case CEPH_CAP_OP_REVOKE:
4435a8599bd8SSage Weil 	case CEPH_CAP_OP_GRANT:
4436a1c6b835SYan, Zheng 		__ceph_caps_issued(ci, &extra_info.issued);
4437a1c6b835SYan, Zheng 		extra_info.issued |= __ceph_caps_dirty(ci);
4438a1c6b835SYan, Zheng 		handle_cap_grant(inode, session, cap,
4439a1c6b835SYan, Zheng 				 h, msg->middle, &extra_info);
444015637c8bSSage Weil 		goto done_unlocked;
4441a8599bd8SSage Weil 
4442a8599bd8SSage Weil 	case CEPH_CAP_OP_FLUSH_ACK:
4443a1c6b835SYan, Zheng 		handle_cap_flush_ack(inode, le64_to_cpu(msg->hdr.tid),
4444a1c6b835SYan, Zheng 				     h, session, cap);
4445a8599bd8SSage Weil 		break;
4446a8599bd8SSage Weil 
4447a8599bd8SSage Weil 	case CEPH_CAP_OP_TRUNC:
44480d91f0adSJeff Layton 		queue_trunc = handle_cap_trunc(inode, h, session,
44490d91f0adSJeff Layton 						&extra_info);
44507391fba2SJeff Layton 		spin_unlock(&ci->i_ceph_lock);
44517391fba2SJeff Layton 		if (queue_trunc)
44527391fba2SJeff Layton 			ceph_queue_vmtruncate(inode);
4453a8599bd8SSage Weil 		break;
4454a8599bd8SSage Weil 
4455a8599bd8SSage Weil 	default:
4456be655596SSage Weil 		spin_unlock(&ci->i_ceph_lock);
4457a8599bd8SSage Weil 		pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
4458a8599bd8SSage Weil 		       ceph_cap_op_name(op));
4459a8599bd8SSage Weil 	}
4460a8599bd8SSage Weil 
4461e3ec8d68SYan, Zheng done:
4462e3ec8d68SYan, Zheng 	mutex_unlock(&session->s_mutex);
4463e3ec8d68SYan, Zheng done_unlocked:
446423c2c76eSJeff Layton 	iput(inode);
44652ad32cf0SJeff Layton out:
4466e3dfcab2SXiubo Li 	ceph_dec_mds_stopping_blocker(mdsc);
4467e3dfcab2SXiubo Li 
44682ad32cf0SJeff Layton 	ceph_put_string(extra_info.pool_ns);
4469a68e564aSXiubo Li 
4470a68e564aSXiubo Li 	/* Defer closing the sessions after s_mutex lock being released */
4471a68e564aSXiubo Li 	if (close_sessions)
4472a68e564aSXiubo Li 		ceph_mdsc_close_sessions(mdsc);
4473a68e564aSXiubo Li 
44740d91f0adSJeff Layton 	kfree(extra_info.fscrypt_auth);
4475e3ec8d68SYan, Zheng 	return;
447621b559deSGreg Farnum 
447721b559deSGreg Farnum flush_cap_releases:
447821b559deSGreg Farnum 	/*
4479745a8e3bSYan, Zheng 	 * send any cap release message to try to move things
448021b559deSGreg Farnum 	 * along for the mds (who clearly thinks we still have this
448121b559deSGreg Farnum 	 * cap).
448221b559deSGreg Farnum 	 */
4483ce72d4e0SXiubo Li 	if (do_cap_release) {
4484ce72d4e0SXiubo Li 		cap = ceph_get_cap(mdsc, NULL);
4485ce72d4e0SXiubo Li 		cap->cap_ino = vino.ino;
4486ce72d4e0SXiubo Li 		cap->queue_release = 1;
4487ce72d4e0SXiubo Li 		cap->cap_id = le64_to_cpu(h->cap_id);
4488ce72d4e0SXiubo Li 		cap->mseq = mseq;
4489ce72d4e0SXiubo Li 		cap->seq = seq;
4490ce72d4e0SXiubo Li 		cap->issue_seq = seq;
4491ce72d4e0SXiubo Li 		spin_lock(&session->s_cap_lock);
4492ce72d4e0SXiubo Li 		__ceph_queue_cap_release(session, cap);
4493ce72d4e0SXiubo Li 		spin_unlock(&session->s_cap_lock);
4494ce72d4e0SXiubo Li 	}
4495e3ec8d68SYan, Zheng 	ceph_flush_cap_releases(mdsc, session);
4496e3ec8d68SYan, Zheng 	goto done;
4497a8599bd8SSage Weil 
4498a8599bd8SSage Weil bad:
4499a8599bd8SSage Weil 	pr_err("ceph_handle_caps: corrupt message\n");
45009ec7cab1SSage Weil 	ceph_msg_dump(msg);
45012ad32cf0SJeff Layton 	goto out;
4502a8599bd8SSage Weil }
4503a8599bd8SSage Weil 
4504a8599bd8SSage Weil /*
4505a8599bd8SSage Weil  * Delayed work handler to process end of delayed cap release LRU list.
4506bf2ba432SLuis Henriques  *
4507bf2ba432SLuis Henriques  * If new caps are added to the list while processing it, these won't get
4508bf2ba432SLuis Henriques  * processed in this run.  In this case, the ci->i_hold_caps_max will be
4509bf2ba432SLuis Henriques  * returned so that the work can be scheduled accordingly.
4510a8599bd8SSage Weil  */
ceph_check_delayed_caps(struct ceph_mds_client * mdsc)4511bf2ba432SLuis Henriques unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
4512a8599bd8SSage Weil {
45134b9f2042SYan, Zheng 	struct inode *inode;
4514a8599bd8SSage Weil 	struct ceph_inode_info *ci;
4515bf2ba432SLuis Henriques 	struct ceph_mount_options *opt = mdsc->fsc->mount_options;
4516bf2ba432SLuis Henriques 	unsigned long delay_max = opt->caps_wanted_delay_max * HZ;
4517bf2ba432SLuis Henriques 	unsigned long loop_start = jiffies;
4518bf2ba432SLuis Henriques 	unsigned long delay = 0;
4519a8599bd8SSage Weil 
4520a8599bd8SSage Weil 	dout("check_delayed_caps\n");
4521a8599bd8SSage Weil 	spin_lock(&mdsc->cap_delay_lock);
4522585d72f3SJeff Layton 	while (!list_empty(&mdsc->cap_delay_list)) {
4523a8599bd8SSage Weil 		ci = list_first_entry(&mdsc->cap_delay_list,
4524a8599bd8SSage Weil 				      struct ceph_inode_info,
4525a8599bd8SSage Weil 				      i_cap_delay_list);
4526bf2ba432SLuis Henriques 		if (time_before(loop_start, ci->i_hold_caps_max - delay_max)) {
4527bf2ba432SLuis Henriques 			dout("%s caps added recently.  Exiting loop", __func__);
4528bf2ba432SLuis Henriques 			delay = ci->i_hold_caps_max;
4529bf2ba432SLuis Henriques 			break;
4530bf2ba432SLuis Henriques 		}
4531a8599bd8SSage Weil 		if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
4532a8599bd8SSage Weil 		    time_before(jiffies, ci->i_hold_caps_max))
4533a8599bd8SSage Weil 			break;
4534a8599bd8SSage Weil 		list_del_init(&ci->i_cap_delay_list);
45354b9f2042SYan, Zheng 
4536874c8ca1SDavid Howells 		inode = igrab(&ci->netfs.inode);
45374b9f2042SYan, Zheng 		if (inode) {
4538585d72f3SJeff Layton 			spin_unlock(&mdsc->cap_delay_lock);
45394b9f2042SYan, Zheng 			dout("check_delayed_caps on %p\n", inode);
4540e4b731ccSXiubo Li 			ceph_check_caps(ci, 0);
454123c2c76eSJeff Layton 			iput(inode);
4542585d72f3SJeff Layton 			spin_lock(&mdsc->cap_delay_lock);
45434b9f2042SYan, Zheng 		}
4544a8599bd8SSage Weil 	}
4545a8599bd8SSage Weil 	spin_unlock(&mdsc->cap_delay_lock);
4546bf2ba432SLuis Henriques 
4547bf2ba432SLuis Henriques 	return delay;
4548a8599bd8SSage Weil }
4549a8599bd8SSage Weil 
4550a8599bd8SSage Weil /*
4551afcdaea3SSage Weil  * Flush all dirty caps to the mds
4552afcdaea3SSage Weil  */
flush_dirty_session_caps(struct ceph_mds_session * s)45531cf03a68SJeff Layton static void flush_dirty_session_caps(struct ceph_mds_session *s)
4554afcdaea3SSage Weil {
45551cf03a68SJeff Layton 	struct ceph_mds_client *mdsc = s->s_mdsc;
4556db354052SSage Weil 	struct ceph_inode_info *ci;
4557db354052SSage Weil 	struct inode *inode;
4558afcdaea3SSage Weil 
4559afcdaea3SSage Weil 	dout("flush_dirty_caps\n");
4560afcdaea3SSage Weil 	spin_lock(&mdsc->cap_dirty_lock);
45611cf03a68SJeff Layton 	while (!list_empty(&s->s_cap_dirty)) {
45621cf03a68SJeff Layton 		ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info,
4563afcdaea3SSage Weil 				      i_dirty_item);
4564874c8ca1SDavid Howells 		inode = &ci->netfs.inode;
456570b666c3SSage Weil 		ihold(inode);
45666407fbb9SJeff Layton 		dout("flush_dirty_caps %llx.%llx\n", ceph_vinop(inode));
4567afcdaea3SSage Weil 		spin_unlock(&mdsc->cap_dirty_lock);
45688692969eSJeff Layton 		ceph_wait_on_async_create(inode);
4569e4b731ccSXiubo Li 		ceph_check_caps(ci, CHECK_CAPS_FLUSH);
4570afcdaea3SSage Weil 		iput(inode);
4571afcdaea3SSage Weil 		spin_lock(&mdsc->cap_dirty_lock);
4572afcdaea3SSage Weil 	}
4573afcdaea3SSage Weil 	spin_unlock(&mdsc->cap_dirty_lock);
4574db354052SSage Weil 	dout("flush_dirty_caps done\n");
4575afcdaea3SSage Weil }
4576afcdaea3SSage Weil 
ceph_flush_dirty_caps(struct ceph_mds_client * mdsc)45771cf03a68SJeff Layton void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
45781cf03a68SJeff Layton {
457959b312f3SXiubo Li 	ceph_mdsc_iterate_sessions(mdsc, flush_dirty_session_caps, true);
45801cf03a68SJeff Layton }
45811cf03a68SJeff Layton 
__ceph_touch_fmode(struct ceph_inode_info * ci,struct ceph_mds_client * mdsc,int fmode)4582719a2514SYan, Zheng void __ceph_touch_fmode(struct ceph_inode_info *ci,
4583719a2514SYan, Zheng 			struct ceph_mds_client *mdsc, int fmode)
4584719a2514SYan, Zheng {
4585719a2514SYan, Zheng 	unsigned long now = jiffies;
4586719a2514SYan, Zheng 	if (fmode & CEPH_FILE_MODE_RD)
4587719a2514SYan, Zheng 		ci->i_last_rd = now;
4588719a2514SYan, Zheng 	if (fmode & CEPH_FILE_MODE_WR)
4589719a2514SYan, Zheng 		ci->i_last_wr = now;
4590719a2514SYan, Zheng 	/* queue periodic check */
4591719a2514SYan, Zheng 	if (fmode &&
4592719a2514SYan, Zheng 	    __ceph_is_any_real_caps(ci) &&
4593719a2514SYan, Zheng 	    list_empty(&ci->i_cap_delay_list))
4594a0d93e32SYan, Zheng 		__cap_delay_requeue(mdsc, ci);
4595719a2514SYan, Zheng }
4596719a2514SYan, Zheng 
ceph_get_fmode(struct ceph_inode_info * ci,int fmode,int count)4597719a2514SYan, Zheng void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
4598719a2514SYan, Zheng {
4599874c8ca1SDavid Howells 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb);
4600719a2514SYan, Zheng 	int bits = (fmode << 1) | 1;
4601973e5245SHu Weiwen 	bool already_opened = false;
46021dd8d470SXiubo Li 	int i;
46031dd8d470SXiubo Li 
46041dd8d470SXiubo Li 	if (count == 1)
46051dd8d470SXiubo Li 		atomic64_inc(&mdsc->metric.opened_files);
46061dd8d470SXiubo Li 
4607719a2514SYan, Zheng 	spin_lock(&ci->i_ceph_lock);
4608719a2514SYan, Zheng 	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
46091dd8d470SXiubo Li 		/*
4610973e5245SHu Weiwen 		 * If any of the mode ref is larger than 0,
46111dd8d470SXiubo Li 		 * that means it has been already opened by
46121dd8d470SXiubo Li 		 * others. Just skip checking the PIN ref.
46131dd8d470SXiubo Li 		 */
4614973e5245SHu Weiwen 		if (i && ci->i_nr_by_mode[i])
4615973e5245SHu Weiwen 			already_opened = true;
4616973e5245SHu Weiwen 
4617973e5245SHu Weiwen 		if (bits & (1 << i))
4618973e5245SHu Weiwen 			ci->i_nr_by_mode[i] += count;
4619719a2514SYan, Zheng 	}
46201dd8d470SXiubo Li 
4621973e5245SHu Weiwen 	if (!already_opened)
46221dd8d470SXiubo Li 		percpu_counter_inc(&mdsc->metric.opened_inodes);
4623719a2514SYan, Zheng 	spin_unlock(&ci->i_ceph_lock);
4624719a2514SYan, Zheng }
4625719a2514SYan, Zheng 
4626afcdaea3SSage Weil /*
4627a8599bd8SSage Weil  * Drop open file reference.  If we were the last open file,
4628a8599bd8SSage Weil  * we may need to release capabilities to the MDS (or schedule
4629a8599bd8SSage Weil  * their delayed release).
4630a8599bd8SSage Weil  */
ceph_put_fmode(struct ceph_inode_info * ci,int fmode,int count)4631719a2514SYan, Zheng void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
4632a8599bd8SSage Weil {
4633874c8ca1SDavid Howells 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb);
4634774a6a11SYan, Zheng 	int bits = (fmode << 1) | 1;
46351dd8d470SXiubo Li 	bool is_closed = true;
46361dd8d470SXiubo Li 	int i;
46371dd8d470SXiubo Li 
46381dd8d470SXiubo Li 	if (count == 1)
46391dd8d470SXiubo Li 		atomic64_dec(&mdsc->metric.opened_files);
46401dd8d470SXiubo Li 
4641be655596SSage Weil 	spin_lock(&ci->i_ceph_lock);
4642774a6a11SYan, Zheng 	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
4643774a6a11SYan, Zheng 		if (bits & (1 << i)) {
4644719a2514SYan, Zheng 			BUG_ON(ci->i_nr_by_mode[i] < count);
4645719a2514SYan, Zheng 			ci->i_nr_by_mode[i] -= count;
4646774a6a11SYan, Zheng 		}
46471dd8d470SXiubo Li 
46481dd8d470SXiubo Li 		/*
46491dd8d470SXiubo Li 		 * If any of the mode ref is not 0 after
46501dd8d470SXiubo Li 		 * decreased, that means it is still opened
46511dd8d470SXiubo Li 		 * by others. Just skip checking the PIN ref.
46521dd8d470SXiubo Li 		 */
46531dd8d470SXiubo Li 		if (i && ci->i_nr_by_mode[i])
46541dd8d470SXiubo Li 			is_closed = false;
4655774a6a11SYan, Zheng 	}
46561dd8d470SXiubo Li 
46571dd8d470SXiubo Li 	if (is_closed)
46581dd8d470SXiubo Li 		percpu_counter_dec(&mdsc->metric.opened_inodes);
4659be655596SSage Weil 	spin_unlock(&ci->i_ceph_lock);
4660a8599bd8SSage Weil }
4661a8599bd8SSage Weil 
4662a8599bd8SSage Weil /*
4663a452bc06SJeff Layton  * For a soon-to-be unlinked file, drop the LINK caps. If it
46646ef0bc6dSZhi Zhang  * looks like the link count will hit 0, drop any other caps (other
46656ef0bc6dSZhi Zhang  * than PIN) we don't specifically want (due to the file still being
46666ef0bc6dSZhi Zhang  * open).
46676ef0bc6dSZhi Zhang  */
ceph_drop_caps_for_unlink(struct inode * inode)46686ef0bc6dSZhi Zhang int ceph_drop_caps_for_unlink(struct inode *inode)
46696ef0bc6dSZhi Zhang {
46706ef0bc6dSZhi Zhang 	struct ceph_inode_info *ci = ceph_inode(inode);
46716ef0bc6dSZhi Zhang 	int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
46726ef0bc6dSZhi Zhang 
46736ef0bc6dSZhi Zhang 	spin_lock(&ci->i_ceph_lock);
46746ef0bc6dSZhi Zhang 	if (inode->i_nlink == 1) {
46756ef0bc6dSZhi Zhang 		drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
46766ef0bc6dSZhi Zhang 
46776ef0bc6dSZhi Zhang 		if (__ceph_caps_dirty(ci)) {
46786ef0bc6dSZhi Zhang 			struct ceph_mds_client *mdsc =
4679985b9ee8SXiubo Li 				ceph_inode_to_fs_client(inode)->mdsc;
46806ef0bc6dSZhi Zhang 			__cap_delay_requeue_front(mdsc, ci);
46816ef0bc6dSZhi Zhang 		}
46826ef0bc6dSZhi Zhang 	}
46836ef0bc6dSZhi Zhang 	spin_unlock(&ci->i_ceph_lock);
46846ef0bc6dSZhi Zhang 	return drop;
46856ef0bc6dSZhi Zhang }
46866ef0bc6dSZhi Zhang 
46876ef0bc6dSZhi Zhang /*
4688a8599bd8SSage Weil  * Helpers for embedding cap and dentry lease releases into mds
4689a8599bd8SSage Weil  * requests.
4690a8599bd8SSage Weil  *
4691a8599bd8SSage Weil  * @force is used by dentry_release (below) to force inclusion of a
4692a8599bd8SSage Weil  * record for the directory inode, even when there aren't any caps to
4693a8599bd8SSage Weil  * drop.
4694a8599bd8SSage Weil  */
ceph_encode_inode_release(void ** p,struct inode * inode,int mds,int drop,int unless,int force)4695a8599bd8SSage Weil int ceph_encode_inode_release(void **p, struct inode *inode,
4696a8599bd8SSage Weil 			      int mds, int drop, int unless, int force)
4697a8599bd8SSage Weil {
4698a8599bd8SSage Weil 	struct ceph_inode_info *ci = ceph_inode(inode);
4699a8599bd8SSage Weil 	struct ceph_cap *cap;
4700a8599bd8SSage Weil 	struct ceph_mds_request_release *rel = *p;
4701ec97f88bSSage Weil 	int used, dirty;
4702a8599bd8SSage Weil 	int ret = 0;
4703a8599bd8SSage Weil 
4704be655596SSage Weil 	spin_lock(&ci->i_ceph_lock);
4705916623daSSage Weil 	used = __ceph_caps_used(ci);
4706ec97f88bSSage Weil 	dirty = __ceph_caps_dirty(ci);
4707916623daSSage Weil 
4708ec97f88bSSage Weil 	dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n",
4709ec97f88bSSage Weil 	     inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop),
4710916623daSSage Weil 	     ceph_cap_string(unless));
4711916623daSSage Weil 
4712ec97f88bSSage Weil 	/* only drop unused, clean caps */
4713ec97f88bSSage Weil 	drop &= ~(used | dirty);
4714916623daSSage Weil 
4715a8599bd8SSage Weil 	cap = __get_cap_for_mds(ci, mds);
4716a8599bd8SSage Weil 	if (cap && __cap_is_valid(cap)) {
4717222b7f90SYan, Zheng 		unless &= cap->issued;
4718222b7f90SYan, Zheng 		if (unless) {
4719222b7f90SYan, Zheng 			if (unless & CEPH_CAP_AUTH_EXCL)
4720222b7f90SYan, Zheng 				drop &= ~CEPH_CAP_AUTH_SHARED;
4721222b7f90SYan, Zheng 			if (unless & CEPH_CAP_LINK_EXCL)
4722222b7f90SYan, Zheng 				drop &= ~CEPH_CAP_LINK_SHARED;
4723222b7f90SYan, Zheng 			if (unless & CEPH_CAP_XATTR_EXCL)
4724222b7f90SYan, Zheng 				drop &= ~CEPH_CAP_XATTR_SHARED;
4725222b7f90SYan, Zheng 			if (unless & CEPH_CAP_FILE_EXCL)
4726222b7f90SYan, Zheng 				drop &= ~CEPH_CAP_FILE_SHARED;
4727222b7f90SYan, Zheng 		}
4728222b7f90SYan, Zheng 
4729222b7f90SYan, Zheng 		if (force || (cap->issued & drop)) {
4730222b7f90SYan, Zheng 			if (cap->issued & drop) {
4731bb137f84SYan, Zheng 				int wanted = __ceph_caps_wanted(ci);
4732bb137f84SYan, Zheng 				dout("encode_inode_release %p cap %p "
4733bb137f84SYan, Zheng 				     "%s -> %s, wanted %s -> %s\n", inode, cap,
4734a8599bd8SSage Weil 				     ceph_cap_string(cap->issued),
4735bb137f84SYan, Zheng 				     ceph_cap_string(cap->issued & ~drop),
4736bb137f84SYan, Zheng 				     ceph_cap_string(cap->mds_wanted),
4737bb137f84SYan, Zheng 				     ceph_cap_string(wanted));
4738bb137f84SYan, Zheng 
4739a8599bd8SSage Weil 				cap->issued &= ~drop;
4740a8599bd8SSage Weil 				cap->implemented &= ~drop;
4741bb137f84SYan, Zheng 				cap->mds_wanted = wanted;
47426f05b30eSYan, Zheng 				if (cap == ci->i_auth_cap &&
47436f05b30eSYan, Zheng 				    !(wanted & CEPH_CAP_ANY_FILE_WR))
47446f05b30eSYan, Zheng 					ci->i_requested_max_size = 0;
4745a8599bd8SSage Weil 			} else {
4746a8599bd8SSage Weil 				dout("encode_inode_release %p cap %p %s"
4747a8599bd8SSage Weil 				     " (force)\n", inode, cap,
4748a8599bd8SSage Weil 				     ceph_cap_string(cap->issued));
4749a8599bd8SSage Weil 			}
4750a8599bd8SSage Weil 
4751a8599bd8SSage Weil 			rel->ino = cpu_to_le64(ceph_ino(inode));
4752a8599bd8SSage Weil 			rel->cap_id = cpu_to_le64(cap->cap_id);
4753a8599bd8SSage Weil 			rel->seq = cpu_to_le32(cap->seq);
475408a0f24eSHimangi Saraogi 			rel->issue_seq = cpu_to_le32(cap->issue_seq);
4755a8599bd8SSage Weil 			rel->mseq = cpu_to_le32(cap->mseq);
4756fd7b95cdSYan, Zheng 			rel->caps = cpu_to_le32(cap->implemented);
4757a8599bd8SSage Weil 			rel->wanted = cpu_to_le32(cap->mds_wanted);
4758a8599bd8SSage Weil 			rel->dname_len = 0;
4759a8599bd8SSage Weil 			rel->dname_seq = 0;
4760a8599bd8SSage Weil 			*p += sizeof(*rel);
4761a8599bd8SSage Weil 			ret = 1;
4762a8599bd8SSage Weil 		} else {
4763222b7f90SYan, Zheng 			dout("encode_inode_release %p cap %p %s (noop)\n",
4764a8599bd8SSage Weil 			     inode, cap, ceph_cap_string(cap->issued));
4765a8599bd8SSage Weil 		}
4766a8599bd8SSage Weil 	}
4767be655596SSage Weil 	spin_unlock(&ci->i_ceph_lock);
4768a8599bd8SSage Weil 	return ret;
4769a8599bd8SSage Weil }
4770a8599bd8SSage Weil 
47713fd945a7SJeff Layton /**
47723fd945a7SJeff Layton  * ceph_encode_dentry_release - encode a dentry release into an outgoing request
47733fd945a7SJeff Layton  * @p: outgoing request buffer
47743fd945a7SJeff Layton  * @dentry: dentry to release
47753fd945a7SJeff Layton  * @dir: dir to release it from
47763fd945a7SJeff Layton  * @mds: mds that we're speaking to
47773fd945a7SJeff Layton  * @drop: caps being dropped
47783fd945a7SJeff Layton  * @unless: unless we have these caps
47793fd945a7SJeff Layton  *
47803fd945a7SJeff Layton  * Encode a dentry release into an outgoing request buffer. Returns 1 if the
47813fd945a7SJeff Layton  * thing was released, or a negative error code otherwise.
47823fd945a7SJeff Layton  */
ceph_encode_dentry_release(void ** p,struct dentry * dentry,struct inode * dir,int mds,int drop,int unless)4783a8599bd8SSage Weil int ceph_encode_dentry_release(void **p, struct dentry *dentry,
4784ca6c8ae0SJeff Layton 			       struct inode *dir,
4785a8599bd8SSage Weil 			       int mds, int drop, int unless)
4786a8599bd8SSage Weil {
4787a8599bd8SSage Weil 	struct ceph_mds_request_release *rel = *p;
4788a8599bd8SSage Weil 	struct ceph_dentry_info *di = ceph_dentry(dentry);
4789a8599bd8SSage Weil 	int force = 0;
4790a8599bd8SSage Weil 	int ret;
4791a8599bd8SSage Weil 
4792196b87e5SXiubo Li 	/* This shouldn't happen */
4793196b87e5SXiubo Li 	BUG_ON(!dir);
4794196b87e5SXiubo Li 
4795a8599bd8SSage Weil 	/*
4796a8599bd8SSage Weil 	 * force an record for the directory caps if we have a dentry lease.
4797be655596SSage Weil 	 * this is racy (can't take i_ceph_lock and d_lock together), but it
4798a8599bd8SSage Weil 	 * doesn't have to be perfect; the mds will revoke anything we don't
4799a8599bd8SSage Weil 	 * release.
4800a8599bd8SSage Weil 	 */
4801a8599bd8SSage Weil 	spin_lock(&dentry->d_lock);
4802a8599bd8SSage Weil 	if (di->lease_session && di->lease_session->s_mds == mds)
4803a8599bd8SSage Weil 		force = 1;
4804a8599bd8SSage Weil 	spin_unlock(&dentry->d_lock);
4805a8599bd8SSage Weil 
4806ca6c8ae0SJeff Layton 	ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
4807a8599bd8SSage Weil 
4808a8599bd8SSage Weil 	spin_lock(&dentry->d_lock);
4809a8599bd8SSage Weil 	if (ret && di->lease_session && di->lease_session->s_mds == mds) {
4810a8599bd8SSage Weil 		dout("encode_dentry_release %p mds%d seq %d\n",
4811a8599bd8SSage Weil 		     dentry, mds, (int)di->lease_seq);
48123fd945a7SJeff Layton 		rel->dname_seq = cpu_to_le32(di->lease_seq);
48133fd945a7SJeff Layton 		__ceph_mdsc_drop_dentry_lease(dentry);
48143fd945a7SJeff Layton 		spin_unlock(&dentry->d_lock);
48153fd945a7SJeff Layton 		if (IS_ENCRYPTED(dir) && fscrypt_has_encryption_key(dir)) {
48163fd945a7SJeff Layton 			int ret2 = ceph_encode_encrypted_fname(dir, dentry, *p);
48173fd945a7SJeff Layton 
48183fd945a7SJeff Layton 			if (ret2 < 0)
48193fd945a7SJeff Layton 				return ret2;
48203fd945a7SJeff Layton 
48213fd945a7SJeff Layton 			rel->dname_len = cpu_to_le32(ret2);
48223fd945a7SJeff Layton 			*p += ret2;
48233fd945a7SJeff Layton 		} else {
4824a8599bd8SSage Weil 			rel->dname_len = cpu_to_le32(dentry->d_name.len);
4825a8599bd8SSage Weil 			memcpy(*p, dentry->d_name.name, dentry->d_name.len);
4826a8599bd8SSage Weil 			*p += dentry->d_name.len;
4827a8599bd8SSage Weil 		}
48283fd945a7SJeff Layton 	} else {
4829a8599bd8SSage Weil 		spin_unlock(&dentry->d_lock);
48303fd945a7SJeff Layton 	}
4831a8599bd8SSage Weil 	return ret;
4832a8599bd8SSage Weil }
483336e6da98SJeff Layton 
remove_capsnaps(struct ceph_mds_client * mdsc,struct inode * inode)483436e6da98SJeff Layton static int remove_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode)
483536e6da98SJeff Layton {
483636e6da98SJeff Layton 	struct ceph_inode_info *ci = ceph_inode(inode);
483736e6da98SJeff Layton 	struct ceph_cap_snap *capsnap;
483836e6da98SJeff Layton 	int capsnap_release = 0;
483936e6da98SJeff Layton 
484036e6da98SJeff Layton 	lockdep_assert_held(&ci->i_ceph_lock);
484136e6da98SJeff Layton 
484236e6da98SJeff Layton 	dout("removing capsnaps, ci is %p, inode is %p\n", ci, inode);
484336e6da98SJeff Layton 
484436e6da98SJeff Layton 	while (!list_empty(&ci->i_cap_snaps)) {
484536e6da98SJeff Layton 		capsnap = list_first_entry(&ci->i_cap_snaps,
484636e6da98SJeff Layton 					   struct ceph_cap_snap, ci_item);
484736e6da98SJeff Layton 		__ceph_remove_capsnap(inode, capsnap, NULL, NULL);
484836e6da98SJeff Layton 		ceph_put_snap_context(capsnap->context);
484936e6da98SJeff Layton 		ceph_put_cap_snap(capsnap);
485036e6da98SJeff Layton 		capsnap_release++;
485136e6da98SJeff Layton 	}
485236e6da98SJeff Layton 	wake_up_all(&ci->i_cap_wq);
485336e6da98SJeff Layton 	wake_up_all(&mdsc->cap_flushing_wq);
485436e6da98SJeff Layton 	return capsnap_release;
485536e6da98SJeff Layton }
485636e6da98SJeff Layton 
ceph_purge_inode_cap(struct inode * inode,struct ceph_cap * cap,bool * invalidate)485736e6da98SJeff Layton int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate)
485836e6da98SJeff Layton {
4859985b9ee8SXiubo Li 	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
486036e6da98SJeff Layton 	struct ceph_mds_client *mdsc = fsc->mdsc;
486136e6da98SJeff Layton 	struct ceph_inode_info *ci = ceph_inode(inode);
486236e6da98SJeff Layton 	bool is_auth;
486336e6da98SJeff Layton 	bool dirty_dropped = false;
486436e6da98SJeff Layton 	int iputs = 0;
486536e6da98SJeff Layton 
486636e6da98SJeff Layton 	lockdep_assert_held(&ci->i_ceph_lock);
486736e6da98SJeff Layton 
486836e6da98SJeff Layton 	dout("removing cap %p, ci is %p, inode is %p\n",
4869874c8ca1SDavid Howells 	     cap, ci, &ci->netfs.inode);
487036e6da98SJeff Layton 
487136e6da98SJeff Layton 	is_auth = (cap == ci->i_auth_cap);
487236e6da98SJeff Layton 	__ceph_remove_cap(cap, false);
487336e6da98SJeff Layton 	if (is_auth) {
487436e6da98SJeff Layton 		struct ceph_cap_flush *cf;
487536e6da98SJeff Layton 
48765d6451b1SJeff Layton 		if (ceph_inode_is_shutdown(inode)) {
487736e6da98SJeff Layton 			if (inode->i_data.nrpages > 0)
487836e6da98SJeff Layton 				*invalidate = true;
487936e6da98SJeff Layton 			if (ci->i_wrbuffer_ref > 0)
488036e6da98SJeff Layton 				mapping_set_error(&inode->i_data, -EIO);
488136e6da98SJeff Layton 		}
488236e6da98SJeff Layton 
488336e6da98SJeff Layton 		spin_lock(&mdsc->cap_dirty_lock);
488436e6da98SJeff Layton 
488536e6da98SJeff Layton 		/* trash all of the cap flushes for this inode */
488636e6da98SJeff Layton 		while (!list_empty(&ci->i_cap_flush_list)) {
488736e6da98SJeff Layton 			cf = list_first_entry(&ci->i_cap_flush_list,
488836e6da98SJeff Layton 					      struct ceph_cap_flush, i_list);
488936e6da98SJeff Layton 			list_del_init(&cf->g_list);
489036e6da98SJeff Layton 			list_del_init(&cf->i_list);
489136e6da98SJeff Layton 			if (!cf->is_capsnap)
489236e6da98SJeff Layton 				ceph_free_cap_flush(cf);
489336e6da98SJeff Layton 		}
489436e6da98SJeff Layton 
489536e6da98SJeff Layton 		if (!list_empty(&ci->i_dirty_item)) {
489636e6da98SJeff Layton 			pr_warn_ratelimited(
489736e6da98SJeff Layton 				" dropping dirty %s state for %p %lld\n",
489836e6da98SJeff Layton 				ceph_cap_string(ci->i_dirty_caps),
489936e6da98SJeff Layton 				inode, ceph_ino(inode));
490036e6da98SJeff Layton 			ci->i_dirty_caps = 0;
490136e6da98SJeff Layton 			list_del_init(&ci->i_dirty_item);
490236e6da98SJeff Layton 			dirty_dropped = true;
490336e6da98SJeff Layton 		}
490436e6da98SJeff Layton 		if (!list_empty(&ci->i_flushing_item)) {
490536e6da98SJeff Layton 			pr_warn_ratelimited(
490636e6da98SJeff Layton 				" dropping dirty+flushing %s state for %p %lld\n",
490736e6da98SJeff Layton 				ceph_cap_string(ci->i_flushing_caps),
490836e6da98SJeff Layton 				inode, ceph_ino(inode));
490936e6da98SJeff Layton 			ci->i_flushing_caps = 0;
491036e6da98SJeff Layton 			list_del_init(&ci->i_flushing_item);
491136e6da98SJeff Layton 			mdsc->num_cap_flushing--;
491236e6da98SJeff Layton 			dirty_dropped = true;
491336e6da98SJeff Layton 		}
491436e6da98SJeff Layton 		spin_unlock(&mdsc->cap_dirty_lock);
491536e6da98SJeff Layton 
491636e6da98SJeff Layton 		if (dirty_dropped) {
491736e6da98SJeff Layton 			mapping_set_error(inode->i_mapping, -EIO);
491836e6da98SJeff Layton 
491936e6da98SJeff Layton 			if (ci->i_wrbuffer_ref_head == 0 &&
492036e6da98SJeff Layton 			    ci->i_wr_ref == 0 &&
492136e6da98SJeff Layton 			    ci->i_dirty_caps == 0 &&
492236e6da98SJeff Layton 			    ci->i_flushing_caps == 0) {
492336e6da98SJeff Layton 				ceph_put_snap_context(ci->i_head_snapc);
492436e6da98SJeff Layton 				ci->i_head_snapc = NULL;
492536e6da98SJeff Layton 			}
492636e6da98SJeff Layton 		}
492736e6da98SJeff Layton 
492836e6da98SJeff Layton 		if (atomic_read(&ci->i_filelock_ref) > 0) {
492936e6da98SJeff Layton 			/* make further file lock syscall return -EIO */
493036e6da98SJeff Layton 			ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK;
493136e6da98SJeff Layton 			pr_warn_ratelimited(" dropping file locks for %p %lld\n",
493236e6da98SJeff Layton 					    inode, ceph_ino(inode));
493336e6da98SJeff Layton 		}
493436e6da98SJeff Layton 
493536e6da98SJeff Layton 		if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
493636e6da98SJeff Layton 			cf = ci->i_prealloc_cap_flush;
493736e6da98SJeff Layton 			ci->i_prealloc_cap_flush = NULL;
493836e6da98SJeff Layton 			if (!cf->is_capsnap)
493936e6da98SJeff Layton 				ceph_free_cap_flush(cf);
494036e6da98SJeff Layton 		}
494136e6da98SJeff Layton 
494236e6da98SJeff Layton 		if (!list_empty(&ci->i_cap_snaps))
494336e6da98SJeff Layton 			iputs = remove_capsnaps(mdsc, inode);
494436e6da98SJeff Layton 	}
494536e6da98SJeff Layton 	if (dirty_dropped)
494636e6da98SJeff Layton 		++iputs;
494736e6da98SJeff Layton 	return iputs;
494836e6da98SJeff Layton }
4949