xref: /openbmc/linux/drivers/block/rbd.c (revision 3302ffd4)
1e2a58ee5SAlex Elder 
2602adf40SYehuda Sadeh /*
3602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh 
6602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
7602adf40SYehuda Sadeh 
8602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
9602adf40SYehuda Sadeh 
10602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
11602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
12602adf40SYehuda Sadeh    the Free Software Foundation.
13602adf40SYehuda Sadeh 
14602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
15602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
16602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17602adf40SYehuda Sadeh    GNU General Public License for more details.
18602adf40SYehuda Sadeh 
19602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
20602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
21602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24602adf40SYehuda Sadeh 
25dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
26602adf40SYehuda Sadeh 
27dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
28602adf40SYehuda Sadeh 
29602adf40SYehuda Sadeh  */
30602adf40SYehuda Sadeh 
31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
34ed95b21aSIlya Dryomov #include <linux/ceph/cls_lock_client.h>
3543df3d35SIlya Dryomov #include <linux/ceph/striper.h>
36602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3782995cc6SDavid Howells #include <linux/fs_parser.h>
3830d1cff8SAlex Elder #include <linux/bsearch.h>
39602adf40SYehuda Sadeh 
40602adf40SYehuda Sadeh #include <linux/kernel.h>
41602adf40SYehuda Sadeh #include <linux/device.h>
42602adf40SYehuda Sadeh #include <linux/module.h>
437ad18afaSChristoph Hellwig #include <linux/blk-mq.h>
44602adf40SYehuda Sadeh #include <linux/fs.h>
45602adf40SYehuda Sadeh #include <linux/blkdev.h>
461c2a9dfeSAlex Elder #include <linux/slab.h>
47f8a22fc2SIlya Dryomov #include <linux/idr.h>
48bc1ecc65SIlya Dryomov #include <linux/workqueue.h>
49602adf40SYehuda Sadeh 
50602adf40SYehuda Sadeh #include "rbd_types.h"
51602adf40SYehuda Sadeh 
52aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
53aafb230eSAlex Elder 
54593a9e7bSAlex Elder /*
55a2acd00eSAlex Elder  * Increment the given counter and return its updated value.
56a2acd00eSAlex Elder  * If the counter is already 0 it will not be incremented.
57a2acd00eSAlex Elder  * If the counter is already at its maximum value returns
58a2acd00eSAlex Elder  * -EINVAL without updating it.
59a2acd00eSAlex Elder  */
60a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v)
61a2acd00eSAlex Elder {
62a2acd00eSAlex Elder 	unsigned int counter;
63a2acd00eSAlex Elder 
64bfc18e38SMark Rutland 	counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0);
65a2acd00eSAlex Elder 	if (counter <= (unsigned int)INT_MAX)
66a2acd00eSAlex Elder 		return (int)counter;
67a2acd00eSAlex Elder 
68a2acd00eSAlex Elder 	atomic_dec(v);
69a2acd00eSAlex Elder 
70a2acd00eSAlex Elder 	return -EINVAL;
71a2acd00eSAlex Elder }
72a2acd00eSAlex Elder 
73a2acd00eSAlex Elder /* Decrement the counter.  Return the resulting value, or -EINVAL */
74a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v)
75a2acd00eSAlex Elder {
76a2acd00eSAlex Elder 	int counter;
77a2acd00eSAlex Elder 
78a2acd00eSAlex Elder 	counter = atomic_dec_return(v);
79a2acd00eSAlex Elder 	if (counter >= 0)
80a2acd00eSAlex Elder 		return counter;
81a2acd00eSAlex Elder 
82a2acd00eSAlex Elder 	atomic_inc(v);
83a2acd00eSAlex Elder 
84a2acd00eSAlex Elder 	return -EINVAL;
85a2acd00eSAlex Elder }
86a2acd00eSAlex Elder 
87f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
88602adf40SYehuda Sadeh 
897e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR		256
907e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT	4
91602adf40SYehuda Sadeh 
926d69bb53SIlya Dryomov #define RBD_MAX_PARENT_CHAIN_LEN	16
936d69bb53SIlya Dryomov 
94d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
95d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
96d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
97d4b125e9SAlex Elder 
9835d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
99602adf40SYehuda Sadeh 
100602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
101602adf40SYehuda Sadeh 
1029682fc6dSAlex Elder #define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
1039682fc6dSAlex Elder 
1049e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
1059e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
106589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
1079e15b77dSAlex Elder 
1081e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
109589d30e0SAlex Elder 
110ed95b21aSIlya Dryomov #define RBD_NOTIFY_TIMEOUT	5	/* seconds */
11199d16943SIlya Dryomov #define RBD_RETRY_DELAY		msecs_to_jiffies(1000)
11299d16943SIlya Dryomov 
113d889140cSAlex Elder /* Feature bits */
114d889140cSAlex Elder 
1158767b293SIlya Dryomov #define RBD_FEATURE_LAYERING		(1ULL<<0)
1168767b293SIlya Dryomov #define RBD_FEATURE_STRIPINGV2		(1ULL<<1)
1178767b293SIlya Dryomov #define RBD_FEATURE_EXCLUSIVE_LOCK	(1ULL<<2)
11822e8bd51SIlya Dryomov #define RBD_FEATURE_OBJECT_MAP		(1ULL<<3)
11922e8bd51SIlya Dryomov #define RBD_FEATURE_FAST_DIFF		(1ULL<<4)
120b9f6d447SIlya Dryomov #define RBD_FEATURE_DEEP_FLATTEN	(1ULL<<5)
1218767b293SIlya Dryomov #define RBD_FEATURE_DATA_POOL		(1ULL<<7)
122e573427aSIlya Dryomov #define RBD_FEATURE_OPERATIONS		(1ULL<<8)
1238767b293SIlya Dryomov 
124ed95b21aSIlya Dryomov #define RBD_FEATURES_ALL	(RBD_FEATURE_LAYERING |		\
125ed95b21aSIlya Dryomov 				 RBD_FEATURE_STRIPINGV2 |	\
1267e97332eSIlya Dryomov 				 RBD_FEATURE_EXCLUSIVE_LOCK |	\
12722e8bd51SIlya Dryomov 				 RBD_FEATURE_OBJECT_MAP |	\
12822e8bd51SIlya Dryomov 				 RBD_FEATURE_FAST_DIFF |	\
129b9f6d447SIlya Dryomov 				 RBD_FEATURE_DEEP_FLATTEN |	\
130e573427aSIlya Dryomov 				 RBD_FEATURE_DATA_POOL |	\
131e573427aSIlya Dryomov 				 RBD_FEATURE_OPERATIONS)
132d889140cSAlex Elder 
133d889140cSAlex Elder /* Features supported by this (client software) implementation. */
134d889140cSAlex Elder 
135770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
136d889140cSAlex Elder 
13781a89793SAlex Elder /*
13881a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
13981a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
14081a89793SAlex Elder  */
141602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
142602adf40SYehuda Sadeh 
143602adf40SYehuda Sadeh /*
144602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
145602adf40SYehuda Sadeh  */
146602adf40SYehuda Sadeh struct rbd_image_header {
147f35a4deeSAlex Elder 	/* These six fields never change for a given rbd image */
148849b4260SAlex Elder 	char *object_prefix;
149602adf40SYehuda Sadeh 	__u8 obj_order;
150f35a4deeSAlex Elder 	u64 stripe_unit;
151f35a4deeSAlex Elder 	u64 stripe_count;
1527e97332eSIlya Dryomov 	s64 data_pool_id;
153f35a4deeSAlex Elder 	u64 features;		/* Might be changeable someday? */
154602adf40SYehuda Sadeh 
155f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
156f84344f3SAlex Elder 	u64 image_size;
157f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
158f35a4deeSAlex Elder 	char *snap_names;	/* format 1 only */
159f35a4deeSAlex Elder 	u64 *snap_sizes;	/* format 1 only */
16059c2be1eSYehuda Sadeh };
16159c2be1eSYehuda Sadeh 
1620d7dbfceSAlex Elder /*
1630d7dbfceSAlex Elder  * An rbd image specification.
1640d7dbfceSAlex Elder  *
1650d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
166c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
167c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
168c66c6e0cSAlex Elder  *
169c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
170c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
171c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
172c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
173c66c6e0cSAlex Elder  *
174c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
175c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
176c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
177c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
178c66c6e0cSAlex Elder  * is shared between the parent and child).
179c66c6e0cSAlex Elder  *
180c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
181c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
182c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
183c66c6e0cSAlex Elder  *
184c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
185c66c6e0cSAlex Elder  * could be a null pointer).
1860d7dbfceSAlex Elder  */
1870d7dbfceSAlex Elder struct rbd_spec {
1880d7dbfceSAlex Elder 	u64		pool_id;
189ecb4dc22SAlex Elder 	const char	*pool_name;
190b26c047bSIlya Dryomov 	const char	*pool_ns;	/* NULL if default, never "" */
1910d7dbfceSAlex Elder 
192ecb4dc22SAlex Elder 	const char	*image_id;
193ecb4dc22SAlex Elder 	const char	*image_name;
1940d7dbfceSAlex Elder 
1950d7dbfceSAlex Elder 	u64		snap_id;
196ecb4dc22SAlex Elder 	const char	*snap_name;
1970d7dbfceSAlex Elder 
1980d7dbfceSAlex Elder 	struct kref	kref;
1990d7dbfceSAlex Elder };
2000d7dbfceSAlex Elder 
201602adf40SYehuda Sadeh /*
202f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
203602adf40SYehuda Sadeh  */
204602adf40SYehuda Sadeh struct rbd_client {
205602adf40SYehuda Sadeh 	struct ceph_client	*client;
206602adf40SYehuda Sadeh 	struct kref		kref;
207602adf40SYehuda Sadeh 	struct list_head	node;
208602adf40SYehuda Sadeh };
209602adf40SYehuda Sadeh 
2100192ce2eSIlya Dryomov struct pending_result {
2110192ce2eSIlya Dryomov 	int			result;		/* first nonzero result */
2120192ce2eSIlya Dryomov 	int			num_pending;
2130192ce2eSIlya Dryomov };
2140192ce2eSIlya Dryomov 
215bf0d5f50SAlex Elder struct rbd_img_request;
216bf0d5f50SAlex Elder 
2179969ebc5SAlex Elder enum obj_request_type {
218a1fbb5e7SIlya Dryomov 	OBJ_REQUEST_NODATA = 1,
2195359a17dSIlya Dryomov 	OBJ_REQUEST_BIO,	/* pointer into provided bio (list) */
2207e07efb1SIlya Dryomov 	OBJ_REQUEST_BVECS,	/* pointer into provided bio_vec array */
221afb97888SIlya Dryomov 	OBJ_REQUEST_OWN_BVECS,	/* private bio_vec array, doesn't own pages */
2229969ebc5SAlex Elder };
223bf0d5f50SAlex Elder 
2246d2940c8SGuangliang Zhao enum obj_operation_type {
225a1fbb5e7SIlya Dryomov 	OBJ_OP_READ = 1,
2266d2940c8SGuangliang Zhao 	OBJ_OP_WRITE,
22790e98c52SGuangliang Zhao 	OBJ_OP_DISCARD,
2286484cbe9SIlya Dryomov 	OBJ_OP_ZEROOUT,
2296d2940c8SGuangliang Zhao };
2306d2940c8SGuangliang Zhao 
2310ad5d953SIlya Dryomov #define RBD_OBJ_FLAG_DELETION			(1U << 0)
2320ad5d953SIlya Dryomov #define RBD_OBJ_FLAG_COPYUP_ENABLED		(1U << 1)
233793333a3SIlya Dryomov #define RBD_OBJ_FLAG_COPYUP_ZEROS		(1U << 2)
23422e8bd51SIlya Dryomov #define RBD_OBJ_FLAG_MAY_EXIST			(1U << 3)
23522e8bd51SIlya Dryomov #define RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT	(1U << 4)
2360ad5d953SIlya Dryomov 
237a9b67e69SIlya Dryomov enum rbd_obj_read_state {
23885b5e6d1SIlya Dryomov 	RBD_OBJ_READ_START = 1,
23985b5e6d1SIlya Dryomov 	RBD_OBJ_READ_OBJECT,
240a9b67e69SIlya Dryomov 	RBD_OBJ_READ_PARENT,
241a9b67e69SIlya Dryomov };
242a9b67e69SIlya Dryomov 
2433da691bfSIlya Dryomov /*
2443da691bfSIlya Dryomov  * Writes go through the following state machine to deal with
2453da691bfSIlya Dryomov  * layering:
2463da691bfSIlya Dryomov  *
24789a59c1cSIlya Dryomov  *            . . . . . RBD_OBJ_WRITE_GUARD. . . . . . . . . . . . . .
24889a59c1cSIlya Dryomov  *            .                 |                                    .
24989a59c1cSIlya Dryomov  *            .                 v                                    .
25089a59c1cSIlya Dryomov  *            .    RBD_OBJ_WRITE_READ_FROM_PARENT. . .               .
25189a59c1cSIlya Dryomov  *            .                 |                    .               .
25289a59c1cSIlya Dryomov  *            .                 v                    v (deep-copyup  .
25389a59c1cSIlya Dryomov  *    (image  .   RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC   .  not needed)  .
25489a59c1cSIlya Dryomov  * flattened) v                 |                    .               .
25589a59c1cSIlya Dryomov  *            .                 v                    .               .
25689a59c1cSIlya Dryomov  *            . . . .RBD_OBJ_WRITE_COPYUP_OPS. . . . .      (copyup  .
25789a59c1cSIlya Dryomov  *                              |                        not needed) v
25889a59c1cSIlya Dryomov  *                              v                                    .
25989a59c1cSIlya Dryomov  *                            done . . . . . . . . . . . . . . . . . .
2603da691bfSIlya Dryomov  *                              ^
2613da691bfSIlya Dryomov  *                              |
2623da691bfSIlya Dryomov  *                     RBD_OBJ_WRITE_FLAT
2633da691bfSIlya Dryomov  *
2643da691bfSIlya Dryomov  * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
26589a59c1cSIlya Dryomov  * assert_exists guard is needed or not (in some cases it's not needed
26689a59c1cSIlya Dryomov  * even if there is a parent).
2673da691bfSIlya Dryomov  */
2683da691bfSIlya Dryomov enum rbd_obj_write_state {
26985b5e6d1SIlya Dryomov 	RBD_OBJ_WRITE_START = 1,
27022e8bd51SIlya Dryomov 	RBD_OBJ_WRITE_PRE_OBJECT_MAP,
27185b5e6d1SIlya Dryomov 	RBD_OBJ_WRITE_OBJECT,
272793333a3SIlya Dryomov 	__RBD_OBJ_WRITE_COPYUP,
273793333a3SIlya Dryomov 	RBD_OBJ_WRITE_COPYUP,
27422e8bd51SIlya Dryomov 	RBD_OBJ_WRITE_POST_OBJECT_MAP,
275793333a3SIlya Dryomov };
276793333a3SIlya Dryomov 
277793333a3SIlya Dryomov enum rbd_obj_copyup_state {
278793333a3SIlya Dryomov 	RBD_OBJ_COPYUP_START = 1,
279793333a3SIlya Dryomov 	RBD_OBJ_COPYUP_READ_PARENT,
28022e8bd51SIlya Dryomov 	__RBD_OBJ_COPYUP_OBJECT_MAPS,
28122e8bd51SIlya Dryomov 	RBD_OBJ_COPYUP_OBJECT_MAPS,
282793333a3SIlya Dryomov 	__RBD_OBJ_COPYUP_WRITE_OBJECT,
283793333a3SIlya Dryomov 	RBD_OBJ_COPYUP_WRITE_OBJECT,
284926f9b3fSAlex Elder };
285926f9b3fSAlex Elder 
286bf0d5f50SAlex Elder struct rbd_obj_request {
28743df3d35SIlya Dryomov 	struct ceph_object_extent ex;
2880ad5d953SIlya Dryomov 	unsigned int		flags;	/* RBD_OBJ_FLAG_* */
289c5b5ef6cSAlex Elder 	union {
290a9b67e69SIlya Dryomov 		enum rbd_obj_read_state	 read_state;	/* for reads */
2913da691bfSIlya Dryomov 		enum rbd_obj_write_state write_state;	/* for writes */
2923da691bfSIlya Dryomov 	};
293bf0d5f50SAlex Elder 
294bf0d5f50SAlex Elder 	struct rbd_img_request	*img_request;
29586bd7998SIlya Dryomov 	struct ceph_file_extent	*img_extents;
29686bd7998SIlya Dryomov 	u32			num_img_extents;
297bf0d5f50SAlex Elder 
298788e2df3SAlex Elder 	union {
2995359a17dSIlya Dryomov 		struct ceph_bio_iter	bio_pos;
300788e2df3SAlex Elder 		struct {
3017e07efb1SIlya Dryomov 			struct ceph_bvec_iter	bvec_pos;
3027e07efb1SIlya Dryomov 			u32			bvec_count;
303afb97888SIlya Dryomov 			u32			bvec_idx;
304788e2df3SAlex Elder 		};
305788e2df3SAlex Elder 	};
306793333a3SIlya Dryomov 
307793333a3SIlya Dryomov 	enum rbd_obj_copyup_state copyup_state;
3087e07efb1SIlya Dryomov 	struct bio_vec		*copyup_bvecs;
3097e07efb1SIlya Dryomov 	u32			copyup_bvec_count;
310bf0d5f50SAlex Elder 
311bcbab1dbSIlya Dryomov 	struct list_head	osd_reqs;	/* w/ r_private_item */
312bf0d5f50SAlex Elder 
31385b5e6d1SIlya Dryomov 	struct mutex		state_mutex;
314793333a3SIlya Dryomov 	struct pending_result	pending;
315bf0d5f50SAlex Elder 	struct kref		kref;
316bf0d5f50SAlex Elder };
317bf0d5f50SAlex Elder 
3180c425248SAlex Elder enum img_req_flags {
3199849e986SAlex Elder 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
320d0b2e944SAlex Elder 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
3210c425248SAlex Elder };
3220c425248SAlex Elder 
3230192ce2eSIlya Dryomov enum rbd_img_state {
3240192ce2eSIlya Dryomov 	RBD_IMG_START = 1,
325637cd060SIlya Dryomov 	RBD_IMG_EXCLUSIVE_LOCK,
3260192ce2eSIlya Dryomov 	__RBD_IMG_OBJECT_REQUESTS,
3270192ce2eSIlya Dryomov 	RBD_IMG_OBJECT_REQUESTS,
3280192ce2eSIlya Dryomov };
3290192ce2eSIlya Dryomov 
330bf0d5f50SAlex Elder struct rbd_img_request {
331bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
3329bb0248dSIlya Dryomov 	enum obj_operation_type	op_type;
333ecc633caSIlya Dryomov 	enum obj_request_type	data_type;
3340c425248SAlex Elder 	unsigned long		flags;
3350192ce2eSIlya Dryomov 	enum rbd_img_state	state;
336bf0d5f50SAlex Elder 	union {
337bf0d5f50SAlex Elder 		u64			snap_id;	/* for reads */
3389849e986SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
3399849e986SAlex Elder 	};
3409849e986SAlex Elder 	struct rbd_obj_request	*obj_request;	/* obj req initiator */
341bf0d5f50SAlex Elder 
342e1fddc8fSIlya Dryomov 	struct list_head	lock_item;
34343df3d35SIlya Dryomov 	struct list_head	object_extents;	/* obj_req.ex structs */
344bf0d5f50SAlex Elder 
3450192ce2eSIlya Dryomov 	struct mutex		state_mutex;
3460192ce2eSIlya Dryomov 	struct pending_result	pending;
3470192ce2eSIlya Dryomov 	struct work_struct	work;
3480192ce2eSIlya Dryomov 	int			work_result;
349bf0d5f50SAlex Elder };
350bf0d5f50SAlex Elder 
351bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
35243df3d35SIlya Dryomov 	list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
353bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
35443df3d35SIlya Dryomov 	list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
355bf0d5f50SAlex Elder 
35699d16943SIlya Dryomov enum rbd_watch_state {
35799d16943SIlya Dryomov 	RBD_WATCH_STATE_UNREGISTERED,
35899d16943SIlya Dryomov 	RBD_WATCH_STATE_REGISTERED,
35999d16943SIlya Dryomov 	RBD_WATCH_STATE_ERROR,
36099d16943SIlya Dryomov };
36199d16943SIlya Dryomov 
362ed95b21aSIlya Dryomov enum rbd_lock_state {
363ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_UNLOCKED,
364ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_LOCKED,
365ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_RELEASING,
366ed95b21aSIlya Dryomov };
367ed95b21aSIlya Dryomov 
368ed95b21aSIlya Dryomov /* WatchNotify::ClientId */
369ed95b21aSIlya Dryomov struct rbd_client_id {
370ed95b21aSIlya Dryomov 	u64 gid;
371ed95b21aSIlya Dryomov 	u64 handle;
372ed95b21aSIlya Dryomov };
373ed95b21aSIlya Dryomov 
374f84344f3SAlex Elder struct rbd_mapping {
37599c1f08fSAlex Elder 	u64                     size;
376f84344f3SAlex Elder };
377f84344f3SAlex Elder 
378602adf40SYehuda Sadeh /*
379602adf40SYehuda Sadeh  * a single device
380602adf40SYehuda Sadeh  */
381602adf40SYehuda Sadeh struct rbd_device {
382de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
383602adf40SYehuda Sadeh 
384602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
385dd82fff1SIlya Dryomov 	int			minor;
386602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
387602adf40SYehuda Sadeh 
388a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
389602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
390602adf40SYehuda Sadeh 
391602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
392602adf40SYehuda Sadeh 
393b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
394602adf40SYehuda Sadeh 
395602adf40SYehuda Sadeh 	struct rbd_image_header	header;
396b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
3970d7dbfceSAlex Elder 	struct rbd_spec		*spec;
398d147543dSIlya Dryomov 	struct rbd_options	*opts;
3990d6d1e9cSMike Christie 	char			*config_info;	/* add{,_single_major} string */
400602adf40SYehuda Sadeh 
401c41d13a3SIlya Dryomov 	struct ceph_object_id	header_oid;
402922dab61SIlya Dryomov 	struct ceph_object_locator header_oloc;
403971f839aSAlex Elder 
4041643dfa4SIlya Dryomov 	struct ceph_file_layout	layout;		/* used for all rbd requests */
4050903e875SAlex Elder 
40699d16943SIlya Dryomov 	struct mutex		watch_mutex;
40799d16943SIlya Dryomov 	enum rbd_watch_state	watch_state;
408922dab61SIlya Dryomov 	struct ceph_osd_linger_request *watch_handle;
40999d16943SIlya Dryomov 	u64			watch_cookie;
41099d16943SIlya Dryomov 	struct delayed_work	watch_dwork;
41159c2be1eSYehuda Sadeh 
412ed95b21aSIlya Dryomov 	struct rw_semaphore	lock_rwsem;
413ed95b21aSIlya Dryomov 	enum rbd_lock_state	lock_state;
414cbbfb0ffSIlya Dryomov 	char			lock_cookie[32];
415ed95b21aSIlya Dryomov 	struct rbd_client_id	owner_cid;
416ed95b21aSIlya Dryomov 	struct work_struct	acquired_lock_work;
417ed95b21aSIlya Dryomov 	struct work_struct	released_lock_work;
418ed95b21aSIlya Dryomov 	struct delayed_work	lock_dwork;
419ed95b21aSIlya Dryomov 	struct work_struct	unlock_work;
420e1fddc8fSIlya Dryomov 	spinlock_t		lock_lists_lock;
421637cd060SIlya Dryomov 	struct list_head	acquiring_list;
422e1fddc8fSIlya Dryomov 	struct list_head	running_list;
423637cd060SIlya Dryomov 	struct completion	acquire_wait;
424637cd060SIlya Dryomov 	int			acquire_err;
425e1fddc8fSIlya Dryomov 	struct completion	releasing_wait;
426ed95b21aSIlya Dryomov 
42722e8bd51SIlya Dryomov 	spinlock_t		object_map_lock;
42822e8bd51SIlya Dryomov 	u8			*object_map;
42922e8bd51SIlya Dryomov 	u64			object_map_size;	/* in objects */
43022e8bd51SIlya Dryomov 	u64			object_map_flags;
431602adf40SYehuda Sadeh 
4321643dfa4SIlya Dryomov 	struct workqueue_struct	*task_wq;
433602adf40SYehuda Sadeh 
43486b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
43586b00e0dSAlex Elder 	u64			parent_overlap;
436a2acd00eSAlex Elder 	atomic_t		parent_ref;
4372f82ee54SAlex Elder 	struct rbd_device	*parent;
43886b00e0dSAlex Elder 
4397ad18afaSChristoph Hellwig 	/* Block layer tags. */
4407ad18afaSChristoph Hellwig 	struct blk_mq_tag_set	tag_set;
4417ad18afaSChristoph Hellwig 
442c666601aSJosh Durgin 	/* protects updating the header */
443c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
444f84344f3SAlex Elder 
445f84344f3SAlex Elder 	struct rbd_mapping	mapping;
446602adf40SYehuda Sadeh 
447602adf40SYehuda Sadeh 	struct list_head	node;
448dfc5606dSYehuda Sadeh 
449dfc5606dSYehuda Sadeh 	/* sysfs related */
450dfc5606dSYehuda Sadeh 	struct device		dev;
451b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
452dfc5606dSYehuda Sadeh };
453dfc5606dSYehuda Sadeh 
454b82d167bSAlex Elder /*
45587c0fdedSIlya Dryomov  * Flag bits for rbd_dev->flags:
45687c0fdedSIlya Dryomov  * - REMOVING (which is coupled with rbd_dev->open_count) is protected
45787c0fdedSIlya Dryomov  *   by rbd_dev->lock
458b82d167bSAlex Elder  */
4596d292906SAlex Elder enum rbd_dev_flags {
460686238b7SIlya Dryomov 	RBD_DEV_FLAG_EXISTS,	/* rbd_dev_device_setup() ran */
461b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
46239258aa2SIlya Dryomov 	RBD_DEV_FLAG_READONLY,  /* -o ro or snapshot */
4636d292906SAlex Elder };
4646d292906SAlex Elder 
465cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex);	/* Serialize client creation */
466e124a82fSAlex Elder 
467602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
468e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
469e124a82fSAlex Elder 
470602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
471432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
472602adf40SYehuda Sadeh 
47378c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */
47478c2a44aSAlex Elder 
4751c2a9dfeSAlex Elder static struct kmem_cache	*rbd_img_request_cache;
476868311b1SAlex Elder static struct kmem_cache	*rbd_obj_request_cache;
4771c2a9dfeSAlex Elder 
4789b60e70bSIlya Dryomov static int rbd_major;
479f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida);
480f8a22fc2SIlya Dryomov 
481f5ee37bdSIlya Dryomov static struct workqueue_struct *rbd_wq;
482f5ee37bdSIlya Dryomov 
48389a59c1cSIlya Dryomov static struct ceph_snap_context rbd_empty_snapc = {
48489a59c1cSIlya Dryomov 	.nref = REFCOUNT_INIT(1),
48589a59c1cSIlya Dryomov };
48689a59c1cSIlya Dryomov 
4879b60e70bSIlya Dryomov /*
4883cfa3b16SIlya Dryomov  * single-major requires >= 0.75 version of userspace rbd utility.
4899b60e70bSIlya Dryomov  */
4903cfa3b16SIlya Dryomov static bool single_major = true;
4915657a819SJoe Perches module_param(single_major, bool, 0444);
4923cfa3b16SIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
4939b60e70bSIlya Dryomov 
4947e9586baSGreg Kroah-Hartman static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count);
4957e9586baSGreg Kroah-Hartman static ssize_t remove_store(struct bus_type *bus, const char *buf,
496f0f8cef5SAlex Elder 			    size_t count);
4977e9586baSGreg Kroah-Hartman static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
498f0f8cef5SAlex Elder 				      size_t count);
4997e9586baSGreg Kroah-Hartman static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
5009b60e70bSIlya Dryomov 					 size_t count);
5016d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
502f0f8cef5SAlex Elder 
5039b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id)
5049b60e70bSIlya Dryomov {
5057e513d43SIlya Dryomov 	return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
5069b60e70bSIlya Dryomov }
5079b60e70bSIlya Dryomov 
5089b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor)
5099b60e70bSIlya Dryomov {
5107e513d43SIlya Dryomov 	return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
5119b60e70bSIlya Dryomov }
5129b60e70bSIlya Dryomov 
51339258aa2SIlya Dryomov static bool rbd_is_ro(struct rbd_device *rbd_dev)
51439258aa2SIlya Dryomov {
51539258aa2SIlya Dryomov 	return test_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags);
51639258aa2SIlya Dryomov }
51739258aa2SIlya Dryomov 
518f3c0e459SIlya Dryomov static bool rbd_is_snap(struct rbd_device *rbd_dev)
519f3c0e459SIlya Dryomov {
520f3c0e459SIlya Dryomov 	return rbd_dev->spec->snap_id != CEPH_NOSNAP;
521f3c0e459SIlya Dryomov }
522f3c0e459SIlya Dryomov 
523ed95b21aSIlya Dryomov static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
524ed95b21aSIlya Dryomov {
525637cd060SIlya Dryomov 	lockdep_assert_held(&rbd_dev->lock_rwsem);
526637cd060SIlya Dryomov 
527ed95b21aSIlya Dryomov 	return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
528ed95b21aSIlya Dryomov 	       rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
529ed95b21aSIlya Dryomov }
530ed95b21aSIlya Dryomov 
531ed95b21aSIlya Dryomov static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
532ed95b21aSIlya Dryomov {
533ed95b21aSIlya Dryomov 	bool is_lock_owner;
534ed95b21aSIlya Dryomov 
535ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
536ed95b21aSIlya Dryomov 	is_lock_owner = __rbd_is_lock_owner(rbd_dev);
537ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
538ed95b21aSIlya Dryomov 	return is_lock_owner;
539ed95b21aSIlya Dryomov }
540ed95b21aSIlya Dryomov 
5417e9586baSGreg Kroah-Hartman static ssize_t supported_features_show(struct bus_type *bus, char *buf)
5428767b293SIlya Dryomov {
5438767b293SIlya Dryomov 	return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
5448767b293SIlya Dryomov }
5458767b293SIlya Dryomov 
5467e9586baSGreg Kroah-Hartman static BUS_ATTR_WO(add);
5477e9586baSGreg Kroah-Hartman static BUS_ATTR_WO(remove);
5487e9586baSGreg Kroah-Hartman static BUS_ATTR_WO(add_single_major);
5497e9586baSGreg Kroah-Hartman static BUS_ATTR_WO(remove_single_major);
5507e9586baSGreg Kroah-Hartman static BUS_ATTR_RO(supported_features);
551b15a21ddSGreg Kroah-Hartman 
552b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = {
553b15a21ddSGreg Kroah-Hartman 	&bus_attr_add.attr,
554b15a21ddSGreg Kroah-Hartman 	&bus_attr_remove.attr,
5559b60e70bSIlya Dryomov 	&bus_attr_add_single_major.attr,
5569b60e70bSIlya Dryomov 	&bus_attr_remove_single_major.attr,
5578767b293SIlya Dryomov 	&bus_attr_supported_features.attr,
558b15a21ddSGreg Kroah-Hartman 	NULL,
559f0f8cef5SAlex Elder };
56092c76dc0SIlya Dryomov 
56192c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj,
56292c76dc0SIlya Dryomov 				  struct attribute *attr, int index)
56392c76dc0SIlya Dryomov {
5649b60e70bSIlya Dryomov 	if (!single_major &&
5659b60e70bSIlya Dryomov 	    (attr == &bus_attr_add_single_major.attr ||
5669b60e70bSIlya Dryomov 	     attr == &bus_attr_remove_single_major.attr))
5679b60e70bSIlya Dryomov 		return 0;
5689b60e70bSIlya Dryomov 
56992c76dc0SIlya Dryomov 	return attr->mode;
57092c76dc0SIlya Dryomov }
57192c76dc0SIlya Dryomov 
57292c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = {
57392c76dc0SIlya Dryomov 	.attrs = rbd_bus_attrs,
57492c76dc0SIlya Dryomov 	.is_visible = rbd_bus_is_visible,
57592c76dc0SIlya Dryomov };
57692c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus);
577f0f8cef5SAlex Elder 
578f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
579f0f8cef5SAlex Elder 	.name		= "rbd",
580b15a21ddSGreg Kroah-Hartman 	.bus_groups	= rbd_bus_groups,
581f0f8cef5SAlex Elder };
582f0f8cef5SAlex Elder 
583f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
584f0f8cef5SAlex Elder {
585f0f8cef5SAlex Elder }
586f0f8cef5SAlex Elder 
587f0f8cef5SAlex Elder static struct device rbd_root_dev = {
588f0f8cef5SAlex Elder 	.init_name =    "rbd",
589f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
590f0f8cef5SAlex Elder };
591f0f8cef5SAlex Elder 
59206ecc6cbSAlex Elder static __printf(2, 3)
59306ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
59406ecc6cbSAlex Elder {
59506ecc6cbSAlex Elder 	struct va_format vaf;
59606ecc6cbSAlex Elder 	va_list args;
59706ecc6cbSAlex Elder 
59806ecc6cbSAlex Elder 	va_start(args, fmt);
59906ecc6cbSAlex Elder 	vaf.fmt = fmt;
60006ecc6cbSAlex Elder 	vaf.va = &args;
60106ecc6cbSAlex Elder 
60206ecc6cbSAlex Elder 	if (!rbd_dev)
60306ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
60406ecc6cbSAlex Elder 	else if (rbd_dev->disk)
60506ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
60606ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
60706ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
60806ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
60906ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
61006ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
61106ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
61206ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
61306ecc6cbSAlex Elder 	else	/* punt */
61406ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
61506ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
61606ecc6cbSAlex Elder 	va_end(args);
61706ecc6cbSAlex Elder }
61806ecc6cbSAlex Elder 
619aafb230eSAlex Elder #ifdef RBD_DEBUG
620aafb230eSAlex Elder #define rbd_assert(expr)						\
621aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
622aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
623aafb230eSAlex Elder 						"at line %d:\n\n"	\
624aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
625aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
626aafb230eSAlex Elder 			BUG();						\
627aafb230eSAlex Elder 		}
628aafb230eSAlex Elder #else /* !RBD_DEBUG */
629aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
630aafb230eSAlex Elder #endif /* !RBD_DEBUG */
631dfc5606dSYehuda Sadeh 
63205a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
6338b3e1a56SAlex Elder 
634cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev);
6352df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
636a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev);
637e8f59b59SIlya Dryomov static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
63854cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
63954cac61fSAlex Elder 					u64 snap_id);
6402ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
6412ad3d716SAlex Elder 				u8 *order, u64 *snap_size);
64222e8bd51SIlya Dryomov static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev);
64359c2be1eSYehuda Sadeh 
64454ab3b24SIlya Dryomov static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result);
6450192ce2eSIlya Dryomov static void rbd_img_handle_request(struct rbd_img_request *img_req, int result);
6460192ce2eSIlya Dryomov 
6470192ce2eSIlya Dryomov /*
6480192ce2eSIlya Dryomov  * Return true if nothing else is pending.
6490192ce2eSIlya Dryomov  */
6500192ce2eSIlya Dryomov static bool pending_result_dec(struct pending_result *pending, int *result)
6510192ce2eSIlya Dryomov {
6520192ce2eSIlya Dryomov 	rbd_assert(pending->num_pending > 0);
6530192ce2eSIlya Dryomov 
6540192ce2eSIlya Dryomov 	if (*result && !pending->result)
6550192ce2eSIlya Dryomov 		pending->result = *result;
6560192ce2eSIlya Dryomov 	if (--pending->num_pending)
6570192ce2eSIlya Dryomov 		return false;
6580192ce2eSIlya Dryomov 
6590192ce2eSIlya Dryomov 	*result = pending->result;
6600192ce2eSIlya Dryomov 	return true;
6610192ce2eSIlya Dryomov }
662602adf40SYehuda Sadeh 
663602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
664602adf40SYehuda Sadeh {
665f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
666b82d167bSAlex Elder 	bool removing = false;
667602adf40SYehuda Sadeh 
668a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
669b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
670b82d167bSAlex Elder 		removing = true;
671b82d167bSAlex Elder 	else
672b82d167bSAlex Elder 		rbd_dev->open_count++;
673a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
674b82d167bSAlex Elder 	if (removing)
675b82d167bSAlex Elder 		return -ENOENT;
676b82d167bSAlex Elder 
677c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
678340c7a2bSAlex Elder 
679602adf40SYehuda Sadeh 	return 0;
680602adf40SYehuda Sadeh }
681602adf40SYehuda Sadeh 
682db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode)
683dfc5606dSYehuda Sadeh {
684dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
685b82d167bSAlex Elder 	unsigned long open_count_before;
686b82d167bSAlex Elder 
687a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
688b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
689a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
690b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
691dfc5606dSYehuda Sadeh 
692c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
693dfc5606dSYehuda Sadeh }
694dfc5606dSYehuda Sadeh 
695602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
696602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
697602adf40SYehuda Sadeh 	.open			= rbd_open,
698dfc5606dSYehuda Sadeh 	.release		= rbd_release,
699602adf40SYehuda Sadeh };
700602adf40SYehuda Sadeh 
701602adf40SYehuda Sadeh /*
7027262cfcaSAlex Elder  * Initialize an rbd client instance.  Success or not, this function
703cfbf6377SAlex Elder  * consumes ceph_opts.  Caller holds client_mutex.
704602adf40SYehuda Sadeh  */
705f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
706602adf40SYehuda Sadeh {
707602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
708602adf40SYehuda Sadeh 	int ret = -ENOMEM;
709602adf40SYehuda Sadeh 
71037206ee5SAlex Elder 	dout("%s:\n", __func__);
711602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
712602adf40SYehuda Sadeh 	if (!rbdc)
713602adf40SYehuda Sadeh 		goto out_opt;
714602adf40SYehuda Sadeh 
715602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
716602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
717602adf40SYehuda Sadeh 
71874da4a0fSIlya Dryomov 	rbdc->client = ceph_create_client(ceph_opts, rbdc);
719602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
72008f75463SAlex Elder 		goto out_rbdc;
72143ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
722602adf40SYehuda Sadeh 
723602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
724602adf40SYehuda Sadeh 	if (ret < 0)
72508f75463SAlex Elder 		goto out_client;
726602adf40SYehuda Sadeh 
727432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
728602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
729432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
730602adf40SYehuda Sadeh 
73137206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
732bc534d86SAlex Elder 
733602adf40SYehuda Sadeh 	return rbdc;
73408f75463SAlex Elder out_client:
735602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
73608f75463SAlex Elder out_rbdc:
737602adf40SYehuda Sadeh 	kfree(rbdc);
738602adf40SYehuda Sadeh out_opt:
73943ae4701SAlex Elder 	if (ceph_opts)
74043ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
74137206ee5SAlex Elder 	dout("%s: error %d\n", __func__, ret);
74237206ee5SAlex Elder 
74328f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
744602adf40SYehuda Sadeh }
745602adf40SYehuda Sadeh 
7462f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
7472f82ee54SAlex Elder {
7482f82ee54SAlex Elder 	kref_get(&rbdc->kref);
7492f82ee54SAlex Elder 
7502f82ee54SAlex Elder 	return rbdc;
7512f82ee54SAlex Elder }
7522f82ee54SAlex Elder 
753602adf40SYehuda Sadeh /*
7541f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
7551f7ba331SAlex Elder  * found, bump its reference count.
756602adf40SYehuda Sadeh  */
7571f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
758602adf40SYehuda Sadeh {
759*3302ffd4SJakob Koschel 	struct rbd_client *rbdc = NULL, *iter;
760602adf40SYehuda Sadeh 
76143ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
762602adf40SYehuda Sadeh 		return NULL;
763602adf40SYehuda Sadeh 
7641f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
765*3302ffd4SJakob Koschel 	list_for_each_entry(iter, &rbd_client_list, node) {
766*3302ffd4SJakob Koschel 		if (!ceph_compare_options(ceph_opts, iter->client)) {
767*3302ffd4SJakob Koschel 			__rbd_get_client(iter);
7682f82ee54SAlex Elder 
769*3302ffd4SJakob Koschel 			rbdc = iter;
7701f7ba331SAlex Elder 			break;
7711f7ba331SAlex Elder 		}
7721f7ba331SAlex Elder 	}
7731f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
7741f7ba331SAlex Elder 
775*3302ffd4SJakob Koschel 	return rbdc;
776602adf40SYehuda Sadeh }
777602adf40SYehuda Sadeh 
778602adf40SYehuda Sadeh /*
779210c104cSIlya Dryomov  * (Per device) rbd map options
78059c2be1eSYehuda Sadeh  */
78159c2be1eSYehuda Sadeh enum {
782b5584180SIlya Dryomov 	Opt_queue_depth,
7830c93e1b7SIlya Dryomov 	Opt_alloc_size,
78434f55d0bSDongsheng Yang 	Opt_lock_timeout,
78559c2be1eSYehuda Sadeh 	/* int args above */
786b26c047bSIlya Dryomov 	Opt_pool_ns,
787dc1dad8eSIlya Dryomov 	Opt_compression_hint,
78859c2be1eSYehuda Sadeh 	/* string args above */
789cc0538b6SAlex Elder 	Opt_read_only,
790cc0538b6SAlex Elder 	Opt_read_write,
79180de1912SIlya Dryomov 	Opt_lock_on_read,
792e010dd0aSIlya Dryomov 	Opt_exclusive,
793d9360540SIlya Dryomov 	Opt_notrim,
79459c2be1eSYehuda Sadeh };
79559c2be1eSYehuda Sadeh 
796dc1dad8eSIlya Dryomov enum {
797dc1dad8eSIlya Dryomov 	Opt_compression_hint_none,
798dc1dad8eSIlya Dryomov 	Opt_compression_hint_compressible,
799dc1dad8eSIlya Dryomov 	Opt_compression_hint_incompressible,
800dc1dad8eSIlya Dryomov };
801dc1dad8eSIlya Dryomov 
802dc1dad8eSIlya Dryomov static const struct constant_table rbd_param_compression_hint[] = {
803dc1dad8eSIlya Dryomov 	{"none",		Opt_compression_hint_none},
804dc1dad8eSIlya Dryomov 	{"compressible",	Opt_compression_hint_compressible},
805dc1dad8eSIlya Dryomov 	{"incompressible",	Opt_compression_hint_incompressible},
806dc1dad8eSIlya Dryomov 	{}
807dc1dad8eSIlya Dryomov };
808dc1dad8eSIlya Dryomov 
809d7167b14SAl Viro static const struct fs_parameter_spec rbd_parameters[] = {
81082995cc6SDavid Howells 	fsparam_u32	("alloc_size",			Opt_alloc_size),
811dc1dad8eSIlya Dryomov 	fsparam_enum	("compression_hint",		Opt_compression_hint,
812dc1dad8eSIlya Dryomov 			 rbd_param_compression_hint),
81382995cc6SDavid Howells 	fsparam_flag	("exclusive",			Opt_exclusive),
81482995cc6SDavid Howells 	fsparam_flag	("lock_on_read",		Opt_lock_on_read),
81582995cc6SDavid Howells 	fsparam_u32	("lock_timeout",		Opt_lock_timeout),
81682995cc6SDavid Howells 	fsparam_flag	("notrim",			Opt_notrim),
81782995cc6SDavid Howells 	fsparam_string	("_pool_ns",			Opt_pool_ns),
81882995cc6SDavid Howells 	fsparam_u32	("queue_depth",			Opt_queue_depth),
81982995cc6SDavid Howells 	fsparam_flag	("read_only",			Opt_read_only),
82082995cc6SDavid Howells 	fsparam_flag	("read_write",			Opt_read_write),
82182995cc6SDavid Howells 	fsparam_flag	("ro",				Opt_read_only),
82282995cc6SDavid Howells 	fsparam_flag	("rw",				Opt_read_write),
82382995cc6SDavid Howells 	{}
82482995cc6SDavid Howells };
82582995cc6SDavid Howells 
82698571b5aSAlex Elder struct rbd_options {
827b5584180SIlya Dryomov 	int	queue_depth;
8280c93e1b7SIlya Dryomov 	int	alloc_size;
82934f55d0bSDongsheng Yang 	unsigned long	lock_timeout;
83098571b5aSAlex Elder 	bool	read_only;
83180de1912SIlya Dryomov 	bool	lock_on_read;
832e010dd0aSIlya Dryomov 	bool	exclusive;
833d9360540SIlya Dryomov 	bool	trim;
834dc1dad8eSIlya Dryomov 
835dc1dad8eSIlya Dryomov 	u32 alloc_hint_flags;  /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */
83698571b5aSAlex Elder };
83798571b5aSAlex Elder 
838d2a27964SJohn Garry #define RBD_QUEUE_DEPTH_DEFAULT	BLKDEV_DEFAULT_RQ
8390c93e1b7SIlya Dryomov #define RBD_ALLOC_SIZE_DEFAULT	(64 * 1024)
84034f55d0bSDongsheng Yang #define RBD_LOCK_TIMEOUT_DEFAULT 0  /* no timeout */
84198571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
84280de1912SIlya Dryomov #define RBD_LOCK_ON_READ_DEFAULT false
843e010dd0aSIlya Dryomov #define RBD_EXCLUSIVE_DEFAULT	false
844d9360540SIlya Dryomov #define RBD_TRIM_DEFAULT	true
84598571b5aSAlex Elder 
84682995cc6SDavid Howells struct rbd_parse_opts_ctx {
847c300156bSIlya Dryomov 	struct rbd_spec		*spec;
84882995cc6SDavid Howells 	struct ceph_options	*copts;
849c300156bSIlya Dryomov 	struct rbd_options	*opts;
850c300156bSIlya Dryomov };
851c300156bSIlya Dryomov 
8526d2940c8SGuangliang Zhao static char* obj_op_name(enum obj_operation_type op_type)
8536d2940c8SGuangliang Zhao {
8546d2940c8SGuangliang Zhao 	switch (op_type) {
8556d2940c8SGuangliang Zhao 	case OBJ_OP_READ:
8566d2940c8SGuangliang Zhao 		return "read";
8576d2940c8SGuangliang Zhao 	case OBJ_OP_WRITE:
8586d2940c8SGuangliang Zhao 		return "write";
85990e98c52SGuangliang Zhao 	case OBJ_OP_DISCARD:
86090e98c52SGuangliang Zhao 		return "discard";
8616484cbe9SIlya Dryomov 	case OBJ_OP_ZEROOUT:
8626484cbe9SIlya Dryomov 		return "zeroout";
8636d2940c8SGuangliang Zhao 	default:
8646d2940c8SGuangliang Zhao 		return "???";
8656d2940c8SGuangliang Zhao 	}
8666d2940c8SGuangliang Zhao }
8676d2940c8SGuangliang Zhao 
86859c2be1eSYehuda Sadeh /*
869602adf40SYehuda Sadeh  * Destroy ceph client
870d23a4b3fSAlex Elder  *
871432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
872602adf40SYehuda Sadeh  */
873602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
874602adf40SYehuda Sadeh {
875602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
876602adf40SYehuda Sadeh 
87737206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
878cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
879602adf40SYehuda Sadeh 	list_del(&rbdc->node);
880cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
881602adf40SYehuda Sadeh 
882602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
883602adf40SYehuda Sadeh 	kfree(rbdc);
884602adf40SYehuda Sadeh }
885602adf40SYehuda Sadeh 
886602adf40SYehuda Sadeh /*
887602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
888602adf40SYehuda Sadeh  * it.
889602adf40SYehuda Sadeh  */
8909d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
891602adf40SYehuda Sadeh {
892c53d5893SAlex Elder 	if (rbdc)
8939d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
894602adf40SYehuda Sadeh }
895602adf40SYehuda Sadeh 
8965feb0d8dSIlya Dryomov /*
8975feb0d8dSIlya Dryomov  * Get a ceph client with specific addr and configuration, if one does
8985feb0d8dSIlya Dryomov  * not exist create it.  Either way, ceph_opts is consumed by this
8995feb0d8dSIlya Dryomov  * function.
9005feb0d8dSIlya Dryomov  */
9015feb0d8dSIlya Dryomov static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
9025feb0d8dSIlya Dryomov {
9035feb0d8dSIlya Dryomov 	struct rbd_client *rbdc;
904dd435855SIlya Dryomov 	int ret;
9055feb0d8dSIlya Dryomov 
906a32e4143SIlya Dryomov 	mutex_lock(&client_mutex);
9075feb0d8dSIlya Dryomov 	rbdc = rbd_client_find(ceph_opts);
908dd435855SIlya Dryomov 	if (rbdc) {
9095feb0d8dSIlya Dryomov 		ceph_destroy_options(ceph_opts);
910dd435855SIlya Dryomov 
911dd435855SIlya Dryomov 		/*
912dd435855SIlya Dryomov 		 * Using an existing client.  Make sure ->pg_pools is up to
913dd435855SIlya Dryomov 		 * date before we look up the pool id in do_rbd_add().
914dd435855SIlya Dryomov 		 */
9159d4a227fSIlya Dryomov 		ret = ceph_wait_for_latest_osdmap(rbdc->client,
9169d4a227fSIlya Dryomov 					rbdc->client->options->mount_timeout);
917dd435855SIlya Dryomov 		if (ret) {
918dd435855SIlya Dryomov 			rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
919dd435855SIlya Dryomov 			rbd_put_client(rbdc);
920dd435855SIlya Dryomov 			rbdc = ERR_PTR(ret);
921dd435855SIlya Dryomov 		}
922dd435855SIlya Dryomov 	} else {
9235feb0d8dSIlya Dryomov 		rbdc = rbd_client_create(ceph_opts);
924dd435855SIlya Dryomov 	}
9255feb0d8dSIlya Dryomov 	mutex_unlock(&client_mutex);
9265feb0d8dSIlya Dryomov 
9275feb0d8dSIlya Dryomov 	return rbdc;
9285feb0d8dSIlya Dryomov }
9295feb0d8dSIlya Dryomov 
930a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
931a30b71b9SAlex Elder {
932a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
933a30b71b9SAlex Elder }
934a30b71b9SAlex Elder 
9358e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
9368e94af8eSAlex Elder {
937103a150fSAlex Elder 	size_t size;
938103a150fSAlex Elder 	u32 snap_count;
939103a150fSAlex Elder 
940103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
941103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
942103a150fSAlex Elder 		return false;
943103a150fSAlex Elder 
944db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
945db2388b6SAlex Elder 
946db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
947db2388b6SAlex Elder 		return false;
948db2388b6SAlex Elder 
949db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
950db2388b6SAlex Elder 
951db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
952db2388b6SAlex Elder 		return false;
953db2388b6SAlex Elder 
954103a150fSAlex Elder 	/*
955103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
956103a150fSAlex Elder 	 * that limits the number of snapshots.
957103a150fSAlex Elder 	 */
958103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
959103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
960103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
961103a150fSAlex Elder 		return false;
962103a150fSAlex Elder 
963103a150fSAlex Elder 	/*
964103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
965103a150fSAlex Elder 	 * header must also be representable in a size_t.
966103a150fSAlex Elder 	 */
967103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
968103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
969103a150fSAlex Elder 		return false;
970103a150fSAlex Elder 
971103a150fSAlex Elder 	return true;
9728e94af8eSAlex Elder }
9738e94af8eSAlex Elder 
974602adf40SYehuda Sadeh /*
9755bc3fb17SIlya Dryomov  * returns the size of an object in the image
9765bc3fb17SIlya Dryomov  */
9775bc3fb17SIlya Dryomov static u32 rbd_obj_bytes(struct rbd_image_header *header)
9785bc3fb17SIlya Dryomov {
9795bc3fb17SIlya Dryomov 	return 1U << header->obj_order;
9805bc3fb17SIlya Dryomov }
9815bc3fb17SIlya Dryomov 
982263423f8SIlya Dryomov static void rbd_init_layout(struct rbd_device *rbd_dev)
983263423f8SIlya Dryomov {
984263423f8SIlya Dryomov 	if (rbd_dev->header.stripe_unit == 0 ||
985263423f8SIlya Dryomov 	    rbd_dev->header.stripe_count == 0) {
986263423f8SIlya Dryomov 		rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
987263423f8SIlya Dryomov 		rbd_dev->header.stripe_count = 1;
988263423f8SIlya Dryomov 	}
989263423f8SIlya Dryomov 
990263423f8SIlya Dryomov 	rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
991263423f8SIlya Dryomov 	rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
992263423f8SIlya Dryomov 	rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
9937e97332eSIlya Dryomov 	rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
9947e97332eSIlya Dryomov 			  rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
995263423f8SIlya Dryomov 	RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
996263423f8SIlya Dryomov }
997263423f8SIlya Dryomov 
9985bc3fb17SIlya Dryomov /*
999bb23e37aSAlex Elder  * Fill an rbd image header with information from the given format 1
1000bb23e37aSAlex Elder  * on-disk header.
1001602adf40SYehuda Sadeh  */
1002662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev,
10034156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
1004602adf40SYehuda Sadeh {
1005662518b1SAlex Elder 	struct rbd_image_header *header = &rbd_dev->header;
1006bb23e37aSAlex Elder 	bool first_time = header->object_prefix == NULL;
1007bb23e37aSAlex Elder 	struct ceph_snap_context *snapc;
1008bb23e37aSAlex Elder 	char *object_prefix = NULL;
1009bb23e37aSAlex Elder 	char *snap_names = NULL;
1010bb23e37aSAlex Elder 	u64 *snap_sizes = NULL;
1011ccece235SAlex Elder 	u32 snap_count;
1012bb23e37aSAlex Elder 	int ret = -ENOMEM;
1013621901d6SAlex Elder 	u32 i;
1014602adf40SYehuda Sadeh 
1015bb23e37aSAlex Elder 	/* Allocate this now to avoid having to handle failure below */
1016103a150fSAlex Elder 
1017bb23e37aSAlex Elder 	if (first_time) {
1018848d796cSIlya Dryomov 		object_prefix = kstrndup(ondisk->object_prefix,
1019848d796cSIlya Dryomov 					 sizeof(ondisk->object_prefix),
1020848d796cSIlya Dryomov 					 GFP_KERNEL);
1021bb23e37aSAlex Elder 		if (!object_prefix)
1022602adf40SYehuda Sadeh 			return -ENOMEM;
1023bb23e37aSAlex Elder 	}
102400f1f36fSAlex Elder 
1025bb23e37aSAlex Elder 	/* Allocate the snapshot context and fill it in */
1026d2bb24e5SAlex Elder 
1027602adf40SYehuda Sadeh 	snap_count = le32_to_cpu(ondisk->snap_count);
1028bb23e37aSAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1029bb23e37aSAlex Elder 	if (!snapc)
1030bb23e37aSAlex Elder 		goto out_err;
1031bb23e37aSAlex Elder 	snapc->seq = le64_to_cpu(ondisk->snap_seq);
1032602adf40SYehuda Sadeh 	if (snap_count) {
1033bb23e37aSAlex Elder 		struct rbd_image_snap_ondisk *snaps;
1034f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1035f785cc1dSAlex Elder 
1036bb23e37aSAlex Elder 		/* We'll keep a copy of the snapshot names... */
1037621901d6SAlex Elder 
1038f785cc1dSAlex Elder 		if (snap_names_len > (u64)SIZE_MAX)
1039bb23e37aSAlex Elder 			goto out_2big;
1040bb23e37aSAlex Elder 		snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1041bb23e37aSAlex Elder 		if (!snap_names)
1042602adf40SYehuda Sadeh 			goto out_err;
1043bb23e37aSAlex Elder 
1044bb23e37aSAlex Elder 		/* ...as well as the array of their sizes. */
104588a25a5fSMarkus Elfring 		snap_sizes = kmalloc_array(snap_count,
104688a25a5fSMarkus Elfring 					   sizeof(*header->snap_sizes),
104788a25a5fSMarkus Elfring 					   GFP_KERNEL);
1048bb23e37aSAlex Elder 		if (!snap_sizes)
1049bb23e37aSAlex Elder 			goto out_err;
1050bb23e37aSAlex Elder 
1051f785cc1dSAlex Elder 		/*
1052bb23e37aSAlex Elder 		 * Copy the names, and fill in each snapshot's id
1053bb23e37aSAlex Elder 		 * and size.
1054bb23e37aSAlex Elder 		 *
105599a41ebcSAlex Elder 		 * Note that rbd_dev_v1_header_info() guarantees the
1056bb23e37aSAlex Elder 		 * ondisk buffer we're working with has
1057f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
1058f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
1059f785cc1dSAlex Elder 		 */
1060bb23e37aSAlex Elder 		memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1061bb23e37aSAlex Elder 		snaps = ondisk->snaps;
1062bb23e37aSAlex Elder 		for (i = 0; i < snap_count; i++) {
1063bb23e37aSAlex Elder 			snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1064bb23e37aSAlex Elder 			snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1065bb23e37aSAlex Elder 		}
1066602adf40SYehuda Sadeh 	}
1067849b4260SAlex Elder 
1068bb23e37aSAlex Elder 	/* We won't fail any more, fill in the header */
1069bb23e37aSAlex Elder 
1070bb23e37aSAlex Elder 	if (first_time) {
1071bb23e37aSAlex Elder 		header->object_prefix = object_prefix;
1072602adf40SYehuda Sadeh 		header->obj_order = ondisk->options.order;
1073263423f8SIlya Dryomov 		rbd_init_layout(rbd_dev);
1074662518b1SAlex Elder 	} else {
1075662518b1SAlex Elder 		ceph_put_snap_context(header->snapc);
1076662518b1SAlex Elder 		kfree(header->snap_names);
1077662518b1SAlex Elder 		kfree(header->snap_sizes);
1078bb23e37aSAlex Elder 	}
10796a52325fSAlex Elder 
1080bb23e37aSAlex Elder 	/* The remaining fields always get updated (when we refresh) */
1081621901d6SAlex Elder 
1082f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
1083bb23e37aSAlex Elder 	header->snapc = snapc;
1084bb23e37aSAlex Elder 	header->snap_names = snap_names;
1085bb23e37aSAlex Elder 	header->snap_sizes = snap_sizes;
1086468521c1SAlex Elder 
1087602adf40SYehuda Sadeh 	return 0;
1088bb23e37aSAlex Elder out_2big:
1089bb23e37aSAlex Elder 	ret = -EIO;
10906a52325fSAlex Elder out_err:
1091bb23e37aSAlex Elder 	kfree(snap_sizes);
1092bb23e37aSAlex Elder 	kfree(snap_names);
1093bb23e37aSAlex Elder 	ceph_put_snap_context(snapc);
1094bb23e37aSAlex Elder 	kfree(object_prefix);
1095ccece235SAlex Elder 
1096bb23e37aSAlex Elder 	return ret;
1097602adf40SYehuda Sadeh }
1098602adf40SYehuda Sadeh 
10999682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
11009682fc6dSAlex Elder {
11019682fc6dSAlex Elder 	const char *snap_name;
11029682fc6dSAlex Elder 
11039682fc6dSAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
11049682fc6dSAlex Elder 
11059682fc6dSAlex Elder 	/* Skip over names until we find the one we are looking for */
11069682fc6dSAlex Elder 
11079682fc6dSAlex Elder 	snap_name = rbd_dev->header.snap_names;
11089682fc6dSAlex Elder 	while (which--)
11099682fc6dSAlex Elder 		snap_name += strlen(snap_name) + 1;
11109682fc6dSAlex Elder 
11119682fc6dSAlex Elder 	return kstrdup(snap_name, GFP_KERNEL);
11129682fc6dSAlex Elder }
11139682fc6dSAlex Elder 
111430d1cff8SAlex Elder /*
111530d1cff8SAlex Elder  * Snapshot id comparison function for use with qsort()/bsearch().
111630d1cff8SAlex Elder  * Note that result is for snapshots in *descending* order.
111730d1cff8SAlex Elder  */
111830d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2)
111930d1cff8SAlex Elder {
112030d1cff8SAlex Elder 	u64 snap_id1 = *(u64 *)s1;
112130d1cff8SAlex Elder 	u64 snap_id2 = *(u64 *)s2;
112230d1cff8SAlex Elder 
112330d1cff8SAlex Elder 	if (snap_id1 < snap_id2)
112430d1cff8SAlex Elder 		return 1;
112530d1cff8SAlex Elder 	return snap_id1 == snap_id2 ? 0 : -1;
112630d1cff8SAlex Elder }
112730d1cff8SAlex Elder 
112830d1cff8SAlex Elder /*
112930d1cff8SAlex Elder  * Search a snapshot context to see if the given snapshot id is
113030d1cff8SAlex Elder  * present.
113130d1cff8SAlex Elder  *
113230d1cff8SAlex Elder  * Returns the position of the snapshot id in the array if it's found,
113330d1cff8SAlex Elder  * or BAD_SNAP_INDEX otherwise.
113430d1cff8SAlex Elder  *
113530d1cff8SAlex Elder  * Note: The snapshot array is in kept sorted (by the osd) in
113630d1cff8SAlex Elder  * reverse order, highest snapshot id first.
113730d1cff8SAlex Elder  */
11389682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
11399682fc6dSAlex Elder {
11409682fc6dSAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
114130d1cff8SAlex Elder 	u64 *found;
11429682fc6dSAlex Elder 
114330d1cff8SAlex Elder 	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
114430d1cff8SAlex Elder 				sizeof (snap_id), snapid_compare_reverse);
11459682fc6dSAlex Elder 
114630d1cff8SAlex Elder 	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
11479682fc6dSAlex Elder }
11489682fc6dSAlex Elder 
11492ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
11502ad3d716SAlex Elder 					u64 snap_id)
115154cac61fSAlex Elder {
115254cac61fSAlex Elder 	u32 which;
1153da6a6b63SJosh Durgin 	const char *snap_name;
115454cac61fSAlex Elder 
115554cac61fSAlex Elder 	which = rbd_dev_snap_index(rbd_dev, snap_id);
115654cac61fSAlex Elder 	if (which == BAD_SNAP_INDEX)
1157da6a6b63SJosh Durgin 		return ERR_PTR(-ENOENT);
115854cac61fSAlex Elder 
1159da6a6b63SJosh Durgin 	snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1160da6a6b63SJosh Durgin 	return snap_name ? snap_name : ERR_PTR(-ENOMEM);
116154cac61fSAlex Elder }
116254cac61fSAlex Elder 
11639e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
11649e15b77dSAlex Elder {
11659e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
11669e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
11679e15b77dSAlex Elder 
116854cac61fSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
116954cac61fSAlex Elder 	if (rbd_dev->image_format == 1)
117054cac61fSAlex Elder 		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
11719e15b77dSAlex Elder 
117254cac61fSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
11739e15b77dSAlex Elder }
11749e15b77dSAlex Elder 
11752ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
11762ad3d716SAlex Elder 				u64 *snap_size)
1177602adf40SYehuda Sadeh {
11782ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
11792ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
11802ad3d716SAlex Elder 		*snap_size = rbd_dev->header.image_size;
11812ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
11822ad3d716SAlex Elder 		u32 which;
118300f1f36fSAlex Elder 
11842ad3d716SAlex Elder 		which = rbd_dev_snap_index(rbd_dev, snap_id);
11852ad3d716SAlex Elder 		if (which == BAD_SNAP_INDEX)
11862ad3d716SAlex Elder 			return -ENOENT;
118700f1f36fSAlex Elder 
11882ad3d716SAlex Elder 		*snap_size = rbd_dev->header.snap_sizes[which];
11892ad3d716SAlex Elder 	} else {
11902ad3d716SAlex Elder 		u64 size = 0;
11912ad3d716SAlex Elder 		int ret;
11922ad3d716SAlex Elder 
11932ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
11942ad3d716SAlex Elder 		if (ret)
11952ad3d716SAlex Elder 			return ret;
11962ad3d716SAlex Elder 
11972ad3d716SAlex Elder 		*snap_size = size;
11982ad3d716SAlex Elder 	}
11992ad3d716SAlex Elder 	return 0;
12002ad3d716SAlex Elder }
12012ad3d716SAlex Elder 
1202d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1203602adf40SYehuda Sadeh {
12048f4b7d98SAlex Elder 	u64 snap_id = rbd_dev->spec->snap_id;
12052ad3d716SAlex Elder 	u64 size = 0;
12062ad3d716SAlex Elder 	int ret;
12078b0241f8SAlex Elder 
12082ad3d716SAlex Elder 	ret = rbd_snap_size(rbd_dev, snap_id, &size);
12092ad3d716SAlex Elder 	if (ret)
12102ad3d716SAlex Elder 		return ret;
12112ad3d716SAlex Elder 
12122ad3d716SAlex Elder 	rbd_dev->mapping.size = size;
12138b0241f8SAlex Elder 	return 0;
1214602adf40SYehuda Sadeh }
1215602adf40SYehuda Sadeh 
1216d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1217d1cf5788SAlex Elder {
1218d1cf5788SAlex Elder 	rbd_dev->mapping.size = 0;
1219200a6a8bSAlex Elder }
1220200a6a8bSAlex Elder 
12215359a17dSIlya Dryomov static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
1222b9434c5bSAlex Elder {
12235359a17dSIlya Dryomov 	struct ceph_bio_iter it = *bio_pos;
1224b9434c5bSAlex Elder 
12255359a17dSIlya Dryomov 	ceph_bio_iter_advance(&it, off);
12265359a17dSIlya Dryomov 	ceph_bio_iter_advance_step(&it, bytes, ({
1227732022b8SChristoph Hellwig 		memzero_bvec(&bv);
12285359a17dSIlya Dryomov 	}));
1229b9434c5bSAlex Elder }
1230b9434c5bSAlex Elder 
12317e07efb1SIlya Dryomov static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
1232602adf40SYehuda Sadeh {
12337e07efb1SIlya Dryomov 	struct ceph_bvec_iter it = *bvec_pos;
1234602adf40SYehuda Sadeh 
12357e07efb1SIlya Dryomov 	ceph_bvec_iter_advance(&it, off);
12367e07efb1SIlya Dryomov 	ceph_bvec_iter_advance_step(&it, bytes, ({
1237732022b8SChristoph Hellwig 		memzero_bvec(&bv);
12387e07efb1SIlya Dryomov 	}));
1239602adf40SYehuda Sadeh }
1240602adf40SYehuda Sadeh 
1241f7760dadSAlex Elder /*
12423da691bfSIlya Dryomov  * Zero a range in @obj_req data buffer defined by a bio (list) or
1243afb97888SIlya Dryomov  * (private) bio_vec array.
1244f7760dadSAlex Elder  *
12453da691bfSIlya Dryomov  * @off is relative to the start of the data buffer.
1246f7760dadSAlex Elder  */
12473da691bfSIlya Dryomov static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
12483da691bfSIlya Dryomov 			       u32 bytes)
1249f7760dadSAlex Elder {
125054ab3b24SIlya Dryomov 	dout("%s %p data buf %u~%u\n", __func__, obj_req, off, bytes);
125154ab3b24SIlya Dryomov 
1252ecc633caSIlya Dryomov 	switch (obj_req->img_request->data_type) {
12533da691bfSIlya Dryomov 	case OBJ_REQUEST_BIO:
12543da691bfSIlya Dryomov 		zero_bios(&obj_req->bio_pos, off, bytes);
12553da691bfSIlya Dryomov 		break;
12563da691bfSIlya Dryomov 	case OBJ_REQUEST_BVECS:
1257afb97888SIlya Dryomov 	case OBJ_REQUEST_OWN_BVECS:
12583da691bfSIlya Dryomov 		zero_bvecs(&obj_req->bvec_pos, off, bytes);
12593da691bfSIlya Dryomov 		break;
12603da691bfSIlya Dryomov 	default:
126116809372SArnd Bergmann 		BUG();
1262f5400b7aSAlex Elder 	}
1263bf0d5f50SAlex Elder }
1264bf0d5f50SAlex Elder 
1265bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
1266bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1267bf0d5f50SAlex Elder {
1268bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
126937206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
12702c935bc5SPeter Zijlstra 		kref_read(&obj_request->kref));
1271bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1272bf0d5f50SAlex Elder }
1273bf0d5f50SAlex Elder 
1274bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1275bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1276bf0d5f50SAlex Elder {
127725dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
127825dcf954SAlex Elder 
1279b155e86cSAlex Elder 	/* Image request now owns object's original reference */
1280bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
128115961b44SIlya Dryomov 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1282bf0d5f50SAlex Elder }
1283bf0d5f50SAlex Elder 
1284bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1285bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1286bf0d5f50SAlex Elder {
128715961b44SIlya Dryomov 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
128843df3d35SIlya Dryomov 	list_del(&obj_request->ex.oe_item);
1289bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1290bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1291bf0d5f50SAlex Elder }
1292bf0d5f50SAlex Elder 
1293a086a1b8SIlya Dryomov static void rbd_osd_submit(struct ceph_osd_request *osd_req)
1294bf0d5f50SAlex Elder {
1295a086a1b8SIlya Dryomov 	struct rbd_obj_request *obj_req = osd_req->r_priv;
1296980917fcSIlya Dryomov 
1297a086a1b8SIlya Dryomov 	dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n",
1298a086a1b8SIlya Dryomov 	     __func__, osd_req, obj_req, obj_req->ex.oe_objno,
1299a086a1b8SIlya Dryomov 	     obj_req->ex.oe_off, obj_req->ex.oe_len);
1300980917fcSIlya Dryomov 	ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
1301bf0d5f50SAlex Elder }
1302bf0d5f50SAlex Elder 
13030c425248SAlex Elder /*
13040c425248SAlex Elder  * The default/initial value for all image request flags is 0.  Each
13050c425248SAlex Elder  * is conditionally set to 1 at image request initialization time
13060c425248SAlex Elder  * and currently never change thereafter.
13070c425248SAlex Elder  */
1308d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request)
1309d0b2e944SAlex Elder {
1310d0b2e944SAlex Elder 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1311d0b2e944SAlex Elder }
1312d0b2e944SAlex Elder 
1313d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request)
1314d0b2e944SAlex Elder {
1315d0b2e944SAlex Elder 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1316d0b2e944SAlex Elder }
1317d0b2e944SAlex Elder 
13183da691bfSIlya Dryomov static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
13193b434a2aSJosh Durgin {
13203da691bfSIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
13213da691bfSIlya Dryomov 
132243df3d35SIlya Dryomov 	return !obj_req->ex.oe_off &&
132343df3d35SIlya Dryomov 	       obj_req->ex.oe_len == rbd_dev->layout.object_size;
13243b434a2aSJosh Durgin }
13253b434a2aSJosh Durgin 
13263da691bfSIlya Dryomov static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
13276e2a4505SAlex Elder {
13283da691bfSIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1329b9434c5bSAlex Elder 
133043df3d35SIlya Dryomov 	return obj_req->ex.oe_off + obj_req->ex.oe_len ==
13313da691bfSIlya Dryomov 					rbd_dev->layout.object_size;
13326e2a4505SAlex Elder }
13336e2a4505SAlex Elder 
133413488d53SIlya Dryomov /*
133513488d53SIlya Dryomov  * Must be called after rbd_obj_calc_img_extents().
133613488d53SIlya Dryomov  */
133713488d53SIlya Dryomov static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req)
133813488d53SIlya Dryomov {
133913488d53SIlya Dryomov 	if (!obj_req->num_img_extents ||
13409b17eb2cSIlya Dryomov 	    (rbd_obj_is_entire(obj_req) &&
13419b17eb2cSIlya Dryomov 	     !obj_req->img_request->snapc->num_snaps))
134213488d53SIlya Dryomov 		return false;
134313488d53SIlya Dryomov 
134413488d53SIlya Dryomov 	return true;
134513488d53SIlya Dryomov }
134613488d53SIlya Dryomov 
134786bd7998SIlya Dryomov static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
1348bf0d5f50SAlex Elder {
134986bd7998SIlya Dryomov 	return ceph_file_extents_bytes(obj_req->img_extents,
135086bd7998SIlya Dryomov 				       obj_req->num_img_extents);
1351bf0d5f50SAlex Elder }
1352bf0d5f50SAlex Elder 
13533da691bfSIlya Dryomov static bool rbd_img_is_write(struct rbd_img_request *img_req)
13540dcc685eSIlya Dryomov {
13559bb0248dSIlya Dryomov 	switch (img_req->op_type) {
13563da691bfSIlya Dryomov 	case OBJ_OP_READ:
13573da691bfSIlya Dryomov 		return false;
13583da691bfSIlya Dryomov 	case OBJ_OP_WRITE:
13593da691bfSIlya Dryomov 	case OBJ_OP_DISCARD:
13606484cbe9SIlya Dryomov 	case OBJ_OP_ZEROOUT:
13613da691bfSIlya Dryomov 		return true;
13623da691bfSIlya Dryomov 	default:
1363c6244b3bSArnd Bergmann 		BUG();
13640dcc685eSIlya Dryomov 	}
13650dcc685eSIlya Dryomov }
13660dcc685eSIlya Dryomov 
136785e084feSIlya Dryomov static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
1368bf0d5f50SAlex Elder {
13693da691bfSIlya Dryomov 	struct rbd_obj_request *obj_req = osd_req->r_priv;
137054ab3b24SIlya Dryomov 	int result;
1371bf0d5f50SAlex Elder 
13723da691bfSIlya Dryomov 	dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
13733da691bfSIlya Dryomov 	     osd_req->r_result, obj_req);
1374bf0d5f50SAlex Elder 
1375c47f9371SAlex Elder 	/*
13763da691bfSIlya Dryomov 	 * Writes aren't allowed to return a data payload.  In some
13773da691bfSIlya Dryomov 	 * guarded write cases (e.g. stat + zero on an empty object)
13783da691bfSIlya Dryomov 	 * a stat response makes it through, but we don't care.
1379c47f9371SAlex Elder 	 */
138054ab3b24SIlya Dryomov 	if (osd_req->r_result > 0 && rbd_img_is_write(obj_req->img_request))
138154ab3b24SIlya Dryomov 		result = 0;
138254ab3b24SIlya Dryomov 	else
138354ab3b24SIlya Dryomov 		result = osd_req->r_result;
13840ccd5926SIlya Dryomov 
138554ab3b24SIlya Dryomov 	rbd_obj_handle_request(obj_req, result);
1386bf0d5f50SAlex Elder }
1387bf0d5f50SAlex Elder 
1388bcbab1dbSIlya Dryomov static void rbd_osd_format_read(struct ceph_osd_request *osd_req)
1389430c28c3SAlex Elder {
1390bcbab1dbSIlya Dryomov 	struct rbd_obj_request *obj_request = osd_req->r_priv;
139122d2cfdfSIlya Dryomov 	struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
139222d2cfdfSIlya Dryomov 	struct ceph_options *opt = rbd_dev->rbd_client->client->options;
1393430c28c3SAlex Elder 
139422d2cfdfSIlya Dryomov 	osd_req->r_flags = CEPH_OSD_FLAG_READ | opt->read_from_replica;
13957c84883aSIlya Dryomov 	osd_req->r_snapid = obj_request->img_request->snap_id;
13969d4df01fSAlex Elder }
13979d4df01fSAlex Elder 
1398bcbab1dbSIlya Dryomov static void rbd_osd_format_write(struct ceph_osd_request *osd_req)
13999d4df01fSAlex Elder {
1400bcbab1dbSIlya Dryomov 	struct rbd_obj_request *obj_request = osd_req->r_priv;
14019d4df01fSAlex Elder 
1402a162b308SIlya Dryomov 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
1403fac02ddfSArnd Bergmann 	ktime_get_real_ts64(&osd_req->r_mtime);
140443df3d35SIlya Dryomov 	osd_req->r_data_offset = obj_request->ex.oe_off;
1405430c28c3SAlex Elder }
1406430c28c3SAlex Elder 
1407bc81207eSIlya Dryomov static struct ceph_osd_request *
1408bcbab1dbSIlya Dryomov __rbd_obj_add_osd_request(struct rbd_obj_request *obj_req,
1409bcbab1dbSIlya Dryomov 			  struct ceph_snap_context *snapc, int num_ops)
1410bc81207eSIlya Dryomov {
1411e28eded5SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1412bc81207eSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1413bc81207eSIlya Dryomov 	struct ceph_osd_request *req;
1414a90bb0c1SIlya Dryomov 	const char *name_format = rbd_dev->image_format == 1 ?
1415a90bb0c1SIlya Dryomov 				      RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
1416bcbab1dbSIlya Dryomov 	int ret;
1417bc81207eSIlya Dryomov 
1418e28eded5SIlya Dryomov 	req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
1419bc81207eSIlya Dryomov 	if (!req)
1420bcbab1dbSIlya Dryomov 		return ERR_PTR(-ENOMEM);
1421bc81207eSIlya Dryomov 
1422bcbab1dbSIlya Dryomov 	list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
1423bc81207eSIlya Dryomov 	req->r_callback = rbd_osd_req_callback;
1424a162b308SIlya Dryomov 	req->r_priv = obj_req;
1425bc81207eSIlya Dryomov 
1426b26c047bSIlya Dryomov 	/*
1427b26c047bSIlya Dryomov 	 * Data objects may be stored in a separate pool, but always in
1428b26c047bSIlya Dryomov 	 * the same namespace in that pool as the header in its pool.
1429b26c047bSIlya Dryomov 	 */
1430b26c047bSIlya Dryomov 	ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
1431bc81207eSIlya Dryomov 	req->r_base_oloc.pool = rbd_dev->layout.pool_id;
1432b26c047bSIlya Dryomov 
1433bcbab1dbSIlya Dryomov 	ret = ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
1434bcbab1dbSIlya Dryomov 			       rbd_dev->header.object_prefix,
1435bcbab1dbSIlya Dryomov 			       obj_req->ex.oe_objno);
1436bcbab1dbSIlya Dryomov 	if (ret)
1437bcbab1dbSIlya Dryomov 		return ERR_PTR(ret);
1438bc81207eSIlya Dryomov 
1439bc81207eSIlya Dryomov 	return req;
1440bc81207eSIlya Dryomov }
1441bc81207eSIlya Dryomov 
1442e28eded5SIlya Dryomov static struct ceph_osd_request *
1443bcbab1dbSIlya Dryomov rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops)
1444e28eded5SIlya Dryomov {
1445bcbab1dbSIlya Dryomov 	return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc,
1446e28eded5SIlya Dryomov 					 num_ops);
1447e28eded5SIlya Dryomov }
1448e28eded5SIlya Dryomov 
1449ecc633caSIlya Dryomov static struct rbd_obj_request *rbd_obj_request_create(void)
1450bf0d5f50SAlex Elder {
1451bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1452bf0d5f50SAlex Elder 
14535a60e876SIlya Dryomov 	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
14546c696d85SIlya Dryomov 	if (!obj_request)
1455f907ad55SAlex Elder 		return NULL;
1456f907ad55SAlex Elder 
145743df3d35SIlya Dryomov 	ceph_object_extent_init(&obj_request->ex);
1458bcbab1dbSIlya Dryomov 	INIT_LIST_HEAD(&obj_request->osd_reqs);
145985b5e6d1SIlya Dryomov 	mutex_init(&obj_request->state_mutex);
1460bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
1461bf0d5f50SAlex Elder 
146267e2b652SIlya Dryomov 	dout("%s %p\n", __func__, obj_request);
1463bf0d5f50SAlex Elder 	return obj_request;
1464bf0d5f50SAlex Elder }
1465bf0d5f50SAlex Elder 
1466bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
1467bf0d5f50SAlex Elder {
1468bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1469bcbab1dbSIlya Dryomov 	struct ceph_osd_request *osd_req;
14707e07efb1SIlya Dryomov 	u32 i;
1471bf0d5f50SAlex Elder 
1472bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
1473bf0d5f50SAlex Elder 
147437206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
147537206ee5SAlex Elder 
1476bcbab1dbSIlya Dryomov 	while (!list_empty(&obj_request->osd_reqs)) {
1477bcbab1dbSIlya Dryomov 		osd_req = list_first_entry(&obj_request->osd_reqs,
1478bcbab1dbSIlya Dryomov 				    struct ceph_osd_request, r_private_item);
1479bcbab1dbSIlya Dryomov 		list_del_init(&osd_req->r_private_item);
1480bcbab1dbSIlya Dryomov 		ceph_osdc_put_request(osd_req);
1481bcbab1dbSIlya Dryomov 	}
1482bf0d5f50SAlex Elder 
1483ecc633caSIlya Dryomov 	switch (obj_request->img_request->data_type) {
14849969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1485bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
14867e07efb1SIlya Dryomov 	case OBJ_REQUEST_BVECS:
14875359a17dSIlya Dryomov 		break;		/* Nothing to do */
1488afb97888SIlya Dryomov 	case OBJ_REQUEST_OWN_BVECS:
1489afb97888SIlya Dryomov 		kfree(obj_request->bvec_pos.bvecs);
1490bf0d5f50SAlex Elder 		break;
14917e07efb1SIlya Dryomov 	default:
149216809372SArnd Bergmann 		BUG();
1493bf0d5f50SAlex Elder 	}
1494bf0d5f50SAlex Elder 
149586bd7998SIlya Dryomov 	kfree(obj_request->img_extents);
14967e07efb1SIlya Dryomov 	if (obj_request->copyup_bvecs) {
14977e07efb1SIlya Dryomov 		for (i = 0; i < obj_request->copyup_bvec_count; i++) {
14987e07efb1SIlya Dryomov 			if (obj_request->copyup_bvecs[i].bv_page)
14997e07efb1SIlya Dryomov 				__free_page(obj_request->copyup_bvecs[i].bv_page);
15007e07efb1SIlya Dryomov 		}
15017e07efb1SIlya Dryomov 		kfree(obj_request->copyup_bvecs);
1502bf0d5f50SAlex Elder 	}
1503bf0d5f50SAlex Elder 
1504868311b1SAlex Elder 	kmem_cache_free(rbd_obj_request_cache, obj_request);
1505bf0d5f50SAlex Elder }
1506bf0d5f50SAlex Elder 
1507fb65d228SAlex Elder /* It's OK to call this for a device with no parent */
1508fb65d228SAlex Elder 
1509fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
1510fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1511fb65d228SAlex Elder {
1512fb65d228SAlex Elder 	rbd_dev_remove_parent(rbd_dev);
1513fb65d228SAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
1514fb65d228SAlex Elder 	rbd_dev->parent_spec = NULL;
1515fb65d228SAlex Elder 	rbd_dev->parent_overlap = 0;
1516fb65d228SAlex Elder }
1517fb65d228SAlex Elder 
1518bf0d5f50SAlex Elder /*
1519a2acd00eSAlex Elder  * Parent image reference counting is used to determine when an
1520a2acd00eSAlex Elder  * image's parent fields can be safely torn down--after there are no
1521a2acd00eSAlex Elder  * more in-flight requests to the parent image.  When the last
1522a2acd00eSAlex Elder  * reference is dropped, cleaning them up is safe.
1523a2acd00eSAlex Elder  */
1524a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1525a2acd00eSAlex Elder {
1526a2acd00eSAlex Elder 	int counter;
1527a2acd00eSAlex Elder 
1528a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
1529a2acd00eSAlex Elder 		return;
1530a2acd00eSAlex Elder 
1531a2acd00eSAlex Elder 	counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1532a2acd00eSAlex Elder 	if (counter > 0)
1533a2acd00eSAlex Elder 		return;
1534a2acd00eSAlex Elder 
1535a2acd00eSAlex Elder 	/* Last reference; clean up parent data structures */
1536a2acd00eSAlex Elder 
1537a2acd00eSAlex Elder 	if (!counter)
1538a2acd00eSAlex Elder 		rbd_dev_unparent(rbd_dev);
1539a2acd00eSAlex Elder 	else
15409584d508SIlya Dryomov 		rbd_warn(rbd_dev, "parent reference underflow");
1541a2acd00eSAlex Elder }
1542a2acd00eSAlex Elder 
1543a2acd00eSAlex Elder /*
1544a2acd00eSAlex Elder  * If an image has a non-zero parent overlap, get a reference to its
1545a2acd00eSAlex Elder  * parent.
1546a2acd00eSAlex Elder  *
1547a2acd00eSAlex Elder  * Returns true if the rbd device has a parent with a non-zero
1548a2acd00eSAlex Elder  * overlap and a reference for it was successfully taken, or
1549a2acd00eSAlex Elder  * false otherwise.
1550a2acd00eSAlex Elder  */
1551a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1552a2acd00eSAlex Elder {
1553ae43e9d0SIlya Dryomov 	int counter = 0;
1554a2acd00eSAlex Elder 
1555a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
1556a2acd00eSAlex Elder 		return false;
1557a2acd00eSAlex Elder 
1558ae43e9d0SIlya Dryomov 	if (rbd_dev->parent_overlap)
1559a2acd00eSAlex Elder 		counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1560a2acd00eSAlex Elder 
1561a2acd00eSAlex Elder 	if (counter < 0)
15629584d508SIlya Dryomov 		rbd_warn(rbd_dev, "parent reference overflow");
1563a2acd00eSAlex Elder 
1564ae43e9d0SIlya Dryomov 	return counter > 0;
1565a2acd00eSAlex Elder }
1566a2acd00eSAlex Elder 
156759e542c8SIlya Dryomov static void rbd_img_request_init(struct rbd_img_request *img_request,
1568cc344fa1SAlex Elder 				 struct rbd_device *rbd_dev,
1569a52cc685SIlya Dryomov 				 enum obj_operation_type op_type)
1570bf0d5f50SAlex Elder {
157159e542c8SIlya Dryomov 	memset(img_request, 0, sizeof(*img_request));
1572bf0d5f50SAlex Elder 
1573bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
15749bb0248dSIlya Dryomov 	img_request->op_type = op_type;
1575a0c5895bSIlya Dryomov 
1576e1fddc8fSIlya Dryomov 	INIT_LIST_HEAD(&img_request->lock_item);
157743df3d35SIlya Dryomov 	INIT_LIST_HEAD(&img_request->object_extents);
15780192ce2eSIlya Dryomov 	mutex_init(&img_request->state_mutex);
1579bf0d5f50SAlex Elder }
1580bf0d5f50SAlex Elder 
1581a52cc685SIlya Dryomov static void rbd_img_capture_header(struct rbd_img_request *img_req)
1582a52cc685SIlya Dryomov {
1583a52cc685SIlya Dryomov 	struct rbd_device *rbd_dev = img_req->rbd_dev;
1584a52cc685SIlya Dryomov 
1585a52cc685SIlya Dryomov 	lockdep_assert_held(&rbd_dev->header_rwsem);
1586a52cc685SIlya Dryomov 
1587a52cc685SIlya Dryomov 	if (rbd_img_is_write(img_req))
1588a52cc685SIlya Dryomov 		img_req->snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1589a52cc685SIlya Dryomov 	else
1590a52cc685SIlya Dryomov 		img_req->snap_id = rbd_dev->spec->snap_id;
1591a52cc685SIlya Dryomov 
1592a52cc685SIlya Dryomov 	if (rbd_dev_parent_get(rbd_dev))
1593a52cc685SIlya Dryomov 		img_request_layered_set(img_req);
1594a52cc685SIlya Dryomov }
1595a52cc685SIlya Dryomov 
1596679a97d2SHannes Reinecke static void rbd_img_request_destroy(struct rbd_img_request *img_request)
1597bf0d5f50SAlex Elder {
1598bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1599bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
1600bf0d5f50SAlex Elder 
160137206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
160237206ee5SAlex Elder 
1603e1fddc8fSIlya Dryomov 	WARN_ON(!list_empty(&img_request->lock_item));
1604bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1605bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
1606bf0d5f50SAlex Elder 
160778b42a87SIlya Dryomov 	if (img_request_layered_test(img_request))
1608a2acd00eSAlex Elder 		rbd_dev_parent_put(img_request->rbd_dev);
1609a2acd00eSAlex Elder 
16109bb0248dSIlya Dryomov 	if (rbd_img_is_write(img_request))
1611812164f8SAlex Elder 		ceph_put_snap_context(img_request->snapc);
1612bf0d5f50SAlex Elder 
161359e542c8SIlya Dryomov 	if (test_bit(IMG_REQ_CHILD, &img_request->flags))
16141c2a9dfeSAlex Elder 		kmem_cache_free(rbd_img_request_cache, img_request);
1615bf0d5f50SAlex Elder }
1616bf0d5f50SAlex Elder 
161722e8bd51SIlya Dryomov #define BITS_PER_OBJ	2
161822e8bd51SIlya Dryomov #define OBJS_PER_BYTE	(BITS_PER_BYTE / BITS_PER_OBJ)
161922e8bd51SIlya Dryomov #define OBJ_MASK	((1 << BITS_PER_OBJ) - 1)
162022e8bd51SIlya Dryomov 
162122e8bd51SIlya Dryomov static void __rbd_object_map_index(struct rbd_device *rbd_dev, u64 objno,
162222e8bd51SIlya Dryomov 				   u64 *index, u8 *shift)
162322e8bd51SIlya Dryomov {
162422e8bd51SIlya Dryomov 	u32 off;
162522e8bd51SIlya Dryomov 
162622e8bd51SIlya Dryomov 	rbd_assert(objno < rbd_dev->object_map_size);
162722e8bd51SIlya Dryomov 	*index = div_u64_rem(objno, OBJS_PER_BYTE, &off);
162822e8bd51SIlya Dryomov 	*shift = (OBJS_PER_BYTE - off - 1) * BITS_PER_OBJ;
162922e8bd51SIlya Dryomov }
163022e8bd51SIlya Dryomov 
163122e8bd51SIlya Dryomov static u8 __rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
163222e8bd51SIlya Dryomov {
163322e8bd51SIlya Dryomov 	u64 index;
163422e8bd51SIlya Dryomov 	u8 shift;
163522e8bd51SIlya Dryomov 
163622e8bd51SIlya Dryomov 	lockdep_assert_held(&rbd_dev->object_map_lock);
163722e8bd51SIlya Dryomov 	__rbd_object_map_index(rbd_dev, objno, &index, &shift);
163822e8bd51SIlya Dryomov 	return (rbd_dev->object_map[index] >> shift) & OBJ_MASK;
163922e8bd51SIlya Dryomov }
164022e8bd51SIlya Dryomov 
164122e8bd51SIlya Dryomov static void __rbd_object_map_set(struct rbd_device *rbd_dev, u64 objno, u8 val)
164222e8bd51SIlya Dryomov {
164322e8bd51SIlya Dryomov 	u64 index;
164422e8bd51SIlya Dryomov 	u8 shift;
164522e8bd51SIlya Dryomov 	u8 *p;
164622e8bd51SIlya Dryomov 
164722e8bd51SIlya Dryomov 	lockdep_assert_held(&rbd_dev->object_map_lock);
164822e8bd51SIlya Dryomov 	rbd_assert(!(val & ~OBJ_MASK));
164922e8bd51SIlya Dryomov 
165022e8bd51SIlya Dryomov 	__rbd_object_map_index(rbd_dev, objno, &index, &shift);
165122e8bd51SIlya Dryomov 	p = &rbd_dev->object_map[index];
165222e8bd51SIlya Dryomov 	*p = (*p & ~(OBJ_MASK << shift)) | (val << shift);
165322e8bd51SIlya Dryomov }
165422e8bd51SIlya Dryomov 
165522e8bd51SIlya Dryomov static u8 rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
165622e8bd51SIlya Dryomov {
165722e8bd51SIlya Dryomov 	u8 state;
165822e8bd51SIlya Dryomov 
165922e8bd51SIlya Dryomov 	spin_lock(&rbd_dev->object_map_lock);
166022e8bd51SIlya Dryomov 	state = __rbd_object_map_get(rbd_dev, objno);
166122e8bd51SIlya Dryomov 	spin_unlock(&rbd_dev->object_map_lock);
166222e8bd51SIlya Dryomov 	return state;
166322e8bd51SIlya Dryomov }
166422e8bd51SIlya Dryomov 
166522e8bd51SIlya Dryomov static bool use_object_map(struct rbd_device *rbd_dev)
166622e8bd51SIlya Dryomov {
16673fe69921SIlya Dryomov 	/*
16683fe69921SIlya Dryomov 	 * An image mapped read-only can't use the object map -- it isn't
16693fe69921SIlya Dryomov 	 * loaded because the header lock isn't acquired.  Someone else can
16703fe69921SIlya Dryomov 	 * write to the image and update the object map behind our back.
16713fe69921SIlya Dryomov 	 *
16723fe69921SIlya Dryomov 	 * A snapshot can't be written to, so using the object map is always
16733fe69921SIlya Dryomov 	 * safe.
16743fe69921SIlya Dryomov 	 */
16753fe69921SIlya Dryomov 	if (!rbd_is_snap(rbd_dev) && rbd_is_ro(rbd_dev))
16763fe69921SIlya Dryomov 		return false;
16773fe69921SIlya Dryomov 
167822e8bd51SIlya Dryomov 	return ((rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) &&
167922e8bd51SIlya Dryomov 		!(rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID));
168022e8bd51SIlya Dryomov }
168122e8bd51SIlya Dryomov 
168222e8bd51SIlya Dryomov static bool rbd_object_map_may_exist(struct rbd_device *rbd_dev, u64 objno)
168322e8bd51SIlya Dryomov {
168422e8bd51SIlya Dryomov 	u8 state;
168522e8bd51SIlya Dryomov 
168622e8bd51SIlya Dryomov 	/* fall back to default logic if object map is disabled or invalid */
168722e8bd51SIlya Dryomov 	if (!use_object_map(rbd_dev))
168822e8bd51SIlya Dryomov 		return true;
168922e8bd51SIlya Dryomov 
169022e8bd51SIlya Dryomov 	state = rbd_object_map_get(rbd_dev, objno);
169122e8bd51SIlya Dryomov 	return state != OBJECT_NONEXISTENT;
169222e8bd51SIlya Dryomov }
169322e8bd51SIlya Dryomov 
169422e8bd51SIlya Dryomov static void rbd_object_map_name(struct rbd_device *rbd_dev, u64 snap_id,
169522e8bd51SIlya Dryomov 				struct ceph_object_id *oid)
169622e8bd51SIlya Dryomov {
169722e8bd51SIlya Dryomov 	if (snap_id == CEPH_NOSNAP)
169822e8bd51SIlya Dryomov 		ceph_oid_printf(oid, "%s%s", RBD_OBJECT_MAP_PREFIX,
169922e8bd51SIlya Dryomov 				rbd_dev->spec->image_id);
170022e8bd51SIlya Dryomov 	else
170122e8bd51SIlya Dryomov 		ceph_oid_printf(oid, "%s%s.%016llx", RBD_OBJECT_MAP_PREFIX,
170222e8bd51SIlya Dryomov 				rbd_dev->spec->image_id, snap_id);
170322e8bd51SIlya Dryomov }
170422e8bd51SIlya Dryomov 
170522e8bd51SIlya Dryomov static int rbd_object_map_lock(struct rbd_device *rbd_dev)
170622e8bd51SIlya Dryomov {
170722e8bd51SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
170822e8bd51SIlya Dryomov 	CEPH_DEFINE_OID_ONSTACK(oid);
170922e8bd51SIlya Dryomov 	u8 lock_type;
171022e8bd51SIlya Dryomov 	char *lock_tag;
171122e8bd51SIlya Dryomov 	struct ceph_locker *lockers;
171222e8bd51SIlya Dryomov 	u32 num_lockers;
171322e8bd51SIlya Dryomov 	bool broke_lock = false;
171422e8bd51SIlya Dryomov 	int ret;
171522e8bd51SIlya Dryomov 
171622e8bd51SIlya Dryomov 	rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
171722e8bd51SIlya Dryomov 
171822e8bd51SIlya Dryomov again:
171922e8bd51SIlya Dryomov 	ret = ceph_cls_lock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
172022e8bd51SIlya Dryomov 			    CEPH_CLS_LOCK_EXCLUSIVE, "", "", "", 0);
172122e8bd51SIlya Dryomov 	if (ret != -EBUSY || broke_lock) {
172222e8bd51SIlya Dryomov 		if (ret == -EEXIST)
172322e8bd51SIlya Dryomov 			ret = 0; /* already locked by myself */
172422e8bd51SIlya Dryomov 		if (ret)
172522e8bd51SIlya Dryomov 			rbd_warn(rbd_dev, "failed to lock object map: %d", ret);
172622e8bd51SIlya Dryomov 		return ret;
172722e8bd51SIlya Dryomov 	}
172822e8bd51SIlya Dryomov 
172922e8bd51SIlya Dryomov 	ret = ceph_cls_lock_info(osdc, &oid, &rbd_dev->header_oloc,
173022e8bd51SIlya Dryomov 				 RBD_LOCK_NAME, &lock_type, &lock_tag,
173122e8bd51SIlya Dryomov 				 &lockers, &num_lockers);
173222e8bd51SIlya Dryomov 	if (ret) {
173322e8bd51SIlya Dryomov 		if (ret == -ENOENT)
173422e8bd51SIlya Dryomov 			goto again;
173522e8bd51SIlya Dryomov 
173622e8bd51SIlya Dryomov 		rbd_warn(rbd_dev, "failed to get object map lockers: %d", ret);
173722e8bd51SIlya Dryomov 		return ret;
173822e8bd51SIlya Dryomov 	}
173922e8bd51SIlya Dryomov 
174022e8bd51SIlya Dryomov 	kfree(lock_tag);
174122e8bd51SIlya Dryomov 	if (num_lockers == 0)
174222e8bd51SIlya Dryomov 		goto again;
174322e8bd51SIlya Dryomov 
174422e8bd51SIlya Dryomov 	rbd_warn(rbd_dev, "breaking object map lock owned by %s%llu",
174522e8bd51SIlya Dryomov 		 ENTITY_NAME(lockers[0].id.name));
174622e8bd51SIlya Dryomov 
174722e8bd51SIlya Dryomov 	ret = ceph_cls_break_lock(osdc, &oid, &rbd_dev->header_oloc,
174822e8bd51SIlya Dryomov 				  RBD_LOCK_NAME, lockers[0].id.cookie,
174922e8bd51SIlya Dryomov 				  &lockers[0].id.name);
175022e8bd51SIlya Dryomov 	ceph_free_lockers(lockers, num_lockers);
175122e8bd51SIlya Dryomov 	if (ret) {
175222e8bd51SIlya Dryomov 		if (ret == -ENOENT)
175322e8bd51SIlya Dryomov 			goto again;
175422e8bd51SIlya Dryomov 
175522e8bd51SIlya Dryomov 		rbd_warn(rbd_dev, "failed to break object map lock: %d", ret);
175622e8bd51SIlya Dryomov 		return ret;
175722e8bd51SIlya Dryomov 	}
175822e8bd51SIlya Dryomov 
175922e8bd51SIlya Dryomov 	broke_lock = true;
176022e8bd51SIlya Dryomov 	goto again;
176122e8bd51SIlya Dryomov }
176222e8bd51SIlya Dryomov 
176322e8bd51SIlya Dryomov static void rbd_object_map_unlock(struct rbd_device *rbd_dev)
176422e8bd51SIlya Dryomov {
176522e8bd51SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
176622e8bd51SIlya Dryomov 	CEPH_DEFINE_OID_ONSTACK(oid);
176722e8bd51SIlya Dryomov 	int ret;
176822e8bd51SIlya Dryomov 
176922e8bd51SIlya Dryomov 	rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
177022e8bd51SIlya Dryomov 
177122e8bd51SIlya Dryomov 	ret = ceph_cls_unlock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
177222e8bd51SIlya Dryomov 			      "");
177322e8bd51SIlya Dryomov 	if (ret && ret != -ENOENT)
177422e8bd51SIlya Dryomov 		rbd_warn(rbd_dev, "failed to unlock object map: %d", ret);
177522e8bd51SIlya Dryomov }
177622e8bd51SIlya Dryomov 
177722e8bd51SIlya Dryomov static int decode_object_map_header(void **p, void *end, u64 *object_map_size)
177822e8bd51SIlya Dryomov {
177922e8bd51SIlya Dryomov 	u8 struct_v;
178022e8bd51SIlya Dryomov 	u32 struct_len;
178122e8bd51SIlya Dryomov 	u32 header_len;
178222e8bd51SIlya Dryomov 	void *header_end;
178322e8bd51SIlya Dryomov 	int ret;
178422e8bd51SIlya Dryomov 
178522e8bd51SIlya Dryomov 	ceph_decode_32_safe(p, end, header_len, e_inval);
178622e8bd51SIlya Dryomov 	header_end = *p + header_len;
178722e8bd51SIlya Dryomov 
178822e8bd51SIlya Dryomov 	ret = ceph_start_decoding(p, end, 1, "BitVector header", &struct_v,
178922e8bd51SIlya Dryomov 				  &struct_len);
179022e8bd51SIlya Dryomov 	if (ret)
179122e8bd51SIlya Dryomov 		return ret;
179222e8bd51SIlya Dryomov 
179322e8bd51SIlya Dryomov 	ceph_decode_64_safe(p, end, *object_map_size, e_inval);
179422e8bd51SIlya Dryomov 
179522e8bd51SIlya Dryomov 	*p = header_end;
179622e8bd51SIlya Dryomov 	return 0;
179722e8bd51SIlya Dryomov 
179822e8bd51SIlya Dryomov e_inval:
179922e8bd51SIlya Dryomov 	return -EINVAL;
180022e8bd51SIlya Dryomov }
180122e8bd51SIlya Dryomov 
180222e8bd51SIlya Dryomov static int __rbd_object_map_load(struct rbd_device *rbd_dev)
180322e8bd51SIlya Dryomov {
180422e8bd51SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
180522e8bd51SIlya Dryomov 	CEPH_DEFINE_OID_ONSTACK(oid);
180622e8bd51SIlya Dryomov 	struct page **pages;
180722e8bd51SIlya Dryomov 	void *p, *end;
180822e8bd51SIlya Dryomov 	size_t reply_len;
180922e8bd51SIlya Dryomov 	u64 num_objects;
181022e8bd51SIlya Dryomov 	u64 object_map_bytes;
181122e8bd51SIlya Dryomov 	u64 object_map_size;
181222e8bd51SIlya Dryomov 	int num_pages;
181322e8bd51SIlya Dryomov 	int ret;
181422e8bd51SIlya Dryomov 
181522e8bd51SIlya Dryomov 	rbd_assert(!rbd_dev->object_map && !rbd_dev->object_map_size);
181622e8bd51SIlya Dryomov 
181722e8bd51SIlya Dryomov 	num_objects = ceph_get_num_objects(&rbd_dev->layout,
181822e8bd51SIlya Dryomov 					   rbd_dev->mapping.size);
181922e8bd51SIlya Dryomov 	object_map_bytes = DIV_ROUND_UP_ULL(num_objects * BITS_PER_OBJ,
182022e8bd51SIlya Dryomov 					    BITS_PER_BYTE);
182122e8bd51SIlya Dryomov 	num_pages = calc_pages_for(0, object_map_bytes) + 1;
182222e8bd51SIlya Dryomov 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
182322e8bd51SIlya Dryomov 	if (IS_ERR(pages))
182422e8bd51SIlya Dryomov 		return PTR_ERR(pages);
182522e8bd51SIlya Dryomov 
182622e8bd51SIlya Dryomov 	reply_len = num_pages * PAGE_SIZE;
182722e8bd51SIlya Dryomov 	rbd_object_map_name(rbd_dev, rbd_dev->spec->snap_id, &oid);
182822e8bd51SIlya Dryomov 	ret = ceph_osdc_call(osdc, &oid, &rbd_dev->header_oloc,
182922e8bd51SIlya Dryomov 			     "rbd", "object_map_load", CEPH_OSD_FLAG_READ,
183022e8bd51SIlya Dryomov 			     NULL, 0, pages, &reply_len);
183122e8bd51SIlya Dryomov 	if (ret)
183222e8bd51SIlya Dryomov 		goto out;
183322e8bd51SIlya Dryomov 
183422e8bd51SIlya Dryomov 	p = page_address(pages[0]);
183522e8bd51SIlya Dryomov 	end = p + min(reply_len, (size_t)PAGE_SIZE);
183622e8bd51SIlya Dryomov 	ret = decode_object_map_header(&p, end, &object_map_size);
183722e8bd51SIlya Dryomov 	if (ret)
183822e8bd51SIlya Dryomov 		goto out;
183922e8bd51SIlya Dryomov 
184022e8bd51SIlya Dryomov 	if (object_map_size != num_objects) {
184122e8bd51SIlya Dryomov 		rbd_warn(rbd_dev, "object map size mismatch: %llu vs %llu",
184222e8bd51SIlya Dryomov 			 object_map_size, num_objects);
184322e8bd51SIlya Dryomov 		ret = -EINVAL;
184422e8bd51SIlya Dryomov 		goto out;
184522e8bd51SIlya Dryomov 	}
184622e8bd51SIlya Dryomov 
184722e8bd51SIlya Dryomov 	if (offset_in_page(p) + object_map_bytes > reply_len) {
184822e8bd51SIlya Dryomov 		ret = -EINVAL;
184922e8bd51SIlya Dryomov 		goto out;
185022e8bd51SIlya Dryomov 	}
185122e8bd51SIlya Dryomov 
185222e8bd51SIlya Dryomov 	rbd_dev->object_map = kvmalloc(object_map_bytes, GFP_KERNEL);
185322e8bd51SIlya Dryomov 	if (!rbd_dev->object_map) {
185422e8bd51SIlya Dryomov 		ret = -ENOMEM;
185522e8bd51SIlya Dryomov 		goto out;
185622e8bd51SIlya Dryomov 	}
185722e8bd51SIlya Dryomov 
185822e8bd51SIlya Dryomov 	rbd_dev->object_map_size = object_map_size;
185922e8bd51SIlya Dryomov 	ceph_copy_from_page_vector(pages, rbd_dev->object_map,
186022e8bd51SIlya Dryomov 				   offset_in_page(p), object_map_bytes);
186122e8bd51SIlya Dryomov 
186222e8bd51SIlya Dryomov out:
186322e8bd51SIlya Dryomov 	ceph_release_page_vector(pages, num_pages);
186422e8bd51SIlya Dryomov 	return ret;
186522e8bd51SIlya Dryomov }
186622e8bd51SIlya Dryomov 
186722e8bd51SIlya Dryomov static void rbd_object_map_free(struct rbd_device *rbd_dev)
186822e8bd51SIlya Dryomov {
186922e8bd51SIlya Dryomov 	kvfree(rbd_dev->object_map);
187022e8bd51SIlya Dryomov 	rbd_dev->object_map = NULL;
187122e8bd51SIlya Dryomov 	rbd_dev->object_map_size = 0;
187222e8bd51SIlya Dryomov }
187322e8bd51SIlya Dryomov 
187422e8bd51SIlya Dryomov static int rbd_object_map_load(struct rbd_device *rbd_dev)
187522e8bd51SIlya Dryomov {
187622e8bd51SIlya Dryomov 	int ret;
187722e8bd51SIlya Dryomov 
187822e8bd51SIlya Dryomov 	ret = __rbd_object_map_load(rbd_dev);
187922e8bd51SIlya Dryomov 	if (ret)
188022e8bd51SIlya Dryomov 		return ret;
188122e8bd51SIlya Dryomov 
188222e8bd51SIlya Dryomov 	ret = rbd_dev_v2_get_flags(rbd_dev);
188322e8bd51SIlya Dryomov 	if (ret) {
188422e8bd51SIlya Dryomov 		rbd_object_map_free(rbd_dev);
188522e8bd51SIlya Dryomov 		return ret;
188622e8bd51SIlya Dryomov 	}
188722e8bd51SIlya Dryomov 
188822e8bd51SIlya Dryomov 	if (rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID)
188922e8bd51SIlya Dryomov 		rbd_warn(rbd_dev, "object map is invalid");
189022e8bd51SIlya Dryomov 
189122e8bd51SIlya Dryomov 	return 0;
189222e8bd51SIlya Dryomov }
189322e8bd51SIlya Dryomov 
189422e8bd51SIlya Dryomov static int rbd_object_map_open(struct rbd_device *rbd_dev)
189522e8bd51SIlya Dryomov {
189622e8bd51SIlya Dryomov 	int ret;
189722e8bd51SIlya Dryomov 
189822e8bd51SIlya Dryomov 	ret = rbd_object_map_lock(rbd_dev);
189922e8bd51SIlya Dryomov 	if (ret)
190022e8bd51SIlya Dryomov 		return ret;
190122e8bd51SIlya Dryomov 
190222e8bd51SIlya Dryomov 	ret = rbd_object_map_load(rbd_dev);
190322e8bd51SIlya Dryomov 	if (ret) {
190422e8bd51SIlya Dryomov 		rbd_object_map_unlock(rbd_dev);
190522e8bd51SIlya Dryomov 		return ret;
190622e8bd51SIlya Dryomov 	}
190722e8bd51SIlya Dryomov 
190822e8bd51SIlya Dryomov 	return 0;
190922e8bd51SIlya Dryomov }
191022e8bd51SIlya Dryomov 
191122e8bd51SIlya Dryomov static void rbd_object_map_close(struct rbd_device *rbd_dev)
191222e8bd51SIlya Dryomov {
191322e8bd51SIlya Dryomov 	rbd_object_map_free(rbd_dev);
191422e8bd51SIlya Dryomov 	rbd_object_map_unlock(rbd_dev);
191522e8bd51SIlya Dryomov }
191622e8bd51SIlya Dryomov 
191722e8bd51SIlya Dryomov /*
191822e8bd51SIlya Dryomov  * This function needs snap_id (or more precisely just something to
191922e8bd51SIlya Dryomov  * distinguish between HEAD and snapshot object maps), new_state and
192022e8bd51SIlya Dryomov  * current_state that were passed to rbd_object_map_update().
192122e8bd51SIlya Dryomov  *
192222e8bd51SIlya Dryomov  * To avoid allocating and stashing a context we piggyback on the OSD
192322e8bd51SIlya Dryomov  * request.  A HEAD update has two ops (assert_locked).  For new_state
192422e8bd51SIlya Dryomov  * and current_state we decode our own object_map_update op, encoded in
192522e8bd51SIlya Dryomov  * rbd_cls_object_map_update().
192622e8bd51SIlya Dryomov  */
192722e8bd51SIlya Dryomov static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req,
192822e8bd51SIlya Dryomov 					struct ceph_osd_request *osd_req)
192922e8bd51SIlya Dryomov {
193022e8bd51SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
193122e8bd51SIlya Dryomov 	struct ceph_osd_data *osd_data;
193222e8bd51SIlya Dryomov 	u64 objno;
19333f649ab7SKees Cook 	u8 state, new_state, current_state;
193422e8bd51SIlya Dryomov 	bool has_current_state;
193522e8bd51SIlya Dryomov 	void *p;
193622e8bd51SIlya Dryomov 
193722e8bd51SIlya Dryomov 	if (osd_req->r_result)
193822e8bd51SIlya Dryomov 		return osd_req->r_result;
193922e8bd51SIlya Dryomov 
194022e8bd51SIlya Dryomov 	/*
194122e8bd51SIlya Dryomov 	 * Nothing to do for a snapshot object map.
194222e8bd51SIlya Dryomov 	 */
194322e8bd51SIlya Dryomov 	if (osd_req->r_num_ops == 1)
194422e8bd51SIlya Dryomov 		return 0;
194522e8bd51SIlya Dryomov 
194622e8bd51SIlya Dryomov 	/*
194722e8bd51SIlya Dryomov 	 * Update in-memory HEAD object map.
194822e8bd51SIlya Dryomov 	 */
194922e8bd51SIlya Dryomov 	rbd_assert(osd_req->r_num_ops == 2);
195022e8bd51SIlya Dryomov 	osd_data = osd_req_op_data(osd_req, 1, cls, request_data);
195122e8bd51SIlya Dryomov 	rbd_assert(osd_data->type == CEPH_OSD_DATA_TYPE_PAGES);
195222e8bd51SIlya Dryomov 
195322e8bd51SIlya Dryomov 	p = page_address(osd_data->pages[0]);
195422e8bd51SIlya Dryomov 	objno = ceph_decode_64(&p);
195522e8bd51SIlya Dryomov 	rbd_assert(objno == obj_req->ex.oe_objno);
195622e8bd51SIlya Dryomov 	rbd_assert(ceph_decode_64(&p) == objno + 1);
195722e8bd51SIlya Dryomov 	new_state = ceph_decode_8(&p);
195822e8bd51SIlya Dryomov 	has_current_state = ceph_decode_8(&p);
195922e8bd51SIlya Dryomov 	if (has_current_state)
196022e8bd51SIlya Dryomov 		current_state = ceph_decode_8(&p);
196122e8bd51SIlya Dryomov 
196222e8bd51SIlya Dryomov 	spin_lock(&rbd_dev->object_map_lock);
196322e8bd51SIlya Dryomov 	state = __rbd_object_map_get(rbd_dev, objno);
196422e8bd51SIlya Dryomov 	if (!has_current_state || current_state == state ||
196522e8bd51SIlya Dryomov 	    (current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN))
196622e8bd51SIlya Dryomov 		__rbd_object_map_set(rbd_dev, objno, new_state);
196722e8bd51SIlya Dryomov 	spin_unlock(&rbd_dev->object_map_lock);
196822e8bd51SIlya Dryomov 
196922e8bd51SIlya Dryomov 	return 0;
197022e8bd51SIlya Dryomov }
197122e8bd51SIlya Dryomov 
197222e8bd51SIlya Dryomov static void rbd_object_map_callback(struct ceph_osd_request *osd_req)
197322e8bd51SIlya Dryomov {
197422e8bd51SIlya Dryomov 	struct rbd_obj_request *obj_req = osd_req->r_priv;
197522e8bd51SIlya Dryomov 	int result;
197622e8bd51SIlya Dryomov 
197722e8bd51SIlya Dryomov 	dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
197822e8bd51SIlya Dryomov 	     osd_req->r_result, obj_req);
197922e8bd51SIlya Dryomov 
198022e8bd51SIlya Dryomov 	result = rbd_object_map_update_finish(obj_req, osd_req);
198122e8bd51SIlya Dryomov 	rbd_obj_handle_request(obj_req, result);
198222e8bd51SIlya Dryomov }
198322e8bd51SIlya Dryomov 
198422e8bd51SIlya Dryomov static bool update_needed(struct rbd_device *rbd_dev, u64 objno, u8 new_state)
198522e8bd51SIlya Dryomov {
198622e8bd51SIlya Dryomov 	u8 state = rbd_object_map_get(rbd_dev, objno);
198722e8bd51SIlya Dryomov 
198822e8bd51SIlya Dryomov 	if (state == new_state ||
198922e8bd51SIlya Dryomov 	    (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) ||
199022e8bd51SIlya Dryomov 	    (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING))
199122e8bd51SIlya Dryomov 		return false;
199222e8bd51SIlya Dryomov 
199322e8bd51SIlya Dryomov 	return true;
199422e8bd51SIlya Dryomov }
199522e8bd51SIlya Dryomov 
199622e8bd51SIlya Dryomov static int rbd_cls_object_map_update(struct ceph_osd_request *req,
199722e8bd51SIlya Dryomov 				     int which, u64 objno, u8 new_state,
199822e8bd51SIlya Dryomov 				     const u8 *current_state)
199922e8bd51SIlya Dryomov {
200022e8bd51SIlya Dryomov 	struct page **pages;
200122e8bd51SIlya Dryomov 	void *p, *start;
200222e8bd51SIlya Dryomov 	int ret;
200322e8bd51SIlya Dryomov 
200422e8bd51SIlya Dryomov 	ret = osd_req_op_cls_init(req, which, "rbd", "object_map_update");
200522e8bd51SIlya Dryomov 	if (ret)
200622e8bd51SIlya Dryomov 		return ret;
200722e8bd51SIlya Dryomov 
200822e8bd51SIlya Dryomov 	pages = ceph_alloc_page_vector(1, GFP_NOIO);
200922e8bd51SIlya Dryomov 	if (IS_ERR(pages))
201022e8bd51SIlya Dryomov 		return PTR_ERR(pages);
201122e8bd51SIlya Dryomov 
201222e8bd51SIlya Dryomov 	p = start = page_address(pages[0]);
201322e8bd51SIlya Dryomov 	ceph_encode_64(&p, objno);
201422e8bd51SIlya Dryomov 	ceph_encode_64(&p, objno + 1);
201522e8bd51SIlya Dryomov 	ceph_encode_8(&p, new_state);
201622e8bd51SIlya Dryomov 	if (current_state) {
201722e8bd51SIlya Dryomov 		ceph_encode_8(&p, 1);
201822e8bd51SIlya Dryomov 		ceph_encode_8(&p, *current_state);
201922e8bd51SIlya Dryomov 	} else {
202022e8bd51SIlya Dryomov 		ceph_encode_8(&p, 0);
202122e8bd51SIlya Dryomov 	}
202222e8bd51SIlya Dryomov 
202322e8bd51SIlya Dryomov 	osd_req_op_cls_request_data_pages(req, which, pages, p - start, 0,
202422e8bd51SIlya Dryomov 					  false, true);
202522e8bd51SIlya Dryomov 	return 0;
202622e8bd51SIlya Dryomov }
202722e8bd51SIlya Dryomov 
202822e8bd51SIlya Dryomov /*
202922e8bd51SIlya Dryomov  * Return:
203022e8bd51SIlya Dryomov  *   0 - object map update sent
203122e8bd51SIlya Dryomov  *   1 - object map update isn't needed
203222e8bd51SIlya Dryomov  *  <0 - error
203322e8bd51SIlya Dryomov  */
203422e8bd51SIlya Dryomov static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id,
203522e8bd51SIlya Dryomov 				 u8 new_state, const u8 *current_state)
203622e8bd51SIlya Dryomov {
203722e8bd51SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
203822e8bd51SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
203922e8bd51SIlya Dryomov 	struct ceph_osd_request *req;
204022e8bd51SIlya Dryomov 	int num_ops = 1;
204122e8bd51SIlya Dryomov 	int which = 0;
204222e8bd51SIlya Dryomov 	int ret;
204322e8bd51SIlya Dryomov 
204422e8bd51SIlya Dryomov 	if (snap_id == CEPH_NOSNAP) {
204522e8bd51SIlya Dryomov 		if (!update_needed(rbd_dev, obj_req->ex.oe_objno, new_state))
204622e8bd51SIlya Dryomov 			return 1;
204722e8bd51SIlya Dryomov 
204822e8bd51SIlya Dryomov 		num_ops++; /* assert_locked */
204922e8bd51SIlya Dryomov 	}
205022e8bd51SIlya Dryomov 
205122e8bd51SIlya Dryomov 	req = ceph_osdc_alloc_request(osdc, NULL, num_ops, false, GFP_NOIO);
205222e8bd51SIlya Dryomov 	if (!req)
205322e8bd51SIlya Dryomov 		return -ENOMEM;
205422e8bd51SIlya Dryomov 
205522e8bd51SIlya Dryomov 	list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
205622e8bd51SIlya Dryomov 	req->r_callback = rbd_object_map_callback;
205722e8bd51SIlya Dryomov 	req->r_priv = obj_req;
205822e8bd51SIlya Dryomov 
205922e8bd51SIlya Dryomov 	rbd_object_map_name(rbd_dev, snap_id, &req->r_base_oid);
206022e8bd51SIlya Dryomov 	ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
206122e8bd51SIlya Dryomov 	req->r_flags = CEPH_OSD_FLAG_WRITE;
206222e8bd51SIlya Dryomov 	ktime_get_real_ts64(&req->r_mtime);
206322e8bd51SIlya Dryomov 
206422e8bd51SIlya Dryomov 	if (snap_id == CEPH_NOSNAP) {
206522e8bd51SIlya Dryomov 		/*
206622e8bd51SIlya Dryomov 		 * Protect against possible race conditions during lock
206722e8bd51SIlya Dryomov 		 * ownership transitions.
206822e8bd51SIlya Dryomov 		 */
206922e8bd51SIlya Dryomov 		ret = ceph_cls_assert_locked(req, which++, RBD_LOCK_NAME,
207022e8bd51SIlya Dryomov 					     CEPH_CLS_LOCK_EXCLUSIVE, "", "");
207122e8bd51SIlya Dryomov 		if (ret)
207222e8bd51SIlya Dryomov 			return ret;
207322e8bd51SIlya Dryomov 	}
207422e8bd51SIlya Dryomov 
207522e8bd51SIlya Dryomov 	ret = rbd_cls_object_map_update(req, which, obj_req->ex.oe_objno,
207622e8bd51SIlya Dryomov 					new_state, current_state);
207722e8bd51SIlya Dryomov 	if (ret)
207822e8bd51SIlya Dryomov 		return ret;
207922e8bd51SIlya Dryomov 
208022e8bd51SIlya Dryomov 	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
208122e8bd51SIlya Dryomov 	if (ret)
208222e8bd51SIlya Dryomov 		return ret;
208322e8bd51SIlya Dryomov 
208422e8bd51SIlya Dryomov 	ceph_osdc_start_request(osdc, req, false);
208522e8bd51SIlya Dryomov 	return 0;
208622e8bd51SIlya Dryomov }
208722e8bd51SIlya Dryomov 
208886bd7998SIlya Dryomov static void prune_extents(struct ceph_file_extent *img_extents,
208986bd7998SIlya Dryomov 			  u32 *num_img_extents, u64 overlap)
2090e93f3152SAlex Elder {
209186bd7998SIlya Dryomov 	u32 cnt = *num_img_extents;
2092e93f3152SAlex Elder 
209386bd7998SIlya Dryomov 	/* drop extents completely beyond the overlap */
209486bd7998SIlya Dryomov 	while (cnt && img_extents[cnt - 1].fe_off >= overlap)
209586bd7998SIlya Dryomov 		cnt--;
2096e93f3152SAlex Elder 
209786bd7998SIlya Dryomov 	if (cnt) {
209886bd7998SIlya Dryomov 		struct ceph_file_extent *ex = &img_extents[cnt - 1];
2099e93f3152SAlex Elder 
210086bd7998SIlya Dryomov 		/* trim final overlapping extent */
210186bd7998SIlya Dryomov 		if (ex->fe_off + ex->fe_len > overlap)
210286bd7998SIlya Dryomov 			ex->fe_len = overlap - ex->fe_off;
2103e93f3152SAlex Elder 	}
2104e93f3152SAlex Elder 
210586bd7998SIlya Dryomov 	*num_img_extents = cnt;
210686bd7998SIlya Dryomov }
210786bd7998SIlya Dryomov 
210886bd7998SIlya Dryomov /*
210986bd7998SIlya Dryomov  * Determine the byte range(s) covered by either just the object extent
211086bd7998SIlya Dryomov  * or the entire object in the parent image.
211186bd7998SIlya Dryomov  */
211286bd7998SIlya Dryomov static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
211386bd7998SIlya Dryomov 				    bool entire)
2114e93f3152SAlex Elder {
211586bd7998SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2116c5b5ef6cSAlex Elder 	int ret;
2117c5b5ef6cSAlex Elder 
211886bd7998SIlya Dryomov 	if (!rbd_dev->parent_overlap)
211986bd7998SIlya Dryomov 		return 0;
212086bd7998SIlya Dryomov 
212186bd7998SIlya Dryomov 	ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
212286bd7998SIlya Dryomov 				  entire ? 0 : obj_req->ex.oe_off,
212386bd7998SIlya Dryomov 				  entire ? rbd_dev->layout.object_size :
212486bd7998SIlya Dryomov 							obj_req->ex.oe_len,
212586bd7998SIlya Dryomov 				  &obj_req->img_extents,
212686bd7998SIlya Dryomov 				  &obj_req->num_img_extents);
212786bd7998SIlya Dryomov 	if (ret)
212886bd7998SIlya Dryomov 		return ret;
212986bd7998SIlya Dryomov 
213086bd7998SIlya Dryomov 	prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
213186bd7998SIlya Dryomov 		      rbd_dev->parent_overlap);
213286bd7998SIlya Dryomov 	return 0;
213386bd7998SIlya Dryomov }
213486bd7998SIlya Dryomov 
2135bcbab1dbSIlya Dryomov static void rbd_osd_setup_data(struct ceph_osd_request *osd_req, int which)
21363da691bfSIlya Dryomov {
2137bcbab1dbSIlya Dryomov 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2138bcbab1dbSIlya Dryomov 
2139ecc633caSIlya Dryomov 	switch (obj_req->img_request->data_type) {
21403da691bfSIlya Dryomov 	case OBJ_REQUEST_BIO:
2141bcbab1dbSIlya Dryomov 		osd_req_op_extent_osd_data_bio(osd_req, which,
21423da691bfSIlya Dryomov 					       &obj_req->bio_pos,
214343df3d35SIlya Dryomov 					       obj_req->ex.oe_len);
21443da691bfSIlya Dryomov 		break;
21453da691bfSIlya Dryomov 	case OBJ_REQUEST_BVECS:
2146afb97888SIlya Dryomov 	case OBJ_REQUEST_OWN_BVECS:
21473da691bfSIlya Dryomov 		rbd_assert(obj_req->bvec_pos.iter.bi_size ==
214843df3d35SIlya Dryomov 							obj_req->ex.oe_len);
2149afb97888SIlya Dryomov 		rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
2150bcbab1dbSIlya Dryomov 		osd_req_op_extent_osd_data_bvec_pos(osd_req, which,
21513da691bfSIlya Dryomov 						    &obj_req->bvec_pos);
21523da691bfSIlya Dryomov 		break;
21533da691bfSIlya Dryomov 	default:
215416809372SArnd Bergmann 		BUG();
21553da691bfSIlya Dryomov 	}
21563da691bfSIlya Dryomov }
21573da691bfSIlya Dryomov 
2158bcbab1dbSIlya Dryomov static int rbd_osd_setup_stat(struct ceph_osd_request *osd_req, int which)
21593da691bfSIlya Dryomov {
21603da691bfSIlya Dryomov 	struct page **pages;
21613da691bfSIlya Dryomov 
2162c5b5ef6cSAlex Elder 	/*
2163c5b5ef6cSAlex Elder 	 * The response data for a STAT call consists of:
2164c5b5ef6cSAlex Elder 	 *     le64 length;
2165c5b5ef6cSAlex Elder 	 *     struct {
2166c5b5ef6cSAlex Elder 	 *         le32 tv_sec;
2167c5b5ef6cSAlex Elder 	 *         le32 tv_nsec;
2168c5b5ef6cSAlex Elder 	 *     } mtime;
2169c5b5ef6cSAlex Elder 	 */
21703da691bfSIlya Dryomov 	pages = ceph_alloc_page_vector(1, GFP_NOIO);
21713da691bfSIlya Dryomov 	if (IS_ERR(pages))
21723da691bfSIlya Dryomov 		return PTR_ERR(pages);
21733da691bfSIlya Dryomov 
2174bcbab1dbSIlya Dryomov 	osd_req_op_init(osd_req, which, CEPH_OSD_OP_STAT, 0);
2175bcbab1dbSIlya Dryomov 	osd_req_op_raw_data_in_pages(osd_req, which, pages,
21763da691bfSIlya Dryomov 				     8 + sizeof(struct ceph_timespec),
21773da691bfSIlya Dryomov 				     0, false, true);
21783da691bfSIlya Dryomov 	return 0;
2179710214e3SIlya Dryomov }
2180c5b5ef6cSAlex Elder 
2181b5ae8cbcSIlya Dryomov static int rbd_osd_setup_copyup(struct ceph_osd_request *osd_req, int which,
2182b5ae8cbcSIlya Dryomov 				u32 bytes)
218313488d53SIlya Dryomov {
2184b5ae8cbcSIlya Dryomov 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2185b5ae8cbcSIlya Dryomov 	int ret;
2186b5ae8cbcSIlya Dryomov 
2187b5ae8cbcSIlya Dryomov 	ret = osd_req_op_cls_init(osd_req, which, "rbd", "copyup");
2188b5ae8cbcSIlya Dryomov 	if (ret)
2189b5ae8cbcSIlya Dryomov 		return ret;
2190b5ae8cbcSIlya Dryomov 
2191b5ae8cbcSIlya Dryomov 	osd_req_op_cls_request_data_bvecs(osd_req, which, obj_req->copyup_bvecs,
2192b5ae8cbcSIlya Dryomov 					  obj_req->copyup_bvec_count, bytes);
2193b5ae8cbcSIlya Dryomov 	return 0;
219413488d53SIlya Dryomov }
219513488d53SIlya Dryomov 
2196ea9b743cSIlya Dryomov static int rbd_obj_init_read(struct rbd_obj_request *obj_req)
21973da691bfSIlya Dryomov {
2198ea9b743cSIlya Dryomov 	obj_req->read_state = RBD_OBJ_READ_START;
2199ea9b743cSIlya Dryomov 	return 0;
2200ea9b743cSIlya Dryomov }
2201ea9b743cSIlya Dryomov 
2202bcbab1dbSIlya Dryomov static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2203bcbab1dbSIlya Dryomov 				      int which)
22043da691bfSIlya Dryomov {
2205bcbab1dbSIlya Dryomov 	struct rbd_obj_request *obj_req = osd_req->r_priv;
22063da691bfSIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
22073da691bfSIlya Dryomov 	u16 opcode;
2208c5b5ef6cSAlex Elder 
22098b5bec5cSIlya Dryomov 	if (!use_object_map(rbd_dev) ||
22108b5bec5cSIlya Dryomov 	    !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) {
2211bcbab1dbSIlya Dryomov 		osd_req_op_alloc_hint_init(osd_req, which++,
22123da691bfSIlya Dryomov 					   rbd_dev->layout.object_size,
2213d3798accSIlya Dryomov 					   rbd_dev->layout.object_size,
2214dc1dad8eSIlya Dryomov 					   rbd_dev->opts->alloc_hint_flags);
22158b5bec5cSIlya Dryomov 	}
2216c5b5ef6cSAlex Elder 
22173da691bfSIlya Dryomov 	if (rbd_obj_is_entire(obj_req))
22183da691bfSIlya Dryomov 		opcode = CEPH_OSD_OP_WRITEFULL;
22193da691bfSIlya Dryomov 	else
22203da691bfSIlya Dryomov 		opcode = CEPH_OSD_OP_WRITE;
2221c5b5ef6cSAlex Elder 
2222bcbab1dbSIlya Dryomov 	osd_req_op_extent_init(osd_req, which, opcode,
222343df3d35SIlya Dryomov 			       obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
2224bcbab1dbSIlya Dryomov 	rbd_osd_setup_data(osd_req, which);
22253da691bfSIlya Dryomov }
22263da691bfSIlya Dryomov 
2227ea9b743cSIlya Dryomov static int rbd_obj_init_write(struct rbd_obj_request *obj_req)
22283da691bfSIlya Dryomov {
22293da691bfSIlya Dryomov 	int ret;
22303da691bfSIlya Dryomov 
223186bd7998SIlya Dryomov 	/* reverse map the entire object onto the parent */
223286bd7998SIlya Dryomov 	ret = rbd_obj_calc_img_extents(obj_req, true);
223386bd7998SIlya Dryomov 	if (ret)
223486bd7998SIlya Dryomov 		return ret;
223586bd7998SIlya Dryomov 
22360ad5d953SIlya Dryomov 	if (rbd_obj_copyup_enabled(obj_req))
22370ad5d953SIlya Dryomov 		obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
22383da691bfSIlya Dryomov 
223985b5e6d1SIlya Dryomov 	obj_req->write_state = RBD_OBJ_WRITE_START;
22403da691bfSIlya Dryomov 	return 0;
224170d045f6SIlya Dryomov }
224270d045f6SIlya Dryomov 
22436484cbe9SIlya Dryomov static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req)
22446484cbe9SIlya Dryomov {
22456484cbe9SIlya Dryomov 	return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE :
22466484cbe9SIlya Dryomov 					  CEPH_OSD_OP_ZERO;
22476484cbe9SIlya Dryomov }
22486484cbe9SIlya Dryomov 
224927bbd911SIlya Dryomov static void __rbd_osd_setup_discard_ops(struct ceph_osd_request *osd_req,
225027bbd911SIlya Dryomov 					int which)
225127bbd911SIlya Dryomov {
225227bbd911SIlya Dryomov 	struct rbd_obj_request *obj_req = osd_req->r_priv;
225327bbd911SIlya Dryomov 
225427bbd911SIlya Dryomov 	if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
225527bbd911SIlya Dryomov 		rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
225627bbd911SIlya Dryomov 		osd_req_op_init(osd_req, which, CEPH_OSD_OP_DELETE, 0);
225727bbd911SIlya Dryomov 	} else {
225827bbd911SIlya Dryomov 		osd_req_op_extent_init(osd_req, which,
225927bbd911SIlya Dryomov 				       truncate_or_zero_opcode(obj_req),
226027bbd911SIlya Dryomov 				       obj_req->ex.oe_off, obj_req->ex.oe_len,
226127bbd911SIlya Dryomov 				       0, 0);
226227bbd911SIlya Dryomov 	}
226327bbd911SIlya Dryomov }
226427bbd911SIlya Dryomov 
2265ea9b743cSIlya Dryomov static int rbd_obj_init_discard(struct rbd_obj_request *obj_req)
22666484cbe9SIlya Dryomov {
22670c93e1b7SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
226827bbd911SIlya Dryomov 	u64 off, next_off;
22696484cbe9SIlya Dryomov 	int ret;
22706484cbe9SIlya Dryomov 
22710c93e1b7SIlya Dryomov 	/*
22720c93e1b7SIlya Dryomov 	 * Align the range to alloc_size boundary and punt on discards
22730c93e1b7SIlya Dryomov 	 * that are too small to free up any space.
22740c93e1b7SIlya Dryomov 	 *
22750c93e1b7SIlya Dryomov 	 * alloc_size == object_size && is_tail() is a special case for
22760c93e1b7SIlya Dryomov 	 * filestore with filestore_punch_hole = false, needed to allow
22770c93e1b7SIlya Dryomov 	 * truncate (in addition to delete).
22780c93e1b7SIlya Dryomov 	 */
22790c93e1b7SIlya Dryomov 	if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size ||
22800c93e1b7SIlya Dryomov 	    !rbd_obj_is_tail(obj_req)) {
228127bbd911SIlya Dryomov 		off = round_up(obj_req->ex.oe_off, rbd_dev->opts->alloc_size);
228227bbd911SIlya Dryomov 		next_off = round_down(obj_req->ex.oe_off + obj_req->ex.oe_len,
228327bbd911SIlya Dryomov 				      rbd_dev->opts->alloc_size);
22840c93e1b7SIlya Dryomov 		if (off >= next_off)
22850c93e1b7SIlya Dryomov 			return 1;
228627bbd911SIlya Dryomov 
228727bbd911SIlya Dryomov 		dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
228827bbd911SIlya Dryomov 		     obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
228927bbd911SIlya Dryomov 		     off, next_off - off);
229027bbd911SIlya Dryomov 		obj_req->ex.oe_off = off;
229127bbd911SIlya Dryomov 		obj_req->ex.oe_len = next_off - off;
22920c93e1b7SIlya Dryomov 	}
22930c93e1b7SIlya Dryomov 
22946484cbe9SIlya Dryomov 	/* reverse map the entire object onto the parent */
22956484cbe9SIlya Dryomov 	ret = rbd_obj_calc_img_extents(obj_req, true);
22966484cbe9SIlya Dryomov 	if (ret)
22976484cbe9SIlya Dryomov 		return ret;
22986484cbe9SIlya Dryomov 
229922e8bd51SIlya Dryomov 	obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
23000ad5d953SIlya Dryomov 	if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents)
23010ad5d953SIlya Dryomov 		obj_req->flags |= RBD_OBJ_FLAG_DELETION;
23026484cbe9SIlya Dryomov 
230385b5e6d1SIlya Dryomov 	obj_req->write_state = RBD_OBJ_WRITE_START;
23046484cbe9SIlya Dryomov 	return 0;
23056484cbe9SIlya Dryomov }
23066484cbe9SIlya Dryomov 
2307bcbab1dbSIlya Dryomov static void __rbd_osd_setup_zeroout_ops(struct ceph_osd_request *osd_req,
2308bcbab1dbSIlya Dryomov 					int which)
230913488d53SIlya Dryomov {
2310bcbab1dbSIlya Dryomov 	struct rbd_obj_request *obj_req = osd_req->r_priv;
23113da691bfSIlya Dryomov 	u16 opcode;
2312058aa991SIlya Dryomov 
23133da691bfSIlya Dryomov 	if (rbd_obj_is_entire(obj_req)) {
231486bd7998SIlya Dryomov 		if (obj_req->num_img_extents) {
23150ad5d953SIlya Dryomov 			if (!(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
2316bcbab1dbSIlya Dryomov 				osd_req_op_init(osd_req, which++,
23172bb1e56eSIlya Dryomov 						CEPH_OSD_OP_CREATE, 0);
23183da691bfSIlya Dryomov 			opcode = CEPH_OSD_OP_TRUNCATE;
23193da691bfSIlya Dryomov 		} else {
23200ad5d953SIlya Dryomov 			rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
2321bcbab1dbSIlya Dryomov 			osd_req_op_init(osd_req, which++,
23223da691bfSIlya Dryomov 					CEPH_OSD_OP_DELETE, 0);
23233da691bfSIlya Dryomov 			opcode = 0;
23243da691bfSIlya Dryomov 		}
23253da691bfSIlya Dryomov 	} else {
23266484cbe9SIlya Dryomov 		opcode = truncate_or_zero_opcode(obj_req);
23273da691bfSIlya Dryomov 	}
23283da691bfSIlya Dryomov 
23293da691bfSIlya Dryomov 	if (opcode)
2330bcbab1dbSIlya Dryomov 		osd_req_op_extent_init(osd_req, which, opcode,
233143df3d35SIlya Dryomov 				       obj_req->ex.oe_off, obj_req->ex.oe_len,
23323da691bfSIlya Dryomov 				       0, 0);
23333da691bfSIlya Dryomov }
23343da691bfSIlya Dryomov 
2335ea9b743cSIlya Dryomov static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req)
23363da691bfSIlya Dryomov {
23373da691bfSIlya Dryomov 	int ret;
23383da691bfSIlya Dryomov 
233986bd7998SIlya Dryomov 	/* reverse map the entire object onto the parent */
234086bd7998SIlya Dryomov 	ret = rbd_obj_calc_img_extents(obj_req, true);
234186bd7998SIlya Dryomov 	if (ret)
234286bd7998SIlya Dryomov 		return ret;
234386bd7998SIlya Dryomov 
23440ad5d953SIlya Dryomov 	if (rbd_obj_copyup_enabled(obj_req))
23450ad5d953SIlya Dryomov 		obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
23460ad5d953SIlya Dryomov 	if (!obj_req->num_img_extents) {
234722e8bd51SIlya Dryomov 		obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
23480ad5d953SIlya Dryomov 		if (rbd_obj_is_entire(obj_req))
23490ad5d953SIlya Dryomov 			obj_req->flags |= RBD_OBJ_FLAG_DELETION;
23503da691bfSIlya Dryomov 	}
23513da691bfSIlya Dryomov 
235285b5e6d1SIlya Dryomov 	obj_req->write_state = RBD_OBJ_WRITE_START;
2353980917fcSIlya Dryomov 	return 0;
2354b454e36dSAlex Elder }
2355b454e36dSAlex Elder 
2356a086a1b8SIlya Dryomov static int count_write_ops(struct rbd_obj_request *obj_req)
2357a086a1b8SIlya Dryomov {
23588b5bec5cSIlya Dryomov 	struct rbd_img_request *img_req = obj_req->img_request;
23598b5bec5cSIlya Dryomov 
23608b5bec5cSIlya Dryomov 	switch (img_req->op_type) {
2361a086a1b8SIlya Dryomov 	case OBJ_OP_WRITE:
23628b5bec5cSIlya Dryomov 		if (!use_object_map(img_req->rbd_dev) ||
23638b5bec5cSIlya Dryomov 		    !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST))
2364a086a1b8SIlya Dryomov 			return 2; /* setallochint + write/writefull */
23658b5bec5cSIlya Dryomov 
23668b5bec5cSIlya Dryomov 		return 1; /* write/writefull */
2367a086a1b8SIlya Dryomov 	case OBJ_OP_DISCARD:
2368a086a1b8SIlya Dryomov 		return 1; /* delete/truncate/zero */
2369a086a1b8SIlya Dryomov 	case OBJ_OP_ZEROOUT:
2370a086a1b8SIlya Dryomov 		if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents &&
2371a086a1b8SIlya Dryomov 		    !(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
2372a086a1b8SIlya Dryomov 			return 2; /* create + truncate */
2373a086a1b8SIlya Dryomov 
2374a086a1b8SIlya Dryomov 		return 1; /* delete/truncate/zero */
2375a086a1b8SIlya Dryomov 	default:
2376a086a1b8SIlya Dryomov 		BUG();
2377a086a1b8SIlya Dryomov 	}
2378a086a1b8SIlya Dryomov }
2379a086a1b8SIlya Dryomov 
2380a086a1b8SIlya Dryomov static void rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2381a086a1b8SIlya Dryomov 				    int which)
2382a086a1b8SIlya Dryomov {
2383a086a1b8SIlya Dryomov 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2384a086a1b8SIlya Dryomov 
2385a086a1b8SIlya Dryomov 	switch (obj_req->img_request->op_type) {
2386a086a1b8SIlya Dryomov 	case OBJ_OP_WRITE:
2387a086a1b8SIlya Dryomov 		__rbd_osd_setup_write_ops(osd_req, which);
2388a086a1b8SIlya Dryomov 		break;
2389a086a1b8SIlya Dryomov 	case OBJ_OP_DISCARD:
2390a086a1b8SIlya Dryomov 		__rbd_osd_setup_discard_ops(osd_req, which);
2391a086a1b8SIlya Dryomov 		break;
2392a086a1b8SIlya Dryomov 	case OBJ_OP_ZEROOUT:
2393a086a1b8SIlya Dryomov 		__rbd_osd_setup_zeroout_ops(osd_req, which);
2394a086a1b8SIlya Dryomov 		break;
2395a086a1b8SIlya Dryomov 	default:
2396a086a1b8SIlya Dryomov 		BUG();
2397a086a1b8SIlya Dryomov 	}
2398a086a1b8SIlya Dryomov }
2399a086a1b8SIlya Dryomov 
2400b454e36dSAlex Elder /*
2401a086a1b8SIlya Dryomov  * Prune the list of object requests (adjust offset and/or length, drop
2402a086a1b8SIlya Dryomov  * redundant requests).  Prepare object request state machines and image
2403a086a1b8SIlya Dryomov  * request state machine for execution.
2404b454e36dSAlex Elder  */
24053da691bfSIlya Dryomov static int __rbd_img_fill_request(struct rbd_img_request *img_req)
24063da691bfSIlya Dryomov {
24070c93e1b7SIlya Dryomov 	struct rbd_obj_request *obj_req, *next_obj_req;
24083da691bfSIlya Dryomov 	int ret;
24093d7efd18SAlex Elder 
24100c93e1b7SIlya Dryomov 	for_each_obj_request_safe(img_req, obj_req, next_obj_req) {
24119bb0248dSIlya Dryomov 		switch (img_req->op_type) {
24123da691bfSIlya Dryomov 		case OBJ_OP_READ:
2413ea9b743cSIlya Dryomov 			ret = rbd_obj_init_read(obj_req);
24143da691bfSIlya Dryomov 			break;
24153da691bfSIlya Dryomov 		case OBJ_OP_WRITE:
2416ea9b743cSIlya Dryomov 			ret = rbd_obj_init_write(obj_req);
24173da691bfSIlya Dryomov 			break;
24183da691bfSIlya Dryomov 		case OBJ_OP_DISCARD:
2419ea9b743cSIlya Dryomov 			ret = rbd_obj_init_discard(obj_req);
24203da691bfSIlya Dryomov 			break;
24216484cbe9SIlya Dryomov 		case OBJ_OP_ZEROOUT:
2422ea9b743cSIlya Dryomov 			ret = rbd_obj_init_zeroout(obj_req);
24236484cbe9SIlya Dryomov 			break;
24243da691bfSIlya Dryomov 		default:
242516809372SArnd Bergmann 			BUG();
24263da691bfSIlya Dryomov 		}
24270c93e1b7SIlya Dryomov 		if (ret < 0)
24283da691bfSIlya Dryomov 			return ret;
24290c93e1b7SIlya Dryomov 		if (ret > 0) {
24300c93e1b7SIlya Dryomov 			rbd_img_obj_request_del(img_req, obj_req);
24310c93e1b7SIlya Dryomov 			continue;
24320c93e1b7SIlya Dryomov 		}
2433b454e36dSAlex Elder 	}
2434b454e36dSAlex Elder 
24350192ce2eSIlya Dryomov 	img_req->state = RBD_IMG_START;
24363da691bfSIlya Dryomov 	return 0;
24373da691bfSIlya Dryomov }
24383da691bfSIlya Dryomov 
24395a237819SIlya Dryomov union rbd_img_fill_iter {
24405a237819SIlya Dryomov 	struct ceph_bio_iter	bio_iter;
24415a237819SIlya Dryomov 	struct ceph_bvec_iter	bvec_iter;
24425a237819SIlya Dryomov };
24435a237819SIlya Dryomov 
24445a237819SIlya Dryomov struct rbd_img_fill_ctx {
24455a237819SIlya Dryomov 	enum obj_request_type	pos_type;
24465a237819SIlya Dryomov 	union rbd_img_fill_iter	*pos;
24475a237819SIlya Dryomov 	union rbd_img_fill_iter	iter;
24485a237819SIlya Dryomov 	ceph_object_extent_fn_t	set_pos_fn;
2449afb97888SIlya Dryomov 	ceph_object_extent_fn_t	count_fn;
2450afb97888SIlya Dryomov 	ceph_object_extent_fn_t	copy_fn;
24515a237819SIlya Dryomov };
24525a237819SIlya Dryomov 
24535a237819SIlya Dryomov static struct ceph_object_extent *alloc_object_extent(void *arg)
24545a237819SIlya Dryomov {
24555a237819SIlya Dryomov 	struct rbd_img_request *img_req = arg;
24565a237819SIlya Dryomov 	struct rbd_obj_request *obj_req;
24575a237819SIlya Dryomov 
24585a237819SIlya Dryomov 	obj_req = rbd_obj_request_create();
24595a237819SIlya Dryomov 	if (!obj_req)
24605a237819SIlya Dryomov 		return NULL;
24615a237819SIlya Dryomov 
24625a237819SIlya Dryomov 	rbd_img_obj_request_add(img_req, obj_req);
24635a237819SIlya Dryomov 	return &obj_req->ex;
24645a237819SIlya Dryomov }
24655a237819SIlya Dryomov 
24665a237819SIlya Dryomov /*
2467afb97888SIlya Dryomov  * While su != os && sc == 1 is technically not fancy (it's the same
2468afb97888SIlya Dryomov  * layout as su == os && sc == 1), we can't use the nocopy path for it
2469afb97888SIlya Dryomov  * because ->set_pos_fn() should be called only once per object.
2470afb97888SIlya Dryomov  * ceph_file_to_extents() invokes action_fn once per stripe unit, so
2471afb97888SIlya Dryomov  * treat su != os && sc == 1 as fancy.
24725a237819SIlya Dryomov  */
2473afb97888SIlya Dryomov static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
2474afb97888SIlya Dryomov {
2475afb97888SIlya Dryomov 	return l->stripe_unit != l->object_size;
2476afb97888SIlya Dryomov }
2477afb97888SIlya Dryomov 
2478afb97888SIlya Dryomov static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
24795a237819SIlya Dryomov 				       struct ceph_file_extent *img_extents,
24805a237819SIlya Dryomov 				       u32 num_img_extents,
24815a237819SIlya Dryomov 				       struct rbd_img_fill_ctx *fctx)
24825a237819SIlya Dryomov {
24835a237819SIlya Dryomov 	u32 i;
24845a237819SIlya Dryomov 	int ret;
24855a237819SIlya Dryomov 
24865a237819SIlya Dryomov 	img_req->data_type = fctx->pos_type;
24875a237819SIlya Dryomov 
24885a237819SIlya Dryomov 	/*
24895a237819SIlya Dryomov 	 * Create object requests and set each object request's starting
24905a237819SIlya Dryomov 	 * position in the provided bio (list) or bio_vec array.
24915a237819SIlya Dryomov 	 */
24925a237819SIlya Dryomov 	fctx->iter = *fctx->pos;
24935a237819SIlya Dryomov 	for (i = 0; i < num_img_extents; i++) {
24945a237819SIlya Dryomov 		ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
24955a237819SIlya Dryomov 					   img_extents[i].fe_off,
24965a237819SIlya Dryomov 					   img_extents[i].fe_len,
24975a237819SIlya Dryomov 					   &img_req->object_extents,
24985a237819SIlya Dryomov 					   alloc_object_extent, img_req,
24995a237819SIlya Dryomov 					   fctx->set_pos_fn, &fctx->iter);
25005a237819SIlya Dryomov 		if (ret)
25015a237819SIlya Dryomov 			return ret;
25025a237819SIlya Dryomov 	}
25035a237819SIlya Dryomov 
25045a237819SIlya Dryomov 	return __rbd_img_fill_request(img_req);
25055a237819SIlya Dryomov }
25065a237819SIlya Dryomov 
2507afb97888SIlya Dryomov /*
2508afb97888SIlya Dryomov  * Map a list of image extents to a list of object extents, create the
2509afb97888SIlya Dryomov  * corresponding object requests (normally each to a different object,
2510afb97888SIlya Dryomov  * but not always) and add them to @img_req.  For each object request,
2511afb97888SIlya Dryomov  * set up its data descriptor to point to the corresponding chunk(s) of
2512afb97888SIlya Dryomov  * @fctx->pos data buffer.
2513afb97888SIlya Dryomov  *
2514afb97888SIlya Dryomov  * Because ceph_file_to_extents() will merge adjacent object extents
2515afb97888SIlya Dryomov  * together, each object request's data descriptor may point to multiple
2516afb97888SIlya Dryomov  * different chunks of @fctx->pos data buffer.
2517afb97888SIlya Dryomov  *
2518afb97888SIlya Dryomov  * @fctx->pos data buffer is assumed to be large enough.
2519afb97888SIlya Dryomov  */
2520afb97888SIlya Dryomov static int rbd_img_fill_request(struct rbd_img_request *img_req,
2521afb97888SIlya Dryomov 				struct ceph_file_extent *img_extents,
2522afb97888SIlya Dryomov 				u32 num_img_extents,
2523afb97888SIlya Dryomov 				struct rbd_img_fill_ctx *fctx)
2524afb97888SIlya Dryomov {
2525afb97888SIlya Dryomov 	struct rbd_device *rbd_dev = img_req->rbd_dev;
2526afb97888SIlya Dryomov 	struct rbd_obj_request *obj_req;
2527afb97888SIlya Dryomov 	u32 i;
2528afb97888SIlya Dryomov 	int ret;
2529afb97888SIlya Dryomov 
2530afb97888SIlya Dryomov 	if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2531afb97888SIlya Dryomov 	    !rbd_layout_is_fancy(&rbd_dev->layout))
2532afb97888SIlya Dryomov 		return rbd_img_fill_request_nocopy(img_req, img_extents,
2533afb97888SIlya Dryomov 						   num_img_extents, fctx);
2534afb97888SIlya Dryomov 
2535afb97888SIlya Dryomov 	img_req->data_type = OBJ_REQUEST_OWN_BVECS;
2536afb97888SIlya Dryomov 
2537afb97888SIlya Dryomov 	/*
2538afb97888SIlya Dryomov 	 * Create object requests and determine ->bvec_count for each object
2539afb97888SIlya Dryomov 	 * request.  Note that ->bvec_count sum over all object requests may
2540afb97888SIlya Dryomov 	 * be greater than the number of bio_vecs in the provided bio (list)
2541afb97888SIlya Dryomov 	 * or bio_vec array because when mapped, those bio_vecs can straddle
2542afb97888SIlya Dryomov 	 * stripe unit boundaries.
2543afb97888SIlya Dryomov 	 */
2544afb97888SIlya Dryomov 	fctx->iter = *fctx->pos;
2545afb97888SIlya Dryomov 	for (i = 0; i < num_img_extents; i++) {
2546afb97888SIlya Dryomov 		ret = ceph_file_to_extents(&rbd_dev->layout,
2547afb97888SIlya Dryomov 					   img_extents[i].fe_off,
2548afb97888SIlya Dryomov 					   img_extents[i].fe_len,
2549afb97888SIlya Dryomov 					   &img_req->object_extents,
2550afb97888SIlya Dryomov 					   alloc_object_extent, img_req,
2551afb97888SIlya Dryomov 					   fctx->count_fn, &fctx->iter);
2552afb97888SIlya Dryomov 		if (ret)
2553afb97888SIlya Dryomov 			return ret;
2554afb97888SIlya Dryomov 	}
2555afb97888SIlya Dryomov 
2556afb97888SIlya Dryomov 	for_each_obj_request(img_req, obj_req) {
2557afb97888SIlya Dryomov 		obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2558afb97888SIlya Dryomov 					      sizeof(*obj_req->bvec_pos.bvecs),
2559afb97888SIlya Dryomov 					      GFP_NOIO);
2560afb97888SIlya Dryomov 		if (!obj_req->bvec_pos.bvecs)
2561afb97888SIlya Dryomov 			return -ENOMEM;
2562afb97888SIlya Dryomov 	}
2563afb97888SIlya Dryomov 
2564afb97888SIlya Dryomov 	/*
2565afb97888SIlya Dryomov 	 * Fill in each object request's private bio_vec array, splitting and
2566afb97888SIlya Dryomov 	 * rearranging the provided bio_vecs in stripe unit chunks as needed.
2567afb97888SIlya Dryomov 	 */
2568afb97888SIlya Dryomov 	fctx->iter = *fctx->pos;
2569afb97888SIlya Dryomov 	for (i = 0; i < num_img_extents; i++) {
2570afb97888SIlya Dryomov 		ret = ceph_iterate_extents(&rbd_dev->layout,
2571afb97888SIlya Dryomov 					   img_extents[i].fe_off,
2572afb97888SIlya Dryomov 					   img_extents[i].fe_len,
2573afb97888SIlya Dryomov 					   &img_req->object_extents,
2574afb97888SIlya Dryomov 					   fctx->copy_fn, &fctx->iter);
2575afb97888SIlya Dryomov 		if (ret)
2576afb97888SIlya Dryomov 			return ret;
2577afb97888SIlya Dryomov 	}
2578afb97888SIlya Dryomov 
2579afb97888SIlya Dryomov 	return __rbd_img_fill_request(img_req);
2580afb97888SIlya Dryomov }
2581afb97888SIlya Dryomov 
25825a237819SIlya Dryomov static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
25835a237819SIlya Dryomov 			       u64 off, u64 len)
25845a237819SIlya Dryomov {
25855a237819SIlya Dryomov 	struct ceph_file_extent ex = { off, len };
2586a55e601bSArnd Bergmann 	union rbd_img_fill_iter dummy = {};
25875a237819SIlya Dryomov 	struct rbd_img_fill_ctx fctx = {
25885a237819SIlya Dryomov 		.pos_type = OBJ_REQUEST_NODATA,
25895a237819SIlya Dryomov 		.pos = &dummy,
25905a237819SIlya Dryomov 	};
25915a237819SIlya Dryomov 
25925a237819SIlya Dryomov 	return rbd_img_fill_request(img_req, &ex, 1, &fctx);
25935a237819SIlya Dryomov }
25945a237819SIlya Dryomov 
25955a237819SIlya Dryomov static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
25965a237819SIlya Dryomov {
25975a237819SIlya Dryomov 	struct rbd_obj_request *obj_req =
25985a237819SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
25995a237819SIlya Dryomov 	struct ceph_bio_iter *it = arg;
26005a237819SIlya Dryomov 
26015a237819SIlya Dryomov 	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
26025a237819SIlya Dryomov 	obj_req->bio_pos = *it;
26035a237819SIlya Dryomov 	ceph_bio_iter_advance(it, bytes);
26045a237819SIlya Dryomov }
26055a237819SIlya Dryomov 
2606afb97888SIlya Dryomov static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2607afb97888SIlya Dryomov {
2608afb97888SIlya Dryomov 	struct rbd_obj_request *obj_req =
2609afb97888SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
2610afb97888SIlya Dryomov 	struct ceph_bio_iter *it = arg;
2611afb97888SIlya Dryomov 
2612afb97888SIlya Dryomov 	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2613afb97888SIlya Dryomov 	ceph_bio_iter_advance_step(it, bytes, ({
2614afb97888SIlya Dryomov 		obj_req->bvec_count++;
2615afb97888SIlya Dryomov 	}));
2616afb97888SIlya Dryomov 
2617afb97888SIlya Dryomov }
2618afb97888SIlya Dryomov 
2619afb97888SIlya Dryomov static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2620afb97888SIlya Dryomov {
2621afb97888SIlya Dryomov 	struct rbd_obj_request *obj_req =
2622afb97888SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
2623afb97888SIlya Dryomov 	struct ceph_bio_iter *it = arg;
2624afb97888SIlya Dryomov 
2625afb97888SIlya Dryomov 	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2626afb97888SIlya Dryomov 	ceph_bio_iter_advance_step(it, bytes, ({
2627afb97888SIlya Dryomov 		obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2628afb97888SIlya Dryomov 		obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2629afb97888SIlya Dryomov 	}));
2630afb97888SIlya Dryomov }
2631afb97888SIlya Dryomov 
26325a237819SIlya Dryomov static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
26335a237819SIlya Dryomov 				   struct ceph_file_extent *img_extents,
26345a237819SIlya Dryomov 				   u32 num_img_extents,
26355a237819SIlya Dryomov 				   struct ceph_bio_iter *bio_pos)
26365a237819SIlya Dryomov {
26375a237819SIlya Dryomov 	struct rbd_img_fill_ctx fctx = {
26385a237819SIlya Dryomov 		.pos_type = OBJ_REQUEST_BIO,
26395a237819SIlya Dryomov 		.pos = (union rbd_img_fill_iter *)bio_pos,
26405a237819SIlya Dryomov 		.set_pos_fn = set_bio_pos,
2641afb97888SIlya Dryomov 		.count_fn = count_bio_bvecs,
2642afb97888SIlya Dryomov 		.copy_fn = copy_bio_bvecs,
26435a237819SIlya Dryomov 	};
26445a237819SIlya Dryomov 
26455a237819SIlya Dryomov 	return rbd_img_fill_request(img_req, img_extents, num_img_extents,
26465a237819SIlya Dryomov 				    &fctx);
26475a237819SIlya Dryomov }
26485a237819SIlya Dryomov 
26495a237819SIlya Dryomov static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
26505a237819SIlya Dryomov 				 u64 off, u64 len, struct bio *bio)
26515a237819SIlya Dryomov {
26525a237819SIlya Dryomov 	struct ceph_file_extent ex = { off, len };
26535a237819SIlya Dryomov 	struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
26545a237819SIlya Dryomov 
26555a237819SIlya Dryomov 	return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
26565a237819SIlya Dryomov }
26575a237819SIlya Dryomov 
26585a237819SIlya Dryomov static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
26595a237819SIlya Dryomov {
26605a237819SIlya Dryomov 	struct rbd_obj_request *obj_req =
26615a237819SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
26625a237819SIlya Dryomov 	struct ceph_bvec_iter *it = arg;
26635a237819SIlya Dryomov 
26645a237819SIlya Dryomov 	obj_req->bvec_pos = *it;
26655a237819SIlya Dryomov 	ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
26665a237819SIlya Dryomov 	ceph_bvec_iter_advance(it, bytes);
26675a237819SIlya Dryomov }
26685a237819SIlya Dryomov 
2669afb97888SIlya Dryomov static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2670afb97888SIlya Dryomov {
2671afb97888SIlya Dryomov 	struct rbd_obj_request *obj_req =
2672afb97888SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
2673afb97888SIlya Dryomov 	struct ceph_bvec_iter *it = arg;
2674afb97888SIlya Dryomov 
2675afb97888SIlya Dryomov 	ceph_bvec_iter_advance_step(it, bytes, ({
2676afb97888SIlya Dryomov 		obj_req->bvec_count++;
2677afb97888SIlya Dryomov 	}));
2678afb97888SIlya Dryomov }
2679afb97888SIlya Dryomov 
2680afb97888SIlya Dryomov static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2681afb97888SIlya Dryomov {
2682afb97888SIlya Dryomov 	struct rbd_obj_request *obj_req =
2683afb97888SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
2684afb97888SIlya Dryomov 	struct ceph_bvec_iter *it = arg;
2685afb97888SIlya Dryomov 
2686afb97888SIlya Dryomov 	ceph_bvec_iter_advance_step(it, bytes, ({
2687afb97888SIlya Dryomov 		obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2688afb97888SIlya Dryomov 		obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2689afb97888SIlya Dryomov 	}));
2690afb97888SIlya Dryomov }
2691afb97888SIlya Dryomov 
26925a237819SIlya Dryomov static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
26935a237819SIlya Dryomov 				     struct ceph_file_extent *img_extents,
26945a237819SIlya Dryomov 				     u32 num_img_extents,
26955a237819SIlya Dryomov 				     struct ceph_bvec_iter *bvec_pos)
26965a237819SIlya Dryomov {
26975a237819SIlya Dryomov 	struct rbd_img_fill_ctx fctx = {
26985a237819SIlya Dryomov 		.pos_type = OBJ_REQUEST_BVECS,
26995a237819SIlya Dryomov 		.pos = (union rbd_img_fill_iter *)bvec_pos,
27005a237819SIlya Dryomov 		.set_pos_fn = set_bvec_pos,
2701afb97888SIlya Dryomov 		.count_fn = count_bvecs,
2702afb97888SIlya Dryomov 		.copy_fn = copy_bvecs,
27035a237819SIlya Dryomov 	};
27045a237819SIlya Dryomov 
27055a237819SIlya Dryomov 	return rbd_img_fill_request(img_req, img_extents, num_img_extents,
27065a237819SIlya Dryomov 				    &fctx);
27075a237819SIlya Dryomov }
27085a237819SIlya Dryomov 
27095a237819SIlya Dryomov static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
27105a237819SIlya Dryomov 				   struct ceph_file_extent *img_extents,
27115a237819SIlya Dryomov 				   u32 num_img_extents,
27125a237819SIlya Dryomov 				   struct bio_vec *bvecs)
27135a237819SIlya Dryomov {
27145a237819SIlya Dryomov 	struct ceph_bvec_iter it = {
27155a237819SIlya Dryomov 		.bvecs = bvecs,
27165a237819SIlya Dryomov 		.iter = { .bi_size = ceph_file_extents_bytes(img_extents,
27175a237819SIlya Dryomov 							     num_img_extents) },
27185a237819SIlya Dryomov 	};
27195a237819SIlya Dryomov 
27205a237819SIlya Dryomov 	return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
27215a237819SIlya Dryomov 					 &it);
27225a237819SIlya Dryomov }
27235a237819SIlya Dryomov 
27240192ce2eSIlya Dryomov static void rbd_img_handle_request_work(struct work_struct *work)
2725bf0d5f50SAlex Elder {
27260192ce2eSIlya Dryomov 	struct rbd_img_request *img_req =
27270192ce2eSIlya Dryomov 	    container_of(work, struct rbd_img_request, work);
2728bf0d5f50SAlex Elder 
27290192ce2eSIlya Dryomov 	rbd_img_handle_request(img_req, img_req->work_result);
27300192ce2eSIlya Dryomov }
2731bf0d5f50SAlex Elder 
27320192ce2eSIlya Dryomov static void rbd_img_schedule(struct rbd_img_request *img_req, int result)
27330192ce2eSIlya Dryomov {
27340192ce2eSIlya Dryomov 	INIT_WORK(&img_req->work, rbd_img_handle_request_work);
27350192ce2eSIlya Dryomov 	img_req->work_result = result;
27360192ce2eSIlya Dryomov 	queue_work(rbd_wq, &img_req->work);
2737bf0d5f50SAlex Elder }
2738bf0d5f50SAlex Elder 
273922e8bd51SIlya Dryomov static bool rbd_obj_may_exist(struct rbd_obj_request *obj_req)
274022e8bd51SIlya Dryomov {
274122e8bd51SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
274222e8bd51SIlya Dryomov 
274322e8bd51SIlya Dryomov 	if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno)) {
274422e8bd51SIlya Dryomov 		obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
274522e8bd51SIlya Dryomov 		return true;
274622e8bd51SIlya Dryomov 	}
274722e8bd51SIlya Dryomov 
274822e8bd51SIlya Dryomov 	dout("%s %p objno %llu assuming dne\n", __func__, obj_req,
274922e8bd51SIlya Dryomov 	     obj_req->ex.oe_objno);
275022e8bd51SIlya Dryomov 	return false;
275122e8bd51SIlya Dryomov }
275222e8bd51SIlya Dryomov 
275385b5e6d1SIlya Dryomov static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
275485b5e6d1SIlya Dryomov {
2755a086a1b8SIlya Dryomov 	struct ceph_osd_request *osd_req;
2756a086a1b8SIlya Dryomov 	int ret;
2757a086a1b8SIlya Dryomov 
2758a086a1b8SIlya Dryomov 	osd_req = __rbd_obj_add_osd_request(obj_req, NULL, 1);
2759a086a1b8SIlya Dryomov 	if (IS_ERR(osd_req))
2760a086a1b8SIlya Dryomov 		return PTR_ERR(osd_req);
2761a086a1b8SIlya Dryomov 
2762a086a1b8SIlya Dryomov 	osd_req_op_extent_init(osd_req, 0, CEPH_OSD_OP_READ,
2763a086a1b8SIlya Dryomov 			       obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
2764a086a1b8SIlya Dryomov 	rbd_osd_setup_data(osd_req, 0);
2765a086a1b8SIlya Dryomov 	rbd_osd_format_read(osd_req);
2766a086a1b8SIlya Dryomov 
2767a086a1b8SIlya Dryomov 	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
2768a086a1b8SIlya Dryomov 	if (ret)
2769a086a1b8SIlya Dryomov 		return ret;
2770a086a1b8SIlya Dryomov 
2771a086a1b8SIlya Dryomov 	rbd_osd_submit(osd_req);
277285b5e6d1SIlya Dryomov 	return 0;
2773bf0d5f50SAlex Elder }
2774bf0d5f50SAlex Elder 
277586bd7998SIlya Dryomov static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
27763da691bfSIlya Dryomov {
27773da691bfSIlya Dryomov 	struct rbd_img_request *img_req = obj_req->img_request;
2778a52cc685SIlya Dryomov 	struct rbd_device *parent = img_req->rbd_dev->parent;
27793da691bfSIlya Dryomov 	struct rbd_img_request *child_img_req;
27803da691bfSIlya Dryomov 	int ret;
27813da691bfSIlya Dryomov 
278259e542c8SIlya Dryomov 	child_img_req = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
27833da691bfSIlya Dryomov 	if (!child_img_req)
27843da691bfSIlya Dryomov 		return -ENOMEM;
27853da691bfSIlya Dryomov 
278659e542c8SIlya Dryomov 	rbd_img_request_init(child_img_req, parent, OBJ_OP_READ);
2787e93aca0aSIlya Dryomov 	__set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2788e93aca0aSIlya Dryomov 	child_img_req->obj_request = obj_req;
2789e93aca0aSIlya Dryomov 
2790a52cc685SIlya Dryomov 	down_read(&parent->header_rwsem);
2791a52cc685SIlya Dryomov 	rbd_img_capture_header(child_img_req);
2792a52cc685SIlya Dryomov 	up_read(&parent->header_rwsem);
2793a52cc685SIlya Dryomov 
279421ed05a8SIlya Dryomov 	dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req,
279521ed05a8SIlya Dryomov 	     obj_req);
279621ed05a8SIlya Dryomov 
27973da691bfSIlya Dryomov 	if (!rbd_img_is_write(img_req)) {
2798ecc633caSIlya Dryomov 		switch (img_req->data_type) {
27993da691bfSIlya Dryomov 		case OBJ_REQUEST_BIO:
28005a237819SIlya Dryomov 			ret = __rbd_img_fill_from_bio(child_img_req,
28015a237819SIlya Dryomov 						      obj_req->img_extents,
28025a237819SIlya Dryomov 						      obj_req->num_img_extents,
28033da691bfSIlya Dryomov 						      &obj_req->bio_pos);
28043da691bfSIlya Dryomov 			break;
28053da691bfSIlya Dryomov 		case OBJ_REQUEST_BVECS:
2806afb97888SIlya Dryomov 		case OBJ_REQUEST_OWN_BVECS:
28075a237819SIlya Dryomov 			ret = __rbd_img_fill_from_bvecs(child_img_req,
28085a237819SIlya Dryomov 						      obj_req->img_extents,
28095a237819SIlya Dryomov 						      obj_req->num_img_extents,
28103da691bfSIlya Dryomov 						      &obj_req->bvec_pos);
28113da691bfSIlya Dryomov 			break;
28123da691bfSIlya Dryomov 		default:
2813d342a15bSArnd Bergmann 			BUG();
28143da691bfSIlya Dryomov 		}
28153da691bfSIlya Dryomov 	} else {
28165a237819SIlya Dryomov 		ret = rbd_img_fill_from_bvecs(child_img_req,
28175a237819SIlya Dryomov 					      obj_req->img_extents,
28185a237819SIlya Dryomov 					      obj_req->num_img_extents,
28195a237819SIlya Dryomov 					      obj_req->copyup_bvecs);
28203da691bfSIlya Dryomov 	}
28213da691bfSIlya Dryomov 	if (ret) {
2822679a97d2SHannes Reinecke 		rbd_img_request_destroy(child_img_req);
2823663ae2ccSIlya Dryomov 		return ret;
2824bf0d5f50SAlex Elder 	}
2825bf0d5f50SAlex Elder 
28260192ce2eSIlya Dryomov 	/* avoid parent chain recursion */
28270192ce2eSIlya Dryomov 	rbd_img_schedule(child_img_req, 0);
28283da691bfSIlya Dryomov 	return 0;
28293da691bfSIlya Dryomov }
28303da691bfSIlya Dryomov 
283185b5e6d1SIlya Dryomov static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
28328b3e1a56SAlex Elder {
28333da691bfSIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
28343da691bfSIlya Dryomov 	int ret;
28358b3e1a56SAlex Elder 
283622e8bd51SIlya Dryomov again:
2837a9b67e69SIlya Dryomov 	switch (obj_req->read_state) {
283885b5e6d1SIlya Dryomov 	case RBD_OBJ_READ_START:
283985b5e6d1SIlya Dryomov 		rbd_assert(!*result);
284085b5e6d1SIlya Dryomov 
284122e8bd51SIlya Dryomov 		if (!rbd_obj_may_exist(obj_req)) {
284222e8bd51SIlya Dryomov 			*result = -ENOENT;
284322e8bd51SIlya Dryomov 			obj_req->read_state = RBD_OBJ_READ_OBJECT;
284422e8bd51SIlya Dryomov 			goto again;
284522e8bd51SIlya Dryomov 		}
284622e8bd51SIlya Dryomov 
284785b5e6d1SIlya Dryomov 		ret = rbd_obj_read_object(obj_req);
284885b5e6d1SIlya Dryomov 		if (ret) {
284985b5e6d1SIlya Dryomov 			*result = ret;
285085b5e6d1SIlya Dryomov 			return true;
285185b5e6d1SIlya Dryomov 		}
285285b5e6d1SIlya Dryomov 		obj_req->read_state = RBD_OBJ_READ_OBJECT;
285385b5e6d1SIlya Dryomov 		return false;
2854a9b67e69SIlya Dryomov 	case RBD_OBJ_READ_OBJECT:
2855a9b67e69SIlya Dryomov 		if (*result == -ENOENT && rbd_dev->parent_overlap) {
285686bd7998SIlya Dryomov 			/* reverse map this object extent onto the parent */
285786bd7998SIlya Dryomov 			ret = rbd_obj_calc_img_extents(obj_req, false);
285886bd7998SIlya Dryomov 			if (ret) {
285954ab3b24SIlya Dryomov 				*result = ret;
286086bd7998SIlya Dryomov 				return true;
286186bd7998SIlya Dryomov 			}
286286bd7998SIlya Dryomov 			if (obj_req->num_img_extents) {
286386bd7998SIlya Dryomov 				ret = rbd_obj_read_from_parent(obj_req);
28643da691bfSIlya Dryomov 				if (ret) {
286554ab3b24SIlya Dryomov 					*result = ret;
28663da691bfSIlya Dryomov 					return true;
28673da691bfSIlya Dryomov 				}
2868a9b67e69SIlya Dryomov 				obj_req->read_state = RBD_OBJ_READ_PARENT;
28693da691bfSIlya Dryomov 				return false;
28703da691bfSIlya Dryomov 			}
287186bd7998SIlya Dryomov 		}
287202c74fbaSAlex Elder 
287302c74fbaSAlex Elder 		/*
28743da691bfSIlya Dryomov 		 * -ENOENT means a hole in the image -- zero-fill the entire
28753da691bfSIlya Dryomov 		 * length of the request.  A short read also implies zero-fill
287654ab3b24SIlya Dryomov 		 * to the end of the request.
287702c74fbaSAlex Elder 		 */
287854ab3b24SIlya Dryomov 		if (*result == -ENOENT) {
287954ab3b24SIlya Dryomov 			rbd_obj_zero_range(obj_req, 0, obj_req->ex.oe_len);
288054ab3b24SIlya Dryomov 			*result = 0;
288154ab3b24SIlya Dryomov 		} else if (*result >= 0) {
288254ab3b24SIlya Dryomov 			if (*result < obj_req->ex.oe_len)
288354ab3b24SIlya Dryomov 				rbd_obj_zero_range(obj_req, *result,
288454ab3b24SIlya Dryomov 						obj_req->ex.oe_len - *result);
288554ab3b24SIlya Dryomov 			else
288654ab3b24SIlya Dryomov 				rbd_assert(*result == obj_req->ex.oe_len);
288754ab3b24SIlya Dryomov 			*result = 0;
28883da691bfSIlya Dryomov 		}
28893da691bfSIlya Dryomov 		return true;
2890a9b67e69SIlya Dryomov 	case RBD_OBJ_READ_PARENT:
2891d435c9a7SIlya Dryomov 		/*
2892d435c9a7SIlya Dryomov 		 * The parent image is read only up to the overlap -- zero-fill
2893d435c9a7SIlya Dryomov 		 * from the overlap to the end of the request.
2894d435c9a7SIlya Dryomov 		 */
2895d435c9a7SIlya Dryomov 		if (!*result) {
2896d435c9a7SIlya Dryomov 			u32 obj_overlap = rbd_obj_img_extents_bytes(obj_req);
2897d435c9a7SIlya Dryomov 
2898d435c9a7SIlya Dryomov 			if (obj_overlap < obj_req->ex.oe_len)
2899d435c9a7SIlya Dryomov 				rbd_obj_zero_range(obj_req, obj_overlap,
2900d435c9a7SIlya Dryomov 					    obj_req->ex.oe_len - obj_overlap);
2901d435c9a7SIlya Dryomov 		}
2902a9b67e69SIlya Dryomov 		return true;
2903a9b67e69SIlya Dryomov 	default:
2904a9b67e69SIlya Dryomov 		BUG();
2905a9b67e69SIlya Dryomov 	}
29063da691bfSIlya Dryomov }
29073da691bfSIlya Dryomov 
290822e8bd51SIlya Dryomov static bool rbd_obj_write_is_noop(struct rbd_obj_request *obj_req)
290922e8bd51SIlya Dryomov {
291022e8bd51SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
291122e8bd51SIlya Dryomov 
291222e8bd51SIlya Dryomov 	if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno))
291322e8bd51SIlya Dryomov 		obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
291422e8bd51SIlya Dryomov 
291522e8bd51SIlya Dryomov 	if (!(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST) &&
291622e8bd51SIlya Dryomov 	    (obj_req->flags & RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT)) {
291722e8bd51SIlya Dryomov 		dout("%s %p noop for nonexistent\n", __func__, obj_req);
29183da691bfSIlya Dryomov 		return true;
29193da691bfSIlya Dryomov 	}
29203da691bfSIlya Dryomov 
292122e8bd51SIlya Dryomov 	return false;
292222e8bd51SIlya Dryomov }
292322e8bd51SIlya Dryomov 
292422e8bd51SIlya Dryomov /*
292522e8bd51SIlya Dryomov  * Return:
292622e8bd51SIlya Dryomov  *   0 - object map update sent
292722e8bd51SIlya Dryomov  *   1 - object map update isn't needed
292822e8bd51SIlya Dryomov  *  <0 - error
292922e8bd51SIlya Dryomov  */
293022e8bd51SIlya Dryomov static int rbd_obj_write_pre_object_map(struct rbd_obj_request *obj_req)
293122e8bd51SIlya Dryomov {
293222e8bd51SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
293322e8bd51SIlya Dryomov 	u8 new_state;
293422e8bd51SIlya Dryomov 
293522e8bd51SIlya Dryomov 	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
293622e8bd51SIlya Dryomov 		return 1;
293722e8bd51SIlya Dryomov 
293822e8bd51SIlya Dryomov 	if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
293922e8bd51SIlya Dryomov 		new_state = OBJECT_PENDING;
294022e8bd51SIlya Dryomov 	else
294122e8bd51SIlya Dryomov 		new_state = OBJECT_EXISTS;
294222e8bd51SIlya Dryomov 
294322e8bd51SIlya Dryomov 	return rbd_object_map_update(obj_req, CEPH_NOSNAP, new_state, NULL);
294422e8bd51SIlya Dryomov }
294522e8bd51SIlya Dryomov 
294685b5e6d1SIlya Dryomov static int rbd_obj_write_object(struct rbd_obj_request *obj_req)
294785b5e6d1SIlya Dryomov {
2948a086a1b8SIlya Dryomov 	struct ceph_osd_request *osd_req;
2949a086a1b8SIlya Dryomov 	int num_ops = count_write_ops(obj_req);
2950a086a1b8SIlya Dryomov 	int which = 0;
2951a086a1b8SIlya Dryomov 	int ret;
2952a086a1b8SIlya Dryomov 
2953a086a1b8SIlya Dryomov 	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)
2954a086a1b8SIlya Dryomov 		num_ops++; /* stat */
2955a086a1b8SIlya Dryomov 
2956a086a1b8SIlya Dryomov 	osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
2957a086a1b8SIlya Dryomov 	if (IS_ERR(osd_req))
2958a086a1b8SIlya Dryomov 		return PTR_ERR(osd_req);
2959a086a1b8SIlya Dryomov 
2960a086a1b8SIlya Dryomov 	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
2961a086a1b8SIlya Dryomov 		ret = rbd_osd_setup_stat(osd_req, which++);
2962a086a1b8SIlya Dryomov 		if (ret)
2963a086a1b8SIlya Dryomov 			return ret;
2964a086a1b8SIlya Dryomov 	}
2965a086a1b8SIlya Dryomov 
2966a086a1b8SIlya Dryomov 	rbd_osd_setup_write_ops(osd_req, which);
2967a086a1b8SIlya Dryomov 	rbd_osd_format_write(osd_req);
2968a086a1b8SIlya Dryomov 
2969a086a1b8SIlya Dryomov 	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
2970a086a1b8SIlya Dryomov 	if (ret)
2971a086a1b8SIlya Dryomov 		return ret;
2972a086a1b8SIlya Dryomov 
2973a086a1b8SIlya Dryomov 	rbd_osd_submit(osd_req);
297485b5e6d1SIlya Dryomov 	return 0;
297585b5e6d1SIlya Dryomov }
297685b5e6d1SIlya Dryomov 
29773da691bfSIlya Dryomov /*
29783da691bfSIlya Dryomov  * copyup_bvecs pages are never highmem pages
29793da691bfSIlya Dryomov  */
29803da691bfSIlya Dryomov static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
29813da691bfSIlya Dryomov {
29823da691bfSIlya Dryomov 	struct ceph_bvec_iter it = {
29833da691bfSIlya Dryomov 		.bvecs = bvecs,
29843da691bfSIlya Dryomov 		.iter = { .bi_size = bytes },
29853da691bfSIlya Dryomov 	};
29863da691bfSIlya Dryomov 
29873da691bfSIlya Dryomov 	ceph_bvec_iter_advance_step(&it, bytes, ({
2988cf58b537SChristoph Hellwig 		if (memchr_inv(bvec_virt(&bv), 0, bv.bv_len))
29893da691bfSIlya Dryomov 			return false;
29903da691bfSIlya Dryomov 	}));
29913da691bfSIlya Dryomov 	return true;
29923da691bfSIlya Dryomov }
29933da691bfSIlya Dryomov 
29943a482501SIlya Dryomov #define MODS_ONLY	U32_MAX
29953a482501SIlya Dryomov 
2996793333a3SIlya Dryomov static int rbd_obj_copyup_empty_snapc(struct rbd_obj_request *obj_req,
299789a59c1cSIlya Dryomov 				      u32 bytes)
29983da691bfSIlya Dryomov {
2999bcbab1dbSIlya Dryomov 	struct ceph_osd_request *osd_req;
3000fe943d50SChengguang Xu 	int ret;
30013da691bfSIlya Dryomov 
30023da691bfSIlya Dryomov 	dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
300389a59c1cSIlya Dryomov 	rbd_assert(bytes > 0 && bytes != MODS_ONLY);
30043da691bfSIlya Dryomov 
3005bcbab1dbSIlya Dryomov 	osd_req = __rbd_obj_add_osd_request(obj_req, &rbd_empty_snapc, 1);
3006bcbab1dbSIlya Dryomov 	if (IS_ERR(osd_req))
3007bcbab1dbSIlya Dryomov 		return PTR_ERR(osd_req);
30083da691bfSIlya Dryomov 
3009b5ae8cbcSIlya Dryomov 	ret = rbd_osd_setup_copyup(osd_req, 0, bytes);
3010fe943d50SChengguang Xu 	if (ret)
3011fe943d50SChengguang Xu 		return ret;
3012fe943d50SChengguang Xu 
3013bcbab1dbSIlya Dryomov 	rbd_osd_format_write(osd_req);
30143da691bfSIlya Dryomov 
3015bcbab1dbSIlya Dryomov 	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
301689a59c1cSIlya Dryomov 	if (ret)
301789a59c1cSIlya Dryomov 		return ret;
301889a59c1cSIlya Dryomov 
3019a086a1b8SIlya Dryomov 	rbd_osd_submit(osd_req);
302089a59c1cSIlya Dryomov 	return 0;
302189a59c1cSIlya Dryomov }
302289a59c1cSIlya Dryomov 
3023793333a3SIlya Dryomov static int rbd_obj_copyup_current_snapc(struct rbd_obj_request *obj_req,
3024793333a3SIlya Dryomov 					u32 bytes)
30253da691bfSIlya Dryomov {
3026bcbab1dbSIlya Dryomov 	struct ceph_osd_request *osd_req;
3027a086a1b8SIlya Dryomov 	int num_ops = count_write_ops(obj_req);
3028a086a1b8SIlya Dryomov 	int which = 0;
30293da691bfSIlya Dryomov 	int ret;
30303da691bfSIlya Dryomov 
30313da691bfSIlya Dryomov 	dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
30323da691bfSIlya Dryomov 
3033a086a1b8SIlya Dryomov 	if (bytes != MODS_ONLY)
3034a086a1b8SIlya Dryomov 		num_ops++; /* copyup */
303513488d53SIlya Dryomov 
3036a086a1b8SIlya Dryomov 	osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
3037bcbab1dbSIlya Dryomov 	if (IS_ERR(osd_req))
3038bcbab1dbSIlya Dryomov 		return PTR_ERR(osd_req);
30393da691bfSIlya Dryomov 
30403a482501SIlya Dryomov 	if (bytes != MODS_ONLY) {
3041b5ae8cbcSIlya Dryomov 		ret = rbd_osd_setup_copyup(osd_req, which++, bytes);
30423da691bfSIlya Dryomov 		if (ret)
30433da691bfSIlya Dryomov 			return ret;
30443a482501SIlya Dryomov 	}
30453da691bfSIlya Dryomov 
3046a086a1b8SIlya Dryomov 	rbd_osd_setup_write_ops(osd_req, which);
3047a086a1b8SIlya Dryomov 	rbd_osd_format_write(osd_req);
30483da691bfSIlya Dryomov 
3049bcbab1dbSIlya Dryomov 	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
30503da691bfSIlya Dryomov 	if (ret)
30513da691bfSIlya Dryomov 		return ret;
30523da691bfSIlya Dryomov 
3053a086a1b8SIlya Dryomov 	rbd_osd_submit(osd_req);
30543da691bfSIlya Dryomov 	return 0;
30553da691bfSIlya Dryomov }
30563da691bfSIlya Dryomov 
30577e07efb1SIlya Dryomov static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
30587e07efb1SIlya Dryomov {
30597e07efb1SIlya Dryomov 	u32 i;
30607e07efb1SIlya Dryomov 
30617e07efb1SIlya Dryomov 	rbd_assert(!obj_req->copyup_bvecs);
30627e07efb1SIlya Dryomov 	obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
30637e07efb1SIlya Dryomov 	obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
30647e07efb1SIlya Dryomov 					sizeof(*obj_req->copyup_bvecs),
30657e07efb1SIlya Dryomov 					GFP_NOIO);
30667e07efb1SIlya Dryomov 	if (!obj_req->copyup_bvecs)
30677e07efb1SIlya Dryomov 		return -ENOMEM;
30687e07efb1SIlya Dryomov 
30697e07efb1SIlya Dryomov 	for (i = 0; i < obj_req->copyup_bvec_count; i++) {
30707e07efb1SIlya Dryomov 		unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
30717e07efb1SIlya Dryomov 
30727e07efb1SIlya Dryomov 		obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
30737e07efb1SIlya Dryomov 		if (!obj_req->copyup_bvecs[i].bv_page)
30747e07efb1SIlya Dryomov 			return -ENOMEM;
30757e07efb1SIlya Dryomov 
30767e07efb1SIlya Dryomov 		obj_req->copyup_bvecs[i].bv_offset = 0;
30777e07efb1SIlya Dryomov 		obj_req->copyup_bvecs[i].bv_len = len;
30787e07efb1SIlya Dryomov 		obj_overlap -= len;
30797e07efb1SIlya Dryomov 	}
30807e07efb1SIlya Dryomov 
30817e07efb1SIlya Dryomov 	rbd_assert(!obj_overlap);
30827e07efb1SIlya Dryomov 	return 0;
30837e07efb1SIlya Dryomov }
30847e07efb1SIlya Dryomov 
30850ad5d953SIlya Dryomov /*
30860ad5d953SIlya Dryomov  * The target object doesn't exist.  Read the data for the entire
30870ad5d953SIlya Dryomov  * target object up to the overlap point (if any) from the parent,
30880ad5d953SIlya Dryomov  * so we can use it for a copyup.
30890ad5d953SIlya Dryomov  */
3090793333a3SIlya Dryomov static int rbd_obj_copyup_read_parent(struct rbd_obj_request *obj_req)
30913da691bfSIlya Dryomov {
30923da691bfSIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
30933da691bfSIlya Dryomov 	int ret;
30943da691bfSIlya Dryomov 
309586bd7998SIlya Dryomov 	rbd_assert(obj_req->num_img_extents);
309686bd7998SIlya Dryomov 	prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
309786bd7998SIlya Dryomov 		      rbd_dev->parent_overlap);
309886bd7998SIlya Dryomov 	if (!obj_req->num_img_extents) {
30993da691bfSIlya Dryomov 		/*
31003da691bfSIlya Dryomov 		 * The overlap has become 0 (most likely because the
31013a482501SIlya Dryomov 		 * image has been flattened).  Re-submit the original write
31023a482501SIlya Dryomov 		 * request -- pass MODS_ONLY since the copyup isn't needed
31033a482501SIlya Dryomov 		 * anymore.
31043da691bfSIlya Dryomov 		 */
3105793333a3SIlya Dryomov 		return rbd_obj_copyup_current_snapc(obj_req, MODS_ONLY);
31063da691bfSIlya Dryomov 	}
31073da691bfSIlya Dryomov 
310886bd7998SIlya Dryomov 	ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
31093da691bfSIlya Dryomov 	if (ret)
31103da691bfSIlya Dryomov 		return ret;
31113da691bfSIlya Dryomov 
311286bd7998SIlya Dryomov 	return rbd_obj_read_from_parent(obj_req);
31133da691bfSIlya Dryomov }
31143da691bfSIlya Dryomov 
311522e8bd51SIlya Dryomov static void rbd_obj_copyup_object_maps(struct rbd_obj_request *obj_req)
31163da691bfSIlya Dryomov {
311722e8bd51SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
311822e8bd51SIlya Dryomov 	struct ceph_snap_context *snapc = obj_req->img_request->snapc;
311922e8bd51SIlya Dryomov 	u8 new_state;
312022e8bd51SIlya Dryomov 	u32 i;
31213da691bfSIlya Dryomov 	int ret;
31223da691bfSIlya Dryomov 
312322e8bd51SIlya Dryomov 	rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
31243da691bfSIlya Dryomov 
312522e8bd51SIlya Dryomov 	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
312622e8bd51SIlya Dryomov 		return;
312789a59c1cSIlya Dryomov 
312822e8bd51SIlya Dryomov 	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
312922e8bd51SIlya Dryomov 		return;
31303da691bfSIlya Dryomov 
313122e8bd51SIlya Dryomov 	for (i = 0; i < snapc->num_snaps; i++) {
313222e8bd51SIlya Dryomov 		if ((rbd_dev->header.features & RBD_FEATURE_FAST_DIFF) &&
313322e8bd51SIlya Dryomov 		    i + 1 < snapc->num_snaps)
313422e8bd51SIlya Dryomov 			new_state = OBJECT_EXISTS_CLEAN;
313522e8bd51SIlya Dryomov 		else
313622e8bd51SIlya Dryomov 			new_state = OBJECT_EXISTS;
31373da691bfSIlya Dryomov 
313822e8bd51SIlya Dryomov 		ret = rbd_object_map_update(obj_req, snapc->snaps[i],
313922e8bd51SIlya Dryomov 					    new_state, NULL);
314022e8bd51SIlya Dryomov 		if (ret < 0) {
314122e8bd51SIlya Dryomov 			obj_req->pending.result = ret;
314202c74fbaSAlex Elder 			return;
314302c74fbaSAlex Elder 		}
314402c74fbaSAlex Elder 
314522e8bd51SIlya Dryomov 		rbd_assert(!ret);
314622e8bd51SIlya Dryomov 		obj_req->pending.num_pending++;
3147a9e8ba2cSAlex Elder 	}
31488b3e1a56SAlex Elder }
31498b3e1a56SAlex Elder 
3150793333a3SIlya Dryomov static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req)
31518b3e1a56SAlex Elder {
3152793333a3SIlya Dryomov 	u32 bytes = rbd_obj_img_extents_bytes(obj_req);
3153793333a3SIlya Dryomov 	int ret;
31548b3e1a56SAlex Elder 
3155793333a3SIlya Dryomov 	rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
31568b3e1a56SAlex Elder 
3157793333a3SIlya Dryomov 	/*
3158793333a3SIlya Dryomov 	 * Only send non-zero copyup data to save some I/O and network
3159793333a3SIlya Dryomov 	 * bandwidth -- zero copyup data is equivalent to the object not
3160793333a3SIlya Dryomov 	 * existing.
3161793333a3SIlya Dryomov 	 */
3162793333a3SIlya Dryomov 	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
3163793333a3SIlya Dryomov 		bytes = 0;
3164793333a3SIlya Dryomov 
3165793333a3SIlya Dryomov 	if (obj_req->img_request->snapc->num_snaps && bytes > 0) {
3166793333a3SIlya Dryomov 		/*
3167793333a3SIlya Dryomov 		 * Send a copyup request with an empty snapshot context to
3168793333a3SIlya Dryomov 		 * deep-copyup the object through all existing snapshots.
3169793333a3SIlya Dryomov 		 * A second request with the current snapshot context will be
3170793333a3SIlya Dryomov 		 * sent for the actual modification.
3171793333a3SIlya Dryomov 		 */
3172793333a3SIlya Dryomov 		ret = rbd_obj_copyup_empty_snapc(obj_req, bytes);
3173793333a3SIlya Dryomov 		if (ret) {
3174793333a3SIlya Dryomov 			obj_req->pending.result = ret;
3175793333a3SIlya Dryomov 			return;
31767114edacSIlya Dryomov 		}
31778b3e1a56SAlex Elder 
3178793333a3SIlya Dryomov 		obj_req->pending.num_pending++;
3179793333a3SIlya Dryomov 		bytes = MODS_ONLY;
31803da691bfSIlya Dryomov 	}
31818b3e1a56SAlex Elder 
3182793333a3SIlya Dryomov 	ret = rbd_obj_copyup_current_snapc(obj_req, bytes);
3183793333a3SIlya Dryomov 	if (ret) {
3184793333a3SIlya Dryomov 		obj_req->pending.result = ret;
3185793333a3SIlya Dryomov 		return;
3186793333a3SIlya Dryomov 	}
3187793333a3SIlya Dryomov 
3188793333a3SIlya Dryomov 	obj_req->pending.num_pending++;
3189793333a3SIlya Dryomov }
3190793333a3SIlya Dryomov 
3191793333a3SIlya Dryomov static bool rbd_obj_advance_copyup(struct rbd_obj_request *obj_req, int *result)
31923da691bfSIlya Dryomov {
319322e8bd51SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3194793333a3SIlya Dryomov 	int ret;
31957114edacSIlya Dryomov 
31967114edacSIlya Dryomov again:
3197793333a3SIlya Dryomov 	switch (obj_req->copyup_state) {
3198793333a3SIlya Dryomov 	case RBD_OBJ_COPYUP_START:
3199793333a3SIlya Dryomov 		rbd_assert(!*result);
32003da691bfSIlya Dryomov 
3201793333a3SIlya Dryomov 		ret = rbd_obj_copyup_read_parent(obj_req);
3202793333a3SIlya Dryomov 		if (ret) {
3203793333a3SIlya Dryomov 			*result = ret;
3204793333a3SIlya Dryomov 			return true;
3205793333a3SIlya Dryomov 		}
3206793333a3SIlya Dryomov 		if (obj_req->num_img_extents)
3207793333a3SIlya Dryomov 			obj_req->copyup_state = RBD_OBJ_COPYUP_READ_PARENT;
3208793333a3SIlya Dryomov 		else
3209793333a3SIlya Dryomov 			obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3210793333a3SIlya Dryomov 		return false;
3211793333a3SIlya Dryomov 	case RBD_OBJ_COPYUP_READ_PARENT:
3212793333a3SIlya Dryomov 		if (*result)
3213793333a3SIlya Dryomov 			return true;
3214793333a3SIlya Dryomov 
3215793333a3SIlya Dryomov 		if (is_zero_bvecs(obj_req->copyup_bvecs,
3216793333a3SIlya Dryomov 				  rbd_obj_img_extents_bytes(obj_req))) {
3217793333a3SIlya Dryomov 			dout("%s %p detected zeros\n", __func__, obj_req);
3218793333a3SIlya Dryomov 			obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ZEROS;
32197114edacSIlya Dryomov 		}
32207114edacSIlya Dryomov 
322122e8bd51SIlya Dryomov 		rbd_obj_copyup_object_maps(obj_req);
322222e8bd51SIlya Dryomov 		if (!obj_req->pending.num_pending) {
322322e8bd51SIlya Dryomov 			*result = obj_req->pending.result;
322422e8bd51SIlya Dryomov 			obj_req->copyup_state = RBD_OBJ_COPYUP_OBJECT_MAPS;
32257114edacSIlya Dryomov 			goto again;
32267114edacSIlya Dryomov 		}
322722e8bd51SIlya Dryomov 		obj_req->copyup_state = __RBD_OBJ_COPYUP_OBJECT_MAPS;
322822e8bd51SIlya Dryomov 		return false;
322922e8bd51SIlya Dryomov 	case __RBD_OBJ_COPYUP_OBJECT_MAPS:
323022e8bd51SIlya Dryomov 		if (!pending_result_dec(&obj_req->pending, result))
323122e8bd51SIlya Dryomov 			return false;
3232df561f66SGustavo A. R. Silva 		fallthrough;
323322e8bd51SIlya Dryomov 	case RBD_OBJ_COPYUP_OBJECT_MAPS:
323422e8bd51SIlya Dryomov 		if (*result) {
323522e8bd51SIlya Dryomov 			rbd_warn(rbd_dev, "snap object map update failed: %d",
323622e8bd51SIlya Dryomov 				 *result);
323722e8bd51SIlya Dryomov 			return true;
323822e8bd51SIlya Dryomov 		}
323922e8bd51SIlya Dryomov 
3240793333a3SIlya Dryomov 		rbd_obj_copyup_write_object(obj_req);
3241793333a3SIlya Dryomov 		if (!obj_req->pending.num_pending) {
3242793333a3SIlya Dryomov 			*result = obj_req->pending.result;
3243793333a3SIlya Dryomov 			obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3244793333a3SIlya Dryomov 			goto again;
3245793333a3SIlya Dryomov 		}
3246793333a3SIlya Dryomov 		obj_req->copyup_state = __RBD_OBJ_COPYUP_WRITE_OBJECT;
3247793333a3SIlya Dryomov 		return false;
3248793333a3SIlya Dryomov 	case __RBD_OBJ_COPYUP_WRITE_OBJECT:
3249793333a3SIlya Dryomov 		if (!pending_result_dec(&obj_req->pending, result))
3250793333a3SIlya Dryomov 			return false;
3251df561f66SGustavo A. R. Silva 		fallthrough;
3252793333a3SIlya Dryomov 	case RBD_OBJ_COPYUP_WRITE_OBJECT:
3253793333a3SIlya Dryomov 		return true;
3254793333a3SIlya Dryomov 	default:
3255793333a3SIlya Dryomov 		BUG();
3256793333a3SIlya Dryomov 	}
3257793333a3SIlya Dryomov }
3258793333a3SIlya Dryomov 
325922e8bd51SIlya Dryomov /*
326022e8bd51SIlya Dryomov  * Return:
326122e8bd51SIlya Dryomov  *   0 - object map update sent
326222e8bd51SIlya Dryomov  *   1 - object map update isn't needed
326322e8bd51SIlya Dryomov  *  <0 - error
326422e8bd51SIlya Dryomov  */
326522e8bd51SIlya Dryomov static int rbd_obj_write_post_object_map(struct rbd_obj_request *obj_req)
326622e8bd51SIlya Dryomov {
326722e8bd51SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
326822e8bd51SIlya Dryomov 	u8 current_state = OBJECT_PENDING;
326922e8bd51SIlya Dryomov 
327022e8bd51SIlya Dryomov 	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
327122e8bd51SIlya Dryomov 		return 1;
327222e8bd51SIlya Dryomov 
327322e8bd51SIlya Dryomov 	if (!(obj_req->flags & RBD_OBJ_FLAG_DELETION))
327422e8bd51SIlya Dryomov 		return 1;
327522e8bd51SIlya Dryomov 
327622e8bd51SIlya Dryomov 	return rbd_object_map_update(obj_req, CEPH_NOSNAP, OBJECT_NONEXISTENT,
327722e8bd51SIlya Dryomov 				     &current_state);
327822e8bd51SIlya Dryomov }
327922e8bd51SIlya Dryomov 
328085b5e6d1SIlya Dryomov static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result)
3281b8d70035SAlex Elder {
3282793333a3SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3283b8d70035SAlex Elder 	int ret;
3284b8d70035SAlex Elder 
3285793333a3SIlya Dryomov again:
3286cf81b60eSAlex Elder 	switch (obj_req->write_state) {
328785b5e6d1SIlya Dryomov 	case RBD_OBJ_WRITE_START:
328885b5e6d1SIlya Dryomov 		rbd_assert(!*result);
328985b5e6d1SIlya Dryomov 
329022e8bd51SIlya Dryomov 		if (rbd_obj_write_is_noop(obj_req))
329122e8bd51SIlya Dryomov 			return true;
329222e8bd51SIlya Dryomov 
329322e8bd51SIlya Dryomov 		ret = rbd_obj_write_pre_object_map(obj_req);
329422e8bd51SIlya Dryomov 		if (ret < 0) {
329522e8bd51SIlya Dryomov 			*result = ret;
329622e8bd51SIlya Dryomov 			return true;
329722e8bd51SIlya Dryomov 		}
329822e8bd51SIlya Dryomov 		obj_req->write_state = RBD_OBJ_WRITE_PRE_OBJECT_MAP;
329922e8bd51SIlya Dryomov 		if (ret > 0)
330022e8bd51SIlya Dryomov 			goto again;
330122e8bd51SIlya Dryomov 		return false;
330222e8bd51SIlya Dryomov 	case RBD_OBJ_WRITE_PRE_OBJECT_MAP:
330322e8bd51SIlya Dryomov 		if (*result) {
330422e8bd51SIlya Dryomov 			rbd_warn(rbd_dev, "pre object map update failed: %d",
330522e8bd51SIlya Dryomov 				 *result);
330622e8bd51SIlya Dryomov 			return true;
330722e8bd51SIlya Dryomov 		}
330885b5e6d1SIlya Dryomov 		ret = rbd_obj_write_object(obj_req);
330985b5e6d1SIlya Dryomov 		if (ret) {
331085b5e6d1SIlya Dryomov 			*result = ret;
331185b5e6d1SIlya Dryomov 			return true;
331285b5e6d1SIlya Dryomov 		}
331385b5e6d1SIlya Dryomov 		obj_req->write_state = RBD_OBJ_WRITE_OBJECT;
331485b5e6d1SIlya Dryomov 		return false;
33150ad5d953SIlya Dryomov 	case RBD_OBJ_WRITE_OBJECT:
331654ab3b24SIlya Dryomov 		if (*result == -ENOENT) {
33170ad5d953SIlya Dryomov 			if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
3318793333a3SIlya Dryomov 				*result = 0;
3319793333a3SIlya Dryomov 				obj_req->copyup_state = RBD_OBJ_COPYUP_START;
3320793333a3SIlya Dryomov 				obj_req->write_state = __RBD_OBJ_WRITE_COPYUP;
3321793333a3SIlya Dryomov 				goto again;
3322b8d70035SAlex Elder 			}
33230ad5d953SIlya Dryomov 			/*
33240ad5d953SIlya Dryomov 			 * On a non-existent object:
33250ad5d953SIlya Dryomov 			 *   delete - -ENOENT, truncate/zero - 0
33260ad5d953SIlya Dryomov 			 */
33270ad5d953SIlya Dryomov 			if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
33280ad5d953SIlya Dryomov 				*result = 0;
33290ad5d953SIlya Dryomov 		}
3330793333a3SIlya Dryomov 		if (*result)
3331793333a3SIlya Dryomov 			return true;
3332793333a3SIlya Dryomov 
3333793333a3SIlya Dryomov 		obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
3334793333a3SIlya Dryomov 		goto again;
3335793333a3SIlya Dryomov 	case __RBD_OBJ_WRITE_COPYUP:
3336793333a3SIlya Dryomov 		if (!rbd_obj_advance_copyup(obj_req, result))
3337793333a3SIlya Dryomov 			return false;
3338df561f66SGustavo A. R. Silva 		fallthrough;
3339793333a3SIlya Dryomov 	case RBD_OBJ_WRITE_COPYUP:
334022e8bd51SIlya Dryomov 		if (*result) {
3341793333a3SIlya Dryomov 			rbd_warn(rbd_dev, "copyup failed: %d", *result);
3342b8d70035SAlex Elder 			return true;
334322e8bd51SIlya Dryomov 		}
334422e8bd51SIlya Dryomov 		ret = rbd_obj_write_post_object_map(obj_req);
334522e8bd51SIlya Dryomov 		if (ret < 0) {
334622e8bd51SIlya Dryomov 			*result = ret;
334722e8bd51SIlya Dryomov 			return true;
334822e8bd51SIlya Dryomov 		}
334922e8bd51SIlya Dryomov 		obj_req->write_state = RBD_OBJ_WRITE_POST_OBJECT_MAP;
335022e8bd51SIlya Dryomov 		if (ret > 0)
335122e8bd51SIlya Dryomov 			goto again;
335222e8bd51SIlya Dryomov 		return false;
335322e8bd51SIlya Dryomov 	case RBD_OBJ_WRITE_POST_OBJECT_MAP:
335422e8bd51SIlya Dryomov 		if (*result)
335522e8bd51SIlya Dryomov 			rbd_warn(rbd_dev, "post object map update failed: %d",
335622e8bd51SIlya Dryomov 				 *result);
335722e8bd51SIlya Dryomov 		return true;
3358b8d70035SAlex Elder 	default:
3359b8d70035SAlex Elder 		BUG();
3360b8d70035SAlex Elder 	}
3361b8d70035SAlex Elder }
3362b8d70035SAlex Elder 
3363b8d70035SAlex Elder /*
33640ad5d953SIlya Dryomov  * Return true if @obj_req is completed.
3365b8d70035SAlex Elder  */
336654ab3b24SIlya Dryomov static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req,
336754ab3b24SIlya Dryomov 				     int *result)
3368b8d70035SAlex Elder {
33690ad5d953SIlya Dryomov 	struct rbd_img_request *img_req = obj_req->img_request;
33700192ce2eSIlya Dryomov 	struct rbd_device *rbd_dev = img_req->rbd_dev;
33710ad5d953SIlya Dryomov 	bool done;
33720ad5d953SIlya Dryomov 
337385b5e6d1SIlya Dryomov 	mutex_lock(&obj_req->state_mutex);
33740ad5d953SIlya Dryomov 	if (!rbd_img_is_write(img_req))
337585b5e6d1SIlya Dryomov 		done = rbd_obj_advance_read(obj_req, result);
33760ad5d953SIlya Dryomov 	else
337785b5e6d1SIlya Dryomov 		done = rbd_obj_advance_write(obj_req, result);
337885b5e6d1SIlya Dryomov 	mutex_unlock(&obj_req->state_mutex);
33790ad5d953SIlya Dryomov 
33800192ce2eSIlya Dryomov 	if (done && *result) {
33810192ce2eSIlya Dryomov 		rbd_assert(*result < 0);
33820192ce2eSIlya Dryomov 		rbd_warn(rbd_dev, "%s at objno %llu %llu~%llu result %d",
33830192ce2eSIlya Dryomov 			 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
33840192ce2eSIlya Dryomov 			 obj_req->ex.oe_off, obj_req->ex.oe_len, *result);
33850192ce2eSIlya Dryomov 	}
33860ad5d953SIlya Dryomov 	return done;
33879969ebc5SAlex Elder }
33889969ebc5SAlex Elder 
33890192ce2eSIlya Dryomov /*
33900192ce2eSIlya Dryomov  * This is open-coded in rbd_img_handle_request() to avoid parent chain
33910192ce2eSIlya Dryomov  * recursion.
33920192ce2eSIlya Dryomov  */
339354ab3b24SIlya Dryomov static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result)
33949969ebc5SAlex Elder {
33950192ce2eSIlya Dryomov 	if (__rbd_obj_handle_request(obj_req, &result))
33960192ce2eSIlya Dryomov 		rbd_img_handle_request(obj_req->img_request, result);
33977114edacSIlya Dryomov }
33987114edacSIlya Dryomov 
3399e1fddc8fSIlya Dryomov static bool need_exclusive_lock(struct rbd_img_request *img_req)
3400e1fddc8fSIlya Dryomov {
3401e1fddc8fSIlya Dryomov 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3402e1fddc8fSIlya Dryomov 
3403e1fddc8fSIlya Dryomov 	if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK))
3404e1fddc8fSIlya Dryomov 		return false;
3405e1fddc8fSIlya Dryomov 
34063fe69921SIlya Dryomov 	if (rbd_is_ro(rbd_dev))
3407e1fddc8fSIlya Dryomov 		return false;
3408e1fddc8fSIlya Dryomov 
3409e1fddc8fSIlya Dryomov 	rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
341022e8bd51SIlya Dryomov 	if (rbd_dev->opts->lock_on_read ||
341122e8bd51SIlya Dryomov 	    (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3412e1fddc8fSIlya Dryomov 		return true;
3413e1fddc8fSIlya Dryomov 
3414e1fddc8fSIlya Dryomov 	return rbd_img_is_write(img_req);
3415e1fddc8fSIlya Dryomov }
3416e1fddc8fSIlya Dryomov 
3417637cd060SIlya Dryomov static bool rbd_lock_add_request(struct rbd_img_request *img_req)
3418e1fddc8fSIlya Dryomov {
3419e1fddc8fSIlya Dryomov 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3420637cd060SIlya Dryomov 	bool locked;
3421e1fddc8fSIlya Dryomov 
3422e1fddc8fSIlya Dryomov 	lockdep_assert_held(&rbd_dev->lock_rwsem);
3423637cd060SIlya Dryomov 	locked = rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED;
3424e1fddc8fSIlya Dryomov 	spin_lock(&rbd_dev->lock_lists_lock);
3425e1fddc8fSIlya Dryomov 	rbd_assert(list_empty(&img_req->lock_item));
3426637cd060SIlya Dryomov 	if (!locked)
3427637cd060SIlya Dryomov 		list_add_tail(&img_req->lock_item, &rbd_dev->acquiring_list);
3428637cd060SIlya Dryomov 	else
3429e1fddc8fSIlya Dryomov 		list_add_tail(&img_req->lock_item, &rbd_dev->running_list);
3430e1fddc8fSIlya Dryomov 	spin_unlock(&rbd_dev->lock_lists_lock);
3431637cd060SIlya Dryomov 	return locked;
3432e1fddc8fSIlya Dryomov }
3433e1fddc8fSIlya Dryomov 
3434e1fddc8fSIlya Dryomov static void rbd_lock_del_request(struct rbd_img_request *img_req)
3435e1fddc8fSIlya Dryomov {
3436e1fddc8fSIlya Dryomov 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3437e1fddc8fSIlya Dryomov 	bool need_wakeup;
3438e1fddc8fSIlya Dryomov 
3439e1fddc8fSIlya Dryomov 	lockdep_assert_held(&rbd_dev->lock_rwsem);
3440e1fddc8fSIlya Dryomov 	spin_lock(&rbd_dev->lock_lists_lock);
3441e1fddc8fSIlya Dryomov 	rbd_assert(!list_empty(&img_req->lock_item));
3442e1fddc8fSIlya Dryomov 	list_del_init(&img_req->lock_item);
3443e1fddc8fSIlya Dryomov 	need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING &&
3444e1fddc8fSIlya Dryomov 		       list_empty(&rbd_dev->running_list));
3445e1fddc8fSIlya Dryomov 	spin_unlock(&rbd_dev->lock_lists_lock);
3446e1fddc8fSIlya Dryomov 	if (need_wakeup)
3447e1fddc8fSIlya Dryomov 		complete(&rbd_dev->releasing_wait);
3448e1fddc8fSIlya Dryomov }
3449e1fddc8fSIlya Dryomov 
3450637cd060SIlya Dryomov static int rbd_img_exclusive_lock(struct rbd_img_request *img_req)
3451637cd060SIlya Dryomov {
3452637cd060SIlya Dryomov 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3453637cd060SIlya Dryomov 
3454637cd060SIlya Dryomov 	if (!need_exclusive_lock(img_req))
3455637cd060SIlya Dryomov 		return 1;
3456637cd060SIlya Dryomov 
3457637cd060SIlya Dryomov 	if (rbd_lock_add_request(img_req))
3458637cd060SIlya Dryomov 		return 1;
3459637cd060SIlya Dryomov 
3460637cd060SIlya Dryomov 	if (rbd_dev->opts->exclusive) {
3461637cd060SIlya Dryomov 		WARN_ON(1); /* lock got released? */
3462637cd060SIlya Dryomov 		return -EROFS;
3463637cd060SIlya Dryomov 	}
3464637cd060SIlya Dryomov 
3465637cd060SIlya Dryomov 	/*
3466637cd060SIlya Dryomov 	 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3467637cd060SIlya Dryomov 	 * and cancel_delayed_work() in wake_lock_waiters().
3468637cd060SIlya Dryomov 	 */
3469637cd060SIlya Dryomov 	dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3470637cd060SIlya Dryomov 	queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3471637cd060SIlya Dryomov 	return 0;
3472637cd060SIlya Dryomov }
3473637cd060SIlya Dryomov 
34740192ce2eSIlya Dryomov static void rbd_img_object_requests(struct rbd_img_request *img_req)
34750192ce2eSIlya Dryomov {
34760192ce2eSIlya Dryomov 	struct rbd_obj_request *obj_req;
34770192ce2eSIlya Dryomov 
34780192ce2eSIlya Dryomov 	rbd_assert(!img_req->pending.result && !img_req->pending.num_pending);
34790192ce2eSIlya Dryomov 
34800192ce2eSIlya Dryomov 	for_each_obj_request(img_req, obj_req) {
34810192ce2eSIlya Dryomov 		int result = 0;
34820192ce2eSIlya Dryomov 
34830192ce2eSIlya Dryomov 		if (__rbd_obj_handle_request(obj_req, &result)) {
34840192ce2eSIlya Dryomov 			if (result) {
34850192ce2eSIlya Dryomov 				img_req->pending.result = result;
34860192ce2eSIlya Dryomov 				return;
34870192ce2eSIlya Dryomov 			}
34880192ce2eSIlya Dryomov 		} else {
34890192ce2eSIlya Dryomov 			img_req->pending.num_pending++;
34900192ce2eSIlya Dryomov 		}
34910192ce2eSIlya Dryomov 	}
34920192ce2eSIlya Dryomov }
34930192ce2eSIlya Dryomov 
34940192ce2eSIlya Dryomov static bool rbd_img_advance(struct rbd_img_request *img_req, int *result)
34950192ce2eSIlya Dryomov {
3496637cd060SIlya Dryomov 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3497637cd060SIlya Dryomov 	int ret;
3498637cd060SIlya Dryomov 
34990192ce2eSIlya Dryomov again:
35000192ce2eSIlya Dryomov 	switch (img_req->state) {
35010192ce2eSIlya Dryomov 	case RBD_IMG_START:
35020192ce2eSIlya Dryomov 		rbd_assert(!*result);
35030192ce2eSIlya Dryomov 
3504637cd060SIlya Dryomov 		ret = rbd_img_exclusive_lock(img_req);
3505637cd060SIlya Dryomov 		if (ret < 0) {
3506637cd060SIlya Dryomov 			*result = ret;
3507637cd060SIlya Dryomov 			return true;
3508637cd060SIlya Dryomov 		}
3509637cd060SIlya Dryomov 		img_req->state = RBD_IMG_EXCLUSIVE_LOCK;
3510637cd060SIlya Dryomov 		if (ret > 0)
3511637cd060SIlya Dryomov 			goto again;
3512637cd060SIlya Dryomov 		return false;
3513637cd060SIlya Dryomov 	case RBD_IMG_EXCLUSIVE_LOCK:
3514637cd060SIlya Dryomov 		if (*result)
3515637cd060SIlya Dryomov 			return true;
3516637cd060SIlya Dryomov 
3517637cd060SIlya Dryomov 		rbd_assert(!need_exclusive_lock(img_req) ||
3518637cd060SIlya Dryomov 			   __rbd_is_lock_owner(rbd_dev));
3519637cd060SIlya Dryomov 
35200192ce2eSIlya Dryomov 		rbd_img_object_requests(img_req);
35210192ce2eSIlya Dryomov 		if (!img_req->pending.num_pending) {
35220192ce2eSIlya Dryomov 			*result = img_req->pending.result;
35230192ce2eSIlya Dryomov 			img_req->state = RBD_IMG_OBJECT_REQUESTS;
35247114edacSIlya Dryomov 			goto again;
35257114edacSIlya Dryomov 		}
35260192ce2eSIlya Dryomov 		img_req->state = __RBD_IMG_OBJECT_REQUESTS;
35270192ce2eSIlya Dryomov 		return false;
35280192ce2eSIlya Dryomov 	case __RBD_IMG_OBJECT_REQUESTS:
35290192ce2eSIlya Dryomov 		if (!pending_result_dec(&img_req->pending, result))
35300192ce2eSIlya Dryomov 			return false;
3531df561f66SGustavo A. R. Silva 		fallthrough;
35320192ce2eSIlya Dryomov 	case RBD_IMG_OBJECT_REQUESTS:
35330192ce2eSIlya Dryomov 		return true;
35340192ce2eSIlya Dryomov 	default:
35350192ce2eSIlya Dryomov 		BUG();
35360192ce2eSIlya Dryomov 	}
35370192ce2eSIlya Dryomov }
35380192ce2eSIlya Dryomov 
35390192ce2eSIlya Dryomov /*
35400192ce2eSIlya Dryomov  * Return true if @img_req is completed.
35410192ce2eSIlya Dryomov  */
35420192ce2eSIlya Dryomov static bool __rbd_img_handle_request(struct rbd_img_request *img_req,
35430192ce2eSIlya Dryomov 				     int *result)
35440192ce2eSIlya Dryomov {
35450192ce2eSIlya Dryomov 	struct rbd_device *rbd_dev = img_req->rbd_dev;
35460192ce2eSIlya Dryomov 	bool done;
35470192ce2eSIlya Dryomov 
3548e1fddc8fSIlya Dryomov 	if (need_exclusive_lock(img_req)) {
3549e1fddc8fSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
3550e1fddc8fSIlya Dryomov 		mutex_lock(&img_req->state_mutex);
3551e1fddc8fSIlya Dryomov 		done = rbd_img_advance(img_req, result);
3552e1fddc8fSIlya Dryomov 		if (done)
3553e1fddc8fSIlya Dryomov 			rbd_lock_del_request(img_req);
3554e1fddc8fSIlya Dryomov 		mutex_unlock(&img_req->state_mutex);
3555e1fddc8fSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
3556e1fddc8fSIlya Dryomov 	} else {
35570192ce2eSIlya Dryomov 		mutex_lock(&img_req->state_mutex);
35580192ce2eSIlya Dryomov 		done = rbd_img_advance(img_req, result);
35590192ce2eSIlya Dryomov 		mutex_unlock(&img_req->state_mutex);
3560e1fddc8fSIlya Dryomov 	}
35610192ce2eSIlya Dryomov 
35620192ce2eSIlya Dryomov 	if (done && *result) {
35630192ce2eSIlya Dryomov 		rbd_assert(*result < 0);
35640192ce2eSIlya Dryomov 		rbd_warn(rbd_dev, "%s%s result %d",
35650192ce2eSIlya Dryomov 		      test_bit(IMG_REQ_CHILD, &img_req->flags) ? "child " : "",
35660192ce2eSIlya Dryomov 		      obj_op_name(img_req->op_type), *result);
35670192ce2eSIlya Dryomov 	}
35680192ce2eSIlya Dryomov 	return done;
35690192ce2eSIlya Dryomov }
35700192ce2eSIlya Dryomov 
35710192ce2eSIlya Dryomov static void rbd_img_handle_request(struct rbd_img_request *img_req, int result)
35720192ce2eSIlya Dryomov {
35730192ce2eSIlya Dryomov again:
35740192ce2eSIlya Dryomov 	if (!__rbd_img_handle_request(img_req, &result))
35750192ce2eSIlya Dryomov 		return;
35760192ce2eSIlya Dryomov 
35770192ce2eSIlya Dryomov 	if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
35780192ce2eSIlya Dryomov 		struct rbd_obj_request *obj_req = img_req->obj_request;
35790192ce2eSIlya Dryomov 
3580679a97d2SHannes Reinecke 		rbd_img_request_destroy(img_req);
35810192ce2eSIlya Dryomov 		if (__rbd_obj_handle_request(obj_req, &result)) {
35820192ce2eSIlya Dryomov 			img_req = obj_req->img_request;
35830192ce2eSIlya Dryomov 			goto again;
35840192ce2eSIlya Dryomov 		}
35850192ce2eSIlya Dryomov 	} else {
358659e542c8SIlya Dryomov 		struct request *rq = blk_mq_rq_from_pdu(img_req);
35870192ce2eSIlya Dryomov 
3588679a97d2SHannes Reinecke 		rbd_img_request_destroy(img_req);
35890192ce2eSIlya Dryomov 		blk_mq_end_request(rq, errno_to_blk_status(result));
35900192ce2eSIlya Dryomov 	}
35919969ebc5SAlex Elder }
35929969ebc5SAlex Elder 
3593ed95b21aSIlya Dryomov static const struct rbd_client_id rbd_empty_cid;
3594ed95b21aSIlya Dryomov 
3595ed95b21aSIlya Dryomov static bool rbd_cid_equal(const struct rbd_client_id *lhs,
3596ed95b21aSIlya Dryomov 			  const struct rbd_client_id *rhs)
3597ed95b21aSIlya Dryomov {
3598ed95b21aSIlya Dryomov 	return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
3599ed95b21aSIlya Dryomov }
3600ed95b21aSIlya Dryomov 
3601ed95b21aSIlya Dryomov static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
3602ed95b21aSIlya Dryomov {
3603ed95b21aSIlya Dryomov 	struct rbd_client_id cid;
3604ed95b21aSIlya Dryomov 
3605ed95b21aSIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
3606ed95b21aSIlya Dryomov 	cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
3607ed95b21aSIlya Dryomov 	cid.handle = rbd_dev->watch_cookie;
3608ed95b21aSIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
3609ed95b21aSIlya Dryomov 	return cid;
3610ed95b21aSIlya Dryomov }
3611ed95b21aSIlya Dryomov 
3612ed95b21aSIlya Dryomov /*
3613ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3614ed95b21aSIlya Dryomov  */
3615ed95b21aSIlya Dryomov static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
3616ed95b21aSIlya Dryomov 			      const struct rbd_client_id *cid)
3617ed95b21aSIlya Dryomov {
3618ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
3619ed95b21aSIlya Dryomov 	     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
3620ed95b21aSIlya Dryomov 	     cid->gid, cid->handle);
3621ed95b21aSIlya Dryomov 	rbd_dev->owner_cid = *cid; /* struct */
3622ed95b21aSIlya Dryomov }
3623ed95b21aSIlya Dryomov 
3624ed95b21aSIlya Dryomov static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
3625ed95b21aSIlya Dryomov {
3626ed95b21aSIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
3627ed95b21aSIlya Dryomov 	sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
3628ed95b21aSIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
3629ed95b21aSIlya Dryomov }
3630ed95b21aSIlya Dryomov 
3631edd8ca80SFlorian Margaine static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
3632edd8ca80SFlorian Margaine {
3633edd8ca80SFlorian Margaine 	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3634edd8ca80SFlorian Margaine 
3635a2b1da09SIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
3636edd8ca80SFlorian Margaine 	strcpy(rbd_dev->lock_cookie, cookie);
3637edd8ca80SFlorian Margaine 	rbd_set_owner_cid(rbd_dev, &cid);
3638edd8ca80SFlorian Margaine 	queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
3639edd8ca80SFlorian Margaine }
3640edd8ca80SFlorian Margaine 
3641ed95b21aSIlya Dryomov /*
3642ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3643ed95b21aSIlya Dryomov  */
3644ed95b21aSIlya Dryomov static int rbd_lock(struct rbd_device *rbd_dev)
3645ed95b21aSIlya Dryomov {
3646ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3647ed95b21aSIlya Dryomov 	char cookie[32];
3648ed95b21aSIlya Dryomov 	int ret;
3649ed95b21aSIlya Dryomov 
3650cbbfb0ffSIlya Dryomov 	WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
3651cbbfb0ffSIlya Dryomov 		rbd_dev->lock_cookie[0] != '\0');
3652ed95b21aSIlya Dryomov 
3653ed95b21aSIlya Dryomov 	format_lock_cookie(rbd_dev, cookie);
3654ed95b21aSIlya Dryomov 	ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3655ed95b21aSIlya Dryomov 			    RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
3656ed95b21aSIlya Dryomov 			    RBD_LOCK_TAG, "", 0);
3657ed95b21aSIlya Dryomov 	if (ret)
3658ed95b21aSIlya Dryomov 		return ret;
3659ed95b21aSIlya Dryomov 
3660edd8ca80SFlorian Margaine 	__rbd_lock(rbd_dev, cookie);
3661ed95b21aSIlya Dryomov 	return 0;
3662ed95b21aSIlya Dryomov }
3663ed95b21aSIlya Dryomov 
3664ed95b21aSIlya Dryomov /*
3665ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3666ed95b21aSIlya Dryomov  */
3667bbead745SIlya Dryomov static void rbd_unlock(struct rbd_device *rbd_dev)
3668ed95b21aSIlya Dryomov {
3669ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3670ed95b21aSIlya Dryomov 	int ret;
3671ed95b21aSIlya Dryomov 
3672cbbfb0ffSIlya Dryomov 	WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
3673cbbfb0ffSIlya Dryomov 		rbd_dev->lock_cookie[0] == '\0');
3674ed95b21aSIlya Dryomov 
3675ed95b21aSIlya Dryomov 	ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3676cbbfb0ffSIlya Dryomov 			      RBD_LOCK_NAME, rbd_dev->lock_cookie);
3677bbead745SIlya Dryomov 	if (ret && ret != -ENOENT)
3678637cd060SIlya Dryomov 		rbd_warn(rbd_dev, "failed to unlock header: %d", ret);
3679ed95b21aSIlya Dryomov 
3680bbead745SIlya Dryomov 	/* treat errors as the image is unlocked */
3681bbead745SIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
3682cbbfb0ffSIlya Dryomov 	rbd_dev->lock_cookie[0] = '\0';
3683ed95b21aSIlya Dryomov 	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3684ed95b21aSIlya Dryomov 	queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
3685ed95b21aSIlya Dryomov }
3686ed95b21aSIlya Dryomov 
3687ed95b21aSIlya Dryomov static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
3688ed95b21aSIlya Dryomov 				enum rbd_notify_op notify_op,
3689ed95b21aSIlya Dryomov 				struct page ***preply_pages,
3690ed95b21aSIlya Dryomov 				size_t *preply_len)
3691ed95b21aSIlya Dryomov {
3692ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3693ed95b21aSIlya Dryomov 	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
369408a79102SKyle Spiers 	char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
369508a79102SKyle Spiers 	int buf_size = sizeof(buf);
3696ed95b21aSIlya Dryomov 	void *p = buf;
3697ed95b21aSIlya Dryomov 
3698ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
3699ed95b21aSIlya Dryomov 
3700ed95b21aSIlya Dryomov 	/* encode *LockPayload NotifyMessage (op + ClientId) */
3701ed95b21aSIlya Dryomov 	ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
3702ed95b21aSIlya Dryomov 	ceph_encode_32(&p, notify_op);
3703ed95b21aSIlya Dryomov 	ceph_encode_64(&p, cid.gid);
3704ed95b21aSIlya Dryomov 	ceph_encode_64(&p, cid.handle);
3705ed95b21aSIlya Dryomov 
3706ed95b21aSIlya Dryomov 	return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
3707ed95b21aSIlya Dryomov 				&rbd_dev->header_oloc, buf, buf_size,
3708ed95b21aSIlya Dryomov 				RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
3709ed95b21aSIlya Dryomov }
3710ed95b21aSIlya Dryomov 
3711ed95b21aSIlya Dryomov static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
3712ed95b21aSIlya Dryomov 			       enum rbd_notify_op notify_op)
3713ed95b21aSIlya Dryomov {
37148ae0299aSIlya Dryomov 	__rbd_notify_op_lock(rbd_dev, notify_op, NULL, NULL);
3715ed95b21aSIlya Dryomov }
3716ed95b21aSIlya Dryomov 
3717ed95b21aSIlya Dryomov static void rbd_notify_acquired_lock(struct work_struct *work)
3718ed95b21aSIlya Dryomov {
3719ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3720ed95b21aSIlya Dryomov 						  acquired_lock_work);
3721ed95b21aSIlya Dryomov 
3722ed95b21aSIlya Dryomov 	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
3723ed95b21aSIlya Dryomov }
3724ed95b21aSIlya Dryomov 
3725ed95b21aSIlya Dryomov static void rbd_notify_released_lock(struct work_struct *work)
3726ed95b21aSIlya Dryomov {
3727ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3728ed95b21aSIlya Dryomov 						  released_lock_work);
3729ed95b21aSIlya Dryomov 
3730ed95b21aSIlya Dryomov 	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
3731ed95b21aSIlya Dryomov }
3732ed95b21aSIlya Dryomov 
3733ed95b21aSIlya Dryomov static int rbd_request_lock(struct rbd_device *rbd_dev)
3734ed95b21aSIlya Dryomov {
3735ed95b21aSIlya Dryomov 	struct page **reply_pages;
3736ed95b21aSIlya Dryomov 	size_t reply_len;
3737ed95b21aSIlya Dryomov 	bool lock_owner_responded = false;
3738ed95b21aSIlya Dryomov 	int ret;
3739ed95b21aSIlya Dryomov 
3740ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3741ed95b21aSIlya Dryomov 
3742ed95b21aSIlya Dryomov 	ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
3743ed95b21aSIlya Dryomov 				   &reply_pages, &reply_len);
3744ed95b21aSIlya Dryomov 	if (ret && ret != -ETIMEDOUT) {
3745ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "failed to request lock: %d", ret);
3746ed95b21aSIlya Dryomov 		goto out;
3747ed95b21aSIlya Dryomov 	}
3748ed95b21aSIlya Dryomov 
3749ed95b21aSIlya Dryomov 	if (reply_len > 0 && reply_len <= PAGE_SIZE) {
3750ed95b21aSIlya Dryomov 		void *p = page_address(reply_pages[0]);
3751ed95b21aSIlya Dryomov 		void *const end = p + reply_len;
3752ed95b21aSIlya Dryomov 		u32 n;
3753ed95b21aSIlya Dryomov 
3754ed95b21aSIlya Dryomov 		ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
3755ed95b21aSIlya Dryomov 		while (n--) {
3756ed95b21aSIlya Dryomov 			u8 struct_v;
3757ed95b21aSIlya Dryomov 			u32 len;
3758ed95b21aSIlya Dryomov 
3759ed95b21aSIlya Dryomov 			ceph_decode_need(&p, end, 8 + 8, e_inval);
3760ed95b21aSIlya Dryomov 			p += 8 + 8; /* skip gid and cookie */
3761ed95b21aSIlya Dryomov 
3762ed95b21aSIlya Dryomov 			ceph_decode_32_safe(&p, end, len, e_inval);
3763ed95b21aSIlya Dryomov 			if (!len)
3764ed95b21aSIlya Dryomov 				continue;
3765ed95b21aSIlya Dryomov 
3766ed95b21aSIlya Dryomov 			if (lock_owner_responded) {
3767ed95b21aSIlya Dryomov 				rbd_warn(rbd_dev,
3768ed95b21aSIlya Dryomov 					 "duplicate lock owners detected");
3769ed95b21aSIlya Dryomov 				ret = -EIO;
3770ed95b21aSIlya Dryomov 				goto out;
3771ed95b21aSIlya Dryomov 			}
3772ed95b21aSIlya Dryomov 
3773ed95b21aSIlya Dryomov 			lock_owner_responded = true;
3774ed95b21aSIlya Dryomov 			ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
3775ed95b21aSIlya Dryomov 						  &struct_v, &len);
3776ed95b21aSIlya Dryomov 			if (ret) {
3777ed95b21aSIlya Dryomov 				rbd_warn(rbd_dev,
3778ed95b21aSIlya Dryomov 					 "failed to decode ResponseMessage: %d",
3779ed95b21aSIlya Dryomov 					 ret);
3780ed95b21aSIlya Dryomov 				goto e_inval;
3781ed95b21aSIlya Dryomov 			}
3782ed95b21aSIlya Dryomov 
3783ed95b21aSIlya Dryomov 			ret = ceph_decode_32(&p);
3784ed95b21aSIlya Dryomov 		}
3785ed95b21aSIlya Dryomov 	}
3786ed95b21aSIlya Dryomov 
3787ed95b21aSIlya Dryomov 	if (!lock_owner_responded) {
3788ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "no lock owners detected");
3789ed95b21aSIlya Dryomov 		ret = -ETIMEDOUT;
3790ed95b21aSIlya Dryomov 	}
3791ed95b21aSIlya Dryomov 
3792ed95b21aSIlya Dryomov out:
3793ed95b21aSIlya Dryomov 	ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3794ed95b21aSIlya Dryomov 	return ret;
3795ed95b21aSIlya Dryomov 
3796ed95b21aSIlya Dryomov e_inval:
3797ed95b21aSIlya Dryomov 	ret = -EINVAL;
3798ed95b21aSIlya Dryomov 	goto out;
3799ed95b21aSIlya Dryomov }
3800ed95b21aSIlya Dryomov 
3801637cd060SIlya Dryomov /*
3802637cd060SIlya Dryomov  * Either image request state machine(s) or rbd_add_acquire_lock()
3803637cd060SIlya Dryomov  * (i.e. "rbd map").
3804637cd060SIlya Dryomov  */
3805637cd060SIlya Dryomov static void wake_lock_waiters(struct rbd_device *rbd_dev, int result)
3806ed95b21aSIlya Dryomov {
3807637cd060SIlya Dryomov 	struct rbd_img_request *img_req;
3808637cd060SIlya Dryomov 
3809637cd060SIlya Dryomov 	dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3810d9b9c893SLinus Torvalds 	lockdep_assert_held_write(&rbd_dev->lock_rwsem);
3811ed95b21aSIlya Dryomov 
3812ed95b21aSIlya Dryomov 	cancel_delayed_work(&rbd_dev->lock_dwork);
3813637cd060SIlya Dryomov 	if (!completion_done(&rbd_dev->acquire_wait)) {
3814637cd060SIlya Dryomov 		rbd_assert(list_empty(&rbd_dev->acquiring_list) &&
3815637cd060SIlya Dryomov 			   list_empty(&rbd_dev->running_list));
3816637cd060SIlya Dryomov 		rbd_dev->acquire_err = result;
3817637cd060SIlya Dryomov 		complete_all(&rbd_dev->acquire_wait);
3818637cd060SIlya Dryomov 		return;
3819637cd060SIlya Dryomov 	}
3820637cd060SIlya Dryomov 
3821637cd060SIlya Dryomov 	list_for_each_entry(img_req, &rbd_dev->acquiring_list, lock_item) {
3822637cd060SIlya Dryomov 		mutex_lock(&img_req->state_mutex);
3823637cd060SIlya Dryomov 		rbd_assert(img_req->state == RBD_IMG_EXCLUSIVE_LOCK);
3824637cd060SIlya Dryomov 		rbd_img_schedule(img_req, result);
3825637cd060SIlya Dryomov 		mutex_unlock(&img_req->state_mutex);
3826637cd060SIlya Dryomov 	}
3827637cd060SIlya Dryomov 
3828637cd060SIlya Dryomov 	list_splice_tail_init(&rbd_dev->acquiring_list, &rbd_dev->running_list);
3829ed95b21aSIlya Dryomov }
3830ed95b21aSIlya Dryomov 
3831ed95b21aSIlya Dryomov static int get_lock_owner_info(struct rbd_device *rbd_dev,
3832ed95b21aSIlya Dryomov 			       struct ceph_locker **lockers, u32 *num_lockers)
3833ed95b21aSIlya Dryomov {
3834ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3835ed95b21aSIlya Dryomov 	u8 lock_type;
3836ed95b21aSIlya Dryomov 	char *lock_tag;
3837ed95b21aSIlya Dryomov 	int ret;
3838ed95b21aSIlya Dryomov 
3839ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3840ed95b21aSIlya Dryomov 
3841ed95b21aSIlya Dryomov 	ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
3842ed95b21aSIlya Dryomov 				 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3843ed95b21aSIlya Dryomov 				 &lock_type, &lock_tag, lockers, num_lockers);
3844ed95b21aSIlya Dryomov 	if (ret)
3845ed95b21aSIlya Dryomov 		return ret;
3846ed95b21aSIlya Dryomov 
3847ed95b21aSIlya Dryomov 	if (*num_lockers == 0) {
3848ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
3849ed95b21aSIlya Dryomov 		goto out;
3850ed95b21aSIlya Dryomov 	}
3851ed95b21aSIlya Dryomov 
3852ed95b21aSIlya Dryomov 	if (strcmp(lock_tag, RBD_LOCK_TAG)) {
3853ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
3854ed95b21aSIlya Dryomov 			 lock_tag);
3855ed95b21aSIlya Dryomov 		ret = -EBUSY;
3856ed95b21aSIlya Dryomov 		goto out;
3857ed95b21aSIlya Dryomov 	}
3858ed95b21aSIlya Dryomov 
3859ed95b21aSIlya Dryomov 	if (lock_type == CEPH_CLS_LOCK_SHARED) {
3860ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "shared lock type detected");
3861ed95b21aSIlya Dryomov 		ret = -EBUSY;
3862ed95b21aSIlya Dryomov 		goto out;
3863ed95b21aSIlya Dryomov 	}
3864ed95b21aSIlya Dryomov 
3865ed95b21aSIlya Dryomov 	if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
3866ed95b21aSIlya Dryomov 		    strlen(RBD_LOCK_COOKIE_PREFIX))) {
3867ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
3868ed95b21aSIlya Dryomov 			 (*lockers)[0].id.cookie);
3869ed95b21aSIlya Dryomov 		ret = -EBUSY;
3870ed95b21aSIlya Dryomov 		goto out;
3871ed95b21aSIlya Dryomov 	}
3872ed95b21aSIlya Dryomov 
3873ed95b21aSIlya Dryomov out:
3874ed95b21aSIlya Dryomov 	kfree(lock_tag);
3875ed95b21aSIlya Dryomov 	return ret;
3876ed95b21aSIlya Dryomov }
3877ed95b21aSIlya Dryomov 
3878ed95b21aSIlya Dryomov static int find_watcher(struct rbd_device *rbd_dev,
3879ed95b21aSIlya Dryomov 			const struct ceph_locker *locker)
3880ed95b21aSIlya Dryomov {
3881ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3882ed95b21aSIlya Dryomov 	struct ceph_watch_item *watchers;
3883ed95b21aSIlya Dryomov 	u32 num_watchers;
3884ed95b21aSIlya Dryomov 	u64 cookie;
3885ed95b21aSIlya Dryomov 	int i;
3886ed95b21aSIlya Dryomov 	int ret;
3887ed95b21aSIlya Dryomov 
3888ed95b21aSIlya Dryomov 	ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
3889ed95b21aSIlya Dryomov 				      &rbd_dev->header_oloc, &watchers,
3890ed95b21aSIlya Dryomov 				      &num_watchers);
3891ed95b21aSIlya Dryomov 	if (ret)
3892ed95b21aSIlya Dryomov 		return ret;
3893ed95b21aSIlya Dryomov 
3894ed95b21aSIlya Dryomov 	sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
3895ed95b21aSIlya Dryomov 	for (i = 0; i < num_watchers; i++) {
3896313771e8SIlya Dryomov 		/*
3897313771e8SIlya Dryomov 		 * Ignore addr->type while comparing.  This mimics
3898313771e8SIlya Dryomov 		 * entity_addr_t::get_legacy_str() + strcmp().
3899313771e8SIlya Dryomov 		 */
3900313771e8SIlya Dryomov 		if (ceph_addr_equal_no_type(&watchers[i].addr,
3901313771e8SIlya Dryomov 					    &locker->info.addr) &&
3902ed95b21aSIlya Dryomov 		    watchers[i].cookie == cookie) {
3903ed95b21aSIlya Dryomov 			struct rbd_client_id cid = {
3904ed95b21aSIlya Dryomov 				.gid = le64_to_cpu(watchers[i].name.num),
3905ed95b21aSIlya Dryomov 				.handle = cookie,
3906ed95b21aSIlya Dryomov 			};
3907ed95b21aSIlya Dryomov 
3908ed95b21aSIlya Dryomov 			dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
3909ed95b21aSIlya Dryomov 			     rbd_dev, cid.gid, cid.handle);
3910ed95b21aSIlya Dryomov 			rbd_set_owner_cid(rbd_dev, &cid);
3911ed95b21aSIlya Dryomov 			ret = 1;
3912ed95b21aSIlya Dryomov 			goto out;
3913ed95b21aSIlya Dryomov 		}
3914ed95b21aSIlya Dryomov 	}
3915ed95b21aSIlya Dryomov 
3916ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
3917ed95b21aSIlya Dryomov 	ret = 0;
3918ed95b21aSIlya Dryomov out:
3919ed95b21aSIlya Dryomov 	kfree(watchers);
3920ed95b21aSIlya Dryomov 	return ret;
3921ed95b21aSIlya Dryomov }
3922ed95b21aSIlya Dryomov 
3923ed95b21aSIlya Dryomov /*
3924ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3925ed95b21aSIlya Dryomov  */
3926ed95b21aSIlya Dryomov static int rbd_try_lock(struct rbd_device *rbd_dev)
3927ed95b21aSIlya Dryomov {
3928ed95b21aSIlya Dryomov 	struct ceph_client *client = rbd_dev->rbd_client->client;
3929ed95b21aSIlya Dryomov 	struct ceph_locker *lockers;
3930ed95b21aSIlya Dryomov 	u32 num_lockers;
3931ed95b21aSIlya Dryomov 	int ret;
3932ed95b21aSIlya Dryomov 
3933ed95b21aSIlya Dryomov 	for (;;) {
3934ed95b21aSIlya Dryomov 		ret = rbd_lock(rbd_dev);
3935ed95b21aSIlya Dryomov 		if (ret != -EBUSY)
3936ed95b21aSIlya Dryomov 			return ret;
3937ed95b21aSIlya Dryomov 
3938ed95b21aSIlya Dryomov 		/* determine if the current lock holder is still alive */
3939ed95b21aSIlya Dryomov 		ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
3940ed95b21aSIlya Dryomov 		if (ret)
3941ed95b21aSIlya Dryomov 			return ret;
3942ed95b21aSIlya Dryomov 
3943ed95b21aSIlya Dryomov 		if (num_lockers == 0)
3944ed95b21aSIlya Dryomov 			goto again;
3945ed95b21aSIlya Dryomov 
3946ed95b21aSIlya Dryomov 		ret = find_watcher(rbd_dev, lockers);
3947637cd060SIlya Dryomov 		if (ret)
3948637cd060SIlya Dryomov 			goto out; /* request lock or error */
3949ed95b21aSIlya Dryomov 
395022e8bd51SIlya Dryomov 		rbd_warn(rbd_dev, "breaking header lock owned by %s%llu",
3951ed95b21aSIlya Dryomov 			 ENTITY_NAME(lockers[0].id.name));
3952ed95b21aSIlya Dryomov 
39530b98acd6SIlya Dryomov 		ret = ceph_monc_blocklist_add(&client->monc,
3954ed95b21aSIlya Dryomov 					      &lockers[0].info.addr);
3955ed95b21aSIlya Dryomov 		if (ret) {
39560b98acd6SIlya Dryomov 			rbd_warn(rbd_dev, "blocklist of %s%llu failed: %d",
3957ed95b21aSIlya Dryomov 				 ENTITY_NAME(lockers[0].id.name), ret);
3958ed95b21aSIlya Dryomov 			goto out;
3959ed95b21aSIlya Dryomov 		}
3960ed95b21aSIlya Dryomov 
3961ed95b21aSIlya Dryomov 		ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
3962ed95b21aSIlya Dryomov 					  &rbd_dev->header_oloc, RBD_LOCK_NAME,
3963ed95b21aSIlya Dryomov 					  lockers[0].id.cookie,
3964ed95b21aSIlya Dryomov 					  &lockers[0].id.name);
3965ed95b21aSIlya Dryomov 		if (ret && ret != -ENOENT)
3966ed95b21aSIlya Dryomov 			goto out;
3967ed95b21aSIlya Dryomov 
3968ed95b21aSIlya Dryomov again:
3969ed95b21aSIlya Dryomov 		ceph_free_lockers(lockers, num_lockers);
3970ed95b21aSIlya Dryomov 	}
3971ed95b21aSIlya Dryomov 
3972ed95b21aSIlya Dryomov out:
3973ed95b21aSIlya Dryomov 	ceph_free_lockers(lockers, num_lockers);
3974ed95b21aSIlya Dryomov 	return ret;
3975ed95b21aSIlya Dryomov }
3976ed95b21aSIlya Dryomov 
397722e8bd51SIlya Dryomov static int rbd_post_acquire_action(struct rbd_device *rbd_dev)
3978ed95b21aSIlya Dryomov {
397922e8bd51SIlya Dryomov 	int ret;
398022e8bd51SIlya Dryomov 
398122e8bd51SIlya Dryomov 	if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) {
398222e8bd51SIlya Dryomov 		ret = rbd_object_map_open(rbd_dev);
398322e8bd51SIlya Dryomov 		if (ret)
398422e8bd51SIlya Dryomov 			return ret;
398522e8bd51SIlya Dryomov 	}
398622e8bd51SIlya Dryomov 
398722e8bd51SIlya Dryomov 	return 0;
398822e8bd51SIlya Dryomov }
398922e8bd51SIlya Dryomov 
3990ed95b21aSIlya Dryomov /*
3991637cd060SIlya Dryomov  * Return:
3992637cd060SIlya Dryomov  *   0 - lock acquired
3993637cd060SIlya Dryomov  *   1 - caller should call rbd_request_lock()
3994637cd060SIlya Dryomov  *  <0 - error
3995ed95b21aSIlya Dryomov  */
3996637cd060SIlya Dryomov static int rbd_try_acquire_lock(struct rbd_device *rbd_dev)
3997ed95b21aSIlya Dryomov {
3998637cd060SIlya Dryomov 	int ret;
3999ed95b21aSIlya Dryomov 
4000ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
4001ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
4002ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
4003ed95b21aSIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev)) {
4004ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
4005637cd060SIlya Dryomov 		return 0;
4006ed95b21aSIlya Dryomov 	}
4007ed95b21aSIlya Dryomov 
4008ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
4009ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
4010ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
4011ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
4012637cd060SIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev)) {
4013637cd060SIlya Dryomov 		up_write(&rbd_dev->lock_rwsem);
4014637cd060SIlya Dryomov 		return 0;
4015ed95b21aSIlya Dryomov 	}
4016ed95b21aSIlya Dryomov 
4017637cd060SIlya Dryomov 	ret = rbd_try_lock(rbd_dev);
4018637cd060SIlya Dryomov 	if (ret < 0) {
4019637cd060SIlya Dryomov 		rbd_warn(rbd_dev, "failed to lock header: %d", ret);
40200b98acd6SIlya Dryomov 		if (ret == -EBLOCKLISTED)
4021637cd060SIlya Dryomov 			goto out;
4022637cd060SIlya Dryomov 
4023637cd060SIlya Dryomov 		ret = 1; /* request lock anyway */
4024637cd060SIlya Dryomov 	}
4025637cd060SIlya Dryomov 	if (ret > 0) {
4026ed95b21aSIlya Dryomov 		up_write(&rbd_dev->lock_rwsem);
4027637cd060SIlya Dryomov 		return ret;
4028637cd060SIlya Dryomov 	}
4029637cd060SIlya Dryomov 
4030637cd060SIlya Dryomov 	rbd_assert(rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED);
4031637cd060SIlya Dryomov 	rbd_assert(list_empty(&rbd_dev->running_list));
4032637cd060SIlya Dryomov 
403322e8bd51SIlya Dryomov 	ret = rbd_post_acquire_action(rbd_dev);
403422e8bd51SIlya Dryomov 	if (ret) {
403522e8bd51SIlya Dryomov 		rbd_warn(rbd_dev, "post-acquire action failed: %d", ret);
403622e8bd51SIlya Dryomov 		/*
403722e8bd51SIlya Dryomov 		 * Can't stay in RBD_LOCK_STATE_LOCKED because
403822e8bd51SIlya Dryomov 		 * rbd_lock_add_request() would let the request through,
403922e8bd51SIlya Dryomov 		 * assuming that e.g. object map is locked and loaded.
404022e8bd51SIlya Dryomov 		 */
404122e8bd51SIlya Dryomov 		rbd_unlock(rbd_dev);
404222e8bd51SIlya Dryomov 	}
404322e8bd51SIlya Dryomov 
4044637cd060SIlya Dryomov out:
4045637cd060SIlya Dryomov 	wake_lock_waiters(rbd_dev, ret);
4046637cd060SIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
4047637cd060SIlya Dryomov 	return ret;
4048ed95b21aSIlya Dryomov }
4049ed95b21aSIlya Dryomov 
4050ed95b21aSIlya Dryomov static void rbd_acquire_lock(struct work_struct *work)
4051ed95b21aSIlya Dryomov {
4052ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
4053ed95b21aSIlya Dryomov 					    struct rbd_device, lock_dwork);
4054637cd060SIlya Dryomov 	int ret;
4055ed95b21aSIlya Dryomov 
4056ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4057ed95b21aSIlya Dryomov again:
4058637cd060SIlya Dryomov 	ret = rbd_try_acquire_lock(rbd_dev);
4059637cd060SIlya Dryomov 	if (ret <= 0) {
4060637cd060SIlya Dryomov 		dout("%s rbd_dev %p ret %d - done\n", __func__, rbd_dev, ret);
4061ed95b21aSIlya Dryomov 		return;
4062ed95b21aSIlya Dryomov 	}
4063ed95b21aSIlya Dryomov 
4064ed95b21aSIlya Dryomov 	ret = rbd_request_lock(rbd_dev);
4065ed95b21aSIlya Dryomov 	if (ret == -ETIMEDOUT) {
4066ed95b21aSIlya Dryomov 		goto again; /* treat this as a dead client */
4067e010dd0aSIlya Dryomov 	} else if (ret == -EROFS) {
4068e010dd0aSIlya Dryomov 		rbd_warn(rbd_dev, "peer will not release lock");
4069637cd060SIlya Dryomov 		down_write(&rbd_dev->lock_rwsem);
4070637cd060SIlya Dryomov 		wake_lock_waiters(rbd_dev, ret);
4071637cd060SIlya Dryomov 		up_write(&rbd_dev->lock_rwsem);
4072ed95b21aSIlya Dryomov 	} else if (ret < 0) {
4073ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "error requesting lock: %d", ret);
4074ed95b21aSIlya Dryomov 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4075ed95b21aSIlya Dryomov 				 RBD_RETRY_DELAY);
4076ed95b21aSIlya Dryomov 	} else {
4077ed95b21aSIlya Dryomov 		/*
4078ed95b21aSIlya Dryomov 		 * lock owner acked, but resend if we don't see them
4079ed95b21aSIlya Dryomov 		 * release the lock
4080ed95b21aSIlya Dryomov 		 */
40816b0a8774SColin Ian King 		dout("%s rbd_dev %p requeuing lock_dwork\n", __func__,
4082ed95b21aSIlya Dryomov 		     rbd_dev);
4083ed95b21aSIlya Dryomov 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4084ed95b21aSIlya Dryomov 		    msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
4085ed95b21aSIlya Dryomov 	}
4086ed95b21aSIlya Dryomov }
4087ed95b21aSIlya Dryomov 
4088a2b1da09SIlya Dryomov static bool rbd_quiesce_lock(struct rbd_device *rbd_dev)
4089ed95b21aSIlya Dryomov {
4090a2b1da09SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4091d9b9c893SLinus Torvalds 	lockdep_assert_held_write(&rbd_dev->lock_rwsem);
4092a2b1da09SIlya Dryomov 
4093ed95b21aSIlya Dryomov 	if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
4094ed95b21aSIlya Dryomov 		return false;
4095ed95b21aSIlya Dryomov 
4096ed95b21aSIlya Dryomov 	/*
4097ed95b21aSIlya Dryomov 	 * Ensure that all in-flight IO is flushed.
4098ed95b21aSIlya Dryomov 	 */
4099e1fddc8fSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
4100e1fddc8fSIlya Dryomov 	rbd_assert(!completion_done(&rbd_dev->releasing_wait));
4101ed9eb710SIlya Dryomov 	if (list_empty(&rbd_dev->running_list))
4102ed9eb710SIlya Dryomov 		return true;
4103ed9eb710SIlya Dryomov 
4104ed9eb710SIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
4105e1fddc8fSIlya Dryomov 	wait_for_completion(&rbd_dev->releasing_wait);
4106ed95b21aSIlya Dryomov 
4107ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
4108ed95b21aSIlya Dryomov 	if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
4109ed95b21aSIlya Dryomov 		return false;
4110ed95b21aSIlya Dryomov 
4111e1fddc8fSIlya Dryomov 	rbd_assert(list_empty(&rbd_dev->running_list));
4112a2b1da09SIlya Dryomov 	return true;
4113a2b1da09SIlya Dryomov }
4114a2b1da09SIlya Dryomov 
411522e8bd51SIlya Dryomov static void rbd_pre_release_action(struct rbd_device *rbd_dev)
411622e8bd51SIlya Dryomov {
411722e8bd51SIlya Dryomov 	if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)
411822e8bd51SIlya Dryomov 		rbd_object_map_close(rbd_dev);
411922e8bd51SIlya Dryomov }
412022e8bd51SIlya Dryomov 
4121e1fddc8fSIlya Dryomov static void __rbd_release_lock(struct rbd_device *rbd_dev)
4122e1fddc8fSIlya Dryomov {
4123e1fddc8fSIlya Dryomov 	rbd_assert(list_empty(&rbd_dev->running_list));
4124e1fddc8fSIlya Dryomov 
412522e8bd51SIlya Dryomov 	rbd_pre_release_action(rbd_dev);
4126bbead745SIlya Dryomov 	rbd_unlock(rbd_dev);
4127e1fddc8fSIlya Dryomov }
4128e1fddc8fSIlya Dryomov 
4129a2b1da09SIlya Dryomov /*
4130a2b1da09SIlya Dryomov  * lock_rwsem must be held for write
4131a2b1da09SIlya Dryomov  */
4132a2b1da09SIlya Dryomov static void rbd_release_lock(struct rbd_device *rbd_dev)
4133a2b1da09SIlya Dryomov {
4134a2b1da09SIlya Dryomov 	if (!rbd_quiesce_lock(rbd_dev))
4135a2b1da09SIlya Dryomov 		return;
4136a2b1da09SIlya Dryomov 
4137e1fddc8fSIlya Dryomov 	__rbd_release_lock(rbd_dev);
4138a2b1da09SIlya Dryomov 
4139ed95b21aSIlya Dryomov 	/*
4140ed95b21aSIlya Dryomov 	 * Give others a chance to grab the lock - we would re-acquire
4141637cd060SIlya Dryomov 	 * almost immediately if we got new IO while draining the running
4142637cd060SIlya Dryomov 	 * list otherwise.  We need to ack our own notifications, so this
4143637cd060SIlya Dryomov 	 * lock_dwork will be requeued from rbd_handle_released_lock() by
4144637cd060SIlya Dryomov 	 * way of maybe_kick_acquire().
4145ed95b21aSIlya Dryomov 	 */
4146ed95b21aSIlya Dryomov 	cancel_delayed_work(&rbd_dev->lock_dwork);
4147ed95b21aSIlya Dryomov }
4148ed95b21aSIlya Dryomov 
4149ed95b21aSIlya Dryomov static void rbd_release_lock_work(struct work_struct *work)
4150ed95b21aSIlya Dryomov {
4151ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
4152ed95b21aSIlya Dryomov 						  unlock_work);
4153ed95b21aSIlya Dryomov 
4154ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
4155ed95b21aSIlya Dryomov 	rbd_release_lock(rbd_dev);
4156ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
4157ed95b21aSIlya Dryomov }
4158ed95b21aSIlya Dryomov 
4159637cd060SIlya Dryomov static void maybe_kick_acquire(struct rbd_device *rbd_dev)
4160637cd060SIlya Dryomov {
4161637cd060SIlya Dryomov 	bool have_requests;
4162637cd060SIlya Dryomov 
4163637cd060SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4164637cd060SIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev))
4165637cd060SIlya Dryomov 		return;
4166637cd060SIlya Dryomov 
4167637cd060SIlya Dryomov 	spin_lock(&rbd_dev->lock_lists_lock);
4168637cd060SIlya Dryomov 	have_requests = !list_empty(&rbd_dev->acquiring_list);
4169637cd060SIlya Dryomov 	spin_unlock(&rbd_dev->lock_lists_lock);
4170637cd060SIlya Dryomov 	if (have_requests || delayed_work_pending(&rbd_dev->lock_dwork)) {
4171637cd060SIlya Dryomov 		dout("%s rbd_dev %p kicking lock_dwork\n", __func__, rbd_dev);
4172637cd060SIlya Dryomov 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
4173637cd060SIlya Dryomov 	}
4174637cd060SIlya Dryomov }
4175637cd060SIlya Dryomov 
4176ed95b21aSIlya Dryomov static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
4177ed95b21aSIlya Dryomov 				     void **p)
4178ed95b21aSIlya Dryomov {
4179ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
4180ed95b21aSIlya Dryomov 
4181ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
4182ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
4183ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
4184ed95b21aSIlya Dryomov 	}
4185ed95b21aSIlya Dryomov 
4186ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4187ed95b21aSIlya Dryomov 	     cid.handle);
4188ed95b21aSIlya Dryomov 	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4189ed95b21aSIlya Dryomov 		down_write(&rbd_dev->lock_rwsem);
4190ed95b21aSIlya Dryomov 		if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
41918798d070SIlya Dryomov 			dout("%s rbd_dev %p cid %llu-%llu == owner_cid\n",
41928798d070SIlya Dryomov 			     __func__, rbd_dev, cid.gid, cid.handle);
41938798d070SIlya Dryomov 		} else {
4194ed95b21aSIlya Dryomov 			rbd_set_owner_cid(rbd_dev, &cid);
41958798d070SIlya Dryomov 		}
4196ed95b21aSIlya Dryomov 		downgrade_write(&rbd_dev->lock_rwsem);
4197ed95b21aSIlya Dryomov 	} else {
4198ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
4199ed95b21aSIlya Dryomov 	}
4200ed95b21aSIlya Dryomov 
4201637cd060SIlya Dryomov 	maybe_kick_acquire(rbd_dev);
4202ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
4203ed95b21aSIlya Dryomov }
4204ed95b21aSIlya Dryomov 
4205ed95b21aSIlya Dryomov static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
4206ed95b21aSIlya Dryomov 				     void **p)
4207ed95b21aSIlya Dryomov {
4208ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
4209ed95b21aSIlya Dryomov 
4210ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
4211ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
4212ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
4213ed95b21aSIlya Dryomov 	}
4214ed95b21aSIlya Dryomov 
4215ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4216ed95b21aSIlya Dryomov 	     cid.handle);
4217ed95b21aSIlya Dryomov 	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4218ed95b21aSIlya Dryomov 		down_write(&rbd_dev->lock_rwsem);
4219ed95b21aSIlya Dryomov 		if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
42208798d070SIlya Dryomov 			dout("%s rbd_dev %p cid %llu-%llu != owner_cid %llu-%llu\n",
4221ed95b21aSIlya Dryomov 			     __func__, rbd_dev, cid.gid, cid.handle,
4222ed95b21aSIlya Dryomov 			     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
42238798d070SIlya Dryomov 		} else {
4224ed95b21aSIlya Dryomov 			rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
42258798d070SIlya Dryomov 		}
4226ed95b21aSIlya Dryomov 		downgrade_write(&rbd_dev->lock_rwsem);
4227ed95b21aSIlya Dryomov 	} else {
4228ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
4229ed95b21aSIlya Dryomov 	}
4230ed95b21aSIlya Dryomov 
4231637cd060SIlya Dryomov 	maybe_kick_acquire(rbd_dev);
4232ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
4233ed95b21aSIlya Dryomov }
4234ed95b21aSIlya Dryomov 
42353b77faa0SIlya Dryomov /*
42363b77faa0SIlya Dryomov  * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
42373b77faa0SIlya Dryomov  * ResponseMessage is needed.
42383b77faa0SIlya Dryomov  */
42393b77faa0SIlya Dryomov static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
4240ed95b21aSIlya Dryomov 				   void **p)
4241ed95b21aSIlya Dryomov {
4242ed95b21aSIlya Dryomov 	struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
4243ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
42443b77faa0SIlya Dryomov 	int result = 1;
4245ed95b21aSIlya Dryomov 
4246ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
4247ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
4248ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
4249ed95b21aSIlya Dryomov 	}
4250ed95b21aSIlya Dryomov 
4251ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4252ed95b21aSIlya Dryomov 	     cid.handle);
4253ed95b21aSIlya Dryomov 	if (rbd_cid_equal(&cid, &my_cid))
42543b77faa0SIlya Dryomov 		return result;
4255ed95b21aSIlya Dryomov 
4256ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
42573b77faa0SIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev)) {
42583b77faa0SIlya Dryomov 		if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
42593b77faa0SIlya Dryomov 		    rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
42603b77faa0SIlya Dryomov 			goto out_unlock;
42613b77faa0SIlya Dryomov 
42623b77faa0SIlya Dryomov 		/*
42633b77faa0SIlya Dryomov 		 * encode ResponseMessage(0) so the peer can detect
42643b77faa0SIlya Dryomov 		 * a missing owner
42653b77faa0SIlya Dryomov 		 */
42663b77faa0SIlya Dryomov 		result = 0;
42673b77faa0SIlya Dryomov 
4268ed95b21aSIlya Dryomov 		if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
4269e010dd0aSIlya Dryomov 			if (!rbd_dev->opts->exclusive) {
4270e010dd0aSIlya Dryomov 				dout("%s rbd_dev %p queueing unlock_work\n",
4271e010dd0aSIlya Dryomov 				     __func__, rbd_dev);
4272e010dd0aSIlya Dryomov 				queue_work(rbd_dev->task_wq,
4273e010dd0aSIlya Dryomov 					   &rbd_dev->unlock_work);
4274e010dd0aSIlya Dryomov 			} else {
4275e010dd0aSIlya Dryomov 				/* refuse to release the lock */
4276e010dd0aSIlya Dryomov 				result = -EROFS;
4277ed95b21aSIlya Dryomov 			}
4278ed95b21aSIlya Dryomov 		}
4279ed95b21aSIlya Dryomov 	}
42803b77faa0SIlya Dryomov 
42813b77faa0SIlya Dryomov out_unlock:
4282ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
42833b77faa0SIlya Dryomov 	return result;
4284ed95b21aSIlya Dryomov }
4285ed95b21aSIlya Dryomov 
4286ed95b21aSIlya Dryomov static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
4287ed95b21aSIlya Dryomov 				     u64 notify_id, u64 cookie, s32 *result)
4288ed95b21aSIlya Dryomov {
4289ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
429008a79102SKyle Spiers 	char buf[4 + CEPH_ENCODING_START_BLK_LEN];
429108a79102SKyle Spiers 	int buf_size = sizeof(buf);
4292ed95b21aSIlya Dryomov 	int ret;
4293ed95b21aSIlya Dryomov 
4294ed95b21aSIlya Dryomov 	if (result) {
4295ed95b21aSIlya Dryomov 		void *p = buf;
4296ed95b21aSIlya Dryomov 
4297ed95b21aSIlya Dryomov 		/* encode ResponseMessage */
4298ed95b21aSIlya Dryomov 		ceph_start_encoding(&p, 1, 1,
4299ed95b21aSIlya Dryomov 				    buf_size - CEPH_ENCODING_START_BLK_LEN);
4300ed95b21aSIlya Dryomov 		ceph_encode_32(&p, *result);
4301ed95b21aSIlya Dryomov 	} else {
4302ed95b21aSIlya Dryomov 		buf_size = 0;
4303ed95b21aSIlya Dryomov 	}
4304ed95b21aSIlya Dryomov 
4305ed95b21aSIlya Dryomov 	ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
4306ed95b21aSIlya Dryomov 				   &rbd_dev->header_oloc, notify_id, cookie,
4307ed95b21aSIlya Dryomov 				   buf, buf_size);
4308ed95b21aSIlya Dryomov 	if (ret)
4309ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
4310ed95b21aSIlya Dryomov }
4311ed95b21aSIlya Dryomov 
4312ed95b21aSIlya Dryomov static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
4313ed95b21aSIlya Dryomov 				   u64 cookie)
4314ed95b21aSIlya Dryomov {
4315ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4316ed95b21aSIlya Dryomov 	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
4317ed95b21aSIlya Dryomov }
4318ed95b21aSIlya Dryomov 
4319ed95b21aSIlya Dryomov static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
4320ed95b21aSIlya Dryomov 					  u64 notify_id, u64 cookie, s32 result)
4321ed95b21aSIlya Dryomov {
4322ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
4323ed95b21aSIlya Dryomov 	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
4324ed95b21aSIlya Dryomov }
4325922dab61SIlya Dryomov 
4326922dab61SIlya Dryomov static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
4327922dab61SIlya Dryomov 			 u64 notifier_id, void *data, size_t data_len)
4328bf0d5f50SAlex Elder {
4329922dab61SIlya Dryomov 	struct rbd_device *rbd_dev = arg;
4330ed95b21aSIlya Dryomov 	void *p = data;
4331ed95b21aSIlya Dryomov 	void *const end = p + data_len;
4332d4c2269bSIlya Dryomov 	u8 struct_v = 0;
4333ed95b21aSIlya Dryomov 	u32 len;
4334ed95b21aSIlya Dryomov 	u32 notify_op;
4335bf0d5f50SAlex Elder 	int ret;
4336bf0d5f50SAlex Elder 
4337ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
4338ed95b21aSIlya Dryomov 	     __func__, rbd_dev, cookie, notify_id, data_len);
4339ed95b21aSIlya Dryomov 	if (data_len) {
4340ed95b21aSIlya Dryomov 		ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
4341ed95b21aSIlya Dryomov 					  &struct_v, &len);
4342ed95b21aSIlya Dryomov 		if (ret) {
4343ed95b21aSIlya Dryomov 			rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
4344ed95b21aSIlya Dryomov 				 ret);
4345ed95b21aSIlya Dryomov 			return;
4346ed95b21aSIlya Dryomov 		}
434752bb1f9bSIlya Dryomov 
4348ed95b21aSIlya Dryomov 		notify_op = ceph_decode_32(&p);
4349ed95b21aSIlya Dryomov 	} else {
4350ed95b21aSIlya Dryomov 		/* legacy notification for header updates */
4351ed95b21aSIlya Dryomov 		notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
4352ed95b21aSIlya Dryomov 		len = 0;
4353ed95b21aSIlya Dryomov 	}
4354ed95b21aSIlya Dryomov 
4355ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
4356ed95b21aSIlya Dryomov 	switch (notify_op) {
4357ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_ACQUIRED_LOCK:
4358ed95b21aSIlya Dryomov 		rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
4359ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4360ed95b21aSIlya Dryomov 		break;
4361ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_RELEASED_LOCK:
4362ed95b21aSIlya Dryomov 		rbd_handle_released_lock(rbd_dev, struct_v, &p);
4363ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4364ed95b21aSIlya Dryomov 		break;
4365ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_REQUEST_LOCK:
43663b77faa0SIlya Dryomov 		ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
43673b77faa0SIlya Dryomov 		if (ret <= 0)
4368ed95b21aSIlya Dryomov 			rbd_acknowledge_notify_result(rbd_dev, notify_id,
43693b77faa0SIlya Dryomov 						      cookie, ret);
4370ed95b21aSIlya Dryomov 		else
4371ed95b21aSIlya Dryomov 			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4372ed95b21aSIlya Dryomov 		break;
4373ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_HEADER_UPDATE:
4374e627db08SAlex Elder 		ret = rbd_dev_refresh(rbd_dev);
4375e627db08SAlex Elder 		if (ret)
43769584d508SIlya Dryomov 			rbd_warn(rbd_dev, "refresh failed: %d", ret);
4377bf0d5f50SAlex Elder 
4378ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4379ed95b21aSIlya Dryomov 		break;
4380ed95b21aSIlya Dryomov 	default:
4381ed95b21aSIlya Dryomov 		if (rbd_is_lock_owner(rbd_dev))
4382ed95b21aSIlya Dryomov 			rbd_acknowledge_notify_result(rbd_dev, notify_id,
4383ed95b21aSIlya Dryomov 						      cookie, -EOPNOTSUPP);
4384ed95b21aSIlya Dryomov 		else
4385ed95b21aSIlya Dryomov 			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4386ed95b21aSIlya Dryomov 		break;
43879969ebc5SAlex Elder 	}
43889969ebc5SAlex Elder }
43899969ebc5SAlex Elder 
439099d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
43919969ebc5SAlex Elder 
4392922dab61SIlya Dryomov static void rbd_watch_errcb(void *arg, u64 cookie, int err)
4393bb040aa0SIlya Dryomov {
4394922dab61SIlya Dryomov 	struct rbd_device *rbd_dev = arg;
4395bb040aa0SIlya Dryomov 
4396922dab61SIlya Dryomov 	rbd_warn(rbd_dev, "encountered watch error: %d", err);
4397bb040aa0SIlya Dryomov 
4398ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
4399ed95b21aSIlya Dryomov 	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
4400ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
4401bb040aa0SIlya Dryomov 
440299d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
440399d16943SIlya Dryomov 	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
440499d16943SIlya Dryomov 		__rbd_unregister_watch(rbd_dev);
440599d16943SIlya Dryomov 		rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
4406bb040aa0SIlya Dryomov 
440799d16943SIlya Dryomov 		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
4408bb040aa0SIlya Dryomov 	}
440999d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
4410bb040aa0SIlya Dryomov }
4411bb040aa0SIlya Dryomov 
4412bb040aa0SIlya Dryomov /*
441399d16943SIlya Dryomov  * watch_mutex must be locked
44149969ebc5SAlex Elder  */
441599d16943SIlya Dryomov static int __rbd_register_watch(struct rbd_device *rbd_dev)
44169969ebc5SAlex Elder {
44179969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4418922dab61SIlya Dryomov 	struct ceph_osd_linger_request *handle;
44199969ebc5SAlex Elder 
4420922dab61SIlya Dryomov 	rbd_assert(!rbd_dev->watch_handle);
442199d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
44229969ebc5SAlex Elder 
4423922dab61SIlya Dryomov 	handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
4424922dab61SIlya Dryomov 				 &rbd_dev->header_oloc, rbd_watch_cb,
4425922dab61SIlya Dryomov 				 rbd_watch_errcb, rbd_dev);
4426922dab61SIlya Dryomov 	if (IS_ERR(handle))
4427922dab61SIlya Dryomov 		return PTR_ERR(handle);
44289969ebc5SAlex Elder 
4429922dab61SIlya Dryomov 	rbd_dev->watch_handle = handle;
44308eb87565SAlex Elder 	return 0;
44319969ebc5SAlex Elder }
44329969ebc5SAlex Elder 
443399d16943SIlya Dryomov /*
443499d16943SIlya Dryomov  * watch_mutex must be locked
443599d16943SIlya Dryomov  */
443699d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
4437fca27065SIlya Dryomov {
4438922dab61SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4439922dab61SIlya Dryomov 	int ret;
4440b30a01f2SIlya Dryomov 
444199d16943SIlya Dryomov 	rbd_assert(rbd_dev->watch_handle);
444299d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4443b30a01f2SIlya Dryomov 
4444922dab61SIlya Dryomov 	ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
4445922dab61SIlya Dryomov 	if (ret)
4446922dab61SIlya Dryomov 		rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
4447b30a01f2SIlya Dryomov 
4448922dab61SIlya Dryomov 	rbd_dev->watch_handle = NULL;
4449c525f036SIlya Dryomov }
4450c525f036SIlya Dryomov 
445199d16943SIlya Dryomov static int rbd_register_watch(struct rbd_device *rbd_dev)
4452c525f036SIlya Dryomov {
445399d16943SIlya Dryomov 	int ret;
4454811c6688SIlya Dryomov 
445599d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
445699d16943SIlya Dryomov 	rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
445799d16943SIlya Dryomov 	ret = __rbd_register_watch(rbd_dev);
445899d16943SIlya Dryomov 	if (ret)
445999d16943SIlya Dryomov 		goto out;
446099d16943SIlya Dryomov 
446199d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
446299d16943SIlya Dryomov 	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
446399d16943SIlya Dryomov 
446499d16943SIlya Dryomov out:
446599d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
446699d16943SIlya Dryomov 	return ret;
446799d16943SIlya Dryomov }
446899d16943SIlya Dryomov 
446999d16943SIlya Dryomov static void cancel_tasks_sync(struct rbd_device *rbd_dev)
447099d16943SIlya Dryomov {
447199d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
447299d16943SIlya Dryomov 
4473ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->acquired_lock_work);
4474ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->released_lock_work);
4475ed95b21aSIlya Dryomov 	cancel_delayed_work_sync(&rbd_dev->lock_dwork);
4476ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->unlock_work);
447799d16943SIlya Dryomov }
447899d16943SIlya Dryomov 
44790e4e1de5SIlya Dryomov /*
44800e4e1de5SIlya Dryomov  * header_rwsem must not be held to avoid a deadlock with
44810e4e1de5SIlya Dryomov  * rbd_dev_refresh() when flushing notifies.
44820e4e1de5SIlya Dryomov  */
448399d16943SIlya Dryomov static void rbd_unregister_watch(struct rbd_device *rbd_dev)
448499d16943SIlya Dryomov {
448599d16943SIlya Dryomov 	cancel_tasks_sync(rbd_dev);
448699d16943SIlya Dryomov 
448799d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
448899d16943SIlya Dryomov 	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
448999d16943SIlya Dryomov 		__rbd_unregister_watch(rbd_dev);
449099d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
449199d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
449299d16943SIlya Dryomov 
449323edca86SDongsheng Yang 	cancel_delayed_work_sync(&rbd_dev->watch_dwork);
4494811c6688SIlya Dryomov 	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
4495fca27065SIlya Dryomov }
4496fca27065SIlya Dryomov 
449714bb211dSIlya Dryomov /*
449814bb211dSIlya Dryomov  * lock_rwsem must be held for write
449914bb211dSIlya Dryomov  */
450014bb211dSIlya Dryomov static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
450114bb211dSIlya Dryomov {
450214bb211dSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
450314bb211dSIlya Dryomov 	char cookie[32];
450414bb211dSIlya Dryomov 	int ret;
450514bb211dSIlya Dryomov 
4506a2b1da09SIlya Dryomov 	if (!rbd_quiesce_lock(rbd_dev))
4507a2b1da09SIlya Dryomov 		return;
450814bb211dSIlya Dryomov 
450914bb211dSIlya Dryomov 	format_lock_cookie(rbd_dev, cookie);
451014bb211dSIlya Dryomov 	ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
451114bb211dSIlya Dryomov 				  &rbd_dev->header_oloc, RBD_LOCK_NAME,
451214bb211dSIlya Dryomov 				  CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
451314bb211dSIlya Dryomov 				  RBD_LOCK_TAG, cookie);
451414bb211dSIlya Dryomov 	if (ret) {
451514bb211dSIlya Dryomov 		if (ret != -EOPNOTSUPP)
451614bb211dSIlya Dryomov 			rbd_warn(rbd_dev, "failed to update lock cookie: %d",
451714bb211dSIlya Dryomov 				 ret);
451814bb211dSIlya Dryomov 
451914bb211dSIlya Dryomov 		/*
452014bb211dSIlya Dryomov 		 * Lock cookie cannot be updated on older OSDs, so do
452114bb211dSIlya Dryomov 		 * a manual release and queue an acquire.
452214bb211dSIlya Dryomov 		 */
4523e1fddc8fSIlya Dryomov 		__rbd_release_lock(rbd_dev);
4524a2b1da09SIlya Dryomov 		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
452514bb211dSIlya Dryomov 	} else {
4526edd8ca80SFlorian Margaine 		__rbd_lock(rbd_dev, cookie);
4527637cd060SIlya Dryomov 		wake_lock_waiters(rbd_dev, 0);
452814bb211dSIlya Dryomov 	}
452914bb211dSIlya Dryomov }
453014bb211dSIlya Dryomov 
453199d16943SIlya Dryomov static void rbd_reregister_watch(struct work_struct *work)
453299d16943SIlya Dryomov {
453399d16943SIlya Dryomov 	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
453499d16943SIlya Dryomov 					    struct rbd_device, watch_dwork);
453599d16943SIlya Dryomov 	int ret;
453699d16943SIlya Dryomov 
453799d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
453899d16943SIlya Dryomov 
453999d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
454087c0fdedSIlya Dryomov 	if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
454187c0fdedSIlya Dryomov 		mutex_unlock(&rbd_dev->watch_mutex);
454214bb211dSIlya Dryomov 		return;
454387c0fdedSIlya Dryomov 	}
454499d16943SIlya Dryomov 
454599d16943SIlya Dryomov 	ret = __rbd_register_watch(rbd_dev);
454699d16943SIlya Dryomov 	if (ret) {
454799d16943SIlya Dryomov 		rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
45480b98acd6SIlya Dryomov 		if (ret != -EBLOCKLISTED && ret != -ENOENT) {
454999d16943SIlya Dryomov 			queue_delayed_work(rbd_dev->task_wq,
455099d16943SIlya Dryomov 					   &rbd_dev->watch_dwork,
455199d16943SIlya Dryomov 					   RBD_RETRY_DELAY);
455287c0fdedSIlya Dryomov 			mutex_unlock(&rbd_dev->watch_mutex);
455314bb211dSIlya Dryomov 			return;
455499d16943SIlya Dryomov 		}
455599d16943SIlya Dryomov 
4556637cd060SIlya Dryomov 		mutex_unlock(&rbd_dev->watch_mutex);
4557637cd060SIlya Dryomov 		down_write(&rbd_dev->lock_rwsem);
4558637cd060SIlya Dryomov 		wake_lock_waiters(rbd_dev, ret);
4559637cd060SIlya Dryomov 		up_write(&rbd_dev->lock_rwsem);
4560637cd060SIlya Dryomov 		return;
4561637cd060SIlya Dryomov 	}
4562637cd060SIlya Dryomov 
456399d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
456499d16943SIlya Dryomov 	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
456599d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
456699d16943SIlya Dryomov 
456714bb211dSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
456814bb211dSIlya Dryomov 	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
456914bb211dSIlya Dryomov 		rbd_reacquire_lock(rbd_dev);
457014bb211dSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
457114bb211dSIlya Dryomov 
457299d16943SIlya Dryomov 	ret = rbd_dev_refresh(rbd_dev);
457399d16943SIlya Dryomov 	if (ret)
4574f6870cc9SColin Ian King 		rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
457599d16943SIlya Dryomov }
457699d16943SIlya Dryomov 
457736be9a76SAlex Elder /*
4578f40eb349SAlex Elder  * Synchronous osd object method call.  Returns the number of bytes
4579f40eb349SAlex Elder  * returned in the outbound buffer, or a negative error code.
458036be9a76SAlex Elder  */
458136be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
4582ecd4a68aSIlya Dryomov 			     struct ceph_object_id *oid,
4583ecd4a68aSIlya Dryomov 			     struct ceph_object_locator *oloc,
458436be9a76SAlex Elder 			     const char *method_name,
45854157976bSAlex Elder 			     const void *outbound,
458636be9a76SAlex Elder 			     size_t outbound_size,
45874157976bSAlex Elder 			     void *inbound,
4588e2a58ee5SAlex Elder 			     size_t inbound_size)
458936be9a76SAlex Elder {
4590ecd4a68aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4591ecd4a68aSIlya Dryomov 	struct page *req_page = NULL;
4592ecd4a68aSIlya Dryomov 	struct page *reply_page;
459336be9a76SAlex Elder 	int ret;
459436be9a76SAlex Elder 
459536be9a76SAlex Elder 	/*
45966010a451SAlex Elder 	 * Method calls are ultimately read operations.  The result
45976010a451SAlex Elder 	 * should placed into the inbound buffer provided.  They
45986010a451SAlex Elder 	 * also supply outbound data--parameters for the object
45996010a451SAlex Elder 	 * method.  Currently if this is present it will be a
46006010a451SAlex Elder 	 * snapshot id.
460136be9a76SAlex Elder 	 */
4602ecd4a68aSIlya Dryomov 	if (outbound) {
4603ecd4a68aSIlya Dryomov 		if (outbound_size > PAGE_SIZE)
4604ecd4a68aSIlya Dryomov 			return -E2BIG;
460536be9a76SAlex Elder 
4606ecd4a68aSIlya Dryomov 		req_page = alloc_page(GFP_KERNEL);
4607ecd4a68aSIlya Dryomov 		if (!req_page)
4608ecd4a68aSIlya Dryomov 			return -ENOMEM;
460936be9a76SAlex Elder 
4610ecd4a68aSIlya Dryomov 		memcpy(page_address(req_page), outbound, outbound_size);
461104017e29SAlex Elder 	}
4612430c28c3SAlex Elder 
4613ecd4a68aSIlya Dryomov 	reply_page = alloc_page(GFP_KERNEL);
4614ecd4a68aSIlya Dryomov 	if (!reply_page) {
4615ecd4a68aSIlya Dryomov 		if (req_page)
4616ecd4a68aSIlya Dryomov 			__free_page(req_page);
4617ecd4a68aSIlya Dryomov 		return -ENOMEM;
4618ecd4a68aSIlya Dryomov 	}
461936be9a76SAlex Elder 
4620ecd4a68aSIlya Dryomov 	ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
4621ecd4a68aSIlya Dryomov 			     CEPH_OSD_FLAG_READ, req_page, outbound_size,
462268ada915SIlya Dryomov 			     &reply_page, &inbound_size);
4623ecd4a68aSIlya Dryomov 	if (!ret) {
4624ecd4a68aSIlya Dryomov 		memcpy(inbound, page_address(reply_page), inbound_size);
4625ecd4a68aSIlya Dryomov 		ret = inbound_size;
4626ecd4a68aSIlya Dryomov 	}
462757385b51SAlex Elder 
4628ecd4a68aSIlya Dryomov 	if (req_page)
4629ecd4a68aSIlya Dryomov 		__free_page(req_page);
4630ecd4a68aSIlya Dryomov 	__free_page(reply_page);
463136be9a76SAlex Elder 	return ret;
463236be9a76SAlex Elder }
463336be9a76SAlex Elder 
46347ad18afaSChristoph Hellwig static void rbd_queue_workfn(struct work_struct *work)
4635bc1ecc65SIlya Dryomov {
463659e542c8SIlya Dryomov 	struct rbd_img_request *img_request =
463759e542c8SIlya Dryomov 	    container_of(work, struct rbd_img_request, work);
463859e542c8SIlya Dryomov 	struct rbd_device *rbd_dev = img_request->rbd_dev;
463959e542c8SIlya Dryomov 	enum obj_operation_type op_type = img_request->op_type;
464059e542c8SIlya Dryomov 	struct request *rq = blk_mq_rq_from_pdu(img_request);
4641bc1ecc65SIlya Dryomov 	u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
4642bc1ecc65SIlya Dryomov 	u64 length = blk_rq_bytes(rq);
46434e752f0aSJosh Durgin 	u64 mapping_size;
4644bc1ecc65SIlya Dryomov 	int result;
4645bc1ecc65SIlya Dryomov 
4646bc1ecc65SIlya Dryomov 	/* Ignore/skip any zero-length requests */
4647bc1ecc65SIlya Dryomov 	if (!length) {
4648bc1ecc65SIlya Dryomov 		dout("%s: zero-length request\n", __func__);
4649bc1ecc65SIlya Dryomov 		result = 0;
465059e542c8SIlya Dryomov 		goto err_img_request;
4651bc1ecc65SIlya Dryomov 	}
4652bc1ecc65SIlya Dryomov 
46537ad18afaSChristoph Hellwig 	blk_mq_start_request(rq);
46547ad18afaSChristoph Hellwig 
46554e752f0aSJosh Durgin 	down_read(&rbd_dev->header_rwsem);
46564e752f0aSJosh Durgin 	mapping_size = rbd_dev->mapping.size;
4657a52cc685SIlya Dryomov 	rbd_img_capture_header(img_request);
46584e752f0aSJosh Durgin 	up_read(&rbd_dev->header_rwsem);
46594e752f0aSJosh Durgin 
46604e752f0aSJosh Durgin 	if (offset + length > mapping_size) {
4661bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
46624e752f0aSJosh Durgin 			 length, mapping_size);
4663bc1ecc65SIlya Dryomov 		result = -EIO;
4664a52cc685SIlya Dryomov 		goto err_img_request;
4665bc1ecc65SIlya Dryomov 	}
4666bc1ecc65SIlya Dryomov 
466721ed05a8SIlya Dryomov 	dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev,
466821ed05a8SIlya Dryomov 	     img_request, obj_op_name(op_type), offset, length);
466921ed05a8SIlya Dryomov 
46706484cbe9SIlya Dryomov 	if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
46715a237819SIlya Dryomov 		result = rbd_img_fill_nodata(img_request, offset, length);
467290e98c52SGuangliang Zhao 	else
46735a237819SIlya Dryomov 		result = rbd_img_fill_from_bio(img_request, offset, length,
467490e98c52SGuangliang Zhao 					       rq->bio);
46750192ce2eSIlya Dryomov 	if (result)
4676bc1ecc65SIlya Dryomov 		goto err_img_request;
4677bc1ecc65SIlya Dryomov 
4678e1fddc8fSIlya Dryomov 	rbd_img_handle_request(img_request, 0);
4679bc1ecc65SIlya Dryomov 	return;
4680bc1ecc65SIlya Dryomov 
4681bc1ecc65SIlya Dryomov err_img_request:
4682679a97d2SHannes Reinecke 	rbd_img_request_destroy(img_request);
4683bc1ecc65SIlya Dryomov 	if (result)
4684bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "%s %llx at %llx result %d",
46856d2940c8SGuangliang Zhao 			 obj_op_name(op_type), length, offset, result);
46862a842acaSChristoph Hellwig 	blk_mq_end_request(rq, errno_to_blk_status(result));
4687bc1ecc65SIlya Dryomov }
4688bc1ecc65SIlya Dryomov 
4689fc17b653SChristoph Hellwig static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
46907ad18afaSChristoph Hellwig 		const struct blk_mq_queue_data *bd)
4691bc1ecc65SIlya Dryomov {
469259e542c8SIlya Dryomov 	struct rbd_device *rbd_dev = hctx->queue->queuedata;
469359e542c8SIlya Dryomov 	struct rbd_img_request *img_req = blk_mq_rq_to_pdu(bd->rq);
469459e542c8SIlya Dryomov 	enum obj_operation_type op_type;
4695bc1ecc65SIlya Dryomov 
469659e542c8SIlya Dryomov 	switch (req_op(bd->rq)) {
469759e542c8SIlya Dryomov 	case REQ_OP_DISCARD:
469859e542c8SIlya Dryomov 		op_type = OBJ_OP_DISCARD;
469959e542c8SIlya Dryomov 		break;
470059e542c8SIlya Dryomov 	case REQ_OP_WRITE_ZEROES:
470159e542c8SIlya Dryomov 		op_type = OBJ_OP_ZEROOUT;
470259e542c8SIlya Dryomov 		break;
470359e542c8SIlya Dryomov 	case REQ_OP_WRITE:
470459e542c8SIlya Dryomov 		op_type = OBJ_OP_WRITE;
470559e542c8SIlya Dryomov 		break;
470659e542c8SIlya Dryomov 	case REQ_OP_READ:
470759e542c8SIlya Dryomov 		op_type = OBJ_OP_READ;
470859e542c8SIlya Dryomov 		break;
470959e542c8SIlya Dryomov 	default:
471059e542c8SIlya Dryomov 		rbd_warn(rbd_dev, "unknown req_op %d", req_op(bd->rq));
471159e542c8SIlya Dryomov 		return BLK_STS_IOERR;
471259e542c8SIlya Dryomov 	}
471359e542c8SIlya Dryomov 
471459e542c8SIlya Dryomov 	rbd_img_request_init(img_req, rbd_dev, op_type);
471559e542c8SIlya Dryomov 
471659e542c8SIlya Dryomov 	if (rbd_img_is_write(img_req)) {
471759e542c8SIlya Dryomov 		if (rbd_is_ro(rbd_dev)) {
471859e542c8SIlya Dryomov 			rbd_warn(rbd_dev, "%s on read-only mapping",
471959e542c8SIlya Dryomov 				 obj_op_name(img_req->op_type));
472059e542c8SIlya Dryomov 			return BLK_STS_IOERR;
472159e542c8SIlya Dryomov 		}
472259e542c8SIlya Dryomov 		rbd_assert(!rbd_is_snap(rbd_dev));
472359e542c8SIlya Dryomov 	}
472459e542c8SIlya Dryomov 
472559e542c8SIlya Dryomov 	INIT_WORK(&img_req->work, rbd_queue_workfn);
472659e542c8SIlya Dryomov 	queue_work(rbd_wq, &img_req->work);
4727fc17b653SChristoph Hellwig 	return BLK_STS_OK;
4728bf0d5f50SAlex Elder }
4729bf0d5f50SAlex Elder 
4730602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
4731602adf40SYehuda Sadeh {
4732195b1956SChristoph Hellwig 	blk_cleanup_disk(rbd_dev->disk);
47337ad18afaSChristoph Hellwig 	blk_mq_free_tag_set(&rbd_dev->tag_set);
47345769ed0cSIlya Dryomov 	rbd_dev->disk = NULL;
4735602adf40SYehuda Sadeh }
4736602adf40SYehuda Sadeh 
4737788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
4738fe5478e0SIlya Dryomov 			     struct ceph_object_id *oid,
4739fe5478e0SIlya Dryomov 			     struct ceph_object_locator *oloc,
4740fe5478e0SIlya Dryomov 			     void *buf, int buf_len)
4741788e2df3SAlex Elder 
4742788e2df3SAlex Elder {
4743fe5478e0SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4744fe5478e0SIlya Dryomov 	struct ceph_osd_request *req;
4745fe5478e0SIlya Dryomov 	struct page **pages;
4746fe5478e0SIlya Dryomov 	int num_pages = calc_pages_for(0, buf_len);
4747788e2df3SAlex Elder 	int ret;
4748788e2df3SAlex Elder 
4749fe5478e0SIlya Dryomov 	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
4750fe5478e0SIlya Dryomov 	if (!req)
4751fe5478e0SIlya Dryomov 		return -ENOMEM;
4752788e2df3SAlex Elder 
4753fe5478e0SIlya Dryomov 	ceph_oid_copy(&req->r_base_oid, oid);
4754fe5478e0SIlya Dryomov 	ceph_oloc_copy(&req->r_base_oloc, oloc);
4755fe5478e0SIlya Dryomov 	req->r_flags = CEPH_OSD_FLAG_READ;
4756788e2df3SAlex Elder 
4757fe5478e0SIlya Dryomov 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
4758fe5478e0SIlya Dryomov 	if (IS_ERR(pages)) {
4759fe5478e0SIlya Dryomov 		ret = PTR_ERR(pages);
4760fe5478e0SIlya Dryomov 		goto out_req;
4761fe5478e0SIlya Dryomov 	}
47621ceae7efSAlex Elder 
4763fe5478e0SIlya Dryomov 	osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
4764fe5478e0SIlya Dryomov 	osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
4765fe5478e0SIlya Dryomov 					 true);
4766788e2df3SAlex Elder 
476726f887e0SIlya Dryomov 	ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
476826f887e0SIlya Dryomov 	if (ret)
476926f887e0SIlya Dryomov 		goto out_req;
477026f887e0SIlya Dryomov 
4771fe5478e0SIlya Dryomov 	ceph_osdc_start_request(osdc, req, false);
4772fe5478e0SIlya Dryomov 	ret = ceph_osdc_wait_request(osdc, req);
4773fe5478e0SIlya Dryomov 	if (ret >= 0)
4774fe5478e0SIlya Dryomov 		ceph_copy_from_page_vector(pages, buf, 0, ret);
4775fe5478e0SIlya Dryomov 
4776fe5478e0SIlya Dryomov out_req:
4777fe5478e0SIlya Dryomov 	ceph_osdc_put_request(req);
4778788e2df3SAlex Elder 	return ret;
4779788e2df3SAlex Elder }
4780788e2df3SAlex Elder 
4781602adf40SYehuda Sadeh /*
4782662518b1SAlex Elder  * Read the complete header for the given rbd device.  On successful
4783662518b1SAlex Elder  * return, the rbd_dev->header field will contain up-to-date
4784662518b1SAlex Elder  * information about the image.
47854156d998SAlex Elder  */
478699a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
47874156d998SAlex Elder {
47884156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
47894156d998SAlex Elder 	u32 snap_count = 0;
47904156d998SAlex Elder 	u64 names_size = 0;
47914156d998SAlex Elder 	u32 want_count;
47924156d998SAlex Elder 	int ret;
47934156d998SAlex Elder 
47944156d998SAlex Elder 	/*
47954156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
47964156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
47974156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
47984156d998SAlex Elder 	 * the number of snapshots could change by the time we read
47994156d998SAlex Elder 	 * it in, in which case we re-read it.
48004156d998SAlex Elder 	 */
48014156d998SAlex Elder 	do {
48024156d998SAlex Elder 		size_t size;
48034156d998SAlex Elder 
48044156d998SAlex Elder 		kfree(ondisk);
48054156d998SAlex Elder 
48064156d998SAlex Elder 		size = sizeof (*ondisk);
48074156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
48084156d998SAlex Elder 		size += names_size;
48094156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
48104156d998SAlex Elder 		if (!ondisk)
4811662518b1SAlex Elder 			return -ENOMEM;
48124156d998SAlex Elder 
4813fe5478e0SIlya Dryomov 		ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
4814fe5478e0SIlya Dryomov 					&rbd_dev->header_oloc, ondisk, size);
48154156d998SAlex Elder 		if (ret < 0)
4816662518b1SAlex Elder 			goto out;
4817c0cd10dbSAlex Elder 		if ((size_t)ret < size) {
48184156d998SAlex Elder 			ret = -ENXIO;
481906ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
482006ecc6cbSAlex Elder 				size, ret);
4821662518b1SAlex Elder 			goto out;
48224156d998SAlex Elder 		}
48234156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
48244156d998SAlex Elder 			ret = -ENXIO;
482506ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
4826662518b1SAlex Elder 			goto out;
48274156d998SAlex Elder 		}
48284156d998SAlex Elder 
48294156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
48304156d998SAlex Elder 		want_count = snap_count;
48314156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
48324156d998SAlex Elder 	} while (snap_count != want_count);
48334156d998SAlex Elder 
4834662518b1SAlex Elder 	ret = rbd_header_from_disk(rbd_dev, ondisk);
4835662518b1SAlex Elder out:
48364156d998SAlex Elder 	kfree(ondisk);
48374156d998SAlex Elder 
4838dfc5606dSYehuda Sadeh 	return ret;
4839602adf40SYehuda Sadeh }
4840602adf40SYehuda Sadeh 
48419875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev)
48429875201eSJosh Durgin {
48439875201eSJosh Durgin 	sector_t size;
48449875201eSJosh Durgin 
48459875201eSJosh Durgin 	/*
4846811c6688SIlya Dryomov 	 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
4847811c6688SIlya Dryomov 	 * try to update its size.  If REMOVING is set, updating size
4848811c6688SIlya Dryomov 	 * is just useless work since the device can't be opened.
48499875201eSJosh Durgin 	 */
4850811c6688SIlya Dryomov 	if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
4851811c6688SIlya Dryomov 	    !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
48529875201eSJosh Durgin 		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
48539875201eSJosh Durgin 		dout("setting size to %llu sectors", (unsigned long long)size);
4854e864e49aSChristoph Hellwig 		set_capacity_and_notify(rbd_dev->disk, size);
48559875201eSJosh Durgin 	}
48569875201eSJosh Durgin }
48579875201eSJosh Durgin 
4858cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev)
48591fe5e993SAlex Elder {
4860e627db08SAlex Elder 	u64 mapping_size;
48611fe5e993SAlex Elder 	int ret;
48621fe5e993SAlex Elder 
4863cfbf6377SAlex Elder 	down_write(&rbd_dev->header_rwsem);
48643b5cf2a2SAlex Elder 	mapping_size = rbd_dev->mapping.size;
4865a720ae09SIlya Dryomov 
4866a720ae09SIlya Dryomov 	ret = rbd_dev_header_info(rbd_dev);
486752bb1f9bSIlya Dryomov 	if (ret)
486873e39e4dSIlya Dryomov 		goto out;
486915228edeSAlex Elder 
4870e8f59b59SIlya Dryomov 	/*
4871e8f59b59SIlya Dryomov 	 * If there is a parent, see if it has disappeared due to the
4872e8f59b59SIlya Dryomov 	 * mapped image getting flattened.
4873e8f59b59SIlya Dryomov 	 */
4874e8f59b59SIlya Dryomov 	if (rbd_dev->parent) {
4875e8f59b59SIlya Dryomov 		ret = rbd_dev_v2_parent_info(rbd_dev);
4876e8f59b59SIlya Dryomov 		if (ret)
487773e39e4dSIlya Dryomov 			goto out;
4878e8f59b59SIlya Dryomov 	}
4879e8f59b59SIlya Dryomov 
4880686238b7SIlya Dryomov 	rbd_assert(!rbd_is_snap(rbd_dev));
48815ff1108cSIlya Dryomov 	rbd_dev->mapping.size = rbd_dev->header.image_size;
48825ff1108cSIlya Dryomov 
488373e39e4dSIlya Dryomov out:
4884cfbf6377SAlex Elder 	up_write(&rbd_dev->header_rwsem);
488573e39e4dSIlya Dryomov 	if (!ret && mapping_size != rbd_dev->mapping.size)
48869875201eSJosh Durgin 		rbd_dev_update_size(rbd_dev);
48871fe5e993SAlex Elder 
488873e39e4dSIlya Dryomov 	return ret;
48891fe5e993SAlex Elder }
48901fe5e993SAlex Elder 
4891f363b089SEric Biggers static const struct blk_mq_ops rbd_mq_ops = {
48927ad18afaSChristoph Hellwig 	.queue_rq	= rbd_queue_rq,
48937ad18afaSChristoph Hellwig };
48947ad18afaSChristoph Hellwig 
4895602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
4896602adf40SYehuda Sadeh {
4897602adf40SYehuda Sadeh 	struct gendisk *disk;
4898602adf40SYehuda Sadeh 	struct request_queue *q;
4899420efbdfSIlya Dryomov 	unsigned int objset_bytes =
4900420efbdfSIlya Dryomov 	    rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
49017ad18afaSChristoph Hellwig 	int err;
4902602adf40SYehuda Sadeh 
49037ad18afaSChristoph Hellwig 	memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
49047ad18afaSChristoph Hellwig 	rbd_dev->tag_set.ops = &rbd_mq_ops;
4905b5584180SIlya Dryomov 	rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
49067ad18afaSChristoph Hellwig 	rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
490756d18f62SMing Lei 	rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
4908f9b6b98dSHannes Reinecke 	rbd_dev->tag_set.nr_hw_queues = num_present_cpus();
490959e542c8SIlya Dryomov 	rbd_dev->tag_set.cmd_size = sizeof(struct rbd_img_request);
49107ad18afaSChristoph Hellwig 
49117ad18afaSChristoph Hellwig 	err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
49127ad18afaSChristoph Hellwig 	if (err)
4913195b1956SChristoph Hellwig 		return err;
4914029bcbd8SJosh Durgin 
4915195b1956SChristoph Hellwig 	disk = blk_mq_alloc_disk(&rbd_dev->tag_set, rbd_dev);
4916195b1956SChristoph Hellwig 	if (IS_ERR(disk)) {
4917195b1956SChristoph Hellwig 		err = PTR_ERR(disk);
49187ad18afaSChristoph Hellwig 		goto out_tag_set;
49197ad18afaSChristoph Hellwig 	}
4920195b1956SChristoph Hellwig 	q = disk->queue;
4921195b1956SChristoph Hellwig 
4922195b1956SChristoph Hellwig 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
4923195b1956SChristoph Hellwig 		 rbd_dev->dev_id);
4924195b1956SChristoph Hellwig 	disk->major = rbd_dev->major;
4925195b1956SChristoph Hellwig 	disk->first_minor = rbd_dev->minor;
49261ebe2e5fSChristoph Hellwig 	if (single_major)
4927195b1956SChristoph Hellwig 		disk->minors = (1 << RBD_SINGLE_MAJOR_PART_SHIFT);
49281ebe2e5fSChristoph Hellwig 	else
4929195b1956SChristoph Hellwig 		disk->minors = RBD_MINORS_PER_MAJOR;
4930195b1956SChristoph Hellwig 	disk->fops = &rbd_bd_ops;
49310077a500SIlya Dryomov 	disk->private_data = rbd_dev;
49327ad18afaSChristoph Hellwig 
49338b904b5bSBart Van Assche 	blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
4934d8a2c89cSIlya Dryomov 	/* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
4935593a9e7bSAlex Elder 
4936420efbdfSIlya Dryomov 	blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
49370d9fde4fSIlya Dryomov 	q->limits.max_sectors = queue_max_hw_sectors(q);
493821acdf45SIlya Dryomov 	blk_queue_max_segments(q, USHRT_MAX);
493924f1df60SIlya Dryomov 	blk_queue_max_segment_size(q, UINT_MAX);
494016d80c54SIlya Dryomov 	blk_queue_io_min(q, rbd_dev->opts->alloc_size);
494116d80c54SIlya Dryomov 	blk_queue_io_opt(q, rbd_dev->opts->alloc_size);
4942029bcbd8SJosh Durgin 
4943d9360540SIlya Dryomov 	if (rbd_dev->opts->trim) {
49448b904b5bSBart Van Assche 		blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
494516d80c54SIlya Dryomov 		q->limits.discard_granularity = rbd_dev->opts->alloc_size;
4946420efbdfSIlya Dryomov 		blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
4947420efbdfSIlya Dryomov 		blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
4948d9360540SIlya Dryomov 	}
494990e98c52SGuangliang Zhao 
4950bae818eeSRonny Hegewald 	if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
49511cb039f3SChristoph Hellwig 		blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
4952bae818eeSRonny Hegewald 
4953602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
4954602adf40SYehuda Sadeh 
4955602adf40SYehuda Sadeh 	return 0;
49567ad18afaSChristoph Hellwig out_tag_set:
49577ad18afaSChristoph Hellwig 	blk_mq_free_tag_set(&rbd_dev->tag_set);
49587ad18afaSChristoph Hellwig 	return err;
4959602adf40SYehuda Sadeh }
4960602adf40SYehuda Sadeh 
4961dfc5606dSYehuda Sadeh /*
4962dfc5606dSYehuda Sadeh   sysfs
4963dfc5606dSYehuda Sadeh */
4964602adf40SYehuda Sadeh 
4965593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
4966593a9e7bSAlex Elder {
4967593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
4968593a9e7bSAlex Elder }
4969593a9e7bSAlex Elder 
4970dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
4971dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
4972602adf40SYehuda Sadeh {
4973593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4974dfc5606dSYehuda Sadeh 
4975fc71d833SAlex Elder 	return sprintf(buf, "%llu\n",
4976fc71d833SAlex Elder 		(unsigned long long)rbd_dev->mapping.size);
4977602adf40SYehuda Sadeh }
4978602adf40SYehuda Sadeh 
497934b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
498034b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
498134b13184SAlex Elder {
498234b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
498334b13184SAlex Elder 
4984fa58bcadSIlya Dryomov 	return sprintf(buf, "0x%016llx\n", rbd_dev->header.features);
498534b13184SAlex Elder }
498634b13184SAlex Elder 
4987dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
4988dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
4989602adf40SYehuda Sadeh {
4990593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4991dfc5606dSYehuda Sadeh 
4992fc71d833SAlex Elder 	if (rbd_dev->major)
4993dfc5606dSYehuda Sadeh 		return sprintf(buf, "%d\n", rbd_dev->major);
4994fc71d833SAlex Elder 
4995fc71d833SAlex Elder 	return sprintf(buf, "(none)\n");
4996dd82fff1SIlya Dryomov }
4997fc71d833SAlex Elder 
4998dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev,
4999dd82fff1SIlya Dryomov 			      struct device_attribute *attr, char *buf)
5000dd82fff1SIlya Dryomov {
5001dd82fff1SIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5002dd82fff1SIlya Dryomov 
5003dd82fff1SIlya Dryomov 	return sprintf(buf, "%d\n", rbd_dev->minor);
5004dfc5606dSYehuda Sadeh }
5005dfc5606dSYehuda Sadeh 
5006005a07bfSIlya Dryomov static ssize_t rbd_client_addr_show(struct device *dev,
5007005a07bfSIlya Dryomov 				    struct device_attribute *attr, char *buf)
5008005a07bfSIlya Dryomov {
5009005a07bfSIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5010005a07bfSIlya Dryomov 	struct ceph_entity_addr *client_addr =
5011005a07bfSIlya Dryomov 	    ceph_client_addr(rbd_dev->rbd_client->client);
5012005a07bfSIlya Dryomov 
5013005a07bfSIlya Dryomov 	return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
5014005a07bfSIlya Dryomov 		       le32_to_cpu(client_addr->nonce));
5015005a07bfSIlya Dryomov }
5016005a07bfSIlya Dryomov 
5017dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
5018dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
5019dfc5606dSYehuda Sadeh {
5020593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5021dfc5606dSYehuda Sadeh 
50221dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
5023033268a5SIlya Dryomov 		       ceph_client_gid(rbd_dev->rbd_client->client));
5024dfc5606dSYehuda Sadeh }
5025dfc5606dSYehuda Sadeh 
5026267fb90bSMike Christie static ssize_t rbd_cluster_fsid_show(struct device *dev,
5027267fb90bSMike Christie 				     struct device_attribute *attr, char *buf)
5028267fb90bSMike Christie {
5029267fb90bSMike Christie 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5030267fb90bSMike Christie 
5031267fb90bSMike Christie 	return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
5032267fb90bSMike Christie }
5033267fb90bSMike Christie 
50340d6d1e9cSMike Christie static ssize_t rbd_config_info_show(struct device *dev,
50350d6d1e9cSMike Christie 				    struct device_attribute *attr, char *buf)
50360d6d1e9cSMike Christie {
50370d6d1e9cSMike Christie 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
50380d6d1e9cSMike Christie 
5039f44d04e6SIlya Dryomov 	if (!capable(CAP_SYS_ADMIN))
5040f44d04e6SIlya Dryomov 		return -EPERM;
5041f44d04e6SIlya Dryomov 
50420d6d1e9cSMike Christie 	return sprintf(buf, "%s\n", rbd_dev->config_info);
5043dfc5606dSYehuda Sadeh }
5044dfc5606dSYehuda Sadeh 
5045dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
5046dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
5047dfc5606dSYehuda Sadeh {
5048593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5049dfc5606dSYehuda Sadeh 
50500d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
5051dfc5606dSYehuda Sadeh }
5052dfc5606dSYehuda Sadeh 
50539bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
50549bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
50559bb2f334SAlex Elder {
50569bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
50579bb2f334SAlex Elder 
50580d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
50590d7dbfceSAlex Elder 			(unsigned long long) rbd_dev->spec->pool_id);
50609bb2f334SAlex Elder }
50619bb2f334SAlex Elder 
5062b26c047bSIlya Dryomov static ssize_t rbd_pool_ns_show(struct device *dev,
5063b26c047bSIlya Dryomov 				struct device_attribute *attr, char *buf)
5064b26c047bSIlya Dryomov {
5065b26c047bSIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5066b26c047bSIlya Dryomov 
5067b26c047bSIlya Dryomov 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: "");
5068b26c047bSIlya Dryomov }
5069b26c047bSIlya Dryomov 
5070dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
5071dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
5072dfc5606dSYehuda Sadeh {
5073593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5074dfc5606dSYehuda Sadeh 
5075a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
50760d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
5077a92ffdf8SAlex Elder 
5078a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
5079dfc5606dSYehuda Sadeh }
5080dfc5606dSYehuda Sadeh 
5081589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
5082589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
5083589d30e0SAlex Elder {
5084589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5085589d30e0SAlex Elder 
50860d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
5087589d30e0SAlex Elder }
5088589d30e0SAlex Elder 
508934b13184SAlex Elder /*
509034b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
509134b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
509234b13184SAlex Elder  */
5093dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
5094dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
5095dfc5606dSYehuda Sadeh 			     char *buf)
5096dfc5606dSYehuda Sadeh {
5097593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5098dfc5606dSYehuda Sadeh 
50990d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
5100dfc5606dSYehuda Sadeh }
5101dfc5606dSYehuda Sadeh 
510292a58671SMike Christie static ssize_t rbd_snap_id_show(struct device *dev,
510392a58671SMike Christie 				struct device_attribute *attr, char *buf)
510492a58671SMike Christie {
510592a58671SMike Christie 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
510692a58671SMike Christie 
510792a58671SMike Christie 	return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
510892a58671SMike Christie }
510992a58671SMike Christie 
511086b00e0dSAlex Elder /*
5111ff96128fSIlya Dryomov  * For a v2 image, shows the chain of parent images, separated by empty
5112ff96128fSIlya Dryomov  * lines.  For v1 images or if there is no parent, shows "(no parent
5113ff96128fSIlya Dryomov  * image)".
511486b00e0dSAlex Elder  */
511586b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
511686b00e0dSAlex Elder 			       struct device_attribute *attr,
511786b00e0dSAlex Elder 			       char *buf)
511886b00e0dSAlex Elder {
511986b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5120ff96128fSIlya Dryomov 	ssize_t count = 0;
512186b00e0dSAlex Elder 
5122ff96128fSIlya Dryomov 	if (!rbd_dev->parent)
512386b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
512486b00e0dSAlex Elder 
5125ff96128fSIlya Dryomov 	for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
5126ff96128fSIlya Dryomov 		struct rbd_spec *spec = rbd_dev->parent_spec;
512786b00e0dSAlex Elder 
5128ff96128fSIlya Dryomov 		count += sprintf(&buf[count], "%s"
5129ff96128fSIlya Dryomov 			    "pool_id %llu\npool_name %s\n"
5130e92c0eafSIlya Dryomov 			    "pool_ns %s\n"
5131ff96128fSIlya Dryomov 			    "image_id %s\nimage_name %s\n"
5132ff96128fSIlya Dryomov 			    "snap_id %llu\nsnap_name %s\n"
5133ff96128fSIlya Dryomov 			    "overlap %llu\n",
5134ff96128fSIlya Dryomov 			    !count ? "" : "\n", /* first? */
5135ff96128fSIlya Dryomov 			    spec->pool_id, spec->pool_name,
5136e92c0eafSIlya Dryomov 			    spec->pool_ns ?: "",
5137ff96128fSIlya Dryomov 			    spec->image_id, spec->image_name ?: "(unknown)",
5138ff96128fSIlya Dryomov 			    spec->snap_id, spec->snap_name,
5139ff96128fSIlya Dryomov 			    rbd_dev->parent_overlap);
5140ff96128fSIlya Dryomov 	}
514186b00e0dSAlex Elder 
514286b00e0dSAlex Elder 	return count;
514386b00e0dSAlex Elder }
514486b00e0dSAlex Elder 
5145dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
5146dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
5147dfc5606dSYehuda Sadeh 				 const char *buf,
5148dfc5606dSYehuda Sadeh 				 size_t size)
5149dfc5606dSYehuda Sadeh {
5150593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5151b813623aSAlex Elder 	int ret;
5152602adf40SYehuda Sadeh 
5153f44d04e6SIlya Dryomov 	if (!capable(CAP_SYS_ADMIN))
5154f44d04e6SIlya Dryomov 		return -EPERM;
5155f44d04e6SIlya Dryomov 
5156cc4a38bdSAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
5157e627db08SAlex Elder 	if (ret)
515852bb1f9bSIlya Dryomov 		return ret;
5159b813623aSAlex Elder 
516052bb1f9bSIlya Dryomov 	return size;
5161dfc5606dSYehuda Sadeh }
5162602adf40SYehuda Sadeh 
51635657a819SJoe Perches static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
51645657a819SJoe Perches static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
51655657a819SJoe Perches static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
51665657a819SJoe Perches static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
51675657a819SJoe Perches static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
51685657a819SJoe Perches static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
51695657a819SJoe Perches static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
51705657a819SJoe Perches static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
51715657a819SJoe Perches static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
51725657a819SJoe Perches static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
5173b26c047bSIlya Dryomov static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL);
51745657a819SJoe Perches static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
51755657a819SJoe Perches static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
51765657a819SJoe Perches static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
51775657a819SJoe Perches static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
51785657a819SJoe Perches static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
51795657a819SJoe Perches static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
5180dfc5606dSYehuda Sadeh 
5181dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
5182dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
518334b13184SAlex Elder 	&dev_attr_features.attr,
5184dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
5185dd82fff1SIlya Dryomov 	&dev_attr_minor.attr,
5186005a07bfSIlya Dryomov 	&dev_attr_client_addr.attr,
5187dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
5188267fb90bSMike Christie 	&dev_attr_cluster_fsid.attr,
51890d6d1e9cSMike Christie 	&dev_attr_config_info.attr,
5190dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
51919bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
5192b26c047bSIlya Dryomov 	&dev_attr_pool_ns.attr,
5193dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
5194589d30e0SAlex Elder 	&dev_attr_image_id.attr,
5195dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
519692a58671SMike Christie 	&dev_attr_snap_id.attr,
519786b00e0dSAlex Elder 	&dev_attr_parent.attr,
5198dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
5199dfc5606dSYehuda Sadeh 	NULL
5200dfc5606dSYehuda Sadeh };
5201dfc5606dSYehuda Sadeh 
5202dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
5203dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
5204dfc5606dSYehuda Sadeh };
5205dfc5606dSYehuda Sadeh 
5206dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
5207dfc5606dSYehuda Sadeh 	&rbd_attr_group,
5208dfc5606dSYehuda Sadeh 	NULL
5209dfc5606dSYehuda Sadeh };
5210dfc5606dSYehuda Sadeh 
52116cac4695SIlya Dryomov static void rbd_dev_release(struct device *dev);
5212dfc5606dSYehuda Sadeh 
5213b9942bc9SBhumika Goyal static const struct device_type rbd_device_type = {
5214dfc5606dSYehuda Sadeh 	.name		= "rbd",
5215dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
52166cac4695SIlya Dryomov 	.release	= rbd_dev_release,
5217dfc5606dSYehuda Sadeh };
5218dfc5606dSYehuda Sadeh 
52198b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
52208b8fb99cSAlex Elder {
52218b8fb99cSAlex Elder 	kref_get(&spec->kref);
52228b8fb99cSAlex Elder 
52238b8fb99cSAlex Elder 	return spec;
52248b8fb99cSAlex Elder }
52258b8fb99cSAlex Elder 
52268b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
52278b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
52288b8fb99cSAlex Elder {
52298b8fb99cSAlex Elder 	if (spec)
52308b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
52318b8fb99cSAlex Elder }
52328b8fb99cSAlex Elder 
52338b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
52348b8fb99cSAlex Elder {
52358b8fb99cSAlex Elder 	struct rbd_spec *spec;
52368b8fb99cSAlex Elder 
52378b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
52388b8fb99cSAlex Elder 	if (!spec)
52398b8fb99cSAlex Elder 		return NULL;
524004077599SIlya Dryomov 
524104077599SIlya Dryomov 	spec->pool_id = CEPH_NOPOOL;
524204077599SIlya Dryomov 	spec->snap_id = CEPH_NOSNAP;
52438b8fb99cSAlex Elder 	kref_init(&spec->kref);
52448b8fb99cSAlex Elder 
52458b8fb99cSAlex Elder 	return spec;
52468b8fb99cSAlex Elder }
52478b8fb99cSAlex Elder 
52488b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
52498b8fb99cSAlex Elder {
52508b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
52518b8fb99cSAlex Elder 
52528b8fb99cSAlex Elder 	kfree(spec->pool_name);
5253b26c047bSIlya Dryomov 	kfree(spec->pool_ns);
52548b8fb99cSAlex Elder 	kfree(spec->image_id);
52558b8fb99cSAlex Elder 	kfree(spec->image_name);
52568b8fb99cSAlex Elder 	kfree(spec->snap_name);
52578b8fb99cSAlex Elder 	kfree(spec);
52588b8fb99cSAlex Elder }
52598b8fb99cSAlex Elder 
52601643dfa4SIlya Dryomov static void rbd_dev_free(struct rbd_device *rbd_dev)
5261dd5ac32dSIlya Dryomov {
526299d16943SIlya Dryomov 	WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
5263ed95b21aSIlya Dryomov 	WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
5264dd5ac32dSIlya Dryomov 
5265c41d13a3SIlya Dryomov 	ceph_oid_destroy(&rbd_dev->header_oid);
52666b6dddbeSIlya Dryomov 	ceph_oloc_destroy(&rbd_dev->header_oloc);
52670d6d1e9cSMike Christie 	kfree(rbd_dev->config_info);
5268c41d13a3SIlya Dryomov 
5269dd5ac32dSIlya Dryomov 	rbd_put_client(rbd_dev->rbd_client);
5270dd5ac32dSIlya Dryomov 	rbd_spec_put(rbd_dev->spec);
5271dd5ac32dSIlya Dryomov 	kfree(rbd_dev->opts);
5272dd5ac32dSIlya Dryomov 	kfree(rbd_dev);
52731643dfa4SIlya Dryomov }
52741643dfa4SIlya Dryomov 
52751643dfa4SIlya Dryomov static void rbd_dev_release(struct device *dev)
52761643dfa4SIlya Dryomov {
52771643dfa4SIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
52781643dfa4SIlya Dryomov 	bool need_put = !!rbd_dev->opts;
52791643dfa4SIlya Dryomov 
52801643dfa4SIlya Dryomov 	if (need_put) {
52811643dfa4SIlya Dryomov 		destroy_workqueue(rbd_dev->task_wq);
52821643dfa4SIlya Dryomov 		ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
52831643dfa4SIlya Dryomov 	}
52841643dfa4SIlya Dryomov 
52851643dfa4SIlya Dryomov 	rbd_dev_free(rbd_dev);
5286dd5ac32dSIlya Dryomov 
5287dd5ac32dSIlya Dryomov 	/*
5288dd5ac32dSIlya Dryomov 	 * This is racy, but way better than putting module outside of
5289dd5ac32dSIlya Dryomov 	 * the release callback.  The race window is pretty small, so
5290dd5ac32dSIlya Dryomov 	 * doing something similar to dm (dm-builtin.c) is overkill.
5291dd5ac32dSIlya Dryomov 	 */
5292dd5ac32dSIlya Dryomov 	if (need_put)
5293dd5ac32dSIlya Dryomov 		module_put(THIS_MODULE);
5294dd5ac32dSIlya Dryomov }
5295dd5ac32dSIlya Dryomov 
52961643dfa4SIlya Dryomov static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
52971643dfa4SIlya Dryomov 					   struct rbd_spec *spec)
5298c53d5893SAlex Elder {
5299c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
5300c53d5893SAlex Elder 
5301c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
5302c53d5893SAlex Elder 	if (!rbd_dev)
5303c53d5893SAlex Elder 		return NULL;
5304c53d5893SAlex Elder 
5305c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
5306c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
5307c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
5308c53d5893SAlex Elder 
53097e97332eSIlya Dryomov 	rbd_dev->header.data_pool_id = CEPH_NOPOOL;
5310c41d13a3SIlya Dryomov 	ceph_oid_init(&rbd_dev->header_oid);
5311431a02cdSIlya Dryomov 	rbd_dev->header_oloc.pool = spec->pool_id;
5312b26c047bSIlya Dryomov 	if (spec->pool_ns) {
5313b26c047bSIlya Dryomov 		WARN_ON(!*spec->pool_ns);
5314b26c047bSIlya Dryomov 		rbd_dev->header_oloc.pool_ns =
5315b26c047bSIlya Dryomov 		    ceph_find_or_create_string(spec->pool_ns,
5316b26c047bSIlya Dryomov 					       strlen(spec->pool_ns));
5317b26c047bSIlya Dryomov 	}
5318c41d13a3SIlya Dryomov 
531999d16943SIlya Dryomov 	mutex_init(&rbd_dev->watch_mutex);
532099d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
532199d16943SIlya Dryomov 	INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
532299d16943SIlya Dryomov 
5323ed95b21aSIlya Dryomov 	init_rwsem(&rbd_dev->lock_rwsem);
5324ed95b21aSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
5325ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
5326ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
5327ed95b21aSIlya Dryomov 	INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
5328ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
5329e1fddc8fSIlya Dryomov 	spin_lock_init(&rbd_dev->lock_lists_lock);
5330637cd060SIlya Dryomov 	INIT_LIST_HEAD(&rbd_dev->acquiring_list);
5331e1fddc8fSIlya Dryomov 	INIT_LIST_HEAD(&rbd_dev->running_list);
5332637cd060SIlya Dryomov 	init_completion(&rbd_dev->acquire_wait);
5333e1fddc8fSIlya Dryomov 	init_completion(&rbd_dev->releasing_wait);
5334ed95b21aSIlya Dryomov 
533522e8bd51SIlya Dryomov 	spin_lock_init(&rbd_dev->object_map_lock);
5336c53d5893SAlex Elder 
5337dd5ac32dSIlya Dryomov 	rbd_dev->dev.bus = &rbd_bus_type;
5338dd5ac32dSIlya Dryomov 	rbd_dev->dev.type = &rbd_device_type;
5339dd5ac32dSIlya Dryomov 	rbd_dev->dev.parent = &rbd_root_dev;
5340dd5ac32dSIlya Dryomov 	device_initialize(&rbd_dev->dev);
5341dd5ac32dSIlya Dryomov 
5342c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
5343d147543dSIlya Dryomov 	rbd_dev->spec = spec;
53440903e875SAlex Elder 
53451643dfa4SIlya Dryomov 	return rbd_dev;
53461643dfa4SIlya Dryomov }
53471643dfa4SIlya Dryomov 
5348dd5ac32dSIlya Dryomov /*
53491643dfa4SIlya Dryomov  * Create a mapping rbd_dev.
5350dd5ac32dSIlya Dryomov  */
53511643dfa4SIlya Dryomov static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
53521643dfa4SIlya Dryomov 					 struct rbd_spec *spec,
53531643dfa4SIlya Dryomov 					 struct rbd_options *opts)
53541643dfa4SIlya Dryomov {
53551643dfa4SIlya Dryomov 	struct rbd_device *rbd_dev;
53561643dfa4SIlya Dryomov 
53571643dfa4SIlya Dryomov 	rbd_dev = __rbd_dev_create(rbdc, spec);
53581643dfa4SIlya Dryomov 	if (!rbd_dev)
53591643dfa4SIlya Dryomov 		return NULL;
53601643dfa4SIlya Dryomov 
53611643dfa4SIlya Dryomov 	rbd_dev->opts = opts;
53621643dfa4SIlya Dryomov 
53631643dfa4SIlya Dryomov 	/* get an id and fill in device name */
53641643dfa4SIlya Dryomov 	rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
53651643dfa4SIlya Dryomov 					 minor_to_rbd_dev_id(1 << MINORBITS),
53661643dfa4SIlya Dryomov 					 GFP_KERNEL);
53671643dfa4SIlya Dryomov 	if (rbd_dev->dev_id < 0)
53681643dfa4SIlya Dryomov 		goto fail_rbd_dev;
53691643dfa4SIlya Dryomov 
53701643dfa4SIlya Dryomov 	sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
53711643dfa4SIlya Dryomov 	rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
53721643dfa4SIlya Dryomov 						   rbd_dev->name);
53731643dfa4SIlya Dryomov 	if (!rbd_dev->task_wq)
53741643dfa4SIlya Dryomov 		goto fail_dev_id;
53751643dfa4SIlya Dryomov 
53761643dfa4SIlya Dryomov 	/* we have a ref from do_rbd_add() */
5377dd5ac32dSIlya Dryomov 	__module_get(THIS_MODULE);
5378dd5ac32dSIlya Dryomov 
53791643dfa4SIlya Dryomov 	dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
5380c53d5893SAlex Elder 	return rbd_dev;
53811643dfa4SIlya Dryomov 
53821643dfa4SIlya Dryomov fail_dev_id:
53831643dfa4SIlya Dryomov 	ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
53841643dfa4SIlya Dryomov fail_rbd_dev:
53851643dfa4SIlya Dryomov 	rbd_dev_free(rbd_dev);
53861643dfa4SIlya Dryomov 	return NULL;
5387c53d5893SAlex Elder }
5388c53d5893SAlex Elder 
5389c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
5390c53d5893SAlex Elder {
5391dd5ac32dSIlya Dryomov 	if (rbd_dev)
5392dd5ac32dSIlya Dryomov 		put_device(&rbd_dev->dev);
5393c53d5893SAlex Elder }
5394c53d5893SAlex Elder 
5395dfc5606dSYehuda Sadeh /*
53969d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
53979d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
53989d475de5SAlex Elder  * image.
53999d475de5SAlex Elder  */
54009d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
54019d475de5SAlex Elder 				u8 *order, u64 *snap_size)
54029d475de5SAlex Elder {
54039d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
54049d475de5SAlex Elder 	int ret;
54059d475de5SAlex Elder 	struct {
54069d475de5SAlex Elder 		u8 order;
54079d475de5SAlex Elder 		__le64 size;
54089d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
54099d475de5SAlex Elder 
5410ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5411ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_size",
54124157976bSAlex Elder 				  &snapid, sizeof(snapid),
5413e2a58ee5SAlex Elder 				  &size_buf, sizeof(size_buf));
541436be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
54159d475de5SAlex Elder 	if (ret < 0)
54169d475de5SAlex Elder 		return ret;
541757385b51SAlex Elder 	if (ret < sizeof (size_buf))
541857385b51SAlex Elder 		return -ERANGE;
54199d475de5SAlex Elder 
5420c3545579SJosh Durgin 	if (order) {
54219d475de5SAlex Elder 		*order = size_buf.order;
5422c3545579SJosh Durgin 		dout("  order %u", (unsigned int)*order);
5423c3545579SJosh Durgin 	}
54249d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
54259d475de5SAlex Elder 
5426c3545579SJosh Durgin 	dout("  snap_id 0x%016llx snap_size = %llu\n",
5427c3545579SJosh Durgin 		(unsigned long long)snap_id,
54289d475de5SAlex Elder 		(unsigned long long)*snap_size);
54299d475de5SAlex Elder 
54309d475de5SAlex Elder 	return 0;
54319d475de5SAlex Elder }
54329d475de5SAlex Elder 
54339d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
54349d475de5SAlex Elder {
54359d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
54369d475de5SAlex Elder 					&rbd_dev->header.obj_order,
54379d475de5SAlex Elder 					&rbd_dev->header.image_size);
54389d475de5SAlex Elder }
54399d475de5SAlex Elder 
54401e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
54411e130199SAlex Elder {
54425435d206SDongsheng Yang 	size_t size;
54431e130199SAlex Elder 	void *reply_buf;
54441e130199SAlex Elder 	int ret;
54451e130199SAlex Elder 	void *p;
54461e130199SAlex Elder 
54475435d206SDongsheng Yang 	/* Response will be an encoded string, which includes a length */
54485435d206SDongsheng Yang 	size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX;
54495435d206SDongsheng Yang 	reply_buf = kzalloc(size, GFP_KERNEL);
54501e130199SAlex Elder 	if (!reply_buf)
54511e130199SAlex Elder 		return -ENOMEM;
54521e130199SAlex Elder 
5453ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5454ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_object_prefix",
54555435d206SDongsheng Yang 				  NULL, 0, reply_buf, size);
545636be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
54571e130199SAlex Elder 	if (ret < 0)
54581e130199SAlex Elder 		goto out;
54591e130199SAlex Elder 
54601e130199SAlex Elder 	p = reply_buf;
54611e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
546257385b51SAlex Elder 						p + ret, NULL, GFP_NOIO);
546357385b51SAlex Elder 	ret = 0;
54641e130199SAlex Elder 
54651e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
54661e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
54671e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
54681e130199SAlex Elder 	} else {
54691e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
54701e130199SAlex Elder 	}
54711e130199SAlex Elder out:
54721e130199SAlex Elder 	kfree(reply_buf);
54731e130199SAlex Elder 
54741e130199SAlex Elder 	return ret;
54751e130199SAlex Elder }
54761e130199SAlex Elder 
5477b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
5478196e2d6dSIlya Dryomov 				     bool read_only, u64 *snap_features)
5479b1b5402aSAlex Elder {
5480196e2d6dSIlya Dryomov 	struct {
5481196e2d6dSIlya Dryomov 		__le64 snap_id;
5482196e2d6dSIlya Dryomov 		u8 read_only;
5483196e2d6dSIlya Dryomov 	} features_in;
5484b1b5402aSAlex Elder 	struct {
5485b1b5402aSAlex Elder 		__le64 features;
5486b1b5402aSAlex Elder 		__le64 incompat;
54874157976bSAlex Elder 	} __attribute__ ((packed)) features_buf = { 0 };
5488d3767f0fSIlya Dryomov 	u64 unsup;
5489b1b5402aSAlex Elder 	int ret;
5490b1b5402aSAlex Elder 
5491196e2d6dSIlya Dryomov 	features_in.snap_id = cpu_to_le64(snap_id);
5492196e2d6dSIlya Dryomov 	features_in.read_only = read_only;
5493196e2d6dSIlya Dryomov 
5494ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5495ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_features",
5496196e2d6dSIlya Dryomov 				  &features_in, sizeof(features_in),
5497e2a58ee5SAlex Elder 				  &features_buf, sizeof(features_buf));
549836be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5499b1b5402aSAlex Elder 	if (ret < 0)
5500b1b5402aSAlex Elder 		return ret;
550157385b51SAlex Elder 	if (ret < sizeof (features_buf))
550257385b51SAlex Elder 		return -ERANGE;
5503d889140cSAlex Elder 
5504d3767f0fSIlya Dryomov 	unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
5505d3767f0fSIlya Dryomov 	if (unsup) {
5506d3767f0fSIlya Dryomov 		rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
5507d3767f0fSIlya Dryomov 			 unsup);
5508b8f5c6edSAlex Elder 		return -ENXIO;
5509d3767f0fSIlya Dryomov 	}
5510d889140cSAlex Elder 
5511b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
5512b1b5402aSAlex Elder 
5513b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
5514b1b5402aSAlex Elder 		(unsigned long long)snap_id,
5515b1b5402aSAlex Elder 		(unsigned long long)*snap_features,
5516b1b5402aSAlex Elder 		(unsigned long long)le64_to_cpu(features_buf.incompat));
5517b1b5402aSAlex Elder 
5518b1b5402aSAlex Elder 	return 0;
5519b1b5402aSAlex Elder }
5520b1b5402aSAlex Elder 
5521b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
5522b1b5402aSAlex Elder {
5523b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
5524196e2d6dSIlya Dryomov 					 rbd_is_ro(rbd_dev),
5525b1b5402aSAlex Elder 					 &rbd_dev->header.features);
5526b1b5402aSAlex Elder }
5527b1b5402aSAlex Elder 
552822e8bd51SIlya Dryomov /*
552922e8bd51SIlya Dryomov  * These are generic image flags, but since they are used only for
553022e8bd51SIlya Dryomov  * object map, store them in rbd_dev->object_map_flags.
553122e8bd51SIlya Dryomov  *
553222e8bd51SIlya Dryomov  * For the same reason, this function is called only on object map
553322e8bd51SIlya Dryomov  * (re)load and not on header refresh.
553422e8bd51SIlya Dryomov  */
553522e8bd51SIlya Dryomov static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev)
553622e8bd51SIlya Dryomov {
553722e8bd51SIlya Dryomov 	__le64 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
553822e8bd51SIlya Dryomov 	__le64 flags;
553922e8bd51SIlya Dryomov 	int ret;
554022e8bd51SIlya Dryomov 
554122e8bd51SIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
554222e8bd51SIlya Dryomov 				  &rbd_dev->header_oloc, "get_flags",
554322e8bd51SIlya Dryomov 				  &snapid, sizeof(snapid),
554422e8bd51SIlya Dryomov 				  &flags, sizeof(flags));
554522e8bd51SIlya Dryomov 	if (ret < 0)
554622e8bd51SIlya Dryomov 		return ret;
554722e8bd51SIlya Dryomov 	if (ret < sizeof(flags))
554822e8bd51SIlya Dryomov 		return -EBADMSG;
554922e8bd51SIlya Dryomov 
555022e8bd51SIlya Dryomov 	rbd_dev->object_map_flags = le64_to_cpu(flags);
555122e8bd51SIlya Dryomov 	return 0;
555222e8bd51SIlya Dryomov }
555322e8bd51SIlya Dryomov 
5554eb3b2d6bSIlya Dryomov struct parent_image_info {
5555eb3b2d6bSIlya Dryomov 	u64		pool_id;
5556e92c0eafSIlya Dryomov 	const char	*pool_ns;
5557eb3b2d6bSIlya Dryomov 	const char	*image_id;
5558eb3b2d6bSIlya Dryomov 	u64		snap_id;
5559eb3b2d6bSIlya Dryomov 
5560e92c0eafSIlya Dryomov 	bool		has_overlap;
5561eb3b2d6bSIlya Dryomov 	u64		overlap;
5562eb3b2d6bSIlya Dryomov };
5563eb3b2d6bSIlya Dryomov 
5564eb3b2d6bSIlya Dryomov /*
5565eb3b2d6bSIlya Dryomov  * The caller is responsible for @pii.
5566eb3b2d6bSIlya Dryomov  */
5567e92c0eafSIlya Dryomov static int decode_parent_image_spec(void **p, void *end,
5568e92c0eafSIlya Dryomov 				    struct parent_image_info *pii)
5569e92c0eafSIlya Dryomov {
5570e92c0eafSIlya Dryomov 	u8 struct_v;
5571e92c0eafSIlya Dryomov 	u32 struct_len;
5572e92c0eafSIlya Dryomov 	int ret;
5573e92c0eafSIlya Dryomov 
5574e92c0eafSIlya Dryomov 	ret = ceph_start_decoding(p, end, 1, "ParentImageSpec",
5575e92c0eafSIlya Dryomov 				  &struct_v, &struct_len);
5576e92c0eafSIlya Dryomov 	if (ret)
5577e92c0eafSIlya Dryomov 		return ret;
5578e92c0eafSIlya Dryomov 
5579e92c0eafSIlya Dryomov 	ceph_decode_64_safe(p, end, pii->pool_id, e_inval);
5580e92c0eafSIlya Dryomov 	pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5581e92c0eafSIlya Dryomov 	if (IS_ERR(pii->pool_ns)) {
5582e92c0eafSIlya Dryomov 		ret = PTR_ERR(pii->pool_ns);
5583e92c0eafSIlya Dryomov 		pii->pool_ns = NULL;
5584e92c0eafSIlya Dryomov 		return ret;
5585e92c0eafSIlya Dryomov 	}
5586e92c0eafSIlya Dryomov 	pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5587e92c0eafSIlya Dryomov 	if (IS_ERR(pii->image_id)) {
5588e92c0eafSIlya Dryomov 		ret = PTR_ERR(pii->image_id);
5589e92c0eafSIlya Dryomov 		pii->image_id = NULL;
5590e92c0eafSIlya Dryomov 		return ret;
5591e92c0eafSIlya Dryomov 	}
5592e92c0eafSIlya Dryomov 	ceph_decode_64_safe(p, end, pii->snap_id, e_inval);
5593e92c0eafSIlya Dryomov 	return 0;
5594e92c0eafSIlya Dryomov 
5595e92c0eafSIlya Dryomov e_inval:
5596e92c0eafSIlya Dryomov 	return -EINVAL;
5597e92c0eafSIlya Dryomov }
5598e92c0eafSIlya Dryomov 
5599e92c0eafSIlya Dryomov static int __get_parent_info(struct rbd_device *rbd_dev,
5600e92c0eafSIlya Dryomov 			     struct page *req_page,
5601e92c0eafSIlya Dryomov 			     struct page *reply_page,
5602e92c0eafSIlya Dryomov 			     struct parent_image_info *pii)
5603e92c0eafSIlya Dryomov {
5604e92c0eafSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5605e92c0eafSIlya Dryomov 	size_t reply_len = PAGE_SIZE;
5606e92c0eafSIlya Dryomov 	void *p, *end;
5607e92c0eafSIlya Dryomov 	int ret;
5608e92c0eafSIlya Dryomov 
5609e92c0eafSIlya Dryomov 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5610e92c0eafSIlya Dryomov 			     "rbd", "parent_get", CEPH_OSD_FLAG_READ,
561168ada915SIlya Dryomov 			     req_page, sizeof(u64), &reply_page, &reply_len);
5612e92c0eafSIlya Dryomov 	if (ret)
5613e92c0eafSIlya Dryomov 		return ret == -EOPNOTSUPP ? 1 : ret;
5614e92c0eafSIlya Dryomov 
5615e92c0eafSIlya Dryomov 	p = page_address(reply_page);
5616e92c0eafSIlya Dryomov 	end = p + reply_len;
5617e92c0eafSIlya Dryomov 	ret = decode_parent_image_spec(&p, end, pii);
5618e92c0eafSIlya Dryomov 	if (ret)
5619e92c0eafSIlya Dryomov 		return ret;
5620e92c0eafSIlya Dryomov 
5621e92c0eafSIlya Dryomov 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5622e92c0eafSIlya Dryomov 			     "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
562368ada915SIlya Dryomov 			     req_page, sizeof(u64), &reply_page, &reply_len);
5624e92c0eafSIlya Dryomov 	if (ret)
5625e92c0eafSIlya Dryomov 		return ret;
5626e92c0eafSIlya Dryomov 
5627e92c0eafSIlya Dryomov 	p = page_address(reply_page);
5628e92c0eafSIlya Dryomov 	end = p + reply_len;
5629e92c0eafSIlya Dryomov 	ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval);
5630e92c0eafSIlya Dryomov 	if (pii->has_overlap)
5631e92c0eafSIlya Dryomov 		ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5632e92c0eafSIlya Dryomov 
5633e92c0eafSIlya Dryomov 	return 0;
5634e92c0eafSIlya Dryomov 
5635e92c0eafSIlya Dryomov e_inval:
5636e92c0eafSIlya Dryomov 	return -EINVAL;
5637e92c0eafSIlya Dryomov }
5638e92c0eafSIlya Dryomov 
5639e92c0eafSIlya Dryomov /*
5640e92c0eafSIlya Dryomov  * The caller is responsible for @pii.
5641e92c0eafSIlya Dryomov  */
5642eb3b2d6bSIlya Dryomov static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
5643eb3b2d6bSIlya Dryomov 				    struct page *req_page,
5644eb3b2d6bSIlya Dryomov 				    struct page *reply_page,
5645eb3b2d6bSIlya Dryomov 				    struct parent_image_info *pii)
5646eb3b2d6bSIlya Dryomov {
5647eb3b2d6bSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5648eb3b2d6bSIlya Dryomov 	size_t reply_len = PAGE_SIZE;
5649eb3b2d6bSIlya Dryomov 	void *p, *end;
5650eb3b2d6bSIlya Dryomov 	int ret;
5651eb3b2d6bSIlya Dryomov 
5652eb3b2d6bSIlya Dryomov 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5653eb3b2d6bSIlya Dryomov 			     "rbd", "get_parent", CEPH_OSD_FLAG_READ,
565468ada915SIlya Dryomov 			     req_page, sizeof(u64), &reply_page, &reply_len);
5655eb3b2d6bSIlya Dryomov 	if (ret)
5656eb3b2d6bSIlya Dryomov 		return ret;
5657eb3b2d6bSIlya Dryomov 
5658eb3b2d6bSIlya Dryomov 	p = page_address(reply_page);
5659eb3b2d6bSIlya Dryomov 	end = p + reply_len;
5660eb3b2d6bSIlya Dryomov 	ceph_decode_64_safe(&p, end, pii->pool_id, e_inval);
5661eb3b2d6bSIlya Dryomov 	pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
5662eb3b2d6bSIlya Dryomov 	if (IS_ERR(pii->image_id)) {
5663eb3b2d6bSIlya Dryomov 		ret = PTR_ERR(pii->image_id);
5664eb3b2d6bSIlya Dryomov 		pii->image_id = NULL;
5665eb3b2d6bSIlya Dryomov 		return ret;
5666eb3b2d6bSIlya Dryomov 	}
5667eb3b2d6bSIlya Dryomov 	ceph_decode_64_safe(&p, end, pii->snap_id, e_inval);
5668e92c0eafSIlya Dryomov 	pii->has_overlap = true;
5669eb3b2d6bSIlya Dryomov 	ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5670eb3b2d6bSIlya Dryomov 
5671eb3b2d6bSIlya Dryomov 	return 0;
5672eb3b2d6bSIlya Dryomov 
5673eb3b2d6bSIlya Dryomov e_inval:
5674eb3b2d6bSIlya Dryomov 	return -EINVAL;
5675eb3b2d6bSIlya Dryomov }
5676eb3b2d6bSIlya Dryomov 
5677eb3b2d6bSIlya Dryomov static int get_parent_info(struct rbd_device *rbd_dev,
5678eb3b2d6bSIlya Dryomov 			   struct parent_image_info *pii)
5679eb3b2d6bSIlya Dryomov {
5680eb3b2d6bSIlya Dryomov 	struct page *req_page, *reply_page;
5681eb3b2d6bSIlya Dryomov 	void *p;
5682eb3b2d6bSIlya Dryomov 	int ret;
5683eb3b2d6bSIlya Dryomov 
5684eb3b2d6bSIlya Dryomov 	req_page = alloc_page(GFP_KERNEL);
5685eb3b2d6bSIlya Dryomov 	if (!req_page)
5686eb3b2d6bSIlya Dryomov 		return -ENOMEM;
5687eb3b2d6bSIlya Dryomov 
5688eb3b2d6bSIlya Dryomov 	reply_page = alloc_page(GFP_KERNEL);
5689eb3b2d6bSIlya Dryomov 	if (!reply_page) {
5690eb3b2d6bSIlya Dryomov 		__free_page(req_page);
5691eb3b2d6bSIlya Dryomov 		return -ENOMEM;
5692eb3b2d6bSIlya Dryomov 	}
5693eb3b2d6bSIlya Dryomov 
5694eb3b2d6bSIlya Dryomov 	p = page_address(req_page);
5695eb3b2d6bSIlya Dryomov 	ceph_encode_64(&p, rbd_dev->spec->snap_id);
5696e92c0eafSIlya Dryomov 	ret = __get_parent_info(rbd_dev, req_page, reply_page, pii);
5697e92c0eafSIlya Dryomov 	if (ret > 0)
5698e92c0eafSIlya Dryomov 		ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page,
5699e92c0eafSIlya Dryomov 					       pii);
5700eb3b2d6bSIlya Dryomov 
5701eb3b2d6bSIlya Dryomov 	__free_page(req_page);
5702eb3b2d6bSIlya Dryomov 	__free_page(reply_page);
5703eb3b2d6bSIlya Dryomov 	return ret;
5704eb3b2d6bSIlya Dryomov }
5705eb3b2d6bSIlya Dryomov 
570686b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
570786b00e0dSAlex Elder {
570886b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
5709eb3b2d6bSIlya Dryomov 	struct parent_image_info pii = { 0 };
571086b00e0dSAlex Elder 	int ret;
571186b00e0dSAlex Elder 
571286b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
571386b00e0dSAlex Elder 	if (!parent_spec)
571486b00e0dSAlex Elder 		return -ENOMEM;
571586b00e0dSAlex Elder 
5716eb3b2d6bSIlya Dryomov 	ret = get_parent_info(rbd_dev, &pii);
5717eb3b2d6bSIlya Dryomov 	if (ret)
571886b00e0dSAlex Elder 		goto out_err;
571986b00e0dSAlex Elder 
5720e92c0eafSIlya Dryomov 	dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
5721e92c0eafSIlya Dryomov 	     __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
5722e92c0eafSIlya Dryomov 	     pii.has_overlap, pii.overlap);
5723eb3b2d6bSIlya Dryomov 
5724e92c0eafSIlya Dryomov 	if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
5725392a9dadSAlex Elder 		/*
5726392a9dadSAlex Elder 		 * Either the parent never existed, or we have
5727392a9dadSAlex Elder 		 * record of it but the image got flattened so it no
5728392a9dadSAlex Elder 		 * longer has a parent.  When the parent of a
5729392a9dadSAlex Elder 		 * layered image disappears we immediately set the
5730392a9dadSAlex Elder 		 * overlap to 0.  The effect of this is that all new
5731392a9dadSAlex Elder 		 * requests will be treated as if the image had no
5732392a9dadSAlex Elder 		 * parent.
5733e92c0eafSIlya Dryomov 		 *
5734e92c0eafSIlya Dryomov 		 * If !pii.has_overlap, the parent image spec is not
5735e92c0eafSIlya Dryomov 		 * applicable.  It's there to avoid duplication in each
5736e92c0eafSIlya Dryomov 		 * snapshot record.
5737392a9dadSAlex Elder 		 */
5738392a9dadSAlex Elder 		if (rbd_dev->parent_overlap) {
5739392a9dadSAlex Elder 			rbd_dev->parent_overlap = 0;
5740392a9dadSAlex Elder 			rbd_dev_parent_put(rbd_dev);
5741392a9dadSAlex Elder 			pr_info("%s: clone image has been flattened\n",
5742392a9dadSAlex Elder 				rbd_dev->disk->disk_name);
5743392a9dadSAlex Elder 		}
5744392a9dadSAlex Elder 
574586b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
5746392a9dadSAlex Elder 	}
574786b00e0dSAlex Elder 
57480903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
57490903e875SAlex Elder 
57500903e875SAlex Elder 	ret = -EIO;
5751eb3b2d6bSIlya Dryomov 	if (pii.pool_id > (u64)U32_MAX) {
57529584d508SIlya Dryomov 		rbd_warn(NULL, "parent pool id too large (%llu > %u)",
5753eb3b2d6bSIlya Dryomov 			(unsigned long long)pii.pool_id, U32_MAX);
575457385b51SAlex Elder 		goto out_err;
5755c0cd10dbSAlex Elder 	}
57560903e875SAlex Elder 
57573b5cf2a2SAlex Elder 	/*
57583b5cf2a2SAlex Elder 	 * The parent won't change (except when the clone is
57593b5cf2a2SAlex Elder 	 * flattened, already handled that).  So we only need to
57603b5cf2a2SAlex Elder 	 * record the parent spec we have not already done so.
57613b5cf2a2SAlex Elder 	 */
57623b5cf2a2SAlex Elder 	if (!rbd_dev->parent_spec) {
5763eb3b2d6bSIlya Dryomov 		parent_spec->pool_id = pii.pool_id;
5764e92c0eafSIlya Dryomov 		if (pii.pool_ns && *pii.pool_ns) {
5765e92c0eafSIlya Dryomov 			parent_spec->pool_ns = pii.pool_ns;
5766e92c0eafSIlya Dryomov 			pii.pool_ns = NULL;
5767e92c0eafSIlya Dryomov 		}
5768eb3b2d6bSIlya Dryomov 		parent_spec->image_id = pii.image_id;
5769eb3b2d6bSIlya Dryomov 		pii.image_id = NULL;
5770eb3b2d6bSIlya Dryomov 		parent_spec->snap_id = pii.snap_id;
5771b26c047bSIlya Dryomov 
577286b00e0dSAlex Elder 		rbd_dev->parent_spec = parent_spec;
577386b00e0dSAlex Elder 		parent_spec = NULL;	/* rbd_dev now owns this */
57743b5cf2a2SAlex Elder 	}
57753b5cf2a2SAlex Elder 
57763b5cf2a2SAlex Elder 	/*
5777cf32bd9cSIlya Dryomov 	 * We always update the parent overlap.  If it's zero we issue
5778cf32bd9cSIlya Dryomov 	 * a warning, as we will proceed as if there was no parent.
57793b5cf2a2SAlex Elder 	 */
5780eb3b2d6bSIlya Dryomov 	if (!pii.overlap) {
57813b5cf2a2SAlex Elder 		if (parent_spec) {
5782cf32bd9cSIlya Dryomov 			/* refresh, careful to warn just once */
5783cf32bd9cSIlya Dryomov 			if (rbd_dev->parent_overlap)
5784cf32bd9cSIlya Dryomov 				rbd_warn(rbd_dev,
5785cf32bd9cSIlya Dryomov 				    "clone now standalone (overlap became 0)");
578670cf49cfSAlex Elder 		} else {
5787cf32bd9cSIlya Dryomov 			/* initial probe */
5788cf32bd9cSIlya Dryomov 			rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
57893b5cf2a2SAlex Elder 		}
579070cf49cfSAlex Elder 	}
5791eb3b2d6bSIlya Dryomov 	rbd_dev->parent_overlap = pii.overlap;
5792cf32bd9cSIlya Dryomov 
579386b00e0dSAlex Elder out:
579486b00e0dSAlex Elder 	ret = 0;
579586b00e0dSAlex Elder out_err:
5796e92c0eafSIlya Dryomov 	kfree(pii.pool_ns);
5797eb3b2d6bSIlya Dryomov 	kfree(pii.image_id);
579886b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
579986b00e0dSAlex Elder 	return ret;
580086b00e0dSAlex Elder }
580186b00e0dSAlex Elder 
5802cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
5803cc070d59SAlex Elder {
5804cc070d59SAlex Elder 	struct {
5805cc070d59SAlex Elder 		__le64 stripe_unit;
5806cc070d59SAlex Elder 		__le64 stripe_count;
5807cc070d59SAlex Elder 	} __attribute__ ((packed)) striping_info_buf = { 0 };
5808cc070d59SAlex Elder 	size_t size = sizeof (striping_info_buf);
5809cc070d59SAlex Elder 	void *p;
5810cc070d59SAlex Elder 	int ret;
5811cc070d59SAlex Elder 
5812ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5813ecd4a68aSIlya Dryomov 				&rbd_dev->header_oloc, "get_stripe_unit_count",
5814ecd4a68aSIlya Dryomov 				NULL, 0, &striping_info_buf, size);
5815cc070d59SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5816cc070d59SAlex Elder 	if (ret < 0)
5817cc070d59SAlex Elder 		return ret;
5818cc070d59SAlex Elder 	if (ret < size)
5819cc070d59SAlex Elder 		return -ERANGE;
5820cc070d59SAlex Elder 
5821cc070d59SAlex Elder 	p = &striping_info_buf;
5822b1331852SIlya Dryomov 	rbd_dev->header.stripe_unit = ceph_decode_64(&p);
5823b1331852SIlya Dryomov 	rbd_dev->header.stripe_count = ceph_decode_64(&p);
5824cc070d59SAlex Elder 	return 0;
5825cc070d59SAlex Elder }
5826cc070d59SAlex Elder 
58277e97332eSIlya Dryomov static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
58287e97332eSIlya Dryomov {
58297e97332eSIlya Dryomov 	__le64 data_pool_id;
58307e97332eSIlya Dryomov 	int ret;
58317e97332eSIlya Dryomov 
58327e97332eSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
58337e97332eSIlya Dryomov 				  &rbd_dev->header_oloc, "get_data_pool",
58347e97332eSIlya Dryomov 				  NULL, 0, &data_pool_id, sizeof(data_pool_id));
58357e97332eSIlya Dryomov 	if (ret < 0)
58367e97332eSIlya Dryomov 		return ret;
58377e97332eSIlya Dryomov 	if (ret < sizeof(data_pool_id))
58387e97332eSIlya Dryomov 		return -EBADMSG;
58397e97332eSIlya Dryomov 
58407e97332eSIlya Dryomov 	rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
58417e97332eSIlya Dryomov 	WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
58427e97332eSIlya Dryomov 	return 0;
58437e97332eSIlya Dryomov }
58447e97332eSIlya Dryomov 
58459e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
58469e15b77dSAlex Elder {
5847ecd4a68aSIlya Dryomov 	CEPH_DEFINE_OID_ONSTACK(oid);
58489e15b77dSAlex Elder 	size_t image_id_size;
58499e15b77dSAlex Elder 	char *image_id;
58509e15b77dSAlex Elder 	void *p;
58519e15b77dSAlex Elder 	void *end;
58529e15b77dSAlex Elder 	size_t size;
58539e15b77dSAlex Elder 	void *reply_buf = NULL;
58549e15b77dSAlex Elder 	size_t len = 0;
58559e15b77dSAlex Elder 	char *image_name = NULL;
58569e15b77dSAlex Elder 	int ret;
58579e15b77dSAlex Elder 
58589e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
58599e15b77dSAlex Elder 
586069e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
586169e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
58629e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
58639e15b77dSAlex Elder 	if (!image_id)
58649e15b77dSAlex Elder 		return NULL;
58659e15b77dSAlex Elder 
58669e15b77dSAlex Elder 	p = image_id;
58674157976bSAlex Elder 	end = image_id + image_id_size;
586869e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
58699e15b77dSAlex Elder 
58709e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
58719e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
58729e15b77dSAlex Elder 	if (!reply_buf)
58739e15b77dSAlex Elder 		goto out;
58749e15b77dSAlex Elder 
5875ecd4a68aSIlya Dryomov 	ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
5876ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5877ecd4a68aSIlya Dryomov 				  "dir_get_name", image_id, image_id_size,
5878e2a58ee5SAlex Elder 				  reply_buf, size);
58799e15b77dSAlex Elder 	if (ret < 0)
58809e15b77dSAlex Elder 		goto out;
58819e15b77dSAlex Elder 	p = reply_buf;
5882f40eb349SAlex Elder 	end = reply_buf + ret;
5883f40eb349SAlex Elder 
58849e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
58859e15b77dSAlex Elder 	if (IS_ERR(image_name))
58869e15b77dSAlex Elder 		image_name = NULL;
58879e15b77dSAlex Elder 	else
58889e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
58899e15b77dSAlex Elder out:
58909e15b77dSAlex Elder 	kfree(reply_buf);
58919e15b77dSAlex Elder 	kfree(image_id);
58929e15b77dSAlex Elder 
58939e15b77dSAlex Elder 	return image_name;
58949e15b77dSAlex Elder }
58959e15b77dSAlex Elder 
58962ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
58972ad3d716SAlex Elder {
58982ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
58992ad3d716SAlex Elder 	const char *snap_name;
59002ad3d716SAlex Elder 	u32 which = 0;
59012ad3d716SAlex Elder 
59022ad3d716SAlex Elder 	/* Skip over names until we find the one we are looking for */
59032ad3d716SAlex Elder 
59042ad3d716SAlex Elder 	snap_name = rbd_dev->header.snap_names;
59052ad3d716SAlex Elder 	while (which < snapc->num_snaps) {
59062ad3d716SAlex Elder 		if (!strcmp(name, snap_name))
59072ad3d716SAlex Elder 			return snapc->snaps[which];
59082ad3d716SAlex Elder 		snap_name += strlen(snap_name) + 1;
59092ad3d716SAlex Elder 		which++;
59102ad3d716SAlex Elder 	}
59112ad3d716SAlex Elder 	return CEPH_NOSNAP;
59122ad3d716SAlex Elder }
59132ad3d716SAlex Elder 
59142ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
59152ad3d716SAlex Elder {
59162ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
59172ad3d716SAlex Elder 	u32 which;
59182ad3d716SAlex Elder 	bool found = false;
59192ad3d716SAlex Elder 	u64 snap_id;
59202ad3d716SAlex Elder 
59212ad3d716SAlex Elder 	for (which = 0; !found && which < snapc->num_snaps; which++) {
59222ad3d716SAlex Elder 		const char *snap_name;
59232ad3d716SAlex Elder 
59242ad3d716SAlex Elder 		snap_id = snapc->snaps[which];
59252ad3d716SAlex Elder 		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
5926efadc98aSJosh Durgin 		if (IS_ERR(snap_name)) {
5927efadc98aSJosh Durgin 			/* ignore no-longer existing snapshots */
5928efadc98aSJosh Durgin 			if (PTR_ERR(snap_name) == -ENOENT)
5929efadc98aSJosh Durgin 				continue;
5930efadc98aSJosh Durgin 			else
59312ad3d716SAlex Elder 				break;
5932efadc98aSJosh Durgin 		}
59332ad3d716SAlex Elder 		found = !strcmp(name, snap_name);
59342ad3d716SAlex Elder 		kfree(snap_name);
59352ad3d716SAlex Elder 	}
59362ad3d716SAlex Elder 	return found ? snap_id : CEPH_NOSNAP;
59372ad3d716SAlex Elder }
59382ad3d716SAlex Elder 
59392ad3d716SAlex Elder /*
59402ad3d716SAlex Elder  * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
59412ad3d716SAlex Elder  * no snapshot by that name is found, or if an error occurs.
59422ad3d716SAlex Elder  */
59432ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
59442ad3d716SAlex Elder {
59452ad3d716SAlex Elder 	if (rbd_dev->image_format == 1)
59462ad3d716SAlex Elder 		return rbd_v1_snap_id_by_name(rbd_dev, name);
59472ad3d716SAlex Elder 
59482ad3d716SAlex Elder 	return rbd_v2_snap_id_by_name(rbd_dev, name);
59492ad3d716SAlex Elder }
59502ad3d716SAlex Elder 
59519e15b77dSAlex Elder /*
595204077599SIlya Dryomov  * An image being mapped will have everything but the snap id.
59539e15b77dSAlex Elder  */
595404077599SIlya Dryomov static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
595504077599SIlya Dryomov {
595604077599SIlya Dryomov 	struct rbd_spec *spec = rbd_dev->spec;
595704077599SIlya Dryomov 
595804077599SIlya Dryomov 	rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
595904077599SIlya Dryomov 	rbd_assert(spec->image_id && spec->image_name);
596004077599SIlya Dryomov 	rbd_assert(spec->snap_name);
596104077599SIlya Dryomov 
596204077599SIlya Dryomov 	if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
596304077599SIlya Dryomov 		u64 snap_id;
596404077599SIlya Dryomov 
596504077599SIlya Dryomov 		snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
596604077599SIlya Dryomov 		if (snap_id == CEPH_NOSNAP)
596704077599SIlya Dryomov 			return -ENOENT;
596804077599SIlya Dryomov 
596904077599SIlya Dryomov 		spec->snap_id = snap_id;
597004077599SIlya Dryomov 	} else {
597104077599SIlya Dryomov 		spec->snap_id = CEPH_NOSNAP;
597204077599SIlya Dryomov 	}
597304077599SIlya Dryomov 
597404077599SIlya Dryomov 	return 0;
597504077599SIlya Dryomov }
597604077599SIlya Dryomov 
597704077599SIlya Dryomov /*
597804077599SIlya Dryomov  * A parent image will have all ids but none of the names.
597904077599SIlya Dryomov  *
598004077599SIlya Dryomov  * All names in an rbd spec are dynamically allocated.  It's OK if we
598104077599SIlya Dryomov  * can't figure out the name for an image id.
598204077599SIlya Dryomov  */
598304077599SIlya Dryomov static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
59849e15b77dSAlex Elder {
59852e9f7f1cSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
59862e9f7f1cSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
59872e9f7f1cSAlex Elder 	const char *pool_name;
59882e9f7f1cSAlex Elder 	const char *image_name;
59892e9f7f1cSAlex Elder 	const char *snap_name;
59909e15b77dSAlex Elder 	int ret;
59919e15b77dSAlex Elder 
599204077599SIlya Dryomov 	rbd_assert(spec->pool_id != CEPH_NOPOOL);
599304077599SIlya Dryomov 	rbd_assert(spec->image_id);
599404077599SIlya Dryomov 	rbd_assert(spec->snap_id != CEPH_NOSNAP);
59959e15b77dSAlex Elder 
59962e9f7f1cSAlex Elder 	/* Get the pool name; we have to make our own copy of this */
59979e15b77dSAlex Elder 
59982e9f7f1cSAlex Elder 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
59992e9f7f1cSAlex Elder 	if (!pool_name) {
60002e9f7f1cSAlex Elder 		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
6001935dc89fSAlex Elder 		return -EIO;
6002935dc89fSAlex Elder 	}
60032e9f7f1cSAlex Elder 	pool_name = kstrdup(pool_name, GFP_KERNEL);
60042e9f7f1cSAlex Elder 	if (!pool_name)
60059e15b77dSAlex Elder 		return -ENOMEM;
60069e15b77dSAlex Elder 
60079e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
60089e15b77dSAlex Elder 
60092e9f7f1cSAlex Elder 	image_name = rbd_dev_image_name(rbd_dev);
60102e9f7f1cSAlex Elder 	if (!image_name)
601106ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
60129e15b77dSAlex Elder 
601304077599SIlya Dryomov 	/* Fetch the snapshot name */
60149e15b77dSAlex Elder 
60152e9f7f1cSAlex Elder 	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
6016da6a6b63SJosh Durgin 	if (IS_ERR(snap_name)) {
6017da6a6b63SJosh Durgin 		ret = PTR_ERR(snap_name);
60189e15b77dSAlex Elder 		goto out_err;
60192e9f7f1cSAlex Elder 	}
60202e9f7f1cSAlex Elder 
60212e9f7f1cSAlex Elder 	spec->pool_name = pool_name;
60222e9f7f1cSAlex Elder 	spec->image_name = image_name;
60232e9f7f1cSAlex Elder 	spec->snap_name = snap_name;
60249e15b77dSAlex Elder 
60259e15b77dSAlex Elder 	return 0;
602604077599SIlya Dryomov 
60279e15b77dSAlex Elder out_err:
60282e9f7f1cSAlex Elder 	kfree(image_name);
60292e9f7f1cSAlex Elder 	kfree(pool_name);
60309e15b77dSAlex Elder 	return ret;
60319e15b77dSAlex Elder }
60329e15b77dSAlex Elder 
6033cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
603435d489f9SAlex Elder {
603535d489f9SAlex Elder 	size_t size;
603635d489f9SAlex Elder 	int ret;
603735d489f9SAlex Elder 	void *reply_buf;
603835d489f9SAlex Elder 	void *p;
603935d489f9SAlex Elder 	void *end;
604035d489f9SAlex Elder 	u64 seq;
604135d489f9SAlex Elder 	u32 snap_count;
604235d489f9SAlex Elder 	struct ceph_snap_context *snapc;
604335d489f9SAlex Elder 	u32 i;
604435d489f9SAlex Elder 
604535d489f9SAlex Elder 	/*
604635d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
604735d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
604835d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
604935d489f9SAlex Elder 	 * prepared to receive.
605035d489f9SAlex Elder 	 */
605135d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
605235d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
605335d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
605435d489f9SAlex Elder 	if (!reply_buf)
605535d489f9SAlex Elder 		return -ENOMEM;
605635d489f9SAlex Elder 
6057ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6058ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_snapcontext",
6059ecd4a68aSIlya Dryomov 				  NULL, 0, reply_buf, size);
606036be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
606135d489f9SAlex Elder 	if (ret < 0)
606235d489f9SAlex Elder 		goto out;
606335d489f9SAlex Elder 
606435d489f9SAlex Elder 	p = reply_buf;
606557385b51SAlex Elder 	end = reply_buf + ret;
606657385b51SAlex Elder 	ret = -ERANGE;
606735d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
606835d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
606935d489f9SAlex Elder 
607035d489f9SAlex Elder 	/*
607135d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
607235d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
607335d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
607435d489f9SAlex Elder 	 * allocate is representable in a size_t.
607535d489f9SAlex Elder 	 */
607635d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
607735d489f9SAlex Elder 				 / sizeof (u64)) {
607835d489f9SAlex Elder 		ret = -EINVAL;
607935d489f9SAlex Elder 		goto out;
608035d489f9SAlex Elder 	}
608135d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
608235d489f9SAlex Elder 		goto out;
6083468521c1SAlex Elder 	ret = 0;
608435d489f9SAlex Elder 
6085812164f8SAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
608635d489f9SAlex Elder 	if (!snapc) {
608735d489f9SAlex Elder 		ret = -ENOMEM;
608835d489f9SAlex Elder 		goto out;
608935d489f9SAlex Elder 	}
609035d489f9SAlex Elder 	snapc->seq = seq;
609135d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
609235d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
609335d489f9SAlex Elder 
609449ece554SAlex Elder 	ceph_put_snap_context(rbd_dev->header.snapc);
609535d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
609635d489f9SAlex Elder 
609735d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
609835d489f9SAlex Elder 		(unsigned long long)seq, (unsigned int)snap_count);
609935d489f9SAlex Elder out:
610035d489f9SAlex Elder 	kfree(reply_buf);
610135d489f9SAlex Elder 
610257385b51SAlex Elder 	return ret;
610335d489f9SAlex Elder }
610435d489f9SAlex Elder 
610554cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
610654cac61fSAlex Elder 					u64 snap_id)
6107b8b1e2dbSAlex Elder {
6108b8b1e2dbSAlex Elder 	size_t size;
6109b8b1e2dbSAlex Elder 	void *reply_buf;
611054cac61fSAlex Elder 	__le64 snapid;
6111b8b1e2dbSAlex Elder 	int ret;
6112b8b1e2dbSAlex Elder 	void *p;
6113b8b1e2dbSAlex Elder 	void *end;
6114b8b1e2dbSAlex Elder 	char *snap_name;
6115b8b1e2dbSAlex Elder 
6116b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
6117b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
6118b8b1e2dbSAlex Elder 	if (!reply_buf)
6119b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
6120b8b1e2dbSAlex Elder 
612154cac61fSAlex Elder 	snapid = cpu_to_le64(snap_id);
6122ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6123ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_snapshot_name",
6124ecd4a68aSIlya Dryomov 				  &snapid, sizeof(snapid), reply_buf, size);
612536be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6126f40eb349SAlex Elder 	if (ret < 0) {
6127f40eb349SAlex Elder 		snap_name = ERR_PTR(ret);
6128b8b1e2dbSAlex Elder 		goto out;
6129f40eb349SAlex Elder 	}
6130b8b1e2dbSAlex Elder 
6131b8b1e2dbSAlex Elder 	p = reply_buf;
6132f40eb349SAlex Elder 	end = reply_buf + ret;
6133e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
6134f40eb349SAlex Elder 	if (IS_ERR(snap_name))
6135b8b1e2dbSAlex Elder 		goto out;
6136f40eb349SAlex Elder 
6137b8b1e2dbSAlex Elder 	dout("  snap_id 0x%016llx snap_name = %s\n",
613854cac61fSAlex Elder 		(unsigned long long)snap_id, snap_name);
6139b8b1e2dbSAlex Elder out:
6140b8b1e2dbSAlex Elder 	kfree(reply_buf);
6141b8b1e2dbSAlex Elder 
6142f40eb349SAlex Elder 	return snap_name;
6143b8b1e2dbSAlex Elder }
6144b8b1e2dbSAlex Elder 
61452df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
6146117973fbSAlex Elder {
61472df3fac7SAlex Elder 	bool first_time = rbd_dev->header.object_prefix == NULL;
6148117973fbSAlex Elder 	int ret;
6149117973fbSAlex Elder 
61501617e40cSJosh Durgin 	ret = rbd_dev_v2_image_size(rbd_dev);
61511617e40cSJosh Durgin 	if (ret)
6152cfbf6377SAlex Elder 		return ret;
61531617e40cSJosh Durgin 
61542df3fac7SAlex Elder 	if (first_time) {
61552df3fac7SAlex Elder 		ret = rbd_dev_v2_header_onetime(rbd_dev);
61562df3fac7SAlex Elder 		if (ret)
6157cfbf6377SAlex Elder 			return ret;
61582df3fac7SAlex Elder 	}
61592df3fac7SAlex Elder 
6160cc4a38bdSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev);
6161d194cd1dSIlya Dryomov 	if (ret && first_time) {
6162d194cd1dSIlya Dryomov 		kfree(rbd_dev->header.object_prefix);
6163d194cd1dSIlya Dryomov 		rbd_dev->header.object_prefix = NULL;
6164d194cd1dSIlya Dryomov 	}
6165117973fbSAlex Elder 
6166117973fbSAlex Elder 	return ret;
6167117973fbSAlex Elder }
6168117973fbSAlex Elder 
6169a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev)
6170a720ae09SIlya Dryomov {
6171a720ae09SIlya Dryomov 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
6172a720ae09SIlya Dryomov 
6173a720ae09SIlya Dryomov 	if (rbd_dev->image_format == 1)
6174a720ae09SIlya Dryomov 		return rbd_dev_v1_header_info(rbd_dev);
6175a720ae09SIlya Dryomov 
6176a720ae09SIlya Dryomov 	return rbd_dev_v2_header_info(rbd_dev);
6177a720ae09SIlya Dryomov }
6178a720ae09SIlya Dryomov 
61791ddbe94eSAlex Elder /*
6180e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
6181e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
6182593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
6183593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
6184e28fff26SAlex Elder  */
6185e28fff26SAlex Elder static inline size_t next_token(const char **buf)
6186e28fff26SAlex Elder {
6187e28fff26SAlex Elder         /*
6188e28fff26SAlex Elder         * These are the characters that produce nonzero for
6189e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
6190e28fff26SAlex Elder         */
6191435a120aSColin Ian King 	static const char spaces[] = " \f\n\r\t\v";
6192e28fff26SAlex Elder 
6193e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
6194e28fff26SAlex Elder 
6195e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
6196e28fff26SAlex Elder }
6197e28fff26SAlex Elder 
6198e28fff26SAlex Elder /*
6199ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
6200ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
6201ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
6202ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
6203ea3352f4SAlex Elder  *
6204ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
6205ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
6206ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
6207ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
6208ea3352f4SAlex Elder  *
6209ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
6210ea3352f4SAlex Elder  * the end of the found token.
6211ea3352f4SAlex Elder  *
6212ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
6213ea3352f4SAlex Elder  */
6214ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
6215ea3352f4SAlex Elder {
6216ea3352f4SAlex Elder 	char *dup;
6217ea3352f4SAlex Elder 	size_t len;
6218ea3352f4SAlex Elder 
6219ea3352f4SAlex Elder 	len = next_token(buf);
62204caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
6221ea3352f4SAlex Elder 	if (!dup)
6222ea3352f4SAlex Elder 		return NULL;
6223ea3352f4SAlex Elder 	*(dup + len) = '\0';
6224ea3352f4SAlex Elder 	*buf += len;
6225ea3352f4SAlex Elder 
6226ea3352f4SAlex Elder 	if (lenp)
6227ea3352f4SAlex Elder 		*lenp = len;
6228ea3352f4SAlex Elder 
6229ea3352f4SAlex Elder 	return dup;
6230ea3352f4SAlex Elder }
6231ea3352f4SAlex Elder 
623282995cc6SDavid Howells static int rbd_parse_param(struct fs_parameter *param,
623382995cc6SDavid Howells 			    struct rbd_parse_opts_ctx *pctx)
623482995cc6SDavid Howells {
623582995cc6SDavid Howells 	struct rbd_options *opt = pctx->opts;
623682995cc6SDavid Howells 	struct fs_parse_result result;
62373fbb8d55SAl Viro 	struct p_log log = {.prefix = "rbd"};
623882995cc6SDavid Howells 	int token, ret;
623982995cc6SDavid Howells 
624082995cc6SDavid Howells 	ret = ceph_parse_param(param, pctx->copts, NULL);
624182995cc6SDavid Howells 	if (ret != -ENOPARAM)
624282995cc6SDavid Howells 		return ret;
624382995cc6SDavid Howells 
6244d7167b14SAl Viro 	token = __fs_parse(&log, rbd_parameters, param, &result);
624582995cc6SDavid Howells 	dout("%s fs_parse '%s' token %d\n", __func__, param->key, token);
624682995cc6SDavid Howells 	if (token < 0) {
62472c3f3dc3SAl Viro 		if (token == -ENOPARAM)
62482c3f3dc3SAl Viro 			return inval_plog(&log, "Unknown parameter '%s'",
624982995cc6SDavid Howells 					  param->key);
625082995cc6SDavid Howells 		return token;
625182995cc6SDavid Howells 	}
625282995cc6SDavid Howells 
625382995cc6SDavid Howells 	switch (token) {
625482995cc6SDavid Howells 	case Opt_queue_depth:
625582995cc6SDavid Howells 		if (result.uint_32 < 1)
625682995cc6SDavid Howells 			goto out_of_range;
625782995cc6SDavid Howells 		opt->queue_depth = result.uint_32;
625882995cc6SDavid Howells 		break;
625982995cc6SDavid Howells 	case Opt_alloc_size:
626082995cc6SDavid Howells 		if (result.uint_32 < SECTOR_SIZE)
626182995cc6SDavid Howells 			goto out_of_range;
62622c3f3dc3SAl Viro 		if (!is_power_of_2(result.uint_32))
62632c3f3dc3SAl Viro 			return inval_plog(&log, "alloc_size must be a power of 2");
626482995cc6SDavid Howells 		opt->alloc_size = result.uint_32;
626582995cc6SDavid Howells 		break;
626682995cc6SDavid Howells 	case Opt_lock_timeout:
626782995cc6SDavid Howells 		/* 0 is "wait forever" (i.e. infinite timeout) */
626882995cc6SDavid Howells 		if (result.uint_32 > INT_MAX / 1000)
626982995cc6SDavid Howells 			goto out_of_range;
627082995cc6SDavid Howells 		opt->lock_timeout = msecs_to_jiffies(result.uint_32 * 1000);
627182995cc6SDavid Howells 		break;
627282995cc6SDavid Howells 	case Opt_pool_ns:
627382995cc6SDavid Howells 		kfree(pctx->spec->pool_ns);
627482995cc6SDavid Howells 		pctx->spec->pool_ns = param->string;
627582995cc6SDavid Howells 		param->string = NULL;
627682995cc6SDavid Howells 		break;
6277dc1dad8eSIlya Dryomov 	case Opt_compression_hint:
6278dc1dad8eSIlya Dryomov 		switch (result.uint_32) {
6279dc1dad8eSIlya Dryomov 		case Opt_compression_hint_none:
6280dc1dad8eSIlya Dryomov 			opt->alloc_hint_flags &=
6281dc1dad8eSIlya Dryomov 			    ~(CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE |
6282dc1dad8eSIlya Dryomov 			      CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE);
6283dc1dad8eSIlya Dryomov 			break;
6284dc1dad8eSIlya Dryomov 		case Opt_compression_hint_compressible:
6285dc1dad8eSIlya Dryomov 			opt->alloc_hint_flags |=
6286dc1dad8eSIlya Dryomov 			    CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE;
6287dc1dad8eSIlya Dryomov 			opt->alloc_hint_flags &=
6288dc1dad8eSIlya Dryomov 			    ~CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE;
6289dc1dad8eSIlya Dryomov 			break;
6290dc1dad8eSIlya Dryomov 		case Opt_compression_hint_incompressible:
6291dc1dad8eSIlya Dryomov 			opt->alloc_hint_flags |=
6292dc1dad8eSIlya Dryomov 			    CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE;
6293dc1dad8eSIlya Dryomov 			opt->alloc_hint_flags &=
6294dc1dad8eSIlya Dryomov 			    ~CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE;
6295dc1dad8eSIlya Dryomov 			break;
6296dc1dad8eSIlya Dryomov 		default:
6297dc1dad8eSIlya Dryomov 			BUG();
6298dc1dad8eSIlya Dryomov 		}
6299dc1dad8eSIlya Dryomov 		break;
630082995cc6SDavid Howells 	case Opt_read_only:
630182995cc6SDavid Howells 		opt->read_only = true;
630282995cc6SDavid Howells 		break;
630382995cc6SDavid Howells 	case Opt_read_write:
630482995cc6SDavid Howells 		opt->read_only = false;
630582995cc6SDavid Howells 		break;
630682995cc6SDavid Howells 	case Opt_lock_on_read:
630782995cc6SDavid Howells 		opt->lock_on_read = true;
630882995cc6SDavid Howells 		break;
630982995cc6SDavid Howells 	case Opt_exclusive:
631082995cc6SDavid Howells 		opt->exclusive = true;
631182995cc6SDavid Howells 		break;
631282995cc6SDavid Howells 	case Opt_notrim:
631382995cc6SDavid Howells 		opt->trim = false;
631482995cc6SDavid Howells 		break;
631582995cc6SDavid Howells 	default:
631682995cc6SDavid Howells 		BUG();
631782995cc6SDavid Howells 	}
631882995cc6SDavid Howells 
631982995cc6SDavid Howells 	return 0;
632082995cc6SDavid Howells 
632182995cc6SDavid Howells out_of_range:
63222c3f3dc3SAl Viro 	return inval_plog(&log, "%s out of range", param->key);
632382995cc6SDavid Howells }
632482995cc6SDavid Howells 
632582995cc6SDavid Howells /*
632682995cc6SDavid Howells  * This duplicates most of generic_parse_monolithic(), untying it from
632782995cc6SDavid Howells  * fs_context and skipping standard superblock and security options.
632882995cc6SDavid Howells  */
632982995cc6SDavid Howells static int rbd_parse_options(char *options, struct rbd_parse_opts_ctx *pctx)
633082995cc6SDavid Howells {
633182995cc6SDavid Howells 	char *key;
633282995cc6SDavid Howells 	int ret = 0;
633382995cc6SDavid Howells 
633482995cc6SDavid Howells 	dout("%s '%s'\n", __func__, options);
633582995cc6SDavid Howells 	while ((key = strsep(&options, ",")) != NULL) {
633682995cc6SDavid Howells 		if (*key) {
633782995cc6SDavid Howells 			struct fs_parameter param = {
633882995cc6SDavid Howells 				.key	= key,
63390f89589aSAl Viro 				.type	= fs_value_is_flag,
634082995cc6SDavid Howells 			};
634182995cc6SDavid Howells 			char *value = strchr(key, '=');
634282995cc6SDavid Howells 			size_t v_len = 0;
634382995cc6SDavid Howells 
634482995cc6SDavid Howells 			if (value) {
634582995cc6SDavid Howells 				if (value == key)
634682995cc6SDavid Howells 					continue;
634782995cc6SDavid Howells 				*value++ = 0;
634882995cc6SDavid Howells 				v_len = strlen(value);
634982995cc6SDavid Howells 				param.string = kmemdup_nul(value, v_len,
635082995cc6SDavid Howells 							   GFP_KERNEL);
635182995cc6SDavid Howells 				if (!param.string)
635282995cc6SDavid Howells 					return -ENOMEM;
63530f89589aSAl Viro 				param.type = fs_value_is_string;
635482995cc6SDavid Howells 			}
635582995cc6SDavid Howells 			param.size = v_len;
635682995cc6SDavid Howells 
635782995cc6SDavid Howells 			ret = rbd_parse_param(&param, pctx);
635882995cc6SDavid Howells 			kfree(param.string);
635982995cc6SDavid Howells 			if (ret)
636082995cc6SDavid Howells 				break;
636182995cc6SDavid Howells 		}
636282995cc6SDavid Howells 	}
636382995cc6SDavid Howells 
636482995cc6SDavid Howells 	return ret;
636582995cc6SDavid Howells }
636682995cc6SDavid Howells 
6367ea3352f4SAlex Elder /*
6368859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
6369859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
6370859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
6371859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
6372d22f76e7SAlex Elder  *
6373859c31dfSAlex Elder  * The information extracted from these options is recorded in
6374859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
6375859c31dfSAlex Elder  * structures:
6376859c31dfSAlex Elder  *  ceph_opts
6377859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
6378859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
6379859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
6380859c31dfSAlex Elder  *  rbd_opts
6381859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
6382859c31dfSAlex Elder  *	this function; caller must release with kfree().
6383859c31dfSAlex Elder  *  spec
6384859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
6385859c31dfSAlex Elder  *	initialized by this function based on parsed options.
6386859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
6387859c31dfSAlex Elder  *
6388859c31dfSAlex Elder  * The options passed take this form:
6389859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
6390859c31dfSAlex Elder  * where:
6391859c31dfSAlex Elder  *  <mon_addrs>
6392859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
6393859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
6394859c31dfSAlex Elder  *      by a port number (separated by a colon).
6395859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
6396859c31dfSAlex Elder  *  <options>
6397859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
6398859c31dfSAlex Elder  *  <pool_name>
6399859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
6400859c31dfSAlex Elder  *  <image_name>
6401859c31dfSAlex Elder  *      The name of the image in that pool to map.
6402859c31dfSAlex Elder  *  <snap_id>
6403859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
6404859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
6405859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
6406859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
6407a725f65eSAlex Elder  */
6408859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
6409dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
6410859c31dfSAlex Elder 				struct rbd_options **opts,
6411859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
6412a725f65eSAlex Elder {
6413e28fff26SAlex Elder 	size_t len;
6414859c31dfSAlex Elder 	char *options;
64150ddebc0cSAlex Elder 	const char *mon_addrs;
6416ecb4dc22SAlex Elder 	char *snap_name;
64170ddebc0cSAlex Elder 	size_t mon_addrs_size;
641882995cc6SDavid Howells 	struct rbd_parse_opts_ctx pctx = { 0 };
6419dc79b113SAlex Elder 	int ret;
6420e28fff26SAlex Elder 
6421e28fff26SAlex Elder 	/* The first four tokens are required */
6422e28fff26SAlex Elder 
64237ef3214aSAlex Elder 	len = next_token(&buf);
64244fb5d671SAlex Elder 	if (!len) {
64254fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
64264fb5d671SAlex Elder 		return -EINVAL;
64274fb5d671SAlex Elder 	}
64280ddebc0cSAlex Elder 	mon_addrs = buf;
642982995cc6SDavid Howells 	mon_addrs_size = len;
64307ef3214aSAlex Elder 	buf += len;
6431a725f65eSAlex Elder 
6432dc79b113SAlex Elder 	ret = -EINVAL;
6433f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
6434f28e565aSAlex Elder 	if (!options)
6435dc79b113SAlex Elder 		return -ENOMEM;
64364fb5d671SAlex Elder 	if (!*options) {
64374fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
64384fb5d671SAlex Elder 		goto out_err;
64394fb5d671SAlex Elder 	}
6440a725f65eSAlex Elder 
6441c300156bSIlya Dryomov 	pctx.spec = rbd_spec_alloc();
6442c300156bSIlya Dryomov 	if (!pctx.spec)
6443f28e565aSAlex Elder 		goto out_mem;
6444859c31dfSAlex Elder 
6445c300156bSIlya Dryomov 	pctx.spec->pool_name = dup_token(&buf, NULL);
6446c300156bSIlya Dryomov 	if (!pctx.spec->pool_name)
6447859c31dfSAlex Elder 		goto out_mem;
6448c300156bSIlya Dryomov 	if (!*pctx.spec->pool_name) {
64494fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
64504fb5d671SAlex Elder 		goto out_err;
64514fb5d671SAlex Elder 	}
6452e28fff26SAlex Elder 
6453c300156bSIlya Dryomov 	pctx.spec->image_name = dup_token(&buf, NULL);
6454c300156bSIlya Dryomov 	if (!pctx.spec->image_name)
6455f28e565aSAlex Elder 		goto out_mem;
6456c300156bSIlya Dryomov 	if (!*pctx.spec->image_name) {
64574fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
64584fb5d671SAlex Elder 		goto out_err;
64594fb5d671SAlex Elder 	}
6460e28fff26SAlex Elder 
6461f28e565aSAlex Elder 	/*
6462f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
6463f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
6464f28e565aSAlex Elder 	 */
64653feeb894SAlex Elder 	len = next_token(&buf);
6466820a5f3eSAlex Elder 	if (!len) {
64673feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
64683feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
6469f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
6470dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
6471f28e565aSAlex Elder 		goto out_err;
6472849b4260SAlex Elder 	}
6473ecb4dc22SAlex Elder 	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
6474ecb4dc22SAlex Elder 	if (!snap_name)
6475f28e565aSAlex Elder 		goto out_mem;
6476ecb4dc22SAlex Elder 	*(snap_name + len) = '\0';
6477c300156bSIlya Dryomov 	pctx.spec->snap_name = snap_name;
6478e5c35534SAlex Elder 
647982995cc6SDavid Howells 	pctx.copts = ceph_alloc_options();
648082995cc6SDavid Howells 	if (!pctx.copts)
648182995cc6SDavid Howells 		goto out_mem;
648282995cc6SDavid Howells 
64830ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
6484e28fff26SAlex Elder 
6485c300156bSIlya Dryomov 	pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL);
6486c300156bSIlya Dryomov 	if (!pctx.opts)
64874e9afebaSAlex Elder 		goto out_mem;
64884e9afebaSAlex Elder 
6489c300156bSIlya Dryomov 	pctx.opts->read_only = RBD_READ_ONLY_DEFAULT;
6490c300156bSIlya Dryomov 	pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
64910c93e1b7SIlya Dryomov 	pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT;
6492c300156bSIlya Dryomov 	pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
6493c300156bSIlya Dryomov 	pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
6494c300156bSIlya Dryomov 	pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
6495c300156bSIlya Dryomov 	pctx.opts->trim = RBD_TRIM_DEFAULT;
6496d22f76e7SAlex Elder 
64972d7c86a8SVenky Shankar 	ret = ceph_parse_mon_ips(mon_addrs, mon_addrs_size, pctx.copts, NULL,
64982d7c86a8SVenky Shankar 				 ',');
649982995cc6SDavid Howells 	if (ret)
6500dc79b113SAlex Elder 		goto out_err;
6501859c31dfSAlex Elder 
650282995cc6SDavid Howells 	ret = rbd_parse_options(options, &pctx);
650382995cc6SDavid Howells 	if (ret)
650482995cc6SDavid Howells 		goto out_err;
650582995cc6SDavid Howells 
650682995cc6SDavid Howells 	*ceph_opts = pctx.copts;
6507c300156bSIlya Dryomov 	*opts = pctx.opts;
6508c300156bSIlya Dryomov 	*rbd_spec = pctx.spec;
650982995cc6SDavid Howells 	kfree(options);
6510dc79b113SAlex Elder 	return 0;
651182995cc6SDavid Howells 
6512f28e565aSAlex Elder out_mem:
6513dc79b113SAlex Elder 	ret = -ENOMEM;
6514d22f76e7SAlex Elder out_err:
6515c300156bSIlya Dryomov 	kfree(pctx.opts);
651682995cc6SDavid Howells 	ceph_destroy_options(pctx.copts);
6517c300156bSIlya Dryomov 	rbd_spec_put(pctx.spec);
6518f28e565aSAlex Elder 	kfree(options);
6519dc79b113SAlex Elder 	return ret;
6520a725f65eSAlex Elder }
6521a725f65eSAlex Elder 
6522e010dd0aSIlya Dryomov static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
6523e010dd0aSIlya Dryomov {
6524e010dd0aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
6525e010dd0aSIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev))
6526e1fddc8fSIlya Dryomov 		__rbd_release_lock(rbd_dev);
6527e010dd0aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
6528e010dd0aSIlya Dryomov }
6529e010dd0aSIlya Dryomov 
6530637cd060SIlya Dryomov /*
6531637cd060SIlya Dryomov  * If the wait is interrupted, an error is returned even if the lock
6532637cd060SIlya Dryomov  * was successfully acquired.  rbd_dev_image_unlock() will release it
6533637cd060SIlya Dryomov  * if needed.
6534637cd060SIlya Dryomov  */
6535e010dd0aSIlya Dryomov static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
6536e010dd0aSIlya Dryomov {
6537637cd060SIlya Dryomov 	long ret;
65382f18d466SIlya Dryomov 
6539e010dd0aSIlya Dryomov 	if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
6540637cd060SIlya Dryomov 		if (!rbd_dev->opts->exclusive && !rbd_dev->opts->lock_on_read)
6541637cd060SIlya Dryomov 			return 0;
6542637cd060SIlya Dryomov 
6543e010dd0aSIlya Dryomov 		rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
6544e010dd0aSIlya Dryomov 		return -EINVAL;
6545e010dd0aSIlya Dryomov 	}
6546e010dd0aSIlya Dryomov 
65473fe69921SIlya Dryomov 	if (rbd_is_ro(rbd_dev))
6548637cd060SIlya Dryomov 		return 0;
6549637cd060SIlya Dryomov 
6550637cd060SIlya Dryomov 	rbd_assert(!rbd_is_lock_owner(rbd_dev));
6551637cd060SIlya Dryomov 	queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
6552637cd060SIlya Dryomov 	ret = wait_for_completion_killable_timeout(&rbd_dev->acquire_wait,
6553637cd060SIlya Dryomov 			    ceph_timeout_jiffies(rbd_dev->opts->lock_timeout));
655425e6be21SDongsheng Yang 	if (ret > 0) {
6555637cd060SIlya Dryomov 		ret = rbd_dev->acquire_err;
655625e6be21SDongsheng Yang 	} else {
655725e6be21SDongsheng Yang 		cancel_delayed_work_sync(&rbd_dev->lock_dwork);
655825e6be21SDongsheng Yang 		if (!ret)
6559637cd060SIlya Dryomov 			ret = -ETIMEDOUT;
656025e6be21SDongsheng Yang 	}
6561637cd060SIlya Dryomov 
65622f18d466SIlya Dryomov 	if (ret) {
6563637cd060SIlya Dryomov 		rbd_warn(rbd_dev, "failed to acquire exclusive lock: %ld", ret);
6564637cd060SIlya Dryomov 		return ret;
6565e010dd0aSIlya Dryomov 	}
6566e010dd0aSIlya Dryomov 
6567637cd060SIlya Dryomov 	/*
6568637cd060SIlya Dryomov 	 * The lock may have been released by now, unless automatic lock
6569637cd060SIlya Dryomov 	 * transitions are disabled.
6570637cd060SIlya Dryomov 	 */
6571637cd060SIlya Dryomov 	rbd_assert(!rbd_dev->opts->exclusive || rbd_is_lock_owner(rbd_dev));
6572e010dd0aSIlya Dryomov 	return 0;
6573e010dd0aSIlya Dryomov }
6574e010dd0aSIlya Dryomov 
657530ba1f02SIlya Dryomov /*
6576589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
6577589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
6578589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
6579589d30e0SAlex Elder  *
6580589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
6581589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
6582589d30e0SAlex Elder  * with the supplied name.
6583589d30e0SAlex Elder  *
6584589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
6585589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
6586589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
6587589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
6588589d30e0SAlex Elder  */
6589589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
6590589d30e0SAlex Elder {
6591589d30e0SAlex Elder 	int ret;
6592589d30e0SAlex Elder 	size_t size;
6593ecd4a68aSIlya Dryomov 	CEPH_DEFINE_OID_ONSTACK(oid);
6594589d30e0SAlex Elder 	void *response;
6595c0fba368SAlex Elder 	char *image_id;
65962f82ee54SAlex Elder 
6597589d30e0SAlex Elder 	/*
65982c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
65992c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
6600c0fba368SAlex Elder 	 * need to fetch the image id again in this case.  We
6601c0fba368SAlex Elder 	 * do still need to set the image format though.
66022c0d0a10SAlex Elder 	 */
6603c0fba368SAlex Elder 	if (rbd_dev->spec->image_id) {
6604c0fba368SAlex Elder 		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
6605c0fba368SAlex Elder 
66062c0d0a10SAlex Elder 		return 0;
6607c0fba368SAlex Elder 	}
66082c0d0a10SAlex Elder 
66092c0d0a10SAlex Elder 	/*
6610589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
6611589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
6612589d30e0SAlex Elder 	 */
6613ecd4a68aSIlya Dryomov 	ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
6614ecd4a68aSIlya Dryomov 			       rbd_dev->spec->image_name);
6615ecd4a68aSIlya Dryomov 	if (ret)
6616ecd4a68aSIlya Dryomov 		return ret;
6617ecd4a68aSIlya Dryomov 
6618ecd4a68aSIlya Dryomov 	dout("rbd id object name is %s\n", oid.name);
6619589d30e0SAlex Elder 
6620589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
6621589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
6622589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
6623589d30e0SAlex Elder 	if (!response) {
6624589d30e0SAlex Elder 		ret = -ENOMEM;
6625589d30e0SAlex Elder 		goto out;
6626589d30e0SAlex Elder 	}
6627589d30e0SAlex Elder 
6628c0fba368SAlex Elder 	/* If it doesn't exist we'll assume it's a format 1 image */
6629c0fba368SAlex Elder 
6630ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
6631ecd4a68aSIlya Dryomov 				  "get_id", NULL, 0,
66325435d206SDongsheng Yang 				  response, size);
663336be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6634c0fba368SAlex Elder 	if (ret == -ENOENT) {
6635c0fba368SAlex Elder 		image_id = kstrdup("", GFP_KERNEL);
6636c0fba368SAlex Elder 		ret = image_id ? 0 : -ENOMEM;
6637c0fba368SAlex Elder 		if (!ret)
6638c0fba368SAlex Elder 			rbd_dev->image_format = 1;
66397dd440c9SIlya Dryomov 	} else if (ret >= 0) {
6640c0fba368SAlex Elder 		void *p = response;
6641589d30e0SAlex Elder 
6642c0fba368SAlex Elder 		image_id = ceph_extract_encoded_string(&p, p + ret,
6643979ed480SAlex Elder 						NULL, GFP_NOIO);
6644461f758aSDuan Jiong 		ret = PTR_ERR_OR_ZERO(image_id);
6645c0fba368SAlex Elder 		if (!ret)
6646c0fba368SAlex Elder 			rbd_dev->image_format = 2;
6647c0fba368SAlex Elder 	}
6648c0fba368SAlex Elder 
6649c0fba368SAlex Elder 	if (!ret) {
6650c0fba368SAlex Elder 		rbd_dev->spec->image_id = image_id;
6651c0fba368SAlex Elder 		dout("image_id is %s\n", image_id);
6652589d30e0SAlex Elder 	}
6653589d30e0SAlex Elder out:
6654589d30e0SAlex Elder 	kfree(response);
6655ecd4a68aSIlya Dryomov 	ceph_oid_destroy(&oid);
6656589d30e0SAlex Elder 	return ret;
6657589d30e0SAlex Elder }
6658589d30e0SAlex Elder 
66593abef3b3SAlex Elder /*
66603abef3b3SAlex Elder  * Undo whatever state changes are made by v1 or v2 header info
66613abef3b3SAlex Elder  * call.
66623abef3b3SAlex Elder  */
66636fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
66646fd48b3bSAlex Elder {
66656fd48b3bSAlex Elder 	struct rbd_image_header	*header;
66666fd48b3bSAlex Elder 
6667a2acd00eSAlex Elder 	rbd_dev_parent_put(rbd_dev);
666822e8bd51SIlya Dryomov 	rbd_object_map_free(rbd_dev);
6669da5ef6beSIlya Dryomov 	rbd_dev_mapping_clear(rbd_dev);
66706fd48b3bSAlex Elder 
66716fd48b3bSAlex Elder 	/* Free dynamic fields from the header, then zero it out */
66726fd48b3bSAlex Elder 
66736fd48b3bSAlex Elder 	header = &rbd_dev->header;
6674812164f8SAlex Elder 	ceph_put_snap_context(header->snapc);
66756fd48b3bSAlex Elder 	kfree(header->snap_sizes);
66766fd48b3bSAlex Elder 	kfree(header->snap_names);
66776fd48b3bSAlex Elder 	kfree(header->object_prefix);
66786fd48b3bSAlex Elder 	memset(header, 0, sizeof (*header));
66796fd48b3bSAlex Elder }
66806fd48b3bSAlex Elder 
66812df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
6682a30b71b9SAlex Elder {
6683a30b71b9SAlex Elder 	int ret;
6684a30b71b9SAlex Elder 
66851e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
668657385b51SAlex Elder 	if (ret)
66871e130199SAlex Elder 		goto out_err;
6688b1b5402aSAlex Elder 
66892df3fac7SAlex Elder 	/*
66902df3fac7SAlex Elder 	 * Get the and check features for the image.  Currently the
66912df3fac7SAlex Elder 	 * features are assumed to never change.
66922df3fac7SAlex Elder 	 */
6693b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
669457385b51SAlex Elder 	if (ret)
6695b1b5402aSAlex Elder 		goto out_err;
669635d489f9SAlex Elder 
6697cc070d59SAlex Elder 	/* If the image supports fancy striping, get its parameters */
6698cc070d59SAlex Elder 
6699cc070d59SAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
6700cc070d59SAlex Elder 		ret = rbd_dev_v2_striping_info(rbd_dev);
6701cc070d59SAlex Elder 		if (ret < 0)
6702cc070d59SAlex Elder 			goto out_err;
6703cc070d59SAlex Elder 	}
6704a30b71b9SAlex Elder 
67057e97332eSIlya Dryomov 	if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
67067e97332eSIlya Dryomov 		ret = rbd_dev_v2_data_pool(rbd_dev);
67077e97332eSIlya Dryomov 		if (ret)
67087e97332eSIlya Dryomov 			goto out_err;
67097e97332eSIlya Dryomov 	}
67107e97332eSIlya Dryomov 
6711263423f8SIlya Dryomov 	rbd_init_layout(rbd_dev);
671235152979SAlex Elder 	return 0;
6713263423f8SIlya Dryomov 
67149d475de5SAlex Elder out_err:
6715642a2537SAlex Elder 	rbd_dev->header.features = 0;
67161e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
67171e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
67189d475de5SAlex Elder 	return ret;
6719a30b71b9SAlex Elder }
6720a30b71b9SAlex Elder 
67216d69bb53SIlya Dryomov /*
67226d69bb53SIlya Dryomov  * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
67236d69bb53SIlya Dryomov  * rbd_dev_image_probe() recursion depth, which means it's also the
67246d69bb53SIlya Dryomov  * length of the already discovered part of the parent chain.
67256d69bb53SIlya Dryomov  */
67266d69bb53SIlya Dryomov static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
672783a06263SAlex Elder {
67282f82ee54SAlex Elder 	struct rbd_device *parent = NULL;
6729124afba2SAlex Elder 	int ret;
6730124afba2SAlex Elder 
6731124afba2SAlex Elder 	if (!rbd_dev->parent_spec)
6732124afba2SAlex Elder 		return 0;
6733124afba2SAlex Elder 
67346d69bb53SIlya Dryomov 	if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
67356d69bb53SIlya Dryomov 		pr_info("parent chain is too long (%d)\n", depth);
67366d69bb53SIlya Dryomov 		ret = -EINVAL;
67376d69bb53SIlya Dryomov 		goto out_err;
67386d69bb53SIlya Dryomov 	}
67396d69bb53SIlya Dryomov 
67401643dfa4SIlya Dryomov 	parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
67411f2c6651SIlya Dryomov 	if (!parent) {
6742124afba2SAlex Elder 		ret = -ENOMEM;
6743124afba2SAlex Elder 		goto out_err;
67441f2c6651SIlya Dryomov 	}
67451f2c6651SIlya Dryomov 
67461f2c6651SIlya Dryomov 	/*
67471f2c6651SIlya Dryomov 	 * Images related by parent/child relationships always share
67481f2c6651SIlya Dryomov 	 * rbd_client and spec/parent_spec, so bump their refcounts.
67491f2c6651SIlya Dryomov 	 */
67501f2c6651SIlya Dryomov 	__rbd_get_client(rbd_dev->rbd_client);
67511f2c6651SIlya Dryomov 	rbd_spec_get(rbd_dev->parent_spec);
6752124afba2SAlex Elder 
675339258aa2SIlya Dryomov 	__set_bit(RBD_DEV_FLAG_READONLY, &parent->flags);
675439258aa2SIlya Dryomov 
67556d69bb53SIlya Dryomov 	ret = rbd_dev_image_probe(parent, depth);
6756124afba2SAlex Elder 	if (ret < 0)
6757124afba2SAlex Elder 		goto out_err;
67581f2c6651SIlya Dryomov 
6759124afba2SAlex Elder 	rbd_dev->parent = parent;
6760a2acd00eSAlex Elder 	atomic_set(&rbd_dev->parent_ref, 1);
6761124afba2SAlex Elder 	return 0;
6762124afba2SAlex Elder 
67631f2c6651SIlya Dryomov out_err:
67641f2c6651SIlya Dryomov 	rbd_dev_unparent(rbd_dev);
67651f2c6651SIlya Dryomov 	rbd_dev_destroy(parent);
6766124afba2SAlex Elder 	return ret;
6767124afba2SAlex Elder }
6768124afba2SAlex Elder 
67695769ed0cSIlya Dryomov static void rbd_dev_device_release(struct rbd_device *rbd_dev)
67705769ed0cSIlya Dryomov {
67715769ed0cSIlya Dryomov 	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
67725769ed0cSIlya Dryomov 	rbd_free_disk(rbd_dev);
67735769ed0cSIlya Dryomov 	if (!single_major)
67745769ed0cSIlya Dryomov 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
67755769ed0cSIlya Dryomov }
67765769ed0cSIlya Dryomov 
6777811c6688SIlya Dryomov /*
6778811c6688SIlya Dryomov  * rbd_dev->header_rwsem must be locked for write and will be unlocked
6779811c6688SIlya Dryomov  * upon return.
6780811c6688SIlya Dryomov  */
6781200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
6782124afba2SAlex Elder {
678383a06263SAlex Elder 	int ret;
678483a06263SAlex Elder 
67859b60e70bSIlya Dryomov 	/* Record our major and minor device numbers. */
678683a06263SAlex Elder 
67879b60e70bSIlya Dryomov 	if (!single_major) {
678883a06263SAlex Elder 		ret = register_blkdev(0, rbd_dev->name);
678983a06263SAlex Elder 		if (ret < 0)
67901643dfa4SIlya Dryomov 			goto err_out_unlock;
67919b60e70bSIlya Dryomov 
679283a06263SAlex Elder 		rbd_dev->major = ret;
6793dd82fff1SIlya Dryomov 		rbd_dev->minor = 0;
67949b60e70bSIlya Dryomov 	} else {
67959b60e70bSIlya Dryomov 		rbd_dev->major = rbd_major;
67969b60e70bSIlya Dryomov 		rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
67979b60e70bSIlya Dryomov 	}
679883a06263SAlex Elder 
679983a06263SAlex Elder 	/* Set up the blkdev mapping. */
680083a06263SAlex Elder 
680183a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
680283a06263SAlex Elder 	if (ret)
680383a06263SAlex Elder 		goto err_out_blkdev;
680483a06263SAlex Elder 
6805f35a4deeSAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
680639258aa2SIlya Dryomov 	set_disk_ro(rbd_dev->disk, rbd_is_ro(rbd_dev));
6807f35a4deeSAlex Elder 
68085769ed0cSIlya Dryomov 	ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
6809f35a4deeSAlex Elder 	if (ret)
6810da5ef6beSIlya Dryomov 		goto err_out_disk;
681183a06263SAlex Elder 
6812129b79d4SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
6813811c6688SIlya Dryomov 	up_write(&rbd_dev->header_rwsem);
68145769ed0cSIlya Dryomov 	return 0;
68152f82ee54SAlex Elder 
681683a06263SAlex Elder err_out_disk:
681783a06263SAlex Elder 	rbd_free_disk(rbd_dev);
681883a06263SAlex Elder err_out_blkdev:
68199b60e70bSIlya Dryomov 	if (!single_major)
682083a06263SAlex Elder 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
6821811c6688SIlya Dryomov err_out_unlock:
6822811c6688SIlya Dryomov 	up_write(&rbd_dev->header_rwsem);
682383a06263SAlex Elder 	return ret;
682483a06263SAlex Elder }
682583a06263SAlex Elder 
6826332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev)
6827332bb12dSAlex Elder {
6828332bb12dSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
6829c41d13a3SIlya Dryomov 	int ret;
6830332bb12dSAlex Elder 
6831332bb12dSAlex Elder 	/* Record the header object name for this rbd image. */
6832332bb12dSAlex Elder 
6833332bb12dSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
6834332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
6835c41d13a3SIlya Dryomov 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6836332bb12dSAlex Elder 				       spec->image_name, RBD_SUFFIX);
6837332bb12dSAlex Elder 	else
6838c41d13a3SIlya Dryomov 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6839332bb12dSAlex Elder 				       RBD_HEADER_PREFIX, spec->image_id);
6840c41d13a3SIlya Dryomov 
6841c41d13a3SIlya Dryomov 	return ret;
6842332bb12dSAlex Elder }
6843332bb12dSAlex Elder 
6844b9ef2b88SIlya Dryomov static void rbd_print_dne(struct rbd_device *rbd_dev, bool is_snap)
6845b9ef2b88SIlya Dryomov {
6846b9ef2b88SIlya Dryomov 	if (!is_snap) {
6847b9ef2b88SIlya Dryomov 		pr_info("image %s/%s%s%s does not exist\n",
6848b9ef2b88SIlya Dryomov 			rbd_dev->spec->pool_name,
6849b9ef2b88SIlya Dryomov 			rbd_dev->spec->pool_ns ?: "",
6850b9ef2b88SIlya Dryomov 			rbd_dev->spec->pool_ns ? "/" : "",
6851b9ef2b88SIlya Dryomov 			rbd_dev->spec->image_name);
6852b9ef2b88SIlya Dryomov 	} else {
6853b9ef2b88SIlya Dryomov 		pr_info("snap %s/%s%s%s@%s does not exist\n",
6854b9ef2b88SIlya Dryomov 			rbd_dev->spec->pool_name,
6855b9ef2b88SIlya Dryomov 			rbd_dev->spec->pool_ns ?: "",
6856b9ef2b88SIlya Dryomov 			rbd_dev->spec->pool_ns ? "/" : "",
6857b9ef2b88SIlya Dryomov 			rbd_dev->spec->image_name,
6858b9ef2b88SIlya Dryomov 			rbd_dev->spec->snap_name);
6859b9ef2b88SIlya Dryomov 	}
6860b9ef2b88SIlya Dryomov }
6861b9ef2b88SIlya Dryomov 
6862200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev)
6863200a6a8bSAlex Elder {
6864b8776051SIlya Dryomov 	if (!rbd_is_ro(rbd_dev))
6865fd22aef8SIlya Dryomov 		rbd_unregister_watch(rbd_dev);
6866952c48b0SIlya Dryomov 
6867952c48b0SIlya Dryomov 	rbd_dev_unprobe(rbd_dev);
68686fd48b3bSAlex Elder 	rbd_dev->image_format = 0;
68696fd48b3bSAlex Elder 	kfree(rbd_dev->spec->image_id);
68706fd48b3bSAlex Elder 	rbd_dev->spec->image_id = NULL;
6871200a6a8bSAlex Elder }
6872200a6a8bSAlex Elder 
6873a30b71b9SAlex Elder /*
6874a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
68751f3ef788SAlex Elder  * device.  If this image is the one being mapped (i.e., not a
68761f3ef788SAlex Elder  * parent), initiate a watch on its header object before using that
68771f3ef788SAlex Elder  * object to get detailed information about the rbd image.
68780e4e1de5SIlya Dryomov  *
68790e4e1de5SIlya Dryomov  * On success, returns with header_rwsem held for write if called
68800e4e1de5SIlya Dryomov  * with @depth == 0.
6881a30b71b9SAlex Elder  */
68826d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
6883a30b71b9SAlex Elder {
6884b9ef2b88SIlya Dryomov 	bool need_watch = !rbd_is_ro(rbd_dev);
6885a30b71b9SAlex Elder 	int ret;
6886a30b71b9SAlex Elder 
6887a30b71b9SAlex Elder 	/*
68883abef3b3SAlex Elder 	 * Get the id from the image id object.  Unless there's an
68893abef3b3SAlex Elder 	 * error, rbd_dev->spec->image_id will be filled in with
68903abef3b3SAlex Elder 	 * a dynamically-allocated string, and rbd_dev->image_format
68913abef3b3SAlex Elder 	 * will be set to either 1 or 2.
6892a30b71b9SAlex Elder 	 */
6893a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
6894a30b71b9SAlex Elder 	if (ret)
6895c0fba368SAlex Elder 		return ret;
6896c0fba368SAlex Elder 
6897332bb12dSAlex Elder 	ret = rbd_dev_header_name(rbd_dev);
6898332bb12dSAlex Elder 	if (ret)
6899332bb12dSAlex Elder 		goto err_out_format;
6900332bb12dSAlex Elder 
6901b9ef2b88SIlya Dryomov 	if (need_watch) {
690299d16943SIlya Dryomov 		ret = rbd_register_watch(rbd_dev);
69031fe48023SIlya Dryomov 		if (ret) {
69041fe48023SIlya Dryomov 			if (ret == -ENOENT)
6905b9ef2b88SIlya Dryomov 				rbd_print_dne(rbd_dev, false);
6906c41d13a3SIlya Dryomov 			goto err_out_format;
69071f3ef788SAlex Elder 		}
69081fe48023SIlya Dryomov 	}
6909b644de2bSAlex Elder 
69100e4e1de5SIlya Dryomov 	if (!depth)
69110e4e1de5SIlya Dryomov 		down_write(&rbd_dev->header_rwsem);
69120e4e1de5SIlya Dryomov 
6913a720ae09SIlya Dryomov 	ret = rbd_dev_header_info(rbd_dev);
6914b9ef2b88SIlya Dryomov 	if (ret) {
6915b9ef2b88SIlya Dryomov 		if (ret == -ENOENT && !need_watch)
6916b9ef2b88SIlya Dryomov 			rbd_print_dne(rbd_dev, false);
6917952c48b0SIlya Dryomov 		goto err_out_probe;
6918b9ef2b88SIlya Dryomov 	}
6919a30b71b9SAlex Elder 
692004077599SIlya Dryomov 	/*
692104077599SIlya Dryomov 	 * If this image is the one being mapped, we have pool name and
692204077599SIlya Dryomov 	 * id, image name and id, and snap name - need to fill snap id.
692304077599SIlya Dryomov 	 * Otherwise this is a parent image, identified by pool, image
692404077599SIlya Dryomov 	 * and snap ids - need to fill in names for those ids.
692504077599SIlya Dryomov 	 */
69266d69bb53SIlya Dryomov 	if (!depth)
692704077599SIlya Dryomov 		ret = rbd_spec_fill_snap_id(rbd_dev);
692804077599SIlya Dryomov 	else
692904077599SIlya Dryomov 		ret = rbd_spec_fill_names(rbd_dev);
69301fe48023SIlya Dryomov 	if (ret) {
69311fe48023SIlya Dryomov 		if (ret == -ENOENT)
6932b9ef2b88SIlya Dryomov 			rbd_print_dne(rbd_dev, true);
693333dca39fSAlex Elder 		goto err_out_probe;
69341fe48023SIlya Dryomov 	}
69359bb81c9bSAlex Elder 
6936da5ef6beSIlya Dryomov 	ret = rbd_dev_mapping_set(rbd_dev);
6937da5ef6beSIlya Dryomov 	if (ret)
6938da5ef6beSIlya Dryomov 		goto err_out_probe;
6939da5ef6beSIlya Dryomov 
6940f3c0e459SIlya Dryomov 	if (rbd_is_snap(rbd_dev) &&
694122e8bd51SIlya Dryomov 	    (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) {
694222e8bd51SIlya Dryomov 		ret = rbd_object_map_load(rbd_dev);
694322e8bd51SIlya Dryomov 		if (ret)
694422e8bd51SIlya Dryomov 			goto err_out_probe;
694522e8bd51SIlya Dryomov 	}
694622e8bd51SIlya Dryomov 
6947e8f59b59SIlya Dryomov 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
6948e8f59b59SIlya Dryomov 		ret = rbd_dev_v2_parent_info(rbd_dev);
6949e8f59b59SIlya Dryomov 		if (ret)
6950e8f59b59SIlya Dryomov 			goto err_out_probe;
6951e8f59b59SIlya Dryomov 	}
6952e8f59b59SIlya Dryomov 
69536d69bb53SIlya Dryomov 	ret = rbd_dev_probe_parent(rbd_dev, depth);
695430d60ba2SAlex Elder 	if (ret)
695530d60ba2SAlex Elder 		goto err_out_probe;
695683a06263SAlex Elder 
695730d60ba2SAlex Elder 	dout("discovered format %u image, header name is %s\n",
6958c41d13a3SIlya Dryomov 		rbd_dev->image_format, rbd_dev->header_oid.name);
695930d60ba2SAlex Elder 	return 0;
6960e8f59b59SIlya Dryomov 
69616fd48b3bSAlex Elder err_out_probe:
69620e4e1de5SIlya Dryomov 	if (!depth)
69630e4e1de5SIlya Dryomov 		up_write(&rbd_dev->header_rwsem);
6964b9ef2b88SIlya Dryomov 	if (need_watch)
696599d16943SIlya Dryomov 		rbd_unregister_watch(rbd_dev);
6966952c48b0SIlya Dryomov 	rbd_dev_unprobe(rbd_dev);
6967332bb12dSAlex Elder err_out_format:
6968332bb12dSAlex Elder 	rbd_dev->image_format = 0;
69695655c4d9SAlex Elder 	kfree(rbd_dev->spec->image_id);
69705655c4d9SAlex Elder 	rbd_dev->spec->image_id = NULL;
69715655c4d9SAlex Elder 	return ret;
697283a06263SAlex Elder }
697383a06263SAlex Elder 
69749b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus,
697559c2be1eSYehuda Sadeh 			  const char *buf,
697659c2be1eSYehuda Sadeh 			  size_t count)
6977602adf40SYehuda Sadeh {
6978cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
6979dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
69804e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
6981859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
69829d3997fdSAlex Elder 	struct rbd_client *rbdc;
6983b51c83c2SIlya Dryomov 	int rc;
6984602adf40SYehuda Sadeh 
6985f44d04e6SIlya Dryomov 	if (!capable(CAP_SYS_ADMIN))
6986f44d04e6SIlya Dryomov 		return -EPERM;
6987f44d04e6SIlya Dryomov 
6988602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
6989602adf40SYehuda Sadeh 		return -ENODEV;
6990602adf40SYehuda Sadeh 
6991a725f65eSAlex Elder 	/* parse add command */
6992859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
6993dc79b113SAlex Elder 	if (rc < 0)
6994dd5ac32dSIlya Dryomov 		goto out;
6995a725f65eSAlex Elder 
69969d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
69979d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
69989d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
69990ddebc0cSAlex Elder 		goto err_out_args;
70009d3997fdSAlex Elder 	}
7001602adf40SYehuda Sadeh 
7002602adf40SYehuda Sadeh 	/* pick the pool */
7003dd435855SIlya Dryomov 	rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
70041fe48023SIlya Dryomov 	if (rc < 0) {
70051fe48023SIlya Dryomov 		if (rc == -ENOENT)
70061fe48023SIlya Dryomov 			pr_info("pool %s does not exist\n", spec->pool_name);
7007602adf40SYehuda Sadeh 		goto err_out_client;
70081fe48023SIlya Dryomov 	}
7009859c31dfSAlex Elder 	spec->pool_id = (u64)rc;
7010859c31dfSAlex Elder 
7011d147543dSIlya Dryomov 	rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
7012b51c83c2SIlya Dryomov 	if (!rbd_dev) {
7013b51c83c2SIlya Dryomov 		rc = -ENOMEM;
7014bd4ba655SAlex Elder 		goto err_out_client;
7015b51c83c2SIlya Dryomov 	}
7016c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
7017c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
7018d147543dSIlya Dryomov 	rbd_opts = NULL;	/* rbd_dev now owns this */
7019602adf40SYehuda Sadeh 
702039258aa2SIlya Dryomov 	/* if we are mapping a snapshot it will be a read-only mapping */
702139258aa2SIlya Dryomov 	if (rbd_dev->opts->read_only ||
702239258aa2SIlya Dryomov 	    strcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME))
702339258aa2SIlya Dryomov 		__set_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags);
702439258aa2SIlya Dryomov 
70250d6d1e9cSMike Christie 	rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
70260d6d1e9cSMike Christie 	if (!rbd_dev->config_info) {
70270d6d1e9cSMike Christie 		rc = -ENOMEM;
70280d6d1e9cSMike Christie 		goto err_out_rbd_dev;
70290d6d1e9cSMike Christie 	}
70300d6d1e9cSMike Christie 
70316d69bb53SIlya Dryomov 	rc = rbd_dev_image_probe(rbd_dev, 0);
70320e4e1de5SIlya Dryomov 	if (rc < 0)
7033c53d5893SAlex Elder 		goto err_out_rbd_dev;
703405fd6f6fSAlex Elder 
70350c93e1b7SIlya Dryomov 	if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) {
70360c93e1b7SIlya Dryomov 		rbd_warn(rbd_dev, "alloc_size adjusted to %u",
70370c93e1b7SIlya Dryomov 			 rbd_dev->layout.object_size);
70380c93e1b7SIlya Dryomov 		rbd_dev->opts->alloc_size = rbd_dev->layout.object_size;
70390c93e1b7SIlya Dryomov 	}
70400c93e1b7SIlya Dryomov 
7041b536f69aSAlex Elder 	rc = rbd_dev_device_setup(rbd_dev);
7042fd22aef8SIlya Dryomov 	if (rc)
70438b679ec5SIlya Dryomov 		goto err_out_image_probe;
70443abef3b3SAlex Elder 
7045e010dd0aSIlya Dryomov 	rc = rbd_add_acquire_lock(rbd_dev);
7046e010dd0aSIlya Dryomov 	if (rc)
7047637cd060SIlya Dryomov 		goto err_out_image_lock;
7048b536f69aSAlex Elder 
70495769ed0cSIlya Dryomov 	/* Everything's ready.  Announce the disk to the world. */
70505769ed0cSIlya Dryomov 
70515769ed0cSIlya Dryomov 	rc = device_add(&rbd_dev->dev);
70525769ed0cSIlya Dryomov 	if (rc)
7053e010dd0aSIlya Dryomov 		goto err_out_image_lock;
70545769ed0cSIlya Dryomov 
705527c97abcSLuis Chamberlain 	rc = device_add_disk(&rbd_dev->dev, rbd_dev->disk, NULL);
705627c97abcSLuis Chamberlain 	if (rc)
705727c97abcSLuis Chamberlain 		goto err_out_cleanup_disk;
70585769ed0cSIlya Dryomov 
70595769ed0cSIlya Dryomov 	spin_lock(&rbd_dev_list_lock);
70605769ed0cSIlya Dryomov 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
70615769ed0cSIlya Dryomov 	spin_unlock(&rbd_dev_list_lock);
70625769ed0cSIlya Dryomov 
70635769ed0cSIlya Dryomov 	pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
70645769ed0cSIlya Dryomov 		(unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
70655769ed0cSIlya Dryomov 		rbd_dev->header.features);
7066dd5ac32dSIlya Dryomov 	rc = count;
7067dd5ac32dSIlya Dryomov out:
7068dd5ac32dSIlya Dryomov 	module_put(THIS_MODULE);
7069dd5ac32dSIlya Dryomov 	return rc;
7070b536f69aSAlex Elder 
707127c97abcSLuis Chamberlain err_out_cleanup_disk:
707227c97abcSLuis Chamberlain 	rbd_free_disk(rbd_dev);
7073e010dd0aSIlya Dryomov err_out_image_lock:
7074e010dd0aSIlya Dryomov 	rbd_dev_image_unlock(rbd_dev);
70755769ed0cSIlya Dryomov 	rbd_dev_device_release(rbd_dev);
70768b679ec5SIlya Dryomov err_out_image_probe:
70778b679ec5SIlya Dryomov 	rbd_dev_image_release(rbd_dev);
7078c53d5893SAlex Elder err_out_rbd_dev:
7079c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
7080bd4ba655SAlex Elder err_out_client:
70819d3997fdSAlex Elder 	rbd_put_client(rbdc);
70820ddebc0cSAlex Elder err_out_args:
7083859c31dfSAlex Elder 	rbd_spec_put(spec);
7084d147543dSIlya Dryomov 	kfree(rbd_opts);
7085dd5ac32dSIlya Dryomov 	goto out;
7086602adf40SYehuda Sadeh }
7087602adf40SYehuda Sadeh 
70887e9586baSGreg Kroah-Hartman static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count)
70899b60e70bSIlya Dryomov {
70909b60e70bSIlya Dryomov 	if (single_major)
70919b60e70bSIlya Dryomov 		return -EINVAL;
70929b60e70bSIlya Dryomov 
70939b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
70949b60e70bSIlya Dryomov }
70959b60e70bSIlya Dryomov 
70967e9586baSGreg Kroah-Hartman static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
70979b60e70bSIlya Dryomov 				      size_t count)
70989b60e70bSIlya Dryomov {
70999b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
71009b60e70bSIlya Dryomov }
71019b60e70bSIlya Dryomov 
710205a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
710305a46afdSAlex Elder {
7104ad945fc1SAlex Elder 	while (rbd_dev->parent) {
710505a46afdSAlex Elder 		struct rbd_device *first = rbd_dev;
710605a46afdSAlex Elder 		struct rbd_device *second = first->parent;
710705a46afdSAlex Elder 		struct rbd_device *third;
710805a46afdSAlex Elder 
710905a46afdSAlex Elder 		/*
711005a46afdSAlex Elder 		 * Follow to the parent with no grandparent and
711105a46afdSAlex Elder 		 * remove it.
711205a46afdSAlex Elder 		 */
711305a46afdSAlex Elder 		while (second && (third = second->parent)) {
711405a46afdSAlex Elder 			first = second;
711505a46afdSAlex Elder 			second = third;
711605a46afdSAlex Elder 		}
7117ad945fc1SAlex Elder 		rbd_assert(second);
71188ad42cd0SAlex Elder 		rbd_dev_image_release(second);
71198b679ec5SIlya Dryomov 		rbd_dev_destroy(second);
7120ad945fc1SAlex Elder 		first->parent = NULL;
7121ad945fc1SAlex Elder 		first->parent_overlap = 0;
7122ad945fc1SAlex Elder 
7123ad945fc1SAlex Elder 		rbd_assert(first->parent_spec);
712405a46afdSAlex Elder 		rbd_spec_put(first->parent_spec);
712505a46afdSAlex Elder 		first->parent_spec = NULL;
712605a46afdSAlex Elder 	}
712705a46afdSAlex Elder }
712805a46afdSAlex Elder 
71299b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus,
7130602adf40SYehuda Sadeh 			     const char *buf,
7131602adf40SYehuda Sadeh 			     size_t count)
7132602adf40SYehuda Sadeh {
7133602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
7134751cc0e3SAlex Elder 	struct list_head *tmp;
7135751cc0e3SAlex Elder 	int dev_id;
71360276dca6SMike Christie 	char opt_buf[6];
71370276dca6SMike Christie 	bool force = false;
71380d8189e1SAlex Elder 	int ret;
7139602adf40SYehuda Sadeh 
7140f44d04e6SIlya Dryomov 	if (!capable(CAP_SYS_ADMIN))
7141f44d04e6SIlya Dryomov 		return -EPERM;
7142f44d04e6SIlya Dryomov 
71430276dca6SMike Christie 	dev_id = -1;
71440276dca6SMike Christie 	opt_buf[0] = '\0';
71450276dca6SMike Christie 	sscanf(buf, "%d %5s", &dev_id, opt_buf);
71460276dca6SMike Christie 	if (dev_id < 0) {
71470276dca6SMike Christie 		pr_err("dev_id out of range\n");
7148602adf40SYehuda Sadeh 		return -EINVAL;
71490276dca6SMike Christie 	}
71500276dca6SMike Christie 	if (opt_buf[0] != '\0') {
71510276dca6SMike Christie 		if (!strcmp(opt_buf, "force")) {
71520276dca6SMike Christie 			force = true;
71530276dca6SMike Christie 		} else {
71540276dca6SMike Christie 			pr_err("bad remove option at '%s'\n", opt_buf);
71550276dca6SMike Christie 			return -EINVAL;
71560276dca6SMike Christie 		}
71570276dca6SMike Christie 	}
7158602adf40SYehuda Sadeh 
7159602adf40SYehuda Sadeh 	ret = -ENOENT;
7160751cc0e3SAlex Elder 	spin_lock(&rbd_dev_list_lock);
7161751cc0e3SAlex Elder 	list_for_each(tmp, &rbd_dev_list) {
7162751cc0e3SAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
7163751cc0e3SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
7164751cc0e3SAlex Elder 			ret = 0;
7165751cc0e3SAlex Elder 			break;
7166602adf40SYehuda Sadeh 		}
7167751cc0e3SAlex Elder 	}
7168751cc0e3SAlex Elder 	if (!ret) {
7169a14ea269SAlex Elder 		spin_lock_irq(&rbd_dev->lock);
71700276dca6SMike Christie 		if (rbd_dev->open_count && !force)
717142382b70SAlex Elder 			ret = -EBUSY;
717285f5a4d6SIlya Dryomov 		else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING,
717385f5a4d6SIlya Dryomov 					  &rbd_dev->flags))
717485f5a4d6SIlya Dryomov 			ret = -EINPROGRESS;
7175a14ea269SAlex Elder 		spin_unlock_irq(&rbd_dev->lock);
7176751cc0e3SAlex Elder 	}
7177751cc0e3SAlex Elder 	spin_unlock(&rbd_dev_list_lock);
717885f5a4d6SIlya Dryomov 	if (ret)
71791ba0f1e7SAlex Elder 		return ret;
7180751cc0e3SAlex Elder 
71810276dca6SMike Christie 	if (force) {
71820276dca6SMike Christie 		/*
71830276dca6SMike Christie 		 * Prevent new IO from being queued and wait for existing
71840276dca6SMike Christie 		 * IO to complete/fail.
71850276dca6SMike Christie 		 */
71860276dca6SMike Christie 		blk_mq_freeze_queue(rbd_dev->disk->queue);
71877a5428dcSChristoph Hellwig 		blk_mark_disk_dead(rbd_dev->disk);
71880276dca6SMike Christie 	}
71890276dca6SMike Christie 
71905769ed0cSIlya Dryomov 	del_gendisk(rbd_dev->disk);
71915769ed0cSIlya Dryomov 	spin_lock(&rbd_dev_list_lock);
71925769ed0cSIlya Dryomov 	list_del_init(&rbd_dev->node);
71935769ed0cSIlya Dryomov 	spin_unlock(&rbd_dev_list_lock);
71945769ed0cSIlya Dryomov 	device_del(&rbd_dev->dev);
7195fca27065SIlya Dryomov 
7196e010dd0aSIlya Dryomov 	rbd_dev_image_unlock(rbd_dev);
7197dd5ac32dSIlya Dryomov 	rbd_dev_device_release(rbd_dev);
71988ad42cd0SAlex Elder 	rbd_dev_image_release(rbd_dev);
71998b679ec5SIlya Dryomov 	rbd_dev_destroy(rbd_dev);
72001ba0f1e7SAlex Elder 	return count;
7201602adf40SYehuda Sadeh }
7202602adf40SYehuda Sadeh 
72037e9586baSGreg Kroah-Hartman static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count)
72049b60e70bSIlya Dryomov {
72059b60e70bSIlya Dryomov 	if (single_major)
72069b60e70bSIlya Dryomov 		return -EINVAL;
72079b60e70bSIlya Dryomov 
72089b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
72099b60e70bSIlya Dryomov }
72109b60e70bSIlya Dryomov 
72117e9586baSGreg Kroah-Hartman static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
72129b60e70bSIlya Dryomov 					 size_t count)
72139b60e70bSIlya Dryomov {
72149b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
72159b60e70bSIlya Dryomov }
72169b60e70bSIlya Dryomov 
7217602adf40SYehuda Sadeh /*
7218602adf40SYehuda Sadeh  * create control files in sysfs
7219dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
7220602adf40SYehuda Sadeh  */
72217d8dc534SChengguang Xu static int __init rbd_sysfs_init(void)
7222602adf40SYehuda Sadeh {
7223dfc5606dSYehuda Sadeh 	int ret;
7224602adf40SYehuda Sadeh 
7225fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
7226dfc5606dSYehuda Sadeh 	if (ret < 0)
7227dfc5606dSYehuda Sadeh 		return ret;
7228602adf40SYehuda Sadeh 
7229fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
7230fed4c143SAlex Elder 	if (ret < 0)
7231fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
7232602adf40SYehuda Sadeh 
7233602adf40SYehuda Sadeh 	return ret;
7234602adf40SYehuda Sadeh }
7235602adf40SYehuda Sadeh 
72367d8dc534SChengguang Xu static void __exit rbd_sysfs_cleanup(void)
7237602adf40SYehuda Sadeh {
7238dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
7239fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
7240602adf40SYehuda Sadeh }
7241602adf40SYehuda Sadeh 
72427d8dc534SChengguang Xu static int __init rbd_slab_init(void)
72431c2a9dfeSAlex Elder {
72441c2a9dfeSAlex Elder 	rbd_assert(!rbd_img_request_cache);
724503d94406SGeliang Tang 	rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
7246868311b1SAlex Elder 	if (!rbd_img_request_cache)
7247868311b1SAlex Elder 		return -ENOMEM;
7248868311b1SAlex Elder 
7249868311b1SAlex Elder 	rbd_assert(!rbd_obj_request_cache);
725003d94406SGeliang Tang 	rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
725178c2a44aSAlex Elder 	if (!rbd_obj_request_cache)
725278c2a44aSAlex Elder 		goto out_err;
725378c2a44aSAlex Elder 
72541c2a9dfeSAlex Elder 	return 0;
72551c2a9dfeSAlex Elder 
72566c696d85SIlya Dryomov out_err:
7257868311b1SAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
7258868311b1SAlex Elder 	rbd_img_request_cache = NULL;
72591c2a9dfeSAlex Elder 	return -ENOMEM;
72601c2a9dfeSAlex Elder }
72611c2a9dfeSAlex Elder 
72621c2a9dfeSAlex Elder static void rbd_slab_exit(void)
72631c2a9dfeSAlex Elder {
7264868311b1SAlex Elder 	rbd_assert(rbd_obj_request_cache);
7265868311b1SAlex Elder 	kmem_cache_destroy(rbd_obj_request_cache);
7266868311b1SAlex Elder 	rbd_obj_request_cache = NULL;
7267868311b1SAlex Elder 
72681c2a9dfeSAlex Elder 	rbd_assert(rbd_img_request_cache);
72691c2a9dfeSAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
72701c2a9dfeSAlex Elder 	rbd_img_request_cache = NULL;
72711c2a9dfeSAlex Elder }
72721c2a9dfeSAlex Elder 
7273cc344fa1SAlex Elder static int __init rbd_init(void)
7274602adf40SYehuda Sadeh {
7275602adf40SYehuda Sadeh 	int rc;
7276602adf40SYehuda Sadeh 
72771e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
72781e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
72791e32d34cSAlex Elder 		return -EINVAL;
72801e32d34cSAlex Elder 	}
7281e1b4d96dSIlya Dryomov 
72821c2a9dfeSAlex Elder 	rc = rbd_slab_init();
7283602adf40SYehuda Sadeh 	if (rc)
7284602adf40SYehuda Sadeh 		return rc;
7285e1b4d96dSIlya Dryomov 
7286f5ee37bdSIlya Dryomov 	/*
7287f5ee37bdSIlya Dryomov 	 * The number of active work items is limited by the number of
7288f77303bdSIlya Dryomov 	 * rbd devices * queue depth, so leave @max_active at default.
7289f5ee37bdSIlya Dryomov 	 */
7290f5ee37bdSIlya Dryomov 	rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
7291f5ee37bdSIlya Dryomov 	if (!rbd_wq) {
7292f5ee37bdSIlya Dryomov 		rc = -ENOMEM;
7293f5ee37bdSIlya Dryomov 		goto err_out_slab;
7294f5ee37bdSIlya Dryomov 	}
7295f5ee37bdSIlya Dryomov 
72969b60e70bSIlya Dryomov 	if (single_major) {
72979b60e70bSIlya Dryomov 		rbd_major = register_blkdev(0, RBD_DRV_NAME);
72989b60e70bSIlya Dryomov 		if (rbd_major < 0) {
72999b60e70bSIlya Dryomov 			rc = rbd_major;
7300f5ee37bdSIlya Dryomov 			goto err_out_wq;
73019b60e70bSIlya Dryomov 		}
73029b60e70bSIlya Dryomov 	}
73039b60e70bSIlya Dryomov 
73041c2a9dfeSAlex Elder 	rc = rbd_sysfs_init();
73051c2a9dfeSAlex Elder 	if (rc)
73069b60e70bSIlya Dryomov 		goto err_out_blkdev;
73071c2a9dfeSAlex Elder 
73089b60e70bSIlya Dryomov 	if (single_major)
73099b60e70bSIlya Dryomov 		pr_info("loaded (major %d)\n", rbd_major);
73109b60e70bSIlya Dryomov 	else
7311e1b4d96dSIlya Dryomov 		pr_info("loaded\n");
73129b60e70bSIlya Dryomov 
7313e1b4d96dSIlya Dryomov 	return 0;
7314e1b4d96dSIlya Dryomov 
73159b60e70bSIlya Dryomov err_out_blkdev:
73169b60e70bSIlya Dryomov 	if (single_major)
73179b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
7318f5ee37bdSIlya Dryomov err_out_wq:
7319f5ee37bdSIlya Dryomov 	destroy_workqueue(rbd_wq);
7320e1b4d96dSIlya Dryomov err_out_slab:
7321e1b4d96dSIlya Dryomov 	rbd_slab_exit();
73221c2a9dfeSAlex Elder 	return rc;
7323602adf40SYehuda Sadeh }
7324602adf40SYehuda Sadeh 
7325cc344fa1SAlex Elder static void __exit rbd_exit(void)
7326602adf40SYehuda Sadeh {
7327ffe312cfSIlya Dryomov 	ida_destroy(&rbd_dev_id_ida);
7328602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
73299b60e70bSIlya Dryomov 	if (single_major)
73309b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
7331f5ee37bdSIlya Dryomov 	destroy_workqueue(rbd_wq);
73321c2a9dfeSAlex Elder 	rbd_slab_exit();
7333602adf40SYehuda Sadeh }
7334602adf40SYehuda Sadeh 
7335602adf40SYehuda Sadeh module_init(rbd_init);
7336602adf40SYehuda Sadeh module_exit(rbd_exit);
7337602adf40SYehuda Sadeh 
7338d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
7339602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
7340602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
7341602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
7342602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
7343602adf40SYehuda Sadeh 
734490da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
7345602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
7346