xref: /openbmc/linux/drivers/block/rbd.c (revision f3c0e459)
1e2a58ee5SAlex Elder 
2602adf40SYehuda Sadeh /*
3602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh 
6602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
7602adf40SYehuda Sadeh 
8602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
9602adf40SYehuda Sadeh 
10602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
11602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
12602adf40SYehuda Sadeh    the Free Software Foundation.
13602adf40SYehuda Sadeh 
14602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
15602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
16602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17602adf40SYehuda Sadeh    GNU General Public License for more details.
18602adf40SYehuda Sadeh 
19602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
20602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
21602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24602adf40SYehuda Sadeh 
25dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
26602adf40SYehuda Sadeh 
27dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
28602adf40SYehuda Sadeh 
29602adf40SYehuda Sadeh  */
30602adf40SYehuda Sadeh 
31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
34ed95b21aSIlya Dryomov #include <linux/ceph/cls_lock_client.h>
3543df3d35SIlya Dryomov #include <linux/ceph/striper.h>
36602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3759c2be1eSYehuda Sadeh #include <linux/parser.h>
3830d1cff8SAlex Elder #include <linux/bsearch.h>
39602adf40SYehuda Sadeh 
40602adf40SYehuda Sadeh #include <linux/kernel.h>
41602adf40SYehuda Sadeh #include <linux/device.h>
42602adf40SYehuda Sadeh #include <linux/module.h>
437ad18afaSChristoph Hellwig #include <linux/blk-mq.h>
44602adf40SYehuda Sadeh #include <linux/fs.h>
45602adf40SYehuda Sadeh #include <linux/blkdev.h>
461c2a9dfeSAlex Elder #include <linux/slab.h>
47f8a22fc2SIlya Dryomov #include <linux/idr.h>
48bc1ecc65SIlya Dryomov #include <linux/workqueue.h>
49602adf40SYehuda Sadeh 
50602adf40SYehuda Sadeh #include "rbd_types.h"
51602adf40SYehuda Sadeh 
52aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
53aafb230eSAlex Elder 
54593a9e7bSAlex Elder /*
55a2acd00eSAlex Elder  * Increment the given counter and return its updated value.
56a2acd00eSAlex Elder  * If the counter is already 0 it will not be incremented.
57a2acd00eSAlex Elder  * If the counter is already at its maximum value returns
58a2acd00eSAlex Elder  * -EINVAL without updating it.
59a2acd00eSAlex Elder  */
60a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v)
61a2acd00eSAlex Elder {
62a2acd00eSAlex Elder 	unsigned int counter;
63a2acd00eSAlex Elder 
64bfc18e38SMark Rutland 	counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0);
65a2acd00eSAlex Elder 	if (counter <= (unsigned int)INT_MAX)
66a2acd00eSAlex Elder 		return (int)counter;
67a2acd00eSAlex Elder 
68a2acd00eSAlex Elder 	atomic_dec(v);
69a2acd00eSAlex Elder 
70a2acd00eSAlex Elder 	return -EINVAL;
71a2acd00eSAlex Elder }
72a2acd00eSAlex Elder 
73a2acd00eSAlex Elder /* Decrement the counter.  Return the resulting value, or -EINVAL */
74a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v)
75a2acd00eSAlex Elder {
76a2acd00eSAlex Elder 	int counter;
77a2acd00eSAlex Elder 
78a2acd00eSAlex Elder 	counter = atomic_dec_return(v);
79a2acd00eSAlex Elder 	if (counter >= 0)
80a2acd00eSAlex Elder 		return counter;
81a2acd00eSAlex Elder 
82a2acd00eSAlex Elder 	atomic_inc(v);
83a2acd00eSAlex Elder 
84a2acd00eSAlex Elder 	return -EINVAL;
85a2acd00eSAlex Elder }
86a2acd00eSAlex Elder 
87f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
88602adf40SYehuda Sadeh 
897e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR		256
907e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT	4
91602adf40SYehuda Sadeh 
926d69bb53SIlya Dryomov #define RBD_MAX_PARENT_CHAIN_LEN	16
936d69bb53SIlya Dryomov 
94d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
95d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
96d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
97d4b125e9SAlex Elder 
9835d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
99602adf40SYehuda Sadeh 
100602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
101602adf40SYehuda Sadeh 
1029682fc6dSAlex Elder #define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
1039682fc6dSAlex Elder 
1049e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
1059e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
106589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
1079e15b77dSAlex Elder 
1081e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
109589d30e0SAlex Elder 
110ed95b21aSIlya Dryomov #define RBD_NOTIFY_TIMEOUT	5	/* seconds */
11199d16943SIlya Dryomov #define RBD_RETRY_DELAY		msecs_to_jiffies(1000)
11299d16943SIlya Dryomov 
113d889140cSAlex Elder /* Feature bits */
114d889140cSAlex Elder 
1158767b293SIlya Dryomov #define RBD_FEATURE_LAYERING		(1ULL<<0)
1168767b293SIlya Dryomov #define RBD_FEATURE_STRIPINGV2		(1ULL<<1)
1178767b293SIlya Dryomov #define RBD_FEATURE_EXCLUSIVE_LOCK	(1ULL<<2)
11822e8bd51SIlya Dryomov #define RBD_FEATURE_OBJECT_MAP		(1ULL<<3)
11922e8bd51SIlya Dryomov #define RBD_FEATURE_FAST_DIFF		(1ULL<<4)
120b9f6d447SIlya Dryomov #define RBD_FEATURE_DEEP_FLATTEN	(1ULL<<5)
1218767b293SIlya Dryomov #define RBD_FEATURE_DATA_POOL		(1ULL<<7)
122e573427aSIlya Dryomov #define RBD_FEATURE_OPERATIONS		(1ULL<<8)
1238767b293SIlya Dryomov 
124ed95b21aSIlya Dryomov #define RBD_FEATURES_ALL	(RBD_FEATURE_LAYERING |		\
125ed95b21aSIlya Dryomov 				 RBD_FEATURE_STRIPINGV2 |	\
1267e97332eSIlya Dryomov 				 RBD_FEATURE_EXCLUSIVE_LOCK |	\
12722e8bd51SIlya Dryomov 				 RBD_FEATURE_OBJECT_MAP |	\
12822e8bd51SIlya Dryomov 				 RBD_FEATURE_FAST_DIFF |	\
129b9f6d447SIlya Dryomov 				 RBD_FEATURE_DEEP_FLATTEN |	\
130e573427aSIlya Dryomov 				 RBD_FEATURE_DATA_POOL |	\
131e573427aSIlya Dryomov 				 RBD_FEATURE_OPERATIONS)
132d889140cSAlex Elder 
133d889140cSAlex Elder /* Features supported by this (client software) implementation. */
134d889140cSAlex Elder 
135770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
136d889140cSAlex Elder 
13781a89793SAlex Elder /*
13881a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
13981a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
14081a89793SAlex Elder  */
141602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
142602adf40SYehuda Sadeh 
143602adf40SYehuda Sadeh /*
144602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
145602adf40SYehuda Sadeh  */
146602adf40SYehuda Sadeh struct rbd_image_header {
147f35a4deeSAlex Elder 	/* These six fields never change for a given rbd image */
148849b4260SAlex Elder 	char *object_prefix;
149602adf40SYehuda Sadeh 	__u8 obj_order;
150f35a4deeSAlex Elder 	u64 stripe_unit;
151f35a4deeSAlex Elder 	u64 stripe_count;
1527e97332eSIlya Dryomov 	s64 data_pool_id;
153f35a4deeSAlex Elder 	u64 features;		/* Might be changeable someday? */
154602adf40SYehuda Sadeh 
155f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
156f84344f3SAlex Elder 	u64 image_size;
157f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
158f35a4deeSAlex Elder 	char *snap_names;	/* format 1 only */
159f35a4deeSAlex Elder 	u64 *snap_sizes;	/* format 1 only */
16059c2be1eSYehuda Sadeh };
16159c2be1eSYehuda Sadeh 
1620d7dbfceSAlex Elder /*
1630d7dbfceSAlex Elder  * An rbd image specification.
1640d7dbfceSAlex Elder  *
1650d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
166c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
167c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
168c66c6e0cSAlex Elder  *
169c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
170c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
171c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
172c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
173c66c6e0cSAlex Elder  *
174c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
175c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
176c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
177c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
178c66c6e0cSAlex Elder  * is shared between the parent and child).
179c66c6e0cSAlex Elder  *
180c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
181c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
182c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
183c66c6e0cSAlex Elder  *
184c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
185c66c6e0cSAlex Elder  * could be a null pointer).
1860d7dbfceSAlex Elder  */
1870d7dbfceSAlex Elder struct rbd_spec {
1880d7dbfceSAlex Elder 	u64		pool_id;
189ecb4dc22SAlex Elder 	const char	*pool_name;
190b26c047bSIlya Dryomov 	const char	*pool_ns;	/* NULL if default, never "" */
1910d7dbfceSAlex Elder 
192ecb4dc22SAlex Elder 	const char	*image_id;
193ecb4dc22SAlex Elder 	const char	*image_name;
1940d7dbfceSAlex Elder 
1950d7dbfceSAlex Elder 	u64		snap_id;
196ecb4dc22SAlex Elder 	const char	*snap_name;
1970d7dbfceSAlex Elder 
1980d7dbfceSAlex Elder 	struct kref	kref;
1990d7dbfceSAlex Elder };
2000d7dbfceSAlex Elder 
201602adf40SYehuda Sadeh /*
202f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
203602adf40SYehuda Sadeh  */
204602adf40SYehuda Sadeh struct rbd_client {
205602adf40SYehuda Sadeh 	struct ceph_client	*client;
206602adf40SYehuda Sadeh 	struct kref		kref;
207602adf40SYehuda Sadeh 	struct list_head	node;
208602adf40SYehuda Sadeh };
209602adf40SYehuda Sadeh 
2100192ce2eSIlya Dryomov struct pending_result {
2110192ce2eSIlya Dryomov 	int			result;		/* first nonzero result */
2120192ce2eSIlya Dryomov 	int			num_pending;
2130192ce2eSIlya Dryomov };
2140192ce2eSIlya Dryomov 
215bf0d5f50SAlex Elder struct rbd_img_request;
216bf0d5f50SAlex Elder 
2179969ebc5SAlex Elder enum obj_request_type {
218a1fbb5e7SIlya Dryomov 	OBJ_REQUEST_NODATA = 1,
2195359a17dSIlya Dryomov 	OBJ_REQUEST_BIO,	/* pointer into provided bio (list) */
2207e07efb1SIlya Dryomov 	OBJ_REQUEST_BVECS,	/* pointer into provided bio_vec array */
221afb97888SIlya Dryomov 	OBJ_REQUEST_OWN_BVECS,	/* private bio_vec array, doesn't own pages */
2229969ebc5SAlex Elder };
223bf0d5f50SAlex Elder 
2246d2940c8SGuangliang Zhao enum obj_operation_type {
225a1fbb5e7SIlya Dryomov 	OBJ_OP_READ = 1,
2266d2940c8SGuangliang Zhao 	OBJ_OP_WRITE,
22790e98c52SGuangliang Zhao 	OBJ_OP_DISCARD,
2286484cbe9SIlya Dryomov 	OBJ_OP_ZEROOUT,
2296d2940c8SGuangliang Zhao };
2306d2940c8SGuangliang Zhao 
2310ad5d953SIlya Dryomov #define RBD_OBJ_FLAG_DELETION			(1U << 0)
2320ad5d953SIlya Dryomov #define RBD_OBJ_FLAG_COPYUP_ENABLED		(1U << 1)
233793333a3SIlya Dryomov #define RBD_OBJ_FLAG_COPYUP_ZEROS		(1U << 2)
23422e8bd51SIlya Dryomov #define RBD_OBJ_FLAG_MAY_EXIST			(1U << 3)
23522e8bd51SIlya Dryomov #define RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT	(1U << 4)
2360ad5d953SIlya Dryomov 
237a9b67e69SIlya Dryomov enum rbd_obj_read_state {
23885b5e6d1SIlya Dryomov 	RBD_OBJ_READ_START = 1,
23985b5e6d1SIlya Dryomov 	RBD_OBJ_READ_OBJECT,
240a9b67e69SIlya Dryomov 	RBD_OBJ_READ_PARENT,
241a9b67e69SIlya Dryomov };
242a9b67e69SIlya Dryomov 
2433da691bfSIlya Dryomov /*
2443da691bfSIlya Dryomov  * Writes go through the following state machine to deal with
2453da691bfSIlya Dryomov  * layering:
2463da691bfSIlya Dryomov  *
24789a59c1cSIlya Dryomov  *            . . . . . RBD_OBJ_WRITE_GUARD. . . . . . . . . . . . . .
24889a59c1cSIlya Dryomov  *            .                 |                                    .
24989a59c1cSIlya Dryomov  *            .                 v                                    .
25089a59c1cSIlya Dryomov  *            .    RBD_OBJ_WRITE_READ_FROM_PARENT. . .               .
25189a59c1cSIlya Dryomov  *            .                 |                    .               .
25289a59c1cSIlya Dryomov  *            .                 v                    v (deep-copyup  .
25389a59c1cSIlya Dryomov  *    (image  .   RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC   .  not needed)  .
25489a59c1cSIlya Dryomov  * flattened) v                 |                    .               .
25589a59c1cSIlya Dryomov  *            .                 v                    .               .
25689a59c1cSIlya Dryomov  *            . . . .RBD_OBJ_WRITE_COPYUP_OPS. . . . .      (copyup  .
25789a59c1cSIlya Dryomov  *                              |                        not needed) v
25889a59c1cSIlya Dryomov  *                              v                                    .
25989a59c1cSIlya Dryomov  *                            done . . . . . . . . . . . . . . . . . .
2603da691bfSIlya Dryomov  *                              ^
2613da691bfSIlya Dryomov  *                              |
2623da691bfSIlya Dryomov  *                     RBD_OBJ_WRITE_FLAT
2633da691bfSIlya Dryomov  *
2643da691bfSIlya Dryomov  * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
26589a59c1cSIlya Dryomov  * assert_exists guard is needed or not (in some cases it's not needed
26689a59c1cSIlya Dryomov  * even if there is a parent).
2673da691bfSIlya Dryomov  */
2683da691bfSIlya Dryomov enum rbd_obj_write_state {
26985b5e6d1SIlya Dryomov 	RBD_OBJ_WRITE_START = 1,
27022e8bd51SIlya Dryomov 	RBD_OBJ_WRITE_PRE_OBJECT_MAP,
27185b5e6d1SIlya Dryomov 	RBD_OBJ_WRITE_OBJECT,
272793333a3SIlya Dryomov 	__RBD_OBJ_WRITE_COPYUP,
273793333a3SIlya Dryomov 	RBD_OBJ_WRITE_COPYUP,
27422e8bd51SIlya Dryomov 	RBD_OBJ_WRITE_POST_OBJECT_MAP,
275793333a3SIlya Dryomov };
276793333a3SIlya Dryomov 
277793333a3SIlya Dryomov enum rbd_obj_copyup_state {
278793333a3SIlya Dryomov 	RBD_OBJ_COPYUP_START = 1,
279793333a3SIlya Dryomov 	RBD_OBJ_COPYUP_READ_PARENT,
28022e8bd51SIlya Dryomov 	__RBD_OBJ_COPYUP_OBJECT_MAPS,
28122e8bd51SIlya Dryomov 	RBD_OBJ_COPYUP_OBJECT_MAPS,
282793333a3SIlya Dryomov 	__RBD_OBJ_COPYUP_WRITE_OBJECT,
283793333a3SIlya Dryomov 	RBD_OBJ_COPYUP_WRITE_OBJECT,
284926f9b3fSAlex Elder };
285926f9b3fSAlex Elder 
286bf0d5f50SAlex Elder struct rbd_obj_request {
28743df3d35SIlya Dryomov 	struct ceph_object_extent ex;
2880ad5d953SIlya Dryomov 	unsigned int		flags;	/* RBD_OBJ_FLAG_* */
289c5b5ef6cSAlex Elder 	union {
290a9b67e69SIlya Dryomov 		enum rbd_obj_read_state	 read_state;	/* for reads */
2913da691bfSIlya Dryomov 		enum rbd_obj_write_state write_state;	/* for writes */
2923da691bfSIlya Dryomov 	};
293bf0d5f50SAlex Elder 
294bf0d5f50SAlex Elder 	struct rbd_img_request	*img_request;
29586bd7998SIlya Dryomov 	struct ceph_file_extent	*img_extents;
29686bd7998SIlya Dryomov 	u32			num_img_extents;
297bf0d5f50SAlex Elder 
298788e2df3SAlex Elder 	union {
2995359a17dSIlya Dryomov 		struct ceph_bio_iter	bio_pos;
300788e2df3SAlex Elder 		struct {
3017e07efb1SIlya Dryomov 			struct ceph_bvec_iter	bvec_pos;
3027e07efb1SIlya Dryomov 			u32			bvec_count;
303afb97888SIlya Dryomov 			u32			bvec_idx;
304788e2df3SAlex Elder 		};
305788e2df3SAlex Elder 	};
306793333a3SIlya Dryomov 
307793333a3SIlya Dryomov 	enum rbd_obj_copyup_state copyup_state;
3087e07efb1SIlya Dryomov 	struct bio_vec		*copyup_bvecs;
3097e07efb1SIlya Dryomov 	u32			copyup_bvec_count;
310bf0d5f50SAlex Elder 
311bcbab1dbSIlya Dryomov 	struct list_head	osd_reqs;	/* w/ r_private_item */
312bf0d5f50SAlex Elder 
31385b5e6d1SIlya Dryomov 	struct mutex		state_mutex;
314793333a3SIlya Dryomov 	struct pending_result	pending;
315bf0d5f50SAlex Elder 	struct kref		kref;
316bf0d5f50SAlex Elder };
317bf0d5f50SAlex Elder 
3180c425248SAlex Elder enum img_req_flags {
3199849e986SAlex Elder 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
320d0b2e944SAlex Elder 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
3210c425248SAlex Elder };
3220c425248SAlex Elder 
3230192ce2eSIlya Dryomov enum rbd_img_state {
3240192ce2eSIlya Dryomov 	RBD_IMG_START = 1,
325637cd060SIlya Dryomov 	RBD_IMG_EXCLUSIVE_LOCK,
3260192ce2eSIlya Dryomov 	__RBD_IMG_OBJECT_REQUESTS,
3270192ce2eSIlya Dryomov 	RBD_IMG_OBJECT_REQUESTS,
3280192ce2eSIlya Dryomov };
3290192ce2eSIlya Dryomov 
330bf0d5f50SAlex Elder struct rbd_img_request {
331bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
3329bb0248dSIlya Dryomov 	enum obj_operation_type	op_type;
333ecc633caSIlya Dryomov 	enum obj_request_type	data_type;
3340c425248SAlex Elder 	unsigned long		flags;
3350192ce2eSIlya Dryomov 	enum rbd_img_state	state;
336bf0d5f50SAlex Elder 	union {
337bf0d5f50SAlex Elder 		u64			snap_id;	/* for reads */
3389849e986SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
3399849e986SAlex Elder 	};
3409849e986SAlex Elder 	union {
3419849e986SAlex Elder 		struct request		*rq;		/* block request */
3429849e986SAlex Elder 		struct rbd_obj_request	*obj_request;	/* obj req initiator */
343bf0d5f50SAlex Elder 	};
344bf0d5f50SAlex Elder 
345e1fddc8fSIlya Dryomov 	struct list_head	lock_item;
34643df3d35SIlya Dryomov 	struct list_head	object_extents;	/* obj_req.ex structs */
347bf0d5f50SAlex Elder 
3480192ce2eSIlya Dryomov 	struct mutex		state_mutex;
3490192ce2eSIlya Dryomov 	struct pending_result	pending;
3500192ce2eSIlya Dryomov 	struct work_struct	work;
3510192ce2eSIlya Dryomov 	int			work_result;
352bf0d5f50SAlex Elder 	struct kref		kref;
353bf0d5f50SAlex Elder };
354bf0d5f50SAlex Elder 
355bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
35643df3d35SIlya Dryomov 	list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
357bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
35843df3d35SIlya Dryomov 	list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
359bf0d5f50SAlex Elder 
36099d16943SIlya Dryomov enum rbd_watch_state {
36199d16943SIlya Dryomov 	RBD_WATCH_STATE_UNREGISTERED,
36299d16943SIlya Dryomov 	RBD_WATCH_STATE_REGISTERED,
36399d16943SIlya Dryomov 	RBD_WATCH_STATE_ERROR,
36499d16943SIlya Dryomov };
36599d16943SIlya Dryomov 
366ed95b21aSIlya Dryomov enum rbd_lock_state {
367ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_UNLOCKED,
368ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_LOCKED,
369ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_RELEASING,
370ed95b21aSIlya Dryomov };
371ed95b21aSIlya Dryomov 
372ed95b21aSIlya Dryomov /* WatchNotify::ClientId */
373ed95b21aSIlya Dryomov struct rbd_client_id {
374ed95b21aSIlya Dryomov 	u64 gid;
375ed95b21aSIlya Dryomov 	u64 handle;
376ed95b21aSIlya Dryomov };
377ed95b21aSIlya Dryomov 
378f84344f3SAlex Elder struct rbd_mapping {
37999c1f08fSAlex Elder 	u64                     size;
38034b13184SAlex Elder 	u64                     features;
381f84344f3SAlex Elder };
382f84344f3SAlex Elder 
383602adf40SYehuda Sadeh /*
384602adf40SYehuda Sadeh  * a single device
385602adf40SYehuda Sadeh  */
386602adf40SYehuda Sadeh struct rbd_device {
387de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
388602adf40SYehuda Sadeh 
389602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
390dd82fff1SIlya Dryomov 	int			minor;
391602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
392602adf40SYehuda Sadeh 
393a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
394602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
395602adf40SYehuda Sadeh 
396602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
397602adf40SYehuda Sadeh 
398b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
399602adf40SYehuda Sadeh 
400602adf40SYehuda Sadeh 	struct rbd_image_header	header;
401b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
4020d7dbfceSAlex Elder 	struct rbd_spec		*spec;
403d147543dSIlya Dryomov 	struct rbd_options	*opts;
4040d6d1e9cSMike Christie 	char			*config_info;	/* add{,_single_major} string */
405602adf40SYehuda Sadeh 
406c41d13a3SIlya Dryomov 	struct ceph_object_id	header_oid;
407922dab61SIlya Dryomov 	struct ceph_object_locator header_oloc;
408971f839aSAlex Elder 
4091643dfa4SIlya Dryomov 	struct ceph_file_layout	layout;		/* used for all rbd requests */
4100903e875SAlex Elder 
41199d16943SIlya Dryomov 	struct mutex		watch_mutex;
41299d16943SIlya Dryomov 	enum rbd_watch_state	watch_state;
413922dab61SIlya Dryomov 	struct ceph_osd_linger_request *watch_handle;
41499d16943SIlya Dryomov 	u64			watch_cookie;
41599d16943SIlya Dryomov 	struct delayed_work	watch_dwork;
41659c2be1eSYehuda Sadeh 
417ed95b21aSIlya Dryomov 	struct rw_semaphore	lock_rwsem;
418ed95b21aSIlya Dryomov 	enum rbd_lock_state	lock_state;
419cbbfb0ffSIlya Dryomov 	char			lock_cookie[32];
420ed95b21aSIlya Dryomov 	struct rbd_client_id	owner_cid;
421ed95b21aSIlya Dryomov 	struct work_struct	acquired_lock_work;
422ed95b21aSIlya Dryomov 	struct work_struct	released_lock_work;
423ed95b21aSIlya Dryomov 	struct delayed_work	lock_dwork;
424ed95b21aSIlya Dryomov 	struct work_struct	unlock_work;
425e1fddc8fSIlya Dryomov 	spinlock_t		lock_lists_lock;
426637cd060SIlya Dryomov 	struct list_head	acquiring_list;
427e1fddc8fSIlya Dryomov 	struct list_head	running_list;
428637cd060SIlya Dryomov 	struct completion	acquire_wait;
429637cd060SIlya Dryomov 	int			acquire_err;
430e1fddc8fSIlya Dryomov 	struct completion	releasing_wait;
431ed95b21aSIlya Dryomov 
43222e8bd51SIlya Dryomov 	spinlock_t		object_map_lock;
43322e8bd51SIlya Dryomov 	u8			*object_map;
43422e8bd51SIlya Dryomov 	u64			object_map_size;	/* in objects */
43522e8bd51SIlya Dryomov 	u64			object_map_flags;
436602adf40SYehuda Sadeh 
4371643dfa4SIlya Dryomov 	struct workqueue_struct	*task_wq;
438602adf40SYehuda Sadeh 
43986b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
44086b00e0dSAlex Elder 	u64			parent_overlap;
441a2acd00eSAlex Elder 	atomic_t		parent_ref;
4422f82ee54SAlex Elder 	struct rbd_device	*parent;
44386b00e0dSAlex Elder 
4447ad18afaSChristoph Hellwig 	/* Block layer tags. */
4457ad18afaSChristoph Hellwig 	struct blk_mq_tag_set	tag_set;
4467ad18afaSChristoph Hellwig 
447c666601aSJosh Durgin 	/* protects updating the header */
448c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
449f84344f3SAlex Elder 
450f84344f3SAlex Elder 	struct rbd_mapping	mapping;
451602adf40SYehuda Sadeh 
452602adf40SYehuda Sadeh 	struct list_head	node;
453dfc5606dSYehuda Sadeh 
454dfc5606dSYehuda Sadeh 	/* sysfs related */
455dfc5606dSYehuda Sadeh 	struct device		dev;
456b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
457dfc5606dSYehuda Sadeh };
458dfc5606dSYehuda Sadeh 
459b82d167bSAlex Elder /*
46087c0fdedSIlya Dryomov  * Flag bits for rbd_dev->flags:
46187c0fdedSIlya Dryomov  * - REMOVING (which is coupled with rbd_dev->open_count) is protected
46287c0fdedSIlya Dryomov  *   by rbd_dev->lock
463b82d167bSAlex Elder  */
4646d292906SAlex Elder enum rbd_dev_flags {
4656d292906SAlex Elder 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
466b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
4676d292906SAlex Elder };
4686d292906SAlex Elder 
469cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex);	/* Serialize client creation */
470e124a82fSAlex Elder 
471602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
472e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
473e124a82fSAlex Elder 
474602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
475432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
476602adf40SYehuda Sadeh 
47778c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */
47878c2a44aSAlex Elder 
4791c2a9dfeSAlex Elder static struct kmem_cache	*rbd_img_request_cache;
480868311b1SAlex Elder static struct kmem_cache	*rbd_obj_request_cache;
4811c2a9dfeSAlex Elder 
4829b60e70bSIlya Dryomov static int rbd_major;
483f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida);
484f8a22fc2SIlya Dryomov 
485f5ee37bdSIlya Dryomov static struct workqueue_struct *rbd_wq;
486f5ee37bdSIlya Dryomov 
48789a59c1cSIlya Dryomov static struct ceph_snap_context rbd_empty_snapc = {
48889a59c1cSIlya Dryomov 	.nref = REFCOUNT_INIT(1),
48989a59c1cSIlya Dryomov };
49089a59c1cSIlya Dryomov 
4919b60e70bSIlya Dryomov /*
4923cfa3b16SIlya Dryomov  * single-major requires >= 0.75 version of userspace rbd utility.
4939b60e70bSIlya Dryomov  */
4943cfa3b16SIlya Dryomov static bool single_major = true;
4955657a819SJoe Perches module_param(single_major, bool, 0444);
4963cfa3b16SIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
4979b60e70bSIlya Dryomov 
4987e9586baSGreg Kroah-Hartman static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count);
4997e9586baSGreg Kroah-Hartman static ssize_t remove_store(struct bus_type *bus, const char *buf,
500f0f8cef5SAlex Elder 			    size_t count);
5017e9586baSGreg Kroah-Hartman static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
502f0f8cef5SAlex Elder 				      size_t count);
5037e9586baSGreg Kroah-Hartman static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
5049b60e70bSIlya Dryomov 					 size_t count);
5056d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
506f0f8cef5SAlex Elder 
5079b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id)
5089b60e70bSIlya Dryomov {
5097e513d43SIlya Dryomov 	return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
5109b60e70bSIlya Dryomov }
5119b60e70bSIlya Dryomov 
5129b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor)
5139b60e70bSIlya Dryomov {
5147e513d43SIlya Dryomov 	return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
5159b60e70bSIlya Dryomov }
5169b60e70bSIlya Dryomov 
517f3c0e459SIlya Dryomov static bool rbd_is_snap(struct rbd_device *rbd_dev)
518f3c0e459SIlya Dryomov {
519f3c0e459SIlya Dryomov 	return rbd_dev->spec->snap_id != CEPH_NOSNAP;
520f3c0e459SIlya Dryomov }
521f3c0e459SIlya Dryomov 
522ed95b21aSIlya Dryomov static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
523ed95b21aSIlya Dryomov {
524637cd060SIlya Dryomov 	lockdep_assert_held(&rbd_dev->lock_rwsem);
525637cd060SIlya Dryomov 
526ed95b21aSIlya Dryomov 	return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
527ed95b21aSIlya Dryomov 	       rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
528ed95b21aSIlya Dryomov }
529ed95b21aSIlya Dryomov 
530ed95b21aSIlya Dryomov static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
531ed95b21aSIlya Dryomov {
532ed95b21aSIlya Dryomov 	bool is_lock_owner;
533ed95b21aSIlya Dryomov 
534ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
535ed95b21aSIlya Dryomov 	is_lock_owner = __rbd_is_lock_owner(rbd_dev);
536ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
537ed95b21aSIlya Dryomov 	return is_lock_owner;
538ed95b21aSIlya Dryomov }
539ed95b21aSIlya Dryomov 
5407e9586baSGreg Kroah-Hartman static ssize_t supported_features_show(struct bus_type *bus, char *buf)
5418767b293SIlya Dryomov {
5428767b293SIlya Dryomov 	return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
5438767b293SIlya Dryomov }
5448767b293SIlya Dryomov 
5457e9586baSGreg Kroah-Hartman static BUS_ATTR_WO(add);
5467e9586baSGreg Kroah-Hartman static BUS_ATTR_WO(remove);
5477e9586baSGreg Kroah-Hartman static BUS_ATTR_WO(add_single_major);
5487e9586baSGreg Kroah-Hartman static BUS_ATTR_WO(remove_single_major);
5497e9586baSGreg Kroah-Hartman static BUS_ATTR_RO(supported_features);
550b15a21ddSGreg Kroah-Hartman 
551b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = {
552b15a21ddSGreg Kroah-Hartman 	&bus_attr_add.attr,
553b15a21ddSGreg Kroah-Hartman 	&bus_attr_remove.attr,
5549b60e70bSIlya Dryomov 	&bus_attr_add_single_major.attr,
5559b60e70bSIlya Dryomov 	&bus_attr_remove_single_major.attr,
5568767b293SIlya Dryomov 	&bus_attr_supported_features.attr,
557b15a21ddSGreg Kroah-Hartman 	NULL,
558f0f8cef5SAlex Elder };
55992c76dc0SIlya Dryomov 
56092c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj,
56192c76dc0SIlya Dryomov 				  struct attribute *attr, int index)
56292c76dc0SIlya Dryomov {
5639b60e70bSIlya Dryomov 	if (!single_major &&
5649b60e70bSIlya Dryomov 	    (attr == &bus_attr_add_single_major.attr ||
5659b60e70bSIlya Dryomov 	     attr == &bus_attr_remove_single_major.attr))
5669b60e70bSIlya Dryomov 		return 0;
5679b60e70bSIlya Dryomov 
56892c76dc0SIlya Dryomov 	return attr->mode;
56992c76dc0SIlya Dryomov }
57092c76dc0SIlya Dryomov 
57192c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = {
57292c76dc0SIlya Dryomov 	.attrs = rbd_bus_attrs,
57392c76dc0SIlya Dryomov 	.is_visible = rbd_bus_is_visible,
57492c76dc0SIlya Dryomov };
57592c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus);
576f0f8cef5SAlex Elder 
577f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
578f0f8cef5SAlex Elder 	.name		= "rbd",
579b15a21ddSGreg Kroah-Hartman 	.bus_groups	= rbd_bus_groups,
580f0f8cef5SAlex Elder };
581f0f8cef5SAlex Elder 
582f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
583f0f8cef5SAlex Elder {
584f0f8cef5SAlex Elder }
585f0f8cef5SAlex Elder 
586f0f8cef5SAlex Elder static struct device rbd_root_dev = {
587f0f8cef5SAlex Elder 	.init_name =    "rbd",
588f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
589f0f8cef5SAlex Elder };
590f0f8cef5SAlex Elder 
59106ecc6cbSAlex Elder static __printf(2, 3)
59206ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
59306ecc6cbSAlex Elder {
59406ecc6cbSAlex Elder 	struct va_format vaf;
59506ecc6cbSAlex Elder 	va_list args;
59606ecc6cbSAlex Elder 
59706ecc6cbSAlex Elder 	va_start(args, fmt);
59806ecc6cbSAlex Elder 	vaf.fmt = fmt;
59906ecc6cbSAlex Elder 	vaf.va = &args;
60006ecc6cbSAlex Elder 
60106ecc6cbSAlex Elder 	if (!rbd_dev)
60206ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
60306ecc6cbSAlex Elder 	else if (rbd_dev->disk)
60406ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
60506ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
60606ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
60706ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
60806ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
60906ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
61006ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
61106ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
61206ecc6cbSAlex Elder 	else	/* punt */
61306ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
61406ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
61506ecc6cbSAlex Elder 	va_end(args);
61606ecc6cbSAlex Elder }
61706ecc6cbSAlex Elder 
618aafb230eSAlex Elder #ifdef RBD_DEBUG
619aafb230eSAlex Elder #define rbd_assert(expr)						\
620aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
621aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
622aafb230eSAlex Elder 						"at line %d:\n\n"	\
623aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
624aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
625aafb230eSAlex Elder 			BUG();						\
626aafb230eSAlex Elder 		}
627aafb230eSAlex Elder #else /* !RBD_DEBUG */
628aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
629aafb230eSAlex Elder #endif /* !RBD_DEBUG */
630dfc5606dSYehuda Sadeh 
63105a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
6328b3e1a56SAlex Elder 
633cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev);
6342df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
635a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev);
636e8f59b59SIlya Dryomov static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
63754cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
63854cac61fSAlex Elder 					u64 snap_id);
6392ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
6402ad3d716SAlex Elder 				u8 *order, u64 *snap_size);
6412ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
6422ad3d716SAlex Elder 		u64 *snap_features);
64322e8bd51SIlya Dryomov static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev);
64459c2be1eSYehuda Sadeh 
64554ab3b24SIlya Dryomov static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result);
6460192ce2eSIlya Dryomov static void rbd_img_handle_request(struct rbd_img_request *img_req, int result);
6470192ce2eSIlya Dryomov 
6480192ce2eSIlya Dryomov /*
6490192ce2eSIlya Dryomov  * Return true if nothing else is pending.
6500192ce2eSIlya Dryomov  */
6510192ce2eSIlya Dryomov static bool pending_result_dec(struct pending_result *pending, int *result)
6520192ce2eSIlya Dryomov {
6530192ce2eSIlya Dryomov 	rbd_assert(pending->num_pending > 0);
6540192ce2eSIlya Dryomov 
6550192ce2eSIlya Dryomov 	if (*result && !pending->result)
6560192ce2eSIlya Dryomov 		pending->result = *result;
6570192ce2eSIlya Dryomov 	if (--pending->num_pending)
6580192ce2eSIlya Dryomov 		return false;
6590192ce2eSIlya Dryomov 
6600192ce2eSIlya Dryomov 	*result = pending->result;
6610192ce2eSIlya Dryomov 	return true;
6620192ce2eSIlya Dryomov }
663602adf40SYehuda Sadeh 
664602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
665602adf40SYehuda Sadeh {
666f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
667b82d167bSAlex Elder 	bool removing = false;
668602adf40SYehuda Sadeh 
669a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
670b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
671b82d167bSAlex Elder 		removing = true;
672b82d167bSAlex Elder 	else
673b82d167bSAlex Elder 		rbd_dev->open_count++;
674a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
675b82d167bSAlex Elder 	if (removing)
676b82d167bSAlex Elder 		return -ENOENT;
677b82d167bSAlex Elder 
678c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
679340c7a2bSAlex Elder 
680602adf40SYehuda Sadeh 	return 0;
681602adf40SYehuda Sadeh }
682602adf40SYehuda Sadeh 
683db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode)
684dfc5606dSYehuda Sadeh {
685dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
686b82d167bSAlex Elder 	unsigned long open_count_before;
687b82d167bSAlex Elder 
688a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
689b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
690a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
691b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
692dfc5606dSYehuda Sadeh 
693c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
694dfc5606dSYehuda Sadeh }
695dfc5606dSYehuda Sadeh 
696131fd9f6SGuangliang Zhao static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
697131fd9f6SGuangliang Zhao {
6981de797bbSIlya Dryomov 	int ro;
699131fd9f6SGuangliang Zhao 
7001de797bbSIlya Dryomov 	if (get_user(ro, (int __user *)arg))
701131fd9f6SGuangliang Zhao 		return -EFAULT;
702131fd9f6SGuangliang Zhao 
7031de797bbSIlya Dryomov 	/* Snapshots can't be marked read-write */
704f3c0e459SIlya Dryomov 	if (rbd_is_snap(rbd_dev) && !ro)
705131fd9f6SGuangliang Zhao 		return -EROFS;
706131fd9f6SGuangliang Zhao 
7071de797bbSIlya Dryomov 	/* Let blkdev_roset() handle it */
7081de797bbSIlya Dryomov 	return -ENOTTY;
709131fd9f6SGuangliang Zhao }
710131fd9f6SGuangliang Zhao 
711131fd9f6SGuangliang Zhao static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
712131fd9f6SGuangliang Zhao 			unsigned int cmd, unsigned long arg)
713131fd9f6SGuangliang Zhao {
714131fd9f6SGuangliang Zhao 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
7151de797bbSIlya Dryomov 	int ret;
716131fd9f6SGuangliang Zhao 
717131fd9f6SGuangliang Zhao 	switch (cmd) {
718131fd9f6SGuangliang Zhao 	case BLKROSET:
719131fd9f6SGuangliang Zhao 		ret = rbd_ioctl_set_ro(rbd_dev, arg);
720131fd9f6SGuangliang Zhao 		break;
721131fd9f6SGuangliang Zhao 	default:
722131fd9f6SGuangliang Zhao 		ret = -ENOTTY;
723131fd9f6SGuangliang Zhao 	}
724131fd9f6SGuangliang Zhao 
725131fd9f6SGuangliang Zhao 	return ret;
726131fd9f6SGuangliang Zhao }
727131fd9f6SGuangliang Zhao 
728131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT
729131fd9f6SGuangliang Zhao static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
730131fd9f6SGuangliang Zhao 				unsigned int cmd, unsigned long arg)
731131fd9f6SGuangliang Zhao {
732131fd9f6SGuangliang Zhao 	return rbd_ioctl(bdev, mode, cmd, arg);
733131fd9f6SGuangliang Zhao }
734131fd9f6SGuangliang Zhao #endif /* CONFIG_COMPAT */
735131fd9f6SGuangliang Zhao 
736602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
737602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
738602adf40SYehuda Sadeh 	.open			= rbd_open,
739dfc5606dSYehuda Sadeh 	.release		= rbd_release,
740131fd9f6SGuangliang Zhao 	.ioctl			= rbd_ioctl,
741131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT
742131fd9f6SGuangliang Zhao 	.compat_ioctl		= rbd_compat_ioctl,
743131fd9f6SGuangliang Zhao #endif
744602adf40SYehuda Sadeh };
745602adf40SYehuda Sadeh 
746602adf40SYehuda Sadeh /*
7477262cfcaSAlex Elder  * Initialize an rbd client instance.  Success or not, this function
748cfbf6377SAlex Elder  * consumes ceph_opts.  Caller holds client_mutex.
749602adf40SYehuda Sadeh  */
750f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
751602adf40SYehuda Sadeh {
752602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
753602adf40SYehuda Sadeh 	int ret = -ENOMEM;
754602adf40SYehuda Sadeh 
75537206ee5SAlex Elder 	dout("%s:\n", __func__);
756602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
757602adf40SYehuda Sadeh 	if (!rbdc)
758602adf40SYehuda Sadeh 		goto out_opt;
759602adf40SYehuda Sadeh 
760602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
761602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
762602adf40SYehuda Sadeh 
76374da4a0fSIlya Dryomov 	rbdc->client = ceph_create_client(ceph_opts, rbdc);
764602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
76508f75463SAlex Elder 		goto out_rbdc;
76643ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
767602adf40SYehuda Sadeh 
768602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
769602adf40SYehuda Sadeh 	if (ret < 0)
77008f75463SAlex Elder 		goto out_client;
771602adf40SYehuda Sadeh 
772432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
773602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
774432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
775602adf40SYehuda Sadeh 
77637206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
777bc534d86SAlex Elder 
778602adf40SYehuda Sadeh 	return rbdc;
77908f75463SAlex Elder out_client:
780602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
78108f75463SAlex Elder out_rbdc:
782602adf40SYehuda Sadeh 	kfree(rbdc);
783602adf40SYehuda Sadeh out_opt:
78443ae4701SAlex Elder 	if (ceph_opts)
78543ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
78637206ee5SAlex Elder 	dout("%s: error %d\n", __func__, ret);
78737206ee5SAlex Elder 
78828f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
789602adf40SYehuda Sadeh }
790602adf40SYehuda Sadeh 
7912f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
7922f82ee54SAlex Elder {
7932f82ee54SAlex Elder 	kref_get(&rbdc->kref);
7942f82ee54SAlex Elder 
7952f82ee54SAlex Elder 	return rbdc;
7962f82ee54SAlex Elder }
7972f82ee54SAlex Elder 
798602adf40SYehuda Sadeh /*
7991f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
8001f7ba331SAlex Elder  * found, bump its reference count.
801602adf40SYehuda Sadeh  */
8021f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
803602adf40SYehuda Sadeh {
804602adf40SYehuda Sadeh 	struct rbd_client *client_node;
8051f7ba331SAlex Elder 	bool found = false;
806602adf40SYehuda Sadeh 
80743ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
808602adf40SYehuda Sadeh 		return NULL;
809602adf40SYehuda Sadeh 
8101f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
8111f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
8121f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
8132f82ee54SAlex Elder 			__rbd_get_client(client_node);
8142f82ee54SAlex Elder 
8151f7ba331SAlex Elder 			found = true;
8161f7ba331SAlex Elder 			break;
8171f7ba331SAlex Elder 		}
8181f7ba331SAlex Elder 	}
8191f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
8201f7ba331SAlex Elder 
8211f7ba331SAlex Elder 	return found ? client_node : NULL;
822602adf40SYehuda Sadeh }
823602adf40SYehuda Sadeh 
824602adf40SYehuda Sadeh /*
825210c104cSIlya Dryomov  * (Per device) rbd map options
82659c2be1eSYehuda Sadeh  */
82759c2be1eSYehuda Sadeh enum {
828b5584180SIlya Dryomov 	Opt_queue_depth,
8290c93e1b7SIlya Dryomov 	Opt_alloc_size,
83034f55d0bSDongsheng Yang 	Opt_lock_timeout,
83159c2be1eSYehuda Sadeh 	Opt_last_int,
83259c2be1eSYehuda Sadeh 	/* int args above */
833b26c047bSIlya Dryomov 	Opt_pool_ns,
83459c2be1eSYehuda Sadeh 	Opt_last_string,
83559c2be1eSYehuda Sadeh 	/* string args above */
836cc0538b6SAlex Elder 	Opt_read_only,
837cc0538b6SAlex Elder 	Opt_read_write,
83880de1912SIlya Dryomov 	Opt_lock_on_read,
839e010dd0aSIlya Dryomov 	Opt_exclusive,
840d9360540SIlya Dryomov 	Opt_notrim,
841210c104cSIlya Dryomov 	Opt_err
84259c2be1eSYehuda Sadeh };
84359c2be1eSYehuda Sadeh 
84443ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
845b5584180SIlya Dryomov 	{Opt_queue_depth, "queue_depth=%d"},
8460c93e1b7SIlya Dryomov 	{Opt_alloc_size, "alloc_size=%d"},
84734f55d0bSDongsheng Yang 	{Opt_lock_timeout, "lock_timeout=%d"},
84859c2be1eSYehuda Sadeh 	/* int args above */
849b26c047bSIlya Dryomov 	{Opt_pool_ns, "_pool_ns=%s"},
85059c2be1eSYehuda Sadeh 	/* string args above */
851be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
852cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
853cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
854cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
85580de1912SIlya Dryomov 	{Opt_lock_on_read, "lock_on_read"},
856e010dd0aSIlya Dryomov 	{Opt_exclusive, "exclusive"},
857d9360540SIlya Dryomov 	{Opt_notrim, "notrim"},
858210c104cSIlya Dryomov 	{Opt_err, NULL}
85959c2be1eSYehuda Sadeh };
86059c2be1eSYehuda Sadeh 
86198571b5aSAlex Elder struct rbd_options {
862b5584180SIlya Dryomov 	int	queue_depth;
8630c93e1b7SIlya Dryomov 	int	alloc_size;
86434f55d0bSDongsheng Yang 	unsigned long	lock_timeout;
86598571b5aSAlex Elder 	bool	read_only;
86680de1912SIlya Dryomov 	bool	lock_on_read;
867e010dd0aSIlya Dryomov 	bool	exclusive;
868d9360540SIlya Dryomov 	bool	trim;
86998571b5aSAlex Elder };
87098571b5aSAlex Elder 
871b5584180SIlya Dryomov #define RBD_QUEUE_DEPTH_DEFAULT	BLKDEV_MAX_RQ
8720c93e1b7SIlya Dryomov #define RBD_ALLOC_SIZE_DEFAULT	(64 * 1024)
87334f55d0bSDongsheng Yang #define RBD_LOCK_TIMEOUT_DEFAULT 0  /* no timeout */
87498571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
87580de1912SIlya Dryomov #define RBD_LOCK_ON_READ_DEFAULT false
876e010dd0aSIlya Dryomov #define RBD_EXCLUSIVE_DEFAULT	false
877d9360540SIlya Dryomov #define RBD_TRIM_DEFAULT	true
87898571b5aSAlex Elder 
879c300156bSIlya Dryomov struct parse_rbd_opts_ctx {
880c300156bSIlya Dryomov 	struct rbd_spec		*spec;
881c300156bSIlya Dryomov 	struct rbd_options	*opts;
882c300156bSIlya Dryomov };
883c300156bSIlya Dryomov 
88459c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
88559c2be1eSYehuda Sadeh {
886c300156bSIlya Dryomov 	struct parse_rbd_opts_ctx *pctx = private;
88759c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
88859c2be1eSYehuda Sadeh 	int token, intval, ret;
88959c2be1eSYehuda Sadeh 
89043ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
89159c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
89259c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
89359c2be1eSYehuda Sadeh 		if (ret < 0) {
8942f56b6baSIlya Dryomov 			pr_err("bad option arg (not int) at '%s'\n", c);
89559c2be1eSYehuda Sadeh 			return ret;
89659c2be1eSYehuda Sadeh 		}
89759c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
89859c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
899210c104cSIlya Dryomov 		dout("got string token %d val %s\n", token, argstr[0].from);
90059c2be1eSYehuda Sadeh 	} else {
90159c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
90259c2be1eSYehuda Sadeh 	}
90359c2be1eSYehuda Sadeh 
90459c2be1eSYehuda Sadeh 	switch (token) {
905b5584180SIlya Dryomov 	case Opt_queue_depth:
906b5584180SIlya Dryomov 		if (intval < 1) {
907b5584180SIlya Dryomov 			pr_err("queue_depth out of range\n");
908b5584180SIlya Dryomov 			return -EINVAL;
909b5584180SIlya Dryomov 		}
910c300156bSIlya Dryomov 		pctx->opts->queue_depth = intval;
911b5584180SIlya Dryomov 		break;
9120c93e1b7SIlya Dryomov 	case Opt_alloc_size:
91316d80c54SIlya Dryomov 		if (intval < SECTOR_SIZE) {
9140c93e1b7SIlya Dryomov 			pr_err("alloc_size out of range\n");
9150c93e1b7SIlya Dryomov 			return -EINVAL;
9160c93e1b7SIlya Dryomov 		}
9170c93e1b7SIlya Dryomov 		if (!is_power_of_2(intval)) {
9180c93e1b7SIlya Dryomov 			pr_err("alloc_size must be a power of 2\n");
9190c93e1b7SIlya Dryomov 			return -EINVAL;
9200c93e1b7SIlya Dryomov 		}
9210c93e1b7SIlya Dryomov 		pctx->opts->alloc_size = intval;
9220c93e1b7SIlya Dryomov 		break;
92334f55d0bSDongsheng Yang 	case Opt_lock_timeout:
92434f55d0bSDongsheng Yang 		/* 0 is "wait forever" (i.e. infinite timeout) */
92534f55d0bSDongsheng Yang 		if (intval < 0 || intval > INT_MAX / 1000) {
92634f55d0bSDongsheng Yang 			pr_err("lock_timeout out of range\n");
92734f55d0bSDongsheng Yang 			return -EINVAL;
92834f55d0bSDongsheng Yang 		}
929c300156bSIlya Dryomov 		pctx->opts->lock_timeout = msecs_to_jiffies(intval * 1000);
93034f55d0bSDongsheng Yang 		break;
931b26c047bSIlya Dryomov 	case Opt_pool_ns:
932b26c047bSIlya Dryomov 		kfree(pctx->spec->pool_ns);
933b26c047bSIlya Dryomov 		pctx->spec->pool_ns = match_strdup(argstr);
934b26c047bSIlya Dryomov 		if (!pctx->spec->pool_ns)
935b26c047bSIlya Dryomov 			return -ENOMEM;
93659c2be1eSYehuda Sadeh 		break;
937cc0538b6SAlex Elder 	case Opt_read_only:
938c300156bSIlya Dryomov 		pctx->opts->read_only = true;
939cc0538b6SAlex Elder 		break;
940cc0538b6SAlex Elder 	case Opt_read_write:
941c300156bSIlya Dryomov 		pctx->opts->read_only = false;
942cc0538b6SAlex Elder 		break;
94380de1912SIlya Dryomov 	case Opt_lock_on_read:
944c300156bSIlya Dryomov 		pctx->opts->lock_on_read = true;
94580de1912SIlya Dryomov 		break;
946e010dd0aSIlya Dryomov 	case Opt_exclusive:
947c300156bSIlya Dryomov 		pctx->opts->exclusive = true;
948e010dd0aSIlya Dryomov 		break;
949d9360540SIlya Dryomov 	case Opt_notrim:
950c300156bSIlya Dryomov 		pctx->opts->trim = false;
951d9360540SIlya Dryomov 		break;
95259c2be1eSYehuda Sadeh 	default:
953210c104cSIlya Dryomov 		/* libceph prints "bad option" msg */
954210c104cSIlya Dryomov 		return -EINVAL;
95559c2be1eSYehuda Sadeh 	}
956210c104cSIlya Dryomov 
95759c2be1eSYehuda Sadeh 	return 0;
95859c2be1eSYehuda Sadeh }
95959c2be1eSYehuda Sadeh 
9606d2940c8SGuangliang Zhao static char* obj_op_name(enum obj_operation_type op_type)
9616d2940c8SGuangliang Zhao {
9626d2940c8SGuangliang Zhao 	switch (op_type) {
9636d2940c8SGuangliang Zhao 	case OBJ_OP_READ:
9646d2940c8SGuangliang Zhao 		return "read";
9656d2940c8SGuangliang Zhao 	case OBJ_OP_WRITE:
9666d2940c8SGuangliang Zhao 		return "write";
96790e98c52SGuangliang Zhao 	case OBJ_OP_DISCARD:
96890e98c52SGuangliang Zhao 		return "discard";
9696484cbe9SIlya Dryomov 	case OBJ_OP_ZEROOUT:
9706484cbe9SIlya Dryomov 		return "zeroout";
9716d2940c8SGuangliang Zhao 	default:
9726d2940c8SGuangliang Zhao 		return "???";
9736d2940c8SGuangliang Zhao 	}
9746d2940c8SGuangliang Zhao }
9756d2940c8SGuangliang Zhao 
97659c2be1eSYehuda Sadeh /*
977602adf40SYehuda Sadeh  * Destroy ceph client
978d23a4b3fSAlex Elder  *
979432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
980602adf40SYehuda Sadeh  */
981602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
982602adf40SYehuda Sadeh {
983602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
984602adf40SYehuda Sadeh 
98537206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
986cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
987602adf40SYehuda Sadeh 	list_del(&rbdc->node);
988cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
989602adf40SYehuda Sadeh 
990602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
991602adf40SYehuda Sadeh 	kfree(rbdc);
992602adf40SYehuda Sadeh }
993602adf40SYehuda Sadeh 
994602adf40SYehuda Sadeh /*
995602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
996602adf40SYehuda Sadeh  * it.
997602adf40SYehuda Sadeh  */
9989d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
999602adf40SYehuda Sadeh {
1000c53d5893SAlex Elder 	if (rbdc)
10019d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
1002602adf40SYehuda Sadeh }
1003602adf40SYehuda Sadeh 
10045feb0d8dSIlya Dryomov /*
10055feb0d8dSIlya Dryomov  * Get a ceph client with specific addr and configuration, if one does
10065feb0d8dSIlya Dryomov  * not exist create it.  Either way, ceph_opts is consumed by this
10075feb0d8dSIlya Dryomov  * function.
10085feb0d8dSIlya Dryomov  */
10095feb0d8dSIlya Dryomov static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
10105feb0d8dSIlya Dryomov {
10115feb0d8dSIlya Dryomov 	struct rbd_client *rbdc;
1012dd435855SIlya Dryomov 	int ret;
10135feb0d8dSIlya Dryomov 
1014a32e4143SIlya Dryomov 	mutex_lock(&client_mutex);
10155feb0d8dSIlya Dryomov 	rbdc = rbd_client_find(ceph_opts);
1016dd435855SIlya Dryomov 	if (rbdc) {
10175feb0d8dSIlya Dryomov 		ceph_destroy_options(ceph_opts);
1018dd435855SIlya Dryomov 
1019dd435855SIlya Dryomov 		/*
1020dd435855SIlya Dryomov 		 * Using an existing client.  Make sure ->pg_pools is up to
1021dd435855SIlya Dryomov 		 * date before we look up the pool id in do_rbd_add().
1022dd435855SIlya Dryomov 		 */
10239d4a227fSIlya Dryomov 		ret = ceph_wait_for_latest_osdmap(rbdc->client,
10249d4a227fSIlya Dryomov 					rbdc->client->options->mount_timeout);
1025dd435855SIlya Dryomov 		if (ret) {
1026dd435855SIlya Dryomov 			rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
1027dd435855SIlya Dryomov 			rbd_put_client(rbdc);
1028dd435855SIlya Dryomov 			rbdc = ERR_PTR(ret);
1029dd435855SIlya Dryomov 		}
1030dd435855SIlya Dryomov 	} else {
10315feb0d8dSIlya Dryomov 		rbdc = rbd_client_create(ceph_opts);
1032dd435855SIlya Dryomov 	}
10335feb0d8dSIlya Dryomov 	mutex_unlock(&client_mutex);
10345feb0d8dSIlya Dryomov 
10355feb0d8dSIlya Dryomov 	return rbdc;
10365feb0d8dSIlya Dryomov }
10375feb0d8dSIlya Dryomov 
1038a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
1039a30b71b9SAlex Elder {
1040a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
1041a30b71b9SAlex Elder }
1042a30b71b9SAlex Elder 
10438e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
10448e94af8eSAlex Elder {
1045103a150fSAlex Elder 	size_t size;
1046103a150fSAlex Elder 	u32 snap_count;
1047103a150fSAlex Elder 
1048103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
1049103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
1050103a150fSAlex Elder 		return false;
1051103a150fSAlex Elder 
1052db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
1053db2388b6SAlex Elder 
1054db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
1055db2388b6SAlex Elder 		return false;
1056db2388b6SAlex Elder 
1057db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
1058db2388b6SAlex Elder 
1059db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
1060db2388b6SAlex Elder 		return false;
1061db2388b6SAlex Elder 
1062103a150fSAlex Elder 	/*
1063103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
1064103a150fSAlex Elder 	 * that limits the number of snapshots.
1065103a150fSAlex Elder 	 */
1066103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
1067103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
1068103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
1069103a150fSAlex Elder 		return false;
1070103a150fSAlex Elder 
1071103a150fSAlex Elder 	/*
1072103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
1073103a150fSAlex Elder 	 * header must also be representable in a size_t.
1074103a150fSAlex Elder 	 */
1075103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
1076103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
1077103a150fSAlex Elder 		return false;
1078103a150fSAlex Elder 
1079103a150fSAlex Elder 	return true;
10808e94af8eSAlex Elder }
10818e94af8eSAlex Elder 
1082602adf40SYehuda Sadeh /*
10835bc3fb17SIlya Dryomov  * returns the size of an object in the image
10845bc3fb17SIlya Dryomov  */
10855bc3fb17SIlya Dryomov static u32 rbd_obj_bytes(struct rbd_image_header *header)
10865bc3fb17SIlya Dryomov {
10875bc3fb17SIlya Dryomov 	return 1U << header->obj_order;
10885bc3fb17SIlya Dryomov }
10895bc3fb17SIlya Dryomov 
1090263423f8SIlya Dryomov static void rbd_init_layout(struct rbd_device *rbd_dev)
1091263423f8SIlya Dryomov {
1092263423f8SIlya Dryomov 	if (rbd_dev->header.stripe_unit == 0 ||
1093263423f8SIlya Dryomov 	    rbd_dev->header.stripe_count == 0) {
1094263423f8SIlya Dryomov 		rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
1095263423f8SIlya Dryomov 		rbd_dev->header.stripe_count = 1;
1096263423f8SIlya Dryomov 	}
1097263423f8SIlya Dryomov 
1098263423f8SIlya Dryomov 	rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
1099263423f8SIlya Dryomov 	rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
1100263423f8SIlya Dryomov 	rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
11017e97332eSIlya Dryomov 	rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
11027e97332eSIlya Dryomov 			  rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
1103263423f8SIlya Dryomov 	RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
1104263423f8SIlya Dryomov }
1105263423f8SIlya Dryomov 
11065bc3fb17SIlya Dryomov /*
1107bb23e37aSAlex Elder  * Fill an rbd image header with information from the given format 1
1108bb23e37aSAlex Elder  * on-disk header.
1109602adf40SYehuda Sadeh  */
1110662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev,
11114156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
1112602adf40SYehuda Sadeh {
1113662518b1SAlex Elder 	struct rbd_image_header *header = &rbd_dev->header;
1114bb23e37aSAlex Elder 	bool first_time = header->object_prefix == NULL;
1115bb23e37aSAlex Elder 	struct ceph_snap_context *snapc;
1116bb23e37aSAlex Elder 	char *object_prefix = NULL;
1117bb23e37aSAlex Elder 	char *snap_names = NULL;
1118bb23e37aSAlex Elder 	u64 *snap_sizes = NULL;
1119ccece235SAlex Elder 	u32 snap_count;
1120bb23e37aSAlex Elder 	int ret = -ENOMEM;
1121621901d6SAlex Elder 	u32 i;
1122602adf40SYehuda Sadeh 
1123bb23e37aSAlex Elder 	/* Allocate this now to avoid having to handle failure below */
1124103a150fSAlex Elder 
1125bb23e37aSAlex Elder 	if (first_time) {
1126848d796cSIlya Dryomov 		object_prefix = kstrndup(ondisk->object_prefix,
1127848d796cSIlya Dryomov 					 sizeof(ondisk->object_prefix),
1128848d796cSIlya Dryomov 					 GFP_KERNEL);
1129bb23e37aSAlex Elder 		if (!object_prefix)
1130602adf40SYehuda Sadeh 			return -ENOMEM;
1131bb23e37aSAlex Elder 	}
113200f1f36fSAlex Elder 
1133bb23e37aSAlex Elder 	/* Allocate the snapshot context and fill it in */
1134d2bb24e5SAlex Elder 
1135602adf40SYehuda Sadeh 	snap_count = le32_to_cpu(ondisk->snap_count);
1136bb23e37aSAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1137bb23e37aSAlex Elder 	if (!snapc)
1138bb23e37aSAlex Elder 		goto out_err;
1139bb23e37aSAlex Elder 	snapc->seq = le64_to_cpu(ondisk->snap_seq);
1140602adf40SYehuda Sadeh 	if (snap_count) {
1141bb23e37aSAlex Elder 		struct rbd_image_snap_ondisk *snaps;
1142f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1143f785cc1dSAlex Elder 
1144bb23e37aSAlex Elder 		/* We'll keep a copy of the snapshot names... */
1145621901d6SAlex Elder 
1146f785cc1dSAlex Elder 		if (snap_names_len > (u64)SIZE_MAX)
1147bb23e37aSAlex Elder 			goto out_2big;
1148bb23e37aSAlex Elder 		snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1149bb23e37aSAlex Elder 		if (!snap_names)
1150602adf40SYehuda Sadeh 			goto out_err;
1151bb23e37aSAlex Elder 
1152bb23e37aSAlex Elder 		/* ...as well as the array of their sizes. */
115388a25a5fSMarkus Elfring 		snap_sizes = kmalloc_array(snap_count,
115488a25a5fSMarkus Elfring 					   sizeof(*header->snap_sizes),
115588a25a5fSMarkus Elfring 					   GFP_KERNEL);
1156bb23e37aSAlex Elder 		if (!snap_sizes)
1157bb23e37aSAlex Elder 			goto out_err;
1158bb23e37aSAlex Elder 
1159f785cc1dSAlex Elder 		/*
1160bb23e37aSAlex Elder 		 * Copy the names, and fill in each snapshot's id
1161bb23e37aSAlex Elder 		 * and size.
1162bb23e37aSAlex Elder 		 *
116399a41ebcSAlex Elder 		 * Note that rbd_dev_v1_header_info() guarantees the
1164bb23e37aSAlex Elder 		 * ondisk buffer we're working with has
1165f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
1166f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
1167f785cc1dSAlex Elder 		 */
1168bb23e37aSAlex Elder 		memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1169bb23e37aSAlex Elder 		snaps = ondisk->snaps;
1170bb23e37aSAlex Elder 		for (i = 0; i < snap_count; i++) {
1171bb23e37aSAlex Elder 			snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1172bb23e37aSAlex Elder 			snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1173bb23e37aSAlex Elder 		}
1174602adf40SYehuda Sadeh 	}
1175849b4260SAlex Elder 
1176bb23e37aSAlex Elder 	/* We won't fail any more, fill in the header */
1177bb23e37aSAlex Elder 
1178bb23e37aSAlex Elder 	if (first_time) {
1179bb23e37aSAlex Elder 		header->object_prefix = object_prefix;
1180602adf40SYehuda Sadeh 		header->obj_order = ondisk->options.order;
1181263423f8SIlya Dryomov 		rbd_init_layout(rbd_dev);
1182662518b1SAlex Elder 	} else {
1183662518b1SAlex Elder 		ceph_put_snap_context(header->snapc);
1184662518b1SAlex Elder 		kfree(header->snap_names);
1185662518b1SAlex Elder 		kfree(header->snap_sizes);
1186bb23e37aSAlex Elder 	}
11876a52325fSAlex Elder 
1188bb23e37aSAlex Elder 	/* The remaining fields always get updated (when we refresh) */
1189621901d6SAlex Elder 
1190f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
1191bb23e37aSAlex Elder 	header->snapc = snapc;
1192bb23e37aSAlex Elder 	header->snap_names = snap_names;
1193bb23e37aSAlex Elder 	header->snap_sizes = snap_sizes;
1194468521c1SAlex Elder 
1195602adf40SYehuda Sadeh 	return 0;
1196bb23e37aSAlex Elder out_2big:
1197bb23e37aSAlex Elder 	ret = -EIO;
11986a52325fSAlex Elder out_err:
1199bb23e37aSAlex Elder 	kfree(snap_sizes);
1200bb23e37aSAlex Elder 	kfree(snap_names);
1201bb23e37aSAlex Elder 	ceph_put_snap_context(snapc);
1202bb23e37aSAlex Elder 	kfree(object_prefix);
1203ccece235SAlex Elder 
1204bb23e37aSAlex Elder 	return ret;
1205602adf40SYehuda Sadeh }
1206602adf40SYehuda Sadeh 
12079682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
12089682fc6dSAlex Elder {
12099682fc6dSAlex Elder 	const char *snap_name;
12109682fc6dSAlex Elder 
12119682fc6dSAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
12129682fc6dSAlex Elder 
12139682fc6dSAlex Elder 	/* Skip over names until we find the one we are looking for */
12149682fc6dSAlex Elder 
12159682fc6dSAlex Elder 	snap_name = rbd_dev->header.snap_names;
12169682fc6dSAlex Elder 	while (which--)
12179682fc6dSAlex Elder 		snap_name += strlen(snap_name) + 1;
12189682fc6dSAlex Elder 
12199682fc6dSAlex Elder 	return kstrdup(snap_name, GFP_KERNEL);
12209682fc6dSAlex Elder }
12219682fc6dSAlex Elder 
122230d1cff8SAlex Elder /*
122330d1cff8SAlex Elder  * Snapshot id comparison function for use with qsort()/bsearch().
122430d1cff8SAlex Elder  * Note that result is for snapshots in *descending* order.
122530d1cff8SAlex Elder  */
122630d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2)
122730d1cff8SAlex Elder {
122830d1cff8SAlex Elder 	u64 snap_id1 = *(u64 *)s1;
122930d1cff8SAlex Elder 	u64 snap_id2 = *(u64 *)s2;
123030d1cff8SAlex Elder 
123130d1cff8SAlex Elder 	if (snap_id1 < snap_id2)
123230d1cff8SAlex Elder 		return 1;
123330d1cff8SAlex Elder 	return snap_id1 == snap_id2 ? 0 : -1;
123430d1cff8SAlex Elder }
123530d1cff8SAlex Elder 
123630d1cff8SAlex Elder /*
123730d1cff8SAlex Elder  * Search a snapshot context to see if the given snapshot id is
123830d1cff8SAlex Elder  * present.
123930d1cff8SAlex Elder  *
124030d1cff8SAlex Elder  * Returns the position of the snapshot id in the array if it's found,
124130d1cff8SAlex Elder  * or BAD_SNAP_INDEX otherwise.
124230d1cff8SAlex Elder  *
124330d1cff8SAlex Elder  * Note: The snapshot array is in kept sorted (by the osd) in
124430d1cff8SAlex Elder  * reverse order, highest snapshot id first.
124530d1cff8SAlex Elder  */
12469682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
12479682fc6dSAlex Elder {
12489682fc6dSAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
124930d1cff8SAlex Elder 	u64 *found;
12509682fc6dSAlex Elder 
125130d1cff8SAlex Elder 	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
125230d1cff8SAlex Elder 				sizeof (snap_id), snapid_compare_reverse);
12539682fc6dSAlex Elder 
125430d1cff8SAlex Elder 	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
12559682fc6dSAlex Elder }
12569682fc6dSAlex Elder 
12572ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
12582ad3d716SAlex Elder 					u64 snap_id)
125954cac61fSAlex Elder {
126054cac61fSAlex Elder 	u32 which;
1261da6a6b63SJosh Durgin 	const char *snap_name;
126254cac61fSAlex Elder 
126354cac61fSAlex Elder 	which = rbd_dev_snap_index(rbd_dev, snap_id);
126454cac61fSAlex Elder 	if (which == BAD_SNAP_INDEX)
1265da6a6b63SJosh Durgin 		return ERR_PTR(-ENOENT);
126654cac61fSAlex Elder 
1267da6a6b63SJosh Durgin 	snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1268da6a6b63SJosh Durgin 	return snap_name ? snap_name : ERR_PTR(-ENOMEM);
126954cac61fSAlex Elder }
127054cac61fSAlex Elder 
12719e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
12729e15b77dSAlex Elder {
12739e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
12749e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
12759e15b77dSAlex Elder 
127654cac61fSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
127754cac61fSAlex Elder 	if (rbd_dev->image_format == 1)
127854cac61fSAlex Elder 		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
12799e15b77dSAlex Elder 
128054cac61fSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
12819e15b77dSAlex Elder }
12829e15b77dSAlex Elder 
12832ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
12842ad3d716SAlex Elder 				u64 *snap_size)
1285602adf40SYehuda Sadeh {
12862ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
12872ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
12882ad3d716SAlex Elder 		*snap_size = rbd_dev->header.image_size;
12892ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
12902ad3d716SAlex Elder 		u32 which;
129100f1f36fSAlex Elder 
12922ad3d716SAlex Elder 		which = rbd_dev_snap_index(rbd_dev, snap_id);
12932ad3d716SAlex Elder 		if (which == BAD_SNAP_INDEX)
12942ad3d716SAlex Elder 			return -ENOENT;
129500f1f36fSAlex Elder 
12962ad3d716SAlex Elder 		*snap_size = rbd_dev->header.snap_sizes[which];
12972ad3d716SAlex Elder 	} else {
12982ad3d716SAlex Elder 		u64 size = 0;
12992ad3d716SAlex Elder 		int ret;
13002ad3d716SAlex Elder 
13012ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
13022ad3d716SAlex Elder 		if (ret)
13032ad3d716SAlex Elder 			return ret;
13042ad3d716SAlex Elder 
13052ad3d716SAlex Elder 		*snap_size = size;
13062ad3d716SAlex Elder 	}
13072ad3d716SAlex Elder 	return 0;
13082ad3d716SAlex Elder }
13092ad3d716SAlex Elder 
13102ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
13112ad3d716SAlex Elder 			u64 *snap_features)
13122ad3d716SAlex Elder {
13132ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
13142ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
13152ad3d716SAlex Elder 		*snap_features = rbd_dev->header.features;
13162ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
13172ad3d716SAlex Elder 		*snap_features = 0;	/* No features for format 1 */
13182ad3d716SAlex Elder 	} else {
13192ad3d716SAlex Elder 		u64 features = 0;
13202ad3d716SAlex Elder 		int ret;
13212ad3d716SAlex Elder 
13222ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
13232ad3d716SAlex Elder 		if (ret)
13242ad3d716SAlex Elder 			return ret;
13252ad3d716SAlex Elder 
13262ad3d716SAlex Elder 		*snap_features = features;
13272ad3d716SAlex Elder 	}
13282ad3d716SAlex Elder 	return 0;
132900f1f36fSAlex Elder }
1330602adf40SYehuda Sadeh 
1331d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1332602adf40SYehuda Sadeh {
13338f4b7d98SAlex Elder 	u64 snap_id = rbd_dev->spec->snap_id;
13342ad3d716SAlex Elder 	u64 size = 0;
13352ad3d716SAlex Elder 	u64 features = 0;
13362ad3d716SAlex Elder 	int ret;
13378b0241f8SAlex Elder 
13382ad3d716SAlex Elder 	ret = rbd_snap_size(rbd_dev, snap_id, &size);
13392ad3d716SAlex Elder 	if (ret)
13402ad3d716SAlex Elder 		return ret;
13412ad3d716SAlex Elder 	ret = rbd_snap_features(rbd_dev, snap_id, &features);
13422ad3d716SAlex Elder 	if (ret)
13432ad3d716SAlex Elder 		return ret;
13442ad3d716SAlex Elder 
13452ad3d716SAlex Elder 	rbd_dev->mapping.size = size;
13462ad3d716SAlex Elder 	rbd_dev->mapping.features = features;
13472ad3d716SAlex Elder 
13488b0241f8SAlex Elder 	return 0;
1349602adf40SYehuda Sadeh }
1350602adf40SYehuda Sadeh 
1351d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1352d1cf5788SAlex Elder {
1353d1cf5788SAlex Elder 	rbd_dev->mapping.size = 0;
1354d1cf5788SAlex Elder 	rbd_dev->mapping.features = 0;
1355200a6a8bSAlex Elder }
1356200a6a8bSAlex Elder 
13575359a17dSIlya Dryomov static void zero_bvec(struct bio_vec *bv)
135865ccfe21SAlex Elder {
1359602adf40SYehuda Sadeh 	void *buf;
13605359a17dSIlya Dryomov 	unsigned long flags;
1361602adf40SYehuda Sadeh 
13625359a17dSIlya Dryomov 	buf = bvec_kmap_irq(bv, &flags);
13635359a17dSIlya Dryomov 	memset(buf, 0, bv->bv_len);
13645359a17dSIlya Dryomov 	flush_dcache_page(bv->bv_page);
136585b5aaa6SDan Carpenter 	bvec_kunmap_irq(buf, &flags);
1366602adf40SYehuda Sadeh }
1367602adf40SYehuda Sadeh 
13685359a17dSIlya Dryomov static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
1369b9434c5bSAlex Elder {
13705359a17dSIlya Dryomov 	struct ceph_bio_iter it = *bio_pos;
1371b9434c5bSAlex Elder 
13725359a17dSIlya Dryomov 	ceph_bio_iter_advance(&it, off);
13735359a17dSIlya Dryomov 	ceph_bio_iter_advance_step(&it, bytes, ({
13745359a17dSIlya Dryomov 		zero_bvec(&bv);
13755359a17dSIlya Dryomov 	}));
1376b9434c5bSAlex Elder }
1377b9434c5bSAlex Elder 
13787e07efb1SIlya Dryomov static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
1379602adf40SYehuda Sadeh {
13807e07efb1SIlya Dryomov 	struct ceph_bvec_iter it = *bvec_pos;
1381602adf40SYehuda Sadeh 
13827e07efb1SIlya Dryomov 	ceph_bvec_iter_advance(&it, off);
13837e07efb1SIlya Dryomov 	ceph_bvec_iter_advance_step(&it, bytes, ({
13847e07efb1SIlya Dryomov 		zero_bvec(&bv);
13857e07efb1SIlya Dryomov 	}));
1386602adf40SYehuda Sadeh }
1387602adf40SYehuda Sadeh 
1388f7760dadSAlex Elder /*
13893da691bfSIlya Dryomov  * Zero a range in @obj_req data buffer defined by a bio (list) or
1390afb97888SIlya Dryomov  * (private) bio_vec array.
1391f7760dadSAlex Elder  *
13923da691bfSIlya Dryomov  * @off is relative to the start of the data buffer.
1393f7760dadSAlex Elder  */
13943da691bfSIlya Dryomov static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
13953da691bfSIlya Dryomov 			       u32 bytes)
1396f7760dadSAlex Elder {
139754ab3b24SIlya Dryomov 	dout("%s %p data buf %u~%u\n", __func__, obj_req, off, bytes);
139854ab3b24SIlya Dryomov 
1399ecc633caSIlya Dryomov 	switch (obj_req->img_request->data_type) {
14003da691bfSIlya Dryomov 	case OBJ_REQUEST_BIO:
14013da691bfSIlya Dryomov 		zero_bios(&obj_req->bio_pos, off, bytes);
14023da691bfSIlya Dryomov 		break;
14033da691bfSIlya Dryomov 	case OBJ_REQUEST_BVECS:
1404afb97888SIlya Dryomov 	case OBJ_REQUEST_OWN_BVECS:
14053da691bfSIlya Dryomov 		zero_bvecs(&obj_req->bvec_pos, off, bytes);
14063da691bfSIlya Dryomov 		break;
14073da691bfSIlya Dryomov 	default:
140816809372SArnd Bergmann 		BUG();
1409f5400b7aSAlex Elder 	}
1410bf0d5f50SAlex Elder }
1411bf0d5f50SAlex Elder 
1412bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
1413bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1414bf0d5f50SAlex Elder {
1415bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
141637206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
14172c935bc5SPeter Zijlstra 		kref_read(&obj_request->kref));
1418bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1419bf0d5f50SAlex Elder }
1420bf0d5f50SAlex Elder 
1421bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref);
1422bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request)
1423bf0d5f50SAlex Elder {
1424bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
142537206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
14262c935bc5SPeter Zijlstra 		kref_read(&img_request->kref));
1427bf0d5f50SAlex Elder 	kref_put(&img_request->kref, rbd_img_request_destroy);
1428bf0d5f50SAlex Elder }
1429bf0d5f50SAlex Elder 
1430bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1431bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1432bf0d5f50SAlex Elder {
143325dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
143425dcf954SAlex Elder 
1435b155e86cSAlex Elder 	/* Image request now owns object's original reference */
1436bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
143715961b44SIlya Dryomov 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1438bf0d5f50SAlex Elder }
1439bf0d5f50SAlex Elder 
1440bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1441bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1442bf0d5f50SAlex Elder {
144315961b44SIlya Dryomov 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
144443df3d35SIlya Dryomov 	list_del(&obj_request->ex.oe_item);
1445bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1446bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1447bf0d5f50SAlex Elder }
1448bf0d5f50SAlex Elder 
1449a086a1b8SIlya Dryomov static void rbd_osd_submit(struct ceph_osd_request *osd_req)
1450bf0d5f50SAlex Elder {
1451a086a1b8SIlya Dryomov 	struct rbd_obj_request *obj_req = osd_req->r_priv;
1452980917fcSIlya Dryomov 
1453a086a1b8SIlya Dryomov 	dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n",
1454a086a1b8SIlya Dryomov 	     __func__, osd_req, obj_req, obj_req->ex.oe_objno,
1455a086a1b8SIlya Dryomov 	     obj_req->ex.oe_off, obj_req->ex.oe_len);
1456980917fcSIlya Dryomov 	ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
1457bf0d5f50SAlex Elder }
1458bf0d5f50SAlex Elder 
14590c425248SAlex Elder /*
14600c425248SAlex Elder  * The default/initial value for all image request flags is 0.  Each
14610c425248SAlex Elder  * is conditionally set to 1 at image request initialization time
14620c425248SAlex Elder  * and currently never change thereafter.
14630c425248SAlex Elder  */
1464d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request)
1465d0b2e944SAlex Elder {
1466d0b2e944SAlex Elder 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1467d0b2e944SAlex Elder 	smp_mb();
1468d0b2e944SAlex Elder }
1469d0b2e944SAlex Elder 
1470a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request)
1471a2acd00eSAlex Elder {
1472a2acd00eSAlex Elder 	clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1473a2acd00eSAlex Elder 	smp_mb();
1474a2acd00eSAlex Elder }
1475a2acd00eSAlex Elder 
1476d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request)
1477d0b2e944SAlex Elder {
1478d0b2e944SAlex Elder 	smp_mb();
1479d0b2e944SAlex Elder 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1480d0b2e944SAlex Elder }
1481d0b2e944SAlex Elder 
14823da691bfSIlya Dryomov static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
14833b434a2aSJosh Durgin {
14843da691bfSIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
14853da691bfSIlya Dryomov 
148643df3d35SIlya Dryomov 	return !obj_req->ex.oe_off &&
148743df3d35SIlya Dryomov 	       obj_req->ex.oe_len == rbd_dev->layout.object_size;
14883b434a2aSJosh Durgin }
14893b434a2aSJosh Durgin 
14903da691bfSIlya Dryomov static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
14916e2a4505SAlex Elder {
14923da691bfSIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1493b9434c5bSAlex Elder 
149443df3d35SIlya Dryomov 	return obj_req->ex.oe_off + obj_req->ex.oe_len ==
14953da691bfSIlya Dryomov 					rbd_dev->layout.object_size;
14966e2a4505SAlex Elder }
14976e2a4505SAlex Elder 
149813488d53SIlya Dryomov /*
149913488d53SIlya Dryomov  * Must be called after rbd_obj_calc_img_extents().
150013488d53SIlya Dryomov  */
150113488d53SIlya Dryomov static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req)
150213488d53SIlya Dryomov {
150313488d53SIlya Dryomov 	if (!obj_req->num_img_extents ||
15049b17eb2cSIlya Dryomov 	    (rbd_obj_is_entire(obj_req) &&
15059b17eb2cSIlya Dryomov 	     !obj_req->img_request->snapc->num_snaps))
150613488d53SIlya Dryomov 		return false;
150713488d53SIlya Dryomov 
150813488d53SIlya Dryomov 	return true;
150913488d53SIlya Dryomov }
151013488d53SIlya Dryomov 
151186bd7998SIlya Dryomov static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
1512bf0d5f50SAlex Elder {
151386bd7998SIlya Dryomov 	return ceph_file_extents_bytes(obj_req->img_extents,
151486bd7998SIlya Dryomov 				       obj_req->num_img_extents);
1515bf0d5f50SAlex Elder }
1516bf0d5f50SAlex Elder 
15173da691bfSIlya Dryomov static bool rbd_img_is_write(struct rbd_img_request *img_req)
15180dcc685eSIlya Dryomov {
15199bb0248dSIlya Dryomov 	switch (img_req->op_type) {
15203da691bfSIlya Dryomov 	case OBJ_OP_READ:
15213da691bfSIlya Dryomov 		return false;
15223da691bfSIlya Dryomov 	case OBJ_OP_WRITE:
15233da691bfSIlya Dryomov 	case OBJ_OP_DISCARD:
15246484cbe9SIlya Dryomov 	case OBJ_OP_ZEROOUT:
15253da691bfSIlya Dryomov 		return true;
15263da691bfSIlya Dryomov 	default:
1527c6244b3bSArnd Bergmann 		BUG();
15280dcc685eSIlya Dryomov 	}
15290dcc685eSIlya Dryomov }
15300dcc685eSIlya Dryomov 
153185e084feSIlya Dryomov static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
1532bf0d5f50SAlex Elder {
15333da691bfSIlya Dryomov 	struct rbd_obj_request *obj_req = osd_req->r_priv;
153454ab3b24SIlya Dryomov 	int result;
1535bf0d5f50SAlex Elder 
15363da691bfSIlya Dryomov 	dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
15373da691bfSIlya Dryomov 	     osd_req->r_result, obj_req);
1538bf0d5f50SAlex Elder 
1539c47f9371SAlex Elder 	/*
15403da691bfSIlya Dryomov 	 * Writes aren't allowed to return a data payload.  In some
15413da691bfSIlya Dryomov 	 * guarded write cases (e.g. stat + zero on an empty object)
15423da691bfSIlya Dryomov 	 * a stat response makes it through, but we don't care.
1543c47f9371SAlex Elder 	 */
154454ab3b24SIlya Dryomov 	if (osd_req->r_result > 0 && rbd_img_is_write(obj_req->img_request))
154554ab3b24SIlya Dryomov 		result = 0;
154654ab3b24SIlya Dryomov 	else
154754ab3b24SIlya Dryomov 		result = osd_req->r_result;
15480ccd5926SIlya Dryomov 
154954ab3b24SIlya Dryomov 	rbd_obj_handle_request(obj_req, result);
1550bf0d5f50SAlex Elder }
1551bf0d5f50SAlex Elder 
1552bcbab1dbSIlya Dryomov static void rbd_osd_format_read(struct ceph_osd_request *osd_req)
1553430c28c3SAlex Elder {
1554bcbab1dbSIlya Dryomov 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1555430c28c3SAlex Elder 
1556a162b308SIlya Dryomov 	osd_req->r_flags = CEPH_OSD_FLAG_READ;
15577c84883aSIlya Dryomov 	osd_req->r_snapid = obj_request->img_request->snap_id;
15589d4df01fSAlex Elder }
15599d4df01fSAlex Elder 
1560bcbab1dbSIlya Dryomov static void rbd_osd_format_write(struct ceph_osd_request *osd_req)
15619d4df01fSAlex Elder {
1562bcbab1dbSIlya Dryomov 	struct rbd_obj_request *obj_request = osd_req->r_priv;
15639d4df01fSAlex Elder 
1564a162b308SIlya Dryomov 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
1565fac02ddfSArnd Bergmann 	ktime_get_real_ts64(&osd_req->r_mtime);
156643df3d35SIlya Dryomov 	osd_req->r_data_offset = obj_request->ex.oe_off;
1567430c28c3SAlex Elder }
1568430c28c3SAlex Elder 
1569bc81207eSIlya Dryomov static struct ceph_osd_request *
1570bcbab1dbSIlya Dryomov __rbd_obj_add_osd_request(struct rbd_obj_request *obj_req,
1571bcbab1dbSIlya Dryomov 			  struct ceph_snap_context *snapc, int num_ops)
1572bc81207eSIlya Dryomov {
1573e28eded5SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1574bc81207eSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1575bc81207eSIlya Dryomov 	struct ceph_osd_request *req;
1576a90bb0c1SIlya Dryomov 	const char *name_format = rbd_dev->image_format == 1 ?
1577a90bb0c1SIlya Dryomov 				      RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
1578bcbab1dbSIlya Dryomov 	int ret;
1579bc81207eSIlya Dryomov 
1580e28eded5SIlya Dryomov 	req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
1581bc81207eSIlya Dryomov 	if (!req)
1582bcbab1dbSIlya Dryomov 		return ERR_PTR(-ENOMEM);
1583bc81207eSIlya Dryomov 
1584bcbab1dbSIlya Dryomov 	list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
1585bc81207eSIlya Dryomov 	req->r_callback = rbd_osd_req_callback;
1586a162b308SIlya Dryomov 	req->r_priv = obj_req;
1587bc81207eSIlya Dryomov 
1588b26c047bSIlya Dryomov 	/*
1589b26c047bSIlya Dryomov 	 * Data objects may be stored in a separate pool, but always in
1590b26c047bSIlya Dryomov 	 * the same namespace in that pool as the header in its pool.
1591b26c047bSIlya Dryomov 	 */
1592b26c047bSIlya Dryomov 	ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
1593bc81207eSIlya Dryomov 	req->r_base_oloc.pool = rbd_dev->layout.pool_id;
1594b26c047bSIlya Dryomov 
1595bcbab1dbSIlya Dryomov 	ret = ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
1596bcbab1dbSIlya Dryomov 			       rbd_dev->header.object_prefix,
1597bcbab1dbSIlya Dryomov 			       obj_req->ex.oe_objno);
1598bcbab1dbSIlya Dryomov 	if (ret)
1599bcbab1dbSIlya Dryomov 		return ERR_PTR(ret);
1600bc81207eSIlya Dryomov 
1601bc81207eSIlya Dryomov 	return req;
1602bc81207eSIlya Dryomov }
1603bc81207eSIlya Dryomov 
1604e28eded5SIlya Dryomov static struct ceph_osd_request *
1605bcbab1dbSIlya Dryomov rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops)
1606e28eded5SIlya Dryomov {
1607bcbab1dbSIlya Dryomov 	return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc,
1608e28eded5SIlya Dryomov 					 num_ops);
1609e28eded5SIlya Dryomov }
1610e28eded5SIlya Dryomov 
1611ecc633caSIlya Dryomov static struct rbd_obj_request *rbd_obj_request_create(void)
1612bf0d5f50SAlex Elder {
1613bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1614bf0d5f50SAlex Elder 
16155a60e876SIlya Dryomov 	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
16166c696d85SIlya Dryomov 	if (!obj_request)
1617f907ad55SAlex Elder 		return NULL;
1618f907ad55SAlex Elder 
161943df3d35SIlya Dryomov 	ceph_object_extent_init(&obj_request->ex);
1620bcbab1dbSIlya Dryomov 	INIT_LIST_HEAD(&obj_request->osd_reqs);
162185b5e6d1SIlya Dryomov 	mutex_init(&obj_request->state_mutex);
1622bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
1623bf0d5f50SAlex Elder 
162467e2b652SIlya Dryomov 	dout("%s %p\n", __func__, obj_request);
1625bf0d5f50SAlex Elder 	return obj_request;
1626bf0d5f50SAlex Elder }
1627bf0d5f50SAlex Elder 
1628bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
1629bf0d5f50SAlex Elder {
1630bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1631bcbab1dbSIlya Dryomov 	struct ceph_osd_request *osd_req;
16327e07efb1SIlya Dryomov 	u32 i;
1633bf0d5f50SAlex Elder 
1634bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
1635bf0d5f50SAlex Elder 
163637206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
163737206ee5SAlex Elder 
1638bcbab1dbSIlya Dryomov 	while (!list_empty(&obj_request->osd_reqs)) {
1639bcbab1dbSIlya Dryomov 		osd_req = list_first_entry(&obj_request->osd_reqs,
1640bcbab1dbSIlya Dryomov 				    struct ceph_osd_request, r_private_item);
1641bcbab1dbSIlya Dryomov 		list_del_init(&osd_req->r_private_item);
1642bcbab1dbSIlya Dryomov 		ceph_osdc_put_request(osd_req);
1643bcbab1dbSIlya Dryomov 	}
1644bf0d5f50SAlex Elder 
1645ecc633caSIlya Dryomov 	switch (obj_request->img_request->data_type) {
16469969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1647bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
16487e07efb1SIlya Dryomov 	case OBJ_REQUEST_BVECS:
16495359a17dSIlya Dryomov 		break;		/* Nothing to do */
1650afb97888SIlya Dryomov 	case OBJ_REQUEST_OWN_BVECS:
1651afb97888SIlya Dryomov 		kfree(obj_request->bvec_pos.bvecs);
1652bf0d5f50SAlex Elder 		break;
16537e07efb1SIlya Dryomov 	default:
165416809372SArnd Bergmann 		BUG();
1655bf0d5f50SAlex Elder 	}
1656bf0d5f50SAlex Elder 
165786bd7998SIlya Dryomov 	kfree(obj_request->img_extents);
16587e07efb1SIlya Dryomov 	if (obj_request->copyup_bvecs) {
16597e07efb1SIlya Dryomov 		for (i = 0; i < obj_request->copyup_bvec_count; i++) {
16607e07efb1SIlya Dryomov 			if (obj_request->copyup_bvecs[i].bv_page)
16617e07efb1SIlya Dryomov 				__free_page(obj_request->copyup_bvecs[i].bv_page);
16627e07efb1SIlya Dryomov 		}
16637e07efb1SIlya Dryomov 		kfree(obj_request->copyup_bvecs);
1664bf0d5f50SAlex Elder 	}
1665bf0d5f50SAlex Elder 
1666868311b1SAlex Elder 	kmem_cache_free(rbd_obj_request_cache, obj_request);
1667bf0d5f50SAlex Elder }
1668bf0d5f50SAlex Elder 
1669fb65d228SAlex Elder /* It's OK to call this for a device with no parent */
1670fb65d228SAlex Elder 
1671fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
1672fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1673fb65d228SAlex Elder {
1674fb65d228SAlex Elder 	rbd_dev_remove_parent(rbd_dev);
1675fb65d228SAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
1676fb65d228SAlex Elder 	rbd_dev->parent_spec = NULL;
1677fb65d228SAlex Elder 	rbd_dev->parent_overlap = 0;
1678fb65d228SAlex Elder }
1679fb65d228SAlex Elder 
1680bf0d5f50SAlex Elder /*
1681a2acd00eSAlex Elder  * Parent image reference counting is used to determine when an
1682a2acd00eSAlex Elder  * image's parent fields can be safely torn down--after there are no
1683a2acd00eSAlex Elder  * more in-flight requests to the parent image.  When the last
1684a2acd00eSAlex Elder  * reference is dropped, cleaning them up is safe.
1685a2acd00eSAlex Elder  */
1686a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1687a2acd00eSAlex Elder {
1688a2acd00eSAlex Elder 	int counter;
1689a2acd00eSAlex Elder 
1690a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
1691a2acd00eSAlex Elder 		return;
1692a2acd00eSAlex Elder 
1693a2acd00eSAlex Elder 	counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1694a2acd00eSAlex Elder 	if (counter > 0)
1695a2acd00eSAlex Elder 		return;
1696a2acd00eSAlex Elder 
1697a2acd00eSAlex Elder 	/* Last reference; clean up parent data structures */
1698a2acd00eSAlex Elder 
1699a2acd00eSAlex Elder 	if (!counter)
1700a2acd00eSAlex Elder 		rbd_dev_unparent(rbd_dev);
1701a2acd00eSAlex Elder 	else
17029584d508SIlya Dryomov 		rbd_warn(rbd_dev, "parent reference underflow");
1703a2acd00eSAlex Elder }
1704a2acd00eSAlex Elder 
1705a2acd00eSAlex Elder /*
1706a2acd00eSAlex Elder  * If an image has a non-zero parent overlap, get a reference to its
1707a2acd00eSAlex Elder  * parent.
1708a2acd00eSAlex Elder  *
1709a2acd00eSAlex Elder  * Returns true if the rbd device has a parent with a non-zero
1710a2acd00eSAlex Elder  * overlap and a reference for it was successfully taken, or
1711a2acd00eSAlex Elder  * false otherwise.
1712a2acd00eSAlex Elder  */
1713a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1714a2acd00eSAlex Elder {
1715ae43e9d0SIlya Dryomov 	int counter = 0;
1716a2acd00eSAlex Elder 
1717a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
1718a2acd00eSAlex Elder 		return false;
1719a2acd00eSAlex Elder 
1720ae43e9d0SIlya Dryomov 	down_read(&rbd_dev->header_rwsem);
1721ae43e9d0SIlya Dryomov 	if (rbd_dev->parent_overlap)
1722a2acd00eSAlex Elder 		counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1723ae43e9d0SIlya Dryomov 	up_read(&rbd_dev->header_rwsem);
1724a2acd00eSAlex Elder 
1725a2acd00eSAlex Elder 	if (counter < 0)
17269584d508SIlya Dryomov 		rbd_warn(rbd_dev, "parent reference overflow");
1727a2acd00eSAlex Elder 
1728ae43e9d0SIlya Dryomov 	return counter > 0;
1729a2acd00eSAlex Elder }
1730a2acd00eSAlex Elder 
1731bf0d5f50SAlex Elder /*
1732bf0d5f50SAlex Elder  * Caller is responsible for filling in the list of object requests
1733bf0d5f50SAlex Elder  * that comprises the image request, and the Linux request pointer
1734bf0d5f50SAlex Elder  * (if there is one).
1735bf0d5f50SAlex Elder  */
1736cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create(
1737cc344fa1SAlex Elder 					struct rbd_device *rbd_dev,
17386d2940c8SGuangliang Zhao 					enum obj_operation_type op_type,
17394e752f0aSJosh Durgin 					struct ceph_snap_context *snapc)
1740bf0d5f50SAlex Elder {
1741bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1742bf0d5f50SAlex Elder 
1743a0c5895bSIlya Dryomov 	img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO);
1744bf0d5f50SAlex Elder 	if (!img_request)
1745bf0d5f50SAlex Elder 		return NULL;
1746bf0d5f50SAlex Elder 
1747bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
17489bb0248dSIlya Dryomov 	img_request->op_type = op_type;
17499bb0248dSIlya Dryomov 	if (!rbd_img_is_write(img_request))
1750bf0d5f50SAlex Elder 		img_request->snap_id = rbd_dev->spec->snap_id;
17519bb0248dSIlya Dryomov 	else
17529bb0248dSIlya Dryomov 		img_request->snapc = snapc;
17539bb0248dSIlya Dryomov 
1754a2acd00eSAlex Elder 	if (rbd_dev_parent_get(rbd_dev))
1755d0b2e944SAlex Elder 		img_request_layered_set(img_request);
1756a0c5895bSIlya Dryomov 
1757e1fddc8fSIlya Dryomov 	INIT_LIST_HEAD(&img_request->lock_item);
175843df3d35SIlya Dryomov 	INIT_LIST_HEAD(&img_request->object_extents);
17590192ce2eSIlya Dryomov 	mutex_init(&img_request->state_mutex);
1760bf0d5f50SAlex Elder 	kref_init(&img_request->kref);
1761bf0d5f50SAlex Elder 
1762bf0d5f50SAlex Elder 	return img_request;
1763bf0d5f50SAlex Elder }
1764bf0d5f50SAlex Elder 
1765bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref)
1766bf0d5f50SAlex Elder {
1767bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1768bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1769bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
1770bf0d5f50SAlex Elder 
1771bf0d5f50SAlex Elder 	img_request = container_of(kref, struct rbd_img_request, kref);
1772bf0d5f50SAlex Elder 
177337206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
177437206ee5SAlex Elder 
1775e1fddc8fSIlya Dryomov 	WARN_ON(!list_empty(&img_request->lock_item));
1776bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1777bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
1778bf0d5f50SAlex Elder 
1779a2acd00eSAlex Elder 	if (img_request_layered_test(img_request)) {
1780a2acd00eSAlex Elder 		img_request_layered_clear(img_request);
1781a2acd00eSAlex Elder 		rbd_dev_parent_put(img_request->rbd_dev);
1782a2acd00eSAlex Elder 	}
1783a2acd00eSAlex Elder 
17849bb0248dSIlya Dryomov 	if (rbd_img_is_write(img_request))
1785812164f8SAlex Elder 		ceph_put_snap_context(img_request->snapc);
1786bf0d5f50SAlex Elder 
17871c2a9dfeSAlex Elder 	kmem_cache_free(rbd_img_request_cache, img_request);
1788bf0d5f50SAlex Elder }
1789bf0d5f50SAlex Elder 
179022e8bd51SIlya Dryomov #define BITS_PER_OBJ	2
179122e8bd51SIlya Dryomov #define OBJS_PER_BYTE	(BITS_PER_BYTE / BITS_PER_OBJ)
179222e8bd51SIlya Dryomov #define OBJ_MASK	((1 << BITS_PER_OBJ) - 1)
179322e8bd51SIlya Dryomov 
179422e8bd51SIlya Dryomov static void __rbd_object_map_index(struct rbd_device *rbd_dev, u64 objno,
179522e8bd51SIlya Dryomov 				   u64 *index, u8 *shift)
179622e8bd51SIlya Dryomov {
179722e8bd51SIlya Dryomov 	u32 off;
179822e8bd51SIlya Dryomov 
179922e8bd51SIlya Dryomov 	rbd_assert(objno < rbd_dev->object_map_size);
180022e8bd51SIlya Dryomov 	*index = div_u64_rem(objno, OBJS_PER_BYTE, &off);
180122e8bd51SIlya Dryomov 	*shift = (OBJS_PER_BYTE - off - 1) * BITS_PER_OBJ;
180222e8bd51SIlya Dryomov }
180322e8bd51SIlya Dryomov 
180422e8bd51SIlya Dryomov static u8 __rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
180522e8bd51SIlya Dryomov {
180622e8bd51SIlya Dryomov 	u64 index;
180722e8bd51SIlya Dryomov 	u8 shift;
180822e8bd51SIlya Dryomov 
180922e8bd51SIlya Dryomov 	lockdep_assert_held(&rbd_dev->object_map_lock);
181022e8bd51SIlya Dryomov 	__rbd_object_map_index(rbd_dev, objno, &index, &shift);
181122e8bd51SIlya Dryomov 	return (rbd_dev->object_map[index] >> shift) & OBJ_MASK;
181222e8bd51SIlya Dryomov }
181322e8bd51SIlya Dryomov 
181422e8bd51SIlya Dryomov static void __rbd_object_map_set(struct rbd_device *rbd_dev, u64 objno, u8 val)
181522e8bd51SIlya Dryomov {
181622e8bd51SIlya Dryomov 	u64 index;
181722e8bd51SIlya Dryomov 	u8 shift;
181822e8bd51SIlya Dryomov 	u8 *p;
181922e8bd51SIlya Dryomov 
182022e8bd51SIlya Dryomov 	lockdep_assert_held(&rbd_dev->object_map_lock);
182122e8bd51SIlya Dryomov 	rbd_assert(!(val & ~OBJ_MASK));
182222e8bd51SIlya Dryomov 
182322e8bd51SIlya Dryomov 	__rbd_object_map_index(rbd_dev, objno, &index, &shift);
182422e8bd51SIlya Dryomov 	p = &rbd_dev->object_map[index];
182522e8bd51SIlya Dryomov 	*p = (*p & ~(OBJ_MASK << shift)) | (val << shift);
182622e8bd51SIlya Dryomov }
182722e8bd51SIlya Dryomov 
182822e8bd51SIlya Dryomov static u8 rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
182922e8bd51SIlya Dryomov {
183022e8bd51SIlya Dryomov 	u8 state;
183122e8bd51SIlya Dryomov 
183222e8bd51SIlya Dryomov 	spin_lock(&rbd_dev->object_map_lock);
183322e8bd51SIlya Dryomov 	state = __rbd_object_map_get(rbd_dev, objno);
183422e8bd51SIlya Dryomov 	spin_unlock(&rbd_dev->object_map_lock);
183522e8bd51SIlya Dryomov 	return state;
183622e8bd51SIlya Dryomov }
183722e8bd51SIlya Dryomov 
183822e8bd51SIlya Dryomov static bool use_object_map(struct rbd_device *rbd_dev)
183922e8bd51SIlya Dryomov {
184022e8bd51SIlya Dryomov 	return ((rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) &&
184122e8bd51SIlya Dryomov 		!(rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID));
184222e8bd51SIlya Dryomov }
184322e8bd51SIlya Dryomov 
184422e8bd51SIlya Dryomov static bool rbd_object_map_may_exist(struct rbd_device *rbd_dev, u64 objno)
184522e8bd51SIlya Dryomov {
184622e8bd51SIlya Dryomov 	u8 state;
184722e8bd51SIlya Dryomov 
184822e8bd51SIlya Dryomov 	/* fall back to default logic if object map is disabled or invalid */
184922e8bd51SIlya Dryomov 	if (!use_object_map(rbd_dev))
185022e8bd51SIlya Dryomov 		return true;
185122e8bd51SIlya Dryomov 
185222e8bd51SIlya Dryomov 	state = rbd_object_map_get(rbd_dev, objno);
185322e8bd51SIlya Dryomov 	return state != OBJECT_NONEXISTENT;
185422e8bd51SIlya Dryomov }
185522e8bd51SIlya Dryomov 
185622e8bd51SIlya Dryomov static void rbd_object_map_name(struct rbd_device *rbd_dev, u64 snap_id,
185722e8bd51SIlya Dryomov 				struct ceph_object_id *oid)
185822e8bd51SIlya Dryomov {
185922e8bd51SIlya Dryomov 	if (snap_id == CEPH_NOSNAP)
186022e8bd51SIlya Dryomov 		ceph_oid_printf(oid, "%s%s", RBD_OBJECT_MAP_PREFIX,
186122e8bd51SIlya Dryomov 				rbd_dev->spec->image_id);
186222e8bd51SIlya Dryomov 	else
186322e8bd51SIlya Dryomov 		ceph_oid_printf(oid, "%s%s.%016llx", RBD_OBJECT_MAP_PREFIX,
186422e8bd51SIlya Dryomov 				rbd_dev->spec->image_id, snap_id);
186522e8bd51SIlya Dryomov }
186622e8bd51SIlya Dryomov 
186722e8bd51SIlya Dryomov static int rbd_object_map_lock(struct rbd_device *rbd_dev)
186822e8bd51SIlya Dryomov {
186922e8bd51SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
187022e8bd51SIlya Dryomov 	CEPH_DEFINE_OID_ONSTACK(oid);
187122e8bd51SIlya Dryomov 	u8 lock_type;
187222e8bd51SIlya Dryomov 	char *lock_tag;
187322e8bd51SIlya Dryomov 	struct ceph_locker *lockers;
187422e8bd51SIlya Dryomov 	u32 num_lockers;
187522e8bd51SIlya Dryomov 	bool broke_lock = false;
187622e8bd51SIlya Dryomov 	int ret;
187722e8bd51SIlya Dryomov 
187822e8bd51SIlya Dryomov 	rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
187922e8bd51SIlya Dryomov 
188022e8bd51SIlya Dryomov again:
188122e8bd51SIlya Dryomov 	ret = ceph_cls_lock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
188222e8bd51SIlya Dryomov 			    CEPH_CLS_LOCK_EXCLUSIVE, "", "", "", 0);
188322e8bd51SIlya Dryomov 	if (ret != -EBUSY || broke_lock) {
188422e8bd51SIlya Dryomov 		if (ret == -EEXIST)
188522e8bd51SIlya Dryomov 			ret = 0; /* already locked by myself */
188622e8bd51SIlya Dryomov 		if (ret)
188722e8bd51SIlya Dryomov 			rbd_warn(rbd_dev, "failed to lock object map: %d", ret);
188822e8bd51SIlya Dryomov 		return ret;
188922e8bd51SIlya Dryomov 	}
189022e8bd51SIlya Dryomov 
189122e8bd51SIlya Dryomov 	ret = ceph_cls_lock_info(osdc, &oid, &rbd_dev->header_oloc,
189222e8bd51SIlya Dryomov 				 RBD_LOCK_NAME, &lock_type, &lock_tag,
189322e8bd51SIlya Dryomov 				 &lockers, &num_lockers);
189422e8bd51SIlya Dryomov 	if (ret) {
189522e8bd51SIlya Dryomov 		if (ret == -ENOENT)
189622e8bd51SIlya Dryomov 			goto again;
189722e8bd51SIlya Dryomov 
189822e8bd51SIlya Dryomov 		rbd_warn(rbd_dev, "failed to get object map lockers: %d", ret);
189922e8bd51SIlya Dryomov 		return ret;
190022e8bd51SIlya Dryomov 	}
190122e8bd51SIlya Dryomov 
190222e8bd51SIlya Dryomov 	kfree(lock_tag);
190322e8bd51SIlya Dryomov 	if (num_lockers == 0)
190422e8bd51SIlya Dryomov 		goto again;
190522e8bd51SIlya Dryomov 
190622e8bd51SIlya Dryomov 	rbd_warn(rbd_dev, "breaking object map lock owned by %s%llu",
190722e8bd51SIlya Dryomov 		 ENTITY_NAME(lockers[0].id.name));
190822e8bd51SIlya Dryomov 
190922e8bd51SIlya Dryomov 	ret = ceph_cls_break_lock(osdc, &oid, &rbd_dev->header_oloc,
191022e8bd51SIlya Dryomov 				  RBD_LOCK_NAME, lockers[0].id.cookie,
191122e8bd51SIlya Dryomov 				  &lockers[0].id.name);
191222e8bd51SIlya Dryomov 	ceph_free_lockers(lockers, num_lockers);
191322e8bd51SIlya Dryomov 	if (ret) {
191422e8bd51SIlya Dryomov 		if (ret == -ENOENT)
191522e8bd51SIlya Dryomov 			goto again;
191622e8bd51SIlya Dryomov 
191722e8bd51SIlya Dryomov 		rbd_warn(rbd_dev, "failed to break object map lock: %d", ret);
191822e8bd51SIlya Dryomov 		return ret;
191922e8bd51SIlya Dryomov 	}
192022e8bd51SIlya Dryomov 
192122e8bd51SIlya Dryomov 	broke_lock = true;
192222e8bd51SIlya Dryomov 	goto again;
192322e8bd51SIlya Dryomov }
192422e8bd51SIlya Dryomov 
192522e8bd51SIlya Dryomov static void rbd_object_map_unlock(struct rbd_device *rbd_dev)
192622e8bd51SIlya Dryomov {
192722e8bd51SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
192822e8bd51SIlya Dryomov 	CEPH_DEFINE_OID_ONSTACK(oid);
192922e8bd51SIlya Dryomov 	int ret;
193022e8bd51SIlya Dryomov 
193122e8bd51SIlya Dryomov 	rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
193222e8bd51SIlya Dryomov 
193322e8bd51SIlya Dryomov 	ret = ceph_cls_unlock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
193422e8bd51SIlya Dryomov 			      "");
193522e8bd51SIlya Dryomov 	if (ret && ret != -ENOENT)
193622e8bd51SIlya Dryomov 		rbd_warn(rbd_dev, "failed to unlock object map: %d", ret);
193722e8bd51SIlya Dryomov }
193822e8bd51SIlya Dryomov 
193922e8bd51SIlya Dryomov static int decode_object_map_header(void **p, void *end, u64 *object_map_size)
194022e8bd51SIlya Dryomov {
194122e8bd51SIlya Dryomov 	u8 struct_v;
194222e8bd51SIlya Dryomov 	u32 struct_len;
194322e8bd51SIlya Dryomov 	u32 header_len;
194422e8bd51SIlya Dryomov 	void *header_end;
194522e8bd51SIlya Dryomov 	int ret;
194622e8bd51SIlya Dryomov 
194722e8bd51SIlya Dryomov 	ceph_decode_32_safe(p, end, header_len, e_inval);
194822e8bd51SIlya Dryomov 	header_end = *p + header_len;
194922e8bd51SIlya Dryomov 
195022e8bd51SIlya Dryomov 	ret = ceph_start_decoding(p, end, 1, "BitVector header", &struct_v,
195122e8bd51SIlya Dryomov 				  &struct_len);
195222e8bd51SIlya Dryomov 	if (ret)
195322e8bd51SIlya Dryomov 		return ret;
195422e8bd51SIlya Dryomov 
195522e8bd51SIlya Dryomov 	ceph_decode_64_safe(p, end, *object_map_size, e_inval);
195622e8bd51SIlya Dryomov 
195722e8bd51SIlya Dryomov 	*p = header_end;
195822e8bd51SIlya Dryomov 	return 0;
195922e8bd51SIlya Dryomov 
196022e8bd51SIlya Dryomov e_inval:
196122e8bd51SIlya Dryomov 	return -EINVAL;
196222e8bd51SIlya Dryomov }
196322e8bd51SIlya Dryomov 
196422e8bd51SIlya Dryomov static int __rbd_object_map_load(struct rbd_device *rbd_dev)
196522e8bd51SIlya Dryomov {
196622e8bd51SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
196722e8bd51SIlya Dryomov 	CEPH_DEFINE_OID_ONSTACK(oid);
196822e8bd51SIlya Dryomov 	struct page **pages;
196922e8bd51SIlya Dryomov 	void *p, *end;
197022e8bd51SIlya Dryomov 	size_t reply_len;
197122e8bd51SIlya Dryomov 	u64 num_objects;
197222e8bd51SIlya Dryomov 	u64 object_map_bytes;
197322e8bd51SIlya Dryomov 	u64 object_map_size;
197422e8bd51SIlya Dryomov 	int num_pages;
197522e8bd51SIlya Dryomov 	int ret;
197622e8bd51SIlya Dryomov 
197722e8bd51SIlya Dryomov 	rbd_assert(!rbd_dev->object_map && !rbd_dev->object_map_size);
197822e8bd51SIlya Dryomov 
197922e8bd51SIlya Dryomov 	num_objects = ceph_get_num_objects(&rbd_dev->layout,
198022e8bd51SIlya Dryomov 					   rbd_dev->mapping.size);
198122e8bd51SIlya Dryomov 	object_map_bytes = DIV_ROUND_UP_ULL(num_objects * BITS_PER_OBJ,
198222e8bd51SIlya Dryomov 					    BITS_PER_BYTE);
198322e8bd51SIlya Dryomov 	num_pages = calc_pages_for(0, object_map_bytes) + 1;
198422e8bd51SIlya Dryomov 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
198522e8bd51SIlya Dryomov 	if (IS_ERR(pages))
198622e8bd51SIlya Dryomov 		return PTR_ERR(pages);
198722e8bd51SIlya Dryomov 
198822e8bd51SIlya Dryomov 	reply_len = num_pages * PAGE_SIZE;
198922e8bd51SIlya Dryomov 	rbd_object_map_name(rbd_dev, rbd_dev->spec->snap_id, &oid);
199022e8bd51SIlya Dryomov 	ret = ceph_osdc_call(osdc, &oid, &rbd_dev->header_oloc,
199122e8bd51SIlya Dryomov 			     "rbd", "object_map_load", CEPH_OSD_FLAG_READ,
199222e8bd51SIlya Dryomov 			     NULL, 0, pages, &reply_len);
199322e8bd51SIlya Dryomov 	if (ret)
199422e8bd51SIlya Dryomov 		goto out;
199522e8bd51SIlya Dryomov 
199622e8bd51SIlya Dryomov 	p = page_address(pages[0]);
199722e8bd51SIlya Dryomov 	end = p + min(reply_len, (size_t)PAGE_SIZE);
199822e8bd51SIlya Dryomov 	ret = decode_object_map_header(&p, end, &object_map_size);
199922e8bd51SIlya Dryomov 	if (ret)
200022e8bd51SIlya Dryomov 		goto out;
200122e8bd51SIlya Dryomov 
200222e8bd51SIlya Dryomov 	if (object_map_size != num_objects) {
200322e8bd51SIlya Dryomov 		rbd_warn(rbd_dev, "object map size mismatch: %llu vs %llu",
200422e8bd51SIlya Dryomov 			 object_map_size, num_objects);
200522e8bd51SIlya Dryomov 		ret = -EINVAL;
200622e8bd51SIlya Dryomov 		goto out;
200722e8bd51SIlya Dryomov 	}
200822e8bd51SIlya Dryomov 
200922e8bd51SIlya Dryomov 	if (offset_in_page(p) + object_map_bytes > reply_len) {
201022e8bd51SIlya Dryomov 		ret = -EINVAL;
201122e8bd51SIlya Dryomov 		goto out;
201222e8bd51SIlya Dryomov 	}
201322e8bd51SIlya Dryomov 
201422e8bd51SIlya Dryomov 	rbd_dev->object_map = kvmalloc(object_map_bytes, GFP_KERNEL);
201522e8bd51SIlya Dryomov 	if (!rbd_dev->object_map) {
201622e8bd51SIlya Dryomov 		ret = -ENOMEM;
201722e8bd51SIlya Dryomov 		goto out;
201822e8bd51SIlya Dryomov 	}
201922e8bd51SIlya Dryomov 
202022e8bd51SIlya Dryomov 	rbd_dev->object_map_size = object_map_size;
202122e8bd51SIlya Dryomov 	ceph_copy_from_page_vector(pages, rbd_dev->object_map,
202222e8bd51SIlya Dryomov 				   offset_in_page(p), object_map_bytes);
202322e8bd51SIlya Dryomov 
202422e8bd51SIlya Dryomov out:
202522e8bd51SIlya Dryomov 	ceph_release_page_vector(pages, num_pages);
202622e8bd51SIlya Dryomov 	return ret;
202722e8bd51SIlya Dryomov }
202822e8bd51SIlya Dryomov 
202922e8bd51SIlya Dryomov static void rbd_object_map_free(struct rbd_device *rbd_dev)
203022e8bd51SIlya Dryomov {
203122e8bd51SIlya Dryomov 	kvfree(rbd_dev->object_map);
203222e8bd51SIlya Dryomov 	rbd_dev->object_map = NULL;
203322e8bd51SIlya Dryomov 	rbd_dev->object_map_size = 0;
203422e8bd51SIlya Dryomov }
203522e8bd51SIlya Dryomov 
203622e8bd51SIlya Dryomov static int rbd_object_map_load(struct rbd_device *rbd_dev)
203722e8bd51SIlya Dryomov {
203822e8bd51SIlya Dryomov 	int ret;
203922e8bd51SIlya Dryomov 
204022e8bd51SIlya Dryomov 	ret = __rbd_object_map_load(rbd_dev);
204122e8bd51SIlya Dryomov 	if (ret)
204222e8bd51SIlya Dryomov 		return ret;
204322e8bd51SIlya Dryomov 
204422e8bd51SIlya Dryomov 	ret = rbd_dev_v2_get_flags(rbd_dev);
204522e8bd51SIlya Dryomov 	if (ret) {
204622e8bd51SIlya Dryomov 		rbd_object_map_free(rbd_dev);
204722e8bd51SIlya Dryomov 		return ret;
204822e8bd51SIlya Dryomov 	}
204922e8bd51SIlya Dryomov 
205022e8bd51SIlya Dryomov 	if (rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID)
205122e8bd51SIlya Dryomov 		rbd_warn(rbd_dev, "object map is invalid");
205222e8bd51SIlya Dryomov 
205322e8bd51SIlya Dryomov 	return 0;
205422e8bd51SIlya Dryomov }
205522e8bd51SIlya Dryomov 
205622e8bd51SIlya Dryomov static int rbd_object_map_open(struct rbd_device *rbd_dev)
205722e8bd51SIlya Dryomov {
205822e8bd51SIlya Dryomov 	int ret;
205922e8bd51SIlya Dryomov 
206022e8bd51SIlya Dryomov 	ret = rbd_object_map_lock(rbd_dev);
206122e8bd51SIlya Dryomov 	if (ret)
206222e8bd51SIlya Dryomov 		return ret;
206322e8bd51SIlya Dryomov 
206422e8bd51SIlya Dryomov 	ret = rbd_object_map_load(rbd_dev);
206522e8bd51SIlya Dryomov 	if (ret) {
206622e8bd51SIlya Dryomov 		rbd_object_map_unlock(rbd_dev);
206722e8bd51SIlya Dryomov 		return ret;
206822e8bd51SIlya Dryomov 	}
206922e8bd51SIlya Dryomov 
207022e8bd51SIlya Dryomov 	return 0;
207122e8bd51SIlya Dryomov }
207222e8bd51SIlya Dryomov 
207322e8bd51SIlya Dryomov static void rbd_object_map_close(struct rbd_device *rbd_dev)
207422e8bd51SIlya Dryomov {
207522e8bd51SIlya Dryomov 	rbd_object_map_free(rbd_dev);
207622e8bd51SIlya Dryomov 	rbd_object_map_unlock(rbd_dev);
207722e8bd51SIlya Dryomov }
207822e8bd51SIlya Dryomov 
207922e8bd51SIlya Dryomov /*
208022e8bd51SIlya Dryomov  * This function needs snap_id (or more precisely just something to
208122e8bd51SIlya Dryomov  * distinguish between HEAD and snapshot object maps), new_state and
208222e8bd51SIlya Dryomov  * current_state that were passed to rbd_object_map_update().
208322e8bd51SIlya Dryomov  *
208422e8bd51SIlya Dryomov  * To avoid allocating and stashing a context we piggyback on the OSD
208522e8bd51SIlya Dryomov  * request.  A HEAD update has two ops (assert_locked).  For new_state
208622e8bd51SIlya Dryomov  * and current_state we decode our own object_map_update op, encoded in
208722e8bd51SIlya Dryomov  * rbd_cls_object_map_update().
208822e8bd51SIlya Dryomov  */
208922e8bd51SIlya Dryomov static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req,
209022e8bd51SIlya Dryomov 					struct ceph_osd_request *osd_req)
209122e8bd51SIlya Dryomov {
209222e8bd51SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
209322e8bd51SIlya Dryomov 	struct ceph_osd_data *osd_data;
209422e8bd51SIlya Dryomov 	u64 objno;
2095633739b2SIlya Dryomov 	u8 state, new_state, uninitialized_var(current_state);
209622e8bd51SIlya Dryomov 	bool has_current_state;
209722e8bd51SIlya Dryomov 	void *p;
209822e8bd51SIlya Dryomov 
209922e8bd51SIlya Dryomov 	if (osd_req->r_result)
210022e8bd51SIlya Dryomov 		return osd_req->r_result;
210122e8bd51SIlya Dryomov 
210222e8bd51SIlya Dryomov 	/*
210322e8bd51SIlya Dryomov 	 * Nothing to do for a snapshot object map.
210422e8bd51SIlya Dryomov 	 */
210522e8bd51SIlya Dryomov 	if (osd_req->r_num_ops == 1)
210622e8bd51SIlya Dryomov 		return 0;
210722e8bd51SIlya Dryomov 
210822e8bd51SIlya Dryomov 	/*
210922e8bd51SIlya Dryomov 	 * Update in-memory HEAD object map.
211022e8bd51SIlya Dryomov 	 */
211122e8bd51SIlya Dryomov 	rbd_assert(osd_req->r_num_ops == 2);
211222e8bd51SIlya Dryomov 	osd_data = osd_req_op_data(osd_req, 1, cls, request_data);
211322e8bd51SIlya Dryomov 	rbd_assert(osd_data->type == CEPH_OSD_DATA_TYPE_PAGES);
211422e8bd51SIlya Dryomov 
211522e8bd51SIlya Dryomov 	p = page_address(osd_data->pages[0]);
211622e8bd51SIlya Dryomov 	objno = ceph_decode_64(&p);
211722e8bd51SIlya Dryomov 	rbd_assert(objno == obj_req->ex.oe_objno);
211822e8bd51SIlya Dryomov 	rbd_assert(ceph_decode_64(&p) == objno + 1);
211922e8bd51SIlya Dryomov 	new_state = ceph_decode_8(&p);
212022e8bd51SIlya Dryomov 	has_current_state = ceph_decode_8(&p);
212122e8bd51SIlya Dryomov 	if (has_current_state)
212222e8bd51SIlya Dryomov 		current_state = ceph_decode_8(&p);
212322e8bd51SIlya Dryomov 
212422e8bd51SIlya Dryomov 	spin_lock(&rbd_dev->object_map_lock);
212522e8bd51SIlya Dryomov 	state = __rbd_object_map_get(rbd_dev, objno);
212622e8bd51SIlya Dryomov 	if (!has_current_state || current_state == state ||
212722e8bd51SIlya Dryomov 	    (current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN))
212822e8bd51SIlya Dryomov 		__rbd_object_map_set(rbd_dev, objno, new_state);
212922e8bd51SIlya Dryomov 	spin_unlock(&rbd_dev->object_map_lock);
213022e8bd51SIlya Dryomov 
213122e8bd51SIlya Dryomov 	return 0;
213222e8bd51SIlya Dryomov }
213322e8bd51SIlya Dryomov 
213422e8bd51SIlya Dryomov static void rbd_object_map_callback(struct ceph_osd_request *osd_req)
213522e8bd51SIlya Dryomov {
213622e8bd51SIlya Dryomov 	struct rbd_obj_request *obj_req = osd_req->r_priv;
213722e8bd51SIlya Dryomov 	int result;
213822e8bd51SIlya Dryomov 
213922e8bd51SIlya Dryomov 	dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
214022e8bd51SIlya Dryomov 	     osd_req->r_result, obj_req);
214122e8bd51SIlya Dryomov 
214222e8bd51SIlya Dryomov 	result = rbd_object_map_update_finish(obj_req, osd_req);
214322e8bd51SIlya Dryomov 	rbd_obj_handle_request(obj_req, result);
214422e8bd51SIlya Dryomov }
214522e8bd51SIlya Dryomov 
214622e8bd51SIlya Dryomov static bool update_needed(struct rbd_device *rbd_dev, u64 objno, u8 new_state)
214722e8bd51SIlya Dryomov {
214822e8bd51SIlya Dryomov 	u8 state = rbd_object_map_get(rbd_dev, objno);
214922e8bd51SIlya Dryomov 
215022e8bd51SIlya Dryomov 	if (state == new_state ||
215122e8bd51SIlya Dryomov 	    (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) ||
215222e8bd51SIlya Dryomov 	    (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING))
215322e8bd51SIlya Dryomov 		return false;
215422e8bd51SIlya Dryomov 
215522e8bd51SIlya Dryomov 	return true;
215622e8bd51SIlya Dryomov }
215722e8bd51SIlya Dryomov 
215822e8bd51SIlya Dryomov static int rbd_cls_object_map_update(struct ceph_osd_request *req,
215922e8bd51SIlya Dryomov 				     int which, u64 objno, u8 new_state,
216022e8bd51SIlya Dryomov 				     const u8 *current_state)
216122e8bd51SIlya Dryomov {
216222e8bd51SIlya Dryomov 	struct page **pages;
216322e8bd51SIlya Dryomov 	void *p, *start;
216422e8bd51SIlya Dryomov 	int ret;
216522e8bd51SIlya Dryomov 
216622e8bd51SIlya Dryomov 	ret = osd_req_op_cls_init(req, which, "rbd", "object_map_update");
216722e8bd51SIlya Dryomov 	if (ret)
216822e8bd51SIlya Dryomov 		return ret;
216922e8bd51SIlya Dryomov 
217022e8bd51SIlya Dryomov 	pages = ceph_alloc_page_vector(1, GFP_NOIO);
217122e8bd51SIlya Dryomov 	if (IS_ERR(pages))
217222e8bd51SIlya Dryomov 		return PTR_ERR(pages);
217322e8bd51SIlya Dryomov 
217422e8bd51SIlya Dryomov 	p = start = page_address(pages[0]);
217522e8bd51SIlya Dryomov 	ceph_encode_64(&p, objno);
217622e8bd51SIlya Dryomov 	ceph_encode_64(&p, objno + 1);
217722e8bd51SIlya Dryomov 	ceph_encode_8(&p, new_state);
217822e8bd51SIlya Dryomov 	if (current_state) {
217922e8bd51SIlya Dryomov 		ceph_encode_8(&p, 1);
218022e8bd51SIlya Dryomov 		ceph_encode_8(&p, *current_state);
218122e8bd51SIlya Dryomov 	} else {
218222e8bd51SIlya Dryomov 		ceph_encode_8(&p, 0);
218322e8bd51SIlya Dryomov 	}
218422e8bd51SIlya Dryomov 
218522e8bd51SIlya Dryomov 	osd_req_op_cls_request_data_pages(req, which, pages, p - start, 0,
218622e8bd51SIlya Dryomov 					  false, true);
218722e8bd51SIlya Dryomov 	return 0;
218822e8bd51SIlya Dryomov }
218922e8bd51SIlya Dryomov 
219022e8bd51SIlya Dryomov /*
219122e8bd51SIlya Dryomov  * Return:
219222e8bd51SIlya Dryomov  *   0 - object map update sent
219322e8bd51SIlya Dryomov  *   1 - object map update isn't needed
219422e8bd51SIlya Dryomov  *  <0 - error
219522e8bd51SIlya Dryomov  */
219622e8bd51SIlya Dryomov static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id,
219722e8bd51SIlya Dryomov 				 u8 new_state, const u8 *current_state)
219822e8bd51SIlya Dryomov {
219922e8bd51SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
220022e8bd51SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
220122e8bd51SIlya Dryomov 	struct ceph_osd_request *req;
220222e8bd51SIlya Dryomov 	int num_ops = 1;
220322e8bd51SIlya Dryomov 	int which = 0;
220422e8bd51SIlya Dryomov 	int ret;
220522e8bd51SIlya Dryomov 
220622e8bd51SIlya Dryomov 	if (snap_id == CEPH_NOSNAP) {
220722e8bd51SIlya Dryomov 		if (!update_needed(rbd_dev, obj_req->ex.oe_objno, new_state))
220822e8bd51SIlya Dryomov 			return 1;
220922e8bd51SIlya Dryomov 
221022e8bd51SIlya Dryomov 		num_ops++; /* assert_locked */
221122e8bd51SIlya Dryomov 	}
221222e8bd51SIlya Dryomov 
221322e8bd51SIlya Dryomov 	req = ceph_osdc_alloc_request(osdc, NULL, num_ops, false, GFP_NOIO);
221422e8bd51SIlya Dryomov 	if (!req)
221522e8bd51SIlya Dryomov 		return -ENOMEM;
221622e8bd51SIlya Dryomov 
221722e8bd51SIlya Dryomov 	list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
221822e8bd51SIlya Dryomov 	req->r_callback = rbd_object_map_callback;
221922e8bd51SIlya Dryomov 	req->r_priv = obj_req;
222022e8bd51SIlya Dryomov 
222122e8bd51SIlya Dryomov 	rbd_object_map_name(rbd_dev, snap_id, &req->r_base_oid);
222222e8bd51SIlya Dryomov 	ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
222322e8bd51SIlya Dryomov 	req->r_flags = CEPH_OSD_FLAG_WRITE;
222422e8bd51SIlya Dryomov 	ktime_get_real_ts64(&req->r_mtime);
222522e8bd51SIlya Dryomov 
222622e8bd51SIlya Dryomov 	if (snap_id == CEPH_NOSNAP) {
222722e8bd51SIlya Dryomov 		/*
222822e8bd51SIlya Dryomov 		 * Protect against possible race conditions during lock
222922e8bd51SIlya Dryomov 		 * ownership transitions.
223022e8bd51SIlya Dryomov 		 */
223122e8bd51SIlya Dryomov 		ret = ceph_cls_assert_locked(req, which++, RBD_LOCK_NAME,
223222e8bd51SIlya Dryomov 					     CEPH_CLS_LOCK_EXCLUSIVE, "", "");
223322e8bd51SIlya Dryomov 		if (ret)
223422e8bd51SIlya Dryomov 			return ret;
223522e8bd51SIlya Dryomov 	}
223622e8bd51SIlya Dryomov 
223722e8bd51SIlya Dryomov 	ret = rbd_cls_object_map_update(req, which, obj_req->ex.oe_objno,
223822e8bd51SIlya Dryomov 					new_state, current_state);
223922e8bd51SIlya Dryomov 	if (ret)
224022e8bd51SIlya Dryomov 		return ret;
224122e8bd51SIlya Dryomov 
224222e8bd51SIlya Dryomov 	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
224322e8bd51SIlya Dryomov 	if (ret)
224422e8bd51SIlya Dryomov 		return ret;
224522e8bd51SIlya Dryomov 
224622e8bd51SIlya Dryomov 	ceph_osdc_start_request(osdc, req, false);
224722e8bd51SIlya Dryomov 	return 0;
224822e8bd51SIlya Dryomov }
224922e8bd51SIlya Dryomov 
225086bd7998SIlya Dryomov static void prune_extents(struct ceph_file_extent *img_extents,
225186bd7998SIlya Dryomov 			  u32 *num_img_extents, u64 overlap)
2252e93f3152SAlex Elder {
225386bd7998SIlya Dryomov 	u32 cnt = *num_img_extents;
2254e93f3152SAlex Elder 
225586bd7998SIlya Dryomov 	/* drop extents completely beyond the overlap */
225686bd7998SIlya Dryomov 	while (cnt && img_extents[cnt - 1].fe_off >= overlap)
225786bd7998SIlya Dryomov 		cnt--;
2258e93f3152SAlex Elder 
225986bd7998SIlya Dryomov 	if (cnt) {
226086bd7998SIlya Dryomov 		struct ceph_file_extent *ex = &img_extents[cnt - 1];
2261e93f3152SAlex Elder 
226286bd7998SIlya Dryomov 		/* trim final overlapping extent */
226386bd7998SIlya Dryomov 		if (ex->fe_off + ex->fe_len > overlap)
226486bd7998SIlya Dryomov 			ex->fe_len = overlap - ex->fe_off;
2265e93f3152SAlex Elder 	}
2266e93f3152SAlex Elder 
226786bd7998SIlya Dryomov 	*num_img_extents = cnt;
226886bd7998SIlya Dryomov }
226986bd7998SIlya Dryomov 
227086bd7998SIlya Dryomov /*
227186bd7998SIlya Dryomov  * Determine the byte range(s) covered by either just the object extent
227286bd7998SIlya Dryomov  * or the entire object in the parent image.
227386bd7998SIlya Dryomov  */
227486bd7998SIlya Dryomov static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
227586bd7998SIlya Dryomov 				    bool entire)
2276e93f3152SAlex Elder {
227786bd7998SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2278c5b5ef6cSAlex Elder 	int ret;
2279c5b5ef6cSAlex Elder 
228086bd7998SIlya Dryomov 	if (!rbd_dev->parent_overlap)
228186bd7998SIlya Dryomov 		return 0;
228286bd7998SIlya Dryomov 
228386bd7998SIlya Dryomov 	ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
228486bd7998SIlya Dryomov 				  entire ? 0 : obj_req->ex.oe_off,
228586bd7998SIlya Dryomov 				  entire ? rbd_dev->layout.object_size :
228686bd7998SIlya Dryomov 							obj_req->ex.oe_len,
228786bd7998SIlya Dryomov 				  &obj_req->img_extents,
228886bd7998SIlya Dryomov 				  &obj_req->num_img_extents);
228986bd7998SIlya Dryomov 	if (ret)
229086bd7998SIlya Dryomov 		return ret;
229186bd7998SIlya Dryomov 
229286bd7998SIlya Dryomov 	prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
229386bd7998SIlya Dryomov 		      rbd_dev->parent_overlap);
229486bd7998SIlya Dryomov 	return 0;
229586bd7998SIlya Dryomov }
229686bd7998SIlya Dryomov 
2297bcbab1dbSIlya Dryomov static void rbd_osd_setup_data(struct ceph_osd_request *osd_req, int which)
22983da691bfSIlya Dryomov {
2299bcbab1dbSIlya Dryomov 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2300bcbab1dbSIlya Dryomov 
2301ecc633caSIlya Dryomov 	switch (obj_req->img_request->data_type) {
23023da691bfSIlya Dryomov 	case OBJ_REQUEST_BIO:
2303bcbab1dbSIlya Dryomov 		osd_req_op_extent_osd_data_bio(osd_req, which,
23043da691bfSIlya Dryomov 					       &obj_req->bio_pos,
230543df3d35SIlya Dryomov 					       obj_req->ex.oe_len);
23063da691bfSIlya Dryomov 		break;
23073da691bfSIlya Dryomov 	case OBJ_REQUEST_BVECS:
2308afb97888SIlya Dryomov 	case OBJ_REQUEST_OWN_BVECS:
23093da691bfSIlya Dryomov 		rbd_assert(obj_req->bvec_pos.iter.bi_size ==
231043df3d35SIlya Dryomov 							obj_req->ex.oe_len);
2311afb97888SIlya Dryomov 		rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
2312bcbab1dbSIlya Dryomov 		osd_req_op_extent_osd_data_bvec_pos(osd_req, which,
23133da691bfSIlya Dryomov 						    &obj_req->bvec_pos);
23143da691bfSIlya Dryomov 		break;
23153da691bfSIlya Dryomov 	default:
231616809372SArnd Bergmann 		BUG();
23173da691bfSIlya Dryomov 	}
23183da691bfSIlya Dryomov }
23193da691bfSIlya Dryomov 
2320bcbab1dbSIlya Dryomov static int rbd_osd_setup_stat(struct ceph_osd_request *osd_req, int which)
23213da691bfSIlya Dryomov {
23223da691bfSIlya Dryomov 	struct page **pages;
23233da691bfSIlya Dryomov 
2324c5b5ef6cSAlex Elder 	/*
2325c5b5ef6cSAlex Elder 	 * The response data for a STAT call consists of:
2326c5b5ef6cSAlex Elder 	 *     le64 length;
2327c5b5ef6cSAlex Elder 	 *     struct {
2328c5b5ef6cSAlex Elder 	 *         le32 tv_sec;
2329c5b5ef6cSAlex Elder 	 *         le32 tv_nsec;
2330c5b5ef6cSAlex Elder 	 *     } mtime;
2331c5b5ef6cSAlex Elder 	 */
23323da691bfSIlya Dryomov 	pages = ceph_alloc_page_vector(1, GFP_NOIO);
23333da691bfSIlya Dryomov 	if (IS_ERR(pages))
23343da691bfSIlya Dryomov 		return PTR_ERR(pages);
23353da691bfSIlya Dryomov 
2336bcbab1dbSIlya Dryomov 	osd_req_op_init(osd_req, which, CEPH_OSD_OP_STAT, 0);
2337bcbab1dbSIlya Dryomov 	osd_req_op_raw_data_in_pages(osd_req, which, pages,
23383da691bfSIlya Dryomov 				     8 + sizeof(struct ceph_timespec),
23393da691bfSIlya Dryomov 				     0, false, true);
23403da691bfSIlya Dryomov 	return 0;
2341710214e3SIlya Dryomov }
2342c5b5ef6cSAlex Elder 
2343b5ae8cbcSIlya Dryomov static int rbd_osd_setup_copyup(struct ceph_osd_request *osd_req, int which,
2344b5ae8cbcSIlya Dryomov 				u32 bytes)
234513488d53SIlya Dryomov {
2346b5ae8cbcSIlya Dryomov 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2347b5ae8cbcSIlya Dryomov 	int ret;
2348b5ae8cbcSIlya Dryomov 
2349b5ae8cbcSIlya Dryomov 	ret = osd_req_op_cls_init(osd_req, which, "rbd", "copyup");
2350b5ae8cbcSIlya Dryomov 	if (ret)
2351b5ae8cbcSIlya Dryomov 		return ret;
2352b5ae8cbcSIlya Dryomov 
2353b5ae8cbcSIlya Dryomov 	osd_req_op_cls_request_data_bvecs(osd_req, which, obj_req->copyup_bvecs,
2354b5ae8cbcSIlya Dryomov 					  obj_req->copyup_bvec_count, bytes);
2355b5ae8cbcSIlya Dryomov 	return 0;
235613488d53SIlya Dryomov }
235713488d53SIlya Dryomov 
2358ea9b743cSIlya Dryomov static int rbd_obj_init_read(struct rbd_obj_request *obj_req)
23593da691bfSIlya Dryomov {
2360ea9b743cSIlya Dryomov 	obj_req->read_state = RBD_OBJ_READ_START;
2361ea9b743cSIlya Dryomov 	return 0;
2362ea9b743cSIlya Dryomov }
2363ea9b743cSIlya Dryomov 
2364bcbab1dbSIlya Dryomov static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2365bcbab1dbSIlya Dryomov 				      int which)
23663da691bfSIlya Dryomov {
2367bcbab1dbSIlya Dryomov 	struct rbd_obj_request *obj_req = osd_req->r_priv;
23683da691bfSIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
23693da691bfSIlya Dryomov 	u16 opcode;
2370c5b5ef6cSAlex Elder 
23718b5bec5cSIlya Dryomov 	if (!use_object_map(rbd_dev) ||
23728b5bec5cSIlya Dryomov 	    !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) {
2373bcbab1dbSIlya Dryomov 		osd_req_op_alloc_hint_init(osd_req, which++,
23743da691bfSIlya Dryomov 					   rbd_dev->layout.object_size,
23753da691bfSIlya Dryomov 					   rbd_dev->layout.object_size);
23768b5bec5cSIlya Dryomov 	}
2377c5b5ef6cSAlex Elder 
23783da691bfSIlya Dryomov 	if (rbd_obj_is_entire(obj_req))
23793da691bfSIlya Dryomov 		opcode = CEPH_OSD_OP_WRITEFULL;
23803da691bfSIlya Dryomov 	else
23813da691bfSIlya Dryomov 		opcode = CEPH_OSD_OP_WRITE;
2382c5b5ef6cSAlex Elder 
2383bcbab1dbSIlya Dryomov 	osd_req_op_extent_init(osd_req, which, opcode,
238443df3d35SIlya Dryomov 			       obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
2385bcbab1dbSIlya Dryomov 	rbd_osd_setup_data(osd_req, which);
23863da691bfSIlya Dryomov }
23873da691bfSIlya Dryomov 
2388ea9b743cSIlya Dryomov static int rbd_obj_init_write(struct rbd_obj_request *obj_req)
23893da691bfSIlya Dryomov {
23903da691bfSIlya Dryomov 	int ret;
23913da691bfSIlya Dryomov 
239286bd7998SIlya Dryomov 	/* reverse map the entire object onto the parent */
239386bd7998SIlya Dryomov 	ret = rbd_obj_calc_img_extents(obj_req, true);
239486bd7998SIlya Dryomov 	if (ret)
239586bd7998SIlya Dryomov 		return ret;
239686bd7998SIlya Dryomov 
23970ad5d953SIlya Dryomov 	if (rbd_obj_copyup_enabled(obj_req))
23980ad5d953SIlya Dryomov 		obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
23993da691bfSIlya Dryomov 
240085b5e6d1SIlya Dryomov 	obj_req->write_state = RBD_OBJ_WRITE_START;
24013da691bfSIlya Dryomov 	return 0;
240270d045f6SIlya Dryomov }
240370d045f6SIlya Dryomov 
24046484cbe9SIlya Dryomov static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req)
24056484cbe9SIlya Dryomov {
24066484cbe9SIlya Dryomov 	return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE :
24076484cbe9SIlya Dryomov 					  CEPH_OSD_OP_ZERO;
24086484cbe9SIlya Dryomov }
24096484cbe9SIlya Dryomov 
241027bbd911SIlya Dryomov static void __rbd_osd_setup_discard_ops(struct ceph_osd_request *osd_req,
241127bbd911SIlya Dryomov 					int which)
241227bbd911SIlya Dryomov {
241327bbd911SIlya Dryomov 	struct rbd_obj_request *obj_req = osd_req->r_priv;
241427bbd911SIlya Dryomov 
241527bbd911SIlya Dryomov 	if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
241627bbd911SIlya Dryomov 		rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
241727bbd911SIlya Dryomov 		osd_req_op_init(osd_req, which, CEPH_OSD_OP_DELETE, 0);
241827bbd911SIlya Dryomov 	} else {
241927bbd911SIlya Dryomov 		osd_req_op_extent_init(osd_req, which,
242027bbd911SIlya Dryomov 				       truncate_or_zero_opcode(obj_req),
242127bbd911SIlya Dryomov 				       obj_req->ex.oe_off, obj_req->ex.oe_len,
242227bbd911SIlya Dryomov 				       0, 0);
242327bbd911SIlya Dryomov 	}
242427bbd911SIlya Dryomov }
242527bbd911SIlya Dryomov 
2426ea9b743cSIlya Dryomov static int rbd_obj_init_discard(struct rbd_obj_request *obj_req)
24276484cbe9SIlya Dryomov {
24280c93e1b7SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
242927bbd911SIlya Dryomov 	u64 off, next_off;
24306484cbe9SIlya Dryomov 	int ret;
24316484cbe9SIlya Dryomov 
24320c93e1b7SIlya Dryomov 	/*
24330c93e1b7SIlya Dryomov 	 * Align the range to alloc_size boundary and punt on discards
24340c93e1b7SIlya Dryomov 	 * that are too small to free up any space.
24350c93e1b7SIlya Dryomov 	 *
24360c93e1b7SIlya Dryomov 	 * alloc_size == object_size && is_tail() is a special case for
24370c93e1b7SIlya Dryomov 	 * filestore with filestore_punch_hole = false, needed to allow
24380c93e1b7SIlya Dryomov 	 * truncate (in addition to delete).
24390c93e1b7SIlya Dryomov 	 */
24400c93e1b7SIlya Dryomov 	if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size ||
24410c93e1b7SIlya Dryomov 	    !rbd_obj_is_tail(obj_req)) {
244227bbd911SIlya Dryomov 		off = round_up(obj_req->ex.oe_off, rbd_dev->opts->alloc_size);
244327bbd911SIlya Dryomov 		next_off = round_down(obj_req->ex.oe_off + obj_req->ex.oe_len,
244427bbd911SIlya Dryomov 				      rbd_dev->opts->alloc_size);
24450c93e1b7SIlya Dryomov 		if (off >= next_off)
24460c93e1b7SIlya Dryomov 			return 1;
244727bbd911SIlya Dryomov 
244827bbd911SIlya Dryomov 		dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
244927bbd911SIlya Dryomov 		     obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
245027bbd911SIlya Dryomov 		     off, next_off - off);
245127bbd911SIlya Dryomov 		obj_req->ex.oe_off = off;
245227bbd911SIlya Dryomov 		obj_req->ex.oe_len = next_off - off;
24530c93e1b7SIlya Dryomov 	}
24540c93e1b7SIlya Dryomov 
24556484cbe9SIlya Dryomov 	/* reverse map the entire object onto the parent */
24566484cbe9SIlya Dryomov 	ret = rbd_obj_calc_img_extents(obj_req, true);
24576484cbe9SIlya Dryomov 	if (ret)
24586484cbe9SIlya Dryomov 		return ret;
24596484cbe9SIlya Dryomov 
246022e8bd51SIlya Dryomov 	obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
24610ad5d953SIlya Dryomov 	if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents)
24620ad5d953SIlya Dryomov 		obj_req->flags |= RBD_OBJ_FLAG_DELETION;
24636484cbe9SIlya Dryomov 
246485b5e6d1SIlya Dryomov 	obj_req->write_state = RBD_OBJ_WRITE_START;
24656484cbe9SIlya Dryomov 	return 0;
24666484cbe9SIlya Dryomov }
24676484cbe9SIlya Dryomov 
2468bcbab1dbSIlya Dryomov static void __rbd_osd_setup_zeroout_ops(struct ceph_osd_request *osd_req,
2469bcbab1dbSIlya Dryomov 					int which)
247013488d53SIlya Dryomov {
2471bcbab1dbSIlya Dryomov 	struct rbd_obj_request *obj_req = osd_req->r_priv;
24723da691bfSIlya Dryomov 	u16 opcode;
2473058aa991SIlya Dryomov 
24743da691bfSIlya Dryomov 	if (rbd_obj_is_entire(obj_req)) {
247586bd7998SIlya Dryomov 		if (obj_req->num_img_extents) {
24760ad5d953SIlya Dryomov 			if (!(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
2477bcbab1dbSIlya Dryomov 				osd_req_op_init(osd_req, which++,
24782bb1e56eSIlya Dryomov 						CEPH_OSD_OP_CREATE, 0);
24793da691bfSIlya Dryomov 			opcode = CEPH_OSD_OP_TRUNCATE;
24803da691bfSIlya Dryomov 		} else {
24810ad5d953SIlya Dryomov 			rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
2482bcbab1dbSIlya Dryomov 			osd_req_op_init(osd_req, which++,
24833da691bfSIlya Dryomov 					CEPH_OSD_OP_DELETE, 0);
24843da691bfSIlya Dryomov 			opcode = 0;
24853da691bfSIlya Dryomov 		}
24863da691bfSIlya Dryomov 	} else {
24876484cbe9SIlya Dryomov 		opcode = truncate_or_zero_opcode(obj_req);
24883da691bfSIlya Dryomov 	}
24893da691bfSIlya Dryomov 
24903da691bfSIlya Dryomov 	if (opcode)
2491bcbab1dbSIlya Dryomov 		osd_req_op_extent_init(osd_req, which, opcode,
249243df3d35SIlya Dryomov 				       obj_req->ex.oe_off, obj_req->ex.oe_len,
24933da691bfSIlya Dryomov 				       0, 0);
24943da691bfSIlya Dryomov }
24953da691bfSIlya Dryomov 
2496ea9b743cSIlya Dryomov static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req)
24973da691bfSIlya Dryomov {
24983da691bfSIlya Dryomov 	int ret;
24993da691bfSIlya Dryomov 
250086bd7998SIlya Dryomov 	/* reverse map the entire object onto the parent */
250186bd7998SIlya Dryomov 	ret = rbd_obj_calc_img_extents(obj_req, true);
250286bd7998SIlya Dryomov 	if (ret)
250386bd7998SIlya Dryomov 		return ret;
250486bd7998SIlya Dryomov 
25050ad5d953SIlya Dryomov 	if (rbd_obj_copyup_enabled(obj_req))
25060ad5d953SIlya Dryomov 		obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
25070ad5d953SIlya Dryomov 	if (!obj_req->num_img_extents) {
250822e8bd51SIlya Dryomov 		obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
25090ad5d953SIlya Dryomov 		if (rbd_obj_is_entire(obj_req))
25100ad5d953SIlya Dryomov 			obj_req->flags |= RBD_OBJ_FLAG_DELETION;
25113da691bfSIlya Dryomov 	}
25123da691bfSIlya Dryomov 
251385b5e6d1SIlya Dryomov 	obj_req->write_state = RBD_OBJ_WRITE_START;
2514980917fcSIlya Dryomov 	return 0;
2515b454e36dSAlex Elder }
2516b454e36dSAlex Elder 
2517a086a1b8SIlya Dryomov static int count_write_ops(struct rbd_obj_request *obj_req)
2518a086a1b8SIlya Dryomov {
25198b5bec5cSIlya Dryomov 	struct rbd_img_request *img_req = obj_req->img_request;
25208b5bec5cSIlya Dryomov 
25218b5bec5cSIlya Dryomov 	switch (img_req->op_type) {
2522a086a1b8SIlya Dryomov 	case OBJ_OP_WRITE:
25238b5bec5cSIlya Dryomov 		if (!use_object_map(img_req->rbd_dev) ||
25248b5bec5cSIlya Dryomov 		    !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST))
2525a086a1b8SIlya Dryomov 			return 2; /* setallochint + write/writefull */
25268b5bec5cSIlya Dryomov 
25278b5bec5cSIlya Dryomov 		return 1; /* write/writefull */
2528a086a1b8SIlya Dryomov 	case OBJ_OP_DISCARD:
2529a086a1b8SIlya Dryomov 		return 1; /* delete/truncate/zero */
2530a086a1b8SIlya Dryomov 	case OBJ_OP_ZEROOUT:
2531a086a1b8SIlya Dryomov 		if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents &&
2532a086a1b8SIlya Dryomov 		    !(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
2533a086a1b8SIlya Dryomov 			return 2; /* create + truncate */
2534a086a1b8SIlya Dryomov 
2535a086a1b8SIlya Dryomov 		return 1; /* delete/truncate/zero */
2536a086a1b8SIlya Dryomov 	default:
2537a086a1b8SIlya Dryomov 		BUG();
2538a086a1b8SIlya Dryomov 	}
2539a086a1b8SIlya Dryomov }
2540a086a1b8SIlya Dryomov 
2541a086a1b8SIlya Dryomov static void rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2542a086a1b8SIlya Dryomov 				    int which)
2543a086a1b8SIlya Dryomov {
2544a086a1b8SIlya Dryomov 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2545a086a1b8SIlya Dryomov 
2546a086a1b8SIlya Dryomov 	switch (obj_req->img_request->op_type) {
2547a086a1b8SIlya Dryomov 	case OBJ_OP_WRITE:
2548a086a1b8SIlya Dryomov 		__rbd_osd_setup_write_ops(osd_req, which);
2549a086a1b8SIlya Dryomov 		break;
2550a086a1b8SIlya Dryomov 	case OBJ_OP_DISCARD:
2551a086a1b8SIlya Dryomov 		__rbd_osd_setup_discard_ops(osd_req, which);
2552a086a1b8SIlya Dryomov 		break;
2553a086a1b8SIlya Dryomov 	case OBJ_OP_ZEROOUT:
2554a086a1b8SIlya Dryomov 		__rbd_osd_setup_zeroout_ops(osd_req, which);
2555a086a1b8SIlya Dryomov 		break;
2556a086a1b8SIlya Dryomov 	default:
2557a086a1b8SIlya Dryomov 		BUG();
2558a086a1b8SIlya Dryomov 	}
2559a086a1b8SIlya Dryomov }
2560a086a1b8SIlya Dryomov 
2561b454e36dSAlex Elder /*
2562a086a1b8SIlya Dryomov  * Prune the list of object requests (adjust offset and/or length, drop
2563a086a1b8SIlya Dryomov  * redundant requests).  Prepare object request state machines and image
2564a086a1b8SIlya Dryomov  * request state machine for execution.
2565b454e36dSAlex Elder  */
25663da691bfSIlya Dryomov static int __rbd_img_fill_request(struct rbd_img_request *img_req)
25673da691bfSIlya Dryomov {
25680c93e1b7SIlya Dryomov 	struct rbd_obj_request *obj_req, *next_obj_req;
25693da691bfSIlya Dryomov 	int ret;
25703d7efd18SAlex Elder 
25710c93e1b7SIlya Dryomov 	for_each_obj_request_safe(img_req, obj_req, next_obj_req) {
25729bb0248dSIlya Dryomov 		switch (img_req->op_type) {
25733da691bfSIlya Dryomov 		case OBJ_OP_READ:
2574ea9b743cSIlya Dryomov 			ret = rbd_obj_init_read(obj_req);
25753da691bfSIlya Dryomov 			break;
25763da691bfSIlya Dryomov 		case OBJ_OP_WRITE:
2577ea9b743cSIlya Dryomov 			ret = rbd_obj_init_write(obj_req);
25783da691bfSIlya Dryomov 			break;
25793da691bfSIlya Dryomov 		case OBJ_OP_DISCARD:
2580ea9b743cSIlya Dryomov 			ret = rbd_obj_init_discard(obj_req);
25813da691bfSIlya Dryomov 			break;
25826484cbe9SIlya Dryomov 		case OBJ_OP_ZEROOUT:
2583ea9b743cSIlya Dryomov 			ret = rbd_obj_init_zeroout(obj_req);
25846484cbe9SIlya Dryomov 			break;
25853da691bfSIlya Dryomov 		default:
258616809372SArnd Bergmann 			BUG();
25873da691bfSIlya Dryomov 		}
25880c93e1b7SIlya Dryomov 		if (ret < 0)
25893da691bfSIlya Dryomov 			return ret;
25900c93e1b7SIlya Dryomov 		if (ret > 0) {
25910c93e1b7SIlya Dryomov 			rbd_img_obj_request_del(img_req, obj_req);
25920c93e1b7SIlya Dryomov 			continue;
25930c93e1b7SIlya Dryomov 		}
2594b454e36dSAlex Elder 	}
2595b454e36dSAlex Elder 
25960192ce2eSIlya Dryomov 	img_req->state = RBD_IMG_START;
25973da691bfSIlya Dryomov 	return 0;
25983da691bfSIlya Dryomov }
25993da691bfSIlya Dryomov 
26005a237819SIlya Dryomov union rbd_img_fill_iter {
26015a237819SIlya Dryomov 	struct ceph_bio_iter	bio_iter;
26025a237819SIlya Dryomov 	struct ceph_bvec_iter	bvec_iter;
26035a237819SIlya Dryomov };
26045a237819SIlya Dryomov 
26055a237819SIlya Dryomov struct rbd_img_fill_ctx {
26065a237819SIlya Dryomov 	enum obj_request_type	pos_type;
26075a237819SIlya Dryomov 	union rbd_img_fill_iter	*pos;
26085a237819SIlya Dryomov 	union rbd_img_fill_iter	iter;
26095a237819SIlya Dryomov 	ceph_object_extent_fn_t	set_pos_fn;
2610afb97888SIlya Dryomov 	ceph_object_extent_fn_t	count_fn;
2611afb97888SIlya Dryomov 	ceph_object_extent_fn_t	copy_fn;
26125a237819SIlya Dryomov };
26135a237819SIlya Dryomov 
26145a237819SIlya Dryomov static struct ceph_object_extent *alloc_object_extent(void *arg)
26155a237819SIlya Dryomov {
26165a237819SIlya Dryomov 	struct rbd_img_request *img_req = arg;
26175a237819SIlya Dryomov 	struct rbd_obj_request *obj_req;
26185a237819SIlya Dryomov 
26195a237819SIlya Dryomov 	obj_req = rbd_obj_request_create();
26205a237819SIlya Dryomov 	if (!obj_req)
26215a237819SIlya Dryomov 		return NULL;
26225a237819SIlya Dryomov 
26235a237819SIlya Dryomov 	rbd_img_obj_request_add(img_req, obj_req);
26245a237819SIlya Dryomov 	return &obj_req->ex;
26255a237819SIlya Dryomov }
26265a237819SIlya Dryomov 
26275a237819SIlya Dryomov /*
2628afb97888SIlya Dryomov  * While su != os && sc == 1 is technically not fancy (it's the same
2629afb97888SIlya Dryomov  * layout as su == os && sc == 1), we can't use the nocopy path for it
2630afb97888SIlya Dryomov  * because ->set_pos_fn() should be called only once per object.
2631afb97888SIlya Dryomov  * ceph_file_to_extents() invokes action_fn once per stripe unit, so
2632afb97888SIlya Dryomov  * treat su != os && sc == 1 as fancy.
26335a237819SIlya Dryomov  */
2634afb97888SIlya Dryomov static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
2635afb97888SIlya Dryomov {
2636afb97888SIlya Dryomov 	return l->stripe_unit != l->object_size;
2637afb97888SIlya Dryomov }
2638afb97888SIlya Dryomov 
2639afb97888SIlya Dryomov static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
26405a237819SIlya Dryomov 				       struct ceph_file_extent *img_extents,
26415a237819SIlya Dryomov 				       u32 num_img_extents,
26425a237819SIlya Dryomov 				       struct rbd_img_fill_ctx *fctx)
26435a237819SIlya Dryomov {
26445a237819SIlya Dryomov 	u32 i;
26455a237819SIlya Dryomov 	int ret;
26465a237819SIlya Dryomov 
26475a237819SIlya Dryomov 	img_req->data_type = fctx->pos_type;
26485a237819SIlya Dryomov 
26495a237819SIlya Dryomov 	/*
26505a237819SIlya Dryomov 	 * Create object requests and set each object request's starting
26515a237819SIlya Dryomov 	 * position in the provided bio (list) or bio_vec array.
26525a237819SIlya Dryomov 	 */
26535a237819SIlya Dryomov 	fctx->iter = *fctx->pos;
26545a237819SIlya Dryomov 	for (i = 0; i < num_img_extents; i++) {
26555a237819SIlya Dryomov 		ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
26565a237819SIlya Dryomov 					   img_extents[i].fe_off,
26575a237819SIlya Dryomov 					   img_extents[i].fe_len,
26585a237819SIlya Dryomov 					   &img_req->object_extents,
26595a237819SIlya Dryomov 					   alloc_object_extent, img_req,
26605a237819SIlya Dryomov 					   fctx->set_pos_fn, &fctx->iter);
26615a237819SIlya Dryomov 		if (ret)
26625a237819SIlya Dryomov 			return ret;
26635a237819SIlya Dryomov 	}
26645a237819SIlya Dryomov 
26655a237819SIlya Dryomov 	return __rbd_img_fill_request(img_req);
26665a237819SIlya Dryomov }
26675a237819SIlya Dryomov 
2668afb97888SIlya Dryomov /*
2669afb97888SIlya Dryomov  * Map a list of image extents to a list of object extents, create the
2670afb97888SIlya Dryomov  * corresponding object requests (normally each to a different object,
2671afb97888SIlya Dryomov  * but not always) and add them to @img_req.  For each object request,
2672afb97888SIlya Dryomov  * set up its data descriptor to point to the corresponding chunk(s) of
2673afb97888SIlya Dryomov  * @fctx->pos data buffer.
2674afb97888SIlya Dryomov  *
2675afb97888SIlya Dryomov  * Because ceph_file_to_extents() will merge adjacent object extents
2676afb97888SIlya Dryomov  * together, each object request's data descriptor may point to multiple
2677afb97888SIlya Dryomov  * different chunks of @fctx->pos data buffer.
2678afb97888SIlya Dryomov  *
2679afb97888SIlya Dryomov  * @fctx->pos data buffer is assumed to be large enough.
2680afb97888SIlya Dryomov  */
2681afb97888SIlya Dryomov static int rbd_img_fill_request(struct rbd_img_request *img_req,
2682afb97888SIlya Dryomov 				struct ceph_file_extent *img_extents,
2683afb97888SIlya Dryomov 				u32 num_img_extents,
2684afb97888SIlya Dryomov 				struct rbd_img_fill_ctx *fctx)
2685afb97888SIlya Dryomov {
2686afb97888SIlya Dryomov 	struct rbd_device *rbd_dev = img_req->rbd_dev;
2687afb97888SIlya Dryomov 	struct rbd_obj_request *obj_req;
2688afb97888SIlya Dryomov 	u32 i;
2689afb97888SIlya Dryomov 	int ret;
2690afb97888SIlya Dryomov 
2691afb97888SIlya Dryomov 	if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2692afb97888SIlya Dryomov 	    !rbd_layout_is_fancy(&rbd_dev->layout))
2693afb97888SIlya Dryomov 		return rbd_img_fill_request_nocopy(img_req, img_extents,
2694afb97888SIlya Dryomov 						   num_img_extents, fctx);
2695afb97888SIlya Dryomov 
2696afb97888SIlya Dryomov 	img_req->data_type = OBJ_REQUEST_OWN_BVECS;
2697afb97888SIlya Dryomov 
2698afb97888SIlya Dryomov 	/*
2699afb97888SIlya Dryomov 	 * Create object requests and determine ->bvec_count for each object
2700afb97888SIlya Dryomov 	 * request.  Note that ->bvec_count sum over all object requests may
2701afb97888SIlya Dryomov 	 * be greater than the number of bio_vecs in the provided bio (list)
2702afb97888SIlya Dryomov 	 * or bio_vec array because when mapped, those bio_vecs can straddle
2703afb97888SIlya Dryomov 	 * stripe unit boundaries.
2704afb97888SIlya Dryomov 	 */
2705afb97888SIlya Dryomov 	fctx->iter = *fctx->pos;
2706afb97888SIlya Dryomov 	for (i = 0; i < num_img_extents; i++) {
2707afb97888SIlya Dryomov 		ret = ceph_file_to_extents(&rbd_dev->layout,
2708afb97888SIlya Dryomov 					   img_extents[i].fe_off,
2709afb97888SIlya Dryomov 					   img_extents[i].fe_len,
2710afb97888SIlya Dryomov 					   &img_req->object_extents,
2711afb97888SIlya Dryomov 					   alloc_object_extent, img_req,
2712afb97888SIlya Dryomov 					   fctx->count_fn, &fctx->iter);
2713afb97888SIlya Dryomov 		if (ret)
2714afb97888SIlya Dryomov 			return ret;
2715afb97888SIlya Dryomov 	}
2716afb97888SIlya Dryomov 
2717afb97888SIlya Dryomov 	for_each_obj_request(img_req, obj_req) {
2718afb97888SIlya Dryomov 		obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2719afb97888SIlya Dryomov 					      sizeof(*obj_req->bvec_pos.bvecs),
2720afb97888SIlya Dryomov 					      GFP_NOIO);
2721afb97888SIlya Dryomov 		if (!obj_req->bvec_pos.bvecs)
2722afb97888SIlya Dryomov 			return -ENOMEM;
2723afb97888SIlya Dryomov 	}
2724afb97888SIlya Dryomov 
2725afb97888SIlya Dryomov 	/*
2726afb97888SIlya Dryomov 	 * Fill in each object request's private bio_vec array, splitting and
2727afb97888SIlya Dryomov 	 * rearranging the provided bio_vecs in stripe unit chunks as needed.
2728afb97888SIlya Dryomov 	 */
2729afb97888SIlya Dryomov 	fctx->iter = *fctx->pos;
2730afb97888SIlya Dryomov 	for (i = 0; i < num_img_extents; i++) {
2731afb97888SIlya Dryomov 		ret = ceph_iterate_extents(&rbd_dev->layout,
2732afb97888SIlya Dryomov 					   img_extents[i].fe_off,
2733afb97888SIlya Dryomov 					   img_extents[i].fe_len,
2734afb97888SIlya Dryomov 					   &img_req->object_extents,
2735afb97888SIlya Dryomov 					   fctx->copy_fn, &fctx->iter);
2736afb97888SIlya Dryomov 		if (ret)
2737afb97888SIlya Dryomov 			return ret;
2738afb97888SIlya Dryomov 	}
2739afb97888SIlya Dryomov 
2740afb97888SIlya Dryomov 	return __rbd_img_fill_request(img_req);
2741afb97888SIlya Dryomov }
2742afb97888SIlya Dryomov 
27435a237819SIlya Dryomov static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
27445a237819SIlya Dryomov 			       u64 off, u64 len)
27455a237819SIlya Dryomov {
27465a237819SIlya Dryomov 	struct ceph_file_extent ex = { off, len };
27475a237819SIlya Dryomov 	union rbd_img_fill_iter dummy;
27485a237819SIlya Dryomov 	struct rbd_img_fill_ctx fctx = {
27495a237819SIlya Dryomov 		.pos_type = OBJ_REQUEST_NODATA,
27505a237819SIlya Dryomov 		.pos = &dummy,
27515a237819SIlya Dryomov 	};
27525a237819SIlya Dryomov 
27535a237819SIlya Dryomov 	return rbd_img_fill_request(img_req, &ex, 1, &fctx);
27545a237819SIlya Dryomov }
27555a237819SIlya Dryomov 
27565a237819SIlya Dryomov static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
27575a237819SIlya Dryomov {
27585a237819SIlya Dryomov 	struct rbd_obj_request *obj_req =
27595a237819SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
27605a237819SIlya Dryomov 	struct ceph_bio_iter *it = arg;
27615a237819SIlya Dryomov 
27625a237819SIlya Dryomov 	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
27635a237819SIlya Dryomov 	obj_req->bio_pos = *it;
27645a237819SIlya Dryomov 	ceph_bio_iter_advance(it, bytes);
27655a237819SIlya Dryomov }
27665a237819SIlya Dryomov 
2767afb97888SIlya Dryomov static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2768afb97888SIlya Dryomov {
2769afb97888SIlya Dryomov 	struct rbd_obj_request *obj_req =
2770afb97888SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
2771afb97888SIlya Dryomov 	struct ceph_bio_iter *it = arg;
2772afb97888SIlya Dryomov 
2773afb97888SIlya Dryomov 	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2774afb97888SIlya Dryomov 	ceph_bio_iter_advance_step(it, bytes, ({
2775afb97888SIlya Dryomov 		obj_req->bvec_count++;
2776afb97888SIlya Dryomov 	}));
2777afb97888SIlya Dryomov 
2778afb97888SIlya Dryomov }
2779afb97888SIlya Dryomov 
2780afb97888SIlya Dryomov static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2781afb97888SIlya Dryomov {
2782afb97888SIlya Dryomov 	struct rbd_obj_request *obj_req =
2783afb97888SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
2784afb97888SIlya Dryomov 	struct ceph_bio_iter *it = arg;
2785afb97888SIlya Dryomov 
2786afb97888SIlya Dryomov 	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2787afb97888SIlya Dryomov 	ceph_bio_iter_advance_step(it, bytes, ({
2788afb97888SIlya Dryomov 		obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2789afb97888SIlya Dryomov 		obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2790afb97888SIlya Dryomov 	}));
2791afb97888SIlya Dryomov }
2792afb97888SIlya Dryomov 
27935a237819SIlya Dryomov static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
27945a237819SIlya Dryomov 				   struct ceph_file_extent *img_extents,
27955a237819SIlya Dryomov 				   u32 num_img_extents,
27965a237819SIlya Dryomov 				   struct ceph_bio_iter *bio_pos)
27975a237819SIlya Dryomov {
27985a237819SIlya Dryomov 	struct rbd_img_fill_ctx fctx = {
27995a237819SIlya Dryomov 		.pos_type = OBJ_REQUEST_BIO,
28005a237819SIlya Dryomov 		.pos = (union rbd_img_fill_iter *)bio_pos,
28015a237819SIlya Dryomov 		.set_pos_fn = set_bio_pos,
2802afb97888SIlya Dryomov 		.count_fn = count_bio_bvecs,
2803afb97888SIlya Dryomov 		.copy_fn = copy_bio_bvecs,
28045a237819SIlya Dryomov 	};
28055a237819SIlya Dryomov 
28065a237819SIlya Dryomov 	return rbd_img_fill_request(img_req, img_extents, num_img_extents,
28075a237819SIlya Dryomov 				    &fctx);
28085a237819SIlya Dryomov }
28095a237819SIlya Dryomov 
28105a237819SIlya Dryomov static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
28115a237819SIlya Dryomov 				 u64 off, u64 len, struct bio *bio)
28125a237819SIlya Dryomov {
28135a237819SIlya Dryomov 	struct ceph_file_extent ex = { off, len };
28145a237819SIlya Dryomov 	struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
28155a237819SIlya Dryomov 
28165a237819SIlya Dryomov 	return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
28175a237819SIlya Dryomov }
28185a237819SIlya Dryomov 
28195a237819SIlya Dryomov static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
28205a237819SIlya Dryomov {
28215a237819SIlya Dryomov 	struct rbd_obj_request *obj_req =
28225a237819SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
28235a237819SIlya Dryomov 	struct ceph_bvec_iter *it = arg;
28245a237819SIlya Dryomov 
28255a237819SIlya Dryomov 	obj_req->bvec_pos = *it;
28265a237819SIlya Dryomov 	ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
28275a237819SIlya Dryomov 	ceph_bvec_iter_advance(it, bytes);
28285a237819SIlya Dryomov }
28295a237819SIlya Dryomov 
2830afb97888SIlya Dryomov static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2831afb97888SIlya Dryomov {
2832afb97888SIlya Dryomov 	struct rbd_obj_request *obj_req =
2833afb97888SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
2834afb97888SIlya Dryomov 	struct ceph_bvec_iter *it = arg;
2835afb97888SIlya Dryomov 
2836afb97888SIlya Dryomov 	ceph_bvec_iter_advance_step(it, bytes, ({
2837afb97888SIlya Dryomov 		obj_req->bvec_count++;
2838afb97888SIlya Dryomov 	}));
2839afb97888SIlya Dryomov }
2840afb97888SIlya Dryomov 
2841afb97888SIlya Dryomov static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2842afb97888SIlya Dryomov {
2843afb97888SIlya Dryomov 	struct rbd_obj_request *obj_req =
2844afb97888SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
2845afb97888SIlya Dryomov 	struct ceph_bvec_iter *it = arg;
2846afb97888SIlya Dryomov 
2847afb97888SIlya Dryomov 	ceph_bvec_iter_advance_step(it, bytes, ({
2848afb97888SIlya Dryomov 		obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2849afb97888SIlya Dryomov 		obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2850afb97888SIlya Dryomov 	}));
2851afb97888SIlya Dryomov }
2852afb97888SIlya Dryomov 
28535a237819SIlya Dryomov static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
28545a237819SIlya Dryomov 				     struct ceph_file_extent *img_extents,
28555a237819SIlya Dryomov 				     u32 num_img_extents,
28565a237819SIlya Dryomov 				     struct ceph_bvec_iter *bvec_pos)
28575a237819SIlya Dryomov {
28585a237819SIlya Dryomov 	struct rbd_img_fill_ctx fctx = {
28595a237819SIlya Dryomov 		.pos_type = OBJ_REQUEST_BVECS,
28605a237819SIlya Dryomov 		.pos = (union rbd_img_fill_iter *)bvec_pos,
28615a237819SIlya Dryomov 		.set_pos_fn = set_bvec_pos,
2862afb97888SIlya Dryomov 		.count_fn = count_bvecs,
2863afb97888SIlya Dryomov 		.copy_fn = copy_bvecs,
28645a237819SIlya Dryomov 	};
28655a237819SIlya Dryomov 
28665a237819SIlya Dryomov 	return rbd_img_fill_request(img_req, img_extents, num_img_extents,
28675a237819SIlya Dryomov 				    &fctx);
28685a237819SIlya Dryomov }
28695a237819SIlya Dryomov 
28705a237819SIlya Dryomov static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
28715a237819SIlya Dryomov 				   struct ceph_file_extent *img_extents,
28725a237819SIlya Dryomov 				   u32 num_img_extents,
28735a237819SIlya Dryomov 				   struct bio_vec *bvecs)
28745a237819SIlya Dryomov {
28755a237819SIlya Dryomov 	struct ceph_bvec_iter it = {
28765a237819SIlya Dryomov 		.bvecs = bvecs,
28775a237819SIlya Dryomov 		.iter = { .bi_size = ceph_file_extents_bytes(img_extents,
28785a237819SIlya Dryomov 							     num_img_extents) },
28795a237819SIlya Dryomov 	};
28805a237819SIlya Dryomov 
28815a237819SIlya Dryomov 	return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
28825a237819SIlya Dryomov 					 &it);
28835a237819SIlya Dryomov }
28845a237819SIlya Dryomov 
28850192ce2eSIlya Dryomov static void rbd_img_handle_request_work(struct work_struct *work)
2886bf0d5f50SAlex Elder {
28870192ce2eSIlya Dryomov 	struct rbd_img_request *img_req =
28880192ce2eSIlya Dryomov 	    container_of(work, struct rbd_img_request, work);
2889bf0d5f50SAlex Elder 
28900192ce2eSIlya Dryomov 	rbd_img_handle_request(img_req, img_req->work_result);
28910192ce2eSIlya Dryomov }
2892bf0d5f50SAlex Elder 
28930192ce2eSIlya Dryomov static void rbd_img_schedule(struct rbd_img_request *img_req, int result)
28940192ce2eSIlya Dryomov {
28950192ce2eSIlya Dryomov 	INIT_WORK(&img_req->work, rbd_img_handle_request_work);
28960192ce2eSIlya Dryomov 	img_req->work_result = result;
28970192ce2eSIlya Dryomov 	queue_work(rbd_wq, &img_req->work);
2898bf0d5f50SAlex Elder }
2899bf0d5f50SAlex Elder 
290022e8bd51SIlya Dryomov static bool rbd_obj_may_exist(struct rbd_obj_request *obj_req)
290122e8bd51SIlya Dryomov {
290222e8bd51SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
290322e8bd51SIlya Dryomov 
290422e8bd51SIlya Dryomov 	if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno)) {
290522e8bd51SIlya Dryomov 		obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
290622e8bd51SIlya Dryomov 		return true;
290722e8bd51SIlya Dryomov 	}
290822e8bd51SIlya Dryomov 
290922e8bd51SIlya Dryomov 	dout("%s %p objno %llu assuming dne\n", __func__, obj_req,
291022e8bd51SIlya Dryomov 	     obj_req->ex.oe_objno);
291122e8bd51SIlya Dryomov 	return false;
291222e8bd51SIlya Dryomov }
291322e8bd51SIlya Dryomov 
291485b5e6d1SIlya Dryomov static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
291585b5e6d1SIlya Dryomov {
2916a086a1b8SIlya Dryomov 	struct ceph_osd_request *osd_req;
2917a086a1b8SIlya Dryomov 	int ret;
2918a086a1b8SIlya Dryomov 
2919a086a1b8SIlya Dryomov 	osd_req = __rbd_obj_add_osd_request(obj_req, NULL, 1);
2920a086a1b8SIlya Dryomov 	if (IS_ERR(osd_req))
2921a086a1b8SIlya Dryomov 		return PTR_ERR(osd_req);
2922a086a1b8SIlya Dryomov 
2923a086a1b8SIlya Dryomov 	osd_req_op_extent_init(osd_req, 0, CEPH_OSD_OP_READ,
2924a086a1b8SIlya Dryomov 			       obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
2925a086a1b8SIlya Dryomov 	rbd_osd_setup_data(osd_req, 0);
2926a086a1b8SIlya Dryomov 	rbd_osd_format_read(osd_req);
2927a086a1b8SIlya Dryomov 
2928a086a1b8SIlya Dryomov 	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
2929a086a1b8SIlya Dryomov 	if (ret)
2930a086a1b8SIlya Dryomov 		return ret;
2931a086a1b8SIlya Dryomov 
2932a086a1b8SIlya Dryomov 	rbd_osd_submit(osd_req);
293385b5e6d1SIlya Dryomov 	return 0;
2934bf0d5f50SAlex Elder }
2935bf0d5f50SAlex Elder 
293686bd7998SIlya Dryomov static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
29373da691bfSIlya Dryomov {
29383da691bfSIlya Dryomov 	struct rbd_img_request *img_req = obj_req->img_request;
29393da691bfSIlya Dryomov 	struct rbd_img_request *child_img_req;
29403da691bfSIlya Dryomov 	int ret;
29413da691bfSIlya Dryomov 
2942e93aca0aSIlya Dryomov 	child_img_req = rbd_img_request_create(img_req->rbd_dev->parent,
2943e93aca0aSIlya Dryomov 					       OBJ_OP_READ, NULL);
29443da691bfSIlya Dryomov 	if (!child_img_req)
29453da691bfSIlya Dryomov 		return -ENOMEM;
29463da691bfSIlya Dryomov 
2947e93aca0aSIlya Dryomov 	__set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2948e93aca0aSIlya Dryomov 	child_img_req->obj_request = obj_req;
2949e93aca0aSIlya Dryomov 
295021ed05a8SIlya Dryomov 	dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req,
295121ed05a8SIlya Dryomov 	     obj_req);
295221ed05a8SIlya Dryomov 
29533da691bfSIlya Dryomov 	if (!rbd_img_is_write(img_req)) {
2954ecc633caSIlya Dryomov 		switch (img_req->data_type) {
29553da691bfSIlya Dryomov 		case OBJ_REQUEST_BIO:
29565a237819SIlya Dryomov 			ret = __rbd_img_fill_from_bio(child_img_req,
29575a237819SIlya Dryomov 						      obj_req->img_extents,
29585a237819SIlya Dryomov 						      obj_req->num_img_extents,
29593da691bfSIlya Dryomov 						      &obj_req->bio_pos);
29603da691bfSIlya Dryomov 			break;
29613da691bfSIlya Dryomov 		case OBJ_REQUEST_BVECS:
2962afb97888SIlya Dryomov 		case OBJ_REQUEST_OWN_BVECS:
29635a237819SIlya Dryomov 			ret = __rbd_img_fill_from_bvecs(child_img_req,
29645a237819SIlya Dryomov 						      obj_req->img_extents,
29655a237819SIlya Dryomov 						      obj_req->num_img_extents,
29663da691bfSIlya Dryomov 						      &obj_req->bvec_pos);
29673da691bfSIlya Dryomov 			break;
29683da691bfSIlya Dryomov 		default:
2969d342a15bSArnd Bergmann 			BUG();
29703da691bfSIlya Dryomov 		}
29713da691bfSIlya Dryomov 	} else {
29725a237819SIlya Dryomov 		ret = rbd_img_fill_from_bvecs(child_img_req,
29735a237819SIlya Dryomov 					      obj_req->img_extents,
29745a237819SIlya Dryomov 					      obj_req->num_img_extents,
29755a237819SIlya Dryomov 					      obj_req->copyup_bvecs);
29763da691bfSIlya Dryomov 	}
29773da691bfSIlya Dryomov 	if (ret) {
29783da691bfSIlya Dryomov 		rbd_img_request_put(child_img_req);
2979663ae2ccSIlya Dryomov 		return ret;
2980bf0d5f50SAlex Elder 	}
2981bf0d5f50SAlex Elder 
29820192ce2eSIlya Dryomov 	/* avoid parent chain recursion */
29830192ce2eSIlya Dryomov 	rbd_img_schedule(child_img_req, 0);
29843da691bfSIlya Dryomov 	return 0;
29853da691bfSIlya Dryomov }
29863da691bfSIlya Dryomov 
298785b5e6d1SIlya Dryomov static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
29888b3e1a56SAlex Elder {
29893da691bfSIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
29903da691bfSIlya Dryomov 	int ret;
29918b3e1a56SAlex Elder 
299222e8bd51SIlya Dryomov again:
2993a9b67e69SIlya Dryomov 	switch (obj_req->read_state) {
299485b5e6d1SIlya Dryomov 	case RBD_OBJ_READ_START:
299585b5e6d1SIlya Dryomov 		rbd_assert(!*result);
299685b5e6d1SIlya Dryomov 
299722e8bd51SIlya Dryomov 		if (!rbd_obj_may_exist(obj_req)) {
299822e8bd51SIlya Dryomov 			*result = -ENOENT;
299922e8bd51SIlya Dryomov 			obj_req->read_state = RBD_OBJ_READ_OBJECT;
300022e8bd51SIlya Dryomov 			goto again;
300122e8bd51SIlya Dryomov 		}
300222e8bd51SIlya Dryomov 
300385b5e6d1SIlya Dryomov 		ret = rbd_obj_read_object(obj_req);
300485b5e6d1SIlya Dryomov 		if (ret) {
300585b5e6d1SIlya Dryomov 			*result = ret;
300685b5e6d1SIlya Dryomov 			return true;
300785b5e6d1SIlya Dryomov 		}
300885b5e6d1SIlya Dryomov 		obj_req->read_state = RBD_OBJ_READ_OBJECT;
300985b5e6d1SIlya Dryomov 		return false;
3010a9b67e69SIlya Dryomov 	case RBD_OBJ_READ_OBJECT:
3011a9b67e69SIlya Dryomov 		if (*result == -ENOENT && rbd_dev->parent_overlap) {
301286bd7998SIlya Dryomov 			/* reverse map this object extent onto the parent */
301386bd7998SIlya Dryomov 			ret = rbd_obj_calc_img_extents(obj_req, false);
301486bd7998SIlya Dryomov 			if (ret) {
301554ab3b24SIlya Dryomov 				*result = ret;
301686bd7998SIlya Dryomov 				return true;
301786bd7998SIlya Dryomov 			}
301886bd7998SIlya Dryomov 			if (obj_req->num_img_extents) {
301986bd7998SIlya Dryomov 				ret = rbd_obj_read_from_parent(obj_req);
30203da691bfSIlya Dryomov 				if (ret) {
302154ab3b24SIlya Dryomov 					*result = ret;
30223da691bfSIlya Dryomov 					return true;
30233da691bfSIlya Dryomov 				}
3024a9b67e69SIlya Dryomov 				obj_req->read_state = RBD_OBJ_READ_PARENT;
30253da691bfSIlya Dryomov 				return false;
30263da691bfSIlya Dryomov 			}
302786bd7998SIlya Dryomov 		}
302802c74fbaSAlex Elder 
302902c74fbaSAlex Elder 		/*
30303da691bfSIlya Dryomov 		 * -ENOENT means a hole in the image -- zero-fill the entire
30313da691bfSIlya Dryomov 		 * length of the request.  A short read also implies zero-fill
303254ab3b24SIlya Dryomov 		 * to the end of the request.
303302c74fbaSAlex Elder 		 */
303454ab3b24SIlya Dryomov 		if (*result == -ENOENT) {
303554ab3b24SIlya Dryomov 			rbd_obj_zero_range(obj_req, 0, obj_req->ex.oe_len);
303654ab3b24SIlya Dryomov 			*result = 0;
303754ab3b24SIlya Dryomov 		} else if (*result >= 0) {
303854ab3b24SIlya Dryomov 			if (*result < obj_req->ex.oe_len)
303954ab3b24SIlya Dryomov 				rbd_obj_zero_range(obj_req, *result,
304054ab3b24SIlya Dryomov 						obj_req->ex.oe_len - *result);
304154ab3b24SIlya Dryomov 			else
304254ab3b24SIlya Dryomov 				rbd_assert(*result == obj_req->ex.oe_len);
304354ab3b24SIlya Dryomov 			*result = 0;
30443da691bfSIlya Dryomov 		}
30453da691bfSIlya Dryomov 		return true;
3046a9b67e69SIlya Dryomov 	case RBD_OBJ_READ_PARENT:
3047d435c9a7SIlya Dryomov 		/*
3048d435c9a7SIlya Dryomov 		 * The parent image is read only up to the overlap -- zero-fill
3049d435c9a7SIlya Dryomov 		 * from the overlap to the end of the request.
3050d435c9a7SIlya Dryomov 		 */
3051d435c9a7SIlya Dryomov 		if (!*result) {
3052d435c9a7SIlya Dryomov 			u32 obj_overlap = rbd_obj_img_extents_bytes(obj_req);
3053d435c9a7SIlya Dryomov 
3054d435c9a7SIlya Dryomov 			if (obj_overlap < obj_req->ex.oe_len)
3055d435c9a7SIlya Dryomov 				rbd_obj_zero_range(obj_req, obj_overlap,
3056d435c9a7SIlya Dryomov 					    obj_req->ex.oe_len - obj_overlap);
3057d435c9a7SIlya Dryomov 		}
3058a9b67e69SIlya Dryomov 		return true;
3059a9b67e69SIlya Dryomov 	default:
3060a9b67e69SIlya Dryomov 		BUG();
3061a9b67e69SIlya Dryomov 	}
30623da691bfSIlya Dryomov }
30633da691bfSIlya Dryomov 
306422e8bd51SIlya Dryomov static bool rbd_obj_write_is_noop(struct rbd_obj_request *obj_req)
306522e8bd51SIlya Dryomov {
306622e8bd51SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
306722e8bd51SIlya Dryomov 
306822e8bd51SIlya Dryomov 	if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno))
306922e8bd51SIlya Dryomov 		obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
307022e8bd51SIlya Dryomov 
307122e8bd51SIlya Dryomov 	if (!(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST) &&
307222e8bd51SIlya Dryomov 	    (obj_req->flags & RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT)) {
307322e8bd51SIlya Dryomov 		dout("%s %p noop for nonexistent\n", __func__, obj_req);
30743da691bfSIlya Dryomov 		return true;
30753da691bfSIlya Dryomov 	}
30763da691bfSIlya Dryomov 
307722e8bd51SIlya Dryomov 	return false;
307822e8bd51SIlya Dryomov }
307922e8bd51SIlya Dryomov 
308022e8bd51SIlya Dryomov /*
308122e8bd51SIlya Dryomov  * Return:
308222e8bd51SIlya Dryomov  *   0 - object map update sent
308322e8bd51SIlya Dryomov  *   1 - object map update isn't needed
308422e8bd51SIlya Dryomov  *  <0 - error
308522e8bd51SIlya Dryomov  */
308622e8bd51SIlya Dryomov static int rbd_obj_write_pre_object_map(struct rbd_obj_request *obj_req)
308722e8bd51SIlya Dryomov {
308822e8bd51SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
308922e8bd51SIlya Dryomov 	u8 new_state;
309022e8bd51SIlya Dryomov 
309122e8bd51SIlya Dryomov 	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
309222e8bd51SIlya Dryomov 		return 1;
309322e8bd51SIlya Dryomov 
309422e8bd51SIlya Dryomov 	if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
309522e8bd51SIlya Dryomov 		new_state = OBJECT_PENDING;
309622e8bd51SIlya Dryomov 	else
309722e8bd51SIlya Dryomov 		new_state = OBJECT_EXISTS;
309822e8bd51SIlya Dryomov 
309922e8bd51SIlya Dryomov 	return rbd_object_map_update(obj_req, CEPH_NOSNAP, new_state, NULL);
310022e8bd51SIlya Dryomov }
310122e8bd51SIlya Dryomov 
310285b5e6d1SIlya Dryomov static int rbd_obj_write_object(struct rbd_obj_request *obj_req)
310385b5e6d1SIlya Dryomov {
3104a086a1b8SIlya Dryomov 	struct ceph_osd_request *osd_req;
3105a086a1b8SIlya Dryomov 	int num_ops = count_write_ops(obj_req);
3106a086a1b8SIlya Dryomov 	int which = 0;
3107a086a1b8SIlya Dryomov 	int ret;
3108a086a1b8SIlya Dryomov 
3109a086a1b8SIlya Dryomov 	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)
3110a086a1b8SIlya Dryomov 		num_ops++; /* stat */
3111a086a1b8SIlya Dryomov 
3112a086a1b8SIlya Dryomov 	osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
3113a086a1b8SIlya Dryomov 	if (IS_ERR(osd_req))
3114a086a1b8SIlya Dryomov 		return PTR_ERR(osd_req);
3115a086a1b8SIlya Dryomov 
3116a086a1b8SIlya Dryomov 	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
3117a086a1b8SIlya Dryomov 		ret = rbd_osd_setup_stat(osd_req, which++);
3118a086a1b8SIlya Dryomov 		if (ret)
3119a086a1b8SIlya Dryomov 			return ret;
3120a086a1b8SIlya Dryomov 	}
3121a086a1b8SIlya Dryomov 
3122a086a1b8SIlya Dryomov 	rbd_osd_setup_write_ops(osd_req, which);
3123a086a1b8SIlya Dryomov 	rbd_osd_format_write(osd_req);
3124a086a1b8SIlya Dryomov 
3125a086a1b8SIlya Dryomov 	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
3126a086a1b8SIlya Dryomov 	if (ret)
3127a086a1b8SIlya Dryomov 		return ret;
3128a086a1b8SIlya Dryomov 
3129a086a1b8SIlya Dryomov 	rbd_osd_submit(osd_req);
313085b5e6d1SIlya Dryomov 	return 0;
313185b5e6d1SIlya Dryomov }
313285b5e6d1SIlya Dryomov 
31333da691bfSIlya Dryomov /*
31343da691bfSIlya Dryomov  * copyup_bvecs pages are never highmem pages
31353da691bfSIlya Dryomov  */
31363da691bfSIlya Dryomov static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
31373da691bfSIlya Dryomov {
31383da691bfSIlya Dryomov 	struct ceph_bvec_iter it = {
31393da691bfSIlya Dryomov 		.bvecs = bvecs,
31403da691bfSIlya Dryomov 		.iter = { .bi_size = bytes },
31413da691bfSIlya Dryomov 	};
31423da691bfSIlya Dryomov 
31433da691bfSIlya Dryomov 	ceph_bvec_iter_advance_step(&it, bytes, ({
31443da691bfSIlya Dryomov 		if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
31453da691bfSIlya Dryomov 			       bv.bv_len))
31463da691bfSIlya Dryomov 			return false;
31473da691bfSIlya Dryomov 	}));
31483da691bfSIlya Dryomov 	return true;
31493da691bfSIlya Dryomov }
31503da691bfSIlya Dryomov 
31513a482501SIlya Dryomov #define MODS_ONLY	U32_MAX
31523a482501SIlya Dryomov 
3153793333a3SIlya Dryomov static int rbd_obj_copyup_empty_snapc(struct rbd_obj_request *obj_req,
315489a59c1cSIlya Dryomov 				      u32 bytes)
31553da691bfSIlya Dryomov {
3156bcbab1dbSIlya Dryomov 	struct ceph_osd_request *osd_req;
3157fe943d50SChengguang Xu 	int ret;
31583da691bfSIlya Dryomov 
31593da691bfSIlya Dryomov 	dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
316089a59c1cSIlya Dryomov 	rbd_assert(bytes > 0 && bytes != MODS_ONLY);
31613da691bfSIlya Dryomov 
3162bcbab1dbSIlya Dryomov 	osd_req = __rbd_obj_add_osd_request(obj_req, &rbd_empty_snapc, 1);
3163bcbab1dbSIlya Dryomov 	if (IS_ERR(osd_req))
3164bcbab1dbSIlya Dryomov 		return PTR_ERR(osd_req);
31653da691bfSIlya Dryomov 
3166b5ae8cbcSIlya Dryomov 	ret = rbd_osd_setup_copyup(osd_req, 0, bytes);
3167fe943d50SChengguang Xu 	if (ret)
3168fe943d50SChengguang Xu 		return ret;
3169fe943d50SChengguang Xu 
3170bcbab1dbSIlya Dryomov 	rbd_osd_format_write(osd_req);
31713da691bfSIlya Dryomov 
3172bcbab1dbSIlya Dryomov 	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
317389a59c1cSIlya Dryomov 	if (ret)
317489a59c1cSIlya Dryomov 		return ret;
317589a59c1cSIlya Dryomov 
3176a086a1b8SIlya Dryomov 	rbd_osd_submit(osd_req);
317789a59c1cSIlya Dryomov 	return 0;
317889a59c1cSIlya Dryomov }
317989a59c1cSIlya Dryomov 
3180793333a3SIlya Dryomov static int rbd_obj_copyup_current_snapc(struct rbd_obj_request *obj_req,
3181793333a3SIlya Dryomov 					u32 bytes)
31823da691bfSIlya Dryomov {
3183bcbab1dbSIlya Dryomov 	struct ceph_osd_request *osd_req;
3184a086a1b8SIlya Dryomov 	int num_ops = count_write_ops(obj_req);
3185a086a1b8SIlya Dryomov 	int which = 0;
31863da691bfSIlya Dryomov 	int ret;
31873da691bfSIlya Dryomov 
31883da691bfSIlya Dryomov 	dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
31893da691bfSIlya Dryomov 
3190a086a1b8SIlya Dryomov 	if (bytes != MODS_ONLY)
3191a086a1b8SIlya Dryomov 		num_ops++; /* copyup */
319213488d53SIlya Dryomov 
3193a086a1b8SIlya Dryomov 	osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
3194bcbab1dbSIlya Dryomov 	if (IS_ERR(osd_req))
3195bcbab1dbSIlya Dryomov 		return PTR_ERR(osd_req);
31963da691bfSIlya Dryomov 
31973a482501SIlya Dryomov 	if (bytes != MODS_ONLY) {
3198b5ae8cbcSIlya Dryomov 		ret = rbd_osd_setup_copyup(osd_req, which++, bytes);
31993da691bfSIlya Dryomov 		if (ret)
32003da691bfSIlya Dryomov 			return ret;
32013a482501SIlya Dryomov 	}
32023da691bfSIlya Dryomov 
3203a086a1b8SIlya Dryomov 	rbd_osd_setup_write_ops(osd_req, which);
3204a086a1b8SIlya Dryomov 	rbd_osd_format_write(osd_req);
32053da691bfSIlya Dryomov 
3206bcbab1dbSIlya Dryomov 	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
32073da691bfSIlya Dryomov 	if (ret)
32083da691bfSIlya Dryomov 		return ret;
32093da691bfSIlya Dryomov 
3210a086a1b8SIlya Dryomov 	rbd_osd_submit(osd_req);
32113da691bfSIlya Dryomov 	return 0;
32123da691bfSIlya Dryomov }
32133da691bfSIlya Dryomov 
32147e07efb1SIlya Dryomov static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
32157e07efb1SIlya Dryomov {
32167e07efb1SIlya Dryomov 	u32 i;
32177e07efb1SIlya Dryomov 
32187e07efb1SIlya Dryomov 	rbd_assert(!obj_req->copyup_bvecs);
32197e07efb1SIlya Dryomov 	obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
32207e07efb1SIlya Dryomov 	obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
32217e07efb1SIlya Dryomov 					sizeof(*obj_req->copyup_bvecs),
32227e07efb1SIlya Dryomov 					GFP_NOIO);
32237e07efb1SIlya Dryomov 	if (!obj_req->copyup_bvecs)
32247e07efb1SIlya Dryomov 		return -ENOMEM;
32257e07efb1SIlya Dryomov 
32267e07efb1SIlya Dryomov 	for (i = 0; i < obj_req->copyup_bvec_count; i++) {
32277e07efb1SIlya Dryomov 		unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
32287e07efb1SIlya Dryomov 
32297e07efb1SIlya Dryomov 		obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
32307e07efb1SIlya Dryomov 		if (!obj_req->copyup_bvecs[i].bv_page)
32317e07efb1SIlya Dryomov 			return -ENOMEM;
32327e07efb1SIlya Dryomov 
32337e07efb1SIlya Dryomov 		obj_req->copyup_bvecs[i].bv_offset = 0;
32347e07efb1SIlya Dryomov 		obj_req->copyup_bvecs[i].bv_len = len;
32357e07efb1SIlya Dryomov 		obj_overlap -= len;
32367e07efb1SIlya Dryomov 	}
32377e07efb1SIlya Dryomov 
32387e07efb1SIlya Dryomov 	rbd_assert(!obj_overlap);
32397e07efb1SIlya Dryomov 	return 0;
32407e07efb1SIlya Dryomov }
32417e07efb1SIlya Dryomov 
32420ad5d953SIlya Dryomov /*
32430ad5d953SIlya Dryomov  * The target object doesn't exist.  Read the data for the entire
32440ad5d953SIlya Dryomov  * target object up to the overlap point (if any) from the parent,
32450ad5d953SIlya Dryomov  * so we can use it for a copyup.
32460ad5d953SIlya Dryomov  */
3247793333a3SIlya Dryomov static int rbd_obj_copyup_read_parent(struct rbd_obj_request *obj_req)
32483da691bfSIlya Dryomov {
32493da691bfSIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
32503da691bfSIlya Dryomov 	int ret;
32513da691bfSIlya Dryomov 
325286bd7998SIlya Dryomov 	rbd_assert(obj_req->num_img_extents);
325386bd7998SIlya Dryomov 	prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
325486bd7998SIlya Dryomov 		      rbd_dev->parent_overlap);
325586bd7998SIlya Dryomov 	if (!obj_req->num_img_extents) {
32563da691bfSIlya Dryomov 		/*
32573da691bfSIlya Dryomov 		 * The overlap has become 0 (most likely because the
32583a482501SIlya Dryomov 		 * image has been flattened).  Re-submit the original write
32593a482501SIlya Dryomov 		 * request -- pass MODS_ONLY since the copyup isn't needed
32603a482501SIlya Dryomov 		 * anymore.
32613da691bfSIlya Dryomov 		 */
3262793333a3SIlya Dryomov 		return rbd_obj_copyup_current_snapc(obj_req, MODS_ONLY);
32633da691bfSIlya Dryomov 	}
32643da691bfSIlya Dryomov 
326586bd7998SIlya Dryomov 	ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
32663da691bfSIlya Dryomov 	if (ret)
32673da691bfSIlya Dryomov 		return ret;
32683da691bfSIlya Dryomov 
326986bd7998SIlya Dryomov 	return rbd_obj_read_from_parent(obj_req);
32703da691bfSIlya Dryomov }
32713da691bfSIlya Dryomov 
327222e8bd51SIlya Dryomov static void rbd_obj_copyup_object_maps(struct rbd_obj_request *obj_req)
32733da691bfSIlya Dryomov {
327422e8bd51SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
327522e8bd51SIlya Dryomov 	struct ceph_snap_context *snapc = obj_req->img_request->snapc;
327622e8bd51SIlya Dryomov 	u8 new_state;
327722e8bd51SIlya Dryomov 	u32 i;
32783da691bfSIlya Dryomov 	int ret;
32793da691bfSIlya Dryomov 
328022e8bd51SIlya Dryomov 	rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
32813da691bfSIlya Dryomov 
328222e8bd51SIlya Dryomov 	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
328322e8bd51SIlya Dryomov 		return;
328489a59c1cSIlya Dryomov 
328522e8bd51SIlya Dryomov 	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
328622e8bd51SIlya Dryomov 		return;
32873da691bfSIlya Dryomov 
328822e8bd51SIlya Dryomov 	for (i = 0; i < snapc->num_snaps; i++) {
328922e8bd51SIlya Dryomov 		if ((rbd_dev->header.features & RBD_FEATURE_FAST_DIFF) &&
329022e8bd51SIlya Dryomov 		    i + 1 < snapc->num_snaps)
329122e8bd51SIlya Dryomov 			new_state = OBJECT_EXISTS_CLEAN;
329222e8bd51SIlya Dryomov 		else
329322e8bd51SIlya Dryomov 			new_state = OBJECT_EXISTS;
32943da691bfSIlya Dryomov 
329522e8bd51SIlya Dryomov 		ret = rbd_object_map_update(obj_req, snapc->snaps[i],
329622e8bd51SIlya Dryomov 					    new_state, NULL);
329722e8bd51SIlya Dryomov 		if (ret < 0) {
329822e8bd51SIlya Dryomov 			obj_req->pending.result = ret;
329902c74fbaSAlex Elder 			return;
330002c74fbaSAlex Elder 		}
330102c74fbaSAlex Elder 
330222e8bd51SIlya Dryomov 		rbd_assert(!ret);
330322e8bd51SIlya Dryomov 		obj_req->pending.num_pending++;
3304a9e8ba2cSAlex Elder 	}
33058b3e1a56SAlex Elder }
33068b3e1a56SAlex Elder 
3307793333a3SIlya Dryomov static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req)
33088b3e1a56SAlex Elder {
3309793333a3SIlya Dryomov 	u32 bytes = rbd_obj_img_extents_bytes(obj_req);
3310793333a3SIlya Dryomov 	int ret;
33118b3e1a56SAlex Elder 
3312793333a3SIlya Dryomov 	rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
33138b3e1a56SAlex Elder 
3314793333a3SIlya Dryomov 	/*
3315793333a3SIlya Dryomov 	 * Only send non-zero copyup data to save some I/O and network
3316793333a3SIlya Dryomov 	 * bandwidth -- zero copyup data is equivalent to the object not
3317793333a3SIlya Dryomov 	 * existing.
3318793333a3SIlya Dryomov 	 */
3319793333a3SIlya Dryomov 	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
3320793333a3SIlya Dryomov 		bytes = 0;
3321793333a3SIlya Dryomov 
3322793333a3SIlya Dryomov 	if (obj_req->img_request->snapc->num_snaps && bytes > 0) {
3323793333a3SIlya Dryomov 		/*
3324793333a3SIlya Dryomov 		 * Send a copyup request with an empty snapshot context to
3325793333a3SIlya Dryomov 		 * deep-copyup the object through all existing snapshots.
3326793333a3SIlya Dryomov 		 * A second request with the current snapshot context will be
3327793333a3SIlya Dryomov 		 * sent for the actual modification.
3328793333a3SIlya Dryomov 		 */
3329793333a3SIlya Dryomov 		ret = rbd_obj_copyup_empty_snapc(obj_req, bytes);
3330793333a3SIlya Dryomov 		if (ret) {
3331793333a3SIlya Dryomov 			obj_req->pending.result = ret;
3332793333a3SIlya Dryomov 			return;
33337114edacSIlya Dryomov 		}
33348b3e1a56SAlex Elder 
3335793333a3SIlya Dryomov 		obj_req->pending.num_pending++;
3336793333a3SIlya Dryomov 		bytes = MODS_ONLY;
33373da691bfSIlya Dryomov 	}
33388b3e1a56SAlex Elder 
3339793333a3SIlya Dryomov 	ret = rbd_obj_copyup_current_snapc(obj_req, bytes);
3340793333a3SIlya Dryomov 	if (ret) {
3341793333a3SIlya Dryomov 		obj_req->pending.result = ret;
3342793333a3SIlya Dryomov 		return;
3343793333a3SIlya Dryomov 	}
3344793333a3SIlya Dryomov 
3345793333a3SIlya Dryomov 	obj_req->pending.num_pending++;
3346793333a3SIlya Dryomov }
3347793333a3SIlya Dryomov 
3348793333a3SIlya Dryomov static bool rbd_obj_advance_copyup(struct rbd_obj_request *obj_req, int *result)
33493da691bfSIlya Dryomov {
335022e8bd51SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3351793333a3SIlya Dryomov 	int ret;
33527114edacSIlya Dryomov 
33537114edacSIlya Dryomov again:
3354793333a3SIlya Dryomov 	switch (obj_req->copyup_state) {
3355793333a3SIlya Dryomov 	case RBD_OBJ_COPYUP_START:
3356793333a3SIlya Dryomov 		rbd_assert(!*result);
33573da691bfSIlya Dryomov 
3358793333a3SIlya Dryomov 		ret = rbd_obj_copyup_read_parent(obj_req);
3359793333a3SIlya Dryomov 		if (ret) {
3360793333a3SIlya Dryomov 			*result = ret;
3361793333a3SIlya Dryomov 			return true;
3362793333a3SIlya Dryomov 		}
3363793333a3SIlya Dryomov 		if (obj_req->num_img_extents)
3364793333a3SIlya Dryomov 			obj_req->copyup_state = RBD_OBJ_COPYUP_READ_PARENT;
3365793333a3SIlya Dryomov 		else
3366793333a3SIlya Dryomov 			obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3367793333a3SIlya Dryomov 		return false;
3368793333a3SIlya Dryomov 	case RBD_OBJ_COPYUP_READ_PARENT:
3369793333a3SIlya Dryomov 		if (*result)
3370793333a3SIlya Dryomov 			return true;
3371793333a3SIlya Dryomov 
3372793333a3SIlya Dryomov 		if (is_zero_bvecs(obj_req->copyup_bvecs,
3373793333a3SIlya Dryomov 				  rbd_obj_img_extents_bytes(obj_req))) {
3374793333a3SIlya Dryomov 			dout("%s %p detected zeros\n", __func__, obj_req);
3375793333a3SIlya Dryomov 			obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ZEROS;
33767114edacSIlya Dryomov 		}
33777114edacSIlya Dryomov 
337822e8bd51SIlya Dryomov 		rbd_obj_copyup_object_maps(obj_req);
337922e8bd51SIlya Dryomov 		if (!obj_req->pending.num_pending) {
338022e8bd51SIlya Dryomov 			*result = obj_req->pending.result;
338122e8bd51SIlya Dryomov 			obj_req->copyup_state = RBD_OBJ_COPYUP_OBJECT_MAPS;
33827114edacSIlya Dryomov 			goto again;
33837114edacSIlya Dryomov 		}
338422e8bd51SIlya Dryomov 		obj_req->copyup_state = __RBD_OBJ_COPYUP_OBJECT_MAPS;
338522e8bd51SIlya Dryomov 		return false;
338622e8bd51SIlya Dryomov 	case __RBD_OBJ_COPYUP_OBJECT_MAPS:
338722e8bd51SIlya Dryomov 		if (!pending_result_dec(&obj_req->pending, result))
338822e8bd51SIlya Dryomov 			return false;
338922e8bd51SIlya Dryomov 		/* fall through */
339022e8bd51SIlya Dryomov 	case RBD_OBJ_COPYUP_OBJECT_MAPS:
339122e8bd51SIlya Dryomov 		if (*result) {
339222e8bd51SIlya Dryomov 			rbd_warn(rbd_dev, "snap object map update failed: %d",
339322e8bd51SIlya Dryomov 				 *result);
339422e8bd51SIlya Dryomov 			return true;
339522e8bd51SIlya Dryomov 		}
339622e8bd51SIlya Dryomov 
3397793333a3SIlya Dryomov 		rbd_obj_copyup_write_object(obj_req);
3398793333a3SIlya Dryomov 		if (!obj_req->pending.num_pending) {
3399793333a3SIlya Dryomov 			*result = obj_req->pending.result;
3400793333a3SIlya Dryomov 			obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3401793333a3SIlya Dryomov 			goto again;
3402793333a3SIlya Dryomov 		}
3403793333a3SIlya Dryomov 		obj_req->copyup_state = __RBD_OBJ_COPYUP_WRITE_OBJECT;
3404793333a3SIlya Dryomov 		return false;
3405793333a3SIlya Dryomov 	case __RBD_OBJ_COPYUP_WRITE_OBJECT:
3406793333a3SIlya Dryomov 		if (!pending_result_dec(&obj_req->pending, result))
3407793333a3SIlya Dryomov 			return false;
3408793333a3SIlya Dryomov 		/* fall through */
3409793333a3SIlya Dryomov 	case RBD_OBJ_COPYUP_WRITE_OBJECT:
3410793333a3SIlya Dryomov 		return true;
3411793333a3SIlya Dryomov 	default:
3412793333a3SIlya Dryomov 		BUG();
3413793333a3SIlya Dryomov 	}
3414793333a3SIlya Dryomov }
3415793333a3SIlya Dryomov 
341622e8bd51SIlya Dryomov /*
341722e8bd51SIlya Dryomov  * Return:
341822e8bd51SIlya Dryomov  *   0 - object map update sent
341922e8bd51SIlya Dryomov  *   1 - object map update isn't needed
342022e8bd51SIlya Dryomov  *  <0 - error
342122e8bd51SIlya Dryomov  */
342222e8bd51SIlya Dryomov static int rbd_obj_write_post_object_map(struct rbd_obj_request *obj_req)
342322e8bd51SIlya Dryomov {
342422e8bd51SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
342522e8bd51SIlya Dryomov 	u8 current_state = OBJECT_PENDING;
342622e8bd51SIlya Dryomov 
342722e8bd51SIlya Dryomov 	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
342822e8bd51SIlya Dryomov 		return 1;
342922e8bd51SIlya Dryomov 
343022e8bd51SIlya Dryomov 	if (!(obj_req->flags & RBD_OBJ_FLAG_DELETION))
343122e8bd51SIlya Dryomov 		return 1;
343222e8bd51SIlya Dryomov 
343322e8bd51SIlya Dryomov 	return rbd_object_map_update(obj_req, CEPH_NOSNAP, OBJECT_NONEXISTENT,
343422e8bd51SIlya Dryomov 				     &current_state);
343522e8bd51SIlya Dryomov }
343622e8bd51SIlya Dryomov 
343785b5e6d1SIlya Dryomov static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result)
3438b8d70035SAlex Elder {
3439793333a3SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3440b8d70035SAlex Elder 	int ret;
3441b8d70035SAlex Elder 
3442793333a3SIlya Dryomov again:
3443cf81b60eSAlex Elder 	switch (obj_req->write_state) {
344485b5e6d1SIlya Dryomov 	case RBD_OBJ_WRITE_START:
344585b5e6d1SIlya Dryomov 		rbd_assert(!*result);
344685b5e6d1SIlya Dryomov 
344722e8bd51SIlya Dryomov 		if (rbd_obj_write_is_noop(obj_req))
344822e8bd51SIlya Dryomov 			return true;
344922e8bd51SIlya Dryomov 
345022e8bd51SIlya Dryomov 		ret = rbd_obj_write_pre_object_map(obj_req);
345122e8bd51SIlya Dryomov 		if (ret < 0) {
345222e8bd51SIlya Dryomov 			*result = ret;
345322e8bd51SIlya Dryomov 			return true;
345422e8bd51SIlya Dryomov 		}
345522e8bd51SIlya Dryomov 		obj_req->write_state = RBD_OBJ_WRITE_PRE_OBJECT_MAP;
345622e8bd51SIlya Dryomov 		if (ret > 0)
345722e8bd51SIlya Dryomov 			goto again;
345822e8bd51SIlya Dryomov 		return false;
345922e8bd51SIlya Dryomov 	case RBD_OBJ_WRITE_PRE_OBJECT_MAP:
346022e8bd51SIlya Dryomov 		if (*result) {
346122e8bd51SIlya Dryomov 			rbd_warn(rbd_dev, "pre object map update failed: %d",
346222e8bd51SIlya Dryomov 				 *result);
346322e8bd51SIlya Dryomov 			return true;
346422e8bd51SIlya Dryomov 		}
346585b5e6d1SIlya Dryomov 		ret = rbd_obj_write_object(obj_req);
346685b5e6d1SIlya Dryomov 		if (ret) {
346785b5e6d1SIlya Dryomov 			*result = ret;
346885b5e6d1SIlya Dryomov 			return true;
346985b5e6d1SIlya Dryomov 		}
347085b5e6d1SIlya Dryomov 		obj_req->write_state = RBD_OBJ_WRITE_OBJECT;
347185b5e6d1SIlya Dryomov 		return false;
34720ad5d953SIlya Dryomov 	case RBD_OBJ_WRITE_OBJECT:
347354ab3b24SIlya Dryomov 		if (*result == -ENOENT) {
34740ad5d953SIlya Dryomov 			if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
3475793333a3SIlya Dryomov 				*result = 0;
3476793333a3SIlya Dryomov 				obj_req->copyup_state = RBD_OBJ_COPYUP_START;
3477793333a3SIlya Dryomov 				obj_req->write_state = __RBD_OBJ_WRITE_COPYUP;
3478793333a3SIlya Dryomov 				goto again;
3479b8d70035SAlex Elder 			}
34800ad5d953SIlya Dryomov 			/*
34810ad5d953SIlya Dryomov 			 * On a non-existent object:
34820ad5d953SIlya Dryomov 			 *   delete - -ENOENT, truncate/zero - 0
34830ad5d953SIlya Dryomov 			 */
34840ad5d953SIlya Dryomov 			if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
34850ad5d953SIlya Dryomov 				*result = 0;
34860ad5d953SIlya Dryomov 		}
3487793333a3SIlya Dryomov 		if (*result)
3488793333a3SIlya Dryomov 			return true;
3489793333a3SIlya Dryomov 
3490793333a3SIlya Dryomov 		obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
3491793333a3SIlya Dryomov 		goto again;
3492793333a3SIlya Dryomov 	case __RBD_OBJ_WRITE_COPYUP:
3493793333a3SIlya Dryomov 		if (!rbd_obj_advance_copyup(obj_req, result))
3494793333a3SIlya Dryomov 			return false;
34959969ebc5SAlex Elder 		/* fall through */
3496793333a3SIlya Dryomov 	case RBD_OBJ_WRITE_COPYUP:
349722e8bd51SIlya Dryomov 		if (*result) {
3498793333a3SIlya Dryomov 			rbd_warn(rbd_dev, "copyup failed: %d", *result);
3499bf0d5f50SAlex Elder 			return true;
350022e8bd51SIlya Dryomov 		}
350122e8bd51SIlya Dryomov 		ret = rbd_obj_write_post_object_map(obj_req);
350222e8bd51SIlya Dryomov 		if (ret < 0) {
350322e8bd51SIlya Dryomov 			*result = ret;
350422e8bd51SIlya Dryomov 			return true;
350522e8bd51SIlya Dryomov 		}
350622e8bd51SIlya Dryomov 		obj_req->write_state = RBD_OBJ_WRITE_POST_OBJECT_MAP;
350722e8bd51SIlya Dryomov 		if (ret > 0)
350822e8bd51SIlya Dryomov 			goto again;
350922e8bd51SIlya Dryomov 		return false;
351022e8bd51SIlya Dryomov 	case RBD_OBJ_WRITE_POST_OBJECT_MAP:
351122e8bd51SIlya Dryomov 		if (*result)
351222e8bd51SIlya Dryomov 			rbd_warn(rbd_dev, "post object map update failed: %d",
351322e8bd51SIlya Dryomov 				 *result);
351422e8bd51SIlya Dryomov 		return true;
3515bf0d5f50SAlex Elder 	default:
3516bf0d5f50SAlex Elder 		BUG();
3517bf0d5f50SAlex Elder 	}
3518bf0d5f50SAlex Elder }
3519bf0d5f50SAlex Elder 
3520bf0d5f50SAlex Elder /*
35210ad5d953SIlya Dryomov  * Return true if @obj_req is completed.
3522bf0d5f50SAlex Elder  */
352354ab3b24SIlya Dryomov static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req,
352454ab3b24SIlya Dryomov 				     int *result)
3525bf0d5f50SAlex Elder {
35260ad5d953SIlya Dryomov 	struct rbd_img_request *img_req = obj_req->img_request;
35270192ce2eSIlya Dryomov 	struct rbd_device *rbd_dev = img_req->rbd_dev;
35280ad5d953SIlya Dryomov 	bool done;
35290ad5d953SIlya Dryomov 
353085b5e6d1SIlya Dryomov 	mutex_lock(&obj_req->state_mutex);
35310ad5d953SIlya Dryomov 	if (!rbd_img_is_write(img_req))
353285b5e6d1SIlya Dryomov 		done = rbd_obj_advance_read(obj_req, result);
35330ad5d953SIlya Dryomov 	else
353485b5e6d1SIlya Dryomov 		done = rbd_obj_advance_write(obj_req, result);
353585b5e6d1SIlya Dryomov 	mutex_unlock(&obj_req->state_mutex);
35360ad5d953SIlya Dryomov 
35370192ce2eSIlya Dryomov 	if (done && *result) {
35380192ce2eSIlya Dryomov 		rbd_assert(*result < 0);
35390192ce2eSIlya Dryomov 		rbd_warn(rbd_dev, "%s at objno %llu %llu~%llu result %d",
35400192ce2eSIlya Dryomov 			 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
35410192ce2eSIlya Dryomov 			 obj_req->ex.oe_off, obj_req->ex.oe_len, *result);
35420192ce2eSIlya Dryomov 	}
35430ad5d953SIlya Dryomov 	return done;
35449969ebc5SAlex Elder }
35459969ebc5SAlex Elder 
35460192ce2eSIlya Dryomov /*
35470192ce2eSIlya Dryomov  * This is open-coded in rbd_img_handle_request() to avoid parent chain
35480192ce2eSIlya Dryomov  * recursion.
35490192ce2eSIlya Dryomov  */
355054ab3b24SIlya Dryomov static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result)
35519969ebc5SAlex Elder {
35520192ce2eSIlya Dryomov 	if (__rbd_obj_handle_request(obj_req, &result))
35530192ce2eSIlya Dryomov 		rbd_img_handle_request(obj_req->img_request, result);
35547114edacSIlya Dryomov }
35557114edacSIlya Dryomov 
3556e1fddc8fSIlya Dryomov static bool need_exclusive_lock(struct rbd_img_request *img_req)
3557e1fddc8fSIlya Dryomov {
3558e1fddc8fSIlya Dryomov 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3559e1fddc8fSIlya Dryomov 
3560e1fddc8fSIlya Dryomov 	if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK))
3561e1fddc8fSIlya Dryomov 		return false;
3562e1fddc8fSIlya Dryomov 
3563f3c0e459SIlya Dryomov 	if (rbd_is_snap(rbd_dev))
3564e1fddc8fSIlya Dryomov 		return false;
3565e1fddc8fSIlya Dryomov 
3566e1fddc8fSIlya Dryomov 	rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
356722e8bd51SIlya Dryomov 	if (rbd_dev->opts->lock_on_read ||
356822e8bd51SIlya Dryomov 	    (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3569e1fddc8fSIlya Dryomov 		return true;
3570e1fddc8fSIlya Dryomov 
3571e1fddc8fSIlya Dryomov 	return rbd_img_is_write(img_req);
3572e1fddc8fSIlya Dryomov }
3573e1fddc8fSIlya Dryomov 
3574637cd060SIlya Dryomov static bool rbd_lock_add_request(struct rbd_img_request *img_req)
3575e1fddc8fSIlya Dryomov {
3576e1fddc8fSIlya Dryomov 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3577637cd060SIlya Dryomov 	bool locked;
3578e1fddc8fSIlya Dryomov 
3579e1fddc8fSIlya Dryomov 	lockdep_assert_held(&rbd_dev->lock_rwsem);
3580637cd060SIlya Dryomov 	locked = rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED;
3581e1fddc8fSIlya Dryomov 	spin_lock(&rbd_dev->lock_lists_lock);
3582e1fddc8fSIlya Dryomov 	rbd_assert(list_empty(&img_req->lock_item));
3583637cd060SIlya Dryomov 	if (!locked)
3584637cd060SIlya Dryomov 		list_add_tail(&img_req->lock_item, &rbd_dev->acquiring_list);
3585637cd060SIlya Dryomov 	else
3586e1fddc8fSIlya Dryomov 		list_add_tail(&img_req->lock_item, &rbd_dev->running_list);
3587e1fddc8fSIlya Dryomov 	spin_unlock(&rbd_dev->lock_lists_lock);
3588637cd060SIlya Dryomov 	return locked;
3589e1fddc8fSIlya Dryomov }
3590e1fddc8fSIlya Dryomov 
3591e1fddc8fSIlya Dryomov static void rbd_lock_del_request(struct rbd_img_request *img_req)
3592e1fddc8fSIlya Dryomov {
3593e1fddc8fSIlya Dryomov 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3594e1fddc8fSIlya Dryomov 	bool need_wakeup;
3595e1fddc8fSIlya Dryomov 
3596e1fddc8fSIlya Dryomov 	lockdep_assert_held(&rbd_dev->lock_rwsem);
3597e1fddc8fSIlya Dryomov 	spin_lock(&rbd_dev->lock_lists_lock);
3598e1fddc8fSIlya Dryomov 	rbd_assert(!list_empty(&img_req->lock_item));
3599e1fddc8fSIlya Dryomov 	list_del_init(&img_req->lock_item);
3600e1fddc8fSIlya Dryomov 	need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING &&
3601e1fddc8fSIlya Dryomov 		       list_empty(&rbd_dev->running_list));
3602e1fddc8fSIlya Dryomov 	spin_unlock(&rbd_dev->lock_lists_lock);
3603e1fddc8fSIlya Dryomov 	if (need_wakeup)
3604e1fddc8fSIlya Dryomov 		complete(&rbd_dev->releasing_wait);
3605e1fddc8fSIlya Dryomov }
3606e1fddc8fSIlya Dryomov 
3607637cd060SIlya Dryomov static int rbd_img_exclusive_lock(struct rbd_img_request *img_req)
3608637cd060SIlya Dryomov {
3609637cd060SIlya Dryomov 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3610637cd060SIlya Dryomov 
3611637cd060SIlya Dryomov 	if (!need_exclusive_lock(img_req))
3612637cd060SIlya Dryomov 		return 1;
3613637cd060SIlya Dryomov 
3614637cd060SIlya Dryomov 	if (rbd_lock_add_request(img_req))
3615637cd060SIlya Dryomov 		return 1;
3616637cd060SIlya Dryomov 
3617637cd060SIlya Dryomov 	if (rbd_dev->opts->exclusive) {
3618637cd060SIlya Dryomov 		WARN_ON(1); /* lock got released? */
3619637cd060SIlya Dryomov 		return -EROFS;
3620637cd060SIlya Dryomov 	}
3621637cd060SIlya Dryomov 
3622637cd060SIlya Dryomov 	/*
3623637cd060SIlya Dryomov 	 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3624637cd060SIlya Dryomov 	 * and cancel_delayed_work() in wake_lock_waiters().
3625637cd060SIlya Dryomov 	 */
3626637cd060SIlya Dryomov 	dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3627637cd060SIlya Dryomov 	queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3628637cd060SIlya Dryomov 	return 0;
3629637cd060SIlya Dryomov }
3630637cd060SIlya Dryomov 
36310192ce2eSIlya Dryomov static void rbd_img_object_requests(struct rbd_img_request *img_req)
36320192ce2eSIlya Dryomov {
36330192ce2eSIlya Dryomov 	struct rbd_obj_request *obj_req;
36340192ce2eSIlya Dryomov 
36350192ce2eSIlya Dryomov 	rbd_assert(!img_req->pending.result && !img_req->pending.num_pending);
36360192ce2eSIlya Dryomov 
36370192ce2eSIlya Dryomov 	for_each_obj_request(img_req, obj_req) {
36380192ce2eSIlya Dryomov 		int result = 0;
36390192ce2eSIlya Dryomov 
36400192ce2eSIlya Dryomov 		if (__rbd_obj_handle_request(obj_req, &result)) {
36410192ce2eSIlya Dryomov 			if (result) {
36420192ce2eSIlya Dryomov 				img_req->pending.result = result;
36430192ce2eSIlya Dryomov 				return;
36440192ce2eSIlya Dryomov 			}
36450192ce2eSIlya Dryomov 		} else {
36460192ce2eSIlya Dryomov 			img_req->pending.num_pending++;
36470192ce2eSIlya Dryomov 		}
36480192ce2eSIlya Dryomov 	}
36490192ce2eSIlya Dryomov }
36500192ce2eSIlya Dryomov 
36510192ce2eSIlya Dryomov static bool rbd_img_advance(struct rbd_img_request *img_req, int *result)
36520192ce2eSIlya Dryomov {
3653637cd060SIlya Dryomov 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3654637cd060SIlya Dryomov 	int ret;
3655637cd060SIlya Dryomov 
36560192ce2eSIlya Dryomov again:
36570192ce2eSIlya Dryomov 	switch (img_req->state) {
36580192ce2eSIlya Dryomov 	case RBD_IMG_START:
36590192ce2eSIlya Dryomov 		rbd_assert(!*result);
36600192ce2eSIlya Dryomov 
3661637cd060SIlya Dryomov 		ret = rbd_img_exclusive_lock(img_req);
3662637cd060SIlya Dryomov 		if (ret < 0) {
3663637cd060SIlya Dryomov 			*result = ret;
3664637cd060SIlya Dryomov 			return true;
3665637cd060SIlya Dryomov 		}
3666637cd060SIlya Dryomov 		img_req->state = RBD_IMG_EXCLUSIVE_LOCK;
3667637cd060SIlya Dryomov 		if (ret > 0)
3668637cd060SIlya Dryomov 			goto again;
3669637cd060SIlya Dryomov 		return false;
3670637cd060SIlya Dryomov 	case RBD_IMG_EXCLUSIVE_LOCK:
3671637cd060SIlya Dryomov 		if (*result)
3672637cd060SIlya Dryomov 			return true;
3673637cd060SIlya Dryomov 
3674637cd060SIlya Dryomov 		rbd_assert(!need_exclusive_lock(img_req) ||
3675637cd060SIlya Dryomov 			   __rbd_is_lock_owner(rbd_dev));
3676637cd060SIlya Dryomov 
36770192ce2eSIlya Dryomov 		rbd_img_object_requests(img_req);
36780192ce2eSIlya Dryomov 		if (!img_req->pending.num_pending) {
36790192ce2eSIlya Dryomov 			*result = img_req->pending.result;
36800192ce2eSIlya Dryomov 			img_req->state = RBD_IMG_OBJECT_REQUESTS;
36817114edacSIlya Dryomov 			goto again;
36827114edacSIlya Dryomov 		}
36830192ce2eSIlya Dryomov 		img_req->state = __RBD_IMG_OBJECT_REQUESTS;
36840192ce2eSIlya Dryomov 		return false;
36850192ce2eSIlya Dryomov 	case __RBD_IMG_OBJECT_REQUESTS:
36860192ce2eSIlya Dryomov 		if (!pending_result_dec(&img_req->pending, result))
36870192ce2eSIlya Dryomov 			return false;
36880192ce2eSIlya Dryomov 		/* fall through */
36890192ce2eSIlya Dryomov 	case RBD_IMG_OBJECT_REQUESTS:
36900192ce2eSIlya Dryomov 		return true;
36910192ce2eSIlya Dryomov 	default:
36920192ce2eSIlya Dryomov 		BUG();
36930192ce2eSIlya Dryomov 	}
36940192ce2eSIlya Dryomov }
36950192ce2eSIlya Dryomov 
36960192ce2eSIlya Dryomov /*
36970192ce2eSIlya Dryomov  * Return true if @img_req is completed.
36980192ce2eSIlya Dryomov  */
36990192ce2eSIlya Dryomov static bool __rbd_img_handle_request(struct rbd_img_request *img_req,
37000192ce2eSIlya Dryomov 				     int *result)
37010192ce2eSIlya Dryomov {
37020192ce2eSIlya Dryomov 	struct rbd_device *rbd_dev = img_req->rbd_dev;
37030192ce2eSIlya Dryomov 	bool done;
37040192ce2eSIlya Dryomov 
3705e1fddc8fSIlya Dryomov 	if (need_exclusive_lock(img_req)) {
3706e1fddc8fSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
3707e1fddc8fSIlya Dryomov 		mutex_lock(&img_req->state_mutex);
3708e1fddc8fSIlya Dryomov 		done = rbd_img_advance(img_req, result);
3709e1fddc8fSIlya Dryomov 		if (done)
3710e1fddc8fSIlya Dryomov 			rbd_lock_del_request(img_req);
3711e1fddc8fSIlya Dryomov 		mutex_unlock(&img_req->state_mutex);
3712e1fddc8fSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
3713e1fddc8fSIlya Dryomov 	} else {
37140192ce2eSIlya Dryomov 		mutex_lock(&img_req->state_mutex);
37150192ce2eSIlya Dryomov 		done = rbd_img_advance(img_req, result);
37160192ce2eSIlya Dryomov 		mutex_unlock(&img_req->state_mutex);
3717e1fddc8fSIlya Dryomov 	}
37180192ce2eSIlya Dryomov 
37190192ce2eSIlya Dryomov 	if (done && *result) {
37200192ce2eSIlya Dryomov 		rbd_assert(*result < 0);
37210192ce2eSIlya Dryomov 		rbd_warn(rbd_dev, "%s%s result %d",
37220192ce2eSIlya Dryomov 		      test_bit(IMG_REQ_CHILD, &img_req->flags) ? "child " : "",
37230192ce2eSIlya Dryomov 		      obj_op_name(img_req->op_type), *result);
37240192ce2eSIlya Dryomov 	}
37250192ce2eSIlya Dryomov 	return done;
37260192ce2eSIlya Dryomov }
37270192ce2eSIlya Dryomov 
37280192ce2eSIlya Dryomov static void rbd_img_handle_request(struct rbd_img_request *img_req, int result)
37290192ce2eSIlya Dryomov {
37300192ce2eSIlya Dryomov again:
37310192ce2eSIlya Dryomov 	if (!__rbd_img_handle_request(img_req, &result))
37320192ce2eSIlya Dryomov 		return;
37330192ce2eSIlya Dryomov 
37340192ce2eSIlya Dryomov 	if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
37350192ce2eSIlya Dryomov 		struct rbd_obj_request *obj_req = img_req->obj_request;
37360192ce2eSIlya Dryomov 
37370192ce2eSIlya Dryomov 		rbd_img_request_put(img_req);
37380192ce2eSIlya Dryomov 		if (__rbd_obj_handle_request(obj_req, &result)) {
37390192ce2eSIlya Dryomov 			img_req = obj_req->img_request;
37400192ce2eSIlya Dryomov 			goto again;
37410192ce2eSIlya Dryomov 		}
37420192ce2eSIlya Dryomov 	} else {
37430192ce2eSIlya Dryomov 		struct request *rq = img_req->rq;
37440192ce2eSIlya Dryomov 
37450192ce2eSIlya Dryomov 		rbd_img_request_put(img_req);
37460192ce2eSIlya Dryomov 		blk_mq_end_request(rq, errno_to_blk_status(result));
37470192ce2eSIlya Dryomov 	}
37489969ebc5SAlex Elder }
37499969ebc5SAlex Elder 
3750ed95b21aSIlya Dryomov static const struct rbd_client_id rbd_empty_cid;
3751ed95b21aSIlya Dryomov 
3752ed95b21aSIlya Dryomov static bool rbd_cid_equal(const struct rbd_client_id *lhs,
3753ed95b21aSIlya Dryomov 			  const struct rbd_client_id *rhs)
3754ed95b21aSIlya Dryomov {
3755ed95b21aSIlya Dryomov 	return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
3756ed95b21aSIlya Dryomov }
3757ed95b21aSIlya Dryomov 
3758ed95b21aSIlya Dryomov static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
3759ed95b21aSIlya Dryomov {
3760ed95b21aSIlya Dryomov 	struct rbd_client_id cid;
3761ed95b21aSIlya Dryomov 
3762ed95b21aSIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
3763ed95b21aSIlya Dryomov 	cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
3764ed95b21aSIlya Dryomov 	cid.handle = rbd_dev->watch_cookie;
3765ed95b21aSIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
3766ed95b21aSIlya Dryomov 	return cid;
3767ed95b21aSIlya Dryomov }
3768ed95b21aSIlya Dryomov 
3769ed95b21aSIlya Dryomov /*
3770ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3771ed95b21aSIlya Dryomov  */
3772ed95b21aSIlya Dryomov static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
3773ed95b21aSIlya Dryomov 			      const struct rbd_client_id *cid)
3774ed95b21aSIlya Dryomov {
3775ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
3776ed95b21aSIlya Dryomov 	     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
3777ed95b21aSIlya Dryomov 	     cid->gid, cid->handle);
3778ed95b21aSIlya Dryomov 	rbd_dev->owner_cid = *cid; /* struct */
3779ed95b21aSIlya Dryomov }
3780ed95b21aSIlya Dryomov 
3781ed95b21aSIlya Dryomov static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
3782ed95b21aSIlya Dryomov {
3783ed95b21aSIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
3784ed95b21aSIlya Dryomov 	sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
3785ed95b21aSIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
3786ed95b21aSIlya Dryomov }
3787ed95b21aSIlya Dryomov 
3788edd8ca80SFlorian Margaine static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
3789edd8ca80SFlorian Margaine {
3790edd8ca80SFlorian Margaine 	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3791edd8ca80SFlorian Margaine 
3792a2b1da09SIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
3793edd8ca80SFlorian Margaine 	strcpy(rbd_dev->lock_cookie, cookie);
3794edd8ca80SFlorian Margaine 	rbd_set_owner_cid(rbd_dev, &cid);
3795edd8ca80SFlorian Margaine 	queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
3796edd8ca80SFlorian Margaine }
3797edd8ca80SFlorian Margaine 
3798ed95b21aSIlya Dryomov /*
3799ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3800ed95b21aSIlya Dryomov  */
3801ed95b21aSIlya Dryomov static int rbd_lock(struct rbd_device *rbd_dev)
3802ed95b21aSIlya Dryomov {
3803ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3804ed95b21aSIlya Dryomov 	char cookie[32];
3805ed95b21aSIlya Dryomov 	int ret;
3806ed95b21aSIlya Dryomov 
3807cbbfb0ffSIlya Dryomov 	WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
3808cbbfb0ffSIlya Dryomov 		rbd_dev->lock_cookie[0] != '\0');
3809ed95b21aSIlya Dryomov 
3810ed95b21aSIlya Dryomov 	format_lock_cookie(rbd_dev, cookie);
3811ed95b21aSIlya Dryomov 	ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3812ed95b21aSIlya Dryomov 			    RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
3813ed95b21aSIlya Dryomov 			    RBD_LOCK_TAG, "", 0);
3814ed95b21aSIlya Dryomov 	if (ret)
3815ed95b21aSIlya Dryomov 		return ret;
3816ed95b21aSIlya Dryomov 
3817edd8ca80SFlorian Margaine 	__rbd_lock(rbd_dev, cookie);
3818ed95b21aSIlya Dryomov 	return 0;
3819ed95b21aSIlya Dryomov }
3820ed95b21aSIlya Dryomov 
3821ed95b21aSIlya Dryomov /*
3822ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3823ed95b21aSIlya Dryomov  */
3824bbead745SIlya Dryomov static void rbd_unlock(struct rbd_device *rbd_dev)
3825ed95b21aSIlya Dryomov {
3826ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3827ed95b21aSIlya Dryomov 	int ret;
3828ed95b21aSIlya Dryomov 
3829cbbfb0ffSIlya Dryomov 	WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
3830cbbfb0ffSIlya Dryomov 		rbd_dev->lock_cookie[0] == '\0');
3831ed95b21aSIlya Dryomov 
3832ed95b21aSIlya Dryomov 	ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3833cbbfb0ffSIlya Dryomov 			      RBD_LOCK_NAME, rbd_dev->lock_cookie);
3834bbead745SIlya Dryomov 	if (ret && ret != -ENOENT)
3835637cd060SIlya Dryomov 		rbd_warn(rbd_dev, "failed to unlock header: %d", ret);
3836ed95b21aSIlya Dryomov 
3837bbead745SIlya Dryomov 	/* treat errors as the image is unlocked */
3838bbead745SIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
3839cbbfb0ffSIlya Dryomov 	rbd_dev->lock_cookie[0] = '\0';
3840ed95b21aSIlya Dryomov 	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3841ed95b21aSIlya Dryomov 	queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
3842ed95b21aSIlya Dryomov }
3843ed95b21aSIlya Dryomov 
3844ed95b21aSIlya Dryomov static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
3845ed95b21aSIlya Dryomov 				enum rbd_notify_op notify_op,
3846ed95b21aSIlya Dryomov 				struct page ***preply_pages,
3847ed95b21aSIlya Dryomov 				size_t *preply_len)
3848ed95b21aSIlya Dryomov {
3849ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3850ed95b21aSIlya Dryomov 	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
385108a79102SKyle Spiers 	char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
385208a79102SKyle Spiers 	int buf_size = sizeof(buf);
3853ed95b21aSIlya Dryomov 	void *p = buf;
3854ed95b21aSIlya Dryomov 
3855ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
3856ed95b21aSIlya Dryomov 
3857ed95b21aSIlya Dryomov 	/* encode *LockPayload NotifyMessage (op + ClientId) */
3858ed95b21aSIlya Dryomov 	ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
3859ed95b21aSIlya Dryomov 	ceph_encode_32(&p, notify_op);
3860ed95b21aSIlya Dryomov 	ceph_encode_64(&p, cid.gid);
3861ed95b21aSIlya Dryomov 	ceph_encode_64(&p, cid.handle);
3862ed95b21aSIlya Dryomov 
3863ed95b21aSIlya Dryomov 	return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
3864ed95b21aSIlya Dryomov 				&rbd_dev->header_oloc, buf, buf_size,
3865ed95b21aSIlya Dryomov 				RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
3866ed95b21aSIlya Dryomov }
3867ed95b21aSIlya Dryomov 
3868ed95b21aSIlya Dryomov static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
3869ed95b21aSIlya Dryomov 			       enum rbd_notify_op notify_op)
3870ed95b21aSIlya Dryomov {
3871ed95b21aSIlya Dryomov 	struct page **reply_pages;
3872ed95b21aSIlya Dryomov 	size_t reply_len;
3873ed95b21aSIlya Dryomov 
3874ed95b21aSIlya Dryomov 	__rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
3875ed95b21aSIlya Dryomov 	ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3876ed95b21aSIlya Dryomov }
3877ed95b21aSIlya Dryomov 
3878ed95b21aSIlya Dryomov static void rbd_notify_acquired_lock(struct work_struct *work)
3879ed95b21aSIlya Dryomov {
3880ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3881ed95b21aSIlya Dryomov 						  acquired_lock_work);
3882ed95b21aSIlya Dryomov 
3883ed95b21aSIlya Dryomov 	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
3884ed95b21aSIlya Dryomov }
3885ed95b21aSIlya Dryomov 
3886ed95b21aSIlya Dryomov static void rbd_notify_released_lock(struct work_struct *work)
3887ed95b21aSIlya Dryomov {
3888ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3889ed95b21aSIlya Dryomov 						  released_lock_work);
3890ed95b21aSIlya Dryomov 
3891ed95b21aSIlya Dryomov 	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
3892ed95b21aSIlya Dryomov }
3893ed95b21aSIlya Dryomov 
3894ed95b21aSIlya Dryomov static int rbd_request_lock(struct rbd_device *rbd_dev)
3895ed95b21aSIlya Dryomov {
3896ed95b21aSIlya Dryomov 	struct page **reply_pages;
3897ed95b21aSIlya Dryomov 	size_t reply_len;
3898ed95b21aSIlya Dryomov 	bool lock_owner_responded = false;
3899ed95b21aSIlya Dryomov 	int ret;
3900ed95b21aSIlya Dryomov 
3901ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3902ed95b21aSIlya Dryomov 
3903ed95b21aSIlya Dryomov 	ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
3904ed95b21aSIlya Dryomov 				   &reply_pages, &reply_len);
3905ed95b21aSIlya Dryomov 	if (ret && ret != -ETIMEDOUT) {
3906ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "failed to request lock: %d", ret);
3907ed95b21aSIlya Dryomov 		goto out;
3908ed95b21aSIlya Dryomov 	}
3909ed95b21aSIlya Dryomov 
3910ed95b21aSIlya Dryomov 	if (reply_len > 0 && reply_len <= PAGE_SIZE) {
3911ed95b21aSIlya Dryomov 		void *p = page_address(reply_pages[0]);
3912ed95b21aSIlya Dryomov 		void *const end = p + reply_len;
3913ed95b21aSIlya Dryomov 		u32 n;
3914ed95b21aSIlya Dryomov 
3915ed95b21aSIlya Dryomov 		ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
3916ed95b21aSIlya Dryomov 		while (n--) {
3917ed95b21aSIlya Dryomov 			u8 struct_v;
3918ed95b21aSIlya Dryomov 			u32 len;
3919ed95b21aSIlya Dryomov 
3920ed95b21aSIlya Dryomov 			ceph_decode_need(&p, end, 8 + 8, e_inval);
3921ed95b21aSIlya Dryomov 			p += 8 + 8; /* skip gid and cookie */
3922ed95b21aSIlya Dryomov 
3923ed95b21aSIlya Dryomov 			ceph_decode_32_safe(&p, end, len, e_inval);
3924ed95b21aSIlya Dryomov 			if (!len)
3925ed95b21aSIlya Dryomov 				continue;
3926ed95b21aSIlya Dryomov 
3927ed95b21aSIlya Dryomov 			if (lock_owner_responded) {
3928ed95b21aSIlya Dryomov 				rbd_warn(rbd_dev,
3929ed95b21aSIlya Dryomov 					 "duplicate lock owners detected");
3930ed95b21aSIlya Dryomov 				ret = -EIO;
3931ed95b21aSIlya Dryomov 				goto out;
3932ed95b21aSIlya Dryomov 			}
3933ed95b21aSIlya Dryomov 
3934ed95b21aSIlya Dryomov 			lock_owner_responded = true;
3935ed95b21aSIlya Dryomov 			ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
3936ed95b21aSIlya Dryomov 						  &struct_v, &len);
3937ed95b21aSIlya Dryomov 			if (ret) {
3938ed95b21aSIlya Dryomov 				rbd_warn(rbd_dev,
3939ed95b21aSIlya Dryomov 					 "failed to decode ResponseMessage: %d",
3940ed95b21aSIlya Dryomov 					 ret);
3941ed95b21aSIlya Dryomov 				goto e_inval;
3942ed95b21aSIlya Dryomov 			}
3943ed95b21aSIlya Dryomov 
3944ed95b21aSIlya Dryomov 			ret = ceph_decode_32(&p);
3945ed95b21aSIlya Dryomov 		}
3946ed95b21aSIlya Dryomov 	}
3947ed95b21aSIlya Dryomov 
3948ed95b21aSIlya Dryomov 	if (!lock_owner_responded) {
3949ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "no lock owners detected");
3950ed95b21aSIlya Dryomov 		ret = -ETIMEDOUT;
3951ed95b21aSIlya Dryomov 	}
3952ed95b21aSIlya Dryomov 
3953ed95b21aSIlya Dryomov out:
3954ed95b21aSIlya Dryomov 	ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3955ed95b21aSIlya Dryomov 	return ret;
3956ed95b21aSIlya Dryomov 
3957ed95b21aSIlya Dryomov e_inval:
3958ed95b21aSIlya Dryomov 	ret = -EINVAL;
3959ed95b21aSIlya Dryomov 	goto out;
3960ed95b21aSIlya Dryomov }
3961ed95b21aSIlya Dryomov 
3962637cd060SIlya Dryomov /*
3963637cd060SIlya Dryomov  * Either image request state machine(s) or rbd_add_acquire_lock()
3964637cd060SIlya Dryomov  * (i.e. "rbd map").
3965637cd060SIlya Dryomov  */
3966637cd060SIlya Dryomov static void wake_lock_waiters(struct rbd_device *rbd_dev, int result)
3967ed95b21aSIlya Dryomov {
3968637cd060SIlya Dryomov 	struct rbd_img_request *img_req;
3969637cd060SIlya Dryomov 
3970637cd060SIlya Dryomov 	dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3971d9b9c893SLinus Torvalds 	lockdep_assert_held_write(&rbd_dev->lock_rwsem);
3972ed95b21aSIlya Dryomov 
3973ed95b21aSIlya Dryomov 	cancel_delayed_work(&rbd_dev->lock_dwork);
3974637cd060SIlya Dryomov 	if (!completion_done(&rbd_dev->acquire_wait)) {
3975637cd060SIlya Dryomov 		rbd_assert(list_empty(&rbd_dev->acquiring_list) &&
3976637cd060SIlya Dryomov 			   list_empty(&rbd_dev->running_list));
3977637cd060SIlya Dryomov 		rbd_dev->acquire_err = result;
3978637cd060SIlya Dryomov 		complete_all(&rbd_dev->acquire_wait);
3979637cd060SIlya Dryomov 		return;
3980637cd060SIlya Dryomov 	}
3981637cd060SIlya Dryomov 
3982637cd060SIlya Dryomov 	list_for_each_entry(img_req, &rbd_dev->acquiring_list, lock_item) {
3983637cd060SIlya Dryomov 		mutex_lock(&img_req->state_mutex);
3984637cd060SIlya Dryomov 		rbd_assert(img_req->state == RBD_IMG_EXCLUSIVE_LOCK);
3985637cd060SIlya Dryomov 		rbd_img_schedule(img_req, result);
3986637cd060SIlya Dryomov 		mutex_unlock(&img_req->state_mutex);
3987637cd060SIlya Dryomov 	}
3988637cd060SIlya Dryomov 
3989637cd060SIlya Dryomov 	list_splice_tail_init(&rbd_dev->acquiring_list, &rbd_dev->running_list);
3990ed95b21aSIlya Dryomov }
3991ed95b21aSIlya Dryomov 
3992ed95b21aSIlya Dryomov static int get_lock_owner_info(struct rbd_device *rbd_dev,
3993ed95b21aSIlya Dryomov 			       struct ceph_locker **lockers, u32 *num_lockers)
3994ed95b21aSIlya Dryomov {
3995ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3996ed95b21aSIlya Dryomov 	u8 lock_type;
3997ed95b21aSIlya Dryomov 	char *lock_tag;
3998ed95b21aSIlya Dryomov 	int ret;
3999ed95b21aSIlya Dryomov 
4000ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4001ed95b21aSIlya Dryomov 
4002ed95b21aSIlya Dryomov 	ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
4003ed95b21aSIlya Dryomov 				 &rbd_dev->header_oloc, RBD_LOCK_NAME,
4004ed95b21aSIlya Dryomov 				 &lock_type, &lock_tag, lockers, num_lockers);
4005ed95b21aSIlya Dryomov 	if (ret)
4006ed95b21aSIlya Dryomov 		return ret;
4007ed95b21aSIlya Dryomov 
4008ed95b21aSIlya Dryomov 	if (*num_lockers == 0) {
4009ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
4010ed95b21aSIlya Dryomov 		goto out;
4011ed95b21aSIlya Dryomov 	}
4012ed95b21aSIlya Dryomov 
4013ed95b21aSIlya Dryomov 	if (strcmp(lock_tag, RBD_LOCK_TAG)) {
4014ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
4015ed95b21aSIlya Dryomov 			 lock_tag);
4016ed95b21aSIlya Dryomov 		ret = -EBUSY;
4017ed95b21aSIlya Dryomov 		goto out;
4018ed95b21aSIlya Dryomov 	}
4019ed95b21aSIlya Dryomov 
4020ed95b21aSIlya Dryomov 	if (lock_type == CEPH_CLS_LOCK_SHARED) {
4021ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "shared lock type detected");
4022ed95b21aSIlya Dryomov 		ret = -EBUSY;
4023ed95b21aSIlya Dryomov 		goto out;
4024ed95b21aSIlya Dryomov 	}
4025ed95b21aSIlya Dryomov 
4026ed95b21aSIlya Dryomov 	if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
4027ed95b21aSIlya Dryomov 		    strlen(RBD_LOCK_COOKIE_PREFIX))) {
4028ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
4029ed95b21aSIlya Dryomov 			 (*lockers)[0].id.cookie);
4030ed95b21aSIlya Dryomov 		ret = -EBUSY;
4031ed95b21aSIlya Dryomov 		goto out;
4032ed95b21aSIlya Dryomov 	}
4033ed95b21aSIlya Dryomov 
4034ed95b21aSIlya Dryomov out:
4035ed95b21aSIlya Dryomov 	kfree(lock_tag);
4036ed95b21aSIlya Dryomov 	return ret;
4037ed95b21aSIlya Dryomov }
4038ed95b21aSIlya Dryomov 
4039ed95b21aSIlya Dryomov static int find_watcher(struct rbd_device *rbd_dev,
4040ed95b21aSIlya Dryomov 			const struct ceph_locker *locker)
4041ed95b21aSIlya Dryomov {
4042ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4043ed95b21aSIlya Dryomov 	struct ceph_watch_item *watchers;
4044ed95b21aSIlya Dryomov 	u32 num_watchers;
4045ed95b21aSIlya Dryomov 	u64 cookie;
4046ed95b21aSIlya Dryomov 	int i;
4047ed95b21aSIlya Dryomov 	int ret;
4048ed95b21aSIlya Dryomov 
4049ed95b21aSIlya Dryomov 	ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
4050ed95b21aSIlya Dryomov 				      &rbd_dev->header_oloc, &watchers,
4051ed95b21aSIlya Dryomov 				      &num_watchers);
4052ed95b21aSIlya Dryomov 	if (ret)
4053ed95b21aSIlya Dryomov 		return ret;
4054ed95b21aSIlya Dryomov 
4055ed95b21aSIlya Dryomov 	sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
4056ed95b21aSIlya Dryomov 	for (i = 0; i < num_watchers; i++) {
4057ed95b21aSIlya Dryomov 		if (!memcmp(&watchers[i].addr, &locker->info.addr,
4058ed95b21aSIlya Dryomov 			    sizeof(locker->info.addr)) &&
4059ed95b21aSIlya Dryomov 		    watchers[i].cookie == cookie) {
4060ed95b21aSIlya Dryomov 			struct rbd_client_id cid = {
4061ed95b21aSIlya Dryomov 				.gid = le64_to_cpu(watchers[i].name.num),
4062ed95b21aSIlya Dryomov 				.handle = cookie,
4063ed95b21aSIlya Dryomov 			};
4064ed95b21aSIlya Dryomov 
4065ed95b21aSIlya Dryomov 			dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
4066ed95b21aSIlya Dryomov 			     rbd_dev, cid.gid, cid.handle);
4067ed95b21aSIlya Dryomov 			rbd_set_owner_cid(rbd_dev, &cid);
4068ed95b21aSIlya Dryomov 			ret = 1;
4069ed95b21aSIlya Dryomov 			goto out;
4070ed95b21aSIlya Dryomov 		}
4071ed95b21aSIlya Dryomov 	}
4072ed95b21aSIlya Dryomov 
4073ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
4074ed95b21aSIlya Dryomov 	ret = 0;
4075ed95b21aSIlya Dryomov out:
4076ed95b21aSIlya Dryomov 	kfree(watchers);
4077ed95b21aSIlya Dryomov 	return ret;
4078ed95b21aSIlya Dryomov }
4079ed95b21aSIlya Dryomov 
4080ed95b21aSIlya Dryomov /*
4081ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
4082ed95b21aSIlya Dryomov  */
4083ed95b21aSIlya Dryomov static int rbd_try_lock(struct rbd_device *rbd_dev)
4084ed95b21aSIlya Dryomov {
4085ed95b21aSIlya Dryomov 	struct ceph_client *client = rbd_dev->rbd_client->client;
4086ed95b21aSIlya Dryomov 	struct ceph_locker *lockers;
4087ed95b21aSIlya Dryomov 	u32 num_lockers;
4088ed95b21aSIlya Dryomov 	int ret;
4089ed95b21aSIlya Dryomov 
4090ed95b21aSIlya Dryomov 	for (;;) {
4091ed95b21aSIlya Dryomov 		ret = rbd_lock(rbd_dev);
4092ed95b21aSIlya Dryomov 		if (ret != -EBUSY)
4093ed95b21aSIlya Dryomov 			return ret;
4094ed95b21aSIlya Dryomov 
4095ed95b21aSIlya Dryomov 		/* determine if the current lock holder is still alive */
4096ed95b21aSIlya Dryomov 		ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
4097ed95b21aSIlya Dryomov 		if (ret)
4098ed95b21aSIlya Dryomov 			return ret;
4099ed95b21aSIlya Dryomov 
4100ed95b21aSIlya Dryomov 		if (num_lockers == 0)
4101ed95b21aSIlya Dryomov 			goto again;
4102ed95b21aSIlya Dryomov 
4103ed95b21aSIlya Dryomov 		ret = find_watcher(rbd_dev, lockers);
4104637cd060SIlya Dryomov 		if (ret)
4105637cd060SIlya Dryomov 			goto out; /* request lock or error */
4106ed95b21aSIlya Dryomov 
410722e8bd51SIlya Dryomov 		rbd_warn(rbd_dev, "breaking header lock owned by %s%llu",
4108ed95b21aSIlya Dryomov 			 ENTITY_NAME(lockers[0].id.name));
4109ed95b21aSIlya Dryomov 
4110ed95b21aSIlya Dryomov 		ret = ceph_monc_blacklist_add(&client->monc,
4111ed95b21aSIlya Dryomov 					      &lockers[0].info.addr);
4112ed95b21aSIlya Dryomov 		if (ret) {
4113ed95b21aSIlya Dryomov 			rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
4114ed95b21aSIlya Dryomov 				 ENTITY_NAME(lockers[0].id.name), ret);
4115ed95b21aSIlya Dryomov 			goto out;
4116ed95b21aSIlya Dryomov 		}
4117ed95b21aSIlya Dryomov 
4118ed95b21aSIlya Dryomov 		ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
4119ed95b21aSIlya Dryomov 					  &rbd_dev->header_oloc, RBD_LOCK_NAME,
4120ed95b21aSIlya Dryomov 					  lockers[0].id.cookie,
4121ed95b21aSIlya Dryomov 					  &lockers[0].id.name);
4122ed95b21aSIlya Dryomov 		if (ret && ret != -ENOENT)
4123ed95b21aSIlya Dryomov 			goto out;
4124ed95b21aSIlya Dryomov 
4125ed95b21aSIlya Dryomov again:
4126ed95b21aSIlya Dryomov 		ceph_free_lockers(lockers, num_lockers);
4127ed95b21aSIlya Dryomov 	}
4128ed95b21aSIlya Dryomov 
4129ed95b21aSIlya Dryomov out:
4130ed95b21aSIlya Dryomov 	ceph_free_lockers(lockers, num_lockers);
4131ed95b21aSIlya Dryomov 	return ret;
4132ed95b21aSIlya Dryomov }
4133ed95b21aSIlya Dryomov 
413422e8bd51SIlya Dryomov static int rbd_post_acquire_action(struct rbd_device *rbd_dev)
4135ed95b21aSIlya Dryomov {
413622e8bd51SIlya Dryomov 	int ret;
413722e8bd51SIlya Dryomov 
413822e8bd51SIlya Dryomov 	if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) {
413922e8bd51SIlya Dryomov 		ret = rbd_object_map_open(rbd_dev);
414022e8bd51SIlya Dryomov 		if (ret)
414122e8bd51SIlya Dryomov 			return ret;
414222e8bd51SIlya Dryomov 	}
414322e8bd51SIlya Dryomov 
414422e8bd51SIlya Dryomov 	return 0;
414522e8bd51SIlya Dryomov }
414622e8bd51SIlya Dryomov 
4147ed95b21aSIlya Dryomov /*
4148637cd060SIlya Dryomov  * Return:
4149637cd060SIlya Dryomov  *   0 - lock acquired
4150637cd060SIlya Dryomov  *   1 - caller should call rbd_request_lock()
4151637cd060SIlya Dryomov  *  <0 - error
4152ed95b21aSIlya Dryomov  */
4153637cd060SIlya Dryomov static int rbd_try_acquire_lock(struct rbd_device *rbd_dev)
4154ed95b21aSIlya Dryomov {
4155637cd060SIlya Dryomov 	int ret;
4156ed95b21aSIlya Dryomov 
4157ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
4158ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
4159ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
4160ed95b21aSIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev)) {
4161ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
4162637cd060SIlya Dryomov 		return 0;
4163ed95b21aSIlya Dryomov 	}
4164ed95b21aSIlya Dryomov 
4165ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
4166ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
4167ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
4168ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
4169637cd060SIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev)) {
4170637cd060SIlya Dryomov 		up_write(&rbd_dev->lock_rwsem);
4171637cd060SIlya Dryomov 		return 0;
4172ed95b21aSIlya Dryomov 	}
4173ed95b21aSIlya Dryomov 
4174637cd060SIlya Dryomov 	ret = rbd_try_lock(rbd_dev);
4175637cd060SIlya Dryomov 	if (ret < 0) {
4176637cd060SIlya Dryomov 		rbd_warn(rbd_dev, "failed to lock header: %d", ret);
4177637cd060SIlya Dryomov 		if (ret == -EBLACKLISTED)
4178637cd060SIlya Dryomov 			goto out;
4179637cd060SIlya Dryomov 
4180637cd060SIlya Dryomov 		ret = 1; /* request lock anyway */
4181637cd060SIlya Dryomov 	}
4182637cd060SIlya Dryomov 	if (ret > 0) {
4183ed95b21aSIlya Dryomov 		up_write(&rbd_dev->lock_rwsem);
4184637cd060SIlya Dryomov 		return ret;
4185637cd060SIlya Dryomov 	}
4186637cd060SIlya Dryomov 
4187637cd060SIlya Dryomov 	rbd_assert(rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED);
4188637cd060SIlya Dryomov 	rbd_assert(list_empty(&rbd_dev->running_list));
4189637cd060SIlya Dryomov 
419022e8bd51SIlya Dryomov 	ret = rbd_post_acquire_action(rbd_dev);
419122e8bd51SIlya Dryomov 	if (ret) {
419222e8bd51SIlya Dryomov 		rbd_warn(rbd_dev, "post-acquire action failed: %d", ret);
419322e8bd51SIlya Dryomov 		/*
419422e8bd51SIlya Dryomov 		 * Can't stay in RBD_LOCK_STATE_LOCKED because
419522e8bd51SIlya Dryomov 		 * rbd_lock_add_request() would let the request through,
419622e8bd51SIlya Dryomov 		 * assuming that e.g. object map is locked and loaded.
419722e8bd51SIlya Dryomov 		 */
419822e8bd51SIlya Dryomov 		rbd_unlock(rbd_dev);
419922e8bd51SIlya Dryomov 	}
420022e8bd51SIlya Dryomov 
4201637cd060SIlya Dryomov out:
4202637cd060SIlya Dryomov 	wake_lock_waiters(rbd_dev, ret);
4203637cd060SIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
4204637cd060SIlya Dryomov 	return ret;
4205ed95b21aSIlya Dryomov }
4206ed95b21aSIlya Dryomov 
4207ed95b21aSIlya Dryomov static void rbd_acquire_lock(struct work_struct *work)
4208ed95b21aSIlya Dryomov {
4209ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
4210ed95b21aSIlya Dryomov 					    struct rbd_device, lock_dwork);
4211637cd060SIlya Dryomov 	int ret;
4212ed95b21aSIlya Dryomov 
4213ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4214ed95b21aSIlya Dryomov again:
4215637cd060SIlya Dryomov 	ret = rbd_try_acquire_lock(rbd_dev);
4216637cd060SIlya Dryomov 	if (ret <= 0) {
4217637cd060SIlya Dryomov 		dout("%s rbd_dev %p ret %d - done\n", __func__, rbd_dev, ret);
4218ed95b21aSIlya Dryomov 		return;
4219ed95b21aSIlya Dryomov 	}
4220ed95b21aSIlya Dryomov 
4221ed95b21aSIlya Dryomov 	ret = rbd_request_lock(rbd_dev);
4222ed95b21aSIlya Dryomov 	if (ret == -ETIMEDOUT) {
4223ed95b21aSIlya Dryomov 		goto again; /* treat this as a dead client */
4224e010dd0aSIlya Dryomov 	} else if (ret == -EROFS) {
4225e010dd0aSIlya Dryomov 		rbd_warn(rbd_dev, "peer will not release lock");
4226637cd060SIlya Dryomov 		down_write(&rbd_dev->lock_rwsem);
4227637cd060SIlya Dryomov 		wake_lock_waiters(rbd_dev, ret);
4228637cd060SIlya Dryomov 		up_write(&rbd_dev->lock_rwsem);
4229ed95b21aSIlya Dryomov 	} else if (ret < 0) {
4230ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "error requesting lock: %d", ret);
4231ed95b21aSIlya Dryomov 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4232ed95b21aSIlya Dryomov 				 RBD_RETRY_DELAY);
4233ed95b21aSIlya Dryomov 	} else {
4234ed95b21aSIlya Dryomov 		/*
4235ed95b21aSIlya Dryomov 		 * lock owner acked, but resend if we don't see them
4236ed95b21aSIlya Dryomov 		 * release the lock
4237ed95b21aSIlya Dryomov 		 */
42386b0a8774SColin Ian King 		dout("%s rbd_dev %p requeuing lock_dwork\n", __func__,
4239ed95b21aSIlya Dryomov 		     rbd_dev);
4240ed95b21aSIlya Dryomov 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4241ed95b21aSIlya Dryomov 		    msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
4242ed95b21aSIlya Dryomov 	}
4243ed95b21aSIlya Dryomov }
4244ed95b21aSIlya Dryomov 
4245a2b1da09SIlya Dryomov static bool rbd_quiesce_lock(struct rbd_device *rbd_dev)
4246ed95b21aSIlya Dryomov {
4247e1fddc8fSIlya Dryomov 	bool need_wait;
4248e1fddc8fSIlya Dryomov 
4249a2b1da09SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4250d9b9c893SLinus Torvalds 	lockdep_assert_held_write(&rbd_dev->lock_rwsem);
4251a2b1da09SIlya Dryomov 
4252ed95b21aSIlya Dryomov 	if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
4253ed95b21aSIlya Dryomov 		return false;
4254ed95b21aSIlya Dryomov 
4255ed95b21aSIlya Dryomov 	/*
4256ed95b21aSIlya Dryomov 	 * Ensure that all in-flight IO is flushed.
4257ed95b21aSIlya Dryomov 	 */
4258e1fddc8fSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
4259e1fddc8fSIlya Dryomov 	rbd_assert(!completion_done(&rbd_dev->releasing_wait));
4260e1fddc8fSIlya Dryomov 	need_wait = !list_empty(&rbd_dev->running_list);
4261e1fddc8fSIlya Dryomov 	downgrade_write(&rbd_dev->lock_rwsem);
4262e1fddc8fSIlya Dryomov 	if (need_wait)
4263e1fddc8fSIlya Dryomov 		wait_for_completion(&rbd_dev->releasing_wait);
4264ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
4265ed95b21aSIlya Dryomov 
4266ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
4267ed95b21aSIlya Dryomov 	if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
4268ed95b21aSIlya Dryomov 		return false;
4269ed95b21aSIlya Dryomov 
4270e1fddc8fSIlya Dryomov 	rbd_assert(list_empty(&rbd_dev->running_list));
4271a2b1da09SIlya Dryomov 	return true;
4272a2b1da09SIlya Dryomov }
4273a2b1da09SIlya Dryomov 
427422e8bd51SIlya Dryomov static void rbd_pre_release_action(struct rbd_device *rbd_dev)
427522e8bd51SIlya Dryomov {
427622e8bd51SIlya Dryomov 	if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)
427722e8bd51SIlya Dryomov 		rbd_object_map_close(rbd_dev);
427822e8bd51SIlya Dryomov }
427922e8bd51SIlya Dryomov 
4280e1fddc8fSIlya Dryomov static void __rbd_release_lock(struct rbd_device *rbd_dev)
4281e1fddc8fSIlya Dryomov {
4282e1fddc8fSIlya Dryomov 	rbd_assert(list_empty(&rbd_dev->running_list));
4283e1fddc8fSIlya Dryomov 
428422e8bd51SIlya Dryomov 	rbd_pre_release_action(rbd_dev);
4285bbead745SIlya Dryomov 	rbd_unlock(rbd_dev);
4286e1fddc8fSIlya Dryomov }
4287e1fddc8fSIlya Dryomov 
4288a2b1da09SIlya Dryomov /*
4289a2b1da09SIlya Dryomov  * lock_rwsem must be held for write
4290a2b1da09SIlya Dryomov  */
4291a2b1da09SIlya Dryomov static void rbd_release_lock(struct rbd_device *rbd_dev)
4292a2b1da09SIlya Dryomov {
4293a2b1da09SIlya Dryomov 	if (!rbd_quiesce_lock(rbd_dev))
4294a2b1da09SIlya Dryomov 		return;
4295a2b1da09SIlya Dryomov 
4296e1fddc8fSIlya Dryomov 	__rbd_release_lock(rbd_dev);
4297a2b1da09SIlya Dryomov 
4298ed95b21aSIlya Dryomov 	/*
4299ed95b21aSIlya Dryomov 	 * Give others a chance to grab the lock - we would re-acquire
4300637cd060SIlya Dryomov 	 * almost immediately if we got new IO while draining the running
4301637cd060SIlya Dryomov 	 * list otherwise.  We need to ack our own notifications, so this
4302637cd060SIlya Dryomov 	 * lock_dwork will be requeued from rbd_handle_released_lock() by
4303637cd060SIlya Dryomov 	 * way of maybe_kick_acquire().
4304ed95b21aSIlya Dryomov 	 */
4305ed95b21aSIlya Dryomov 	cancel_delayed_work(&rbd_dev->lock_dwork);
4306ed95b21aSIlya Dryomov }
4307ed95b21aSIlya Dryomov 
4308ed95b21aSIlya Dryomov static void rbd_release_lock_work(struct work_struct *work)
4309ed95b21aSIlya Dryomov {
4310ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
4311ed95b21aSIlya Dryomov 						  unlock_work);
4312ed95b21aSIlya Dryomov 
4313ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
4314ed95b21aSIlya Dryomov 	rbd_release_lock(rbd_dev);
4315ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
4316ed95b21aSIlya Dryomov }
4317ed95b21aSIlya Dryomov 
4318637cd060SIlya Dryomov static void maybe_kick_acquire(struct rbd_device *rbd_dev)
4319637cd060SIlya Dryomov {
4320637cd060SIlya Dryomov 	bool have_requests;
4321637cd060SIlya Dryomov 
4322637cd060SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4323637cd060SIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev))
4324637cd060SIlya Dryomov 		return;
4325637cd060SIlya Dryomov 
4326637cd060SIlya Dryomov 	spin_lock(&rbd_dev->lock_lists_lock);
4327637cd060SIlya Dryomov 	have_requests = !list_empty(&rbd_dev->acquiring_list);
4328637cd060SIlya Dryomov 	spin_unlock(&rbd_dev->lock_lists_lock);
4329637cd060SIlya Dryomov 	if (have_requests || delayed_work_pending(&rbd_dev->lock_dwork)) {
4330637cd060SIlya Dryomov 		dout("%s rbd_dev %p kicking lock_dwork\n", __func__, rbd_dev);
4331637cd060SIlya Dryomov 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
4332637cd060SIlya Dryomov 	}
4333637cd060SIlya Dryomov }
4334637cd060SIlya Dryomov 
4335ed95b21aSIlya Dryomov static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
4336ed95b21aSIlya Dryomov 				     void **p)
4337ed95b21aSIlya Dryomov {
4338ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
4339ed95b21aSIlya Dryomov 
4340ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
4341ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
4342ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
4343ed95b21aSIlya Dryomov 	}
4344ed95b21aSIlya Dryomov 
4345ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4346ed95b21aSIlya Dryomov 	     cid.handle);
4347ed95b21aSIlya Dryomov 	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4348ed95b21aSIlya Dryomov 		down_write(&rbd_dev->lock_rwsem);
4349ed95b21aSIlya Dryomov 		if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
4350ed95b21aSIlya Dryomov 			/*
4351ed95b21aSIlya Dryomov 			 * we already know that the remote client is
4352ed95b21aSIlya Dryomov 			 * the owner
4353ed95b21aSIlya Dryomov 			 */
4354ed95b21aSIlya Dryomov 			up_write(&rbd_dev->lock_rwsem);
4355ed95b21aSIlya Dryomov 			return;
4356ed95b21aSIlya Dryomov 		}
4357ed95b21aSIlya Dryomov 
4358ed95b21aSIlya Dryomov 		rbd_set_owner_cid(rbd_dev, &cid);
4359ed95b21aSIlya Dryomov 		downgrade_write(&rbd_dev->lock_rwsem);
4360ed95b21aSIlya Dryomov 	} else {
4361ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
4362ed95b21aSIlya Dryomov 	}
4363ed95b21aSIlya Dryomov 
4364637cd060SIlya Dryomov 	maybe_kick_acquire(rbd_dev);
4365ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
4366ed95b21aSIlya Dryomov }
4367ed95b21aSIlya Dryomov 
4368ed95b21aSIlya Dryomov static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
4369ed95b21aSIlya Dryomov 				     void **p)
4370ed95b21aSIlya Dryomov {
4371ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
4372ed95b21aSIlya Dryomov 
4373ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
4374ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
4375ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
4376ed95b21aSIlya Dryomov 	}
4377ed95b21aSIlya Dryomov 
4378ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4379ed95b21aSIlya Dryomov 	     cid.handle);
4380ed95b21aSIlya Dryomov 	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4381ed95b21aSIlya Dryomov 		down_write(&rbd_dev->lock_rwsem);
4382ed95b21aSIlya Dryomov 		if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
4383ed95b21aSIlya Dryomov 			dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
4384ed95b21aSIlya Dryomov 			     __func__, rbd_dev, cid.gid, cid.handle,
4385ed95b21aSIlya Dryomov 			     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
4386ed95b21aSIlya Dryomov 			up_write(&rbd_dev->lock_rwsem);
4387ed95b21aSIlya Dryomov 			return;
4388ed95b21aSIlya Dryomov 		}
4389ed95b21aSIlya Dryomov 
4390ed95b21aSIlya Dryomov 		rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
4391ed95b21aSIlya Dryomov 		downgrade_write(&rbd_dev->lock_rwsem);
4392ed95b21aSIlya Dryomov 	} else {
4393ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
4394ed95b21aSIlya Dryomov 	}
4395ed95b21aSIlya Dryomov 
4396637cd060SIlya Dryomov 	maybe_kick_acquire(rbd_dev);
4397ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
4398ed95b21aSIlya Dryomov }
4399ed95b21aSIlya Dryomov 
44003b77faa0SIlya Dryomov /*
44013b77faa0SIlya Dryomov  * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
44023b77faa0SIlya Dryomov  * ResponseMessage is needed.
44033b77faa0SIlya Dryomov  */
44043b77faa0SIlya Dryomov static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
4405ed95b21aSIlya Dryomov 				   void **p)
4406ed95b21aSIlya Dryomov {
4407ed95b21aSIlya Dryomov 	struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
4408ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
44093b77faa0SIlya Dryomov 	int result = 1;
4410ed95b21aSIlya Dryomov 
4411ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
4412ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
4413ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
4414ed95b21aSIlya Dryomov 	}
4415ed95b21aSIlya Dryomov 
4416ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4417ed95b21aSIlya Dryomov 	     cid.handle);
4418ed95b21aSIlya Dryomov 	if (rbd_cid_equal(&cid, &my_cid))
44193b77faa0SIlya Dryomov 		return result;
4420ed95b21aSIlya Dryomov 
4421ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
44223b77faa0SIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev)) {
44233b77faa0SIlya Dryomov 		if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
44243b77faa0SIlya Dryomov 		    rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
44253b77faa0SIlya Dryomov 			goto out_unlock;
44263b77faa0SIlya Dryomov 
44273b77faa0SIlya Dryomov 		/*
44283b77faa0SIlya Dryomov 		 * encode ResponseMessage(0) so the peer can detect
44293b77faa0SIlya Dryomov 		 * a missing owner
44303b77faa0SIlya Dryomov 		 */
44313b77faa0SIlya Dryomov 		result = 0;
44323b77faa0SIlya Dryomov 
4433ed95b21aSIlya Dryomov 		if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
4434e010dd0aSIlya Dryomov 			if (!rbd_dev->opts->exclusive) {
4435e010dd0aSIlya Dryomov 				dout("%s rbd_dev %p queueing unlock_work\n",
4436e010dd0aSIlya Dryomov 				     __func__, rbd_dev);
4437e010dd0aSIlya Dryomov 				queue_work(rbd_dev->task_wq,
4438e010dd0aSIlya Dryomov 					   &rbd_dev->unlock_work);
4439e010dd0aSIlya Dryomov 			} else {
4440e010dd0aSIlya Dryomov 				/* refuse to release the lock */
4441e010dd0aSIlya Dryomov 				result = -EROFS;
4442ed95b21aSIlya Dryomov 			}
4443ed95b21aSIlya Dryomov 		}
4444ed95b21aSIlya Dryomov 	}
44453b77faa0SIlya Dryomov 
44463b77faa0SIlya Dryomov out_unlock:
4447ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
44483b77faa0SIlya Dryomov 	return result;
4449ed95b21aSIlya Dryomov }
4450ed95b21aSIlya Dryomov 
4451ed95b21aSIlya Dryomov static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
4452ed95b21aSIlya Dryomov 				     u64 notify_id, u64 cookie, s32 *result)
4453ed95b21aSIlya Dryomov {
4454ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
445508a79102SKyle Spiers 	char buf[4 + CEPH_ENCODING_START_BLK_LEN];
445608a79102SKyle Spiers 	int buf_size = sizeof(buf);
4457ed95b21aSIlya Dryomov 	int ret;
4458ed95b21aSIlya Dryomov 
4459ed95b21aSIlya Dryomov 	if (result) {
4460ed95b21aSIlya Dryomov 		void *p = buf;
4461ed95b21aSIlya Dryomov 
4462ed95b21aSIlya Dryomov 		/* encode ResponseMessage */
4463ed95b21aSIlya Dryomov 		ceph_start_encoding(&p, 1, 1,
4464ed95b21aSIlya Dryomov 				    buf_size - CEPH_ENCODING_START_BLK_LEN);
4465ed95b21aSIlya Dryomov 		ceph_encode_32(&p, *result);
4466ed95b21aSIlya Dryomov 	} else {
4467ed95b21aSIlya Dryomov 		buf_size = 0;
4468ed95b21aSIlya Dryomov 	}
4469ed95b21aSIlya Dryomov 
4470ed95b21aSIlya Dryomov 	ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
4471ed95b21aSIlya Dryomov 				   &rbd_dev->header_oloc, notify_id, cookie,
4472ed95b21aSIlya Dryomov 				   buf, buf_size);
4473ed95b21aSIlya Dryomov 	if (ret)
4474ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
4475ed95b21aSIlya Dryomov }
4476ed95b21aSIlya Dryomov 
4477ed95b21aSIlya Dryomov static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
4478ed95b21aSIlya Dryomov 				   u64 cookie)
4479ed95b21aSIlya Dryomov {
4480ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4481ed95b21aSIlya Dryomov 	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
4482ed95b21aSIlya Dryomov }
4483ed95b21aSIlya Dryomov 
4484ed95b21aSIlya Dryomov static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
4485ed95b21aSIlya Dryomov 					  u64 notify_id, u64 cookie, s32 result)
4486ed95b21aSIlya Dryomov {
4487ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
4488ed95b21aSIlya Dryomov 	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
4489ed95b21aSIlya Dryomov }
4490922dab61SIlya Dryomov 
4491922dab61SIlya Dryomov static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
4492922dab61SIlya Dryomov 			 u64 notifier_id, void *data, size_t data_len)
4493bf0d5f50SAlex Elder {
4494922dab61SIlya Dryomov 	struct rbd_device *rbd_dev = arg;
4495ed95b21aSIlya Dryomov 	void *p = data;
4496ed95b21aSIlya Dryomov 	void *const end = p + data_len;
4497d4c2269bSIlya Dryomov 	u8 struct_v = 0;
4498ed95b21aSIlya Dryomov 	u32 len;
4499ed95b21aSIlya Dryomov 	u32 notify_op;
4500bf0d5f50SAlex Elder 	int ret;
4501bf0d5f50SAlex Elder 
4502ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
4503ed95b21aSIlya Dryomov 	     __func__, rbd_dev, cookie, notify_id, data_len);
4504ed95b21aSIlya Dryomov 	if (data_len) {
4505ed95b21aSIlya Dryomov 		ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
4506ed95b21aSIlya Dryomov 					  &struct_v, &len);
4507ed95b21aSIlya Dryomov 		if (ret) {
4508ed95b21aSIlya Dryomov 			rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
4509ed95b21aSIlya Dryomov 				 ret);
4510ed95b21aSIlya Dryomov 			return;
4511ed95b21aSIlya Dryomov 		}
451252bb1f9bSIlya Dryomov 
4513ed95b21aSIlya Dryomov 		notify_op = ceph_decode_32(&p);
4514ed95b21aSIlya Dryomov 	} else {
4515ed95b21aSIlya Dryomov 		/* legacy notification for header updates */
4516ed95b21aSIlya Dryomov 		notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
4517ed95b21aSIlya Dryomov 		len = 0;
4518ed95b21aSIlya Dryomov 	}
4519ed95b21aSIlya Dryomov 
4520ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
4521ed95b21aSIlya Dryomov 	switch (notify_op) {
4522ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_ACQUIRED_LOCK:
4523ed95b21aSIlya Dryomov 		rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
4524ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4525ed95b21aSIlya Dryomov 		break;
4526ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_RELEASED_LOCK:
4527ed95b21aSIlya Dryomov 		rbd_handle_released_lock(rbd_dev, struct_v, &p);
4528ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4529ed95b21aSIlya Dryomov 		break;
4530ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_REQUEST_LOCK:
45313b77faa0SIlya Dryomov 		ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
45323b77faa0SIlya Dryomov 		if (ret <= 0)
4533ed95b21aSIlya Dryomov 			rbd_acknowledge_notify_result(rbd_dev, notify_id,
45343b77faa0SIlya Dryomov 						      cookie, ret);
4535ed95b21aSIlya Dryomov 		else
4536ed95b21aSIlya Dryomov 			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4537ed95b21aSIlya Dryomov 		break;
4538ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_HEADER_UPDATE:
4539e627db08SAlex Elder 		ret = rbd_dev_refresh(rbd_dev);
4540e627db08SAlex Elder 		if (ret)
45419584d508SIlya Dryomov 			rbd_warn(rbd_dev, "refresh failed: %d", ret);
4542bf0d5f50SAlex Elder 
4543ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4544ed95b21aSIlya Dryomov 		break;
4545ed95b21aSIlya Dryomov 	default:
4546ed95b21aSIlya Dryomov 		if (rbd_is_lock_owner(rbd_dev))
4547ed95b21aSIlya Dryomov 			rbd_acknowledge_notify_result(rbd_dev, notify_id,
4548ed95b21aSIlya Dryomov 						      cookie, -EOPNOTSUPP);
4549ed95b21aSIlya Dryomov 		else
4550ed95b21aSIlya Dryomov 			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4551ed95b21aSIlya Dryomov 		break;
45529969ebc5SAlex Elder 	}
45539969ebc5SAlex Elder }
45549969ebc5SAlex Elder 
455599d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
45569969ebc5SAlex Elder 
4557922dab61SIlya Dryomov static void rbd_watch_errcb(void *arg, u64 cookie, int err)
4558bb040aa0SIlya Dryomov {
4559922dab61SIlya Dryomov 	struct rbd_device *rbd_dev = arg;
4560bb040aa0SIlya Dryomov 
4561922dab61SIlya Dryomov 	rbd_warn(rbd_dev, "encountered watch error: %d", err);
4562bb040aa0SIlya Dryomov 
4563ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
4564ed95b21aSIlya Dryomov 	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
4565ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
4566bb040aa0SIlya Dryomov 
456799d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
456899d16943SIlya Dryomov 	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
456999d16943SIlya Dryomov 		__rbd_unregister_watch(rbd_dev);
457099d16943SIlya Dryomov 		rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
4571bb040aa0SIlya Dryomov 
457299d16943SIlya Dryomov 		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
4573bb040aa0SIlya Dryomov 	}
457499d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
4575bb040aa0SIlya Dryomov }
4576bb040aa0SIlya Dryomov 
4577bb040aa0SIlya Dryomov /*
457899d16943SIlya Dryomov  * watch_mutex must be locked
45799969ebc5SAlex Elder  */
458099d16943SIlya Dryomov static int __rbd_register_watch(struct rbd_device *rbd_dev)
45819969ebc5SAlex Elder {
45829969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4583922dab61SIlya Dryomov 	struct ceph_osd_linger_request *handle;
45849969ebc5SAlex Elder 
4585922dab61SIlya Dryomov 	rbd_assert(!rbd_dev->watch_handle);
458699d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
45879969ebc5SAlex Elder 
4588922dab61SIlya Dryomov 	handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
4589922dab61SIlya Dryomov 				 &rbd_dev->header_oloc, rbd_watch_cb,
4590922dab61SIlya Dryomov 				 rbd_watch_errcb, rbd_dev);
4591922dab61SIlya Dryomov 	if (IS_ERR(handle))
4592922dab61SIlya Dryomov 		return PTR_ERR(handle);
45939969ebc5SAlex Elder 
4594922dab61SIlya Dryomov 	rbd_dev->watch_handle = handle;
45958eb87565SAlex Elder 	return 0;
45969969ebc5SAlex Elder }
45979969ebc5SAlex Elder 
459899d16943SIlya Dryomov /*
459999d16943SIlya Dryomov  * watch_mutex must be locked
460099d16943SIlya Dryomov  */
460199d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
4602fca27065SIlya Dryomov {
4603922dab61SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4604922dab61SIlya Dryomov 	int ret;
4605b30a01f2SIlya Dryomov 
460699d16943SIlya Dryomov 	rbd_assert(rbd_dev->watch_handle);
460799d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4608b30a01f2SIlya Dryomov 
4609922dab61SIlya Dryomov 	ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
4610922dab61SIlya Dryomov 	if (ret)
4611922dab61SIlya Dryomov 		rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
4612b30a01f2SIlya Dryomov 
4613922dab61SIlya Dryomov 	rbd_dev->watch_handle = NULL;
4614c525f036SIlya Dryomov }
4615c525f036SIlya Dryomov 
461699d16943SIlya Dryomov static int rbd_register_watch(struct rbd_device *rbd_dev)
4617c525f036SIlya Dryomov {
461899d16943SIlya Dryomov 	int ret;
4619811c6688SIlya Dryomov 
462099d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
462199d16943SIlya Dryomov 	rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
462299d16943SIlya Dryomov 	ret = __rbd_register_watch(rbd_dev);
462399d16943SIlya Dryomov 	if (ret)
462499d16943SIlya Dryomov 		goto out;
462599d16943SIlya Dryomov 
462699d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
462799d16943SIlya Dryomov 	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
462899d16943SIlya Dryomov 
462999d16943SIlya Dryomov out:
463099d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
463199d16943SIlya Dryomov 	return ret;
463299d16943SIlya Dryomov }
463399d16943SIlya Dryomov 
463499d16943SIlya Dryomov static void cancel_tasks_sync(struct rbd_device *rbd_dev)
463599d16943SIlya Dryomov {
463699d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
463799d16943SIlya Dryomov 
4638ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->acquired_lock_work);
4639ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->released_lock_work);
4640ed95b21aSIlya Dryomov 	cancel_delayed_work_sync(&rbd_dev->lock_dwork);
4641ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->unlock_work);
464299d16943SIlya Dryomov }
464399d16943SIlya Dryomov 
464499d16943SIlya Dryomov static void rbd_unregister_watch(struct rbd_device *rbd_dev)
464599d16943SIlya Dryomov {
464699d16943SIlya Dryomov 	cancel_tasks_sync(rbd_dev);
464799d16943SIlya Dryomov 
464899d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
464999d16943SIlya Dryomov 	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
465099d16943SIlya Dryomov 		__rbd_unregister_watch(rbd_dev);
465199d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
465299d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
465399d16943SIlya Dryomov 
465423edca86SDongsheng Yang 	cancel_delayed_work_sync(&rbd_dev->watch_dwork);
4655811c6688SIlya Dryomov 	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
4656fca27065SIlya Dryomov }
4657fca27065SIlya Dryomov 
465814bb211dSIlya Dryomov /*
465914bb211dSIlya Dryomov  * lock_rwsem must be held for write
466014bb211dSIlya Dryomov  */
466114bb211dSIlya Dryomov static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
466214bb211dSIlya Dryomov {
466314bb211dSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
466414bb211dSIlya Dryomov 	char cookie[32];
466514bb211dSIlya Dryomov 	int ret;
466614bb211dSIlya Dryomov 
4667a2b1da09SIlya Dryomov 	if (!rbd_quiesce_lock(rbd_dev))
4668a2b1da09SIlya Dryomov 		return;
466914bb211dSIlya Dryomov 
467014bb211dSIlya Dryomov 	format_lock_cookie(rbd_dev, cookie);
467114bb211dSIlya Dryomov 	ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
467214bb211dSIlya Dryomov 				  &rbd_dev->header_oloc, RBD_LOCK_NAME,
467314bb211dSIlya Dryomov 				  CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
467414bb211dSIlya Dryomov 				  RBD_LOCK_TAG, cookie);
467514bb211dSIlya Dryomov 	if (ret) {
467614bb211dSIlya Dryomov 		if (ret != -EOPNOTSUPP)
467714bb211dSIlya Dryomov 			rbd_warn(rbd_dev, "failed to update lock cookie: %d",
467814bb211dSIlya Dryomov 				 ret);
467914bb211dSIlya Dryomov 
468014bb211dSIlya Dryomov 		/*
468114bb211dSIlya Dryomov 		 * Lock cookie cannot be updated on older OSDs, so do
468214bb211dSIlya Dryomov 		 * a manual release and queue an acquire.
468314bb211dSIlya Dryomov 		 */
4684e1fddc8fSIlya Dryomov 		__rbd_release_lock(rbd_dev);
4685a2b1da09SIlya Dryomov 		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
468614bb211dSIlya Dryomov 	} else {
4687edd8ca80SFlorian Margaine 		__rbd_lock(rbd_dev, cookie);
4688637cd060SIlya Dryomov 		wake_lock_waiters(rbd_dev, 0);
468914bb211dSIlya Dryomov 	}
469014bb211dSIlya Dryomov }
469114bb211dSIlya Dryomov 
469299d16943SIlya Dryomov static void rbd_reregister_watch(struct work_struct *work)
469399d16943SIlya Dryomov {
469499d16943SIlya Dryomov 	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
469599d16943SIlya Dryomov 					    struct rbd_device, watch_dwork);
469699d16943SIlya Dryomov 	int ret;
469799d16943SIlya Dryomov 
469899d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
469999d16943SIlya Dryomov 
470099d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
470187c0fdedSIlya Dryomov 	if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
470287c0fdedSIlya Dryomov 		mutex_unlock(&rbd_dev->watch_mutex);
470314bb211dSIlya Dryomov 		return;
470487c0fdedSIlya Dryomov 	}
470599d16943SIlya Dryomov 
470699d16943SIlya Dryomov 	ret = __rbd_register_watch(rbd_dev);
470799d16943SIlya Dryomov 	if (ret) {
470899d16943SIlya Dryomov 		rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
4709637cd060SIlya Dryomov 		if (ret != -EBLACKLISTED && ret != -ENOENT) {
471099d16943SIlya Dryomov 			queue_delayed_work(rbd_dev->task_wq,
471199d16943SIlya Dryomov 					   &rbd_dev->watch_dwork,
471299d16943SIlya Dryomov 					   RBD_RETRY_DELAY);
471387c0fdedSIlya Dryomov 			mutex_unlock(&rbd_dev->watch_mutex);
471414bb211dSIlya Dryomov 			return;
471599d16943SIlya Dryomov 		}
471699d16943SIlya Dryomov 
4717637cd060SIlya Dryomov 		mutex_unlock(&rbd_dev->watch_mutex);
4718637cd060SIlya Dryomov 		down_write(&rbd_dev->lock_rwsem);
4719637cd060SIlya Dryomov 		wake_lock_waiters(rbd_dev, ret);
4720637cd060SIlya Dryomov 		up_write(&rbd_dev->lock_rwsem);
4721637cd060SIlya Dryomov 		return;
4722637cd060SIlya Dryomov 	}
4723637cd060SIlya Dryomov 
472499d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
472599d16943SIlya Dryomov 	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
472699d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
472799d16943SIlya Dryomov 
472814bb211dSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
472914bb211dSIlya Dryomov 	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
473014bb211dSIlya Dryomov 		rbd_reacquire_lock(rbd_dev);
473114bb211dSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
473214bb211dSIlya Dryomov 
473399d16943SIlya Dryomov 	ret = rbd_dev_refresh(rbd_dev);
473499d16943SIlya Dryomov 	if (ret)
4735f6870cc9SColin Ian King 		rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
473699d16943SIlya Dryomov }
473799d16943SIlya Dryomov 
473836be9a76SAlex Elder /*
4739f40eb349SAlex Elder  * Synchronous osd object method call.  Returns the number of bytes
4740f40eb349SAlex Elder  * returned in the outbound buffer, or a negative error code.
474136be9a76SAlex Elder  */
474236be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
4743ecd4a68aSIlya Dryomov 			     struct ceph_object_id *oid,
4744ecd4a68aSIlya Dryomov 			     struct ceph_object_locator *oloc,
474536be9a76SAlex Elder 			     const char *method_name,
47464157976bSAlex Elder 			     const void *outbound,
474736be9a76SAlex Elder 			     size_t outbound_size,
47484157976bSAlex Elder 			     void *inbound,
4749e2a58ee5SAlex Elder 			     size_t inbound_size)
475036be9a76SAlex Elder {
4751ecd4a68aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4752ecd4a68aSIlya Dryomov 	struct page *req_page = NULL;
4753ecd4a68aSIlya Dryomov 	struct page *reply_page;
475436be9a76SAlex Elder 	int ret;
475536be9a76SAlex Elder 
475636be9a76SAlex Elder 	/*
47576010a451SAlex Elder 	 * Method calls are ultimately read operations.  The result
47586010a451SAlex Elder 	 * should placed into the inbound buffer provided.  They
47596010a451SAlex Elder 	 * also supply outbound data--parameters for the object
47606010a451SAlex Elder 	 * method.  Currently if this is present it will be a
47616010a451SAlex Elder 	 * snapshot id.
476236be9a76SAlex Elder 	 */
4763ecd4a68aSIlya Dryomov 	if (outbound) {
4764ecd4a68aSIlya Dryomov 		if (outbound_size > PAGE_SIZE)
4765ecd4a68aSIlya Dryomov 			return -E2BIG;
476636be9a76SAlex Elder 
4767ecd4a68aSIlya Dryomov 		req_page = alloc_page(GFP_KERNEL);
4768ecd4a68aSIlya Dryomov 		if (!req_page)
4769ecd4a68aSIlya Dryomov 			return -ENOMEM;
477036be9a76SAlex Elder 
4771ecd4a68aSIlya Dryomov 		memcpy(page_address(req_page), outbound, outbound_size);
477204017e29SAlex Elder 	}
4773430c28c3SAlex Elder 
4774ecd4a68aSIlya Dryomov 	reply_page = alloc_page(GFP_KERNEL);
4775ecd4a68aSIlya Dryomov 	if (!reply_page) {
4776ecd4a68aSIlya Dryomov 		if (req_page)
4777ecd4a68aSIlya Dryomov 			__free_page(req_page);
4778ecd4a68aSIlya Dryomov 		return -ENOMEM;
4779ecd4a68aSIlya Dryomov 	}
478036be9a76SAlex Elder 
4781ecd4a68aSIlya Dryomov 	ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
4782ecd4a68aSIlya Dryomov 			     CEPH_OSD_FLAG_READ, req_page, outbound_size,
478368ada915SIlya Dryomov 			     &reply_page, &inbound_size);
4784ecd4a68aSIlya Dryomov 	if (!ret) {
4785ecd4a68aSIlya Dryomov 		memcpy(inbound, page_address(reply_page), inbound_size);
4786ecd4a68aSIlya Dryomov 		ret = inbound_size;
4787ecd4a68aSIlya Dryomov 	}
478857385b51SAlex Elder 
4789ecd4a68aSIlya Dryomov 	if (req_page)
4790ecd4a68aSIlya Dryomov 		__free_page(req_page);
4791ecd4a68aSIlya Dryomov 	__free_page(reply_page);
479236be9a76SAlex Elder 	return ret;
479336be9a76SAlex Elder }
479436be9a76SAlex Elder 
47957ad18afaSChristoph Hellwig static void rbd_queue_workfn(struct work_struct *work)
4796bc1ecc65SIlya Dryomov {
47977ad18afaSChristoph Hellwig 	struct request *rq = blk_mq_rq_from_pdu(work);
47987ad18afaSChristoph Hellwig 	struct rbd_device *rbd_dev = rq->q->queuedata;
4799bc1ecc65SIlya Dryomov 	struct rbd_img_request *img_request;
48004e752f0aSJosh Durgin 	struct ceph_snap_context *snapc = NULL;
4801bc1ecc65SIlya Dryomov 	u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
4802bc1ecc65SIlya Dryomov 	u64 length = blk_rq_bytes(rq);
48036d2940c8SGuangliang Zhao 	enum obj_operation_type op_type;
48044e752f0aSJosh Durgin 	u64 mapping_size;
4805bc1ecc65SIlya Dryomov 	int result;
4806bc1ecc65SIlya Dryomov 
4807aebf526bSChristoph Hellwig 	switch (req_op(rq)) {
4808aebf526bSChristoph Hellwig 	case REQ_OP_DISCARD:
4809aebf526bSChristoph Hellwig 		op_type = OBJ_OP_DISCARD;
4810aebf526bSChristoph Hellwig 		break;
48116484cbe9SIlya Dryomov 	case REQ_OP_WRITE_ZEROES:
48126484cbe9SIlya Dryomov 		op_type = OBJ_OP_ZEROOUT;
48136484cbe9SIlya Dryomov 		break;
4814aebf526bSChristoph Hellwig 	case REQ_OP_WRITE:
4815aebf526bSChristoph Hellwig 		op_type = OBJ_OP_WRITE;
4816aebf526bSChristoph Hellwig 		break;
4817aebf526bSChristoph Hellwig 	case REQ_OP_READ:
4818aebf526bSChristoph Hellwig 		op_type = OBJ_OP_READ;
4819aebf526bSChristoph Hellwig 		break;
4820aebf526bSChristoph Hellwig 	default:
4821aebf526bSChristoph Hellwig 		dout("%s: non-fs request type %d\n", __func__, req_op(rq));
48227ad18afaSChristoph Hellwig 		result = -EIO;
48237ad18afaSChristoph Hellwig 		goto err;
48247ad18afaSChristoph Hellwig 	}
48257ad18afaSChristoph Hellwig 
4826bc1ecc65SIlya Dryomov 	/* Ignore/skip any zero-length requests */
4827bc1ecc65SIlya Dryomov 
4828bc1ecc65SIlya Dryomov 	if (!length) {
4829bc1ecc65SIlya Dryomov 		dout("%s: zero-length request\n", __func__);
4830bc1ecc65SIlya Dryomov 		result = 0;
4831bc1ecc65SIlya Dryomov 		goto err_rq;
4832bc1ecc65SIlya Dryomov 	}
4833bc1ecc65SIlya Dryomov 
4834f3c0e459SIlya Dryomov 	if (op_type != OBJ_OP_READ && rbd_is_snap(rbd_dev)) {
4835b91a7bdcSIlya Dryomov 		rbd_warn(rbd_dev, "%s on read-only snapshot",
4836b91a7bdcSIlya Dryomov 			 obj_op_name(op_type));
4837b91a7bdcSIlya Dryomov 		result = -EIO;
4838b91a7bdcSIlya Dryomov 		goto err;
4839b91a7bdcSIlya Dryomov 	}
4840bc1ecc65SIlya Dryomov 
4841bc1ecc65SIlya Dryomov 	/*
4842bc1ecc65SIlya Dryomov 	 * Quit early if the mapped snapshot no longer exists.  It's
4843bc1ecc65SIlya Dryomov 	 * still possible the snapshot will have disappeared by the
4844bc1ecc65SIlya Dryomov 	 * time our request arrives at the osd, but there's no sense in
4845bc1ecc65SIlya Dryomov 	 * sending it if we already know.
4846bc1ecc65SIlya Dryomov 	 */
4847bc1ecc65SIlya Dryomov 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
4848bc1ecc65SIlya Dryomov 		dout("request for non-existent snapshot");
4849f3c0e459SIlya Dryomov 		rbd_assert(rbd_is_snap(rbd_dev));
4850bc1ecc65SIlya Dryomov 		result = -ENXIO;
4851bc1ecc65SIlya Dryomov 		goto err_rq;
4852bc1ecc65SIlya Dryomov 	}
4853bc1ecc65SIlya Dryomov 
4854bc1ecc65SIlya Dryomov 	if (offset && length > U64_MAX - offset + 1) {
4855bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
4856bc1ecc65SIlya Dryomov 			 length);
4857bc1ecc65SIlya Dryomov 		result = -EINVAL;
4858bc1ecc65SIlya Dryomov 		goto err_rq;	/* Shouldn't happen */
4859bc1ecc65SIlya Dryomov 	}
4860bc1ecc65SIlya Dryomov 
48617ad18afaSChristoph Hellwig 	blk_mq_start_request(rq);
48627ad18afaSChristoph Hellwig 
48634e752f0aSJosh Durgin 	down_read(&rbd_dev->header_rwsem);
48644e752f0aSJosh Durgin 	mapping_size = rbd_dev->mapping.size;
48656d2940c8SGuangliang Zhao 	if (op_type != OBJ_OP_READ) {
48664e752f0aSJosh Durgin 		snapc = rbd_dev->header.snapc;
48674e752f0aSJosh Durgin 		ceph_get_snap_context(snapc);
48684e752f0aSJosh Durgin 	}
48694e752f0aSJosh Durgin 	up_read(&rbd_dev->header_rwsem);
48704e752f0aSJosh Durgin 
48714e752f0aSJosh Durgin 	if (offset + length > mapping_size) {
4872bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
48734e752f0aSJosh Durgin 			 length, mapping_size);
4874bc1ecc65SIlya Dryomov 		result = -EIO;
4875bc1ecc65SIlya Dryomov 		goto err_rq;
4876bc1ecc65SIlya Dryomov 	}
4877bc1ecc65SIlya Dryomov 
4878dfd9875fSIlya Dryomov 	img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
4879bc1ecc65SIlya Dryomov 	if (!img_request) {
4880bc1ecc65SIlya Dryomov 		result = -ENOMEM;
4881637cd060SIlya Dryomov 		goto err_rq;
4882bc1ecc65SIlya Dryomov 	}
4883bc1ecc65SIlya Dryomov 	img_request->rq = rq;
488470b16db8SIlya Dryomov 	snapc = NULL; /* img_request consumes a ref */
4885bc1ecc65SIlya Dryomov 
488621ed05a8SIlya Dryomov 	dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev,
488721ed05a8SIlya Dryomov 	     img_request, obj_op_name(op_type), offset, length);
488821ed05a8SIlya Dryomov 
48896484cbe9SIlya Dryomov 	if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
48905a237819SIlya Dryomov 		result = rbd_img_fill_nodata(img_request, offset, length);
489190e98c52SGuangliang Zhao 	else
48925a237819SIlya Dryomov 		result = rbd_img_fill_from_bio(img_request, offset, length,
489390e98c52SGuangliang Zhao 					       rq->bio);
48940192ce2eSIlya Dryomov 	if (result)
4895bc1ecc65SIlya Dryomov 		goto err_img_request;
4896bc1ecc65SIlya Dryomov 
4897e1fddc8fSIlya Dryomov 	rbd_img_handle_request(img_request, 0);
4898bc1ecc65SIlya Dryomov 	return;
4899bc1ecc65SIlya Dryomov 
4900bc1ecc65SIlya Dryomov err_img_request:
4901bc1ecc65SIlya Dryomov 	rbd_img_request_put(img_request);
4902bc1ecc65SIlya Dryomov err_rq:
4903bc1ecc65SIlya Dryomov 	if (result)
4904bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "%s %llx at %llx result %d",
49056d2940c8SGuangliang Zhao 			 obj_op_name(op_type), length, offset, result);
49064e752f0aSJosh Durgin 	ceph_put_snap_context(snapc);
49077ad18afaSChristoph Hellwig err:
49082a842acaSChristoph Hellwig 	blk_mq_end_request(rq, errno_to_blk_status(result));
4909bc1ecc65SIlya Dryomov }
4910bc1ecc65SIlya Dryomov 
4911fc17b653SChristoph Hellwig static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
49127ad18afaSChristoph Hellwig 		const struct blk_mq_queue_data *bd)
4913bc1ecc65SIlya Dryomov {
49147ad18afaSChristoph Hellwig 	struct request *rq = bd->rq;
49157ad18afaSChristoph Hellwig 	struct work_struct *work = blk_mq_rq_to_pdu(rq);
4916bc1ecc65SIlya Dryomov 
49177ad18afaSChristoph Hellwig 	queue_work(rbd_wq, work);
4918fc17b653SChristoph Hellwig 	return BLK_STS_OK;
4919bf0d5f50SAlex Elder }
4920bf0d5f50SAlex Elder 
4921602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
4922602adf40SYehuda Sadeh {
49235769ed0cSIlya Dryomov 	blk_cleanup_queue(rbd_dev->disk->queue);
49247ad18afaSChristoph Hellwig 	blk_mq_free_tag_set(&rbd_dev->tag_set);
49255769ed0cSIlya Dryomov 	put_disk(rbd_dev->disk);
49265769ed0cSIlya Dryomov 	rbd_dev->disk = NULL;
4927602adf40SYehuda Sadeh }
4928602adf40SYehuda Sadeh 
4929788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
4930fe5478e0SIlya Dryomov 			     struct ceph_object_id *oid,
4931fe5478e0SIlya Dryomov 			     struct ceph_object_locator *oloc,
4932fe5478e0SIlya Dryomov 			     void *buf, int buf_len)
4933788e2df3SAlex Elder 
4934788e2df3SAlex Elder {
4935fe5478e0SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4936fe5478e0SIlya Dryomov 	struct ceph_osd_request *req;
4937fe5478e0SIlya Dryomov 	struct page **pages;
4938fe5478e0SIlya Dryomov 	int num_pages = calc_pages_for(0, buf_len);
4939788e2df3SAlex Elder 	int ret;
4940788e2df3SAlex Elder 
4941fe5478e0SIlya Dryomov 	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
4942fe5478e0SIlya Dryomov 	if (!req)
4943fe5478e0SIlya Dryomov 		return -ENOMEM;
4944788e2df3SAlex Elder 
4945fe5478e0SIlya Dryomov 	ceph_oid_copy(&req->r_base_oid, oid);
4946fe5478e0SIlya Dryomov 	ceph_oloc_copy(&req->r_base_oloc, oloc);
4947fe5478e0SIlya Dryomov 	req->r_flags = CEPH_OSD_FLAG_READ;
4948788e2df3SAlex Elder 
4949fe5478e0SIlya Dryomov 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
4950fe5478e0SIlya Dryomov 	if (IS_ERR(pages)) {
4951fe5478e0SIlya Dryomov 		ret = PTR_ERR(pages);
4952fe5478e0SIlya Dryomov 		goto out_req;
4953fe5478e0SIlya Dryomov 	}
49541ceae7efSAlex Elder 
4955fe5478e0SIlya Dryomov 	osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
4956fe5478e0SIlya Dryomov 	osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
4957fe5478e0SIlya Dryomov 					 true);
4958788e2df3SAlex Elder 
495926f887e0SIlya Dryomov 	ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
496026f887e0SIlya Dryomov 	if (ret)
496126f887e0SIlya Dryomov 		goto out_req;
496226f887e0SIlya Dryomov 
4963fe5478e0SIlya Dryomov 	ceph_osdc_start_request(osdc, req, false);
4964fe5478e0SIlya Dryomov 	ret = ceph_osdc_wait_request(osdc, req);
4965fe5478e0SIlya Dryomov 	if (ret >= 0)
4966fe5478e0SIlya Dryomov 		ceph_copy_from_page_vector(pages, buf, 0, ret);
4967fe5478e0SIlya Dryomov 
4968fe5478e0SIlya Dryomov out_req:
4969fe5478e0SIlya Dryomov 	ceph_osdc_put_request(req);
4970788e2df3SAlex Elder 	return ret;
4971788e2df3SAlex Elder }
4972788e2df3SAlex Elder 
4973602adf40SYehuda Sadeh /*
4974662518b1SAlex Elder  * Read the complete header for the given rbd device.  On successful
4975662518b1SAlex Elder  * return, the rbd_dev->header field will contain up-to-date
4976662518b1SAlex Elder  * information about the image.
49774156d998SAlex Elder  */
497899a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
49794156d998SAlex Elder {
49804156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
49814156d998SAlex Elder 	u32 snap_count = 0;
49824156d998SAlex Elder 	u64 names_size = 0;
49834156d998SAlex Elder 	u32 want_count;
49844156d998SAlex Elder 	int ret;
49854156d998SAlex Elder 
49864156d998SAlex Elder 	/*
49874156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
49884156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
49894156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
49904156d998SAlex Elder 	 * the number of snapshots could change by the time we read
49914156d998SAlex Elder 	 * it in, in which case we re-read it.
49924156d998SAlex Elder 	 */
49934156d998SAlex Elder 	do {
49944156d998SAlex Elder 		size_t size;
49954156d998SAlex Elder 
49964156d998SAlex Elder 		kfree(ondisk);
49974156d998SAlex Elder 
49984156d998SAlex Elder 		size = sizeof (*ondisk);
49994156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
50004156d998SAlex Elder 		size += names_size;
50014156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
50024156d998SAlex Elder 		if (!ondisk)
5003662518b1SAlex Elder 			return -ENOMEM;
50044156d998SAlex Elder 
5005fe5478e0SIlya Dryomov 		ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
5006fe5478e0SIlya Dryomov 					&rbd_dev->header_oloc, ondisk, size);
50074156d998SAlex Elder 		if (ret < 0)
5008662518b1SAlex Elder 			goto out;
5009c0cd10dbSAlex Elder 		if ((size_t)ret < size) {
50104156d998SAlex Elder 			ret = -ENXIO;
501106ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
501206ecc6cbSAlex Elder 				size, ret);
5013662518b1SAlex Elder 			goto out;
50144156d998SAlex Elder 		}
50154156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
50164156d998SAlex Elder 			ret = -ENXIO;
501706ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
5018662518b1SAlex Elder 			goto out;
50194156d998SAlex Elder 		}
50204156d998SAlex Elder 
50214156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
50224156d998SAlex Elder 		want_count = snap_count;
50234156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
50244156d998SAlex Elder 	} while (snap_count != want_count);
50254156d998SAlex Elder 
5026662518b1SAlex Elder 	ret = rbd_header_from_disk(rbd_dev, ondisk);
5027662518b1SAlex Elder out:
50284156d998SAlex Elder 	kfree(ondisk);
50294156d998SAlex Elder 
5030dfc5606dSYehuda Sadeh 	return ret;
5031602adf40SYehuda Sadeh }
5032602adf40SYehuda Sadeh 
503315228edeSAlex Elder /*
503415228edeSAlex Elder  * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
503515228edeSAlex Elder  * has disappeared from the (just updated) snapshot context.
503615228edeSAlex Elder  */
503715228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev)
503815228edeSAlex Elder {
503915228edeSAlex Elder 	u64 snap_id;
504015228edeSAlex Elder 
504115228edeSAlex Elder 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
504215228edeSAlex Elder 		return;
504315228edeSAlex Elder 
504415228edeSAlex Elder 	snap_id = rbd_dev->spec->snap_id;
504515228edeSAlex Elder 	if (snap_id == CEPH_NOSNAP)
504615228edeSAlex Elder 		return;
504715228edeSAlex Elder 
504815228edeSAlex Elder 	if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
504915228edeSAlex Elder 		clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
505015228edeSAlex Elder }
505115228edeSAlex Elder 
50529875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev)
50539875201eSJosh Durgin {
50549875201eSJosh Durgin 	sector_t size;
50559875201eSJosh Durgin 
50569875201eSJosh Durgin 	/*
5057811c6688SIlya Dryomov 	 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
5058811c6688SIlya Dryomov 	 * try to update its size.  If REMOVING is set, updating size
5059811c6688SIlya Dryomov 	 * is just useless work since the device can't be opened.
50609875201eSJosh Durgin 	 */
5061811c6688SIlya Dryomov 	if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
5062811c6688SIlya Dryomov 	    !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
50639875201eSJosh Durgin 		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
50649875201eSJosh Durgin 		dout("setting size to %llu sectors", (unsigned long long)size);
50659875201eSJosh Durgin 		set_capacity(rbd_dev->disk, size);
50669875201eSJosh Durgin 		revalidate_disk(rbd_dev->disk);
50679875201eSJosh Durgin 	}
50689875201eSJosh Durgin }
50699875201eSJosh Durgin 
5070cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev)
50711fe5e993SAlex Elder {
5072e627db08SAlex Elder 	u64 mapping_size;
50731fe5e993SAlex Elder 	int ret;
50741fe5e993SAlex Elder 
5075cfbf6377SAlex Elder 	down_write(&rbd_dev->header_rwsem);
50763b5cf2a2SAlex Elder 	mapping_size = rbd_dev->mapping.size;
5077a720ae09SIlya Dryomov 
5078a720ae09SIlya Dryomov 	ret = rbd_dev_header_info(rbd_dev);
507952bb1f9bSIlya Dryomov 	if (ret)
508073e39e4dSIlya Dryomov 		goto out;
508115228edeSAlex Elder 
5082e8f59b59SIlya Dryomov 	/*
5083e8f59b59SIlya Dryomov 	 * If there is a parent, see if it has disappeared due to the
5084e8f59b59SIlya Dryomov 	 * mapped image getting flattened.
5085e8f59b59SIlya Dryomov 	 */
5086e8f59b59SIlya Dryomov 	if (rbd_dev->parent) {
5087e8f59b59SIlya Dryomov 		ret = rbd_dev_v2_parent_info(rbd_dev);
5088e8f59b59SIlya Dryomov 		if (ret)
508973e39e4dSIlya Dryomov 			goto out;
5090e8f59b59SIlya Dryomov 	}
5091e8f59b59SIlya Dryomov 
5092f3c0e459SIlya Dryomov 	if (!rbd_is_snap(rbd_dev)) {
50935ff1108cSIlya Dryomov 		rbd_dev->mapping.size = rbd_dev->header.image_size;
50945ff1108cSIlya Dryomov 	} else {
50955ff1108cSIlya Dryomov 		/* validate mapped snapshot's EXISTS flag */
509615228edeSAlex Elder 		rbd_exists_validate(rbd_dev);
50975ff1108cSIlya Dryomov 	}
50985ff1108cSIlya Dryomov 
509973e39e4dSIlya Dryomov out:
5100cfbf6377SAlex Elder 	up_write(&rbd_dev->header_rwsem);
510173e39e4dSIlya Dryomov 	if (!ret && mapping_size != rbd_dev->mapping.size)
51029875201eSJosh Durgin 		rbd_dev_update_size(rbd_dev);
51031fe5e993SAlex Elder 
510473e39e4dSIlya Dryomov 	return ret;
51051fe5e993SAlex Elder }
51061fe5e993SAlex Elder 
5107d6296d39SChristoph Hellwig static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
5108d6296d39SChristoph Hellwig 		unsigned int hctx_idx, unsigned int numa_node)
51097ad18afaSChristoph Hellwig {
51107ad18afaSChristoph Hellwig 	struct work_struct *work = blk_mq_rq_to_pdu(rq);
51117ad18afaSChristoph Hellwig 
51127ad18afaSChristoph Hellwig 	INIT_WORK(work, rbd_queue_workfn);
51137ad18afaSChristoph Hellwig 	return 0;
51147ad18afaSChristoph Hellwig }
51157ad18afaSChristoph Hellwig 
5116f363b089SEric Biggers static const struct blk_mq_ops rbd_mq_ops = {
51177ad18afaSChristoph Hellwig 	.queue_rq	= rbd_queue_rq,
51187ad18afaSChristoph Hellwig 	.init_request	= rbd_init_request,
51197ad18afaSChristoph Hellwig };
51207ad18afaSChristoph Hellwig 
5121602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
5122602adf40SYehuda Sadeh {
5123602adf40SYehuda Sadeh 	struct gendisk *disk;
5124602adf40SYehuda Sadeh 	struct request_queue *q;
5125420efbdfSIlya Dryomov 	unsigned int objset_bytes =
5126420efbdfSIlya Dryomov 	    rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
51277ad18afaSChristoph Hellwig 	int err;
5128602adf40SYehuda Sadeh 
5129602adf40SYehuda Sadeh 	/* create gendisk info */
51307e513d43SIlya Dryomov 	disk = alloc_disk(single_major ?
51317e513d43SIlya Dryomov 			  (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
51327e513d43SIlya Dryomov 			  RBD_MINORS_PER_MAJOR);
5133602adf40SYehuda Sadeh 	if (!disk)
51341fcdb8aaSAlex Elder 		return -ENOMEM;
5135602adf40SYehuda Sadeh 
5136f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
5137de71a297SAlex Elder 		 rbd_dev->dev_id);
5138602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
5139dd82fff1SIlya Dryomov 	disk->first_minor = rbd_dev->minor;
51407e513d43SIlya Dryomov 	if (single_major)
51417e513d43SIlya Dryomov 		disk->flags |= GENHD_FL_EXT_DEVT;
5142602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
5143602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
5144602adf40SYehuda Sadeh 
51457ad18afaSChristoph Hellwig 	memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
51467ad18afaSChristoph Hellwig 	rbd_dev->tag_set.ops = &rbd_mq_ops;
5147b5584180SIlya Dryomov 	rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
51487ad18afaSChristoph Hellwig 	rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
514956d18f62SMing Lei 	rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
51507ad18afaSChristoph Hellwig 	rbd_dev->tag_set.nr_hw_queues = 1;
51517ad18afaSChristoph Hellwig 	rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
51527ad18afaSChristoph Hellwig 
51537ad18afaSChristoph Hellwig 	err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
51547ad18afaSChristoph Hellwig 	if (err)
5155602adf40SYehuda Sadeh 		goto out_disk;
5156029bcbd8SJosh Durgin 
51577ad18afaSChristoph Hellwig 	q = blk_mq_init_queue(&rbd_dev->tag_set);
51587ad18afaSChristoph Hellwig 	if (IS_ERR(q)) {
51597ad18afaSChristoph Hellwig 		err = PTR_ERR(q);
51607ad18afaSChristoph Hellwig 		goto out_tag_set;
51617ad18afaSChristoph Hellwig 	}
51627ad18afaSChristoph Hellwig 
51638b904b5bSBart Van Assche 	blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
5164d8a2c89cSIlya Dryomov 	/* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
5165593a9e7bSAlex Elder 
5166420efbdfSIlya Dryomov 	blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
51670d9fde4fSIlya Dryomov 	q->limits.max_sectors = queue_max_hw_sectors(q);
516821acdf45SIlya Dryomov 	blk_queue_max_segments(q, USHRT_MAX);
516924f1df60SIlya Dryomov 	blk_queue_max_segment_size(q, UINT_MAX);
517016d80c54SIlya Dryomov 	blk_queue_io_min(q, rbd_dev->opts->alloc_size);
517116d80c54SIlya Dryomov 	blk_queue_io_opt(q, rbd_dev->opts->alloc_size);
5172029bcbd8SJosh Durgin 
5173d9360540SIlya Dryomov 	if (rbd_dev->opts->trim) {
51748b904b5bSBart Van Assche 		blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
517516d80c54SIlya Dryomov 		q->limits.discard_granularity = rbd_dev->opts->alloc_size;
5176420efbdfSIlya Dryomov 		blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
5177420efbdfSIlya Dryomov 		blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
5178d9360540SIlya Dryomov 	}
517990e98c52SGuangliang Zhao 
5180bae818eeSRonny Hegewald 	if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
5181dc3b17ccSJan Kara 		q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
5182bae818eeSRonny Hegewald 
51835769ed0cSIlya Dryomov 	/*
51845769ed0cSIlya Dryomov 	 * disk_release() expects a queue ref from add_disk() and will
51855769ed0cSIlya Dryomov 	 * put it.  Hold an extra ref until add_disk() is called.
51865769ed0cSIlya Dryomov 	 */
51875769ed0cSIlya Dryomov 	WARN_ON(!blk_get_queue(q));
5188602adf40SYehuda Sadeh 	disk->queue = q;
5189602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
5190602adf40SYehuda Sadeh 
5191602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
5192602adf40SYehuda Sadeh 
5193602adf40SYehuda Sadeh 	return 0;
51947ad18afaSChristoph Hellwig out_tag_set:
51957ad18afaSChristoph Hellwig 	blk_mq_free_tag_set(&rbd_dev->tag_set);
5196602adf40SYehuda Sadeh out_disk:
5197602adf40SYehuda Sadeh 	put_disk(disk);
51987ad18afaSChristoph Hellwig 	return err;
5199602adf40SYehuda Sadeh }
5200602adf40SYehuda Sadeh 
5201dfc5606dSYehuda Sadeh /*
5202dfc5606dSYehuda Sadeh   sysfs
5203dfc5606dSYehuda Sadeh */
5204602adf40SYehuda Sadeh 
5205593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
5206593a9e7bSAlex Elder {
5207593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
5208593a9e7bSAlex Elder }
5209593a9e7bSAlex Elder 
5210dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
5211dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
5212602adf40SYehuda Sadeh {
5213593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5214dfc5606dSYehuda Sadeh 
5215fc71d833SAlex Elder 	return sprintf(buf, "%llu\n",
5216fc71d833SAlex Elder 		(unsigned long long)rbd_dev->mapping.size);
5217602adf40SYehuda Sadeh }
5218602adf40SYehuda Sadeh 
521934b13184SAlex Elder /*
522034b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
522134b13184SAlex Elder  * necessarily the base image.
522234b13184SAlex Elder  */
522334b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
522434b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
522534b13184SAlex Elder {
522634b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
522734b13184SAlex Elder 
522834b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
522934b13184SAlex Elder 			(unsigned long long)rbd_dev->mapping.features);
523034b13184SAlex Elder }
523134b13184SAlex Elder 
5232dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
5233dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
5234602adf40SYehuda Sadeh {
5235593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5236dfc5606dSYehuda Sadeh 
5237fc71d833SAlex Elder 	if (rbd_dev->major)
5238dfc5606dSYehuda Sadeh 		return sprintf(buf, "%d\n", rbd_dev->major);
5239fc71d833SAlex Elder 
5240fc71d833SAlex Elder 	return sprintf(buf, "(none)\n");
5241dd82fff1SIlya Dryomov }
5242fc71d833SAlex Elder 
5243dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev,
5244dd82fff1SIlya Dryomov 			      struct device_attribute *attr, char *buf)
5245dd82fff1SIlya Dryomov {
5246dd82fff1SIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5247dd82fff1SIlya Dryomov 
5248dd82fff1SIlya Dryomov 	return sprintf(buf, "%d\n", rbd_dev->minor);
5249dfc5606dSYehuda Sadeh }
5250dfc5606dSYehuda Sadeh 
5251005a07bfSIlya Dryomov static ssize_t rbd_client_addr_show(struct device *dev,
5252005a07bfSIlya Dryomov 				    struct device_attribute *attr, char *buf)
5253005a07bfSIlya Dryomov {
5254005a07bfSIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5255005a07bfSIlya Dryomov 	struct ceph_entity_addr *client_addr =
5256005a07bfSIlya Dryomov 	    ceph_client_addr(rbd_dev->rbd_client->client);
5257005a07bfSIlya Dryomov 
5258005a07bfSIlya Dryomov 	return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
5259005a07bfSIlya Dryomov 		       le32_to_cpu(client_addr->nonce));
5260005a07bfSIlya Dryomov }
5261005a07bfSIlya Dryomov 
5262dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
5263dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
5264dfc5606dSYehuda Sadeh {
5265593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5266dfc5606dSYehuda Sadeh 
52671dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
5268033268a5SIlya Dryomov 		       ceph_client_gid(rbd_dev->rbd_client->client));
5269dfc5606dSYehuda Sadeh }
5270dfc5606dSYehuda Sadeh 
5271267fb90bSMike Christie static ssize_t rbd_cluster_fsid_show(struct device *dev,
5272267fb90bSMike Christie 				     struct device_attribute *attr, char *buf)
5273267fb90bSMike Christie {
5274267fb90bSMike Christie 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5275267fb90bSMike Christie 
5276267fb90bSMike Christie 	return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
5277267fb90bSMike Christie }
5278267fb90bSMike Christie 
52790d6d1e9cSMike Christie static ssize_t rbd_config_info_show(struct device *dev,
52800d6d1e9cSMike Christie 				    struct device_attribute *attr, char *buf)
52810d6d1e9cSMike Christie {
52820d6d1e9cSMike Christie 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
52830d6d1e9cSMike Christie 
52840d6d1e9cSMike Christie 	return sprintf(buf, "%s\n", rbd_dev->config_info);
5285dfc5606dSYehuda Sadeh }
5286dfc5606dSYehuda Sadeh 
5287dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
5288dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
5289dfc5606dSYehuda Sadeh {
5290593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5291dfc5606dSYehuda Sadeh 
52920d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
5293dfc5606dSYehuda Sadeh }
5294dfc5606dSYehuda Sadeh 
52959bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
52969bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
52979bb2f334SAlex Elder {
52989bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
52999bb2f334SAlex Elder 
53000d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
53010d7dbfceSAlex Elder 			(unsigned long long) rbd_dev->spec->pool_id);
53029bb2f334SAlex Elder }
53039bb2f334SAlex Elder 
5304b26c047bSIlya Dryomov static ssize_t rbd_pool_ns_show(struct device *dev,
5305b26c047bSIlya Dryomov 				struct device_attribute *attr, char *buf)
5306b26c047bSIlya Dryomov {
5307b26c047bSIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5308b26c047bSIlya Dryomov 
5309b26c047bSIlya Dryomov 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: "");
5310b26c047bSIlya Dryomov }
5311b26c047bSIlya Dryomov 
5312dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
5313dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
5314dfc5606dSYehuda Sadeh {
5315593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5316dfc5606dSYehuda Sadeh 
5317a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
53180d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
5319a92ffdf8SAlex Elder 
5320a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
5321dfc5606dSYehuda Sadeh }
5322dfc5606dSYehuda Sadeh 
5323589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
5324589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
5325589d30e0SAlex Elder {
5326589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5327589d30e0SAlex Elder 
53280d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
5329589d30e0SAlex Elder }
5330589d30e0SAlex Elder 
533134b13184SAlex Elder /*
533234b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
533334b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
533434b13184SAlex Elder  */
5335dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
5336dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
5337dfc5606dSYehuda Sadeh 			     char *buf)
5338dfc5606dSYehuda Sadeh {
5339593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5340dfc5606dSYehuda Sadeh 
53410d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
5342dfc5606dSYehuda Sadeh }
5343dfc5606dSYehuda Sadeh 
534492a58671SMike Christie static ssize_t rbd_snap_id_show(struct device *dev,
534592a58671SMike Christie 				struct device_attribute *attr, char *buf)
534692a58671SMike Christie {
534792a58671SMike Christie 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
534892a58671SMike Christie 
534992a58671SMike Christie 	return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
535092a58671SMike Christie }
535192a58671SMike Christie 
535286b00e0dSAlex Elder /*
5353ff96128fSIlya Dryomov  * For a v2 image, shows the chain of parent images, separated by empty
5354ff96128fSIlya Dryomov  * lines.  For v1 images or if there is no parent, shows "(no parent
5355ff96128fSIlya Dryomov  * image)".
535686b00e0dSAlex Elder  */
535786b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
535886b00e0dSAlex Elder 			       struct device_attribute *attr,
535986b00e0dSAlex Elder 			       char *buf)
536086b00e0dSAlex Elder {
536186b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5362ff96128fSIlya Dryomov 	ssize_t count = 0;
536386b00e0dSAlex Elder 
5364ff96128fSIlya Dryomov 	if (!rbd_dev->parent)
536586b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
536686b00e0dSAlex Elder 
5367ff96128fSIlya Dryomov 	for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
5368ff96128fSIlya Dryomov 		struct rbd_spec *spec = rbd_dev->parent_spec;
536986b00e0dSAlex Elder 
5370ff96128fSIlya Dryomov 		count += sprintf(&buf[count], "%s"
5371ff96128fSIlya Dryomov 			    "pool_id %llu\npool_name %s\n"
5372e92c0eafSIlya Dryomov 			    "pool_ns %s\n"
5373ff96128fSIlya Dryomov 			    "image_id %s\nimage_name %s\n"
5374ff96128fSIlya Dryomov 			    "snap_id %llu\nsnap_name %s\n"
5375ff96128fSIlya Dryomov 			    "overlap %llu\n",
5376ff96128fSIlya Dryomov 			    !count ? "" : "\n", /* first? */
5377ff96128fSIlya Dryomov 			    spec->pool_id, spec->pool_name,
5378e92c0eafSIlya Dryomov 			    spec->pool_ns ?: "",
5379ff96128fSIlya Dryomov 			    spec->image_id, spec->image_name ?: "(unknown)",
5380ff96128fSIlya Dryomov 			    spec->snap_id, spec->snap_name,
5381ff96128fSIlya Dryomov 			    rbd_dev->parent_overlap);
5382ff96128fSIlya Dryomov 	}
538386b00e0dSAlex Elder 
538486b00e0dSAlex Elder 	return count;
538586b00e0dSAlex Elder }
538686b00e0dSAlex Elder 
5387dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
5388dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
5389dfc5606dSYehuda Sadeh 				 const char *buf,
5390dfc5606dSYehuda Sadeh 				 size_t size)
5391dfc5606dSYehuda Sadeh {
5392593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5393b813623aSAlex Elder 	int ret;
5394602adf40SYehuda Sadeh 
5395cc4a38bdSAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
5396e627db08SAlex Elder 	if (ret)
539752bb1f9bSIlya Dryomov 		return ret;
5398b813623aSAlex Elder 
539952bb1f9bSIlya Dryomov 	return size;
5400dfc5606dSYehuda Sadeh }
5401602adf40SYehuda Sadeh 
54025657a819SJoe Perches static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
54035657a819SJoe Perches static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
54045657a819SJoe Perches static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
54055657a819SJoe Perches static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
54065657a819SJoe Perches static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
54075657a819SJoe Perches static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
54085657a819SJoe Perches static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
54095657a819SJoe Perches static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
54105657a819SJoe Perches static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
54115657a819SJoe Perches static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
5412b26c047bSIlya Dryomov static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL);
54135657a819SJoe Perches static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
54145657a819SJoe Perches static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
54155657a819SJoe Perches static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
54165657a819SJoe Perches static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
54175657a819SJoe Perches static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
54185657a819SJoe Perches static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
5419dfc5606dSYehuda Sadeh 
5420dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
5421dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
542234b13184SAlex Elder 	&dev_attr_features.attr,
5423dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
5424dd82fff1SIlya Dryomov 	&dev_attr_minor.attr,
5425005a07bfSIlya Dryomov 	&dev_attr_client_addr.attr,
5426dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
5427267fb90bSMike Christie 	&dev_attr_cluster_fsid.attr,
54280d6d1e9cSMike Christie 	&dev_attr_config_info.attr,
5429dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
54309bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
5431b26c047bSIlya Dryomov 	&dev_attr_pool_ns.attr,
5432dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
5433589d30e0SAlex Elder 	&dev_attr_image_id.attr,
5434dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
543592a58671SMike Christie 	&dev_attr_snap_id.attr,
543686b00e0dSAlex Elder 	&dev_attr_parent.attr,
5437dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
5438dfc5606dSYehuda Sadeh 	NULL
5439dfc5606dSYehuda Sadeh };
5440dfc5606dSYehuda Sadeh 
5441dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
5442dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
5443dfc5606dSYehuda Sadeh };
5444dfc5606dSYehuda Sadeh 
5445dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
5446dfc5606dSYehuda Sadeh 	&rbd_attr_group,
5447dfc5606dSYehuda Sadeh 	NULL
5448dfc5606dSYehuda Sadeh };
5449dfc5606dSYehuda Sadeh 
54506cac4695SIlya Dryomov static void rbd_dev_release(struct device *dev);
5451dfc5606dSYehuda Sadeh 
5452b9942bc9SBhumika Goyal static const struct device_type rbd_device_type = {
5453dfc5606dSYehuda Sadeh 	.name		= "rbd",
5454dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
54556cac4695SIlya Dryomov 	.release	= rbd_dev_release,
5456dfc5606dSYehuda Sadeh };
5457dfc5606dSYehuda Sadeh 
54588b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
54598b8fb99cSAlex Elder {
54608b8fb99cSAlex Elder 	kref_get(&spec->kref);
54618b8fb99cSAlex Elder 
54628b8fb99cSAlex Elder 	return spec;
54638b8fb99cSAlex Elder }
54648b8fb99cSAlex Elder 
54658b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
54668b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
54678b8fb99cSAlex Elder {
54688b8fb99cSAlex Elder 	if (spec)
54698b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
54708b8fb99cSAlex Elder }
54718b8fb99cSAlex Elder 
54728b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
54738b8fb99cSAlex Elder {
54748b8fb99cSAlex Elder 	struct rbd_spec *spec;
54758b8fb99cSAlex Elder 
54768b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
54778b8fb99cSAlex Elder 	if (!spec)
54788b8fb99cSAlex Elder 		return NULL;
547904077599SIlya Dryomov 
548004077599SIlya Dryomov 	spec->pool_id = CEPH_NOPOOL;
548104077599SIlya Dryomov 	spec->snap_id = CEPH_NOSNAP;
54828b8fb99cSAlex Elder 	kref_init(&spec->kref);
54838b8fb99cSAlex Elder 
54848b8fb99cSAlex Elder 	return spec;
54858b8fb99cSAlex Elder }
54868b8fb99cSAlex Elder 
54878b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
54888b8fb99cSAlex Elder {
54898b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
54908b8fb99cSAlex Elder 
54918b8fb99cSAlex Elder 	kfree(spec->pool_name);
5492b26c047bSIlya Dryomov 	kfree(spec->pool_ns);
54938b8fb99cSAlex Elder 	kfree(spec->image_id);
54948b8fb99cSAlex Elder 	kfree(spec->image_name);
54958b8fb99cSAlex Elder 	kfree(spec->snap_name);
54968b8fb99cSAlex Elder 	kfree(spec);
54978b8fb99cSAlex Elder }
54988b8fb99cSAlex Elder 
54991643dfa4SIlya Dryomov static void rbd_dev_free(struct rbd_device *rbd_dev)
5500dd5ac32dSIlya Dryomov {
550199d16943SIlya Dryomov 	WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
5502ed95b21aSIlya Dryomov 	WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
5503dd5ac32dSIlya Dryomov 
5504c41d13a3SIlya Dryomov 	ceph_oid_destroy(&rbd_dev->header_oid);
55056b6dddbeSIlya Dryomov 	ceph_oloc_destroy(&rbd_dev->header_oloc);
55060d6d1e9cSMike Christie 	kfree(rbd_dev->config_info);
5507c41d13a3SIlya Dryomov 
5508dd5ac32dSIlya Dryomov 	rbd_put_client(rbd_dev->rbd_client);
5509dd5ac32dSIlya Dryomov 	rbd_spec_put(rbd_dev->spec);
5510dd5ac32dSIlya Dryomov 	kfree(rbd_dev->opts);
5511dd5ac32dSIlya Dryomov 	kfree(rbd_dev);
55121643dfa4SIlya Dryomov }
55131643dfa4SIlya Dryomov 
55141643dfa4SIlya Dryomov static void rbd_dev_release(struct device *dev)
55151643dfa4SIlya Dryomov {
55161643dfa4SIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
55171643dfa4SIlya Dryomov 	bool need_put = !!rbd_dev->opts;
55181643dfa4SIlya Dryomov 
55191643dfa4SIlya Dryomov 	if (need_put) {
55201643dfa4SIlya Dryomov 		destroy_workqueue(rbd_dev->task_wq);
55211643dfa4SIlya Dryomov 		ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
55221643dfa4SIlya Dryomov 	}
55231643dfa4SIlya Dryomov 
55241643dfa4SIlya Dryomov 	rbd_dev_free(rbd_dev);
5525dd5ac32dSIlya Dryomov 
5526dd5ac32dSIlya Dryomov 	/*
5527dd5ac32dSIlya Dryomov 	 * This is racy, but way better than putting module outside of
5528dd5ac32dSIlya Dryomov 	 * the release callback.  The race window is pretty small, so
5529dd5ac32dSIlya Dryomov 	 * doing something similar to dm (dm-builtin.c) is overkill.
5530dd5ac32dSIlya Dryomov 	 */
5531dd5ac32dSIlya Dryomov 	if (need_put)
5532dd5ac32dSIlya Dryomov 		module_put(THIS_MODULE);
5533dd5ac32dSIlya Dryomov }
5534dd5ac32dSIlya Dryomov 
55351643dfa4SIlya Dryomov static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
55361643dfa4SIlya Dryomov 					   struct rbd_spec *spec)
5537c53d5893SAlex Elder {
5538c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
5539c53d5893SAlex Elder 
5540c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
5541c53d5893SAlex Elder 	if (!rbd_dev)
5542c53d5893SAlex Elder 		return NULL;
5543c53d5893SAlex Elder 
5544c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
5545c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
5546c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
5547c53d5893SAlex Elder 
55487e97332eSIlya Dryomov 	rbd_dev->header.data_pool_id = CEPH_NOPOOL;
5549c41d13a3SIlya Dryomov 	ceph_oid_init(&rbd_dev->header_oid);
5550431a02cdSIlya Dryomov 	rbd_dev->header_oloc.pool = spec->pool_id;
5551b26c047bSIlya Dryomov 	if (spec->pool_ns) {
5552b26c047bSIlya Dryomov 		WARN_ON(!*spec->pool_ns);
5553b26c047bSIlya Dryomov 		rbd_dev->header_oloc.pool_ns =
5554b26c047bSIlya Dryomov 		    ceph_find_or_create_string(spec->pool_ns,
5555b26c047bSIlya Dryomov 					       strlen(spec->pool_ns));
5556b26c047bSIlya Dryomov 	}
5557c41d13a3SIlya Dryomov 
555899d16943SIlya Dryomov 	mutex_init(&rbd_dev->watch_mutex);
555999d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
556099d16943SIlya Dryomov 	INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
556199d16943SIlya Dryomov 
5562ed95b21aSIlya Dryomov 	init_rwsem(&rbd_dev->lock_rwsem);
5563ed95b21aSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
5564ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
5565ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
5566ed95b21aSIlya Dryomov 	INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
5567ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
5568e1fddc8fSIlya Dryomov 	spin_lock_init(&rbd_dev->lock_lists_lock);
5569637cd060SIlya Dryomov 	INIT_LIST_HEAD(&rbd_dev->acquiring_list);
5570e1fddc8fSIlya Dryomov 	INIT_LIST_HEAD(&rbd_dev->running_list);
5571637cd060SIlya Dryomov 	init_completion(&rbd_dev->acquire_wait);
5572e1fddc8fSIlya Dryomov 	init_completion(&rbd_dev->releasing_wait);
5573ed95b21aSIlya Dryomov 
557422e8bd51SIlya Dryomov 	spin_lock_init(&rbd_dev->object_map_lock);
5575c53d5893SAlex Elder 
5576dd5ac32dSIlya Dryomov 	rbd_dev->dev.bus = &rbd_bus_type;
5577dd5ac32dSIlya Dryomov 	rbd_dev->dev.type = &rbd_device_type;
5578dd5ac32dSIlya Dryomov 	rbd_dev->dev.parent = &rbd_root_dev;
5579dd5ac32dSIlya Dryomov 	device_initialize(&rbd_dev->dev);
5580dd5ac32dSIlya Dryomov 
5581c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
5582d147543dSIlya Dryomov 	rbd_dev->spec = spec;
55830903e875SAlex Elder 
55841643dfa4SIlya Dryomov 	return rbd_dev;
55851643dfa4SIlya Dryomov }
55861643dfa4SIlya Dryomov 
5587dd5ac32dSIlya Dryomov /*
55881643dfa4SIlya Dryomov  * Create a mapping rbd_dev.
5589dd5ac32dSIlya Dryomov  */
55901643dfa4SIlya Dryomov static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
55911643dfa4SIlya Dryomov 					 struct rbd_spec *spec,
55921643dfa4SIlya Dryomov 					 struct rbd_options *opts)
55931643dfa4SIlya Dryomov {
55941643dfa4SIlya Dryomov 	struct rbd_device *rbd_dev;
55951643dfa4SIlya Dryomov 
55961643dfa4SIlya Dryomov 	rbd_dev = __rbd_dev_create(rbdc, spec);
55971643dfa4SIlya Dryomov 	if (!rbd_dev)
55981643dfa4SIlya Dryomov 		return NULL;
55991643dfa4SIlya Dryomov 
56001643dfa4SIlya Dryomov 	rbd_dev->opts = opts;
56011643dfa4SIlya Dryomov 
56021643dfa4SIlya Dryomov 	/* get an id and fill in device name */
56031643dfa4SIlya Dryomov 	rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
56041643dfa4SIlya Dryomov 					 minor_to_rbd_dev_id(1 << MINORBITS),
56051643dfa4SIlya Dryomov 					 GFP_KERNEL);
56061643dfa4SIlya Dryomov 	if (rbd_dev->dev_id < 0)
56071643dfa4SIlya Dryomov 		goto fail_rbd_dev;
56081643dfa4SIlya Dryomov 
56091643dfa4SIlya Dryomov 	sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
56101643dfa4SIlya Dryomov 	rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
56111643dfa4SIlya Dryomov 						   rbd_dev->name);
56121643dfa4SIlya Dryomov 	if (!rbd_dev->task_wq)
56131643dfa4SIlya Dryomov 		goto fail_dev_id;
56141643dfa4SIlya Dryomov 
56151643dfa4SIlya Dryomov 	/* we have a ref from do_rbd_add() */
5616dd5ac32dSIlya Dryomov 	__module_get(THIS_MODULE);
5617dd5ac32dSIlya Dryomov 
56181643dfa4SIlya Dryomov 	dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
5619c53d5893SAlex Elder 	return rbd_dev;
56201643dfa4SIlya Dryomov 
56211643dfa4SIlya Dryomov fail_dev_id:
56221643dfa4SIlya Dryomov 	ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
56231643dfa4SIlya Dryomov fail_rbd_dev:
56241643dfa4SIlya Dryomov 	rbd_dev_free(rbd_dev);
56251643dfa4SIlya Dryomov 	return NULL;
5626c53d5893SAlex Elder }
5627c53d5893SAlex Elder 
5628c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
5629c53d5893SAlex Elder {
5630dd5ac32dSIlya Dryomov 	if (rbd_dev)
5631dd5ac32dSIlya Dryomov 		put_device(&rbd_dev->dev);
5632c53d5893SAlex Elder }
5633c53d5893SAlex Elder 
5634dfc5606dSYehuda Sadeh /*
56359d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
56369d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
56379d475de5SAlex Elder  * image.
56389d475de5SAlex Elder  */
56399d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
56409d475de5SAlex Elder 				u8 *order, u64 *snap_size)
56419d475de5SAlex Elder {
56429d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
56439d475de5SAlex Elder 	int ret;
56449d475de5SAlex Elder 	struct {
56459d475de5SAlex Elder 		u8 order;
56469d475de5SAlex Elder 		__le64 size;
56479d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
56489d475de5SAlex Elder 
5649ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5650ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_size",
56514157976bSAlex Elder 				  &snapid, sizeof(snapid),
5652e2a58ee5SAlex Elder 				  &size_buf, sizeof(size_buf));
565336be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
56549d475de5SAlex Elder 	if (ret < 0)
56559d475de5SAlex Elder 		return ret;
565657385b51SAlex Elder 	if (ret < sizeof (size_buf))
565757385b51SAlex Elder 		return -ERANGE;
56589d475de5SAlex Elder 
5659c3545579SJosh Durgin 	if (order) {
56609d475de5SAlex Elder 		*order = size_buf.order;
5661c3545579SJosh Durgin 		dout("  order %u", (unsigned int)*order);
5662c3545579SJosh Durgin 	}
56639d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
56649d475de5SAlex Elder 
5665c3545579SJosh Durgin 	dout("  snap_id 0x%016llx snap_size = %llu\n",
5666c3545579SJosh Durgin 		(unsigned long long)snap_id,
56679d475de5SAlex Elder 		(unsigned long long)*snap_size);
56689d475de5SAlex Elder 
56699d475de5SAlex Elder 	return 0;
56709d475de5SAlex Elder }
56719d475de5SAlex Elder 
56729d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
56739d475de5SAlex Elder {
56749d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
56759d475de5SAlex Elder 					&rbd_dev->header.obj_order,
56769d475de5SAlex Elder 					&rbd_dev->header.image_size);
56779d475de5SAlex Elder }
56789d475de5SAlex Elder 
56791e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
56801e130199SAlex Elder {
56815435d206SDongsheng Yang 	size_t size;
56821e130199SAlex Elder 	void *reply_buf;
56831e130199SAlex Elder 	int ret;
56841e130199SAlex Elder 	void *p;
56851e130199SAlex Elder 
56865435d206SDongsheng Yang 	/* Response will be an encoded string, which includes a length */
56875435d206SDongsheng Yang 	size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX;
56885435d206SDongsheng Yang 	reply_buf = kzalloc(size, GFP_KERNEL);
56891e130199SAlex Elder 	if (!reply_buf)
56901e130199SAlex Elder 		return -ENOMEM;
56911e130199SAlex Elder 
5692ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5693ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_object_prefix",
56945435d206SDongsheng Yang 				  NULL, 0, reply_buf, size);
569536be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
56961e130199SAlex Elder 	if (ret < 0)
56971e130199SAlex Elder 		goto out;
56981e130199SAlex Elder 
56991e130199SAlex Elder 	p = reply_buf;
57001e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
570157385b51SAlex Elder 						p + ret, NULL, GFP_NOIO);
570257385b51SAlex Elder 	ret = 0;
57031e130199SAlex Elder 
57041e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
57051e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
57061e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
57071e130199SAlex Elder 	} else {
57081e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
57091e130199SAlex Elder 	}
57101e130199SAlex Elder out:
57111e130199SAlex Elder 	kfree(reply_buf);
57121e130199SAlex Elder 
57131e130199SAlex Elder 	return ret;
57141e130199SAlex Elder }
57151e130199SAlex Elder 
5716b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
5717b1b5402aSAlex Elder 		u64 *snap_features)
5718b1b5402aSAlex Elder {
5719b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
5720b1b5402aSAlex Elder 	struct {
5721b1b5402aSAlex Elder 		__le64 features;
5722b1b5402aSAlex Elder 		__le64 incompat;
57234157976bSAlex Elder 	} __attribute__ ((packed)) features_buf = { 0 };
5724d3767f0fSIlya Dryomov 	u64 unsup;
5725b1b5402aSAlex Elder 	int ret;
5726b1b5402aSAlex Elder 
5727ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5728ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_features",
57294157976bSAlex Elder 				  &snapid, sizeof(snapid),
5730e2a58ee5SAlex Elder 				  &features_buf, sizeof(features_buf));
573136be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5732b1b5402aSAlex Elder 	if (ret < 0)
5733b1b5402aSAlex Elder 		return ret;
573457385b51SAlex Elder 	if (ret < sizeof (features_buf))
573557385b51SAlex Elder 		return -ERANGE;
5736d889140cSAlex Elder 
5737d3767f0fSIlya Dryomov 	unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
5738d3767f0fSIlya Dryomov 	if (unsup) {
5739d3767f0fSIlya Dryomov 		rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
5740d3767f0fSIlya Dryomov 			 unsup);
5741b8f5c6edSAlex Elder 		return -ENXIO;
5742d3767f0fSIlya Dryomov 	}
5743d889140cSAlex Elder 
5744b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
5745b1b5402aSAlex Elder 
5746b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
5747b1b5402aSAlex Elder 		(unsigned long long)snap_id,
5748b1b5402aSAlex Elder 		(unsigned long long)*snap_features,
5749b1b5402aSAlex Elder 		(unsigned long long)le64_to_cpu(features_buf.incompat));
5750b1b5402aSAlex Elder 
5751b1b5402aSAlex Elder 	return 0;
5752b1b5402aSAlex Elder }
5753b1b5402aSAlex Elder 
5754b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
5755b1b5402aSAlex Elder {
5756b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
5757b1b5402aSAlex Elder 						&rbd_dev->header.features);
5758b1b5402aSAlex Elder }
5759b1b5402aSAlex Elder 
576022e8bd51SIlya Dryomov /*
576122e8bd51SIlya Dryomov  * These are generic image flags, but since they are used only for
576222e8bd51SIlya Dryomov  * object map, store them in rbd_dev->object_map_flags.
576322e8bd51SIlya Dryomov  *
576422e8bd51SIlya Dryomov  * For the same reason, this function is called only on object map
576522e8bd51SIlya Dryomov  * (re)load and not on header refresh.
576622e8bd51SIlya Dryomov  */
576722e8bd51SIlya Dryomov static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev)
576822e8bd51SIlya Dryomov {
576922e8bd51SIlya Dryomov 	__le64 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
577022e8bd51SIlya Dryomov 	__le64 flags;
577122e8bd51SIlya Dryomov 	int ret;
577222e8bd51SIlya Dryomov 
577322e8bd51SIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
577422e8bd51SIlya Dryomov 				  &rbd_dev->header_oloc, "get_flags",
577522e8bd51SIlya Dryomov 				  &snapid, sizeof(snapid),
577622e8bd51SIlya Dryomov 				  &flags, sizeof(flags));
577722e8bd51SIlya Dryomov 	if (ret < 0)
577822e8bd51SIlya Dryomov 		return ret;
577922e8bd51SIlya Dryomov 	if (ret < sizeof(flags))
578022e8bd51SIlya Dryomov 		return -EBADMSG;
578122e8bd51SIlya Dryomov 
578222e8bd51SIlya Dryomov 	rbd_dev->object_map_flags = le64_to_cpu(flags);
578322e8bd51SIlya Dryomov 	return 0;
578422e8bd51SIlya Dryomov }
578522e8bd51SIlya Dryomov 
5786eb3b2d6bSIlya Dryomov struct parent_image_info {
5787eb3b2d6bSIlya Dryomov 	u64		pool_id;
5788e92c0eafSIlya Dryomov 	const char	*pool_ns;
5789eb3b2d6bSIlya Dryomov 	const char	*image_id;
5790eb3b2d6bSIlya Dryomov 	u64		snap_id;
5791eb3b2d6bSIlya Dryomov 
5792e92c0eafSIlya Dryomov 	bool		has_overlap;
5793eb3b2d6bSIlya Dryomov 	u64		overlap;
5794eb3b2d6bSIlya Dryomov };
5795eb3b2d6bSIlya Dryomov 
5796eb3b2d6bSIlya Dryomov /*
5797eb3b2d6bSIlya Dryomov  * The caller is responsible for @pii.
5798eb3b2d6bSIlya Dryomov  */
5799e92c0eafSIlya Dryomov static int decode_parent_image_spec(void **p, void *end,
5800e92c0eafSIlya Dryomov 				    struct parent_image_info *pii)
5801e92c0eafSIlya Dryomov {
5802e92c0eafSIlya Dryomov 	u8 struct_v;
5803e92c0eafSIlya Dryomov 	u32 struct_len;
5804e92c0eafSIlya Dryomov 	int ret;
5805e92c0eafSIlya Dryomov 
5806e92c0eafSIlya Dryomov 	ret = ceph_start_decoding(p, end, 1, "ParentImageSpec",
5807e92c0eafSIlya Dryomov 				  &struct_v, &struct_len);
5808e92c0eafSIlya Dryomov 	if (ret)
5809e92c0eafSIlya Dryomov 		return ret;
5810e92c0eafSIlya Dryomov 
5811e92c0eafSIlya Dryomov 	ceph_decode_64_safe(p, end, pii->pool_id, e_inval);
5812e92c0eafSIlya Dryomov 	pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5813e92c0eafSIlya Dryomov 	if (IS_ERR(pii->pool_ns)) {
5814e92c0eafSIlya Dryomov 		ret = PTR_ERR(pii->pool_ns);
5815e92c0eafSIlya Dryomov 		pii->pool_ns = NULL;
5816e92c0eafSIlya Dryomov 		return ret;
5817e92c0eafSIlya Dryomov 	}
5818e92c0eafSIlya Dryomov 	pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5819e92c0eafSIlya Dryomov 	if (IS_ERR(pii->image_id)) {
5820e92c0eafSIlya Dryomov 		ret = PTR_ERR(pii->image_id);
5821e92c0eafSIlya Dryomov 		pii->image_id = NULL;
5822e92c0eafSIlya Dryomov 		return ret;
5823e92c0eafSIlya Dryomov 	}
5824e92c0eafSIlya Dryomov 	ceph_decode_64_safe(p, end, pii->snap_id, e_inval);
5825e92c0eafSIlya Dryomov 	return 0;
5826e92c0eafSIlya Dryomov 
5827e92c0eafSIlya Dryomov e_inval:
5828e92c0eafSIlya Dryomov 	return -EINVAL;
5829e92c0eafSIlya Dryomov }
5830e92c0eafSIlya Dryomov 
5831e92c0eafSIlya Dryomov static int __get_parent_info(struct rbd_device *rbd_dev,
5832e92c0eafSIlya Dryomov 			     struct page *req_page,
5833e92c0eafSIlya Dryomov 			     struct page *reply_page,
5834e92c0eafSIlya Dryomov 			     struct parent_image_info *pii)
5835e92c0eafSIlya Dryomov {
5836e92c0eafSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5837e92c0eafSIlya Dryomov 	size_t reply_len = PAGE_SIZE;
5838e92c0eafSIlya Dryomov 	void *p, *end;
5839e92c0eafSIlya Dryomov 	int ret;
5840e92c0eafSIlya Dryomov 
5841e92c0eafSIlya Dryomov 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5842e92c0eafSIlya Dryomov 			     "rbd", "parent_get", CEPH_OSD_FLAG_READ,
584368ada915SIlya Dryomov 			     req_page, sizeof(u64), &reply_page, &reply_len);
5844e92c0eafSIlya Dryomov 	if (ret)
5845e92c0eafSIlya Dryomov 		return ret == -EOPNOTSUPP ? 1 : ret;
5846e92c0eafSIlya Dryomov 
5847e92c0eafSIlya Dryomov 	p = page_address(reply_page);
5848e92c0eafSIlya Dryomov 	end = p + reply_len;
5849e92c0eafSIlya Dryomov 	ret = decode_parent_image_spec(&p, end, pii);
5850e92c0eafSIlya Dryomov 	if (ret)
5851e92c0eafSIlya Dryomov 		return ret;
5852e92c0eafSIlya Dryomov 
5853e92c0eafSIlya Dryomov 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5854e92c0eafSIlya Dryomov 			     "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
585568ada915SIlya Dryomov 			     req_page, sizeof(u64), &reply_page, &reply_len);
5856e92c0eafSIlya Dryomov 	if (ret)
5857e92c0eafSIlya Dryomov 		return ret;
5858e92c0eafSIlya Dryomov 
5859e92c0eafSIlya Dryomov 	p = page_address(reply_page);
5860e92c0eafSIlya Dryomov 	end = p + reply_len;
5861e92c0eafSIlya Dryomov 	ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval);
5862e92c0eafSIlya Dryomov 	if (pii->has_overlap)
5863e92c0eafSIlya Dryomov 		ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5864e92c0eafSIlya Dryomov 
5865e92c0eafSIlya Dryomov 	return 0;
5866e92c0eafSIlya Dryomov 
5867e92c0eafSIlya Dryomov e_inval:
5868e92c0eafSIlya Dryomov 	return -EINVAL;
5869e92c0eafSIlya Dryomov }
5870e92c0eafSIlya Dryomov 
5871e92c0eafSIlya Dryomov /*
5872e92c0eafSIlya Dryomov  * The caller is responsible for @pii.
5873e92c0eafSIlya Dryomov  */
5874eb3b2d6bSIlya Dryomov static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
5875eb3b2d6bSIlya Dryomov 				    struct page *req_page,
5876eb3b2d6bSIlya Dryomov 				    struct page *reply_page,
5877eb3b2d6bSIlya Dryomov 				    struct parent_image_info *pii)
5878eb3b2d6bSIlya Dryomov {
5879eb3b2d6bSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5880eb3b2d6bSIlya Dryomov 	size_t reply_len = PAGE_SIZE;
5881eb3b2d6bSIlya Dryomov 	void *p, *end;
5882eb3b2d6bSIlya Dryomov 	int ret;
5883eb3b2d6bSIlya Dryomov 
5884eb3b2d6bSIlya Dryomov 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5885eb3b2d6bSIlya Dryomov 			     "rbd", "get_parent", CEPH_OSD_FLAG_READ,
588668ada915SIlya Dryomov 			     req_page, sizeof(u64), &reply_page, &reply_len);
5887eb3b2d6bSIlya Dryomov 	if (ret)
5888eb3b2d6bSIlya Dryomov 		return ret;
5889eb3b2d6bSIlya Dryomov 
5890eb3b2d6bSIlya Dryomov 	p = page_address(reply_page);
5891eb3b2d6bSIlya Dryomov 	end = p + reply_len;
5892eb3b2d6bSIlya Dryomov 	ceph_decode_64_safe(&p, end, pii->pool_id, e_inval);
5893eb3b2d6bSIlya Dryomov 	pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
5894eb3b2d6bSIlya Dryomov 	if (IS_ERR(pii->image_id)) {
5895eb3b2d6bSIlya Dryomov 		ret = PTR_ERR(pii->image_id);
5896eb3b2d6bSIlya Dryomov 		pii->image_id = NULL;
5897eb3b2d6bSIlya Dryomov 		return ret;
5898eb3b2d6bSIlya Dryomov 	}
5899eb3b2d6bSIlya Dryomov 	ceph_decode_64_safe(&p, end, pii->snap_id, e_inval);
5900e92c0eafSIlya Dryomov 	pii->has_overlap = true;
5901eb3b2d6bSIlya Dryomov 	ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5902eb3b2d6bSIlya Dryomov 
5903eb3b2d6bSIlya Dryomov 	return 0;
5904eb3b2d6bSIlya Dryomov 
5905eb3b2d6bSIlya Dryomov e_inval:
5906eb3b2d6bSIlya Dryomov 	return -EINVAL;
5907eb3b2d6bSIlya Dryomov }
5908eb3b2d6bSIlya Dryomov 
5909eb3b2d6bSIlya Dryomov static int get_parent_info(struct rbd_device *rbd_dev,
5910eb3b2d6bSIlya Dryomov 			   struct parent_image_info *pii)
5911eb3b2d6bSIlya Dryomov {
5912eb3b2d6bSIlya Dryomov 	struct page *req_page, *reply_page;
5913eb3b2d6bSIlya Dryomov 	void *p;
5914eb3b2d6bSIlya Dryomov 	int ret;
5915eb3b2d6bSIlya Dryomov 
5916eb3b2d6bSIlya Dryomov 	req_page = alloc_page(GFP_KERNEL);
5917eb3b2d6bSIlya Dryomov 	if (!req_page)
5918eb3b2d6bSIlya Dryomov 		return -ENOMEM;
5919eb3b2d6bSIlya Dryomov 
5920eb3b2d6bSIlya Dryomov 	reply_page = alloc_page(GFP_KERNEL);
5921eb3b2d6bSIlya Dryomov 	if (!reply_page) {
5922eb3b2d6bSIlya Dryomov 		__free_page(req_page);
5923eb3b2d6bSIlya Dryomov 		return -ENOMEM;
5924eb3b2d6bSIlya Dryomov 	}
5925eb3b2d6bSIlya Dryomov 
5926eb3b2d6bSIlya Dryomov 	p = page_address(req_page);
5927eb3b2d6bSIlya Dryomov 	ceph_encode_64(&p, rbd_dev->spec->snap_id);
5928e92c0eafSIlya Dryomov 	ret = __get_parent_info(rbd_dev, req_page, reply_page, pii);
5929e92c0eafSIlya Dryomov 	if (ret > 0)
5930e92c0eafSIlya Dryomov 		ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page,
5931e92c0eafSIlya Dryomov 					       pii);
5932eb3b2d6bSIlya Dryomov 
5933eb3b2d6bSIlya Dryomov 	__free_page(req_page);
5934eb3b2d6bSIlya Dryomov 	__free_page(reply_page);
5935eb3b2d6bSIlya Dryomov 	return ret;
5936eb3b2d6bSIlya Dryomov }
5937eb3b2d6bSIlya Dryomov 
593886b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
593986b00e0dSAlex Elder {
594086b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
5941eb3b2d6bSIlya Dryomov 	struct parent_image_info pii = { 0 };
594286b00e0dSAlex Elder 	int ret;
594386b00e0dSAlex Elder 
594486b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
594586b00e0dSAlex Elder 	if (!parent_spec)
594686b00e0dSAlex Elder 		return -ENOMEM;
594786b00e0dSAlex Elder 
5948eb3b2d6bSIlya Dryomov 	ret = get_parent_info(rbd_dev, &pii);
5949eb3b2d6bSIlya Dryomov 	if (ret)
595086b00e0dSAlex Elder 		goto out_err;
595186b00e0dSAlex Elder 
5952e92c0eafSIlya Dryomov 	dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
5953e92c0eafSIlya Dryomov 	     __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
5954e92c0eafSIlya Dryomov 	     pii.has_overlap, pii.overlap);
5955eb3b2d6bSIlya Dryomov 
5956e92c0eafSIlya Dryomov 	if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
5957392a9dadSAlex Elder 		/*
5958392a9dadSAlex Elder 		 * Either the parent never existed, or we have
5959392a9dadSAlex Elder 		 * record of it but the image got flattened so it no
5960392a9dadSAlex Elder 		 * longer has a parent.  When the parent of a
5961392a9dadSAlex Elder 		 * layered image disappears we immediately set the
5962392a9dadSAlex Elder 		 * overlap to 0.  The effect of this is that all new
5963392a9dadSAlex Elder 		 * requests will be treated as if the image had no
5964392a9dadSAlex Elder 		 * parent.
5965e92c0eafSIlya Dryomov 		 *
5966e92c0eafSIlya Dryomov 		 * If !pii.has_overlap, the parent image spec is not
5967e92c0eafSIlya Dryomov 		 * applicable.  It's there to avoid duplication in each
5968e92c0eafSIlya Dryomov 		 * snapshot record.
5969392a9dadSAlex Elder 		 */
5970392a9dadSAlex Elder 		if (rbd_dev->parent_overlap) {
5971392a9dadSAlex Elder 			rbd_dev->parent_overlap = 0;
5972392a9dadSAlex Elder 			rbd_dev_parent_put(rbd_dev);
5973392a9dadSAlex Elder 			pr_info("%s: clone image has been flattened\n",
5974392a9dadSAlex Elder 				rbd_dev->disk->disk_name);
5975392a9dadSAlex Elder 		}
5976392a9dadSAlex Elder 
597786b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
5978392a9dadSAlex Elder 	}
597986b00e0dSAlex Elder 
59800903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
59810903e875SAlex Elder 
59820903e875SAlex Elder 	ret = -EIO;
5983eb3b2d6bSIlya Dryomov 	if (pii.pool_id > (u64)U32_MAX) {
59849584d508SIlya Dryomov 		rbd_warn(NULL, "parent pool id too large (%llu > %u)",
5985eb3b2d6bSIlya Dryomov 			(unsigned long long)pii.pool_id, U32_MAX);
598657385b51SAlex Elder 		goto out_err;
5987c0cd10dbSAlex Elder 	}
59880903e875SAlex Elder 
59893b5cf2a2SAlex Elder 	/*
59903b5cf2a2SAlex Elder 	 * The parent won't change (except when the clone is
59913b5cf2a2SAlex Elder 	 * flattened, already handled that).  So we only need to
59923b5cf2a2SAlex Elder 	 * record the parent spec we have not already done so.
59933b5cf2a2SAlex Elder 	 */
59943b5cf2a2SAlex Elder 	if (!rbd_dev->parent_spec) {
5995eb3b2d6bSIlya Dryomov 		parent_spec->pool_id = pii.pool_id;
5996e92c0eafSIlya Dryomov 		if (pii.pool_ns && *pii.pool_ns) {
5997e92c0eafSIlya Dryomov 			parent_spec->pool_ns = pii.pool_ns;
5998e92c0eafSIlya Dryomov 			pii.pool_ns = NULL;
5999e92c0eafSIlya Dryomov 		}
6000eb3b2d6bSIlya Dryomov 		parent_spec->image_id = pii.image_id;
6001eb3b2d6bSIlya Dryomov 		pii.image_id = NULL;
6002eb3b2d6bSIlya Dryomov 		parent_spec->snap_id = pii.snap_id;
6003b26c047bSIlya Dryomov 
600486b00e0dSAlex Elder 		rbd_dev->parent_spec = parent_spec;
600586b00e0dSAlex Elder 		parent_spec = NULL;	/* rbd_dev now owns this */
60063b5cf2a2SAlex Elder 	}
60073b5cf2a2SAlex Elder 
60083b5cf2a2SAlex Elder 	/*
6009cf32bd9cSIlya Dryomov 	 * We always update the parent overlap.  If it's zero we issue
6010cf32bd9cSIlya Dryomov 	 * a warning, as we will proceed as if there was no parent.
60113b5cf2a2SAlex Elder 	 */
6012eb3b2d6bSIlya Dryomov 	if (!pii.overlap) {
60133b5cf2a2SAlex Elder 		if (parent_spec) {
6014cf32bd9cSIlya Dryomov 			/* refresh, careful to warn just once */
6015cf32bd9cSIlya Dryomov 			if (rbd_dev->parent_overlap)
6016cf32bd9cSIlya Dryomov 				rbd_warn(rbd_dev,
6017cf32bd9cSIlya Dryomov 				    "clone now standalone (overlap became 0)");
601870cf49cfSAlex Elder 		} else {
6019cf32bd9cSIlya Dryomov 			/* initial probe */
6020cf32bd9cSIlya Dryomov 			rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
60213b5cf2a2SAlex Elder 		}
602270cf49cfSAlex Elder 	}
6023eb3b2d6bSIlya Dryomov 	rbd_dev->parent_overlap = pii.overlap;
6024cf32bd9cSIlya Dryomov 
602586b00e0dSAlex Elder out:
602686b00e0dSAlex Elder 	ret = 0;
602786b00e0dSAlex Elder out_err:
6028e92c0eafSIlya Dryomov 	kfree(pii.pool_ns);
6029eb3b2d6bSIlya Dryomov 	kfree(pii.image_id);
603086b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
603186b00e0dSAlex Elder 	return ret;
603286b00e0dSAlex Elder }
603386b00e0dSAlex Elder 
6034cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
6035cc070d59SAlex Elder {
6036cc070d59SAlex Elder 	struct {
6037cc070d59SAlex Elder 		__le64 stripe_unit;
6038cc070d59SAlex Elder 		__le64 stripe_count;
6039cc070d59SAlex Elder 	} __attribute__ ((packed)) striping_info_buf = { 0 };
6040cc070d59SAlex Elder 	size_t size = sizeof (striping_info_buf);
6041cc070d59SAlex Elder 	void *p;
6042cc070d59SAlex Elder 	int ret;
6043cc070d59SAlex Elder 
6044ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6045ecd4a68aSIlya Dryomov 				&rbd_dev->header_oloc, "get_stripe_unit_count",
6046ecd4a68aSIlya Dryomov 				NULL, 0, &striping_info_buf, size);
6047cc070d59SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6048cc070d59SAlex Elder 	if (ret < 0)
6049cc070d59SAlex Elder 		return ret;
6050cc070d59SAlex Elder 	if (ret < size)
6051cc070d59SAlex Elder 		return -ERANGE;
6052cc070d59SAlex Elder 
6053cc070d59SAlex Elder 	p = &striping_info_buf;
6054b1331852SIlya Dryomov 	rbd_dev->header.stripe_unit = ceph_decode_64(&p);
6055b1331852SIlya Dryomov 	rbd_dev->header.stripe_count = ceph_decode_64(&p);
6056cc070d59SAlex Elder 	return 0;
6057cc070d59SAlex Elder }
6058cc070d59SAlex Elder 
60597e97332eSIlya Dryomov static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
60607e97332eSIlya Dryomov {
60617e97332eSIlya Dryomov 	__le64 data_pool_id;
60627e97332eSIlya Dryomov 	int ret;
60637e97332eSIlya Dryomov 
60647e97332eSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
60657e97332eSIlya Dryomov 				  &rbd_dev->header_oloc, "get_data_pool",
60667e97332eSIlya Dryomov 				  NULL, 0, &data_pool_id, sizeof(data_pool_id));
60677e97332eSIlya Dryomov 	if (ret < 0)
60687e97332eSIlya Dryomov 		return ret;
60697e97332eSIlya Dryomov 	if (ret < sizeof(data_pool_id))
60707e97332eSIlya Dryomov 		return -EBADMSG;
60717e97332eSIlya Dryomov 
60727e97332eSIlya Dryomov 	rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
60737e97332eSIlya Dryomov 	WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
60747e97332eSIlya Dryomov 	return 0;
60757e97332eSIlya Dryomov }
60767e97332eSIlya Dryomov 
60779e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
60789e15b77dSAlex Elder {
6079ecd4a68aSIlya Dryomov 	CEPH_DEFINE_OID_ONSTACK(oid);
60809e15b77dSAlex Elder 	size_t image_id_size;
60819e15b77dSAlex Elder 	char *image_id;
60829e15b77dSAlex Elder 	void *p;
60839e15b77dSAlex Elder 	void *end;
60849e15b77dSAlex Elder 	size_t size;
60859e15b77dSAlex Elder 	void *reply_buf = NULL;
60869e15b77dSAlex Elder 	size_t len = 0;
60879e15b77dSAlex Elder 	char *image_name = NULL;
60889e15b77dSAlex Elder 	int ret;
60899e15b77dSAlex Elder 
60909e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
60919e15b77dSAlex Elder 
609269e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
609369e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
60949e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
60959e15b77dSAlex Elder 	if (!image_id)
60969e15b77dSAlex Elder 		return NULL;
60979e15b77dSAlex Elder 
60989e15b77dSAlex Elder 	p = image_id;
60994157976bSAlex Elder 	end = image_id + image_id_size;
610069e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
61019e15b77dSAlex Elder 
61029e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
61039e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
61049e15b77dSAlex Elder 	if (!reply_buf)
61059e15b77dSAlex Elder 		goto out;
61069e15b77dSAlex Elder 
6107ecd4a68aSIlya Dryomov 	ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
6108ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
6109ecd4a68aSIlya Dryomov 				  "dir_get_name", image_id, image_id_size,
6110e2a58ee5SAlex Elder 				  reply_buf, size);
61119e15b77dSAlex Elder 	if (ret < 0)
61129e15b77dSAlex Elder 		goto out;
61139e15b77dSAlex Elder 	p = reply_buf;
6114f40eb349SAlex Elder 	end = reply_buf + ret;
6115f40eb349SAlex Elder 
61169e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
61179e15b77dSAlex Elder 	if (IS_ERR(image_name))
61189e15b77dSAlex Elder 		image_name = NULL;
61199e15b77dSAlex Elder 	else
61209e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
61219e15b77dSAlex Elder out:
61229e15b77dSAlex Elder 	kfree(reply_buf);
61239e15b77dSAlex Elder 	kfree(image_id);
61249e15b77dSAlex Elder 
61259e15b77dSAlex Elder 	return image_name;
61269e15b77dSAlex Elder }
61279e15b77dSAlex Elder 
61282ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
61292ad3d716SAlex Elder {
61302ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
61312ad3d716SAlex Elder 	const char *snap_name;
61322ad3d716SAlex Elder 	u32 which = 0;
61332ad3d716SAlex Elder 
61342ad3d716SAlex Elder 	/* Skip over names until we find the one we are looking for */
61352ad3d716SAlex Elder 
61362ad3d716SAlex Elder 	snap_name = rbd_dev->header.snap_names;
61372ad3d716SAlex Elder 	while (which < snapc->num_snaps) {
61382ad3d716SAlex Elder 		if (!strcmp(name, snap_name))
61392ad3d716SAlex Elder 			return snapc->snaps[which];
61402ad3d716SAlex Elder 		snap_name += strlen(snap_name) + 1;
61412ad3d716SAlex Elder 		which++;
61422ad3d716SAlex Elder 	}
61432ad3d716SAlex Elder 	return CEPH_NOSNAP;
61442ad3d716SAlex Elder }
61452ad3d716SAlex Elder 
61462ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
61472ad3d716SAlex Elder {
61482ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
61492ad3d716SAlex Elder 	u32 which;
61502ad3d716SAlex Elder 	bool found = false;
61512ad3d716SAlex Elder 	u64 snap_id;
61522ad3d716SAlex Elder 
61532ad3d716SAlex Elder 	for (which = 0; !found && which < snapc->num_snaps; which++) {
61542ad3d716SAlex Elder 		const char *snap_name;
61552ad3d716SAlex Elder 
61562ad3d716SAlex Elder 		snap_id = snapc->snaps[which];
61572ad3d716SAlex Elder 		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
6158efadc98aSJosh Durgin 		if (IS_ERR(snap_name)) {
6159efadc98aSJosh Durgin 			/* ignore no-longer existing snapshots */
6160efadc98aSJosh Durgin 			if (PTR_ERR(snap_name) == -ENOENT)
6161efadc98aSJosh Durgin 				continue;
6162efadc98aSJosh Durgin 			else
61632ad3d716SAlex Elder 				break;
6164efadc98aSJosh Durgin 		}
61652ad3d716SAlex Elder 		found = !strcmp(name, snap_name);
61662ad3d716SAlex Elder 		kfree(snap_name);
61672ad3d716SAlex Elder 	}
61682ad3d716SAlex Elder 	return found ? snap_id : CEPH_NOSNAP;
61692ad3d716SAlex Elder }
61702ad3d716SAlex Elder 
61712ad3d716SAlex Elder /*
61722ad3d716SAlex Elder  * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
61732ad3d716SAlex Elder  * no snapshot by that name is found, or if an error occurs.
61742ad3d716SAlex Elder  */
61752ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
61762ad3d716SAlex Elder {
61772ad3d716SAlex Elder 	if (rbd_dev->image_format == 1)
61782ad3d716SAlex Elder 		return rbd_v1_snap_id_by_name(rbd_dev, name);
61792ad3d716SAlex Elder 
61802ad3d716SAlex Elder 	return rbd_v2_snap_id_by_name(rbd_dev, name);
61812ad3d716SAlex Elder }
61822ad3d716SAlex Elder 
61839e15b77dSAlex Elder /*
618404077599SIlya Dryomov  * An image being mapped will have everything but the snap id.
61859e15b77dSAlex Elder  */
618604077599SIlya Dryomov static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
618704077599SIlya Dryomov {
618804077599SIlya Dryomov 	struct rbd_spec *spec = rbd_dev->spec;
618904077599SIlya Dryomov 
619004077599SIlya Dryomov 	rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
619104077599SIlya Dryomov 	rbd_assert(spec->image_id && spec->image_name);
619204077599SIlya Dryomov 	rbd_assert(spec->snap_name);
619304077599SIlya Dryomov 
619404077599SIlya Dryomov 	if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
619504077599SIlya Dryomov 		u64 snap_id;
619604077599SIlya Dryomov 
619704077599SIlya Dryomov 		snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
619804077599SIlya Dryomov 		if (snap_id == CEPH_NOSNAP)
619904077599SIlya Dryomov 			return -ENOENT;
620004077599SIlya Dryomov 
620104077599SIlya Dryomov 		spec->snap_id = snap_id;
620204077599SIlya Dryomov 	} else {
620304077599SIlya Dryomov 		spec->snap_id = CEPH_NOSNAP;
620404077599SIlya Dryomov 	}
620504077599SIlya Dryomov 
620604077599SIlya Dryomov 	return 0;
620704077599SIlya Dryomov }
620804077599SIlya Dryomov 
620904077599SIlya Dryomov /*
621004077599SIlya Dryomov  * A parent image will have all ids but none of the names.
621104077599SIlya Dryomov  *
621204077599SIlya Dryomov  * All names in an rbd spec are dynamically allocated.  It's OK if we
621304077599SIlya Dryomov  * can't figure out the name for an image id.
621404077599SIlya Dryomov  */
621504077599SIlya Dryomov static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
62169e15b77dSAlex Elder {
62172e9f7f1cSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
62182e9f7f1cSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
62192e9f7f1cSAlex Elder 	const char *pool_name;
62202e9f7f1cSAlex Elder 	const char *image_name;
62212e9f7f1cSAlex Elder 	const char *snap_name;
62229e15b77dSAlex Elder 	int ret;
62239e15b77dSAlex Elder 
622404077599SIlya Dryomov 	rbd_assert(spec->pool_id != CEPH_NOPOOL);
622504077599SIlya Dryomov 	rbd_assert(spec->image_id);
622604077599SIlya Dryomov 	rbd_assert(spec->snap_id != CEPH_NOSNAP);
62279e15b77dSAlex Elder 
62282e9f7f1cSAlex Elder 	/* Get the pool name; we have to make our own copy of this */
62299e15b77dSAlex Elder 
62302e9f7f1cSAlex Elder 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
62312e9f7f1cSAlex Elder 	if (!pool_name) {
62322e9f7f1cSAlex Elder 		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
6233935dc89fSAlex Elder 		return -EIO;
6234935dc89fSAlex Elder 	}
62352e9f7f1cSAlex Elder 	pool_name = kstrdup(pool_name, GFP_KERNEL);
62362e9f7f1cSAlex Elder 	if (!pool_name)
62379e15b77dSAlex Elder 		return -ENOMEM;
62389e15b77dSAlex Elder 
62399e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
62409e15b77dSAlex Elder 
62412e9f7f1cSAlex Elder 	image_name = rbd_dev_image_name(rbd_dev);
62422e9f7f1cSAlex Elder 	if (!image_name)
624306ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
62449e15b77dSAlex Elder 
624504077599SIlya Dryomov 	/* Fetch the snapshot name */
62469e15b77dSAlex Elder 
62472e9f7f1cSAlex Elder 	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
6248da6a6b63SJosh Durgin 	if (IS_ERR(snap_name)) {
6249da6a6b63SJosh Durgin 		ret = PTR_ERR(snap_name);
62509e15b77dSAlex Elder 		goto out_err;
62512e9f7f1cSAlex Elder 	}
62522e9f7f1cSAlex Elder 
62532e9f7f1cSAlex Elder 	spec->pool_name = pool_name;
62542e9f7f1cSAlex Elder 	spec->image_name = image_name;
62552e9f7f1cSAlex Elder 	spec->snap_name = snap_name;
62569e15b77dSAlex Elder 
62579e15b77dSAlex Elder 	return 0;
625804077599SIlya Dryomov 
62599e15b77dSAlex Elder out_err:
62602e9f7f1cSAlex Elder 	kfree(image_name);
62612e9f7f1cSAlex Elder 	kfree(pool_name);
62629e15b77dSAlex Elder 	return ret;
62639e15b77dSAlex Elder }
62649e15b77dSAlex Elder 
6265cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
626635d489f9SAlex Elder {
626735d489f9SAlex Elder 	size_t size;
626835d489f9SAlex Elder 	int ret;
626935d489f9SAlex Elder 	void *reply_buf;
627035d489f9SAlex Elder 	void *p;
627135d489f9SAlex Elder 	void *end;
627235d489f9SAlex Elder 	u64 seq;
627335d489f9SAlex Elder 	u32 snap_count;
627435d489f9SAlex Elder 	struct ceph_snap_context *snapc;
627535d489f9SAlex Elder 	u32 i;
627635d489f9SAlex Elder 
627735d489f9SAlex Elder 	/*
627835d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
627935d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
628035d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
628135d489f9SAlex Elder 	 * prepared to receive.
628235d489f9SAlex Elder 	 */
628335d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
628435d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
628535d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
628635d489f9SAlex Elder 	if (!reply_buf)
628735d489f9SAlex Elder 		return -ENOMEM;
628835d489f9SAlex Elder 
6289ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6290ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_snapcontext",
6291ecd4a68aSIlya Dryomov 				  NULL, 0, reply_buf, size);
629236be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
629335d489f9SAlex Elder 	if (ret < 0)
629435d489f9SAlex Elder 		goto out;
629535d489f9SAlex Elder 
629635d489f9SAlex Elder 	p = reply_buf;
629757385b51SAlex Elder 	end = reply_buf + ret;
629857385b51SAlex Elder 	ret = -ERANGE;
629935d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
630035d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
630135d489f9SAlex Elder 
630235d489f9SAlex Elder 	/*
630335d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
630435d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
630535d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
630635d489f9SAlex Elder 	 * allocate is representable in a size_t.
630735d489f9SAlex Elder 	 */
630835d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
630935d489f9SAlex Elder 				 / sizeof (u64)) {
631035d489f9SAlex Elder 		ret = -EINVAL;
631135d489f9SAlex Elder 		goto out;
631235d489f9SAlex Elder 	}
631335d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
631435d489f9SAlex Elder 		goto out;
6315468521c1SAlex Elder 	ret = 0;
631635d489f9SAlex Elder 
6317812164f8SAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
631835d489f9SAlex Elder 	if (!snapc) {
631935d489f9SAlex Elder 		ret = -ENOMEM;
632035d489f9SAlex Elder 		goto out;
632135d489f9SAlex Elder 	}
632235d489f9SAlex Elder 	snapc->seq = seq;
632335d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
632435d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
632535d489f9SAlex Elder 
632649ece554SAlex Elder 	ceph_put_snap_context(rbd_dev->header.snapc);
632735d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
632835d489f9SAlex Elder 
632935d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
633035d489f9SAlex Elder 		(unsigned long long)seq, (unsigned int)snap_count);
633135d489f9SAlex Elder out:
633235d489f9SAlex Elder 	kfree(reply_buf);
633335d489f9SAlex Elder 
633457385b51SAlex Elder 	return ret;
633535d489f9SAlex Elder }
633635d489f9SAlex Elder 
633754cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
633854cac61fSAlex Elder 					u64 snap_id)
6339b8b1e2dbSAlex Elder {
6340b8b1e2dbSAlex Elder 	size_t size;
6341b8b1e2dbSAlex Elder 	void *reply_buf;
634254cac61fSAlex Elder 	__le64 snapid;
6343b8b1e2dbSAlex Elder 	int ret;
6344b8b1e2dbSAlex Elder 	void *p;
6345b8b1e2dbSAlex Elder 	void *end;
6346b8b1e2dbSAlex Elder 	char *snap_name;
6347b8b1e2dbSAlex Elder 
6348b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
6349b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
6350b8b1e2dbSAlex Elder 	if (!reply_buf)
6351b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
6352b8b1e2dbSAlex Elder 
635354cac61fSAlex Elder 	snapid = cpu_to_le64(snap_id);
6354ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6355ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_snapshot_name",
6356ecd4a68aSIlya Dryomov 				  &snapid, sizeof(snapid), reply_buf, size);
635736be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6358f40eb349SAlex Elder 	if (ret < 0) {
6359f40eb349SAlex Elder 		snap_name = ERR_PTR(ret);
6360b8b1e2dbSAlex Elder 		goto out;
6361f40eb349SAlex Elder 	}
6362b8b1e2dbSAlex Elder 
6363b8b1e2dbSAlex Elder 	p = reply_buf;
6364f40eb349SAlex Elder 	end = reply_buf + ret;
6365e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
6366f40eb349SAlex Elder 	if (IS_ERR(snap_name))
6367b8b1e2dbSAlex Elder 		goto out;
6368f40eb349SAlex Elder 
6369b8b1e2dbSAlex Elder 	dout("  snap_id 0x%016llx snap_name = %s\n",
637054cac61fSAlex Elder 		(unsigned long long)snap_id, snap_name);
6371b8b1e2dbSAlex Elder out:
6372b8b1e2dbSAlex Elder 	kfree(reply_buf);
6373b8b1e2dbSAlex Elder 
6374f40eb349SAlex Elder 	return snap_name;
6375b8b1e2dbSAlex Elder }
6376b8b1e2dbSAlex Elder 
63772df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
6378117973fbSAlex Elder {
63792df3fac7SAlex Elder 	bool first_time = rbd_dev->header.object_prefix == NULL;
6380117973fbSAlex Elder 	int ret;
6381117973fbSAlex Elder 
63821617e40cSJosh Durgin 	ret = rbd_dev_v2_image_size(rbd_dev);
63831617e40cSJosh Durgin 	if (ret)
6384cfbf6377SAlex Elder 		return ret;
63851617e40cSJosh Durgin 
63862df3fac7SAlex Elder 	if (first_time) {
63872df3fac7SAlex Elder 		ret = rbd_dev_v2_header_onetime(rbd_dev);
63882df3fac7SAlex Elder 		if (ret)
6389cfbf6377SAlex Elder 			return ret;
63902df3fac7SAlex Elder 	}
63912df3fac7SAlex Elder 
6392cc4a38bdSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev);
6393d194cd1dSIlya Dryomov 	if (ret && first_time) {
6394d194cd1dSIlya Dryomov 		kfree(rbd_dev->header.object_prefix);
6395d194cd1dSIlya Dryomov 		rbd_dev->header.object_prefix = NULL;
6396d194cd1dSIlya Dryomov 	}
6397117973fbSAlex Elder 
6398117973fbSAlex Elder 	return ret;
6399117973fbSAlex Elder }
6400117973fbSAlex Elder 
6401a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev)
6402a720ae09SIlya Dryomov {
6403a720ae09SIlya Dryomov 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
6404a720ae09SIlya Dryomov 
6405a720ae09SIlya Dryomov 	if (rbd_dev->image_format == 1)
6406a720ae09SIlya Dryomov 		return rbd_dev_v1_header_info(rbd_dev);
6407a720ae09SIlya Dryomov 
6408a720ae09SIlya Dryomov 	return rbd_dev_v2_header_info(rbd_dev);
6409a720ae09SIlya Dryomov }
6410a720ae09SIlya Dryomov 
64111ddbe94eSAlex Elder /*
6412e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
6413e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
6414593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
6415593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
6416e28fff26SAlex Elder  */
6417e28fff26SAlex Elder static inline size_t next_token(const char **buf)
6418e28fff26SAlex Elder {
6419e28fff26SAlex Elder         /*
6420e28fff26SAlex Elder         * These are the characters that produce nonzero for
6421e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
6422e28fff26SAlex Elder         */
6423e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
6424e28fff26SAlex Elder 
6425e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
6426e28fff26SAlex Elder 
6427e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
6428e28fff26SAlex Elder }
6429e28fff26SAlex Elder 
6430e28fff26SAlex Elder /*
6431ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
6432ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
6433ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
6434ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
6435ea3352f4SAlex Elder  *
6436ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
6437ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
6438ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
6439ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
6440ea3352f4SAlex Elder  *
6441ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
6442ea3352f4SAlex Elder  * the end of the found token.
6443ea3352f4SAlex Elder  *
6444ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
6445ea3352f4SAlex Elder  */
6446ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
6447ea3352f4SAlex Elder {
6448ea3352f4SAlex Elder 	char *dup;
6449ea3352f4SAlex Elder 	size_t len;
6450ea3352f4SAlex Elder 
6451ea3352f4SAlex Elder 	len = next_token(buf);
64524caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
6453ea3352f4SAlex Elder 	if (!dup)
6454ea3352f4SAlex Elder 		return NULL;
6455ea3352f4SAlex Elder 	*(dup + len) = '\0';
6456ea3352f4SAlex Elder 	*buf += len;
6457ea3352f4SAlex Elder 
6458ea3352f4SAlex Elder 	if (lenp)
6459ea3352f4SAlex Elder 		*lenp = len;
6460ea3352f4SAlex Elder 
6461ea3352f4SAlex Elder 	return dup;
6462ea3352f4SAlex Elder }
6463ea3352f4SAlex Elder 
6464ea3352f4SAlex Elder /*
6465859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
6466859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
6467859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
6468859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
6469d22f76e7SAlex Elder  *
6470859c31dfSAlex Elder  * The information extracted from these options is recorded in
6471859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
6472859c31dfSAlex Elder  * structures:
6473859c31dfSAlex Elder  *  ceph_opts
6474859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
6475859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
6476859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
6477859c31dfSAlex Elder  *  rbd_opts
6478859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
6479859c31dfSAlex Elder  *	this function; caller must release with kfree().
6480859c31dfSAlex Elder  *  spec
6481859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
6482859c31dfSAlex Elder  *	initialized by this function based on parsed options.
6483859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
6484859c31dfSAlex Elder  *
6485859c31dfSAlex Elder  * The options passed take this form:
6486859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
6487859c31dfSAlex Elder  * where:
6488859c31dfSAlex Elder  *  <mon_addrs>
6489859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
6490859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
6491859c31dfSAlex Elder  *      by a port number (separated by a colon).
6492859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
6493859c31dfSAlex Elder  *  <options>
6494859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
6495859c31dfSAlex Elder  *  <pool_name>
6496859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
6497859c31dfSAlex Elder  *  <image_name>
6498859c31dfSAlex Elder  *      The name of the image in that pool to map.
6499859c31dfSAlex Elder  *  <snap_id>
6500859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
6501859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
6502859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
6503859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
6504a725f65eSAlex Elder  */
6505859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
6506dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
6507859c31dfSAlex Elder 				struct rbd_options **opts,
6508859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
6509a725f65eSAlex Elder {
6510e28fff26SAlex Elder 	size_t len;
6511859c31dfSAlex Elder 	char *options;
65120ddebc0cSAlex Elder 	const char *mon_addrs;
6513ecb4dc22SAlex Elder 	char *snap_name;
65140ddebc0cSAlex Elder 	size_t mon_addrs_size;
6515c300156bSIlya Dryomov 	struct parse_rbd_opts_ctx pctx = { 0 };
6516859c31dfSAlex Elder 	struct ceph_options *copts;
6517dc79b113SAlex Elder 	int ret;
6518e28fff26SAlex Elder 
6519e28fff26SAlex Elder 	/* The first four tokens are required */
6520e28fff26SAlex Elder 
65217ef3214aSAlex Elder 	len = next_token(&buf);
65224fb5d671SAlex Elder 	if (!len) {
65234fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
65244fb5d671SAlex Elder 		return -EINVAL;
65254fb5d671SAlex Elder 	}
65260ddebc0cSAlex Elder 	mon_addrs = buf;
6527f28e565aSAlex Elder 	mon_addrs_size = len + 1;
65287ef3214aSAlex Elder 	buf += len;
6529a725f65eSAlex Elder 
6530dc79b113SAlex Elder 	ret = -EINVAL;
6531f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
6532f28e565aSAlex Elder 	if (!options)
6533dc79b113SAlex Elder 		return -ENOMEM;
65344fb5d671SAlex Elder 	if (!*options) {
65354fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
65364fb5d671SAlex Elder 		goto out_err;
65374fb5d671SAlex Elder 	}
6538a725f65eSAlex Elder 
6539c300156bSIlya Dryomov 	pctx.spec = rbd_spec_alloc();
6540c300156bSIlya Dryomov 	if (!pctx.spec)
6541f28e565aSAlex Elder 		goto out_mem;
6542859c31dfSAlex Elder 
6543c300156bSIlya Dryomov 	pctx.spec->pool_name = dup_token(&buf, NULL);
6544c300156bSIlya Dryomov 	if (!pctx.spec->pool_name)
6545859c31dfSAlex Elder 		goto out_mem;
6546c300156bSIlya Dryomov 	if (!*pctx.spec->pool_name) {
65474fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
65484fb5d671SAlex Elder 		goto out_err;
65494fb5d671SAlex Elder 	}
6550e28fff26SAlex Elder 
6551c300156bSIlya Dryomov 	pctx.spec->image_name = dup_token(&buf, NULL);
6552c300156bSIlya Dryomov 	if (!pctx.spec->image_name)
6553f28e565aSAlex Elder 		goto out_mem;
6554c300156bSIlya Dryomov 	if (!*pctx.spec->image_name) {
65554fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
65564fb5d671SAlex Elder 		goto out_err;
65574fb5d671SAlex Elder 	}
6558e28fff26SAlex Elder 
6559f28e565aSAlex Elder 	/*
6560f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
6561f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
6562f28e565aSAlex Elder 	 */
65633feeb894SAlex Elder 	len = next_token(&buf);
6564820a5f3eSAlex Elder 	if (!len) {
65653feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
65663feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
6567f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
6568dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
6569f28e565aSAlex Elder 		goto out_err;
6570849b4260SAlex Elder 	}
6571ecb4dc22SAlex Elder 	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
6572ecb4dc22SAlex Elder 	if (!snap_name)
6573f28e565aSAlex Elder 		goto out_mem;
6574ecb4dc22SAlex Elder 	*(snap_name + len) = '\0';
6575c300156bSIlya Dryomov 	pctx.spec->snap_name = snap_name;
6576e5c35534SAlex Elder 
65770ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
6578e28fff26SAlex Elder 
6579c300156bSIlya Dryomov 	pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL);
6580c300156bSIlya Dryomov 	if (!pctx.opts)
65814e9afebaSAlex Elder 		goto out_mem;
65824e9afebaSAlex Elder 
6583c300156bSIlya Dryomov 	pctx.opts->read_only = RBD_READ_ONLY_DEFAULT;
6584c300156bSIlya Dryomov 	pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
65850c93e1b7SIlya Dryomov 	pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT;
6586c300156bSIlya Dryomov 	pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
6587c300156bSIlya Dryomov 	pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
6588c300156bSIlya Dryomov 	pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
6589c300156bSIlya Dryomov 	pctx.opts->trim = RBD_TRIM_DEFAULT;
6590d22f76e7SAlex Elder 
6591859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
65920ddebc0cSAlex Elder 				   mon_addrs + mon_addrs_size - 1,
6593c300156bSIlya Dryomov 				   parse_rbd_opts_token, &pctx);
6594859c31dfSAlex Elder 	if (IS_ERR(copts)) {
6595859c31dfSAlex Elder 		ret = PTR_ERR(copts);
6596dc79b113SAlex Elder 		goto out_err;
6597dc79b113SAlex Elder 	}
6598859c31dfSAlex Elder 	kfree(options);
6599859c31dfSAlex Elder 
6600859c31dfSAlex Elder 	*ceph_opts = copts;
6601c300156bSIlya Dryomov 	*opts = pctx.opts;
6602c300156bSIlya Dryomov 	*rbd_spec = pctx.spec;
66030ddebc0cSAlex Elder 
6604dc79b113SAlex Elder 	return 0;
6605f28e565aSAlex Elder out_mem:
6606dc79b113SAlex Elder 	ret = -ENOMEM;
6607d22f76e7SAlex Elder out_err:
6608c300156bSIlya Dryomov 	kfree(pctx.opts);
6609c300156bSIlya Dryomov 	rbd_spec_put(pctx.spec);
6610f28e565aSAlex Elder 	kfree(options);
6611d22f76e7SAlex Elder 
6612dc79b113SAlex Elder 	return ret;
6613a725f65eSAlex Elder }
6614a725f65eSAlex Elder 
6615e010dd0aSIlya Dryomov static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
6616e010dd0aSIlya Dryomov {
6617e010dd0aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
6618e010dd0aSIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev))
6619e1fddc8fSIlya Dryomov 		__rbd_release_lock(rbd_dev);
6620e010dd0aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
6621e010dd0aSIlya Dryomov }
6622e010dd0aSIlya Dryomov 
6623637cd060SIlya Dryomov /*
6624637cd060SIlya Dryomov  * If the wait is interrupted, an error is returned even if the lock
6625637cd060SIlya Dryomov  * was successfully acquired.  rbd_dev_image_unlock() will release it
6626637cd060SIlya Dryomov  * if needed.
6627637cd060SIlya Dryomov  */
6628e010dd0aSIlya Dryomov static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
6629e010dd0aSIlya Dryomov {
6630637cd060SIlya Dryomov 	long ret;
66312f18d466SIlya Dryomov 
6632e010dd0aSIlya Dryomov 	if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
6633637cd060SIlya Dryomov 		if (!rbd_dev->opts->exclusive && !rbd_dev->opts->lock_on_read)
6634637cd060SIlya Dryomov 			return 0;
6635637cd060SIlya Dryomov 
6636e010dd0aSIlya Dryomov 		rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
6637e010dd0aSIlya Dryomov 		return -EINVAL;
6638e010dd0aSIlya Dryomov 	}
6639e010dd0aSIlya Dryomov 
6640f3c0e459SIlya Dryomov 	if (rbd_is_snap(rbd_dev))
6641637cd060SIlya Dryomov 		return 0;
6642637cd060SIlya Dryomov 
6643637cd060SIlya Dryomov 	rbd_assert(!rbd_is_lock_owner(rbd_dev));
6644637cd060SIlya Dryomov 	queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
6645637cd060SIlya Dryomov 	ret = wait_for_completion_killable_timeout(&rbd_dev->acquire_wait,
6646637cd060SIlya Dryomov 			    ceph_timeout_jiffies(rbd_dev->opts->lock_timeout));
664725e6be21SDongsheng Yang 	if (ret > 0) {
6648637cd060SIlya Dryomov 		ret = rbd_dev->acquire_err;
664925e6be21SDongsheng Yang 	} else {
665025e6be21SDongsheng Yang 		cancel_delayed_work_sync(&rbd_dev->lock_dwork);
665125e6be21SDongsheng Yang 		if (!ret)
6652637cd060SIlya Dryomov 			ret = -ETIMEDOUT;
665325e6be21SDongsheng Yang 	}
6654637cd060SIlya Dryomov 
66552f18d466SIlya Dryomov 	if (ret) {
6656637cd060SIlya Dryomov 		rbd_warn(rbd_dev, "failed to acquire exclusive lock: %ld", ret);
6657637cd060SIlya Dryomov 		return ret;
6658e010dd0aSIlya Dryomov 	}
6659e010dd0aSIlya Dryomov 
6660637cd060SIlya Dryomov 	/*
6661637cd060SIlya Dryomov 	 * The lock may have been released by now, unless automatic lock
6662637cd060SIlya Dryomov 	 * transitions are disabled.
6663637cd060SIlya Dryomov 	 */
6664637cd060SIlya Dryomov 	rbd_assert(!rbd_dev->opts->exclusive || rbd_is_lock_owner(rbd_dev));
6665e010dd0aSIlya Dryomov 	return 0;
6666e010dd0aSIlya Dryomov }
6667e010dd0aSIlya Dryomov 
666830ba1f02SIlya Dryomov /*
6669589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
6670589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
6671589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
6672589d30e0SAlex Elder  *
6673589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
6674589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
6675589d30e0SAlex Elder  * with the supplied name.
6676589d30e0SAlex Elder  *
6677589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
6678589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
6679589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
6680589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
6681589d30e0SAlex Elder  */
6682589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
6683589d30e0SAlex Elder {
6684589d30e0SAlex Elder 	int ret;
6685589d30e0SAlex Elder 	size_t size;
6686ecd4a68aSIlya Dryomov 	CEPH_DEFINE_OID_ONSTACK(oid);
6687589d30e0SAlex Elder 	void *response;
6688c0fba368SAlex Elder 	char *image_id;
66892f82ee54SAlex Elder 
6690589d30e0SAlex Elder 	/*
66912c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
66922c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
6693c0fba368SAlex Elder 	 * need to fetch the image id again in this case.  We
6694c0fba368SAlex Elder 	 * do still need to set the image format though.
66952c0d0a10SAlex Elder 	 */
6696c0fba368SAlex Elder 	if (rbd_dev->spec->image_id) {
6697c0fba368SAlex Elder 		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
6698c0fba368SAlex Elder 
66992c0d0a10SAlex Elder 		return 0;
6700c0fba368SAlex Elder 	}
67012c0d0a10SAlex Elder 
67022c0d0a10SAlex Elder 	/*
6703589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
6704589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
6705589d30e0SAlex Elder 	 */
6706ecd4a68aSIlya Dryomov 	ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
6707ecd4a68aSIlya Dryomov 			       rbd_dev->spec->image_name);
6708ecd4a68aSIlya Dryomov 	if (ret)
6709ecd4a68aSIlya Dryomov 		return ret;
6710ecd4a68aSIlya Dryomov 
6711ecd4a68aSIlya Dryomov 	dout("rbd id object name is %s\n", oid.name);
6712589d30e0SAlex Elder 
6713589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
6714589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
6715589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
6716589d30e0SAlex Elder 	if (!response) {
6717589d30e0SAlex Elder 		ret = -ENOMEM;
6718589d30e0SAlex Elder 		goto out;
6719589d30e0SAlex Elder 	}
6720589d30e0SAlex Elder 
6721c0fba368SAlex Elder 	/* If it doesn't exist we'll assume it's a format 1 image */
6722c0fba368SAlex Elder 
6723ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
6724ecd4a68aSIlya Dryomov 				  "get_id", NULL, 0,
67255435d206SDongsheng Yang 				  response, size);
672636be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6727c0fba368SAlex Elder 	if (ret == -ENOENT) {
6728c0fba368SAlex Elder 		image_id = kstrdup("", GFP_KERNEL);
6729c0fba368SAlex Elder 		ret = image_id ? 0 : -ENOMEM;
6730c0fba368SAlex Elder 		if (!ret)
6731c0fba368SAlex Elder 			rbd_dev->image_format = 1;
67327dd440c9SIlya Dryomov 	} else if (ret >= 0) {
6733c0fba368SAlex Elder 		void *p = response;
6734589d30e0SAlex Elder 
6735c0fba368SAlex Elder 		image_id = ceph_extract_encoded_string(&p, p + ret,
6736979ed480SAlex Elder 						NULL, GFP_NOIO);
6737461f758aSDuan Jiong 		ret = PTR_ERR_OR_ZERO(image_id);
6738c0fba368SAlex Elder 		if (!ret)
6739c0fba368SAlex Elder 			rbd_dev->image_format = 2;
6740c0fba368SAlex Elder 	}
6741c0fba368SAlex Elder 
6742c0fba368SAlex Elder 	if (!ret) {
6743c0fba368SAlex Elder 		rbd_dev->spec->image_id = image_id;
6744c0fba368SAlex Elder 		dout("image_id is %s\n", image_id);
6745589d30e0SAlex Elder 	}
6746589d30e0SAlex Elder out:
6747589d30e0SAlex Elder 	kfree(response);
6748ecd4a68aSIlya Dryomov 	ceph_oid_destroy(&oid);
6749589d30e0SAlex Elder 	return ret;
6750589d30e0SAlex Elder }
6751589d30e0SAlex Elder 
67523abef3b3SAlex Elder /*
67533abef3b3SAlex Elder  * Undo whatever state changes are made by v1 or v2 header info
67543abef3b3SAlex Elder  * call.
67553abef3b3SAlex Elder  */
67566fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
67576fd48b3bSAlex Elder {
67586fd48b3bSAlex Elder 	struct rbd_image_header	*header;
67596fd48b3bSAlex Elder 
6760a2acd00eSAlex Elder 	rbd_dev_parent_put(rbd_dev);
676122e8bd51SIlya Dryomov 	rbd_object_map_free(rbd_dev);
6762da5ef6beSIlya Dryomov 	rbd_dev_mapping_clear(rbd_dev);
67636fd48b3bSAlex Elder 
67646fd48b3bSAlex Elder 	/* Free dynamic fields from the header, then zero it out */
67656fd48b3bSAlex Elder 
67666fd48b3bSAlex Elder 	header = &rbd_dev->header;
6767812164f8SAlex Elder 	ceph_put_snap_context(header->snapc);
67686fd48b3bSAlex Elder 	kfree(header->snap_sizes);
67696fd48b3bSAlex Elder 	kfree(header->snap_names);
67706fd48b3bSAlex Elder 	kfree(header->object_prefix);
67716fd48b3bSAlex Elder 	memset(header, 0, sizeof (*header));
67726fd48b3bSAlex Elder }
67736fd48b3bSAlex Elder 
67742df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
6775a30b71b9SAlex Elder {
6776a30b71b9SAlex Elder 	int ret;
6777a30b71b9SAlex Elder 
67781e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
677957385b51SAlex Elder 	if (ret)
67801e130199SAlex Elder 		goto out_err;
6781b1b5402aSAlex Elder 
67822df3fac7SAlex Elder 	/*
67832df3fac7SAlex Elder 	 * Get the and check features for the image.  Currently the
67842df3fac7SAlex Elder 	 * features are assumed to never change.
67852df3fac7SAlex Elder 	 */
6786b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
678757385b51SAlex Elder 	if (ret)
6788b1b5402aSAlex Elder 		goto out_err;
678935d489f9SAlex Elder 
6790cc070d59SAlex Elder 	/* If the image supports fancy striping, get its parameters */
6791cc070d59SAlex Elder 
6792cc070d59SAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
6793cc070d59SAlex Elder 		ret = rbd_dev_v2_striping_info(rbd_dev);
6794cc070d59SAlex Elder 		if (ret < 0)
6795cc070d59SAlex Elder 			goto out_err;
6796cc070d59SAlex Elder 	}
6797a30b71b9SAlex Elder 
67987e97332eSIlya Dryomov 	if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
67997e97332eSIlya Dryomov 		ret = rbd_dev_v2_data_pool(rbd_dev);
68007e97332eSIlya Dryomov 		if (ret)
68017e97332eSIlya Dryomov 			goto out_err;
68027e97332eSIlya Dryomov 	}
68037e97332eSIlya Dryomov 
6804263423f8SIlya Dryomov 	rbd_init_layout(rbd_dev);
680535152979SAlex Elder 	return 0;
6806263423f8SIlya Dryomov 
68079d475de5SAlex Elder out_err:
6808642a2537SAlex Elder 	rbd_dev->header.features = 0;
68091e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
68101e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
68119d475de5SAlex Elder 	return ret;
6812a30b71b9SAlex Elder }
6813a30b71b9SAlex Elder 
68146d69bb53SIlya Dryomov /*
68156d69bb53SIlya Dryomov  * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
68166d69bb53SIlya Dryomov  * rbd_dev_image_probe() recursion depth, which means it's also the
68176d69bb53SIlya Dryomov  * length of the already discovered part of the parent chain.
68186d69bb53SIlya Dryomov  */
68196d69bb53SIlya Dryomov static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
682083a06263SAlex Elder {
68212f82ee54SAlex Elder 	struct rbd_device *parent = NULL;
6822124afba2SAlex Elder 	int ret;
6823124afba2SAlex Elder 
6824124afba2SAlex Elder 	if (!rbd_dev->parent_spec)
6825124afba2SAlex Elder 		return 0;
6826124afba2SAlex Elder 
68276d69bb53SIlya Dryomov 	if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
68286d69bb53SIlya Dryomov 		pr_info("parent chain is too long (%d)\n", depth);
68296d69bb53SIlya Dryomov 		ret = -EINVAL;
68306d69bb53SIlya Dryomov 		goto out_err;
68316d69bb53SIlya Dryomov 	}
68326d69bb53SIlya Dryomov 
68331643dfa4SIlya Dryomov 	parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
68341f2c6651SIlya Dryomov 	if (!parent) {
6835124afba2SAlex Elder 		ret = -ENOMEM;
6836124afba2SAlex Elder 		goto out_err;
68371f2c6651SIlya Dryomov 	}
68381f2c6651SIlya Dryomov 
68391f2c6651SIlya Dryomov 	/*
68401f2c6651SIlya Dryomov 	 * Images related by parent/child relationships always share
68411f2c6651SIlya Dryomov 	 * rbd_client and spec/parent_spec, so bump their refcounts.
68421f2c6651SIlya Dryomov 	 */
68431f2c6651SIlya Dryomov 	__rbd_get_client(rbd_dev->rbd_client);
68441f2c6651SIlya Dryomov 	rbd_spec_get(rbd_dev->parent_spec);
6845124afba2SAlex Elder 
68466d69bb53SIlya Dryomov 	ret = rbd_dev_image_probe(parent, depth);
6847124afba2SAlex Elder 	if (ret < 0)
6848124afba2SAlex Elder 		goto out_err;
68491f2c6651SIlya Dryomov 
6850124afba2SAlex Elder 	rbd_dev->parent = parent;
6851a2acd00eSAlex Elder 	atomic_set(&rbd_dev->parent_ref, 1);
6852124afba2SAlex Elder 	return 0;
6853124afba2SAlex Elder 
68541f2c6651SIlya Dryomov out_err:
68551f2c6651SIlya Dryomov 	rbd_dev_unparent(rbd_dev);
68561f2c6651SIlya Dryomov 	rbd_dev_destroy(parent);
6857124afba2SAlex Elder 	return ret;
6858124afba2SAlex Elder }
6859124afba2SAlex Elder 
68605769ed0cSIlya Dryomov static void rbd_dev_device_release(struct rbd_device *rbd_dev)
68615769ed0cSIlya Dryomov {
68625769ed0cSIlya Dryomov 	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
68635769ed0cSIlya Dryomov 	rbd_free_disk(rbd_dev);
68645769ed0cSIlya Dryomov 	if (!single_major)
68655769ed0cSIlya Dryomov 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
68665769ed0cSIlya Dryomov }
68675769ed0cSIlya Dryomov 
6868811c6688SIlya Dryomov /*
6869811c6688SIlya Dryomov  * rbd_dev->header_rwsem must be locked for write and will be unlocked
6870811c6688SIlya Dryomov  * upon return.
6871811c6688SIlya Dryomov  */
6872200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
6873124afba2SAlex Elder {
687483a06263SAlex Elder 	int ret;
687583a06263SAlex Elder 
68769b60e70bSIlya Dryomov 	/* Record our major and minor device numbers. */
687783a06263SAlex Elder 
68789b60e70bSIlya Dryomov 	if (!single_major) {
687983a06263SAlex Elder 		ret = register_blkdev(0, rbd_dev->name);
688083a06263SAlex Elder 		if (ret < 0)
68811643dfa4SIlya Dryomov 			goto err_out_unlock;
68829b60e70bSIlya Dryomov 
688383a06263SAlex Elder 		rbd_dev->major = ret;
6884dd82fff1SIlya Dryomov 		rbd_dev->minor = 0;
68859b60e70bSIlya Dryomov 	} else {
68869b60e70bSIlya Dryomov 		rbd_dev->major = rbd_major;
68879b60e70bSIlya Dryomov 		rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
68889b60e70bSIlya Dryomov 	}
688983a06263SAlex Elder 
689083a06263SAlex Elder 	/* Set up the blkdev mapping. */
689183a06263SAlex Elder 
689283a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
689383a06263SAlex Elder 	if (ret)
689483a06263SAlex Elder 		goto err_out_blkdev;
689583a06263SAlex Elder 
6896f35a4deeSAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
68979568c93eSIlya Dryomov 	set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only);
6898f35a4deeSAlex Elder 
68995769ed0cSIlya Dryomov 	ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
6900f35a4deeSAlex Elder 	if (ret)
6901da5ef6beSIlya Dryomov 		goto err_out_disk;
690283a06263SAlex Elder 
6903129b79d4SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
6904811c6688SIlya Dryomov 	up_write(&rbd_dev->header_rwsem);
69055769ed0cSIlya Dryomov 	return 0;
69062f82ee54SAlex Elder 
690783a06263SAlex Elder err_out_disk:
690883a06263SAlex Elder 	rbd_free_disk(rbd_dev);
690983a06263SAlex Elder err_out_blkdev:
69109b60e70bSIlya Dryomov 	if (!single_major)
691183a06263SAlex Elder 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
6912811c6688SIlya Dryomov err_out_unlock:
6913811c6688SIlya Dryomov 	up_write(&rbd_dev->header_rwsem);
691483a06263SAlex Elder 	return ret;
691583a06263SAlex Elder }
691683a06263SAlex Elder 
6917332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev)
6918332bb12dSAlex Elder {
6919332bb12dSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
6920c41d13a3SIlya Dryomov 	int ret;
6921332bb12dSAlex Elder 
6922332bb12dSAlex Elder 	/* Record the header object name for this rbd image. */
6923332bb12dSAlex Elder 
6924332bb12dSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
6925332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
6926c41d13a3SIlya Dryomov 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6927332bb12dSAlex Elder 				       spec->image_name, RBD_SUFFIX);
6928332bb12dSAlex Elder 	else
6929c41d13a3SIlya Dryomov 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6930332bb12dSAlex Elder 				       RBD_HEADER_PREFIX, spec->image_id);
6931c41d13a3SIlya Dryomov 
6932c41d13a3SIlya Dryomov 	return ret;
6933332bb12dSAlex Elder }
6934332bb12dSAlex Elder 
6935200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev)
6936200a6a8bSAlex Elder {
69376fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
6938fd22aef8SIlya Dryomov 	if (rbd_dev->opts)
6939fd22aef8SIlya Dryomov 		rbd_unregister_watch(rbd_dev);
69406fd48b3bSAlex Elder 	rbd_dev->image_format = 0;
69416fd48b3bSAlex Elder 	kfree(rbd_dev->spec->image_id);
69426fd48b3bSAlex Elder 	rbd_dev->spec->image_id = NULL;
6943200a6a8bSAlex Elder }
6944200a6a8bSAlex Elder 
6945a30b71b9SAlex Elder /*
6946a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
69471f3ef788SAlex Elder  * device.  If this image is the one being mapped (i.e., not a
69481f3ef788SAlex Elder  * parent), initiate a watch on its header object before using that
69491f3ef788SAlex Elder  * object to get detailed information about the rbd image.
6950a30b71b9SAlex Elder  */
69516d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
6952a30b71b9SAlex Elder {
6953a30b71b9SAlex Elder 	int ret;
6954a30b71b9SAlex Elder 
6955a30b71b9SAlex Elder 	/*
69563abef3b3SAlex Elder 	 * Get the id from the image id object.  Unless there's an
69573abef3b3SAlex Elder 	 * error, rbd_dev->spec->image_id will be filled in with
69583abef3b3SAlex Elder 	 * a dynamically-allocated string, and rbd_dev->image_format
69593abef3b3SAlex Elder 	 * will be set to either 1 or 2.
6960a30b71b9SAlex Elder 	 */
6961a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
6962a30b71b9SAlex Elder 	if (ret)
6963c0fba368SAlex Elder 		return ret;
6964c0fba368SAlex Elder 
6965332bb12dSAlex Elder 	ret = rbd_dev_header_name(rbd_dev);
6966332bb12dSAlex Elder 	if (ret)
6967332bb12dSAlex Elder 		goto err_out_format;
6968332bb12dSAlex Elder 
69696d69bb53SIlya Dryomov 	if (!depth) {
697099d16943SIlya Dryomov 		ret = rbd_register_watch(rbd_dev);
69711fe48023SIlya Dryomov 		if (ret) {
69721fe48023SIlya Dryomov 			if (ret == -ENOENT)
6973b26c047bSIlya Dryomov 				pr_info("image %s/%s%s%s does not exist\n",
69741fe48023SIlya Dryomov 					rbd_dev->spec->pool_name,
6975b26c047bSIlya Dryomov 					rbd_dev->spec->pool_ns ?: "",
6976b26c047bSIlya Dryomov 					rbd_dev->spec->pool_ns ? "/" : "",
69771fe48023SIlya Dryomov 					rbd_dev->spec->image_name);
6978c41d13a3SIlya Dryomov 			goto err_out_format;
69791f3ef788SAlex Elder 		}
69801fe48023SIlya Dryomov 	}
6981b644de2bSAlex Elder 
6982a720ae09SIlya Dryomov 	ret = rbd_dev_header_info(rbd_dev);
69835655c4d9SAlex Elder 	if (ret)
6984b644de2bSAlex Elder 		goto err_out_watch;
6985a30b71b9SAlex Elder 
698604077599SIlya Dryomov 	/*
698704077599SIlya Dryomov 	 * If this image is the one being mapped, we have pool name and
698804077599SIlya Dryomov 	 * id, image name and id, and snap name - need to fill snap id.
698904077599SIlya Dryomov 	 * Otherwise this is a parent image, identified by pool, image
699004077599SIlya Dryomov 	 * and snap ids - need to fill in names for those ids.
699104077599SIlya Dryomov 	 */
69926d69bb53SIlya Dryomov 	if (!depth)
699304077599SIlya Dryomov 		ret = rbd_spec_fill_snap_id(rbd_dev);
699404077599SIlya Dryomov 	else
699504077599SIlya Dryomov 		ret = rbd_spec_fill_names(rbd_dev);
69961fe48023SIlya Dryomov 	if (ret) {
69971fe48023SIlya Dryomov 		if (ret == -ENOENT)
6998b26c047bSIlya Dryomov 			pr_info("snap %s/%s%s%s@%s does not exist\n",
69991fe48023SIlya Dryomov 				rbd_dev->spec->pool_name,
7000b26c047bSIlya Dryomov 				rbd_dev->spec->pool_ns ?: "",
7001b26c047bSIlya Dryomov 				rbd_dev->spec->pool_ns ? "/" : "",
70021fe48023SIlya Dryomov 				rbd_dev->spec->image_name,
70031fe48023SIlya Dryomov 				rbd_dev->spec->snap_name);
700433dca39fSAlex Elder 		goto err_out_probe;
70051fe48023SIlya Dryomov 	}
70069bb81c9bSAlex Elder 
7007da5ef6beSIlya Dryomov 	ret = rbd_dev_mapping_set(rbd_dev);
7008da5ef6beSIlya Dryomov 	if (ret)
7009da5ef6beSIlya Dryomov 		goto err_out_probe;
7010da5ef6beSIlya Dryomov 
7011f3c0e459SIlya Dryomov 	if (rbd_is_snap(rbd_dev) &&
701222e8bd51SIlya Dryomov 	    (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) {
701322e8bd51SIlya Dryomov 		ret = rbd_object_map_load(rbd_dev);
701422e8bd51SIlya Dryomov 		if (ret)
701522e8bd51SIlya Dryomov 			goto err_out_probe;
701622e8bd51SIlya Dryomov 	}
701722e8bd51SIlya Dryomov 
7018e8f59b59SIlya Dryomov 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
7019e8f59b59SIlya Dryomov 		ret = rbd_dev_v2_parent_info(rbd_dev);
7020e8f59b59SIlya Dryomov 		if (ret)
7021e8f59b59SIlya Dryomov 			goto err_out_probe;
7022e8f59b59SIlya Dryomov 	}
7023e8f59b59SIlya Dryomov 
70246d69bb53SIlya Dryomov 	ret = rbd_dev_probe_parent(rbd_dev, depth);
702530d60ba2SAlex Elder 	if (ret)
702630d60ba2SAlex Elder 		goto err_out_probe;
702783a06263SAlex Elder 
702830d60ba2SAlex Elder 	dout("discovered format %u image, header name is %s\n",
7029c41d13a3SIlya Dryomov 		rbd_dev->image_format, rbd_dev->header_oid.name);
703030d60ba2SAlex Elder 	return 0;
7031e8f59b59SIlya Dryomov 
70326fd48b3bSAlex Elder err_out_probe:
70336fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
7034b644de2bSAlex Elder err_out_watch:
70356d69bb53SIlya Dryomov 	if (!depth)
703699d16943SIlya Dryomov 		rbd_unregister_watch(rbd_dev);
7037332bb12dSAlex Elder err_out_format:
7038332bb12dSAlex Elder 	rbd_dev->image_format = 0;
70395655c4d9SAlex Elder 	kfree(rbd_dev->spec->image_id);
70405655c4d9SAlex Elder 	rbd_dev->spec->image_id = NULL;
70415655c4d9SAlex Elder 	return ret;
704283a06263SAlex Elder }
704383a06263SAlex Elder 
70449b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus,
704559c2be1eSYehuda Sadeh 			  const char *buf,
704659c2be1eSYehuda Sadeh 			  size_t count)
7047602adf40SYehuda Sadeh {
7048cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
7049dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
70504e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
7051859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
70529d3997fdSAlex Elder 	struct rbd_client *rbdc;
7053b51c83c2SIlya Dryomov 	int rc;
7054602adf40SYehuda Sadeh 
7055602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
7056602adf40SYehuda Sadeh 		return -ENODEV;
7057602adf40SYehuda Sadeh 
7058a725f65eSAlex Elder 	/* parse add command */
7059859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
7060dc79b113SAlex Elder 	if (rc < 0)
7061dd5ac32dSIlya Dryomov 		goto out;
7062a725f65eSAlex Elder 
70639d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
70649d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
70659d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
70660ddebc0cSAlex Elder 		goto err_out_args;
70679d3997fdSAlex Elder 	}
7068602adf40SYehuda Sadeh 
7069602adf40SYehuda Sadeh 	/* pick the pool */
7070dd435855SIlya Dryomov 	rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
70711fe48023SIlya Dryomov 	if (rc < 0) {
70721fe48023SIlya Dryomov 		if (rc == -ENOENT)
70731fe48023SIlya Dryomov 			pr_info("pool %s does not exist\n", spec->pool_name);
7074602adf40SYehuda Sadeh 		goto err_out_client;
70751fe48023SIlya Dryomov 	}
7076859c31dfSAlex Elder 	spec->pool_id = (u64)rc;
7077859c31dfSAlex Elder 
7078d147543dSIlya Dryomov 	rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
7079b51c83c2SIlya Dryomov 	if (!rbd_dev) {
7080b51c83c2SIlya Dryomov 		rc = -ENOMEM;
7081bd4ba655SAlex Elder 		goto err_out_client;
7082b51c83c2SIlya Dryomov 	}
7083c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
7084c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
7085d147543dSIlya Dryomov 	rbd_opts = NULL;	/* rbd_dev now owns this */
7086602adf40SYehuda Sadeh 
70870d6d1e9cSMike Christie 	rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
70880d6d1e9cSMike Christie 	if (!rbd_dev->config_info) {
70890d6d1e9cSMike Christie 		rc = -ENOMEM;
70900d6d1e9cSMike Christie 		goto err_out_rbd_dev;
70910d6d1e9cSMike Christie 	}
70920d6d1e9cSMike Christie 
7093811c6688SIlya Dryomov 	down_write(&rbd_dev->header_rwsem);
70946d69bb53SIlya Dryomov 	rc = rbd_dev_image_probe(rbd_dev, 0);
70950d6d1e9cSMike Christie 	if (rc < 0) {
70960d6d1e9cSMike Christie 		up_write(&rbd_dev->header_rwsem);
7097c53d5893SAlex Elder 		goto err_out_rbd_dev;
70980d6d1e9cSMike Christie 	}
709905fd6f6fSAlex Elder 
71007ce4eef7SAlex Elder 	/* If we are mapping a snapshot it must be marked read-only */
7101f3c0e459SIlya Dryomov 	if (rbd_is_snap(rbd_dev))
71029568c93eSIlya Dryomov 		rbd_dev->opts->read_only = true;
71037ce4eef7SAlex Elder 
71040c93e1b7SIlya Dryomov 	if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) {
71050c93e1b7SIlya Dryomov 		rbd_warn(rbd_dev, "alloc_size adjusted to %u",
71060c93e1b7SIlya Dryomov 			 rbd_dev->layout.object_size);
71070c93e1b7SIlya Dryomov 		rbd_dev->opts->alloc_size = rbd_dev->layout.object_size;
71080c93e1b7SIlya Dryomov 	}
71090c93e1b7SIlya Dryomov 
7110b536f69aSAlex Elder 	rc = rbd_dev_device_setup(rbd_dev);
7111fd22aef8SIlya Dryomov 	if (rc)
71128b679ec5SIlya Dryomov 		goto err_out_image_probe;
71133abef3b3SAlex Elder 
7114e010dd0aSIlya Dryomov 	rc = rbd_add_acquire_lock(rbd_dev);
7115e010dd0aSIlya Dryomov 	if (rc)
7116637cd060SIlya Dryomov 		goto err_out_image_lock;
7117b536f69aSAlex Elder 
71185769ed0cSIlya Dryomov 	/* Everything's ready.  Announce the disk to the world. */
71195769ed0cSIlya Dryomov 
71205769ed0cSIlya Dryomov 	rc = device_add(&rbd_dev->dev);
71215769ed0cSIlya Dryomov 	if (rc)
7122e010dd0aSIlya Dryomov 		goto err_out_image_lock;
71235769ed0cSIlya Dryomov 
71245769ed0cSIlya Dryomov 	add_disk(rbd_dev->disk);
71255769ed0cSIlya Dryomov 	/* see rbd_init_disk() */
71265769ed0cSIlya Dryomov 	blk_put_queue(rbd_dev->disk->queue);
71275769ed0cSIlya Dryomov 
71285769ed0cSIlya Dryomov 	spin_lock(&rbd_dev_list_lock);
71295769ed0cSIlya Dryomov 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
71305769ed0cSIlya Dryomov 	spin_unlock(&rbd_dev_list_lock);
71315769ed0cSIlya Dryomov 
71325769ed0cSIlya Dryomov 	pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
71335769ed0cSIlya Dryomov 		(unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
71345769ed0cSIlya Dryomov 		rbd_dev->header.features);
7135dd5ac32dSIlya Dryomov 	rc = count;
7136dd5ac32dSIlya Dryomov out:
7137dd5ac32dSIlya Dryomov 	module_put(THIS_MODULE);
7138dd5ac32dSIlya Dryomov 	return rc;
7139b536f69aSAlex Elder 
7140e010dd0aSIlya Dryomov err_out_image_lock:
7141e010dd0aSIlya Dryomov 	rbd_dev_image_unlock(rbd_dev);
71425769ed0cSIlya Dryomov 	rbd_dev_device_release(rbd_dev);
71438b679ec5SIlya Dryomov err_out_image_probe:
71448b679ec5SIlya Dryomov 	rbd_dev_image_release(rbd_dev);
7145c53d5893SAlex Elder err_out_rbd_dev:
7146c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
7147bd4ba655SAlex Elder err_out_client:
71489d3997fdSAlex Elder 	rbd_put_client(rbdc);
71490ddebc0cSAlex Elder err_out_args:
7150859c31dfSAlex Elder 	rbd_spec_put(spec);
7151d147543dSIlya Dryomov 	kfree(rbd_opts);
7152dd5ac32dSIlya Dryomov 	goto out;
7153602adf40SYehuda Sadeh }
7154602adf40SYehuda Sadeh 
71557e9586baSGreg Kroah-Hartman static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count)
71569b60e70bSIlya Dryomov {
71579b60e70bSIlya Dryomov 	if (single_major)
71589b60e70bSIlya Dryomov 		return -EINVAL;
71599b60e70bSIlya Dryomov 
71609b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
71619b60e70bSIlya Dryomov }
71629b60e70bSIlya Dryomov 
71637e9586baSGreg Kroah-Hartman static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
71649b60e70bSIlya Dryomov 				      size_t count)
71659b60e70bSIlya Dryomov {
71669b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
71679b60e70bSIlya Dryomov }
71689b60e70bSIlya Dryomov 
716905a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
717005a46afdSAlex Elder {
7171ad945fc1SAlex Elder 	while (rbd_dev->parent) {
717205a46afdSAlex Elder 		struct rbd_device *first = rbd_dev;
717305a46afdSAlex Elder 		struct rbd_device *second = first->parent;
717405a46afdSAlex Elder 		struct rbd_device *third;
717505a46afdSAlex Elder 
717605a46afdSAlex Elder 		/*
717705a46afdSAlex Elder 		 * Follow to the parent with no grandparent and
717805a46afdSAlex Elder 		 * remove it.
717905a46afdSAlex Elder 		 */
718005a46afdSAlex Elder 		while (second && (third = second->parent)) {
718105a46afdSAlex Elder 			first = second;
718205a46afdSAlex Elder 			second = third;
718305a46afdSAlex Elder 		}
7184ad945fc1SAlex Elder 		rbd_assert(second);
71858ad42cd0SAlex Elder 		rbd_dev_image_release(second);
71868b679ec5SIlya Dryomov 		rbd_dev_destroy(second);
7187ad945fc1SAlex Elder 		first->parent = NULL;
7188ad945fc1SAlex Elder 		first->parent_overlap = 0;
7189ad945fc1SAlex Elder 
7190ad945fc1SAlex Elder 		rbd_assert(first->parent_spec);
719105a46afdSAlex Elder 		rbd_spec_put(first->parent_spec);
719205a46afdSAlex Elder 		first->parent_spec = NULL;
719305a46afdSAlex Elder 	}
719405a46afdSAlex Elder }
719505a46afdSAlex Elder 
71969b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus,
7197602adf40SYehuda Sadeh 			     const char *buf,
7198602adf40SYehuda Sadeh 			     size_t count)
7199602adf40SYehuda Sadeh {
7200602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
7201751cc0e3SAlex Elder 	struct list_head *tmp;
7202751cc0e3SAlex Elder 	int dev_id;
72030276dca6SMike Christie 	char opt_buf[6];
72040276dca6SMike Christie 	bool force = false;
72050d8189e1SAlex Elder 	int ret;
7206602adf40SYehuda Sadeh 
72070276dca6SMike Christie 	dev_id = -1;
72080276dca6SMike Christie 	opt_buf[0] = '\0';
72090276dca6SMike Christie 	sscanf(buf, "%d %5s", &dev_id, opt_buf);
72100276dca6SMike Christie 	if (dev_id < 0) {
72110276dca6SMike Christie 		pr_err("dev_id out of range\n");
7212602adf40SYehuda Sadeh 		return -EINVAL;
72130276dca6SMike Christie 	}
72140276dca6SMike Christie 	if (opt_buf[0] != '\0') {
72150276dca6SMike Christie 		if (!strcmp(opt_buf, "force")) {
72160276dca6SMike Christie 			force = true;
72170276dca6SMike Christie 		} else {
72180276dca6SMike Christie 			pr_err("bad remove option at '%s'\n", opt_buf);
72190276dca6SMike Christie 			return -EINVAL;
72200276dca6SMike Christie 		}
72210276dca6SMike Christie 	}
7222602adf40SYehuda Sadeh 
7223602adf40SYehuda Sadeh 	ret = -ENOENT;
7224751cc0e3SAlex Elder 	spin_lock(&rbd_dev_list_lock);
7225751cc0e3SAlex Elder 	list_for_each(tmp, &rbd_dev_list) {
7226751cc0e3SAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
7227751cc0e3SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
7228751cc0e3SAlex Elder 			ret = 0;
7229751cc0e3SAlex Elder 			break;
7230602adf40SYehuda Sadeh 		}
7231751cc0e3SAlex Elder 	}
7232751cc0e3SAlex Elder 	if (!ret) {
7233a14ea269SAlex Elder 		spin_lock_irq(&rbd_dev->lock);
72340276dca6SMike Christie 		if (rbd_dev->open_count && !force)
723542382b70SAlex Elder 			ret = -EBUSY;
723685f5a4d6SIlya Dryomov 		else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING,
723785f5a4d6SIlya Dryomov 					  &rbd_dev->flags))
723885f5a4d6SIlya Dryomov 			ret = -EINPROGRESS;
7239a14ea269SAlex Elder 		spin_unlock_irq(&rbd_dev->lock);
7240751cc0e3SAlex Elder 	}
7241751cc0e3SAlex Elder 	spin_unlock(&rbd_dev_list_lock);
724285f5a4d6SIlya Dryomov 	if (ret)
72431ba0f1e7SAlex Elder 		return ret;
7244751cc0e3SAlex Elder 
72450276dca6SMike Christie 	if (force) {
72460276dca6SMike Christie 		/*
72470276dca6SMike Christie 		 * Prevent new IO from being queued and wait for existing
72480276dca6SMike Christie 		 * IO to complete/fail.
72490276dca6SMike Christie 		 */
72500276dca6SMike Christie 		blk_mq_freeze_queue(rbd_dev->disk->queue);
72510276dca6SMike Christie 		blk_set_queue_dying(rbd_dev->disk->queue);
72520276dca6SMike Christie 	}
72530276dca6SMike Christie 
72545769ed0cSIlya Dryomov 	del_gendisk(rbd_dev->disk);
72555769ed0cSIlya Dryomov 	spin_lock(&rbd_dev_list_lock);
72565769ed0cSIlya Dryomov 	list_del_init(&rbd_dev->node);
72575769ed0cSIlya Dryomov 	spin_unlock(&rbd_dev_list_lock);
72585769ed0cSIlya Dryomov 	device_del(&rbd_dev->dev);
7259fca27065SIlya Dryomov 
7260e010dd0aSIlya Dryomov 	rbd_dev_image_unlock(rbd_dev);
7261dd5ac32dSIlya Dryomov 	rbd_dev_device_release(rbd_dev);
72628ad42cd0SAlex Elder 	rbd_dev_image_release(rbd_dev);
72638b679ec5SIlya Dryomov 	rbd_dev_destroy(rbd_dev);
72641ba0f1e7SAlex Elder 	return count;
7265602adf40SYehuda Sadeh }
7266602adf40SYehuda Sadeh 
72677e9586baSGreg Kroah-Hartman static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count)
72689b60e70bSIlya Dryomov {
72699b60e70bSIlya Dryomov 	if (single_major)
72709b60e70bSIlya Dryomov 		return -EINVAL;
72719b60e70bSIlya Dryomov 
72729b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
72739b60e70bSIlya Dryomov }
72749b60e70bSIlya Dryomov 
72757e9586baSGreg Kroah-Hartman static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
72769b60e70bSIlya Dryomov 					 size_t count)
72779b60e70bSIlya Dryomov {
72789b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
72799b60e70bSIlya Dryomov }
72809b60e70bSIlya Dryomov 
7281602adf40SYehuda Sadeh /*
7282602adf40SYehuda Sadeh  * create control files in sysfs
7283dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
7284602adf40SYehuda Sadeh  */
72857d8dc534SChengguang Xu static int __init rbd_sysfs_init(void)
7286602adf40SYehuda Sadeh {
7287dfc5606dSYehuda Sadeh 	int ret;
7288602adf40SYehuda Sadeh 
7289fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
7290dfc5606dSYehuda Sadeh 	if (ret < 0)
7291dfc5606dSYehuda Sadeh 		return ret;
7292602adf40SYehuda Sadeh 
7293fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
7294fed4c143SAlex Elder 	if (ret < 0)
7295fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
7296602adf40SYehuda Sadeh 
7297602adf40SYehuda Sadeh 	return ret;
7298602adf40SYehuda Sadeh }
7299602adf40SYehuda Sadeh 
73007d8dc534SChengguang Xu static void __exit rbd_sysfs_cleanup(void)
7301602adf40SYehuda Sadeh {
7302dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
7303fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
7304602adf40SYehuda Sadeh }
7305602adf40SYehuda Sadeh 
73067d8dc534SChengguang Xu static int __init rbd_slab_init(void)
73071c2a9dfeSAlex Elder {
73081c2a9dfeSAlex Elder 	rbd_assert(!rbd_img_request_cache);
730903d94406SGeliang Tang 	rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
7310868311b1SAlex Elder 	if (!rbd_img_request_cache)
7311868311b1SAlex Elder 		return -ENOMEM;
7312868311b1SAlex Elder 
7313868311b1SAlex Elder 	rbd_assert(!rbd_obj_request_cache);
731403d94406SGeliang Tang 	rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
731578c2a44aSAlex Elder 	if (!rbd_obj_request_cache)
731678c2a44aSAlex Elder 		goto out_err;
731778c2a44aSAlex Elder 
73181c2a9dfeSAlex Elder 	return 0;
73191c2a9dfeSAlex Elder 
73206c696d85SIlya Dryomov out_err:
7321868311b1SAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
7322868311b1SAlex Elder 	rbd_img_request_cache = NULL;
73231c2a9dfeSAlex Elder 	return -ENOMEM;
73241c2a9dfeSAlex Elder }
73251c2a9dfeSAlex Elder 
73261c2a9dfeSAlex Elder static void rbd_slab_exit(void)
73271c2a9dfeSAlex Elder {
7328868311b1SAlex Elder 	rbd_assert(rbd_obj_request_cache);
7329868311b1SAlex Elder 	kmem_cache_destroy(rbd_obj_request_cache);
7330868311b1SAlex Elder 	rbd_obj_request_cache = NULL;
7331868311b1SAlex Elder 
73321c2a9dfeSAlex Elder 	rbd_assert(rbd_img_request_cache);
73331c2a9dfeSAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
73341c2a9dfeSAlex Elder 	rbd_img_request_cache = NULL;
73351c2a9dfeSAlex Elder }
73361c2a9dfeSAlex Elder 
7337cc344fa1SAlex Elder static int __init rbd_init(void)
7338602adf40SYehuda Sadeh {
7339602adf40SYehuda Sadeh 	int rc;
7340602adf40SYehuda Sadeh 
73411e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
73421e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
73431e32d34cSAlex Elder 		return -EINVAL;
73441e32d34cSAlex Elder 	}
7345e1b4d96dSIlya Dryomov 
73461c2a9dfeSAlex Elder 	rc = rbd_slab_init();
7347602adf40SYehuda Sadeh 	if (rc)
7348602adf40SYehuda Sadeh 		return rc;
7349e1b4d96dSIlya Dryomov 
7350f5ee37bdSIlya Dryomov 	/*
7351f5ee37bdSIlya Dryomov 	 * The number of active work items is limited by the number of
7352f77303bdSIlya Dryomov 	 * rbd devices * queue depth, so leave @max_active at default.
7353f5ee37bdSIlya Dryomov 	 */
7354f5ee37bdSIlya Dryomov 	rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
7355f5ee37bdSIlya Dryomov 	if (!rbd_wq) {
7356f5ee37bdSIlya Dryomov 		rc = -ENOMEM;
7357f5ee37bdSIlya Dryomov 		goto err_out_slab;
7358f5ee37bdSIlya Dryomov 	}
7359f5ee37bdSIlya Dryomov 
73609b60e70bSIlya Dryomov 	if (single_major) {
73619b60e70bSIlya Dryomov 		rbd_major = register_blkdev(0, RBD_DRV_NAME);
73629b60e70bSIlya Dryomov 		if (rbd_major < 0) {
73639b60e70bSIlya Dryomov 			rc = rbd_major;
7364f5ee37bdSIlya Dryomov 			goto err_out_wq;
73659b60e70bSIlya Dryomov 		}
73669b60e70bSIlya Dryomov 	}
73679b60e70bSIlya Dryomov 
73681c2a9dfeSAlex Elder 	rc = rbd_sysfs_init();
73691c2a9dfeSAlex Elder 	if (rc)
73709b60e70bSIlya Dryomov 		goto err_out_blkdev;
73711c2a9dfeSAlex Elder 
73729b60e70bSIlya Dryomov 	if (single_major)
73739b60e70bSIlya Dryomov 		pr_info("loaded (major %d)\n", rbd_major);
73749b60e70bSIlya Dryomov 	else
7375e1b4d96dSIlya Dryomov 		pr_info("loaded\n");
73769b60e70bSIlya Dryomov 
7377e1b4d96dSIlya Dryomov 	return 0;
7378e1b4d96dSIlya Dryomov 
73799b60e70bSIlya Dryomov err_out_blkdev:
73809b60e70bSIlya Dryomov 	if (single_major)
73819b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
7382f5ee37bdSIlya Dryomov err_out_wq:
7383f5ee37bdSIlya Dryomov 	destroy_workqueue(rbd_wq);
7384e1b4d96dSIlya Dryomov err_out_slab:
7385e1b4d96dSIlya Dryomov 	rbd_slab_exit();
73861c2a9dfeSAlex Elder 	return rc;
7387602adf40SYehuda Sadeh }
7388602adf40SYehuda Sadeh 
7389cc344fa1SAlex Elder static void __exit rbd_exit(void)
7390602adf40SYehuda Sadeh {
7391ffe312cfSIlya Dryomov 	ida_destroy(&rbd_dev_id_ida);
7392602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
73939b60e70bSIlya Dryomov 	if (single_major)
73949b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
7395f5ee37bdSIlya Dryomov 	destroy_workqueue(rbd_wq);
73961c2a9dfeSAlex Elder 	rbd_slab_exit();
7397602adf40SYehuda Sadeh }
7398602adf40SYehuda Sadeh 
7399602adf40SYehuda Sadeh module_init(rbd_init);
7400602adf40SYehuda Sadeh module_exit(rbd_exit);
7401602adf40SYehuda Sadeh 
7402d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
7403602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
7404602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
7405602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
7406602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
7407602adf40SYehuda Sadeh 
740890da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
7409602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
7410