xref: /openbmc/linux/drivers/block/rbd.c (revision 801474ea)
1e2a58ee5SAlex Elder 
2602adf40SYehuda Sadeh /*
3602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh 
6602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
7602adf40SYehuda Sadeh 
8602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
9602adf40SYehuda Sadeh 
10602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
11602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
12602adf40SYehuda Sadeh    the Free Software Foundation.
13602adf40SYehuda Sadeh 
14602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
15602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
16602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17602adf40SYehuda Sadeh    GNU General Public License for more details.
18602adf40SYehuda Sadeh 
19602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
20602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
21602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24602adf40SYehuda Sadeh 
25dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
26602adf40SYehuda Sadeh 
27dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
28602adf40SYehuda Sadeh 
29602adf40SYehuda Sadeh  */
30602adf40SYehuda Sadeh 
31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
34ed95b21aSIlya Dryomov #include <linux/ceph/cls_lock_client.h>
3543df3d35SIlya Dryomov #include <linux/ceph/striper.h>
36602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3782995cc6SDavid Howells #include <linux/fs_parser.h>
3830d1cff8SAlex Elder #include <linux/bsearch.h>
39602adf40SYehuda Sadeh 
40602adf40SYehuda Sadeh #include <linux/kernel.h>
41602adf40SYehuda Sadeh #include <linux/device.h>
42602adf40SYehuda Sadeh #include <linux/module.h>
437ad18afaSChristoph Hellwig #include <linux/blk-mq.h>
44602adf40SYehuda Sadeh #include <linux/fs.h>
45602adf40SYehuda Sadeh #include <linux/blkdev.h>
461c2a9dfeSAlex Elder #include <linux/slab.h>
47f8a22fc2SIlya Dryomov #include <linux/idr.h>
48bc1ecc65SIlya Dryomov #include <linux/workqueue.h>
49602adf40SYehuda Sadeh 
50602adf40SYehuda Sadeh #include "rbd_types.h"
51602adf40SYehuda Sadeh 
52aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
53aafb230eSAlex Elder 
54593a9e7bSAlex Elder /*
55a2acd00eSAlex Elder  * Increment the given counter and return its updated value.
56a2acd00eSAlex Elder  * If the counter is already 0 it will not be incremented.
57a2acd00eSAlex Elder  * If the counter is already at its maximum value returns
58a2acd00eSAlex Elder  * -EINVAL without updating it.
59a2acd00eSAlex Elder  */
atomic_inc_return_safe(atomic_t * v)60a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v)
61a2acd00eSAlex Elder {
62a2acd00eSAlex Elder 	unsigned int counter;
63a2acd00eSAlex Elder 
64bfc18e38SMark Rutland 	counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0);
65a2acd00eSAlex Elder 	if (counter <= (unsigned int)INT_MAX)
66a2acd00eSAlex Elder 		return (int)counter;
67a2acd00eSAlex Elder 
68a2acd00eSAlex Elder 	atomic_dec(v);
69a2acd00eSAlex Elder 
70a2acd00eSAlex Elder 	return -EINVAL;
71a2acd00eSAlex Elder }
72a2acd00eSAlex Elder 
73a2acd00eSAlex Elder /* Decrement the counter.  Return the resulting value, or -EINVAL */
atomic_dec_return_safe(atomic_t * v)74a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v)
75a2acd00eSAlex Elder {
76a2acd00eSAlex Elder 	int counter;
77a2acd00eSAlex Elder 
78a2acd00eSAlex Elder 	counter = atomic_dec_return(v);
79a2acd00eSAlex Elder 	if (counter >= 0)
80a2acd00eSAlex Elder 		return counter;
81a2acd00eSAlex Elder 
82a2acd00eSAlex Elder 	atomic_inc(v);
83a2acd00eSAlex Elder 
84a2acd00eSAlex Elder 	return -EINVAL;
85a2acd00eSAlex Elder }
86a2acd00eSAlex Elder 
87f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
88602adf40SYehuda Sadeh 
897e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR		256
907e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT	4
91602adf40SYehuda Sadeh 
926d69bb53SIlya Dryomov #define RBD_MAX_PARENT_CHAIN_LEN	16
936d69bb53SIlya Dryomov 
94d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
95d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
96d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
97d4b125e9SAlex Elder 
9835d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
99602adf40SYehuda Sadeh 
100602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
101602adf40SYehuda Sadeh 
1029682fc6dSAlex Elder #define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
1039682fc6dSAlex Elder 
1049e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
1059e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
106589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
1079e15b77dSAlex Elder 
1081e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
109589d30e0SAlex Elder 
110ed95b21aSIlya Dryomov #define RBD_NOTIFY_TIMEOUT	5	/* seconds */
11199d16943SIlya Dryomov #define RBD_RETRY_DELAY		msecs_to_jiffies(1000)
11299d16943SIlya Dryomov 
113d889140cSAlex Elder /* Feature bits */
114d889140cSAlex Elder 
1158767b293SIlya Dryomov #define RBD_FEATURE_LAYERING		(1ULL<<0)
1168767b293SIlya Dryomov #define RBD_FEATURE_STRIPINGV2		(1ULL<<1)
1178767b293SIlya Dryomov #define RBD_FEATURE_EXCLUSIVE_LOCK	(1ULL<<2)
11822e8bd51SIlya Dryomov #define RBD_FEATURE_OBJECT_MAP		(1ULL<<3)
11922e8bd51SIlya Dryomov #define RBD_FEATURE_FAST_DIFF		(1ULL<<4)
120b9f6d447SIlya Dryomov #define RBD_FEATURE_DEEP_FLATTEN	(1ULL<<5)
1218767b293SIlya Dryomov #define RBD_FEATURE_DATA_POOL		(1ULL<<7)
122e573427aSIlya Dryomov #define RBD_FEATURE_OPERATIONS		(1ULL<<8)
1238767b293SIlya Dryomov 
124ed95b21aSIlya Dryomov #define RBD_FEATURES_ALL	(RBD_FEATURE_LAYERING |		\
125ed95b21aSIlya Dryomov 				 RBD_FEATURE_STRIPINGV2 |	\
1267e97332eSIlya Dryomov 				 RBD_FEATURE_EXCLUSIVE_LOCK |	\
12722e8bd51SIlya Dryomov 				 RBD_FEATURE_OBJECT_MAP |	\
12822e8bd51SIlya Dryomov 				 RBD_FEATURE_FAST_DIFF |	\
129b9f6d447SIlya Dryomov 				 RBD_FEATURE_DEEP_FLATTEN |	\
130e573427aSIlya Dryomov 				 RBD_FEATURE_DATA_POOL |	\
131e573427aSIlya Dryomov 				 RBD_FEATURE_OPERATIONS)
132d889140cSAlex Elder 
133d889140cSAlex Elder /* Features supported by this (client software) implementation. */
134d889140cSAlex Elder 
135770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
136d889140cSAlex Elder 
13781a89793SAlex Elder /*
13881a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
13981a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
14081a89793SAlex Elder  */
141602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
142602adf40SYehuda Sadeh 
143602adf40SYehuda Sadeh /*
144602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
145602adf40SYehuda Sadeh  */
146602adf40SYehuda Sadeh struct rbd_image_header {
147f35a4deeSAlex Elder 	/* These six fields never change for a given rbd image */
148849b4260SAlex Elder 	char *object_prefix;
149602adf40SYehuda Sadeh 	__u8 obj_order;
150f35a4deeSAlex Elder 	u64 stripe_unit;
151f35a4deeSAlex Elder 	u64 stripe_count;
1527e97332eSIlya Dryomov 	s64 data_pool_id;
153f35a4deeSAlex Elder 	u64 features;		/* Might be changeable someday? */
154602adf40SYehuda Sadeh 
155f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
156f84344f3SAlex Elder 	u64 image_size;
157f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
158f35a4deeSAlex Elder 	char *snap_names;	/* format 1 only */
159f35a4deeSAlex Elder 	u64 *snap_sizes;	/* format 1 only */
16059c2be1eSYehuda Sadeh };
16159c2be1eSYehuda Sadeh 
1620d7dbfceSAlex Elder /*
1630d7dbfceSAlex Elder  * An rbd image specification.
1640d7dbfceSAlex Elder  *
1650d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
166c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
167c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
168c66c6e0cSAlex Elder  *
169c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
170c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
171c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
172c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
173c66c6e0cSAlex Elder  *
174c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
175c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
176c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
177c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
178c66c6e0cSAlex Elder  * is shared between the parent and child).
179c66c6e0cSAlex Elder  *
180c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
181c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
182c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
183c66c6e0cSAlex Elder  *
184c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
185c66c6e0cSAlex Elder  * could be a null pointer).
1860d7dbfceSAlex Elder  */
1870d7dbfceSAlex Elder struct rbd_spec {
1880d7dbfceSAlex Elder 	u64		pool_id;
189ecb4dc22SAlex Elder 	const char	*pool_name;
190b26c047bSIlya Dryomov 	const char	*pool_ns;	/* NULL if default, never "" */
1910d7dbfceSAlex Elder 
192ecb4dc22SAlex Elder 	const char	*image_id;
193ecb4dc22SAlex Elder 	const char	*image_name;
1940d7dbfceSAlex Elder 
1950d7dbfceSAlex Elder 	u64		snap_id;
196ecb4dc22SAlex Elder 	const char	*snap_name;
1970d7dbfceSAlex Elder 
1980d7dbfceSAlex Elder 	struct kref	kref;
1990d7dbfceSAlex Elder };
2000d7dbfceSAlex Elder 
201602adf40SYehuda Sadeh /*
202f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
203602adf40SYehuda Sadeh  */
204602adf40SYehuda Sadeh struct rbd_client {
205602adf40SYehuda Sadeh 	struct ceph_client	*client;
206602adf40SYehuda Sadeh 	struct kref		kref;
207602adf40SYehuda Sadeh 	struct list_head	node;
208602adf40SYehuda Sadeh };
209602adf40SYehuda Sadeh 
2100192ce2eSIlya Dryomov struct pending_result {
2110192ce2eSIlya Dryomov 	int			result;		/* first nonzero result */
2120192ce2eSIlya Dryomov 	int			num_pending;
2130192ce2eSIlya Dryomov };
2140192ce2eSIlya Dryomov 
215bf0d5f50SAlex Elder struct rbd_img_request;
216bf0d5f50SAlex Elder 
2179969ebc5SAlex Elder enum obj_request_type {
218a1fbb5e7SIlya Dryomov 	OBJ_REQUEST_NODATA = 1,
2195359a17dSIlya Dryomov 	OBJ_REQUEST_BIO,	/* pointer into provided bio (list) */
2207e07efb1SIlya Dryomov 	OBJ_REQUEST_BVECS,	/* pointer into provided bio_vec array */
221afb97888SIlya Dryomov 	OBJ_REQUEST_OWN_BVECS,	/* private bio_vec array, doesn't own pages */
2229969ebc5SAlex Elder };
223bf0d5f50SAlex Elder 
2246d2940c8SGuangliang Zhao enum obj_operation_type {
225a1fbb5e7SIlya Dryomov 	OBJ_OP_READ = 1,
2266d2940c8SGuangliang Zhao 	OBJ_OP_WRITE,
22790e98c52SGuangliang Zhao 	OBJ_OP_DISCARD,
2286484cbe9SIlya Dryomov 	OBJ_OP_ZEROOUT,
2296d2940c8SGuangliang Zhao };
2306d2940c8SGuangliang Zhao 
2310ad5d953SIlya Dryomov #define RBD_OBJ_FLAG_DELETION			(1U << 0)
2320ad5d953SIlya Dryomov #define RBD_OBJ_FLAG_COPYUP_ENABLED		(1U << 1)
233793333a3SIlya Dryomov #define RBD_OBJ_FLAG_COPYUP_ZEROS		(1U << 2)
23422e8bd51SIlya Dryomov #define RBD_OBJ_FLAG_MAY_EXIST			(1U << 3)
23522e8bd51SIlya Dryomov #define RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT	(1U << 4)
2360ad5d953SIlya Dryomov 
237a9b67e69SIlya Dryomov enum rbd_obj_read_state {
23885b5e6d1SIlya Dryomov 	RBD_OBJ_READ_START = 1,
23985b5e6d1SIlya Dryomov 	RBD_OBJ_READ_OBJECT,
240a9b67e69SIlya Dryomov 	RBD_OBJ_READ_PARENT,
241a9b67e69SIlya Dryomov };
242a9b67e69SIlya Dryomov 
2433da691bfSIlya Dryomov /*
2443da691bfSIlya Dryomov  * Writes go through the following state machine to deal with
2453da691bfSIlya Dryomov  * layering:
2463da691bfSIlya Dryomov  *
24789a59c1cSIlya Dryomov  *            . . . . . RBD_OBJ_WRITE_GUARD. . . . . . . . . . . . . .
24889a59c1cSIlya Dryomov  *            .                 |                                    .
24989a59c1cSIlya Dryomov  *            .                 v                                    .
25089a59c1cSIlya Dryomov  *            .    RBD_OBJ_WRITE_READ_FROM_PARENT. . .               .
25189a59c1cSIlya Dryomov  *            .                 |                    .               .
25289a59c1cSIlya Dryomov  *            .                 v                    v (deep-copyup  .
25389a59c1cSIlya Dryomov  *    (image  .   RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC   .  not needed)  .
25489a59c1cSIlya Dryomov  * flattened) v                 |                    .               .
25589a59c1cSIlya Dryomov  *            .                 v                    .               .
25689a59c1cSIlya Dryomov  *            . . . .RBD_OBJ_WRITE_COPYUP_OPS. . . . .      (copyup  .
25789a59c1cSIlya Dryomov  *                              |                        not needed) v
25889a59c1cSIlya Dryomov  *                              v                                    .
25989a59c1cSIlya Dryomov  *                            done . . . . . . . . . . . . . . . . . .
2603da691bfSIlya Dryomov  *                              ^
2613da691bfSIlya Dryomov  *                              |
2623da691bfSIlya Dryomov  *                     RBD_OBJ_WRITE_FLAT
2633da691bfSIlya Dryomov  *
2643da691bfSIlya Dryomov  * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
26589a59c1cSIlya Dryomov  * assert_exists guard is needed or not (in some cases it's not needed
26689a59c1cSIlya Dryomov  * even if there is a parent).
2673da691bfSIlya Dryomov  */
2683da691bfSIlya Dryomov enum rbd_obj_write_state {
26985b5e6d1SIlya Dryomov 	RBD_OBJ_WRITE_START = 1,
27022e8bd51SIlya Dryomov 	RBD_OBJ_WRITE_PRE_OBJECT_MAP,
27185b5e6d1SIlya Dryomov 	RBD_OBJ_WRITE_OBJECT,
272793333a3SIlya Dryomov 	__RBD_OBJ_WRITE_COPYUP,
273793333a3SIlya Dryomov 	RBD_OBJ_WRITE_COPYUP,
27422e8bd51SIlya Dryomov 	RBD_OBJ_WRITE_POST_OBJECT_MAP,
275793333a3SIlya Dryomov };
276793333a3SIlya Dryomov 
277793333a3SIlya Dryomov enum rbd_obj_copyup_state {
278793333a3SIlya Dryomov 	RBD_OBJ_COPYUP_START = 1,
279793333a3SIlya Dryomov 	RBD_OBJ_COPYUP_READ_PARENT,
28022e8bd51SIlya Dryomov 	__RBD_OBJ_COPYUP_OBJECT_MAPS,
28122e8bd51SIlya Dryomov 	RBD_OBJ_COPYUP_OBJECT_MAPS,
282793333a3SIlya Dryomov 	__RBD_OBJ_COPYUP_WRITE_OBJECT,
283793333a3SIlya Dryomov 	RBD_OBJ_COPYUP_WRITE_OBJECT,
284926f9b3fSAlex Elder };
285926f9b3fSAlex Elder 
286bf0d5f50SAlex Elder struct rbd_obj_request {
28743df3d35SIlya Dryomov 	struct ceph_object_extent ex;
2880ad5d953SIlya Dryomov 	unsigned int		flags;	/* RBD_OBJ_FLAG_* */
289c5b5ef6cSAlex Elder 	union {
290a9b67e69SIlya Dryomov 		enum rbd_obj_read_state	 read_state;	/* for reads */
2913da691bfSIlya Dryomov 		enum rbd_obj_write_state write_state;	/* for writes */
2923da691bfSIlya Dryomov 	};
293bf0d5f50SAlex Elder 
294bf0d5f50SAlex Elder 	struct rbd_img_request	*img_request;
29586bd7998SIlya Dryomov 	struct ceph_file_extent	*img_extents;
29686bd7998SIlya Dryomov 	u32			num_img_extents;
297bf0d5f50SAlex Elder 
298788e2df3SAlex Elder 	union {
2995359a17dSIlya Dryomov 		struct ceph_bio_iter	bio_pos;
300788e2df3SAlex Elder 		struct {
3017e07efb1SIlya Dryomov 			struct ceph_bvec_iter	bvec_pos;
3027e07efb1SIlya Dryomov 			u32			bvec_count;
303afb97888SIlya Dryomov 			u32			bvec_idx;
304788e2df3SAlex Elder 		};
305788e2df3SAlex Elder 	};
306793333a3SIlya Dryomov 
307793333a3SIlya Dryomov 	enum rbd_obj_copyup_state copyup_state;
3087e07efb1SIlya Dryomov 	struct bio_vec		*copyup_bvecs;
3097e07efb1SIlya Dryomov 	u32			copyup_bvec_count;
310bf0d5f50SAlex Elder 
311bcbab1dbSIlya Dryomov 	struct list_head	osd_reqs;	/* w/ r_private_item */
312bf0d5f50SAlex Elder 
31385b5e6d1SIlya Dryomov 	struct mutex		state_mutex;
314793333a3SIlya Dryomov 	struct pending_result	pending;
315bf0d5f50SAlex Elder 	struct kref		kref;
316bf0d5f50SAlex Elder };
317bf0d5f50SAlex Elder 
3180c425248SAlex Elder enum img_req_flags {
3199849e986SAlex Elder 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
320d0b2e944SAlex Elder 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
3210c425248SAlex Elder };
3220c425248SAlex Elder 
3230192ce2eSIlya Dryomov enum rbd_img_state {
3240192ce2eSIlya Dryomov 	RBD_IMG_START = 1,
325637cd060SIlya Dryomov 	RBD_IMG_EXCLUSIVE_LOCK,
3260192ce2eSIlya Dryomov 	__RBD_IMG_OBJECT_REQUESTS,
3270192ce2eSIlya Dryomov 	RBD_IMG_OBJECT_REQUESTS,
3280192ce2eSIlya Dryomov };
3290192ce2eSIlya Dryomov 
330bf0d5f50SAlex Elder struct rbd_img_request {
331bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
3329bb0248dSIlya Dryomov 	enum obj_operation_type	op_type;
333ecc633caSIlya Dryomov 	enum obj_request_type	data_type;
3340c425248SAlex Elder 	unsigned long		flags;
3350192ce2eSIlya Dryomov 	enum rbd_img_state	state;
336bf0d5f50SAlex Elder 	union {
337bf0d5f50SAlex Elder 		u64			snap_id;	/* for reads */
3389849e986SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
3399849e986SAlex Elder 	};
3409849e986SAlex Elder 	struct rbd_obj_request	*obj_request;	/* obj req initiator */
341bf0d5f50SAlex Elder 
342e1fddc8fSIlya Dryomov 	struct list_head	lock_item;
34343df3d35SIlya Dryomov 	struct list_head	object_extents;	/* obj_req.ex structs */
344bf0d5f50SAlex Elder 
3450192ce2eSIlya Dryomov 	struct mutex		state_mutex;
3460192ce2eSIlya Dryomov 	struct pending_result	pending;
3470192ce2eSIlya Dryomov 	struct work_struct	work;
3480192ce2eSIlya Dryomov 	int			work_result;
349bf0d5f50SAlex Elder };
350bf0d5f50SAlex Elder 
351bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
35243df3d35SIlya Dryomov 	list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
353bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
35443df3d35SIlya Dryomov 	list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
355bf0d5f50SAlex Elder 
35699d16943SIlya Dryomov enum rbd_watch_state {
35799d16943SIlya Dryomov 	RBD_WATCH_STATE_UNREGISTERED,
35899d16943SIlya Dryomov 	RBD_WATCH_STATE_REGISTERED,
35999d16943SIlya Dryomov 	RBD_WATCH_STATE_ERROR,
36099d16943SIlya Dryomov };
36199d16943SIlya Dryomov 
362ed95b21aSIlya Dryomov enum rbd_lock_state {
363ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_UNLOCKED,
364ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_LOCKED,
365ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_RELEASING,
366ed95b21aSIlya Dryomov };
367ed95b21aSIlya Dryomov 
368ed95b21aSIlya Dryomov /* WatchNotify::ClientId */
369ed95b21aSIlya Dryomov struct rbd_client_id {
370ed95b21aSIlya Dryomov 	u64 gid;
371ed95b21aSIlya Dryomov 	u64 handle;
372ed95b21aSIlya Dryomov };
373ed95b21aSIlya Dryomov 
374f84344f3SAlex Elder struct rbd_mapping {
37599c1f08fSAlex Elder 	u64                     size;
376f84344f3SAlex Elder };
377f84344f3SAlex Elder 
378602adf40SYehuda Sadeh /*
379602adf40SYehuda Sadeh  * a single device
380602adf40SYehuda Sadeh  */
381602adf40SYehuda Sadeh struct rbd_device {
382de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
383602adf40SYehuda Sadeh 
384602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
385dd82fff1SIlya Dryomov 	int			minor;
386602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
387602adf40SYehuda Sadeh 
388a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
389602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
390602adf40SYehuda Sadeh 
391602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
392602adf40SYehuda Sadeh 
393b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
394602adf40SYehuda Sadeh 
395602adf40SYehuda Sadeh 	struct rbd_image_header	header;
396b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
3970d7dbfceSAlex Elder 	struct rbd_spec		*spec;
398d147543dSIlya Dryomov 	struct rbd_options	*opts;
3990d6d1e9cSMike Christie 	char			*config_info;	/* add{,_single_major} string */
400602adf40SYehuda Sadeh 
401c41d13a3SIlya Dryomov 	struct ceph_object_id	header_oid;
402922dab61SIlya Dryomov 	struct ceph_object_locator header_oloc;
403971f839aSAlex Elder 
4041643dfa4SIlya Dryomov 	struct ceph_file_layout	layout;		/* used for all rbd requests */
4050903e875SAlex Elder 
40699d16943SIlya Dryomov 	struct mutex		watch_mutex;
40799d16943SIlya Dryomov 	enum rbd_watch_state	watch_state;
408922dab61SIlya Dryomov 	struct ceph_osd_linger_request *watch_handle;
40999d16943SIlya Dryomov 	u64			watch_cookie;
41099d16943SIlya Dryomov 	struct delayed_work	watch_dwork;
41159c2be1eSYehuda Sadeh 
412ed95b21aSIlya Dryomov 	struct rw_semaphore	lock_rwsem;
413ed95b21aSIlya Dryomov 	enum rbd_lock_state	lock_state;
414cbbfb0ffSIlya Dryomov 	char			lock_cookie[32];
415ed95b21aSIlya Dryomov 	struct rbd_client_id	owner_cid;
416ed95b21aSIlya Dryomov 	struct work_struct	acquired_lock_work;
417ed95b21aSIlya Dryomov 	struct work_struct	released_lock_work;
418ed95b21aSIlya Dryomov 	struct delayed_work	lock_dwork;
419ed95b21aSIlya Dryomov 	struct work_struct	unlock_work;
420e1fddc8fSIlya Dryomov 	spinlock_t		lock_lists_lock;
421637cd060SIlya Dryomov 	struct list_head	acquiring_list;
422e1fddc8fSIlya Dryomov 	struct list_head	running_list;
423637cd060SIlya Dryomov 	struct completion	acquire_wait;
424637cd060SIlya Dryomov 	int			acquire_err;
425e1fddc8fSIlya Dryomov 	struct completion	releasing_wait;
426ed95b21aSIlya Dryomov 
42722e8bd51SIlya Dryomov 	spinlock_t		object_map_lock;
42822e8bd51SIlya Dryomov 	u8			*object_map;
42922e8bd51SIlya Dryomov 	u64			object_map_size;	/* in objects */
43022e8bd51SIlya Dryomov 	u64			object_map_flags;
431602adf40SYehuda Sadeh 
4321643dfa4SIlya Dryomov 	struct workqueue_struct	*task_wq;
433602adf40SYehuda Sadeh 
43486b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
43586b00e0dSAlex Elder 	u64			parent_overlap;
436a2acd00eSAlex Elder 	atomic_t		parent_ref;
4372f82ee54SAlex Elder 	struct rbd_device	*parent;
43886b00e0dSAlex Elder 
4397ad18afaSChristoph Hellwig 	/* Block layer tags. */
4407ad18afaSChristoph Hellwig 	struct blk_mq_tag_set	tag_set;
4417ad18afaSChristoph Hellwig 
442c666601aSJosh Durgin 	/* protects updating the header */
443c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
444f84344f3SAlex Elder 
445f84344f3SAlex Elder 	struct rbd_mapping	mapping;
446602adf40SYehuda Sadeh 
447602adf40SYehuda Sadeh 	struct list_head	node;
448dfc5606dSYehuda Sadeh 
449dfc5606dSYehuda Sadeh 	/* sysfs related */
450dfc5606dSYehuda Sadeh 	struct device		dev;
451b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
452dfc5606dSYehuda Sadeh };
453dfc5606dSYehuda Sadeh 
454b82d167bSAlex Elder /*
45587c0fdedSIlya Dryomov  * Flag bits for rbd_dev->flags:
45687c0fdedSIlya Dryomov  * - REMOVING (which is coupled with rbd_dev->open_count) is protected
45787c0fdedSIlya Dryomov  *   by rbd_dev->lock
458b82d167bSAlex Elder  */
4596d292906SAlex Elder enum rbd_dev_flags {
460686238b7SIlya Dryomov 	RBD_DEV_FLAG_EXISTS,	/* rbd_dev_device_setup() ran */
461b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
46239258aa2SIlya Dryomov 	RBD_DEV_FLAG_READONLY,  /* -o ro or snapshot */
4636d292906SAlex Elder };
4646d292906SAlex Elder 
465cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex);	/* Serialize client creation */
466e124a82fSAlex Elder 
467602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
468e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
469e124a82fSAlex Elder 
470602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
471432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
472602adf40SYehuda Sadeh 
47378c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */
47478c2a44aSAlex Elder 
4751c2a9dfeSAlex Elder static struct kmem_cache	*rbd_img_request_cache;
476868311b1SAlex Elder static struct kmem_cache	*rbd_obj_request_cache;
4771c2a9dfeSAlex Elder 
4789b60e70bSIlya Dryomov static int rbd_major;
479f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida);
480f8a22fc2SIlya Dryomov 
481f5ee37bdSIlya Dryomov static struct workqueue_struct *rbd_wq;
482f5ee37bdSIlya Dryomov 
48389a59c1cSIlya Dryomov static struct ceph_snap_context rbd_empty_snapc = {
48489a59c1cSIlya Dryomov 	.nref = REFCOUNT_INIT(1),
48589a59c1cSIlya Dryomov };
48689a59c1cSIlya Dryomov 
4879b60e70bSIlya Dryomov /*
4883cfa3b16SIlya Dryomov  * single-major requires >= 0.75 version of userspace rbd utility.
4899b60e70bSIlya Dryomov  */
4903cfa3b16SIlya Dryomov static bool single_major = true;
4915657a819SJoe Perches module_param(single_major, bool, 0444);
4923cfa3b16SIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
4939b60e70bSIlya Dryomov 
49475cff725SGreg Kroah-Hartman static ssize_t add_store(const struct bus_type *bus, const char *buf, size_t count);
49575cff725SGreg Kroah-Hartman static ssize_t remove_store(const struct bus_type *bus, const char *buf,
496f0f8cef5SAlex Elder 			    size_t count);
49775cff725SGreg Kroah-Hartman static ssize_t add_single_major_store(const struct bus_type *bus, const char *buf,
498f0f8cef5SAlex Elder 				      size_t count);
49975cff725SGreg Kroah-Hartman static ssize_t remove_single_major_store(const struct bus_type *bus, const char *buf,
5009b60e70bSIlya Dryomov 					 size_t count);
5016d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
502f0f8cef5SAlex Elder 
rbd_dev_id_to_minor(int dev_id)5039b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id)
5049b60e70bSIlya Dryomov {
5057e513d43SIlya Dryomov 	return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
5069b60e70bSIlya Dryomov }
5079b60e70bSIlya Dryomov 
minor_to_rbd_dev_id(int minor)5089b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor)
5099b60e70bSIlya Dryomov {
5107e513d43SIlya Dryomov 	return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
5119b60e70bSIlya Dryomov }
5129b60e70bSIlya Dryomov 
rbd_is_ro(struct rbd_device * rbd_dev)51339258aa2SIlya Dryomov static bool rbd_is_ro(struct rbd_device *rbd_dev)
51439258aa2SIlya Dryomov {
51539258aa2SIlya Dryomov 	return test_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags);
51639258aa2SIlya Dryomov }
51739258aa2SIlya Dryomov 
rbd_is_snap(struct rbd_device * rbd_dev)518f3c0e459SIlya Dryomov static bool rbd_is_snap(struct rbd_device *rbd_dev)
519f3c0e459SIlya Dryomov {
520f3c0e459SIlya Dryomov 	return rbd_dev->spec->snap_id != CEPH_NOSNAP;
521f3c0e459SIlya Dryomov }
522f3c0e459SIlya Dryomov 
__rbd_is_lock_owner(struct rbd_device * rbd_dev)523ed95b21aSIlya Dryomov static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
524ed95b21aSIlya Dryomov {
525637cd060SIlya Dryomov 	lockdep_assert_held(&rbd_dev->lock_rwsem);
526637cd060SIlya Dryomov 
527ed95b21aSIlya Dryomov 	return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
528ed95b21aSIlya Dryomov 	       rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
529ed95b21aSIlya Dryomov }
530ed95b21aSIlya Dryomov 
rbd_is_lock_owner(struct rbd_device * rbd_dev)531ed95b21aSIlya Dryomov static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
532ed95b21aSIlya Dryomov {
533ed95b21aSIlya Dryomov 	bool is_lock_owner;
534ed95b21aSIlya Dryomov 
535ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
536ed95b21aSIlya Dryomov 	is_lock_owner = __rbd_is_lock_owner(rbd_dev);
537ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
538ed95b21aSIlya Dryomov 	return is_lock_owner;
539ed95b21aSIlya Dryomov }
540ed95b21aSIlya Dryomov 
supported_features_show(const struct bus_type * bus,char * buf)54175cff725SGreg Kroah-Hartman static ssize_t supported_features_show(const struct bus_type *bus, char *buf)
5428767b293SIlya Dryomov {
5438767b293SIlya Dryomov 	return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
5448767b293SIlya Dryomov }
5458767b293SIlya Dryomov 
5467e9586baSGreg Kroah-Hartman static BUS_ATTR_WO(add);
5477e9586baSGreg Kroah-Hartman static BUS_ATTR_WO(remove);
5487e9586baSGreg Kroah-Hartman static BUS_ATTR_WO(add_single_major);
5497e9586baSGreg Kroah-Hartman static BUS_ATTR_WO(remove_single_major);
5507e9586baSGreg Kroah-Hartman static BUS_ATTR_RO(supported_features);
551b15a21ddSGreg Kroah-Hartman 
552b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = {
553b15a21ddSGreg Kroah-Hartman 	&bus_attr_add.attr,
554b15a21ddSGreg Kroah-Hartman 	&bus_attr_remove.attr,
5559b60e70bSIlya Dryomov 	&bus_attr_add_single_major.attr,
5569b60e70bSIlya Dryomov 	&bus_attr_remove_single_major.attr,
5578767b293SIlya Dryomov 	&bus_attr_supported_features.attr,
558b15a21ddSGreg Kroah-Hartman 	NULL,
559f0f8cef5SAlex Elder };
56092c76dc0SIlya Dryomov 
rbd_bus_is_visible(struct kobject * kobj,struct attribute * attr,int index)56192c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj,
56292c76dc0SIlya Dryomov 				  struct attribute *attr, int index)
56392c76dc0SIlya Dryomov {
5649b60e70bSIlya Dryomov 	if (!single_major &&
5659b60e70bSIlya Dryomov 	    (attr == &bus_attr_add_single_major.attr ||
5669b60e70bSIlya Dryomov 	     attr == &bus_attr_remove_single_major.attr))
5679b60e70bSIlya Dryomov 		return 0;
5689b60e70bSIlya Dryomov 
56992c76dc0SIlya Dryomov 	return attr->mode;
57092c76dc0SIlya Dryomov }
57192c76dc0SIlya Dryomov 
57292c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = {
57392c76dc0SIlya Dryomov 	.attrs = rbd_bus_attrs,
57492c76dc0SIlya Dryomov 	.is_visible = rbd_bus_is_visible,
57592c76dc0SIlya Dryomov };
57692c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus);
577f0f8cef5SAlex Elder 
578f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
579f0f8cef5SAlex Elder 	.name		= "rbd",
580b15a21ddSGreg Kroah-Hartman 	.bus_groups	= rbd_bus_groups,
581f0f8cef5SAlex Elder };
582f0f8cef5SAlex Elder 
rbd_root_dev_release(struct device * dev)583f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
584f0f8cef5SAlex Elder {
585f0f8cef5SAlex Elder }
586f0f8cef5SAlex Elder 
587f0f8cef5SAlex Elder static struct device rbd_root_dev = {
588f0f8cef5SAlex Elder 	.init_name =    "rbd",
589f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
590f0f8cef5SAlex Elder };
591f0f8cef5SAlex Elder 
59206ecc6cbSAlex Elder static __printf(2, 3)
rbd_warn(struct rbd_device * rbd_dev,const char * fmt,...)59306ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
59406ecc6cbSAlex Elder {
59506ecc6cbSAlex Elder 	struct va_format vaf;
59606ecc6cbSAlex Elder 	va_list args;
59706ecc6cbSAlex Elder 
59806ecc6cbSAlex Elder 	va_start(args, fmt);
59906ecc6cbSAlex Elder 	vaf.fmt = fmt;
60006ecc6cbSAlex Elder 	vaf.va = &args;
60106ecc6cbSAlex Elder 
60206ecc6cbSAlex Elder 	if (!rbd_dev)
60306ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
60406ecc6cbSAlex Elder 	else if (rbd_dev->disk)
60506ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
60606ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
60706ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
60806ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
60906ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
61006ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
61106ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
61206ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
61306ecc6cbSAlex Elder 	else	/* punt */
61406ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
61506ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
61606ecc6cbSAlex Elder 	va_end(args);
61706ecc6cbSAlex Elder }
61806ecc6cbSAlex Elder 
619aafb230eSAlex Elder #ifdef RBD_DEBUG
620aafb230eSAlex Elder #define rbd_assert(expr)						\
621aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
622aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
623aafb230eSAlex Elder 						"at line %d:\n\n"	\
624aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
625aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
626aafb230eSAlex Elder 			BUG();						\
627aafb230eSAlex Elder 		}
628aafb230eSAlex Elder #else /* !RBD_DEBUG */
629aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
630aafb230eSAlex Elder #endif /* !RBD_DEBUG */
631dfc5606dSYehuda Sadeh 
63205a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
6338b3e1a56SAlex Elder 
634cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev);
635510a7330SIlya Dryomov static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev,
636510a7330SIlya Dryomov 				     struct rbd_image_header *header);
63754cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
63854cac61fSAlex Elder 					u64 snap_id);
6392ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
6402ad3d716SAlex Elder 				u8 *order, u64 *snap_size);
64122e8bd51SIlya Dryomov static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev);
64259c2be1eSYehuda Sadeh 
64354ab3b24SIlya Dryomov static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result);
6440192ce2eSIlya Dryomov static void rbd_img_handle_request(struct rbd_img_request *img_req, int result);
6450192ce2eSIlya Dryomov 
6460192ce2eSIlya Dryomov /*
6470192ce2eSIlya Dryomov  * Return true if nothing else is pending.
6480192ce2eSIlya Dryomov  */
pending_result_dec(struct pending_result * pending,int * result)6490192ce2eSIlya Dryomov static bool pending_result_dec(struct pending_result *pending, int *result)
6500192ce2eSIlya Dryomov {
6510192ce2eSIlya Dryomov 	rbd_assert(pending->num_pending > 0);
6520192ce2eSIlya Dryomov 
6530192ce2eSIlya Dryomov 	if (*result && !pending->result)
6540192ce2eSIlya Dryomov 		pending->result = *result;
6550192ce2eSIlya Dryomov 	if (--pending->num_pending)
6560192ce2eSIlya Dryomov 		return false;
6570192ce2eSIlya Dryomov 
6580192ce2eSIlya Dryomov 	*result = pending->result;
6590192ce2eSIlya Dryomov 	return true;
6600192ce2eSIlya Dryomov }
661602adf40SYehuda Sadeh 
rbd_open(struct gendisk * disk,blk_mode_t mode)66205bdb996SChristoph Hellwig static int rbd_open(struct gendisk *disk, blk_mode_t mode)
663602adf40SYehuda Sadeh {
664d32e2bf8SChristoph Hellwig 	struct rbd_device *rbd_dev = disk->private_data;
665b82d167bSAlex Elder 	bool removing = false;
666602adf40SYehuda Sadeh 
667a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
668b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
669b82d167bSAlex Elder 		removing = true;
670b82d167bSAlex Elder 	else
671b82d167bSAlex Elder 		rbd_dev->open_count++;
672a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
673b82d167bSAlex Elder 	if (removing)
674b82d167bSAlex Elder 		return -ENOENT;
675b82d167bSAlex Elder 
676c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
677340c7a2bSAlex Elder 
678602adf40SYehuda Sadeh 	return 0;
679602adf40SYehuda Sadeh }
680602adf40SYehuda Sadeh 
rbd_release(struct gendisk * disk)681ae220766SChristoph Hellwig static void rbd_release(struct gendisk *disk)
682dfc5606dSYehuda Sadeh {
683dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
684b82d167bSAlex Elder 	unsigned long open_count_before;
685b82d167bSAlex Elder 
686a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
687b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
688a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
689b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
690dfc5606dSYehuda Sadeh 
691c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
692dfc5606dSYehuda Sadeh }
693dfc5606dSYehuda Sadeh 
694602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
695602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
696602adf40SYehuda Sadeh 	.open			= rbd_open,
697dfc5606dSYehuda Sadeh 	.release		= rbd_release,
698602adf40SYehuda Sadeh };
699602adf40SYehuda Sadeh 
700602adf40SYehuda Sadeh /*
7017262cfcaSAlex Elder  * Initialize an rbd client instance.  Success or not, this function
702cfbf6377SAlex Elder  * consumes ceph_opts.  Caller holds client_mutex.
703602adf40SYehuda Sadeh  */
rbd_client_create(struct ceph_options * ceph_opts)704f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
705602adf40SYehuda Sadeh {
706602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
707602adf40SYehuda Sadeh 	int ret = -ENOMEM;
708602adf40SYehuda Sadeh 
70937206ee5SAlex Elder 	dout("%s:\n", __func__);
710602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
711602adf40SYehuda Sadeh 	if (!rbdc)
712602adf40SYehuda Sadeh 		goto out_opt;
713602adf40SYehuda Sadeh 
714602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
715602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
716602adf40SYehuda Sadeh 
71774da4a0fSIlya Dryomov 	rbdc->client = ceph_create_client(ceph_opts, rbdc);
718602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
71908f75463SAlex Elder 		goto out_rbdc;
72043ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
721602adf40SYehuda Sadeh 
722602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
723602adf40SYehuda Sadeh 	if (ret < 0)
72408f75463SAlex Elder 		goto out_client;
725602adf40SYehuda Sadeh 
726432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
727602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
728432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
729602adf40SYehuda Sadeh 
73037206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
731bc534d86SAlex Elder 
732602adf40SYehuda Sadeh 	return rbdc;
73308f75463SAlex Elder out_client:
734602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
73508f75463SAlex Elder out_rbdc:
736602adf40SYehuda Sadeh 	kfree(rbdc);
737602adf40SYehuda Sadeh out_opt:
73843ae4701SAlex Elder 	if (ceph_opts)
73943ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
74037206ee5SAlex Elder 	dout("%s: error %d\n", __func__, ret);
74137206ee5SAlex Elder 
74228f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
743602adf40SYehuda Sadeh }
744602adf40SYehuda Sadeh 
__rbd_get_client(struct rbd_client * rbdc)7452f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
7462f82ee54SAlex Elder {
7472f82ee54SAlex Elder 	kref_get(&rbdc->kref);
7482f82ee54SAlex Elder 
7492f82ee54SAlex Elder 	return rbdc;
7502f82ee54SAlex Elder }
7512f82ee54SAlex Elder 
752602adf40SYehuda Sadeh /*
7531f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
7541f7ba331SAlex Elder  * found, bump its reference count.
755602adf40SYehuda Sadeh  */
rbd_client_find(struct ceph_options * ceph_opts)7561f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
757602adf40SYehuda Sadeh {
7583302ffd4SJakob Koschel 	struct rbd_client *rbdc = NULL, *iter;
759602adf40SYehuda Sadeh 
76043ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
761602adf40SYehuda Sadeh 		return NULL;
762602adf40SYehuda Sadeh 
7631f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
7643302ffd4SJakob Koschel 	list_for_each_entry(iter, &rbd_client_list, node) {
7653302ffd4SJakob Koschel 		if (!ceph_compare_options(ceph_opts, iter->client)) {
7663302ffd4SJakob Koschel 			__rbd_get_client(iter);
7672f82ee54SAlex Elder 
7683302ffd4SJakob Koschel 			rbdc = iter;
7691f7ba331SAlex Elder 			break;
7701f7ba331SAlex Elder 		}
7711f7ba331SAlex Elder 	}
7721f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
7731f7ba331SAlex Elder 
7743302ffd4SJakob Koschel 	return rbdc;
775602adf40SYehuda Sadeh }
776602adf40SYehuda Sadeh 
777602adf40SYehuda Sadeh /*
778210c104cSIlya Dryomov  * (Per device) rbd map options
77959c2be1eSYehuda Sadeh  */
78059c2be1eSYehuda Sadeh enum {
781b5584180SIlya Dryomov 	Opt_queue_depth,
7820c93e1b7SIlya Dryomov 	Opt_alloc_size,
78334f55d0bSDongsheng Yang 	Opt_lock_timeout,
78459c2be1eSYehuda Sadeh 	/* int args above */
785b26c047bSIlya Dryomov 	Opt_pool_ns,
786dc1dad8eSIlya Dryomov 	Opt_compression_hint,
78759c2be1eSYehuda Sadeh 	/* string args above */
788cc0538b6SAlex Elder 	Opt_read_only,
789cc0538b6SAlex Elder 	Opt_read_write,
79080de1912SIlya Dryomov 	Opt_lock_on_read,
791e010dd0aSIlya Dryomov 	Opt_exclusive,
792d9360540SIlya Dryomov 	Opt_notrim,
79359c2be1eSYehuda Sadeh };
79459c2be1eSYehuda Sadeh 
795dc1dad8eSIlya Dryomov enum {
796dc1dad8eSIlya Dryomov 	Opt_compression_hint_none,
797dc1dad8eSIlya Dryomov 	Opt_compression_hint_compressible,
798dc1dad8eSIlya Dryomov 	Opt_compression_hint_incompressible,
799dc1dad8eSIlya Dryomov };
800dc1dad8eSIlya Dryomov 
801dc1dad8eSIlya Dryomov static const struct constant_table rbd_param_compression_hint[] = {
802dc1dad8eSIlya Dryomov 	{"none",		Opt_compression_hint_none},
803dc1dad8eSIlya Dryomov 	{"compressible",	Opt_compression_hint_compressible},
804dc1dad8eSIlya Dryomov 	{"incompressible",	Opt_compression_hint_incompressible},
805dc1dad8eSIlya Dryomov 	{}
806dc1dad8eSIlya Dryomov };
807dc1dad8eSIlya Dryomov 
808d7167b14SAl Viro static const struct fs_parameter_spec rbd_parameters[] = {
80982995cc6SDavid Howells 	fsparam_u32	("alloc_size",			Opt_alloc_size),
810dc1dad8eSIlya Dryomov 	fsparam_enum	("compression_hint",		Opt_compression_hint,
811dc1dad8eSIlya Dryomov 			 rbd_param_compression_hint),
81282995cc6SDavid Howells 	fsparam_flag	("exclusive",			Opt_exclusive),
81382995cc6SDavid Howells 	fsparam_flag	("lock_on_read",		Opt_lock_on_read),
81482995cc6SDavid Howells 	fsparam_u32	("lock_timeout",		Opt_lock_timeout),
81582995cc6SDavid Howells 	fsparam_flag	("notrim",			Opt_notrim),
81682995cc6SDavid Howells 	fsparam_string	("_pool_ns",			Opt_pool_ns),
81782995cc6SDavid Howells 	fsparam_u32	("queue_depth",			Opt_queue_depth),
81882995cc6SDavid Howells 	fsparam_flag	("read_only",			Opt_read_only),
81982995cc6SDavid Howells 	fsparam_flag	("read_write",			Opt_read_write),
82082995cc6SDavid Howells 	fsparam_flag	("ro",				Opt_read_only),
82182995cc6SDavid Howells 	fsparam_flag	("rw",				Opt_read_write),
82282995cc6SDavid Howells 	{}
82382995cc6SDavid Howells };
82482995cc6SDavid Howells 
82598571b5aSAlex Elder struct rbd_options {
826b5584180SIlya Dryomov 	int	queue_depth;
8270c93e1b7SIlya Dryomov 	int	alloc_size;
82834f55d0bSDongsheng Yang 	unsigned long	lock_timeout;
82998571b5aSAlex Elder 	bool	read_only;
83080de1912SIlya Dryomov 	bool	lock_on_read;
831e010dd0aSIlya Dryomov 	bool	exclusive;
832d9360540SIlya Dryomov 	bool	trim;
833dc1dad8eSIlya Dryomov 
834dc1dad8eSIlya Dryomov 	u32 alloc_hint_flags;  /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */
83598571b5aSAlex Elder };
83698571b5aSAlex Elder 
837d2a27964SJohn Garry #define RBD_QUEUE_DEPTH_DEFAULT	BLKDEV_DEFAULT_RQ
8380c93e1b7SIlya Dryomov #define RBD_ALLOC_SIZE_DEFAULT	(64 * 1024)
83934f55d0bSDongsheng Yang #define RBD_LOCK_TIMEOUT_DEFAULT 0  /* no timeout */
84098571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
84180de1912SIlya Dryomov #define RBD_LOCK_ON_READ_DEFAULT false
842e010dd0aSIlya Dryomov #define RBD_EXCLUSIVE_DEFAULT	false
843d9360540SIlya Dryomov #define RBD_TRIM_DEFAULT	true
84498571b5aSAlex Elder 
84582995cc6SDavid Howells struct rbd_parse_opts_ctx {
846c300156bSIlya Dryomov 	struct rbd_spec		*spec;
84782995cc6SDavid Howells 	struct ceph_options	*copts;
848c300156bSIlya Dryomov 	struct rbd_options	*opts;
849c300156bSIlya Dryomov };
850c300156bSIlya Dryomov 
obj_op_name(enum obj_operation_type op_type)8516d2940c8SGuangliang Zhao static char* obj_op_name(enum obj_operation_type op_type)
8526d2940c8SGuangliang Zhao {
8536d2940c8SGuangliang Zhao 	switch (op_type) {
8546d2940c8SGuangliang Zhao 	case OBJ_OP_READ:
8556d2940c8SGuangliang Zhao 		return "read";
8566d2940c8SGuangliang Zhao 	case OBJ_OP_WRITE:
8576d2940c8SGuangliang Zhao 		return "write";
85890e98c52SGuangliang Zhao 	case OBJ_OP_DISCARD:
85990e98c52SGuangliang Zhao 		return "discard";
8606484cbe9SIlya Dryomov 	case OBJ_OP_ZEROOUT:
8616484cbe9SIlya Dryomov 		return "zeroout";
8626d2940c8SGuangliang Zhao 	default:
8636d2940c8SGuangliang Zhao 		return "???";
8646d2940c8SGuangliang Zhao 	}
8656d2940c8SGuangliang Zhao }
8666d2940c8SGuangliang Zhao 
86759c2be1eSYehuda Sadeh /*
868602adf40SYehuda Sadeh  * Destroy ceph client
869d23a4b3fSAlex Elder  *
870432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
871602adf40SYehuda Sadeh  */
rbd_client_release(struct kref * kref)872602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
873602adf40SYehuda Sadeh {
874602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
875602adf40SYehuda Sadeh 
87637206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
877cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
878602adf40SYehuda Sadeh 	list_del(&rbdc->node);
879cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
880602adf40SYehuda Sadeh 
881602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
882602adf40SYehuda Sadeh 	kfree(rbdc);
883602adf40SYehuda Sadeh }
884602adf40SYehuda Sadeh 
885602adf40SYehuda Sadeh /*
886602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
887602adf40SYehuda Sadeh  * it.
888602adf40SYehuda Sadeh  */
rbd_put_client(struct rbd_client * rbdc)8899d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
890602adf40SYehuda Sadeh {
891c53d5893SAlex Elder 	if (rbdc)
8929d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
893602adf40SYehuda Sadeh }
894602adf40SYehuda Sadeh 
8955feb0d8dSIlya Dryomov /*
8965feb0d8dSIlya Dryomov  * Get a ceph client with specific addr and configuration, if one does
8975feb0d8dSIlya Dryomov  * not exist create it.  Either way, ceph_opts is consumed by this
8985feb0d8dSIlya Dryomov  * function.
8995feb0d8dSIlya Dryomov  */
rbd_get_client(struct ceph_options * ceph_opts)9005feb0d8dSIlya Dryomov static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
9015feb0d8dSIlya Dryomov {
9025feb0d8dSIlya Dryomov 	struct rbd_client *rbdc;
903dd435855SIlya Dryomov 	int ret;
9045feb0d8dSIlya Dryomov 
905a32e4143SIlya Dryomov 	mutex_lock(&client_mutex);
9065feb0d8dSIlya Dryomov 	rbdc = rbd_client_find(ceph_opts);
907dd435855SIlya Dryomov 	if (rbdc) {
9085feb0d8dSIlya Dryomov 		ceph_destroy_options(ceph_opts);
909dd435855SIlya Dryomov 
910dd435855SIlya Dryomov 		/*
911dd435855SIlya Dryomov 		 * Using an existing client.  Make sure ->pg_pools is up to
912dd435855SIlya Dryomov 		 * date before we look up the pool id in do_rbd_add().
913dd435855SIlya Dryomov 		 */
9149d4a227fSIlya Dryomov 		ret = ceph_wait_for_latest_osdmap(rbdc->client,
9159d4a227fSIlya Dryomov 					rbdc->client->options->mount_timeout);
916dd435855SIlya Dryomov 		if (ret) {
917dd435855SIlya Dryomov 			rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
918dd435855SIlya Dryomov 			rbd_put_client(rbdc);
919dd435855SIlya Dryomov 			rbdc = ERR_PTR(ret);
920dd435855SIlya Dryomov 		}
921dd435855SIlya Dryomov 	} else {
9225feb0d8dSIlya Dryomov 		rbdc = rbd_client_create(ceph_opts);
923dd435855SIlya Dryomov 	}
9245feb0d8dSIlya Dryomov 	mutex_unlock(&client_mutex);
9255feb0d8dSIlya Dryomov 
9265feb0d8dSIlya Dryomov 	return rbdc;
9275feb0d8dSIlya Dryomov }
9285feb0d8dSIlya Dryomov 
rbd_image_format_valid(u32 image_format)929a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
930a30b71b9SAlex Elder {
931a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
932a30b71b9SAlex Elder }
933a30b71b9SAlex Elder 
rbd_dev_ondisk_valid(struct rbd_image_header_ondisk * ondisk)9348e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
9358e94af8eSAlex Elder {
936103a150fSAlex Elder 	size_t size;
937103a150fSAlex Elder 	u32 snap_count;
938103a150fSAlex Elder 
939103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
940103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
941103a150fSAlex Elder 		return false;
942103a150fSAlex Elder 
943db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
944db2388b6SAlex Elder 
945db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
946db2388b6SAlex Elder 		return false;
947db2388b6SAlex Elder 
948db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
949db2388b6SAlex Elder 
950db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
951db2388b6SAlex Elder 		return false;
952db2388b6SAlex Elder 
953103a150fSAlex Elder 	/*
954103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
955103a150fSAlex Elder 	 * that limits the number of snapshots.
956103a150fSAlex Elder 	 */
957103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
958103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
959103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
960103a150fSAlex Elder 		return false;
961103a150fSAlex Elder 
962103a150fSAlex Elder 	/*
963103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
964103a150fSAlex Elder 	 * header must also be representable in a size_t.
965103a150fSAlex Elder 	 */
966103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
967103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
968103a150fSAlex Elder 		return false;
969103a150fSAlex Elder 
970103a150fSAlex Elder 	return true;
9718e94af8eSAlex Elder }
9728e94af8eSAlex Elder 
973602adf40SYehuda Sadeh /*
9745bc3fb17SIlya Dryomov  * returns the size of an object in the image
9755bc3fb17SIlya Dryomov  */
rbd_obj_bytes(struct rbd_image_header * header)9765bc3fb17SIlya Dryomov static u32 rbd_obj_bytes(struct rbd_image_header *header)
9775bc3fb17SIlya Dryomov {
9785bc3fb17SIlya Dryomov 	return 1U << header->obj_order;
9795bc3fb17SIlya Dryomov }
9805bc3fb17SIlya Dryomov 
rbd_init_layout(struct rbd_device * rbd_dev)981263423f8SIlya Dryomov static void rbd_init_layout(struct rbd_device *rbd_dev)
982263423f8SIlya Dryomov {
983263423f8SIlya Dryomov 	if (rbd_dev->header.stripe_unit == 0 ||
984263423f8SIlya Dryomov 	    rbd_dev->header.stripe_count == 0) {
985263423f8SIlya Dryomov 		rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
986263423f8SIlya Dryomov 		rbd_dev->header.stripe_count = 1;
987263423f8SIlya Dryomov 	}
988263423f8SIlya Dryomov 
989263423f8SIlya Dryomov 	rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
990263423f8SIlya Dryomov 	rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
991263423f8SIlya Dryomov 	rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
9927e97332eSIlya Dryomov 	rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
9937e97332eSIlya Dryomov 			  rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
994263423f8SIlya Dryomov 	RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
995263423f8SIlya Dryomov }
996263423f8SIlya Dryomov 
rbd_image_header_cleanup(struct rbd_image_header * header)997510a7330SIlya Dryomov static void rbd_image_header_cleanup(struct rbd_image_header *header)
998510a7330SIlya Dryomov {
999510a7330SIlya Dryomov 	kfree(header->object_prefix);
1000510a7330SIlya Dryomov 	ceph_put_snap_context(header->snapc);
1001510a7330SIlya Dryomov 	kfree(header->snap_sizes);
1002510a7330SIlya Dryomov 	kfree(header->snap_names);
1003510a7330SIlya Dryomov 
1004510a7330SIlya Dryomov 	memset(header, 0, sizeof(*header));
1005510a7330SIlya Dryomov }
1006510a7330SIlya Dryomov 
10075bc3fb17SIlya Dryomov /*
1008bb23e37aSAlex Elder  * Fill an rbd image header with information from the given format 1
1009bb23e37aSAlex Elder  * on-disk header.
1010602adf40SYehuda Sadeh  */
rbd_header_from_disk(struct rbd_image_header * header,struct rbd_image_header_ondisk * ondisk,bool first_time)1011510a7330SIlya Dryomov static int rbd_header_from_disk(struct rbd_image_header *header,
1012510a7330SIlya Dryomov 				struct rbd_image_header_ondisk *ondisk,
1013510a7330SIlya Dryomov 				bool first_time)
1014602adf40SYehuda Sadeh {
1015bb23e37aSAlex Elder 	struct ceph_snap_context *snapc;
1016bb23e37aSAlex Elder 	char *object_prefix = NULL;
1017bb23e37aSAlex Elder 	char *snap_names = NULL;
1018bb23e37aSAlex Elder 	u64 *snap_sizes = NULL;
1019ccece235SAlex Elder 	u32 snap_count;
1020bb23e37aSAlex Elder 	int ret = -ENOMEM;
1021621901d6SAlex Elder 	u32 i;
1022602adf40SYehuda Sadeh 
1023bb23e37aSAlex Elder 	/* Allocate this now to avoid having to handle failure below */
1024103a150fSAlex Elder 
1025bb23e37aSAlex Elder 	if (first_time) {
1026848d796cSIlya Dryomov 		object_prefix = kstrndup(ondisk->object_prefix,
1027848d796cSIlya Dryomov 					 sizeof(ondisk->object_prefix),
1028848d796cSIlya Dryomov 					 GFP_KERNEL);
1029bb23e37aSAlex Elder 		if (!object_prefix)
1030602adf40SYehuda Sadeh 			return -ENOMEM;
1031bb23e37aSAlex Elder 	}
103200f1f36fSAlex Elder 
1033bb23e37aSAlex Elder 	/* Allocate the snapshot context and fill it in */
1034d2bb24e5SAlex Elder 
1035602adf40SYehuda Sadeh 	snap_count = le32_to_cpu(ondisk->snap_count);
1036bb23e37aSAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1037bb23e37aSAlex Elder 	if (!snapc)
1038bb23e37aSAlex Elder 		goto out_err;
1039bb23e37aSAlex Elder 	snapc->seq = le64_to_cpu(ondisk->snap_seq);
1040602adf40SYehuda Sadeh 	if (snap_count) {
1041bb23e37aSAlex Elder 		struct rbd_image_snap_ondisk *snaps;
1042f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1043f785cc1dSAlex Elder 
1044bb23e37aSAlex Elder 		/* We'll keep a copy of the snapshot names... */
1045621901d6SAlex Elder 
1046f785cc1dSAlex Elder 		if (snap_names_len > (u64)SIZE_MAX)
1047bb23e37aSAlex Elder 			goto out_2big;
1048bb23e37aSAlex Elder 		snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1049bb23e37aSAlex Elder 		if (!snap_names)
1050602adf40SYehuda Sadeh 			goto out_err;
1051bb23e37aSAlex Elder 
1052bb23e37aSAlex Elder 		/* ...as well as the array of their sizes. */
105388a25a5fSMarkus Elfring 		snap_sizes = kmalloc_array(snap_count,
105488a25a5fSMarkus Elfring 					   sizeof(*header->snap_sizes),
105588a25a5fSMarkus Elfring 					   GFP_KERNEL);
1056bb23e37aSAlex Elder 		if (!snap_sizes)
1057bb23e37aSAlex Elder 			goto out_err;
1058bb23e37aSAlex Elder 
1059f785cc1dSAlex Elder 		/*
1060bb23e37aSAlex Elder 		 * Copy the names, and fill in each snapshot's id
1061bb23e37aSAlex Elder 		 * and size.
1062bb23e37aSAlex Elder 		 *
106399a41ebcSAlex Elder 		 * Note that rbd_dev_v1_header_info() guarantees the
1064bb23e37aSAlex Elder 		 * ondisk buffer we're working with has
1065f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
1066f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
1067f785cc1dSAlex Elder 		 */
1068bb23e37aSAlex Elder 		memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1069bb23e37aSAlex Elder 		snaps = ondisk->snaps;
1070bb23e37aSAlex Elder 		for (i = 0; i < snap_count; i++) {
1071bb23e37aSAlex Elder 			snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1072bb23e37aSAlex Elder 			snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1073bb23e37aSAlex Elder 		}
1074602adf40SYehuda Sadeh 	}
1075849b4260SAlex Elder 
1076bb23e37aSAlex Elder 	/* We won't fail any more, fill in the header */
1077bb23e37aSAlex Elder 
1078bb23e37aSAlex Elder 	if (first_time) {
1079bb23e37aSAlex Elder 		header->object_prefix = object_prefix;
1080602adf40SYehuda Sadeh 		header->obj_order = ondisk->options.order;
1081bb23e37aSAlex Elder 	}
10826a52325fSAlex Elder 
1083bb23e37aSAlex Elder 	/* The remaining fields always get updated (when we refresh) */
1084621901d6SAlex Elder 
1085f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
1086bb23e37aSAlex Elder 	header->snapc = snapc;
1087bb23e37aSAlex Elder 	header->snap_names = snap_names;
1088bb23e37aSAlex Elder 	header->snap_sizes = snap_sizes;
1089468521c1SAlex Elder 
1090602adf40SYehuda Sadeh 	return 0;
1091bb23e37aSAlex Elder out_2big:
1092bb23e37aSAlex Elder 	ret = -EIO;
10936a52325fSAlex Elder out_err:
1094bb23e37aSAlex Elder 	kfree(snap_sizes);
1095bb23e37aSAlex Elder 	kfree(snap_names);
1096bb23e37aSAlex Elder 	ceph_put_snap_context(snapc);
1097bb23e37aSAlex Elder 	kfree(object_prefix);
1098ccece235SAlex Elder 
1099bb23e37aSAlex Elder 	return ret;
1100602adf40SYehuda Sadeh }
1101602adf40SYehuda Sadeh 
_rbd_dev_v1_snap_name(struct rbd_device * rbd_dev,u32 which)11029682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
11039682fc6dSAlex Elder {
11049682fc6dSAlex Elder 	const char *snap_name;
11059682fc6dSAlex Elder 
11069682fc6dSAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
11079682fc6dSAlex Elder 
11089682fc6dSAlex Elder 	/* Skip over names until we find the one we are looking for */
11099682fc6dSAlex Elder 
11109682fc6dSAlex Elder 	snap_name = rbd_dev->header.snap_names;
11119682fc6dSAlex Elder 	while (which--)
11129682fc6dSAlex Elder 		snap_name += strlen(snap_name) + 1;
11139682fc6dSAlex Elder 
11149682fc6dSAlex Elder 	return kstrdup(snap_name, GFP_KERNEL);
11159682fc6dSAlex Elder }
11169682fc6dSAlex Elder 
111730d1cff8SAlex Elder /*
111830d1cff8SAlex Elder  * Snapshot id comparison function for use with qsort()/bsearch().
111930d1cff8SAlex Elder  * Note that result is for snapshots in *descending* order.
112030d1cff8SAlex Elder  */
snapid_compare_reverse(const void * s1,const void * s2)112130d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2)
112230d1cff8SAlex Elder {
112330d1cff8SAlex Elder 	u64 snap_id1 = *(u64 *)s1;
112430d1cff8SAlex Elder 	u64 snap_id2 = *(u64 *)s2;
112530d1cff8SAlex Elder 
112630d1cff8SAlex Elder 	if (snap_id1 < snap_id2)
112730d1cff8SAlex Elder 		return 1;
112830d1cff8SAlex Elder 	return snap_id1 == snap_id2 ? 0 : -1;
112930d1cff8SAlex Elder }
113030d1cff8SAlex Elder 
113130d1cff8SAlex Elder /*
113230d1cff8SAlex Elder  * Search a snapshot context to see if the given snapshot id is
113330d1cff8SAlex Elder  * present.
113430d1cff8SAlex Elder  *
113530d1cff8SAlex Elder  * Returns the position of the snapshot id in the array if it's found,
113630d1cff8SAlex Elder  * or BAD_SNAP_INDEX otherwise.
113730d1cff8SAlex Elder  *
113830d1cff8SAlex Elder  * Note: The snapshot array is in kept sorted (by the osd) in
113930d1cff8SAlex Elder  * reverse order, highest snapshot id first.
114030d1cff8SAlex Elder  */
rbd_dev_snap_index(struct rbd_device * rbd_dev,u64 snap_id)11419682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
11429682fc6dSAlex Elder {
11439682fc6dSAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
114430d1cff8SAlex Elder 	u64 *found;
11459682fc6dSAlex Elder 
114630d1cff8SAlex Elder 	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
114730d1cff8SAlex Elder 				sizeof (snap_id), snapid_compare_reverse);
11489682fc6dSAlex Elder 
114930d1cff8SAlex Elder 	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
11509682fc6dSAlex Elder }
11519682fc6dSAlex Elder 
rbd_dev_v1_snap_name(struct rbd_device * rbd_dev,u64 snap_id)11522ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
11532ad3d716SAlex Elder 					u64 snap_id)
115454cac61fSAlex Elder {
115554cac61fSAlex Elder 	u32 which;
1156da6a6b63SJosh Durgin 	const char *snap_name;
115754cac61fSAlex Elder 
115854cac61fSAlex Elder 	which = rbd_dev_snap_index(rbd_dev, snap_id);
115954cac61fSAlex Elder 	if (which == BAD_SNAP_INDEX)
1160da6a6b63SJosh Durgin 		return ERR_PTR(-ENOENT);
116154cac61fSAlex Elder 
1162da6a6b63SJosh Durgin 	snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1163da6a6b63SJosh Durgin 	return snap_name ? snap_name : ERR_PTR(-ENOMEM);
116454cac61fSAlex Elder }
116554cac61fSAlex Elder 
rbd_snap_name(struct rbd_device * rbd_dev,u64 snap_id)11669e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
11679e15b77dSAlex Elder {
11689e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
11699e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
11709e15b77dSAlex Elder 
117154cac61fSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
117254cac61fSAlex Elder 	if (rbd_dev->image_format == 1)
117354cac61fSAlex Elder 		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
11749e15b77dSAlex Elder 
117554cac61fSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
11769e15b77dSAlex Elder }
11779e15b77dSAlex Elder 
rbd_snap_size(struct rbd_device * rbd_dev,u64 snap_id,u64 * snap_size)11782ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
11792ad3d716SAlex Elder 				u64 *snap_size)
1180602adf40SYehuda Sadeh {
11812ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
11822ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
11832ad3d716SAlex Elder 		*snap_size = rbd_dev->header.image_size;
11842ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
11852ad3d716SAlex Elder 		u32 which;
118600f1f36fSAlex Elder 
11872ad3d716SAlex Elder 		which = rbd_dev_snap_index(rbd_dev, snap_id);
11882ad3d716SAlex Elder 		if (which == BAD_SNAP_INDEX)
11892ad3d716SAlex Elder 			return -ENOENT;
119000f1f36fSAlex Elder 
11912ad3d716SAlex Elder 		*snap_size = rbd_dev->header.snap_sizes[which];
11922ad3d716SAlex Elder 	} else {
11932ad3d716SAlex Elder 		u64 size = 0;
11942ad3d716SAlex Elder 		int ret;
11952ad3d716SAlex Elder 
11962ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
11972ad3d716SAlex Elder 		if (ret)
11982ad3d716SAlex Elder 			return ret;
11992ad3d716SAlex Elder 
12002ad3d716SAlex Elder 		*snap_size = size;
12012ad3d716SAlex Elder 	}
12022ad3d716SAlex Elder 	return 0;
12032ad3d716SAlex Elder }
12042ad3d716SAlex Elder 
rbd_dev_mapping_set(struct rbd_device * rbd_dev)1205d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1206602adf40SYehuda Sadeh {
12078f4b7d98SAlex Elder 	u64 snap_id = rbd_dev->spec->snap_id;
12082ad3d716SAlex Elder 	u64 size = 0;
12092ad3d716SAlex Elder 	int ret;
12108b0241f8SAlex Elder 
12112ad3d716SAlex Elder 	ret = rbd_snap_size(rbd_dev, snap_id, &size);
12122ad3d716SAlex Elder 	if (ret)
12132ad3d716SAlex Elder 		return ret;
12142ad3d716SAlex Elder 
12152ad3d716SAlex Elder 	rbd_dev->mapping.size = size;
12168b0241f8SAlex Elder 	return 0;
1217602adf40SYehuda Sadeh }
1218602adf40SYehuda Sadeh 
rbd_dev_mapping_clear(struct rbd_device * rbd_dev)1219d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1220d1cf5788SAlex Elder {
1221d1cf5788SAlex Elder 	rbd_dev->mapping.size = 0;
1222200a6a8bSAlex Elder }
1223200a6a8bSAlex Elder 
zero_bios(struct ceph_bio_iter * bio_pos,u32 off,u32 bytes)12245359a17dSIlya Dryomov static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
1225b9434c5bSAlex Elder {
12265359a17dSIlya Dryomov 	struct ceph_bio_iter it = *bio_pos;
1227b9434c5bSAlex Elder 
12285359a17dSIlya Dryomov 	ceph_bio_iter_advance(&it, off);
12295359a17dSIlya Dryomov 	ceph_bio_iter_advance_step(&it, bytes, ({
1230732022b8SChristoph Hellwig 		memzero_bvec(&bv);
12315359a17dSIlya Dryomov 	}));
1232b9434c5bSAlex Elder }
1233b9434c5bSAlex Elder 
zero_bvecs(struct ceph_bvec_iter * bvec_pos,u32 off,u32 bytes)12347e07efb1SIlya Dryomov static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
1235602adf40SYehuda Sadeh {
12367e07efb1SIlya Dryomov 	struct ceph_bvec_iter it = *bvec_pos;
1237602adf40SYehuda Sadeh 
12387e07efb1SIlya Dryomov 	ceph_bvec_iter_advance(&it, off);
12397e07efb1SIlya Dryomov 	ceph_bvec_iter_advance_step(&it, bytes, ({
1240732022b8SChristoph Hellwig 		memzero_bvec(&bv);
12417e07efb1SIlya Dryomov 	}));
1242602adf40SYehuda Sadeh }
1243602adf40SYehuda Sadeh 
1244f7760dadSAlex Elder /*
12453da691bfSIlya Dryomov  * Zero a range in @obj_req data buffer defined by a bio (list) or
1246afb97888SIlya Dryomov  * (private) bio_vec array.
1247f7760dadSAlex Elder  *
12483da691bfSIlya Dryomov  * @off is relative to the start of the data buffer.
1249f7760dadSAlex Elder  */
rbd_obj_zero_range(struct rbd_obj_request * obj_req,u32 off,u32 bytes)12503da691bfSIlya Dryomov static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
12513da691bfSIlya Dryomov 			       u32 bytes)
1252f7760dadSAlex Elder {
125354ab3b24SIlya Dryomov 	dout("%s %p data buf %u~%u\n", __func__, obj_req, off, bytes);
125454ab3b24SIlya Dryomov 
1255ecc633caSIlya Dryomov 	switch (obj_req->img_request->data_type) {
12563da691bfSIlya Dryomov 	case OBJ_REQUEST_BIO:
12573da691bfSIlya Dryomov 		zero_bios(&obj_req->bio_pos, off, bytes);
12583da691bfSIlya Dryomov 		break;
12593da691bfSIlya Dryomov 	case OBJ_REQUEST_BVECS:
1260afb97888SIlya Dryomov 	case OBJ_REQUEST_OWN_BVECS:
12613da691bfSIlya Dryomov 		zero_bvecs(&obj_req->bvec_pos, off, bytes);
12623da691bfSIlya Dryomov 		break;
12633da691bfSIlya Dryomov 	default:
126416809372SArnd Bergmann 		BUG();
1265f5400b7aSAlex Elder 	}
1266bf0d5f50SAlex Elder }
1267bf0d5f50SAlex Elder 
1268bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
rbd_obj_request_put(struct rbd_obj_request * obj_request)1269bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1270bf0d5f50SAlex Elder {
1271bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
127237206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
12732c935bc5SPeter Zijlstra 		kref_read(&obj_request->kref));
1274bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1275bf0d5f50SAlex Elder }
1276bf0d5f50SAlex Elder 
rbd_img_obj_request_add(struct rbd_img_request * img_request,struct rbd_obj_request * obj_request)1277bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1278bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1279bf0d5f50SAlex Elder {
128025dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
128125dcf954SAlex Elder 
1282b155e86cSAlex Elder 	/* Image request now owns object's original reference */
1283bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
128415961b44SIlya Dryomov 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1285bf0d5f50SAlex Elder }
1286bf0d5f50SAlex Elder 
rbd_img_obj_request_del(struct rbd_img_request * img_request,struct rbd_obj_request * obj_request)1287bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1288bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1289bf0d5f50SAlex Elder {
129015961b44SIlya Dryomov 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
129143df3d35SIlya Dryomov 	list_del(&obj_request->ex.oe_item);
1292bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1293bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1294bf0d5f50SAlex Elder }
1295bf0d5f50SAlex Elder 
rbd_osd_submit(struct ceph_osd_request * osd_req)1296a086a1b8SIlya Dryomov static void rbd_osd_submit(struct ceph_osd_request *osd_req)
1297bf0d5f50SAlex Elder {
1298a086a1b8SIlya Dryomov 	struct rbd_obj_request *obj_req = osd_req->r_priv;
1299980917fcSIlya Dryomov 
1300a086a1b8SIlya Dryomov 	dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n",
1301a086a1b8SIlya Dryomov 	     __func__, osd_req, obj_req, obj_req->ex.oe_objno,
1302a086a1b8SIlya Dryomov 	     obj_req->ex.oe_off, obj_req->ex.oe_len);
1303a8af0d68SJeff Layton 	ceph_osdc_start_request(osd_req->r_osdc, osd_req);
1304bf0d5f50SAlex Elder }
1305bf0d5f50SAlex Elder 
13060c425248SAlex Elder /*
13070c425248SAlex Elder  * The default/initial value for all image request flags is 0.  Each
13080c425248SAlex Elder  * is conditionally set to 1 at image request initialization time
13090c425248SAlex Elder  * and currently never change thereafter.
13100c425248SAlex Elder  */
img_request_layered_set(struct rbd_img_request * img_request)1311d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request)
1312d0b2e944SAlex Elder {
1313d0b2e944SAlex Elder 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1314d0b2e944SAlex Elder }
1315d0b2e944SAlex Elder 
img_request_layered_test(struct rbd_img_request * img_request)1316d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request)
1317d0b2e944SAlex Elder {
1318d0b2e944SAlex Elder 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1319d0b2e944SAlex Elder }
1320d0b2e944SAlex Elder 
rbd_obj_is_entire(struct rbd_obj_request * obj_req)13213da691bfSIlya Dryomov static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
13223b434a2aSJosh Durgin {
13233da691bfSIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
13243da691bfSIlya Dryomov 
132543df3d35SIlya Dryomov 	return !obj_req->ex.oe_off &&
132643df3d35SIlya Dryomov 	       obj_req->ex.oe_len == rbd_dev->layout.object_size;
13273b434a2aSJosh Durgin }
13283b434a2aSJosh Durgin 
rbd_obj_is_tail(struct rbd_obj_request * obj_req)13293da691bfSIlya Dryomov static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
13306e2a4505SAlex Elder {
13313da691bfSIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1332b9434c5bSAlex Elder 
133343df3d35SIlya Dryomov 	return obj_req->ex.oe_off + obj_req->ex.oe_len ==
13343da691bfSIlya Dryomov 					rbd_dev->layout.object_size;
13356e2a4505SAlex Elder }
13366e2a4505SAlex Elder 
133713488d53SIlya Dryomov /*
133813488d53SIlya Dryomov  * Must be called after rbd_obj_calc_img_extents().
133913488d53SIlya Dryomov  */
rbd_obj_set_copyup_enabled(struct rbd_obj_request * obj_req)134009fe05c5SIlya Dryomov static void rbd_obj_set_copyup_enabled(struct rbd_obj_request *obj_req)
134113488d53SIlya Dryomov {
1342870611e4SIlya Dryomov 	rbd_assert(obj_req->img_request->snapc);
1343870611e4SIlya Dryomov 
134409fe05c5SIlya Dryomov 	if (obj_req->img_request->op_type == OBJ_OP_DISCARD) {
134509fe05c5SIlya Dryomov 		dout("%s %p objno %llu discard\n", __func__, obj_req,
134609fe05c5SIlya Dryomov 		     obj_req->ex.oe_objno);
134709fe05c5SIlya Dryomov 		return;
134809fe05c5SIlya Dryomov 	}
134913488d53SIlya Dryomov 
135009fe05c5SIlya Dryomov 	if (!obj_req->num_img_extents) {
135109fe05c5SIlya Dryomov 		dout("%s %p objno %llu not overlapping\n", __func__, obj_req,
135209fe05c5SIlya Dryomov 		     obj_req->ex.oe_objno);
135309fe05c5SIlya Dryomov 		return;
135409fe05c5SIlya Dryomov 	}
135509fe05c5SIlya Dryomov 
135609fe05c5SIlya Dryomov 	if (rbd_obj_is_entire(obj_req) &&
135709fe05c5SIlya Dryomov 	    !obj_req->img_request->snapc->num_snaps) {
135809fe05c5SIlya Dryomov 		dout("%s %p objno %llu entire\n", __func__, obj_req,
135909fe05c5SIlya Dryomov 		     obj_req->ex.oe_objno);
136009fe05c5SIlya Dryomov 		return;
136109fe05c5SIlya Dryomov 	}
136209fe05c5SIlya Dryomov 
136309fe05c5SIlya Dryomov 	obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
136413488d53SIlya Dryomov }
136513488d53SIlya Dryomov 
rbd_obj_img_extents_bytes(struct rbd_obj_request * obj_req)136686bd7998SIlya Dryomov static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
1367bf0d5f50SAlex Elder {
136886bd7998SIlya Dryomov 	return ceph_file_extents_bytes(obj_req->img_extents,
136986bd7998SIlya Dryomov 				       obj_req->num_img_extents);
1370bf0d5f50SAlex Elder }
1371bf0d5f50SAlex Elder 
rbd_img_is_write(struct rbd_img_request * img_req)13723da691bfSIlya Dryomov static bool rbd_img_is_write(struct rbd_img_request *img_req)
13730dcc685eSIlya Dryomov {
13749bb0248dSIlya Dryomov 	switch (img_req->op_type) {
13753da691bfSIlya Dryomov 	case OBJ_OP_READ:
13763da691bfSIlya Dryomov 		return false;
13773da691bfSIlya Dryomov 	case OBJ_OP_WRITE:
13783da691bfSIlya Dryomov 	case OBJ_OP_DISCARD:
13796484cbe9SIlya Dryomov 	case OBJ_OP_ZEROOUT:
13803da691bfSIlya Dryomov 		return true;
13813da691bfSIlya Dryomov 	default:
1382c6244b3bSArnd Bergmann 		BUG();
13830dcc685eSIlya Dryomov 	}
13840dcc685eSIlya Dryomov }
13850dcc685eSIlya Dryomov 
rbd_osd_req_callback(struct ceph_osd_request * osd_req)138685e084feSIlya Dryomov static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
1387bf0d5f50SAlex Elder {
13883da691bfSIlya Dryomov 	struct rbd_obj_request *obj_req = osd_req->r_priv;
138954ab3b24SIlya Dryomov 	int result;
1390bf0d5f50SAlex Elder 
13913da691bfSIlya Dryomov 	dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
13923da691bfSIlya Dryomov 	     osd_req->r_result, obj_req);
1393bf0d5f50SAlex Elder 
1394c47f9371SAlex Elder 	/*
13953da691bfSIlya Dryomov 	 * Writes aren't allowed to return a data payload.  In some
13963da691bfSIlya Dryomov 	 * guarded write cases (e.g. stat + zero on an empty object)
13973da691bfSIlya Dryomov 	 * a stat response makes it through, but we don't care.
1398c47f9371SAlex Elder 	 */
139954ab3b24SIlya Dryomov 	if (osd_req->r_result > 0 && rbd_img_is_write(obj_req->img_request))
140054ab3b24SIlya Dryomov 		result = 0;
140154ab3b24SIlya Dryomov 	else
140254ab3b24SIlya Dryomov 		result = osd_req->r_result;
14030ccd5926SIlya Dryomov 
140454ab3b24SIlya Dryomov 	rbd_obj_handle_request(obj_req, result);
1405bf0d5f50SAlex Elder }
1406bf0d5f50SAlex Elder 
rbd_osd_format_read(struct ceph_osd_request * osd_req)1407bcbab1dbSIlya Dryomov static void rbd_osd_format_read(struct ceph_osd_request *osd_req)
1408430c28c3SAlex Elder {
1409bcbab1dbSIlya Dryomov 	struct rbd_obj_request *obj_request = osd_req->r_priv;
141022d2cfdfSIlya Dryomov 	struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
141122d2cfdfSIlya Dryomov 	struct ceph_options *opt = rbd_dev->rbd_client->client->options;
1412430c28c3SAlex Elder 
141322d2cfdfSIlya Dryomov 	osd_req->r_flags = CEPH_OSD_FLAG_READ | opt->read_from_replica;
14147c84883aSIlya Dryomov 	osd_req->r_snapid = obj_request->img_request->snap_id;
14159d4df01fSAlex Elder }
14169d4df01fSAlex Elder 
rbd_osd_format_write(struct ceph_osd_request * osd_req)1417bcbab1dbSIlya Dryomov static void rbd_osd_format_write(struct ceph_osd_request *osd_req)
14189d4df01fSAlex Elder {
1419bcbab1dbSIlya Dryomov 	struct rbd_obj_request *obj_request = osd_req->r_priv;
14209d4df01fSAlex Elder 
1421a162b308SIlya Dryomov 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
1422fac02ddfSArnd Bergmann 	ktime_get_real_ts64(&osd_req->r_mtime);
142343df3d35SIlya Dryomov 	osd_req->r_data_offset = obj_request->ex.oe_off;
1424430c28c3SAlex Elder }
1425430c28c3SAlex Elder 
1426bc81207eSIlya Dryomov static struct ceph_osd_request *
__rbd_obj_add_osd_request(struct rbd_obj_request * obj_req,struct ceph_snap_context * snapc,int num_ops)1427bcbab1dbSIlya Dryomov __rbd_obj_add_osd_request(struct rbd_obj_request *obj_req,
1428bcbab1dbSIlya Dryomov 			  struct ceph_snap_context *snapc, int num_ops)
1429bc81207eSIlya Dryomov {
1430e28eded5SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1431bc81207eSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1432bc81207eSIlya Dryomov 	struct ceph_osd_request *req;
1433a90bb0c1SIlya Dryomov 	const char *name_format = rbd_dev->image_format == 1 ?
1434a90bb0c1SIlya Dryomov 				      RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
1435bcbab1dbSIlya Dryomov 	int ret;
1436bc81207eSIlya Dryomov 
1437e28eded5SIlya Dryomov 	req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
1438bc81207eSIlya Dryomov 	if (!req)
1439bcbab1dbSIlya Dryomov 		return ERR_PTR(-ENOMEM);
1440bc81207eSIlya Dryomov 
1441bcbab1dbSIlya Dryomov 	list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
1442bc81207eSIlya Dryomov 	req->r_callback = rbd_osd_req_callback;
1443a162b308SIlya Dryomov 	req->r_priv = obj_req;
1444bc81207eSIlya Dryomov 
1445b26c047bSIlya Dryomov 	/*
1446b26c047bSIlya Dryomov 	 * Data objects may be stored in a separate pool, but always in
1447b26c047bSIlya Dryomov 	 * the same namespace in that pool as the header in its pool.
1448b26c047bSIlya Dryomov 	 */
1449b26c047bSIlya Dryomov 	ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
1450bc81207eSIlya Dryomov 	req->r_base_oloc.pool = rbd_dev->layout.pool_id;
1451b26c047bSIlya Dryomov 
1452bcbab1dbSIlya Dryomov 	ret = ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
1453bcbab1dbSIlya Dryomov 			       rbd_dev->header.object_prefix,
1454bcbab1dbSIlya Dryomov 			       obj_req->ex.oe_objno);
1455bcbab1dbSIlya Dryomov 	if (ret)
1456bcbab1dbSIlya Dryomov 		return ERR_PTR(ret);
1457bc81207eSIlya Dryomov 
1458bc81207eSIlya Dryomov 	return req;
1459bc81207eSIlya Dryomov }
1460bc81207eSIlya Dryomov 
1461e28eded5SIlya Dryomov static struct ceph_osd_request *
rbd_obj_add_osd_request(struct rbd_obj_request * obj_req,int num_ops)1462bcbab1dbSIlya Dryomov rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops)
1463e28eded5SIlya Dryomov {
1464870611e4SIlya Dryomov 	rbd_assert(obj_req->img_request->snapc);
1465bcbab1dbSIlya Dryomov 	return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc,
1466e28eded5SIlya Dryomov 					 num_ops);
1467e28eded5SIlya Dryomov }
1468e28eded5SIlya Dryomov 
rbd_obj_request_create(void)1469ecc633caSIlya Dryomov static struct rbd_obj_request *rbd_obj_request_create(void)
1470bf0d5f50SAlex Elder {
1471bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1472bf0d5f50SAlex Elder 
14735a60e876SIlya Dryomov 	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
14746c696d85SIlya Dryomov 	if (!obj_request)
1475f907ad55SAlex Elder 		return NULL;
1476f907ad55SAlex Elder 
147743df3d35SIlya Dryomov 	ceph_object_extent_init(&obj_request->ex);
1478bcbab1dbSIlya Dryomov 	INIT_LIST_HEAD(&obj_request->osd_reqs);
147985b5e6d1SIlya Dryomov 	mutex_init(&obj_request->state_mutex);
1480bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
1481bf0d5f50SAlex Elder 
148267e2b652SIlya Dryomov 	dout("%s %p\n", __func__, obj_request);
1483bf0d5f50SAlex Elder 	return obj_request;
1484bf0d5f50SAlex Elder }
1485bf0d5f50SAlex Elder 
rbd_obj_request_destroy(struct kref * kref)1486bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
1487bf0d5f50SAlex Elder {
1488bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1489bcbab1dbSIlya Dryomov 	struct ceph_osd_request *osd_req;
14907e07efb1SIlya Dryomov 	u32 i;
1491bf0d5f50SAlex Elder 
1492bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
1493bf0d5f50SAlex Elder 
149437206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
149537206ee5SAlex Elder 
1496bcbab1dbSIlya Dryomov 	while (!list_empty(&obj_request->osd_reqs)) {
1497bcbab1dbSIlya Dryomov 		osd_req = list_first_entry(&obj_request->osd_reqs,
1498bcbab1dbSIlya Dryomov 				    struct ceph_osd_request, r_private_item);
1499bcbab1dbSIlya Dryomov 		list_del_init(&osd_req->r_private_item);
1500bcbab1dbSIlya Dryomov 		ceph_osdc_put_request(osd_req);
1501bcbab1dbSIlya Dryomov 	}
1502bf0d5f50SAlex Elder 
1503ecc633caSIlya Dryomov 	switch (obj_request->img_request->data_type) {
15049969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1505bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
15067e07efb1SIlya Dryomov 	case OBJ_REQUEST_BVECS:
15075359a17dSIlya Dryomov 		break;		/* Nothing to do */
1508afb97888SIlya Dryomov 	case OBJ_REQUEST_OWN_BVECS:
1509afb97888SIlya Dryomov 		kfree(obj_request->bvec_pos.bvecs);
1510bf0d5f50SAlex Elder 		break;
15117e07efb1SIlya Dryomov 	default:
151216809372SArnd Bergmann 		BUG();
1513bf0d5f50SAlex Elder 	}
1514bf0d5f50SAlex Elder 
151586bd7998SIlya Dryomov 	kfree(obj_request->img_extents);
15167e07efb1SIlya Dryomov 	if (obj_request->copyup_bvecs) {
15177e07efb1SIlya Dryomov 		for (i = 0; i < obj_request->copyup_bvec_count; i++) {
15187e07efb1SIlya Dryomov 			if (obj_request->copyup_bvecs[i].bv_page)
15197e07efb1SIlya Dryomov 				__free_page(obj_request->copyup_bvecs[i].bv_page);
15207e07efb1SIlya Dryomov 		}
15217e07efb1SIlya Dryomov 		kfree(obj_request->copyup_bvecs);
1522bf0d5f50SAlex Elder 	}
1523bf0d5f50SAlex Elder 
1524868311b1SAlex Elder 	kmem_cache_free(rbd_obj_request_cache, obj_request);
1525bf0d5f50SAlex Elder }
1526bf0d5f50SAlex Elder 
1527fb65d228SAlex Elder /* It's OK to call this for a device with no parent */
1528fb65d228SAlex Elder 
1529fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
rbd_dev_unparent(struct rbd_device * rbd_dev)1530fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1531fb65d228SAlex Elder {
1532fb65d228SAlex Elder 	rbd_dev_remove_parent(rbd_dev);
1533fb65d228SAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
1534fb65d228SAlex Elder 	rbd_dev->parent_spec = NULL;
1535fb65d228SAlex Elder 	rbd_dev->parent_overlap = 0;
1536fb65d228SAlex Elder }
1537fb65d228SAlex Elder 
1538bf0d5f50SAlex Elder /*
1539a2acd00eSAlex Elder  * Parent image reference counting is used to determine when an
1540a2acd00eSAlex Elder  * image's parent fields can be safely torn down--after there are no
1541a2acd00eSAlex Elder  * more in-flight requests to the parent image.  When the last
1542a2acd00eSAlex Elder  * reference is dropped, cleaning them up is safe.
1543a2acd00eSAlex Elder  */
rbd_dev_parent_put(struct rbd_device * rbd_dev)1544a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1545a2acd00eSAlex Elder {
1546a2acd00eSAlex Elder 	int counter;
1547a2acd00eSAlex Elder 
1548a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
1549a2acd00eSAlex Elder 		return;
1550a2acd00eSAlex Elder 
1551a2acd00eSAlex Elder 	counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1552a2acd00eSAlex Elder 	if (counter > 0)
1553a2acd00eSAlex Elder 		return;
1554a2acd00eSAlex Elder 
1555a2acd00eSAlex Elder 	/* Last reference; clean up parent data structures */
1556a2acd00eSAlex Elder 
1557a2acd00eSAlex Elder 	if (!counter)
1558a2acd00eSAlex Elder 		rbd_dev_unparent(rbd_dev);
1559a2acd00eSAlex Elder 	else
15609584d508SIlya Dryomov 		rbd_warn(rbd_dev, "parent reference underflow");
1561a2acd00eSAlex Elder }
1562a2acd00eSAlex Elder 
1563a2acd00eSAlex Elder /*
1564a2acd00eSAlex Elder  * If an image has a non-zero parent overlap, get a reference to its
1565a2acd00eSAlex Elder  * parent.
1566a2acd00eSAlex Elder  *
1567a2acd00eSAlex Elder  * Returns true if the rbd device has a parent with a non-zero
1568a2acd00eSAlex Elder  * overlap and a reference for it was successfully taken, or
1569a2acd00eSAlex Elder  * false otherwise.
1570a2acd00eSAlex Elder  */
rbd_dev_parent_get(struct rbd_device * rbd_dev)1571a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1572a2acd00eSAlex Elder {
1573ae43e9d0SIlya Dryomov 	int counter = 0;
1574a2acd00eSAlex Elder 
1575a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
1576a2acd00eSAlex Elder 		return false;
1577a2acd00eSAlex Elder 
1578ae43e9d0SIlya Dryomov 	if (rbd_dev->parent_overlap)
1579a2acd00eSAlex Elder 		counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1580a2acd00eSAlex Elder 
1581a2acd00eSAlex Elder 	if (counter < 0)
15829584d508SIlya Dryomov 		rbd_warn(rbd_dev, "parent reference overflow");
1583a2acd00eSAlex Elder 
1584ae43e9d0SIlya Dryomov 	return counter > 0;
1585a2acd00eSAlex Elder }
1586a2acd00eSAlex Elder 
rbd_img_request_init(struct rbd_img_request * img_request,struct rbd_device * rbd_dev,enum obj_operation_type op_type)158759e542c8SIlya Dryomov static void rbd_img_request_init(struct rbd_img_request *img_request,
1588cc344fa1SAlex Elder 				 struct rbd_device *rbd_dev,
1589a52cc685SIlya Dryomov 				 enum obj_operation_type op_type)
1590bf0d5f50SAlex Elder {
159159e542c8SIlya Dryomov 	memset(img_request, 0, sizeof(*img_request));
1592bf0d5f50SAlex Elder 
1593bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
15949bb0248dSIlya Dryomov 	img_request->op_type = op_type;
1595a0c5895bSIlya Dryomov 
1596e1fddc8fSIlya Dryomov 	INIT_LIST_HEAD(&img_request->lock_item);
159743df3d35SIlya Dryomov 	INIT_LIST_HEAD(&img_request->object_extents);
15980192ce2eSIlya Dryomov 	mutex_init(&img_request->state_mutex);
1599bf0d5f50SAlex Elder }
1600bf0d5f50SAlex Elder 
1601870611e4SIlya Dryomov /*
1602870611e4SIlya Dryomov  * Only snap_id is captured here, for reads.  For writes, snapshot
1603870611e4SIlya Dryomov  * context is captured in rbd_img_object_requests() after exclusive
1604870611e4SIlya Dryomov  * lock is ensured to be held.
1605870611e4SIlya Dryomov  */
rbd_img_capture_header(struct rbd_img_request * img_req)1606a52cc685SIlya Dryomov static void rbd_img_capture_header(struct rbd_img_request *img_req)
1607a52cc685SIlya Dryomov {
1608a52cc685SIlya Dryomov 	struct rbd_device *rbd_dev = img_req->rbd_dev;
1609a52cc685SIlya Dryomov 
1610a52cc685SIlya Dryomov 	lockdep_assert_held(&rbd_dev->header_rwsem);
1611a52cc685SIlya Dryomov 
1612870611e4SIlya Dryomov 	if (!rbd_img_is_write(img_req))
1613a52cc685SIlya Dryomov 		img_req->snap_id = rbd_dev->spec->snap_id;
1614a52cc685SIlya Dryomov 
1615a52cc685SIlya Dryomov 	if (rbd_dev_parent_get(rbd_dev))
1616a52cc685SIlya Dryomov 		img_request_layered_set(img_req);
1617a52cc685SIlya Dryomov }
1618a52cc685SIlya Dryomov 
rbd_img_request_destroy(struct rbd_img_request * img_request)1619679a97d2SHannes Reinecke static void rbd_img_request_destroy(struct rbd_img_request *img_request)
1620bf0d5f50SAlex Elder {
1621bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1622bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
1623bf0d5f50SAlex Elder 
162437206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
162537206ee5SAlex Elder 
1626e1fddc8fSIlya Dryomov 	WARN_ON(!list_empty(&img_request->lock_item));
1627bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1628bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
1629bf0d5f50SAlex Elder 
163078b42a87SIlya Dryomov 	if (img_request_layered_test(img_request))
1631a2acd00eSAlex Elder 		rbd_dev_parent_put(img_request->rbd_dev);
1632a2acd00eSAlex Elder 
16339bb0248dSIlya Dryomov 	if (rbd_img_is_write(img_request))
1634812164f8SAlex Elder 		ceph_put_snap_context(img_request->snapc);
1635bf0d5f50SAlex Elder 
163659e542c8SIlya Dryomov 	if (test_bit(IMG_REQ_CHILD, &img_request->flags))
16371c2a9dfeSAlex Elder 		kmem_cache_free(rbd_img_request_cache, img_request);
1638bf0d5f50SAlex Elder }
1639bf0d5f50SAlex Elder 
164022e8bd51SIlya Dryomov #define BITS_PER_OBJ	2
164122e8bd51SIlya Dryomov #define OBJS_PER_BYTE	(BITS_PER_BYTE / BITS_PER_OBJ)
164222e8bd51SIlya Dryomov #define OBJ_MASK	((1 << BITS_PER_OBJ) - 1)
164322e8bd51SIlya Dryomov 
__rbd_object_map_index(struct rbd_device * rbd_dev,u64 objno,u64 * index,u8 * shift)164422e8bd51SIlya Dryomov static void __rbd_object_map_index(struct rbd_device *rbd_dev, u64 objno,
164522e8bd51SIlya Dryomov 				   u64 *index, u8 *shift)
164622e8bd51SIlya Dryomov {
164722e8bd51SIlya Dryomov 	u32 off;
164822e8bd51SIlya Dryomov 
164922e8bd51SIlya Dryomov 	rbd_assert(objno < rbd_dev->object_map_size);
165022e8bd51SIlya Dryomov 	*index = div_u64_rem(objno, OBJS_PER_BYTE, &off);
165122e8bd51SIlya Dryomov 	*shift = (OBJS_PER_BYTE - off - 1) * BITS_PER_OBJ;
165222e8bd51SIlya Dryomov }
165322e8bd51SIlya Dryomov 
__rbd_object_map_get(struct rbd_device * rbd_dev,u64 objno)165422e8bd51SIlya Dryomov static u8 __rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
165522e8bd51SIlya Dryomov {
165622e8bd51SIlya Dryomov 	u64 index;
165722e8bd51SIlya Dryomov 	u8 shift;
165822e8bd51SIlya Dryomov 
165922e8bd51SIlya Dryomov 	lockdep_assert_held(&rbd_dev->object_map_lock);
166022e8bd51SIlya Dryomov 	__rbd_object_map_index(rbd_dev, objno, &index, &shift);
166122e8bd51SIlya Dryomov 	return (rbd_dev->object_map[index] >> shift) & OBJ_MASK;
166222e8bd51SIlya Dryomov }
166322e8bd51SIlya Dryomov 
__rbd_object_map_set(struct rbd_device * rbd_dev,u64 objno,u8 val)166422e8bd51SIlya Dryomov static void __rbd_object_map_set(struct rbd_device *rbd_dev, u64 objno, u8 val)
166522e8bd51SIlya Dryomov {
166622e8bd51SIlya Dryomov 	u64 index;
166722e8bd51SIlya Dryomov 	u8 shift;
166822e8bd51SIlya Dryomov 	u8 *p;
166922e8bd51SIlya Dryomov 
167022e8bd51SIlya Dryomov 	lockdep_assert_held(&rbd_dev->object_map_lock);
167122e8bd51SIlya Dryomov 	rbd_assert(!(val & ~OBJ_MASK));
167222e8bd51SIlya Dryomov 
167322e8bd51SIlya Dryomov 	__rbd_object_map_index(rbd_dev, objno, &index, &shift);
167422e8bd51SIlya Dryomov 	p = &rbd_dev->object_map[index];
167522e8bd51SIlya Dryomov 	*p = (*p & ~(OBJ_MASK << shift)) | (val << shift);
167622e8bd51SIlya Dryomov }
167722e8bd51SIlya Dryomov 
rbd_object_map_get(struct rbd_device * rbd_dev,u64 objno)167822e8bd51SIlya Dryomov static u8 rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
167922e8bd51SIlya Dryomov {
168022e8bd51SIlya Dryomov 	u8 state;
168122e8bd51SIlya Dryomov 
168222e8bd51SIlya Dryomov 	spin_lock(&rbd_dev->object_map_lock);
168322e8bd51SIlya Dryomov 	state = __rbd_object_map_get(rbd_dev, objno);
168422e8bd51SIlya Dryomov 	spin_unlock(&rbd_dev->object_map_lock);
168522e8bd51SIlya Dryomov 	return state;
168622e8bd51SIlya Dryomov }
168722e8bd51SIlya Dryomov 
use_object_map(struct rbd_device * rbd_dev)168822e8bd51SIlya Dryomov static bool use_object_map(struct rbd_device *rbd_dev)
168922e8bd51SIlya Dryomov {
16903fe69921SIlya Dryomov 	/*
16913fe69921SIlya Dryomov 	 * An image mapped read-only can't use the object map -- it isn't
16923fe69921SIlya Dryomov 	 * loaded because the header lock isn't acquired.  Someone else can
16933fe69921SIlya Dryomov 	 * write to the image and update the object map behind our back.
16943fe69921SIlya Dryomov 	 *
16953fe69921SIlya Dryomov 	 * A snapshot can't be written to, so using the object map is always
16963fe69921SIlya Dryomov 	 * safe.
16973fe69921SIlya Dryomov 	 */
16983fe69921SIlya Dryomov 	if (!rbd_is_snap(rbd_dev) && rbd_is_ro(rbd_dev))
16993fe69921SIlya Dryomov 		return false;
17003fe69921SIlya Dryomov 
170122e8bd51SIlya Dryomov 	return ((rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) &&
170222e8bd51SIlya Dryomov 		!(rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID));
170322e8bd51SIlya Dryomov }
170422e8bd51SIlya Dryomov 
rbd_object_map_may_exist(struct rbd_device * rbd_dev,u64 objno)170522e8bd51SIlya Dryomov static bool rbd_object_map_may_exist(struct rbd_device *rbd_dev, u64 objno)
170622e8bd51SIlya Dryomov {
170722e8bd51SIlya Dryomov 	u8 state;
170822e8bd51SIlya Dryomov 
170922e8bd51SIlya Dryomov 	/* fall back to default logic if object map is disabled or invalid */
171022e8bd51SIlya Dryomov 	if (!use_object_map(rbd_dev))
171122e8bd51SIlya Dryomov 		return true;
171222e8bd51SIlya Dryomov 
171322e8bd51SIlya Dryomov 	state = rbd_object_map_get(rbd_dev, objno);
171422e8bd51SIlya Dryomov 	return state != OBJECT_NONEXISTENT;
171522e8bd51SIlya Dryomov }
171622e8bd51SIlya Dryomov 
rbd_object_map_name(struct rbd_device * rbd_dev,u64 snap_id,struct ceph_object_id * oid)171722e8bd51SIlya Dryomov static void rbd_object_map_name(struct rbd_device *rbd_dev, u64 snap_id,
171822e8bd51SIlya Dryomov 				struct ceph_object_id *oid)
171922e8bd51SIlya Dryomov {
172022e8bd51SIlya Dryomov 	if (snap_id == CEPH_NOSNAP)
172122e8bd51SIlya Dryomov 		ceph_oid_printf(oid, "%s%s", RBD_OBJECT_MAP_PREFIX,
172222e8bd51SIlya Dryomov 				rbd_dev->spec->image_id);
172322e8bd51SIlya Dryomov 	else
172422e8bd51SIlya Dryomov 		ceph_oid_printf(oid, "%s%s.%016llx", RBD_OBJECT_MAP_PREFIX,
172522e8bd51SIlya Dryomov 				rbd_dev->spec->image_id, snap_id);
172622e8bd51SIlya Dryomov }
172722e8bd51SIlya Dryomov 
rbd_object_map_lock(struct rbd_device * rbd_dev)172822e8bd51SIlya Dryomov static int rbd_object_map_lock(struct rbd_device *rbd_dev)
172922e8bd51SIlya Dryomov {
173022e8bd51SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
173122e8bd51SIlya Dryomov 	CEPH_DEFINE_OID_ONSTACK(oid);
173222e8bd51SIlya Dryomov 	u8 lock_type;
173322e8bd51SIlya Dryomov 	char *lock_tag;
173422e8bd51SIlya Dryomov 	struct ceph_locker *lockers;
173522e8bd51SIlya Dryomov 	u32 num_lockers;
173622e8bd51SIlya Dryomov 	bool broke_lock = false;
173722e8bd51SIlya Dryomov 	int ret;
173822e8bd51SIlya Dryomov 
173922e8bd51SIlya Dryomov 	rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
174022e8bd51SIlya Dryomov 
174122e8bd51SIlya Dryomov again:
174222e8bd51SIlya Dryomov 	ret = ceph_cls_lock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
174322e8bd51SIlya Dryomov 			    CEPH_CLS_LOCK_EXCLUSIVE, "", "", "", 0);
174422e8bd51SIlya Dryomov 	if (ret != -EBUSY || broke_lock) {
174522e8bd51SIlya Dryomov 		if (ret == -EEXIST)
174622e8bd51SIlya Dryomov 			ret = 0; /* already locked by myself */
174722e8bd51SIlya Dryomov 		if (ret)
174822e8bd51SIlya Dryomov 			rbd_warn(rbd_dev, "failed to lock object map: %d", ret);
174922e8bd51SIlya Dryomov 		return ret;
175022e8bd51SIlya Dryomov 	}
175122e8bd51SIlya Dryomov 
175222e8bd51SIlya Dryomov 	ret = ceph_cls_lock_info(osdc, &oid, &rbd_dev->header_oloc,
175322e8bd51SIlya Dryomov 				 RBD_LOCK_NAME, &lock_type, &lock_tag,
175422e8bd51SIlya Dryomov 				 &lockers, &num_lockers);
175522e8bd51SIlya Dryomov 	if (ret) {
175622e8bd51SIlya Dryomov 		if (ret == -ENOENT)
175722e8bd51SIlya Dryomov 			goto again;
175822e8bd51SIlya Dryomov 
175922e8bd51SIlya Dryomov 		rbd_warn(rbd_dev, "failed to get object map lockers: %d", ret);
176022e8bd51SIlya Dryomov 		return ret;
176122e8bd51SIlya Dryomov 	}
176222e8bd51SIlya Dryomov 
176322e8bd51SIlya Dryomov 	kfree(lock_tag);
176422e8bd51SIlya Dryomov 	if (num_lockers == 0)
176522e8bd51SIlya Dryomov 		goto again;
176622e8bd51SIlya Dryomov 
176722e8bd51SIlya Dryomov 	rbd_warn(rbd_dev, "breaking object map lock owned by %s%llu",
176822e8bd51SIlya Dryomov 		 ENTITY_NAME(lockers[0].id.name));
176922e8bd51SIlya Dryomov 
177022e8bd51SIlya Dryomov 	ret = ceph_cls_break_lock(osdc, &oid, &rbd_dev->header_oloc,
177122e8bd51SIlya Dryomov 				  RBD_LOCK_NAME, lockers[0].id.cookie,
177222e8bd51SIlya Dryomov 				  &lockers[0].id.name);
177322e8bd51SIlya Dryomov 	ceph_free_lockers(lockers, num_lockers);
177422e8bd51SIlya Dryomov 	if (ret) {
177522e8bd51SIlya Dryomov 		if (ret == -ENOENT)
177622e8bd51SIlya Dryomov 			goto again;
177722e8bd51SIlya Dryomov 
177822e8bd51SIlya Dryomov 		rbd_warn(rbd_dev, "failed to break object map lock: %d", ret);
177922e8bd51SIlya Dryomov 		return ret;
178022e8bd51SIlya Dryomov 	}
178122e8bd51SIlya Dryomov 
178222e8bd51SIlya Dryomov 	broke_lock = true;
178322e8bd51SIlya Dryomov 	goto again;
178422e8bd51SIlya Dryomov }
178522e8bd51SIlya Dryomov 
rbd_object_map_unlock(struct rbd_device * rbd_dev)178622e8bd51SIlya Dryomov static void rbd_object_map_unlock(struct rbd_device *rbd_dev)
178722e8bd51SIlya Dryomov {
178822e8bd51SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
178922e8bd51SIlya Dryomov 	CEPH_DEFINE_OID_ONSTACK(oid);
179022e8bd51SIlya Dryomov 	int ret;
179122e8bd51SIlya Dryomov 
179222e8bd51SIlya Dryomov 	rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
179322e8bd51SIlya Dryomov 
179422e8bd51SIlya Dryomov 	ret = ceph_cls_unlock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
179522e8bd51SIlya Dryomov 			      "");
179622e8bd51SIlya Dryomov 	if (ret && ret != -ENOENT)
179722e8bd51SIlya Dryomov 		rbd_warn(rbd_dev, "failed to unlock object map: %d", ret);
179822e8bd51SIlya Dryomov }
179922e8bd51SIlya Dryomov 
decode_object_map_header(void ** p,void * end,u64 * object_map_size)180022e8bd51SIlya Dryomov static int decode_object_map_header(void **p, void *end, u64 *object_map_size)
180122e8bd51SIlya Dryomov {
180222e8bd51SIlya Dryomov 	u8 struct_v;
180322e8bd51SIlya Dryomov 	u32 struct_len;
180422e8bd51SIlya Dryomov 	u32 header_len;
180522e8bd51SIlya Dryomov 	void *header_end;
180622e8bd51SIlya Dryomov 	int ret;
180722e8bd51SIlya Dryomov 
180822e8bd51SIlya Dryomov 	ceph_decode_32_safe(p, end, header_len, e_inval);
180922e8bd51SIlya Dryomov 	header_end = *p + header_len;
181022e8bd51SIlya Dryomov 
181122e8bd51SIlya Dryomov 	ret = ceph_start_decoding(p, end, 1, "BitVector header", &struct_v,
181222e8bd51SIlya Dryomov 				  &struct_len);
181322e8bd51SIlya Dryomov 	if (ret)
181422e8bd51SIlya Dryomov 		return ret;
181522e8bd51SIlya Dryomov 
181622e8bd51SIlya Dryomov 	ceph_decode_64_safe(p, end, *object_map_size, e_inval);
181722e8bd51SIlya Dryomov 
181822e8bd51SIlya Dryomov 	*p = header_end;
181922e8bd51SIlya Dryomov 	return 0;
182022e8bd51SIlya Dryomov 
182122e8bd51SIlya Dryomov e_inval:
182222e8bd51SIlya Dryomov 	return -EINVAL;
182322e8bd51SIlya Dryomov }
182422e8bd51SIlya Dryomov 
__rbd_object_map_load(struct rbd_device * rbd_dev)182522e8bd51SIlya Dryomov static int __rbd_object_map_load(struct rbd_device *rbd_dev)
182622e8bd51SIlya Dryomov {
182722e8bd51SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
182822e8bd51SIlya Dryomov 	CEPH_DEFINE_OID_ONSTACK(oid);
182922e8bd51SIlya Dryomov 	struct page **pages;
183022e8bd51SIlya Dryomov 	void *p, *end;
183122e8bd51SIlya Dryomov 	size_t reply_len;
183222e8bd51SIlya Dryomov 	u64 num_objects;
183322e8bd51SIlya Dryomov 	u64 object_map_bytes;
183422e8bd51SIlya Dryomov 	u64 object_map_size;
183522e8bd51SIlya Dryomov 	int num_pages;
183622e8bd51SIlya Dryomov 	int ret;
183722e8bd51SIlya Dryomov 
183822e8bd51SIlya Dryomov 	rbd_assert(!rbd_dev->object_map && !rbd_dev->object_map_size);
183922e8bd51SIlya Dryomov 
184022e8bd51SIlya Dryomov 	num_objects = ceph_get_num_objects(&rbd_dev->layout,
184122e8bd51SIlya Dryomov 					   rbd_dev->mapping.size);
184222e8bd51SIlya Dryomov 	object_map_bytes = DIV_ROUND_UP_ULL(num_objects * BITS_PER_OBJ,
184322e8bd51SIlya Dryomov 					    BITS_PER_BYTE);
184422e8bd51SIlya Dryomov 	num_pages = calc_pages_for(0, object_map_bytes) + 1;
184522e8bd51SIlya Dryomov 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
184622e8bd51SIlya Dryomov 	if (IS_ERR(pages))
184722e8bd51SIlya Dryomov 		return PTR_ERR(pages);
184822e8bd51SIlya Dryomov 
184922e8bd51SIlya Dryomov 	reply_len = num_pages * PAGE_SIZE;
185022e8bd51SIlya Dryomov 	rbd_object_map_name(rbd_dev, rbd_dev->spec->snap_id, &oid);
185122e8bd51SIlya Dryomov 	ret = ceph_osdc_call(osdc, &oid, &rbd_dev->header_oloc,
185222e8bd51SIlya Dryomov 			     "rbd", "object_map_load", CEPH_OSD_FLAG_READ,
185322e8bd51SIlya Dryomov 			     NULL, 0, pages, &reply_len);
185422e8bd51SIlya Dryomov 	if (ret)
185522e8bd51SIlya Dryomov 		goto out;
185622e8bd51SIlya Dryomov 
185722e8bd51SIlya Dryomov 	p = page_address(pages[0]);
185822e8bd51SIlya Dryomov 	end = p + min(reply_len, (size_t)PAGE_SIZE);
185922e8bd51SIlya Dryomov 	ret = decode_object_map_header(&p, end, &object_map_size);
186022e8bd51SIlya Dryomov 	if (ret)
186122e8bd51SIlya Dryomov 		goto out;
186222e8bd51SIlya Dryomov 
186322e8bd51SIlya Dryomov 	if (object_map_size != num_objects) {
186422e8bd51SIlya Dryomov 		rbd_warn(rbd_dev, "object map size mismatch: %llu vs %llu",
186522e8bd51SIlya Dryomov 			 object_map_size, num_objects);
186622e8bd51SIlya Dryomov 		ret = -EINVAL;
186722e8bd51SIlya Dryomov 		goto out;
186822e8bd51SIlya Dryomov 	}
186922e8bd51SIlya Dryomov 
187022e8bd51SIlya Dryomov 	if (offset_in_page(p) + object_map_bytes > reply_len) {
187122e8bd51SIlya Dryomov 		ret = -EINVAL;
187222e8bd51SIlya Dryomov 		goto out;
187322e8bd51SIlya Dryomov 	}
187422e8bd51SIlya Dryomov 
187522e8bd51SIlya Dryomov 	rbd_dev->object_map = kvmalloc(object_map_bytes, GFP_KERNEL);
187622e8bd51SIlya Dryomov 	if (!rbd_dev->object_map) {
187722e8bd51SIlya Dryomov 		ret = -ENOMEM;
187822e8bd51SIlya Dryomov 		goto out;
187922e8bd51SIlya Dryomov 	}
188022e8bd51SIlya Dryomov 
188122e8bd51SIlya Dryomov 	rbd_dev->object_map_size = object_map_size;
188222e8bd51SIlya Dryomov 	ceph_copy_from_page_vector(pages, rbd_dev->object_map,
188322e8bd51SIlya Dryomov 				   offset_in_page(p), object_map_bytes);
188422e8bd51SIlya Dryomov 
188522e8bd51SIlya Dryomov out:
188622e8bd51SIlya Dryomov 	ceph_release_page_vector(pages, num_pages);
188722e8bd51SIlya Dryomov 	return ret;
188822e8bd51SIlya Dryomov }
188922e8bd51SIlya Dryomov 
rbd_object_map_free(struct rbd_device * rbd_dev)189022e8bd51SIlya Dryomov static void rbd_object_map_free(struct rbd_device *rbd_dev)
189122e8bd51SIlya Dryomov {
189222e8bd51SIlya Dryomov 	kvfree(rbd_dev->object_map);
189322e8bd51SIlya Dryomov 	rbd_dev->object_map = NULL;
189422e8bd51SIlya Dryomov 	rbd_dev->object_map_size = 0;
189522e8bd51SIlya Dryomov }
189622e8bd51SIlya Dryomov 
rbd_object_map_load(struct rbd_device * rbd_dev)189722e8bd51SIlya Dryomov static int rbd_object_map_load(struct rbd_device *rbd_dev)
189822e8bd51SIlya Dryomov {
189922e8bd51SIlya Dryomov 	int ret;
190022e8bd51SIlya Dryomov 
190122e8bd51SIlya Dryomov 	ret = __rbd_object_map_load(rbd_dev);
190222e8bd51SIlya Dryomov 	if (ret)
190322e8bd51SIlya Dryomov 		return ret;
190422e8bd51SIlya Dryomov 
190522e8bd51SIlya Dryomov 	ret = rbd_dev_v2_get_flags(rbd_dev);
190622e8bd51SIlya Dryomov 	if (ret) {
190722e8bd51SIlya Dryomov 		rbd_object_map_free(rbd_dev);
190822e8bd51SIlya Dryomov 		return ret;
190922e8bd51SIlya Dryomov 	}
191022e8bd51SIlya Dryomov 
191122e8bd51SIlya Dryomov 	if (rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID)
191222e8bd51SIlya Dryomov 		rbd_warn(rbd_dev, "object map is invalid");
191322e8bd51SIlya Dryomov 
191422e8bd51SIlya Dryomov 	return 0;
191522e8bd51SIlya Dryomov }
191622e8bd51SIlya Dryomov 
rbd_object_map_open(struct rbd_device * rbd_dev)191722e8bd51SIlya Dryomov static int rbd_object_map_open(struct rbd_device *rbd_dev)
191822e8bd51SIlya Dryomov {
191922e8bd51SIlya Dryomov 	int ret;
192022e8bd51SIlya Dryomov 
192122e8bd51SIlya Dryomov 	ret = rbd_object_map_lock(rbd_dev);
192222e8bd51SIlya Dryomov 	if (ret)
192322e8bd51SIlya Dryomov 		return ret;
192422e8bd51SIlya Dryomov 
192522e8bd51SIlya Dryomov 	ret = rbd_object_map_load(rbd_dev);
192622e8bd51SIlya Dryomov 	if (ret) {
192722e8bd51SIlya Dryomov 		rbd_object_map_unlock(rbd_dev);
192822e8bd51SIlya Dryomov 		return ret;
192922e8bd51SIlya Dryomov 	}
193022e8bd51SIlya Dryomov 
193122e8bd51SIlya Dryomov 	return 0;
193222e8bd51SIlya Dryomov }
193322e8bd51SIlya Dryomov 
rbd_object_map_close(struct rbd_device * rbd_dev)193422e8bd51SIlya Dryomov static void rbd_object_map_close(struct rbd_device *rbd_dev)
193522e8bd51SIlya Dryomov {
193622e8bd51SIlya Dryomov 	rbd_object_map_free(rbd_dev);
193722e8bd51SIlya Dryomov 	rbd_object_map_unlock(rbd_dev);
193822e8bd51SIlya Dryomov }
193922e8bd51SIlya Dryomov 
194022e8bd51SIlya Dryomov /*
194122e8bd51SIlya Dryomov  * This function needs snap_id (or more precisely just something to
194222e8bd51SIlya Dryomov  * distinguish between HEAD and snapshot object maps), new_state and
194322e8bd51SIlya Dryomov  * current_state that were passed to rbd_object_map_update().
194422e8bd51SIlya Dryomov  *
194522e8bd51SIlya Dryomov  * To avoid allocating and stashing a context we piggyback on the OSD
194622e8bd51SIlya Dryomov  * request.  A HEAD update has two ops (assert_locked).  For new_state
194722e8bd51SIlya Dryomov  * and current_state we decode our own object_map_update op, encoded in
194822e8bd51SIlya Dryomov  * rbd_cls_object_map_update().
194922e8bd51SIlya Dryomov  */
rbd_object_map_update_finish(struct rbd_obj_request * obj_req,struct ceph_osd_request * osd_req)195022e8bd51SIlya Dryomov static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req,
195122e8bd51SIlya Dryomov 					struct ceph_osd_request *osd_req)
195222e8bd51SIlya Dryomov {
195322e8bd51SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
195422e8bd51SIlya Dryomov 	struct ceph_osd_data *osd_data;
195522e8bd51SIlya Dryomov 	u64 objno;
19563f649ab7SKees Cook 	u8 state, new_state, current_state;
195722e8bd51SIlya Dryomov 	bool has_current_state;
195822e8bd51SIlya Dryomov 	void *p;
195922e8bd51SIlya Dryomov 
196022e8bd51SIlya Dryomov 	if (osd_req->r_result)
196122e8bd51SIlya Dryomov 		return osd_req->r_result;
196222e8bd51SIlya Dryomov 
196322e8bd51SIlya Dryomov 	/*
196422e8bd51SIlya Dryomov 	 * Nothing to do for a snapshot object map.
196522e8bd51SIlya Dryomov 	 */
196622e8bd51SIlya Dryomov 	if (osd_req->r_num_ops == 1)
196722e8bd51SIlya Dryomov 		return 0;
196822e8bd51SIlya Dryomov 
196922e8bd51SIlya Dryomov 	/*
197022e8bd51SIlya Dryomov 	 * Update in-memory HEAD object map.
197122e8bd51SIlya Dryomov 	 */
197222e8bd51SIlya Dryomov 	rbd_assert(osd_req->r_num_ops == 2);
197322e8bd51SIlya Dryomov 	osd_data = osd_req_op_data(osd_req, 1, cls, request_data);
197422e8bd51SIlya Dryomov 	rbd_assert(osd_data->type == CEPH_OSD_DATA_TYPE_PAGES);
197522e8bd51SIlya Dryomov 
197622e8bd51SIlya Dryomov 	p = page_address(osd_data->pages[0]);
197722e8bd51SIlya Dryomov 	objno = ceph_decode_64(&p);
197822e8bd51SIlya Dryomov 	rbd_assert(objno == obj_req->ex.oe_objno);
197922e8bd51SIlya Dryomov 	rbd_assert(ceph_decode_64(&p) == objno + 1);
198022e8bd51SIlya Dryomov 	new_state = ceph_decode_8(&p);
198122e8bd51SIlya Dryomov 	has_current_state = ceph_decode_8(&p);
198222e8bd51SIlya Dryomov 	if (has_current_state)
198322e8bd51SIlya Dryomov 		current_state = ceph_decode_8(&p);
198422e8bd51SIlya Dryomov 
198522e8bd51SIlya Dryomov 	spin_lock(&rbd_dev->object_map_lock);
198622e8bd51SIlya Dryomov 	state = __rbd_object_map_get(rbd_dev, objno);
198722e8bd51SIlya Dryomov 	if (!has_current_state || current_state == state ||
198822e8bd51SIlya Dryomov 	    (current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN))
198922e8bd51SIlya Dryomov 		__rbd_object_map_set(rbd_dev, objno, new_state);
199022e8bd51SIlya Dryomov 	spin_unlock(&rbd_dev->object_map_lock);
199122e8bd51SIlya Dryomov 
199222e8bd51SIlya Dryomov 	return 0;
199322e8bd51SIlya Dryomov }
199422e8bd51SIlya Dryomov 
rbd_object_map_callback(struct ceph_osd_request * osd_req)199522e8bd51SIlya Dryomov static void rbd_object_map_callback(struct ceph_osd_request *osd_req)
199622e8bd51SIlya Dryomov {
199722e8bd51SIlya Dryomov 	struct rbd_obj_request *obj_req = osd_req->r_priv;
199822e8bd51SIlya Dryomov 	int result;
199922e8bd51SIlya Dryomov 
200022e8bd51SIlya Dryomov 	dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
200122e8bd51SIlya Dryomov 	     osd_req->r_result, obj_req);
200222e8bd51SIlya Dryomov 
200322e8bd51SIlya Dryomov 	result = rbd_object_map_update_finish(obj_req, osd_req);
200422e8bd51SIlya Dryomov 	rbd_obj_handle_request(obj_req, result);
200522e8bd51SIlya Dryomov }
200622e8bd51SIlya Dryomov 
update_needed(struct rbd_device * rbd_dev,u64 objno,u8 new_state)200722e8bd51SIlya Dryomov static bool update_needed(struct rbd_device *rbd_dev, u64 objno, u8 new_state)
200822e8bd51SIlya Dryomov {
200922e8bd51SIlya Dryomov 	u8 state = rbd_object_map_get(rbd_dev, objno);
201022e8bd51SIlya Dryomov 
201122e8bd51SIlya Dryomov 	if (state == new_state ||
201222e8bd51SIlya Dryomov 	    (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) ||
201322e8bd51SIlya Dryomov 	    (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING))
201422e8bd51SIlya Dryomov 		return false;
201522e8bd51SIlya Dryomov 
201622e8bd51SIlya Dryomov 	return true;
201722e8bd51SIlya Dryomov }
201822e8bd51SIlya Dryomov 
rbd_cls_object_map_update(struct ceph_osd_request * req,int which,u64 objno,u8 new_state,const u8 * current_state)201922e8bd51SIlya Dryomov static int rbd_cls_object_map_update(struct ceph_osd_request *req,
202022e8bd51SIlya Dryomov 				     int which, u64 objno, u8 new_state,
202122e8bd51SIlya Dryomov 				     const u8 *current_state)
202222e8bd51SIlya Dryomov {
202322e8bd51SIlya Dryomov 	struct page **pages;
202422e8bd51SIlya Dryomov 	void *p, *start;
202522e8bd51SIlya Dryomov 	int ret;
202622e8bd51SIlya Dryomov 
202722e8bd51SIlya Dryomov 	ret = osd_req_op_cls_init(req, which, "rbd", "object_map_update");
202822e8bd51SIlya Dryomov 	if (ret)
202922e8bd51SIlya Dryomov 		return ret;
203022e8bd51SIlya Dryomov 
203122e8bd51SIlya Dryomov 	pages = ceph_alloc_page_vector(1, GFP_NOIO);
203222e8bd51SIlya Dryomov 	if (IS_ERR(pages))
203322e8bd51SIlya Dryomov 		return PTR_ERR(pages);
203422e8bd51SIlya Dryomov 
203522e8bd51SIlya Dryomov 	p = start = page_address(pages[0]);
203622e8bd51SIlya Dryomov 	ceph_encode_64(&p, objno);
203722e8bd51SIlya Dryomov 	ceph_encode_64(&p, objno + 1);
203822e8bd51SIlya Dryomov 	ceph_encode_8(&p, new_state);
203922e8bd51SIlya Dryomov 	if (current_state) {
204022e8bd51SIlya Dryomov 		ceph_encode_8(&p, 1);
204122e8bd51SIlya Dryomov 		ceph_encode_8(&p, *current_state);
204222e8bd51SIlya Dryomov 	} else {
204322e8bd51SIlya Dryomov 		ceph_encode_8(&p, 0);
204422e8bd51SIlya Dryomov 	}
204522e8bd51SIlya Dryomov 
204622e8bd51SIlya Dryomov 	osd_req_op_cls_request_data_pages(req, which, pages, p - start, 0,
204722e8bd51SIlya Dryomov 					  false, true);
204822e8bd51SIlya Dryomov 	return 0;
204922e8bd51SIlya Dryomov }
205022e8bd51SIlya Dryomov 
205122e8bd51SIlya Dryomov /*
205222e8bd51SIlya Dryomov  * Return:
205322e8bd51SIlya Dryomov  *   0 - object map update sent
205422e8bd51SIlya Dryomov  *   1 - object map update isn't needed
205522e8bd51SIlya Dryomov  *  <0 - error
205622e8bd51SIlya Dryomov  */
rbd_object_map_update(struct rbd_obj_request * obj_req,u64 snap_id,u8 new_state,const u8 * current_state)205722e8bd51SIlya Dryomov static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id,
205822e8bd51SIlya Dryomov 				 u8 new_state, const u8 *current_state)
205922e8bd51SIlya Dryomov {
206022e8bd51SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
206122e8bd51SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
206222e8bd51SIlya Dryomov 	struct ceph_osd_request *req;
206322e8bd51SIlya Dryomov 	int num_ops = 1;
206422e8bd51SIlya Dryomov 	int which = 0;
206522e8bd51SIlya Dryomov 	int ret;
206622e8bd51SIlya Dryomov 
206722e8bd51SIlya Dryomov 	if (snap_id == CEPH_NOSNAP) {
206822e8bd51SIlya Dryomov 		if (!update_needed(rbd_dev, obj_req->ex.oe_objno, new_state))
206922e8bd51SIlya Dryomov 			return 1;
207022e8bd51SIlya Dryomov 
207122e8bd51SIlya Dryomov 		num_ops++; /* assert_locked */
207222e8bd51SIlya Dryomov 	}
207322e8bd51SIlya Dryomov 
207422e8bd51SIlya Dryomov 	req = ceph_osdc_alloc_request(osdc, NULL, num_ops, false, GFP_NOIO);
207522e8bd51SIlya Dryomov 	if (!req)
207622e8bd51SIlya Dryomov 		return -ENOMEM;
207722e8bd51SIlya Dryomov 
207822e8bd51SIlya Dryomov 	list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
207922e8bd51SIlya Dryomov 	req->r_callback = rbd_object_map_callback;
208022e8bd51SIlya Dryomov 	req->r_priv = obj_req;
208122e8bd51SIlya Dryomov 
208222e8bd51SIlya Dryomov 	rbd_object_map_name(rbd_dev, snap_id, &req->r_base_oid);
208322e8bd51SIlya Dryomov 	ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
208422e8bd51SIlya Dryomov 	req->r_flags = CEPH_OSD_FLAG_WRITE;
208522e8bd51SIlya Dryomov 	ktime_get_real_ts64(&req->r_mtime);
208622e8bd51SIlya Dryomov 
208722e8bd51SIlya Dryomov 	if (snap_id == CEPH_NOSNAP) {
208822e8bd51SIlya Dryomov 		/*
208922e8bd51SIlya Dryomov 		 * Protect against possible race conditions during lock
209022e8bd51SIlya Dryomov 		 * ownership transitions.
209122e8bd51SIlya Dryomov 		 */
209222e8bd51SIlya Dryomov 		ret = ceph_cls_assert_locked(req, which++, RBD_LOCK_NAME,
209322e8bd51SIlya Dryomov 					     CEPH_CLS_LOCK_EXCLUSIVE, "", "");
209422e8bd51SIlya Dryomov 		if (ret)
209522e8bd51SIlya Dryomov 			return ret;
209622e8bd51SIlya Dryomov 	}
209722e8bd51SIlya Dryomov 
209822e8bd51SIlya Dryomov 	ret = rbd_cls_object_map_update(req, which, obj_req->ex.oe_objno,
209922e8bd51SIlya Dryomov 					new_state, current_state);
210022e8bd51SIlya Dryomov 	if (ret)
210122e8bd51SIlya Dryomov 		return ret;
210222e8bd51SIlya Dryomov 
210322e8bd51SIlya Dryomov 	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
210422e8bd51SIlya Dryomov 	if (ret)
210522e8bd51SIlya Dryomov 		return ret;
210622e8bd51SIlya Dryomov 
2107a8af0d68SJeff Layton 	ceph_osdc_start_request(osdc, req);
210822e8bd51SIlya Dryomov 	return 0;
210922e8bd51SIlya Dryomov }
211022e8bd51SIlya Dryomov 
prune_extents(struct ceph_file_extent * img_extents,u32 * num_img_extents,u64 overlap)211186bd7998SIlya Dryomov static void prune_extents(struct ceph_file_extent *img_extents,
211286bd7998SIlya Dryomov 			  u32 *num_img_extents, u64 overlap)
2113e93f3152SAlex Elder {
211486bd7998SIlya Dryomov 	u32 cnt = *num_img_extents;
2115e93f3152SAlex Elder 
211686bd7998SIlya Dryomov 	/* drop extents completely beyond the overlap */
211786bd7998SIlya Dryomov 	while (cnt && img_extents[cnt - 1].fe_off >= overlap)
211886bd7998SIlya Dryomov 		cnt--;
2119e93f3152SAlex Elder 
212086bd7998SIlya Dryomov 	if (cnt) {
212186bd7998SIlya Dryomov 		struct ceph_file_extent *ex = &img_extents[cnt - 1];
2122e93f3152SAlex Elder 
212386bd7998SIlya Dryomov 		/* trim final overlapping extent */
212486bd7998SIlya Dryomov 		if (ex->fe_off + ex->fe_len > overlap)
212586bd7998SIlya Dryomov 			ex->fe_len = overlap - ex->fe_off;
2126e93f3152SAlex Elder 	}
2127e93f3152SAlex Elder 
212886bd7998SIlya Dryomov 	*num_img_extents = cnt;
212986bd7998SIlya Dryomov }
213086bd7998SIlya Dryomov 
213186bd7998SIlya Dryomov /*
213286bd7998SIlya Dryomov  * Determine the byte range(s) covered by either just the object extent
213386bd7998SIlya Dryomov  * or the entire object in the parent image.
213486bd7998SIlya Dryomov  */
rbd_obj_calc_img_extents(struct rbd_obj_request * obj_req,bool entire)213586bd7998SIlya Dryomov static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
213686bd7998SIlya Dryomov 				    bool entire)
2137e93f3152SAlex Elder {
213886bd7998SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2139c5b5ef6cSAlex Elder 	int ret;
2140c5b5ef6cSAlex Elder 
214186bd7998SIlya Dryomov 	if (!rbd_dev->parent_overlap)
214286bd7998SIlya Dryomov 		return 0;
214386bd7998SIlya Dryomov 
214486bd7998SIlya Dryomov 	ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
214586bd7998SIlya Dryomov 				  entire ? 0 : obj_req->ex.oe_off,
214686bd7998SIlya Dryomov 				  entire ? rbd_dev->layout.object_size :
214786bd7998SIlya Dryomov 							obj_req->ex.oe_len,
214886bd7998SIlya Dryomov 				  &obj_req->img_extents,
214986bd7998SIlya Dryomov 				  &obj_req->num_img_extents);
215086bd7998SIlya Dryomov 	if (ret)
215186bd7998SIlya Dryomov 		return ret;
215286bd7998SIlya Dryomov 
215386bd7998SIlya Dryomov 	prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
215486bd7998SIlya Dryomov 		      rbd_dev->parent_overlap);
215586bd7998SIlya Dryomov 	return 0;
215686bd7998SIlya Dryomov }
215786bd7998SIlya Dryomov 
rbd_osd_setup_data(struct ceph_osd_request * osd_req,int which)2158bcbab1dbSIlya Dryomov static void rbd_osd_setup_data(struct ceph_osd_request *osd_req, int which)
21593da691bfSIlya Dryomov {
2160bcbab1dbSIlya Dryomov 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2161bcbab1dbSIlya Dryomov 
2162ecc633caSIlya Dryomov 	switch (obj_req->img_request->data_type) {
21633da691bfSIlya Dryomov 	case OBJ_REQUEST_BIO:
2164bcbab1dbSIlya Dryomov 		osd_req_op_extent_osd_data_bio(osd_req, which,
21653da691bfSIlya Dryomov 					       &obj_req->bio_pos,
216643df3d35SIlya Dryomov 					       obj_req->ex.oe_len);
21673da691bfSIlya Dryomov 		break;
21683da691bfSIlya Dryomov 	case OBJ_REQUEST_BVECS:
2169afb97888SIlya Dryomov 	case OBJ_REQUEST_OWN_BVECS:
21703da691bfSIlya Dryomov 		rbd_assert(obj_req->bvec_pos.iter.bi_size ==
217143df3d35SIlya Dryomov 							obj_req->ex.oe_len);
2172afb97888SIlya Dryomov 		rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
2173bcbab1dbSIlya Dryomov 		osd_req_op_extent_osd_data_bvec_pos(osd_req, which,
21743da691bfSIlya Dryomov 						    &obj_req->bvec_pos);
21753da691bfSIlya Dryomov 		break;
21763da691bfSIlya Dryomov 	default:
217716809372SArnd Bergmann 		BUG();
21783da691bfSIlya Dryomov 	}
21793da691bfSIlya Dryomov }
21803da691bfSIlya Dryomov 
rbd_osd_setup_stat(struct ceph_osd_request * osd_req,int which)2181bcbab1dbSIlya Dryomov static int rbd_osd_setup_stat(struct ceph_osd_request *osd_req, int which)
21823da691bfSIlya Dryomov {
21833da691bfSIlya Dryomov 	struct page **pages;
21843da691bfSIlya Dryomov 
2185c5b5ef6cSAlex Elder 	/*
2186c5b5ef6cSAlex Elder 	 * The response data for a STAT call consists of:
2187c5b5ef6cSAlex Elder 	 *     le64 length;
2188c5b5ef6cSAlex Elder 	 *     struct {
2189c5b5ef6cSAlex Elder 	 *         le32 tv_sec;
2190c5b5ef6cSAlex Elder 	 *         le32 tv_nsec;
2191c5b5ef6cSAlex Elder 	 *     } mtime;
2192c5b5ef6cSAlex Elder 	 */
21933da691bfSIlya Dryomov 	pages = ceph_alloc_page_vector(1, GFP_NOIO);
21943da691bfSIlya Dryomov 	if (IS_ERR(pages))
21953da691bfSIlya Dryomov 		return PTR_ERR(pages);
21963da691bfSIlya Dryomov 
2197bcbab1dbSIlya Dryomov 	osd_req_op_init(osd_req, which, CEPH_OSD_OP_STAT, 0);
2198bcbab1dbSIlya Dryomov 	osd_req_op_raw_data_in_pages(osd_req, which, pages,
21993da691bfSIlya Dryomov 				     8 + sizeof(struct ceph_timespec),
22003da691bfSIlya Dryomov 				     0, false, true);
22013da691bfSIlya Dryomov 	return 0;
2202710214e3SIlya Dryomov }
2203c5b5ef6cSAlex Elder 
rbd_osd_setup_copyup(struct ceph_osd_request * osd_req,int which,u32 bytes)2204b5ae8cbcSIlya Dryomov static int rbd_osd_setup_copyup(struct ceph_osd_request *osd_req, int which,
2205b5ae8cbcSIlya Dryomov 				u32 bytes)
220613488d53SIlya Dryomov {
2207b5ae8cbcSIlya Dryomov 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2208b5ae8cbcSIlya Dryomov 	int ret;
2209b5ae8cbcSIlya Dryomov 
2210b5ae8cbcSIlya Dryomov 	ret = osd_req_op_cls_init(osd_req, which, "rbd", "copyup");
2211b5ae8cbcSIlya Dryomov 	if (ret)
2212b5ae8cbcSIlya Dryomov 		return ret;
2213b5ae8cbcSIlya Dryomov 
2214b5ae8cbcSIlya Dryomov 	osd_req_op_cls_request_data_bvecs(osd_req, which, obj_req->copyup_bvecs,
2215b5ae8cbcSIlya Dryomov 					  obj_req->copyup_bvec_count, bytes);
2216b5ae8cbcSIlya Dryomov 	return 0;
221713488d53SIlya Dryomov }
221813488d53SIlya Dryomov 
rbd_obj_init_read(struct rbd_obj_request * obj_req)2219ea9b743cSIlya Dryomov static int rbd_obj_init_read(struct rbd_obj_request *obj_req)
22203da691bfSIlya Dryomov {
2221ea9b743cSIlya Dryomov 	obj_req->read_state = RBD_OBJ_READ_START;
2222ea9b743cSIlya Dryomov 	return 0;
2223ea9b743cSIlya Dryomov }
2224ea9b743cSIlya Dryomov 
__rbd_osd_setup_write_ops(struct ceph_osd_request * osd_req,int which)2225bcbab1dbSIlya Dryomov static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2226bcbab1dbSIlya Dryomov 				      int which)
22273da691bfSIlya Dryomov {
2228bcbab1dbSIlya Dryomov 	struct rbd_obj_request *obj_req = osd_req->r_priv;
22293da691bfSIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
22303da691bfSIlya Dryomov 	u16 opcode;
2231c5b5ef6cSAlex Elder 
22328b5bec5cSIlya Dryomov 	if (!use_object_map(rbd_dev) ||
22338b5bec5cSIlya Dryomov 	    !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) {
2234bcbab1dbSIlya Dryomov 		osd_req_op_alloc_hint_init(osd_req, which++,
22353da691bfSIlya Dryomov 					   rbd_dev->layout.object_size,
2236d3798accSIlya Dryomov 					   rbd_dev->layout.object_size,
2237dc1dad8eSIlya Dryomov 					   rbd_dev->opts->alloc_hint_flags);
22388b5bec5cSIlya Dryomov 	}
2239c5b5ef6cSAlex Elder 
22403da691bfSIlya Dryomov 	if (rbd_obj_is_entire(obj_req))
22413da691bfSIlya Dryomov 		opcode = CEPH_OSD_OP_WRITEFULL;
22423da691bfSIlya Dryomov 	else
22433da691bfSIlya Dryomov 		opcode = CEPH_OSD_OP_WRITE;
2244c5b5ef6cSAlex Elder 
2245bcbab1dbSIlya Dryomov 	osd_req_op_extent_init(osd_req, which, opcode,
224643df3d35SIlya Dryomov 			       obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
2247bcbab1dbSIlya Dryomov 	rbd_osd_setup_data(osd_req, which);
22483da691bfSIlya Dryomov }
22493da691bfSIlya Dryomov 
rbd_obj_init_write(struct rbd_obj_request * obj_req)2250ea9b743cSIlya Dryomov static int rbd_obj_init_write(struct rbd_obj_request *obj_req)
22513da691bfSIlya Dryomov {
22523da691bfSIlya Dryomov 	int ret;
22533da691bfSIlya Dryomov 
225486bd7998SIlya Dryomov 	/* reverse map the entire object onto the parent */
225586bd7998SIlya Dryomov 	ret = rbd_obj_calc_img_extents(obj_req, true);
225686bd7998SIlya Dryomov 	if (ret)
225786bd7998SIlya Dryomov 		return ret;
225886bd7998SIlya Dryomov 
225985b5e6d1SIlya Dryomov 	obj_req->write_state = RBD_OBJ_WRITE_START;
22603da691bfSIlya Dryomov 	return 0;
226170d045f6SIlya Dryomov }
226270d045f6SIlya Dryomov 
truncate_or_zero_opcode(struct rbd_obj_request * obj_req)22636484cbe9SIlya Dryomov static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req)
22646484cbe9SIlya Dryomov {
22656484cbe9SIlya Dryomov 	return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE :
22666484cbe9SIlya Dryomov 					  CEPH_OSD_OP_ZERO;
22676484cbe9SIlya Dryomov }
22686484cbe9SIlya Dryomov 
__rbd_osd_setup_discard_ops(struct ceph_osd_request * osd_req,int which)226927bbd911SIlya Dryomov static void __rbd_osd_setup_discard_ops(struct ceph_osd_request *osd_req,
227027bbd911SIlya Dryomov 					int which)
227127bbd911SIlya Dryomov {
227227bbd911SIlya Dryomov 	struct rbd_obj_request *obj_req = osd_req->r_priv;
227327bbd911SIlya Dryomov 
227427bbd911SIlya Dryomov 	if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
227527bbd911SIlya Dryomov 		rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
227627bbd911SIlya Dryomov 		osd_req_op_init(osd_req, which, CEPH_OSD_OP_DELETE, 0);
227727bbd911SIlya Dryomov 	} else {
227827bbd911SIlya Dryomov 		osd_req_op_extent_init(osd_req, which,
227927bbd911SIlya Dryomov 				       truncate_or_zero_opcode(obj_req),
228027bbd911SIlya Dryomov 				       obj_req->ex.oe_off, obj_req->ex.oe_len,
228127bbd911SIlya Dryomov 				       0, 0);
228227bbd911SIlya Dryomov 	}
228327bbd911SIlya Dryomov }
228427bbd911SIlya Dryomov 
rbd_obj_init_discard(struct rbd_obj_request * obj_req)2285ea9b743cSIlya Dryomov static int rbd_obj_init_discard(struct rbd_obj_request *obj_req)
22866484cbe9SIlya Dryomov {
22870c93e1b7SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
228827bbd911SIlya Dryomov 	u64 off, next_off;
22896484cbe9SIlya Dryomov 	int ret;
22906484cbe9SIlya Dryomov 
22910c93e1b7SIlya Dryomov 	/*
22920c93e1b7SIlya Dryomov 	 * Align the range to alloc_size boundary and punt on discards
22930c93e1b7SIlya Dryomov 	 * that are too small to free up any space.
22940c93e1b7SIlya Dryomov 	 *
22950c93e1b7SIlya Dryomov 	 * alloc_size == object_size && is_tail() is a special case for
22960c93e1b7SIlya Dryomov 	 * filestore with filestore_punch_hole = false, needed to allow
22970c93e1b7SIlya Dryomov 	 * truncate (in addition to delete).
22980c93e1b7SIlya Dryomov 	 */
22990c93e1b7SIlya Dryomov 	if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size ||
23000c93e1b7SIlya Dryomov 	    !rbd_obj_is_tail(obj_req)) {
230127bbd911SIlya Dryomov 		off = round_up(obj_req->ex.oe_off, rbd_dev->opts->alloc_size);
230227bbd911SIlya Dryomov 		next_off = round_down(obj_req->ex.oe_off + obj_req->ex.oe_len,
230327bbd911SIlya Dryomov 				      rbd_dev->opts->alloc_size);
23040c93e1b7SIlya Dryomov 		if (off >= next_off)
23050c93e1b7SIlya Dryomov 			return 1;
230627bbd911SIlya Dryomov 
230727bbd911SIlya Dryomov 		dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
230827bbd911SIlya Dryomov 		     obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
230927bbd911SIlya Dryomov 		     off, next_off - off);
231027bbd911SIlya Dryomov 		obj_req->ex.oe_off = off;
231127bbd911SIlya Dryomov 		obj_req->ex.oe_len = next_off - off;
23120c93e1b7SIlya Dryomov 	}
23130c93e1b7SIlya Dryomov 
23146484cbe9SIlya Dryomov 	/* reverse map the entire object onto the parent */
23156484cbe9SIlya Dryomov 	ret = rbd_obj_calc_img_extents(obj_req, true);
23166484cbe9SIlya Dryomov 	if (ret)
23176484cbe9SIlya Dryomov 		return ret;
23186484cbe9SIlya Dryomov 
231922e8bd51SIlya Dryomov 	obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
23200ad5d953SIlya Dryomov 	if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents)
23210ad5d953SIlya Dryomov 		obj_req->flags |= RBD_OBJ_FLAG_DELETION;
23226484cbe9SIlya Dryomov 
232385b5e6d1SIlya Dryomov 	obj_req->write_state = RBD_OBJ_WRITE_START;
23246484cbe9SIlya Dryomov 	return 0;
23256484cbe9SIlya Dryomov }
23266484cbe9SIlya Dryomov 
__rbd_osd_setup_zeroout_ops(struct ceph_osd_request * osd_req,int which)2327bcbab1dbSIlya Dryomov static void __rbd_osd_setup_zeroout_ops(struct ceph_osd_request *osd_req,
2328bcbab1dbSIlya Dryomov 					int which)
232913488d53SIlya Dryomov {
2330bcbab1dbSIlya Dryomov 	struct rbd_obj_request *obj_req = osd_req->r_priv;
23313da691bfSIlya Dryomov 	u16 opcode;
2332058aa991SIlya Dryomov 
23333da691bfSIlya Dryomov 	if (rbd_obj_is_entire(obj_req)) {
233486bd7998SIlya Dryomov 		if (obj_req->num_img_extents) {
23350ad5d953SIlya Dryomov 			if (!(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
2336bcbab1dbSIlya Dryomov 				osd_req_op_init(osd_req, which++,
23372bb1e56eSIlya Dryomov 						CEPH_OSD_OP_CREATE, 0);
23383da691bfSIlya Dryomov 			opcode = CEPH_OSD_OP_TRUNCATE;
23393da691bfSIlya Dryomov 		} else {
23400ad5d953SIlya Dryomov 			rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
2341bcbab1dbSIlya Dryomov 			osd_req_op_init(osd_req, which++,
23423da691bfSIlya Dryomov 					CEPH_OSD_OP_DELETE, 0);
23433da691bfSIlya Dryomov 			opcode = 0;
23443da691bfSIlya Dryomov 		}
23453da691bfSIlya Dryomov 	} else {
23466484cbe9SIlya Dryomov 		opcode = truncate_or_zero_opcode(obj_req);
23473da691bfSIlya Dryomov 	}
23483da691bfSIlya Dryomov 
23493da691bfSIlya Dryomov 	if (opcode)
2350bcbab1dbSIlya Dryomov 		osd_req_op_extent_init(osd_req, which, opcode,
235143df3d35SIlya Dryomov 				       obj_req->ex.oe_off, obj_req->ex.oe_len,
23523da691bfSIlya Dryomov 				       0, 0);
23533da691bfSIlya Dryomov }
23543da691bfSIlya Dryomov 
rbd_obj_init_zeroout(struct rbd_obj_request * obj_req)2355ea9b743cSIlya Dryomov static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req)
23563da691bfSIlya Dryomov {
23573da691bfSIlya Dryomov 	int ret;
23583da691bfSIlya Dryomov 
235986bd7998SIlya Dryomov 	/* reverse map the entire object onto the parent */
236086bd7998SIlya Dryomov 	ret = rbd_obj_calc_img_extents(obj_req, true);
236186bd7998SIlya Dryomov 	if (ret)
236286bd7998SIlya Dryomov 		return ret;
236386bd7998SIlya Dryomov 
23640ad5d953SIlya Dryomov 	if (!obj_req->num_img_extents) {
236522e8bd51SIlya Dryomov 		obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
23660ad5d953SIlya Dryomov 		if (rbd_obj_is_entire(obj_req))
23670ad5d953SIlya Dryomov 			obj_req->flags |= RBD_OBJ_FLAG_DELETION;
23683da691bfSIlya Dryomov 	}
23693da691bfSIlya Dryomov 
237085b5e6d1SIlya Dryomov 	obj_req->write_state = RBD_OBJ_WRITE_START;
2371980917fcSIlya Dryomov 	return 0;
2372b454e36dSAlex Elder }
2373b454e36dSAlex Elder 
count_write_ops(struct rbd_obj_request * obj_req)2374a086a1b8SIlya Dryomov static int count_write_ops(struct rbd_obj_request *obj_req)
2375a086a1b8SIlya Dryomov {
23768b5bec5cSIlya Dryomov 	struct rbd_img_request *img_req = obj_req->img_request;
23778b5bec5cSIlya Dryomov 
23788b5bec5cSIlya Dryomov 	switch (img_req->op_type) {
2379a086a1b8SIlya Dryomov 	case OBJ_OP_WRITE:
23808b5bec5cSIlya Dryomov 		if (!use_object_map(img_req->rbd_dev) ||
23818b5bec5cSIlya Dryomov 		    !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST))
2382a086a1b8SIlya Dryomov 			return 2; /* setallochint + write/writefull */
23838b5bec5cSIlya Dryomov 
23848b5bec5cSIlya Dryomov 		return 1; /* write/writefull */
2385a086a1b8SIlya Dryomov 	case OBJ_OP_DISCARD:
2386a086a1b8SIlya Dryomov 		return 1; /* delete/truncate/zero */
2387a086a1b8SIlya Dryomov 	case OBJ_OP_ZEROOUT:
2388a086a1b8SIlya Dryomov 		if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents &&
2389a086a1b8SIlya Dryomov 		    !(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
2390a086a1b8SIlya Dryomov 			return 2; /* create + truncate */
2391a086a1b8SIlya Dryomov 
2392a086a1b8SIlya Dryomov 		return 1; /* delete/truncate/zero */
2393a086a1b8SIlya Dryomov 	default:
2394a086a1b8SIlya Dryomov 		BUG();
2395a086a1b8SIlya Dryomov 	}
2396a086a1b8SIlya Dryomov }
2397a086a1b8SIlya Dryomov 
rbd_osd_setup_write_ops(struct ceph_osd_request * osd_req,int which)2398a086a1b8SIlya Dryomov static void rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2399a086a1b8SIlya Dryomov 				    int which)
2400a086a1b8SIlya Dryomov {
2401a086a1b8SIlya Dryomov 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2402a086a1b8SIlya Dryomov 
2403a086a1b8SIlya Dryomov 	switch (obj_req->img_request->op_type) {
2404a086a1b8SIlya Dryomov 	case OBJ_OP_WRITE:
2405a086a1b8SIlya Dryomov 		__rbd_osd_setup_write_ops(osd_req, which);
2406a086a1b8SIlya Dryomov 		break;
2407a086a1b8SIlya Dryomov 	case OBJ_OP_DISCARD:
2408a086a1b8SIlya Dryomov 		__rbd_osd_setup_discard_ops(osd_req, which);
2409a086a1b8SIlya Dryomov 		break;
2410a086a1b8SIlya Dryomov 	case OBJ_OP_ZEROOUT:
2411a086a1b8SIlya Dryomov 		__rbd_osd_setup_zeroout_ops(osd_req, which);
2412a086a1b8SIlya Dryomov 		break;
2413a086a1b8SIlya Dryomov 	default:
2414a086a1b8SIlya Dryomov 		BUG();
2415a086a1b8SIlya Dryomov 	}
2416a086a1b8SIlya Dryomov }
2417a086a1b8SIlya Dryomov 
2418b454e36dSAlex Elder /*
2419a086a1b8SIlya Dryomov  * Prune the list of object requests (adjust offset and/or length, drop
2420a086a1b8SIlya Dryomov  * redundant requests).  Prepare object request state machines and image
2421a086a1b8SIlya Dryomov  * request state machine for execution.
2422b454e36dSAlex Elder  */
__rbd_img_fill_request(struct rbd_img_request * img_req)24233da691bfSIlya Dryomov static int __rbd_img_fill_request(struct rbd_img_request *img_req)
24243da691bfSIlya Dryomov {
24250c93e1b7SIlya Dryomov 	struct rbd_obj_request *obj_req, *next_obj_req;
24263da691bfSIlya Dryomov 	int ret;
24273d7efd18SAlex Elder 
24280c93e1b7SIlya Dryomov 	for_each_obj_request_safe(img_req, obj_req, next_obj_req) {
24299bb0248dSIlya Dryomov 		switch (img_req->op_type) {
24303da691bfSIlya Dryomov 		case OBJ_OP_READ:
2431ea9b743cSIlya Dryomov 			ret = rbd_obj_init_read(obj_req);
24323da691bfSIlya Dryomov 			break;
24333da691bfSIlya Dryomov 		case OBJ_OP_WRITE:
2434ea9b743cSIlya Dryomov 			ret = rbd_obj_init_write(obj_req);
24353da691bfSIlya Dryomov 			break;
24363da691bfSIlya Dryomov 		case OBJ_OP_DISCARD:
2437ea9b743cSIlya Dryomov 			ret = rbd_obj_init_discard(obj_req);
24383da691bfSIlya Dryomov 			break;
24396484cbe9SIlya Dryomov 		case OBJ_OP_ZEROOUT:
2440ea9b743cSIlya Dryomov 			ret = rbd_obj_init_zeroout(obj_req);
24416484cbe9SIlya Dryomov 			break;
24423da691bfSIlya Dryomov 		default:
244316809372SArnd Bergmann 			BUG();
24443da691bfSIlya Dryomov 		}
24450c93e1b7SIlya Dryomov 		if (ret < 0)
24463da691bfSIlya Dryomov 			return ret;
24470c93e1b7SIlya Dryomov 		if (ret > 0) {
24480c93e1b7SIlya Dryomov 			rbd_img_obj_request_del(img_req, obj_req);
24490c93e1b7SIlya Dryomov 			continue;
24500c93e1b7SIlya Dryomov 		}
2451b454e36dSAlex Elder 	}
2452b454e36dSAlex Elder 
24530192ce2eSIlya Dryomov 	img_req->state = RBD_IMG_START;
24543da691bfSIlya Dryomov 	return 0;
24553da691bfSIlya Dryomov }
24563da691bfSIlya Dryomov 
24575a237819SIlya Dryomov union rbd_img_fill_iter {
24585a237819SIlya Dryomov 	struct ceph_bio_iter	bio_iter;
24595a237819SIlya Dryomov 	struct ceph_bvec_iter	bvec_iter;
24605a237819SIlya Dryomov };
24615a237819SIlya Dryomov 
24625a237819SIlya Dryomov struct rbd_img_fill_ctx {
24635a237819SIlya Dryomov 	enum obj_request_type	pos_type;
24645a237819SIlya Dryomov 	union rbd_img_fill_iter	*pos;
24655a237819SIlya Dryomov 	union rbd_img_fill_iter	iter;
24665a237819SIlya Dryomov 	ceph_object_extent_fn_t	set_pos_fn;
2467afb97888SIlya Dryomov 	ceph_object_extent_fn_t	count_fn;
2468afb97888SIlya Dryomov 	ceph_object_extent_fn_t	copy_fn;
24695a237819SIlya Dryomov };
24705a237819SIlya Dryomov 
alloc_object_extent(void * arg)24715a237819SIlya Dryomov static struct ceph_object_extent *alloc_object_extent(void *arg)
24725a237819SIlya Dryomov {
24735a237819SIlya Dryomov 	struct rbd_img_request *img_req = arg;
24745a237819SIlya Dryomov 	struct rbd_obj_request *obj_req;
24755a237819SIlya Dryomov 
24765a237819SIlya Dryomov 	obj_req = rbd_obj_request_create();
24775a237819SIlya Dryomov 	if (!obj_req)
24785a237819SIlya Dryomov 		return NULL;
24795a237819SIlya Dryomov 
24805a237819SIlya Dryomov 	rbd_img_obj_request_add(img_req, obj_req);
24815a237819SIlya Dryomov 	return &obj_req->ex;
24825a237819SIlya Dryomov }
24835a237819SIlya Dryomov 
24845a237819SIlya Dryomov /*
2485afb97888SIlya Dryomov  * While su != os && sc == 1 is technically not fancy (it's the same
2486afb97888SIlya Dryomov  * layout as su == os && sc == 1), we can't use the nocopy path for it
2487afb97888SIlya Dryomov  * because ->set_pos_fn() should be called only once per object.
2488afb97888SIlya Dryomov  * ceph_file_to_extents() invokes action_fn once per stripe unit, so
2489afb97888SIlya Dryomov  * treat su != os && sc == 1 as fancy.
24905a237819SIlya Dryomov  */
rbd_layout_is_fancy(struct ceph_file_layout * l)2491afb97888SIlya Dryomov static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
2492afb97888SIlya Dryomov {
2493afb97888SIlya Dryomov 	return l->stripe_unit != l->object_size;
2494afb97888SIlya Dryomov }
2495afb97888SIlya Dryomov 
rbd_img_fill_request_nocopy(struct rbd_img_request * img_req,struct ceph_file_extent * img_extents,u32 num_img_extents,struct rbd_img_fill_ctx * fctx)2496afb97888SIlya Dryomov static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
24975a237819SIlya Dryomov 				       struct ceph_file_extent *img_extents,
24985a237819SIlya Dryomov 				       u32 num_img_extents,
24995a237819SIlya Dryomov 				       struct rbd_img_fill_ctx *fctx)
25005a237819SIlya Dryomov {
25015a237819SIlya Dryomov 	u32 i;
25025a237819SIlya Dryomov 	int ret;
25035a237819SIlya Dryomov 
25045a237819SIlya Dryomov 	img_req->data_type = fctx->pos_type;
25055a237819SIlya Dryomov 
25065a237819SIlya Dryomov 	/*
25075a237819SIlya Dryomov 	 * Create object requests and set each object request's starting
25085a237819SIlya Dryomov 	 * position in the provided bio (list) or bio_vec array.
25095a237819SIlya Dryomov 	 */
25105a237819SIlya Dryomov 	fctx->iter = *fctx->pos;
25115a237819SIlya Dryomov 	for (i = 0; i < num_img_extents; i++) {
25125a237819SIlya Dryomov 		ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
25135a237819SIlya Dryomov 					   img_extents[i].fe_off,
25145a237819SIlya Dryomov 					   img_extents[i].fe_len,
25155a237819SIlya Dryomov 					   &img_req->object_extents,
25165a237819SIlya Dryomov 					   alloc_object_extent, img_req,
25175a237819SIlya Dryomov 					   fctx->set_pos_fn, &fctx->iter);
25185a237819SIlya Dryomov 		if (ret)
25195a237819SIlya Dryomov 			return ret;
25205a237819SIlya Dryomov 	}
25215a237819SIlya Dryomov 
25225a237819SIlya Dryomov 	return __rbd_img_fill_request(img_req);
25235a237819SIlya Dryomov }
25245a237819SIlya Dryomov 
2525afb97888SIlya Dryomov /*
2526afb97888SIlya Dryomov  * Map a list of image extents to a list of object extents, create the
2527afb97888SIlya Dryomov  * corresponding object requests (normally each to a different object,
2528afb97888SIlya Dryomov  * but not always) and add them to @img_req.  For each object request,
2529afb97888SIlya Dryomov  * set up its data descriptor to point to the corresponding chunk(s) of
2530afb97888SIlya Dryomov  * @fctx->pos data buffer.
2531afb97888SIlya Dryomov  *
2532afb97888SIlya Dryomov  * Because ceph_file_to_extents() will merge adjacent object extents
2533afb97888SIlya Dryomov  * together, each object request's data descriptor may point to multiple
2534afb97888SIlya Dryomov  * different chunks of @fctx->pos data buffer.
2535afb97888SIlya Dryomov  *
2536afb97888SIlya Dryomov  * @fctx->pos data buffer is assumed to be large enough.
2537afb97888SIlya Dryomov  */
rbd_img_fill_request(struct rbd_img_request * img_req,struct ceph_file_extent * img_extents,u32 num_img_extents,struct rbd_img_fill_ctx * fctx)2538afb97888SIlya Dryomov static int rbd_img_fill_request(struct rbd_img_request *img_req,
2539afb97888SIlya Dryomov 				struct ceph_file_extent *img_extents,
2540afb97888SIlya Dryomov 				u32 num_img_extents,
2541afb97888SIlya Dryomov 				struct rbd_img_fill_ctx *fctx)
2542afb97888SIlya Dryomov {
2543afb97888SIlya Dryomov 	struct rbd_device *rbd_dev = img_req->rbd_dev;
2544afb97888SIlya Dryomov 	struct rbd_obj_request *obj_req;
2545afb97888SIlya Dryomov 	u32 i;
2546afb97888SIlya Dryomov 	int ret;
2547afb97888SIlya Dryomov 
2548afb97888SIlya Dryomov 	if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2549afb97888SIlya Dryomov 	    !rbd_layout_is_fancy(&rbd_dev->layout))
2550afb97888SIlya Dryomov 		return rbd_img_fill_request_nocopy(img_req, img_extents,
2551afb97888SIlya Dryomov 						   num_img_extents, fctx);
2552afb97888SIlya Dryomov 
2553afb97888SIlya Dryomov 	img_req->data_type = OBJ_REQUEST_OWN_BVECS;
2554afb97888SIlya Dryomov 
2555afb97888SIlya Dryomov 	/*
2556afb97888SIlya Dryomov 	 * Create object requests and determine ->bvec_count for each object
2557afb97888SIlya Dryomov 	 * request.  Note that ->bvec_count sum over all object requests may
2558afb97888SIlya Dryomov 	 * be greater than the number of bio_vecs in the provided bio (list)
2559afb97888SIlya Dryomov 	 * or bio_vec array because when mapped, those bio_vecs can straddle
2560afb97888SIlya Dryomov 	 * stripe unit boundaries.
2561afb97888SIlya Dryomov 	 */
2562afb97888SIlya Dryomov 	fctx->iter = *fctx->pos;
2563afb97888SIlya Dryomov 	for (i = 0; i < num_img_extents; i++) {
2564afb97888SIlya Dryomov 		ret = ceph_file_to_extents(&rbd_dev->layout,
2565afb97888SIlya Dryomov 					   img_extents[i].fe_off,
2566afb97888SIlya Dryomov 					   img_extents[i].fe_len,
2567afb97888SIlya Dryomov 					   &img_req->object_extents,
2568afb97888SIlya Dryomov 					   alloc_object_extent, img_req,
2569afb97888SIlya Dryomov 					   fctx->count_fn, &fctx->iter);
2570afb97888SIlya Dryomov 		if (ret)
2571afb97888SIlya Dryomov 			return ret;
2572afb97888SIlya Dryomov 	}
2573afb97888SIlya Dryomov 
2574afb97888SIlya Dryomov 	for_each_obj_request(img_req, obj_req) {
2575afb97888SIlya Dryomov 		obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2576afb97888SIlya Dryomov 					      sizeof(*obj_req->bvec_pos.bvecs),
2577afb97888SIlya Dryomov 					      GFP_NOIO);
2578afb97888SIlya Dryomov 		if (!obj_req->bvec_pos.bvecs)
2579afb97888SIlya Dryomov 			return -ENOMEM;
2580afb97888SIlya Dryomov 	}
2581afb97888SIlya Dryomov 
2582afb97888SIlya Dryomov 	/*
2583afb97888SIlya Dryomov 	 * Fill in each object request's private bio_vec array, splitting and
2584afb97888SIlya Dryomov 	 * rearranging the provided bio_vecs in stripe unit chunks as needed.
2585afb97888SIlya Dryomov 	 */
2586afb97888SIlya Dryomov 	fctx->iter = *fctx->pos;
2587afb97888SIlya Dryomov 	for (i = 0; i < num_img_extents; i++) {
2588afb97888SIlya Dryomov 		ret = ceph_iterate_extents(&rbd_dev->layout,
2589afb97888SIlya Dryomov 					   img_extents[i].fe_off,
2590afb97888SIlya Dryomov 					   img_extents[i].fe_len,
2591afb97888SIlya Dryomov 					   &img_req->object_extents,
2592afb97888SIlya Dryomov 					   fctx->copy_fn, &fctx->iter);
2593afb97888SIlya Dryomov 		if (ret)
2594afb97888SIlya Dryomov 			return ret;
2595afb97888SIlya Dryomov 	}
2596afb97888SIlya Dryomov 
2597afb97888SIlya Dryomov 	return __rbd_img_fill_request(img_req);
2598afb97888SIlya Dryomov }
2599afb97888SIlya Dryomov 
rbd_img_fill_nodata(struct rbd_img_request * img_req,u64 off,u64 len)26005a237819SIlya Dryomov static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
26015a237819SIlya Dryomov 			       u64 off, u64 len)
26025a237819SIlya Dryomov {
26035a237819SIlya Dryomov 	struct ceph_file_extent ex = { off, len };
2604a55e601bSArnd Bergmann 	union rbd_img_fill_iter dummy = {};
26055a237819SIlya Dryomov 	struct rbd_img_fill_ctx fctx = {
26065a237819SIlya Dryomov 		.pos_type = OBJ_REQUEST_NODATA,
26075a237819SIlya Dryomov 		.pos = &dummy,
26085a237819SIlya Dryomov 	};
26095a237819SIlya Dryomov 
26105a237819SIlya Dryomov 	return rbd_img_fill_request(img_req, &ex, 1, &fctx);
26115a237819SIlya Dryomov }
26125a237819SIlya Dryomov 
set_bio_pos(struct ceph_object_extent * ex,u32 bytes,void * arg)26135a237819SIlya Dryomov static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
26145a237819SIlya Dryomov {
26155a237819SIlya Dryomov 	struct rbd_obj_request *obj_req =
26165a237819SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
26175a237819SIlya Dryomov 	struct ceph_bio_iter *it = arg;
26185a237819SIlya Dryomov 
26195a237819SIlya Dryomov 	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
26205a237819SIlya Dryomov 	obj_req->bio_pos = *it;
26215a237819SIlya Dryomov 	ceph_bio_iter_advance(it, bytes);
26225a237819SIlya Dryomov }
26235a237819SIlya Dryomov 
count_bio_bvecs(struct ceph_object_extent * ex,u32 bytes,void * arg)2624afb97888SIlya Dryomov static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2625afb97888SIlya Dryomov {
2626afb97888SIlya Dryomov 	struct rbd_obj_request *obj_req =
2627afb97888SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
2628afb97888SIlya Dryomov 	struct ceph_bio_iter *it = arg;
2629afb97888SIlya Dryomov 
2630afb97888SIlya Dryomov 	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2631afb97888SIlya Dryomov 	ceph_bio_iter_advance_step(it, bytes, ({
2632afb97888SIlya Dryomov 		obj_req->bvec_count++;
2633afb97888SIlya Dryomov 	}));
2634afb97888SIlya Dryomov 
2635afb97888SIlya Dryomov }
2636afb97888SIlya Dryomov 
copy_bio_bvecs(struct ceph_object_extent * ex,u32 bytes,void * arg)2637afb97888SIlya Dryomov static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2638afb97888SIlya Dryomov {
2639afb97888SIlya Dryomov 	struct rbd_obj_request *obj_req =
2640afb97888SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
2641afb97888SIlya Dryomov 	struct ceph_bio_iter *it = arg;
2642afb97888SIlya Dryomov 
2643afb97888SIlya Dryomov 	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2644afb97888SIlya Dryomov 	ceph_bio_iter_advance_step(it, bytes, ({
2645afb97888SIlya Dryomov 		obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2646afb97888SIlya Dryomov 		obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2647afb97888SIlya Dryomov 	}));
2648afb97888SIlya Dryomov }
2649afb97888SIlya Dryomov 
__rbd_img_fill_from_bio(struct rbd_img_request * img_req,struct ceph_file_extent * img_extents,u32 num_img_extents,struct ceph_bio_iter * bio_pos)26505a237819SIlya Dryomov static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
26515a237819SIlya Dryomov 				   struct ceph_file_extent *img_extents,
26525a237819SIlya Dryomov 				   u32 num_img_extents,
26535a237819SIlya Dryomov 				   struct ceph_bio_iter *bio_pos)
26545a237819SIlya Dryomov {
26555a237819SIlya Dryomov 	struct rbd_img_fill_ctx fctx = {
26565a237819SIlya Dryomov 		.pos_type = OBJ_REQUEST_BIO,
26575a237819SIlya Dryomov 		.pos = (union rbd_img_fill_iter *)bio_pos,
26585a237819SIlya Dryomov 		.set_pos_fn = set_bio_pos,
2659afb97888SIlya Dryomov 		.count_fn = count_bio_bvecs,
2660afb97888SIlya Dryomov 		.copy_fn = copy_bio_bvecs,
26615a237819SIlya Dryomov 	};
26625a237819SIlya Dryomov 
26635a237819SIlya Dryomov 	return rbd_img_fill_request(img_req, img_extents, num_img_extents,
26645a237819SIlya Dryomov 				    &fctx);
26655a237819SIlya Dryomov }
26665a237819SIlya Dryomov 
rbd_img_fill_from_bio(struct rbd_img_request * img_req,u64 off,u64 len,struct bio * bio)26675a237819SIlya Dryomov static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
26685a237819SIlya Dryomov 				 u64 off, u64 len, struct bio *bio)
26695a237819SIlya Dryomov {
26705a237819SIlya Dryomov 	struct ceph_file_extent ex = { off, len };
26715a237819SIlya Dryomov 	struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
26725a237819SIlya Dryomov 
26735a237819SIlya Dryomov 	return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
26745a237819SIlya Dryomov }
26755a237819SIlya Dryomov 
set_bvec_pos(struct ceph_object_extent * ex,u32 bytes,void * arg)26765a237819SIlya Dryomov static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
26775a237819SIlya Dryomov {
26785a237819SIlya Dryomov 	struct rbd_obj_request *obj_req =
26795a237819SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
26805a237819SIlya Dryomov 	struct ceph_bvec_iter *it = arg;
26815a237819SIlya Dryomov 
26825a237819SIlya Dryomov 	obj_req->bvec_pos = *it;
26835a237819SIlya Dryomov 	ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
26845a237819SIlya Dryomov 	ceph_bvec_iter_advance(it, bytes);
26855a237819SIlya Dryomov }
26865a237819SIlya Dryomov 
count_bvecs(struct ceph_object_extent * ex,u32 bytes,void * arg)2687afb97888SIlya Dryomov static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2688afb97888SIlya Dryomov {
2689afb97888SIlya Dryomov 	struct rbd_obj_request *obj_req =
2690afb97888SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
2691afb97888SIlya Dryomov 	struct ceph_bvec_iter *it = arg;
2692afb97888SIlya Dryomov 
2693afb97888SIlya Dryomov 	ceph_bvec_iter_advance_step(it, bytes, ({
2694afb97888SIlya Dryomov 		obj_req->bvec_count++;
2695afb97888SIlya Dryomov 	}));
2696afb97888SIlya Dryomov }
2697afb97888SIlya Dryomov 
copy_bvecs(struct ceph_object_extent * ex,u32 bytes,void * arg)2698afb97888SIlya Dryomov static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2699afb97888SIlya Dryomov {
2700afb97888SIlya Dryomov 	struct rbd_obj_request *obj_req =
2701afb97888SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
2702afb97888SIlya Dryomov 	struct ceph_bvec_iter *it = arg;
2703afb97888SIlya Dryomov 
2704afb97888SIlya Dryomov 	ceph_bvec_iter_advance_step(it, bytes, ({
2705afb97888SIlya Dryomov 		obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2706afb97888SIlya Dryomov 		obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2707afb97888SIlya Dryomov 	}));
2708afb97888SIlya Dryomov }
2709afb97888SIlya Dryomov 
__rbd_img_fill_from_bvecs(struct rbd_img_request * img_req,struct ceph_file_extent * img_extents,u32 num_img_extents,struct ceph_bvec_iter * bvec_pos)27105a237819SIlya Dryomov static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
27115a237819SIlya Dryomov 				     struct ceph_file_extent *img_extents,
27125a237819SIlya Dryomov 				     u32 num_img_extents,
27135a237819SIlya Dryomov 				     struct ceph_bvec_iter *bvec_pos)
27145a237819SIlya Dryomov {
27155a237819SIlya Dryomov 	struct rbd_img_fill_ctx fctx = {
27165a237819SIlya Dryomov 		.pos_type = OBJ_REQUEST_BVECS,
27175a237819SIlya Dryomov 		.pos = (union rbd_img_fill_iter *)bvec_pos,
27185a237819SIlya Dryomov 		.set_pos_fn = set_bvec_pos,
2719afb97888SIlya Dryomov 		.count_fn = count_bvecs,
2720afb97888SIlya Dryomov 		.copy_fn = copy_bvecs,
27215a237819SIlya Dryomov 	};
27225a237819SIlya Dryomov 
27235a237819SIlya Dryomov 	return rbd_img_fill_request(img_req, img_extents, num_img_extents,
27245a237819SIlya Dryomov 				    &fctx);
27255a237819SIlya Dryomov }
27265a237819SIlya Dryomov 
rbd_img_fill_from_bvecs(struct rbd_img_request * img_req,struct ceph_file_extent * img_extents,u32 num_img_extents,struct bio_vec * bvecs)27275a237819SIlya Dryomov static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
27285a237819SIlya Dryomov 				   struct ceph_file_extent *img_extents,
27295a237819SIlya Dryomov 				   u32 num_img_extents,
27305a237819SIlya Dryomov 				   struct bio_vec *bvecs)
27315a237819SIlya Dryomov {
27325a237819SIlya Dryomov 	struct ceph_bvec_iter it = {
27335a237819SIlya Dryomov 		.bvecs = bvecs,
27345a237819SIlya Dryomov 		.iter = { .bi_size = ceph_file_extents_bytes(img_extents,
27355a237819SIlya Dryomov 							     num_img_extents) },
27365a237819SIlya Dryomov 	};
27375a237819SIlya Dryomov 
27385a237819SIlya Dryomov 	return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
27395a237819SIlya Dryomov 					 &it);
27405a237819SIlya Dryomov }
27415a237819SIlya Dryomov 
rbd_img_handle_request_work(struct work_struct * work)27420192ce2eSIlya Dryomov static void rbd_img_handle_request_work(struct work_struct *work)
2743bf0d5f50SAlex Elder {
27440192ce2eSIlya Dryomov 	struct rbd_img_request *img_req =
27450192ce2eSIlya Dryomov 	    container_of(work, struct rbd_img_request, work);
2746bf0d5f50SAlex Elder 
27470192ce2eSIlya Dryomov 	rbd_img_handle_request(img_req, img_req->work_result);
27480192ce2eSIlya Dryomov }
2749bf0d5f50SAlex Elder 
rbd_img_schedule(struct rbd_img_request * img_req,int result)27500192ce2eSIlya Dryomov static void rbd_img_schedule(struct rbd_img_request *img_req, int result)
27510192ce2eSIlya Dryomov {
27520192ce2eSIlya Dryomov 	INIT_WORK(&img_req->work, rbd_img_handle_request_work);
27530192ce2eSIlya Dryomov 	img_req->work_result = result;
27540192ce2eSIlya Dryomov 	queue_work(rbd_wq, &img_req->work);
2755bf0d5f50SAlex Elder }
2756bf0d5f50SAlex Elder 
rbd_obj_may_exist(struct rbd_obj_request * obj_req)275722e8bd51SIlya Dryomov static bool rbd_obj_may_exist(struct rbd_obj_request *obj_req)
275822e8bd51SIlya Dryomov {
275922e8bd51SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
276022e8bd51SIlya Dryomov 
276122e8bd51SIlya Dryomov 	if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno)) {
276222e8bd51SIlya Dryomov 		obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
276322e8bd51SIlya Dryomov 		return true;
276422e8bd51SIlya Dryomov 	}
276522e8bd51SIlya Dryomov 
276622e8bd51SIlya Dryomov 	dout("%s %p objno %llu assuming dne\n", __func__, obj_req,
276722e8bd51SIlya Dryomov 	     obj_req->ex.oe_objno);
276822e8bd51SIlya Dryomov 	return false;
276922e8bd51SIlya Dryomov }
277022e8bd51SIlya Dryomov 
rbd_obj_read_object(struct rbd_obj_request * obj_req)277185b5e6d1SIlya Dryomov static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
277285b5e6d1SIlya Dryomov {
2773a086a1b8SIlya Dryomov 	struct ceph_osd_request *osd_req;
2774a086a1b8SIlya Dryomov 	int ret;
2775a086a1b8SIlya Dryomov 
2776a086a1b8SIlya Dryomov 	osd_req = __rbd_obj_add_osd_request(obj_req, NULL, 1);
2777a086a1b8SIlya Dryomov 	if (IS_ERR(osd_req))
2778a086a1b8SIlya Dryomov 		return PTR_ERR(osd_req);
2779a086a1b8SIlya Dryomov 
2780a086a1b8SIlya Dryomov 	osd_req_op_extent_init(osd_req, 0, CEPH_OSD_OP_READ,
2781a086a1b8SIlya Dryomov 			       obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
2782a086a1b8SIlya Dryomov 	rbd_osd_setup_data(osd_req, 0);
2783a086a1b8SIlya Dryomov 	rbd_osd_format_read(osd_req);
2784a086a1b8SIlya Dryomov 
2785a086a1b8SIlya Dryomov 	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
2786a086a1b8SIlya Dryomov 	if (ret)
2787a086a1b8SIlya Dryomov 		return ret;
2788a086a1b8SIlya Dryomov 
2789a086a1b8SIlya Dryomov 	rbd_osd_submit(osd_req);
279085b5e6d1SIlya Dryomov 	return 0;
2791bf0d5f50SAlex Elder }
2792bf0d5f50SAlex Elder 
rbd_obj_read_from_parent(struct rbd_obj_request * obj_req)279386bd7998SIlya Dryomov static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
27943da691bfSIlya Dryomov {
27953da691bfSIlya Dryomov 	struct rbd_img_request *img_req = obj_req->img_request;
2796a52cc685SIlya Dryomov 	struct rbd_device *parent = img_req->rbd_dev->parent;
27973da691bfSIlya Dryomov 	struct rbd_img_request *child_img_req;
27983da691bfSIlya Dryomov 	int ret;
27993da691bfSIlya Dryomov 
280059e542c8SIlya Dryomov 	child_img_req = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
28013da691bfSIlya Dryomov 	if (!child_img_req)
28023da691bfSIlya Dryomov 		return -ENOMEM;
28033da691bfSIlya Dryomov 
280459e542c8SIlya Dryomov 	rbd_img_request_init(child_img_req, parent, OBJ_OP_READ);
2805e93aca0aSIlya Dryomov 	__set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2806e93aca0aSIlya Dryomov 	child_img_req->obj_request = obj_req;
2807e93aca0aSIlya Dryomov 
2808a52cc685SIlya Dryomov 	down_read(&parent->header_rwsem);
2809a52cc685SIlya Dryomov 	rbd_img_capture_header(child_img_req);
2810a52cc685SIlya Dryomov 	up_read(&parent->header_rwsem);
2811a52cc685SIlya Dryomov 
281221ed05a8SIlya Dryomov 	dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req,
281321ed05a8SIlya Dryomov 	     obj_req);
281421ed05a8SIlya Dryomov 
28153da691bfSIlya Dryomov 	if (!rbd_img_is_write(img_req)) {
2816ecc633caSIlya Dryomov 		switch (img_req->data_type) {
28173da691bfSIlya Dryomov 		case OBJ_REQUEST_BIO:
28185a237819SIlya Dryomov 			ret = __rbd_img_fill_from_bio(child_img_req,
28195a237819SIlya Dryomov 						      obj_req->img_extents,
28205a237819SIlya Dryomov 						      obj_req->num_img_extents,
28213da691bfSIlya Dryomov 						      &obj_req->bio_pos);
28223da691bfSIlya Dryomov 			break;
28233da691bfSIlya Dryomov 		case OBJ_REQUEST_BVECS:
2824afb97888SIlya Dryomov 		case OBJ_REQUEST_OWN_BVECS:
28255a237819SIlya Dryomov 			ret = __rbd_img_fill_from_bvecs(child_img_req,
28265a237819SIlya Dryomov 						      obj_req->img_extents,
28275a237819SIlya Dryomov 						      obj_req->num_img_extents,
28283da691bfSIlya Dryomov 						      &obj_req->bvec_pos);
28293da691bfSIlya Dryomov 			break;
28303da691bfSIlya Dryomov 		default:
2831d342a15bSArnd Bergmann 			BUG();
28323da691bfSIlya Dryomov 		}
28333da691bfSIlya Dryomov 	} else {
28345a237819SIlya Dryomov 		ret = rbd_img_fill_from_bvecs(child_img_req,
28355a237819SIlya Dryomov 					      obj_req->img_extents,
28365a237819SIlya Dryomov 					      obj_req->num_img_extents,
28375a237819SIlya Dryomov 					      obj_req->copyup_bvecs);
28383da691bfSIlya Dryomov 	}
28393da691bfSIlya Dryomov 	if (ret) {
2840679a97d2SHannes Reinecke 		rbd_img_request_destroy(child_img_req);
2841663ae2ccSIlya Dryomov 		return ret;
2842bf0d5f50SAlex Elder 	}
2843bf0d5f50SAlex Elder 
28440192ce2eSIlya Dryomov 	/* avoid parent chain recursion */
28450192ce2eSIlya Dryomov 	rbd_img_schedule(child_img_req, 0);
28463da691bfSIlya Dryomov 	return 0;
28473da691bfSIlya Dryomov }
28483da691bfSIlya Dryomov 
rbd_obj_advance_read(struct rbd_obj_request * obj_req,int * result)284985b5e6d1SIlya Dryomov static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
28508b3e1a56SAlex Elder {
28513da691bfSIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
28523da691bfSIlya Dryomov 	int ret;
28538b3e1a56SAlex Elder 
285422e8bd51SIlya Dryomov again:
2855a9b67e69SIlya Dryomov 	switch (obj_req->read_state) {
285685b5e6d1SIlya Dryomov 	case RBD_OBJ_READ_START:
285785b5e6d1SIlya Dryomov 		rbd_assert(!*result);
285885b5e6d1SIlya Dryomov 
285922e8bd51SIlya Dryomov 		if (!rbd_obj_may_exist(obj_req)) {
286022e8bd51SIlya Dryomov 			*result = -ENOENT;
286122e8bd51SIlya Dryomov 			obj_req->read_state = RBD_OBJ_READ_OBJECT;
286222e8bd51SIlya Dryomov 			goto again;
286322e8bd51SIlya Dryomov 		}
286422e8bd51SIlya Dryomov 
286585b5e6d1SIlya Dryomov 		ret = rbd_obj_read_object(obj_req);
286685b5e6d1SIlya Dryomov 		if (ret) {
286785b5e6d1SIlya Dryomov 			*result = ret;
286885b5e6d1SIlya Dryomov 			return true;
286985b5e6d1SIlya Dryomov 		}
287085b5e6d1SIlya Dryomov 		obj_req->read_state = RBD_OBJ_READ_OBJECT;
287185b5e6d1SIlya Dryomov 		return false;
2872a9b67e69SIlya Dryomov 	case RBD_OBJ_READ_OBJECT:
2873a9b67e69SIlya Dryomov 		if (*result == -ENOENT && rbd_dev->parent_overlap) {
287486bd7998SIlya Dryomov 			/* reverse map this object extent onto the parent */
287586bd7998SIlya Dryomov 			ret = rbd_obj_calc_img_extents(obj_req, false);
287686bd7998SIlya Dryomov 			if (ret) {
287754ab3b24SIlya Dryomov 				*result = ret;
287886bd7998SIlya Dryomov 				return true;
287986bd7998SIlya Dryomov 			}
288086bd7998SIlya Dryomov 			if (obj_req->num_img_extents) {
288186bd7998SIlya Dryomov 				ret = rbd_obj_read_from_parent(obj_req);
28823da691bfSIlya Dryomov 				if (ret) {
288354ab3b24SIlya Dryomov 					*result = ret;
28843da691bfSIlya Dryomov 					return true;
28853da691bfSIlya Dryomov 				}
2886a9b67e69SIlya Dryomov 				obj_req->read_state = RBD_OBJ_READ_PARENT;
28873da691bfSIlya Dryomov 				return false;
28883da691bfSIlya Dryomov 			}
288986bd7998SIlya Dryomov 		}
289002c74fbaSAlex Elder 
289102c74fbaSAlex Elder 		/*
28923da691bfSIlya Dryomov 		 * -ENOENT means a hole in the image -- zero-fill the entire
28933da691bfSIlya Dryomov 		 * length of the request.  A short read also implies zero-fill
289454ab3b24SIlya Dryomov 		 * to the end of the request.
289502c74fbaSAlex Elder 		 */
289654ab3b24SIlya Dryomov 		if (*result == -ENOENT) {
289754ab3b24SIlya Dryomov 			rbd_obj_zero_range(obj_req, 0, obj_req->ex.oe_len);
289854ab3b24SIlya Dryomov 			*result = 0;
289954ab3b24SIlya Dryomov 		} else if (*result >= 0) {
290054ab3b24SIlya Dryomov 			if (*result < obj_req->ex.oe_len)
290154ab3b24SIlya Dryomov 				rbd_obj_zero_range(obj_req, *result,
290254ab3b24SIlya Dryomov 						obj_req->ex.oe_len - *result);
290354ab3b24SIlya Dryomov 			else
290454ab3b24SIlya Dryomov 				rbd_assert(*result == obj_req->ex.oe_len);
290554ab3b24SIlya Dryomov 			*result = 0;
29063da691bfSIlya Dryomov 		}
29073da691bfSIlya Dryomov 		return true;
2908a9b67e69SIlya Dryomov 	case RBD_OBJ_READ_PARENT:
2909d435c9a7SIlya Dryomov 		/*
2910d435c9a7SIlya Dryomov 		 * The parent image is read only up to the overlap -- zero-fill
2911d435c9a7SIlya Dryomov 		 * from the overlap to the end of the request.
2912d435c9a7SIlya Dryomov 		 */
2913d435c9a7SIlya Dryomov 		if (!*result) {
2914d435c9a7SIlya Dryomov 			u32 obj_overlap = rbd_obj_img_extents_bytes(obj_req);
2915d435c9a7SIlya Dryomov 
2916d435c9a7SIlya Dryomov 			if (obj_overlap < obj_req->ex.oe_len)
2917d435c9a7SIlya Dryomov 				rbd_obj_zero_range(obj_req, obj_overlap,
2918d435c9a7SIlya Dryomov 					    obj_req->ex.oe_len - obj_overlap);
2919d435c9a7SIlya Dryomov 		}
2920a9b67e69SIlya Dryomov 		return true;
2921a9b67e69SIlya Dryomov 	default:
2922a9b67e69SIlya Dryomov 		BUG();
2923a9b67e69SIlya Dryomov 	}
29243da691bfSIlya Dryomov }
29253da691bfSIlya Dryomov 
rbd_obj_write_is_noop(struct rbd_obj_request * obj_req)292622e8bd51SIlya Dryomov static bool rbd_obj_write_is_noop(struct rbd_obj_request *obj_req)
292722e8bd51SIlya Dryomov {
292822e8bd51SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
292922e8bd51SIlya Dryomov 
293022e8bd51SIlya Dryomov 	if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno))
293122e8bd51SIlya Dryomov 		obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
293222e8bd51SIlya Dryomov 
293322e8bd51SIlya Dryomov 	if (!(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST) &&
293422e8bd51SIlya Dryomov 	    (obj_req->flags & RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT)) {
293522e8bd51SIlya Dryomov 		dout("%s %p noop for nonexistent\n", __func__, obj_req);
29363da691bfSIlya Dryomov 		return true;
29373da691bfSIlya Dryomov 	}
29383da691bfSIlya Dryomov 
293922e8bd51SIlya Dryomov 	return false;
294022e8bd51SIlya Dryomov }
294122e8bd51SIlya Dryomov 
294222e8bd51SIlya Dryomov /*
294322e8bd51SIlya Dryomov  * Return:
294422e8bd51SIlya Dryomov  *   0 - object map update sent
294522e8bd51SIlya Dryomov  *   1 - object map update isn't needed
294622e8bd51SIlya Dryomov  *  <0 - error
294722e8bd51SIlya Dryomov  */
rbd_obj_write_pre_object_map(struct rbd_obj_request * obj_req)294822e8bd51SIlya Dryomov static int rbd_obj_write_pre_object_map(struct rbd_obj_request *obj_req)
294922e8bd51SIlya Dryomov {
295022e8bd51SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
295122e8bd51SIlya Dryomov 	u8 new_state;
295222e8bd51SIlya Dryomov 
295322e8bd51SIlya Dryomov 	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
295422e8bd51SIlya Dryomov 		return 1;
295522e8bd51SIlya Dryomov 
295622e8bd51SIlya Dryomov 	if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
295722e8bd51SIlya Dryomov 		new_state = OBJECT_PENDING;
295822e8bd51SIlya Dryomov 	else
295922e8bd51SIlya Dryomov 		new_state = OBJECT_EXISTS;
296022e8bd51SIlya Dryomov 
296122e8bd51SIlya Dryomov 	return rbd_object_map_update(obj_req, CEPH_NOSNAP, new_state, NULL);
296222e8bd51SIlya Dryomov }
296322e8bd51SIlya Dryomov 
rbd_obj_write_object(struct rbd_obj_request * obj_req)296485b5e6d1SIlya Dryomov static int rbd_obj_write_object(struct rbd_obj_request *obj_req)
296585b5e6d1SIlya Dryomov {
2966a086a1b8SIlya Dryomov 	struct ceph_osd_request *osd_req;
2967a086a1b8SIlya Dryomov 	int num_ops = count_write_ops(obj_req);
2968a086a1b8SIlya Dryomov 	int which = 0;
2969a086a1b8SIlya Dryomov 	int ret;
2970a086a1b8SIlya Dryomov 
2971a086a1b8SIlya Dryomov 	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)
2972a086a1b8SIlya Dryomov 		num_ops++; /* stat */
2973a086a1b8SIlya Dryomov 
2974a086a1b8SIlya Dryomov 	osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
2975a086a1b8SIlya Dryomov 	if (IS_ERR(osd_req))
2976a086a1b8SIlya Dryomov 		return PTR_ERR(osd_req);
2977a086a1b8SIlya Dryomov 
2978a086a1b8SIlya Dryomov 	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
2979a086a1b8SIlya Dryomov 		ret = rbd_osd_setup_stat(osd_req, which++);
2980a086a1b8SIlya Dryomov 		if (ret)
2981a086a1b8SIlya Dryomov 			return ret;
2982a086a1b8SIlya Dryomov 	}
2983a086a1b8SIlya Dryomov 
2984a086a1b8SIlya Dryomov 	rbd_osd_setup_write_ops(osd_req, which);
2985a086a1b8SIlya Dryomov 	rbd_osd_format_write(osd_req);
2986a086a1b8SIlya Dryomov 
2987a086a1b8SIlya Dryomov 	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
2988a086a1b8SIlya Dryomov 	if (ret)
2989a086a1b8SIlya Dryomov 		return ret;
2990a086a1b8SIlya Dryomov 
2991a086a1b8SIlya Dryomov 	rbd_osd_submit(osd_req);
299285b5e6d1SIlya Dryomov 	return 0;
299385b5e6d1SIlya Dryomov }
299485b5e6d1SIlya Dryomov 
29953da691bfSIlya Dryomov /*
29963da691bfSIlya Dryomov  * copyup_bvecs pages are never highmem pages
29973da691bfSIlya Dryomov  */
is_zero_bvecs(struct bio_vec * bvecs,u32 bytes)29983da691bfSIlya Dryomov static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
29993da691bfSIlya Dryomov {
30003da691bfSIlya Dryomov 	struct ceph_bvec_iter it = {
30013da691bfSIlya Dryomov 		.bvecs = bvecs,
30023da691bfSIlya Dryomov 		.iter = { .bi_size = bytes },
30033da691bfSIlya Dryomov 	};
30043da691bfSIlya Dryomov 
30053da691bfSIlya Dryomov 	ceph_bvec_iter_advance_step(&it, bytes, ({
3006cf58b537SChristoph Hellwig 		if (memchr_inv(bvec_virt(&bv), 0, bv.bv_len))
30073da691bfSIlya Dryomov 			return false;
30083da691bfSIlya Dryomov 	}));
30093da691bfSIlya Dryomov 	return true;
30103da691bfSIlya Dryomov }
30113da691bfSIlya Dryomov 
30123a482501SIlya Dryomov #define MODS_ONLY	U32_MAX
30133a482501SIlya Dryomov 
rbd_obj_copyup_empty_snapc(struct rbd_obj_request * obj_req,u32 bytes)3014793333a3SIlya Dryomov static int rbd_obj_copyup_empty_snapc(struct rbd_obj_request *obj_req,
301589a59c1cSIlya Dryomov 				      u32 bytes)
30163da691bfSIlya Dryomov {
3017bcbab1dbSIlya Dryomov 	struct ceph_osd_request *osd_req;
3018fe943d50SChengguang Xu 	int ret;
30193da691bfSIlya Dryomov 
30203da691bfSIlya Dryomov 	dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
302189a59c1cSIlya Dryomov 	rbd_assert(bytes > 0 && bytes != MODS_ONLY);
30223da691bfSIlya Dryomov 
3023bcbab1dbSIlya Dryomov 	osd_req = __rbd_obj_add_osd_request(obj_req, &rbd_empty_snapc, 1);
3024bcbab1dbSIlya Dryomov 	if (IS_ERR(osd_req))
3025bcbab1dbSIlya Dryomov 		return PTR_ERR(osd_req);
30263da691bfSIlya Dryomov 
3027b5ae8cbcSIlya Dryomov 	ret = rbd_osd_setup_copyup(osd_req, 0, bytes);
3028fe943d50SChengguang Xu 	if (ret)
3029fe943d50SChengguang Xu 		return ret;
3030fe943d50SChengguang Xu 
3031bcbab1dbSIlya Dryomov 	rbd_osd_format_write(osd_req);
30323da691bfSIlya Dryomov 
3033bcbab1dbSIlya Dryomov 	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
303489a59c1cSIlya Dryomov 	if (ret)
303589a59c1cSIlya Dryomov 		return ret;
303689a59c1cSIlya Dryomov 
3037a086a1b8SIlya Dryomov 	rbd_osd_submit(osd_req);
303889a59c1cSIlya Dryomov 	return 0;
303989a59c1cSIlya Dryomov }
304089a59c1cSIlya Dryomov 
rbd_obj_copyup_current_snapc(struct rbd_obj_request * obj_req,u32 bytes)3041793333a3SIlya Dryomov static int rbd_obj_copyup_current_snapc(struct rbd_obj_request *obj_req,
3042793333a3SIlya Dryomov 					u32 bytes)
30433da691bfSIlya Dryomov {
3044bcbab1dbSIlya Dryomov 	struct ceph_osd_request *osd_req;
3045a086a1b8SIlya Dryomov 	int num_ops = count_write_ops(obj_req);
3046a086a1b8SIlya Dryomov 	int which = 0;
30473da691bfSIlya Dryomov 	int ret;
30483da691bfSIlya Dryomov 
30493da691bfSIlya Dryomov 	dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
30503da691bfSIlya Dryomov 
3051a086a1b8SIlya Dryomov 	if (bytes != MODS_ONLY)
3052a086a1b8SIlya Dryomov 		num_ops++; /* copyup */
305313488d53SIlya Dryomov 
3054a086a1b8SIlya Dryomov 	osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
3055bcbab1dbSIlya Dryomov 	if (IS_ERR(osd_req))
3056bcbab1dbSIlya Dryomov 		return PTR_ERR(osd_req);
30573da691bfSIlya Dryomov 
30583a482501SIlya Dryomov 	if (bytes != MODS_ONLY) {
3059b5ae8cbcSIlya Dryomov 		ret = rbd_osd_setup_copyup(osd_req, which++, bytes);
30603da691bfSIlya Dryomov 		if (ret)
30613da691bfSIlya Dryomov 			return ret;
30623a482501SIlya Dryomov 	}
30633da691bfSIlya Dryomov 
3064a086a1b8SIlya Dryomov 	rbd_osd_setup_write_ops(osd_req, which);
3065a086a1b8SIlya Dryomov 	rbd_osd_format_write(osd_req);
30663da691bfSIlya Dryomov 
3067bcbab1dbSIlya Dryomov 	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
30683da691bfSIlya Dryomov 	if (ret)
30693da691bfSIlya Dryomov 		return ret;
30703da691bfSIlya Dryomov 
3071a086a1b8SIlya Dryomov 	rbd_osd_submit(osd_req);
30723da691bfSIlya Dryomov 	return 0;
30733da691bfSIlya Dryomov }
30743da691bfSIlya Dryomov 
setup_copyup_bvecs(struct rbd_obj_request * obj_req,u64 obj_overlap)30757e07efb1SIlya Dryomov static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
30767e07efb1SIlya Dryomov {
30777e07efb1SIlya Dryomov 	u32 i;
30787e07efb1SIlya Dryomov 
30797e07efb1SIlya Dryomov 	rbd_assert(!obj_req->copyup_bvecs);
30807e07efb1SIlya Dryomov 	obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
30817e07efb1SIlya Dryomov 	obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
30827e07efb1SIlya Dryomov 					sizeof(*obj_req->copyup_bvecs),
30837e07efb1SIlya Dryomov 					GFP_NOIO);
30847e07efb1SIlya Dryomov 	if (!obj_req->copyup_bvecs)
30857e07efb1SIlya Dryomov 		return -ENOMEM;
30867e07efb1SIlya Dryomov 
30877e07efb1SIlya Dryomov 	for (i = 0; i < obj_req->copyup_bvec_count; i++) {
30887e07efb1SIlya Dryomov 		unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
30897df2af0bSChristoph Hellwig 		struct page *page = alloc_page(GFP_NOIO);
30907e07efb1SIlya Dryomov 
30917df2af0bSChristoph Hellwig 		if (!page)
30927e07efb1SIlya Dryomov 			return -ENOMEM;
30937e07efb1SIlya Dryomov 
30947df2af0bSChristoph Hellwig 		bvec_set_page(&obj_req->copyup_bvecs[i], page, len, 0);
30957e07efb1SIlya Dryomov 		obj_overlap -= len;
30967e07efb1SIlya Dryomov 	}
30977e07efb1SIlya Dryomov 
30987e07efb1SIlya Dryomov 	rbd_assert(!obj_overlap);
30997e07efb1SIlya Dryomov 	return 0;
31007e07efb1SIlya Dryomov }
31017e07efb1SIlya Dryomov 
31020ad5d953SIlya Dryomov /*
31030ad5d953SIlya Dryomov  * The target object doesn't exist.  Read the data for the entire
31040ad5d953SIlya Dryomov  * target object up to the overlap point (if any) from the parent,
31050ad5d953SIlya Dryomov  * so we can use it for a copyup.
31060ad5d953SIlya Dryomov  */
rbd_obj_copyup_read_parent(struct rbd_obj_request * obj_req)3107793333a3SIlya Dryomov static int rbd_obj_copyup_read_parent(struct rbd_obj_request *obj_req)
31083da691bfSIlya Dryomov {
31093da691bfSIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
31103da691bfSIlya Dryomov 	int ret;
31113da691bfSIlya Dryomov 
311286bd7998SIlya Dryomov 	rbd_assert(obj_req->num_img_extents);
311386bd7998SIlya Dryomov 	prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
311486bd7998SIlya Dryomov 		      rbd_dev->parent_overlap);
311586bd7998SIlya Dryomov 	if (!obj_req->num_img_extents) {
31163da691bfSIlya Dryomov 		/*
31173da691bfSIlya Dryomov 		 * The overlap has become 0 (most likely because the
31183a482501SIlya Dryomov 		 * image has been flattened).  Re-submit the original write
31193a482501SIlya Dryomov 		 * request -- pass MODS_ONLY since the copyup isn't needed
31203a482501SIlya Dryomov 		 * anymore.
31213da691bfSIlya Dryomov 		 */
3122793333a3SIlya Dryomov 		return rbd_obj_copyup_current_snapc(obj_req, MODS_ONLY);
31233da691bfSIlya Dryomov 	}
31243da691bfSIlya Dryomov 
312586bd7998SIlya Dryomov 	ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
31263da691bfSIlya Dryomov 	if (ret)
31273da691bfSIlya Dryomov 		return ret;
31283da691bfSIlya Dryomov 
312986bd7998SIlya Dryomov 	return rbd_obj_read_from_parent(obj_req);
31303da691bfSIlya Dryomov }
31313da691bfSIlya Dryomov 
rbd_obj_copyup_object_maps(struct rbd_obj_request * obj_req)313222e8bd51SIlya Dryomov static void rbd_obj_copyup_object_maps(struct rbd_obj_request *obj_req)
31333da691bfSIlya Dryomov {
313422e8bd51SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
313522e8bd51SIlya Dryomov 	struct ceph_snap_context *snapc = obj_req->img_request->snapc;
313622e8bd51SIlya Dryomov 	u8 new_state;
313722e8bd51SIlya Dryomov 	u32 i;
31383da691bfSIlya Dryomov 	int ret;
31393da691bfSIlya Dryomov 
314022e8bd51SIlya Dryomov 	rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
31413da691bfSIlya Dryomov 
314222e8bd51SIlya Dryomov 	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
314322e8bd51SIlya Dryomov 		return;
314489a59c1cSIlya Dryomov 
314522e8bd51SIlya Dryomov 	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
314622e8bd51SIlya Dryomov 		return;
31473da691bfSIlya Dryomov 
314822e8bd51SIlya Dryomov 	for (i = 0; i < snapc->num_snaps; i++) {
314922e8bd51SIlya Dryomov 		if ((rbd_dev->header.features & RBD_FEATURE_FAST_DIFF) &&
315022e8bd51SIlya Dryomov 		    i + 1 < snapc->num_snaps)
315122e8bd51SIlya Dryomov 			new_state = OBJECT_EXISTS_CLEAN;
315222e8bd51SIlya Dryomov 		else
315322e8bd51SIlya Dryomov 			new_state = OBJECT_EXISTS;
31543da691bfSIlya Dryomov 
315522e8bd51SIlya Dryomov 		ret = rbd_object_map_update(obj_req, snapc->snaps[i],
315622e8bd51SIlya Dryomov 					    new_state, NULL);
315722e8bd51SIlya Dryomov 		if (ret < 0) {
315822e8bd51SIlya Dryomov 			obj_req->pending.result = ret;
315902c74fbaSAlex Elder 			return;
316002c74fbaSAlex Elder 		}
316102c74fbaSAlex Elder 
316222e8bd51SIlya Dryomov 		rbd_assert(!ret);
316322e8bd51SIlya Dryomov 		obj_req->pending.num_pending++;
3164a9e8ba2cSAlex Elder 	}
31658b3e1a56SAlex Elder }
31668b3e1a56SAlex Elder 
rbd_obj_copyup_write_object(struct rbd_obj_request * obj_req)3167793333a3SIlya Dryomov static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req)
31688b3e1a56SAlex Elder {
3169793333a3SIlya Dryomov 	u32 bytes = rbd_obj_img_extents_bytes(obj_req);
3170793333a3SIlya Dryomov 	int ret;
31718b3e1a56SAlex Elder 
3172793333a3SIlya Dryomov 	rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
31738b3e1a56SAlex Elder 
3174793333a3SIlya Dryomov 	/*
3175793333a3SIlya Dryomov 	 * Only send non-zero copyup data to save some I/O and network
3176793333a3SIlya Dryomov 	 * bandwidth -- zero copyup data is equivalent to the object not
3177793333a3SIlya Dryomov 	 * existing.
3178793333a3SIlya Dryomov 	 */
3179793333a3SIlya Dryomov 	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
3180793333a3SIlya Dryomov 		bytes = 0;
3181793333a3SIlya Dryomov 
3182793333a3SIlya Dryomov 	if (obj_req->img_request->snapc->num_snaps && bytes > 0) {
3183793333a3SIlya Dryomov 		/*
3184793333a3SIlya Dryomov 		 * Send a copyup request with an empty snapshot context to
3185793333a3SIlya Dryomov 		 * deep-copyup the object through all existing snapshots.
3186793333a3SIlya Dryomov 		 * A second request with the current snapshot context will be
3187793333a3SIlya Dryomov 		 * sent for the actual modification.
3188793333a3SIlya Dryomov 		 */
3189793333a3SIlya Dryomov 		ret = rbd_obj_copyup_empty_snapc(obj_req, bytes);
3190793333a3SIlya Dryomov 		if (ret) {
3191793333a3SIlya Dryomov 			obj_req->pending.result = ret;
3192793333a3SIlya Dryomov 			return;
31937114edacSIlya Dryomov 		}
31948b3e1a56SAlex Elder 
3195793333a3SIlya Dryomov 		obj_req->pending.num_pending++;
3196793333a3SIlya Dryomov 		bytes = MODS_ONLY;
31973da691bfSIlya Dryomov 	}
31988b3e1a56SAlex Elder 
3199793333a3SIlya Dryomov 	ret = rbd_obj_copyup_current_snapc(obj_req, bytes);
3200793333a3SIlya Dryomov 	if (ret) {
3201793333a3SIlya Dryomov 		obj_req->pending.result = ret;
3202793333a3SIlya Dryomov 		return;
3203793333a3SIlya Dryomov 	}
3204793333a3SIlya Dryomov 
3205793333a3SIlya Dryomov 	obj_req->pending.num_pending++;
3206793333a3SIlya Dryomov }
3207793333a3SIlya Dryomov 
rbd_obj_advance_copyup(struct rbd_obj_request * obj_req,int * result)3208793333a3SIlya Dryomov static bool rbd_obj_advance_copyup(struct rbd_obj_request *obj_req, int *result)
32093da691bfSIlya Dryomov {
321022e8bd51SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3211793333a3SIlya Dryomov 	int ret;
32127114edacSIlya Dryomov 
32137114edacSIlya Dryomov again:
3214793333a3SIlya Dryomov 	switch (obj_req->copyup_state) {
3215793333a3SIlya Dryomov 	case RBD_OBJ_COPYUP_START:
3216793333a3SIlya Dryomov 		rbd_assert(!*result);
32173da691bfSIlya Dryomov 
3218793333a3SIlya Dryomov 		ret = rbd_obj_copyup_read_parent(obj_req);
3219793333a3SIlya Dryomov 		if (ret) {
3220793333a3SIlya Dryomov 			*result = ret;
3221793333a3SIlya Dryomov 			return true;
3222793333a3SIlya Dryomov 		}
3223793333a3SIlya Dryomov 		if (obj_req->num_img_extents)
3224793333a3SIlya Dryomov 			obj_req->copyup_state = RBD_OBJ_COPYUP_READ_PARENT;
3225793333a3SIlya Dryomov 		else
3226793333a3SIlya Dryomov 			obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3227793333a3SIlya Dryomov 		return false;
3228793333a3SIlya Dryomov 	case RBD_OBJ_COPYUP_READ_PARENT:
3229793333a3SIlya Dryomov 		if (*result)
3230793333a3SIlya Dryomov 			return true;
3231793333a3SIlya Dryomov 
3232793333a3SIlya Dryomov 		if (is_zero_bvecs(obj_req->copyup_bvecs,
3233793333a3SIlya Dryomov 				  rbd_obj_img_extents_bytes(obj_req))) {
3234793333a3SIlya Dryomov 			dout("%s %p detected zeros\n", __func__, obj_req);
3235793333a3SIlya Dryomov 			obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ZEROS;
32367114edacSIlya Dryomov 		}
32377114edacSIlya Dryomov 
323822e8bd51SIlya Dryomov 		rbd_obj_copyup_object_maps(obj_req);
323922e8bd51SIlya Dryomov 		if (!obj_req->pending.num_pending) {
324022e8bd51SIlya Dryomov 			*result = obj_req->pending.result;
324122e8bd51SIlya Dryomov 			obj_req->copyup_state = RBD_OBJ_COPYUP_OBJECT_MAPS;
32427114edacSIlya Dryomov 			goto again;
32437114edacSIlya Dryomov 		}
324422e8bd51SIlya Dryomov 		obj_req->copyup_state = __RBD_OBJ_COPYUP_OBJECT_MAPS;
324522e8bd51SIlya Dryomov 		return false;
324622e8bd51SIlya Dryomov 	case __RBD_OBJ_COPYUP_OBJECT_MAPS:
324722e8bd51SIlya Dryomov 		if (!pending_result_dec(&obj_req->pending, result))
324822e8bd51SIlya Dryomov 			return false;
3249df561f66SGustavo A. R. Silva 		fallthrough;
325022e8bd51SIlya Dryomov 	case RBD_OBJ_COPYUP_OBJECT_MAPS:
325122e8bd51SIlya Dryomov 		if (*result) {
325222e8bd51SIlya Dryomov 			rbd_warn(rbd_dev, "snap object map update failed: %d",
325322e8bd51SIlya Dryomov 				 *result);
325422e8bd51SIlya Dryomov 			return true;
325522e8bd51SIlya Dryomov 		}
325622e8bd51SIlya Dryomov 
3257793333a3SIlya Dryomov 		rbd_obj_copyup_write_object(obj_req);
3258793333a3SIlya Dryomov 		if (!obj_req->pending.num_pending) {
3259793333a3SIlya Dryomov 			*result = obj_req->pending.result;
3260793333a3SIlya Dryomov 			obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3261793333a3SIlya Dryomov 			goto again;
3262793333a3SIlya Dryomov 		}
3263793333a3SIlya Dryomov 		obj_req->copyup_state = __RBD_OBJ_COPYUP_WRITE_OBJECT;
3264793333a3SIlya Dryomov 		return false;
3265793333a3SIlya Dryomov 	case __RBD_OBJ_COPYUP_WRITE_OBJECT:
3266793333a3SIlya Dryomov 		if (!pending_result_dec(&obj_req->pending, result))
3267793333a3SIlya Dryomov 			return false;
3268df561f66SGustavo A. R. Silva 		fallthrough;
3269793333a3SIlya Dryomov 	case RBD_OBJ_COPYUP_WRITE_OBJECT:
3270793333a3SIlya Dryomov 		return true;
3271793333a3SIlya Dryomov 	default:
3272793333a3SIlya Dryomov 		BUG();
3273793333a3SIlya Dryomov 	}
3274793333a3SIlya Dryomov }
3275793333a3SIlya Dryomov 
327622e8bd51SIlya Dryomov /*
327722e8bd51SIlya Dryomov  * Return:
327822e8bd51SIlya Dryomov  *   0 - object map update sent
327922e8bd51SIlya Dryomov  *   1 - object map update isn't needed
328022e8bd51SIlya Dryomov  *  <0 - error
328122e8bd51SIlya Dryomov  */
rbd_obj_write_post_object_map(struct rbd_obj_request * obj_req)328222e8bd51SIlya Dryomov static int rbd_obj_write_post_object_map(struct rbd_obj_request *obj_req)
328322e8bd51SIlya Dryomov {
328422e8bd51SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
328522e8bd51SIlya Dryomov 	u8 current_state = OBJECT_PENDING;
328622e8bd51SIlya Dryomov 
328722e8bd51SIlya Dryomov 	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
328822e8bd51SIlya Dryomov 		return 1;
328922e8bd51SIlya Dryomov 
329022e8bd51SIlya Dryomov 	if (!(obj_req->flags & RBD_OBJ_FLAG_DELETION))
329122e8bd51SIlya Dryomov 		return 1;
329222e8bd51SIlya Dryomov 
329322e8bd51SIlya Dryomov 	return rbd_object_map_update(obj_req, CEPH_NOSNAP, OBJECT_NONEXISTENT,
329422e8bd51SIlya Dryomov 				     &current_state);
329522e8bd51SIlya Dryomov }
329622e8bd51SIlya Dryomov 
rbd_obj_advance_write(struct rbd_obj_request * obj_req,int * result)329785b5e6d1SIlya Dryomov static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result)
3298b8d70035SAlex Elder {
3299793333a3SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3300b8d70035SAlex Elder 	int ret;
3301b8d70035SAlex Elder 
3302793333a3SIlya Dryomov again:
3303cf81b60eSAlex Elder 	switch (obj_req->write_state) {
330485b5e6d1SIlya Dryomov 	case RBD_OBJ_WRITE_START:
330585b5e6d1SIlya Dryomov 		rbd_assert(!*result);
330685b5e6d1SIlya Dryomov 
330709fe05c5SIlya Dryomov 		rbd_obj_set_copyup_enabled(obj_req);
330822e8bd51SIlya Dryomov 		if (rbd_obj_write_is_noop(obj_req))
330922e8bd51SIlya Dryomov 			return true;
331022e8bd51SIlya Dryomov 
331122e8bd51SIlya Dryomov 		ret = rbd_obj_write_pre_object_map(obj_req);
331222e8bd51SIlya Dryomov 		if (ret < 0) {
331322e8bd51SIlya Dryomov 			*result = ret;
331422e8bd51SIlya Dryomov 			return true;
331522e8bd51SIlya Dryomov 		}
331622e8bd51SIlya Dryomov 		obj_req->write_state = RBD_OBJ_WRITE_PRE_OBJECT_MAP;
331722e8bd51SIlya Dryomov 		if (ret > 0)
331822e8bd51SIlya Dryomov 			goto again;
331922e8bd51SIlya Dryomov 		return false;
332022e8bd51SIlya Dryomov 	case RBD_OBJ_WRITE_PRE_OBJECT_MAP:
332122e8bd51SIlya Dryomov 		if (*result) {
332222e8bd51SIlya Dryomov 			rbd_warn(rbd_dev, "pre object map update failed: %d",
332322e8bd51SIlya Dryomov 				 *result);
332422e8bd51SIlya Dryomov 			return true;
332522e8bd51SIlya Dryomov 		}
332685b5e6d1SIlya Dryomov 		ret = rbd_obj_write_object(obj_req);
332785b5e6d1SIlya Dryomov 		if (ret) {
332885b5e6d1SIlya Dryomov 			*result = ret;
332985b5e6d1SIlya Dryomov 			return true;
333085b5e6d1SIlya Dryomov 		}
333185b5e6d1SIlya Dryomov 		obj_req->write_state = RBD_OBJ_WRITE_OBJECT;
333285b5e6d1SIlya Dryomov 		return false;
33330ad5d953SIlya Dryomov 	case RBD_OBJ_WRITE_OBJECT:
333454ab3b24SIlya Dryomov 		if (*result == -ENOENT) {
33350ad5d953SIlya Dryomov 			if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
3336793333a3SIlya Dryomov 				*result = 0;
3337793333a3SIlya Dryomov 				obj_req->copyup_state = RBD_OBJ_COPYUP_START;
3338793333a3SIlya Dryomov 				obj_req->write_state = __RBD_OBJ_WRITE_COPYUP;
3339793333a3SIlya Dryomov 				goto again;
3340b8d70035SAlex Elder 			}
33410ad5d953SIlya Dryomov 			/*
33420ad5d953SIlya Dryomov 			 * On a non-existent object:
33430ad5d953SIlya Dryomov 			 *   delete - -ENOENT, truncate/zero - 0
33440ad5d953SIlya Dryomov 			 */
33450ad5d953SIlya Dryomov 			if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
33460ad5d953SIlya Dryomov 				*result = 0;
33470ad5d953SIlya Dryomov 		}
3348793333a3SIlya Dryomov 		if (*result)
3349793333a3SIlya Dryomov 			return true;
3350793333a3SIlya Dryomov 
3351793333a3SIlya Dryomov 		obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
3352793333a3SIlya Dryomov 		goto again;
3353793333a3SIlya Dryomov 	case __RBD_OBJ_WRITE_COPYUP:
3354793333a3SIlya Dryomov 		if (!rbd_obj_advance_copyup(obj_req, result))
3355793333a3SIlya Dryomov 			return false;
3356df561f66SGustavo A. R. Silva 		fallthrough;
3357793333a3SIlya Dryomov 	case RBD_OBJ_WRITE_COPYUP:
335822e8bd51SIlya Dryomov 		if (*result) {
3359793333a3SIlya Dryomov 			rbd_warn(rbd_dev, "copyup failed: %d", *result);
3360b8d70035SAlex Elder 			return true;
336122e8bd51SIlya Dryomov 		}
336222e8bd51SIlya Dryomov 		ret = rbd_obj_write_post_object_map(obj_req);
336322e8bd51SIlya Dryomov 		if (ret < 0) {
336422e8bd51SIlya Dryomov 			*result = ret;
336522e8bd51SIlya Dryomov 			return true;
336622e8bd51SIlya Dryomov 		}
336722e8bd51SIlya Dryomov 		obj_req->write_state = RBD_OBJ_WRITE_POST_OBJECT_MAP;
336822e8bd51SIlya Dryomov 		if (ret > 0)
336922e8bd51SIlya Dryomov 			goto again;
337022e8bd51SIlya Dryomov 		return false;
337122e8bd51SIlya Dryomov 	case RBD_OBJ_WRITE_POST_OBJECT_MAP:
337222e8bd51SIlya Dryomov 		if (*result)
337322e8bd51SIlya Dryomov 			rbd_warn(rbd_dev, "post object map update failed: %d",
337422e8bd51SIlya Dryomov 				 *result);
337522e8bd51SIlya Dryomov 		return true;
3376b8d70035SAlex Elder 	default:
3377b8d70035SAlex Elder 		BUG();
3378b8d70035SAlex Elder 	}
3379b8d70035SAlex Elder }
3380b8d70035SAlex Elder 
3381b8d70035SAlex Elder /*
33820ad5d953SIlya Dryomov  * Return true if @obj_req is completed.
3383b8d70035SAlex Elder  */
__rbd_obj_handle_request(struct rbd_obj_request * obj_req,int * result)338454ab3b24SIlya Dryomov static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req,
338554ab3b24SIlya Dryomov 				     int *result)
3386b8d70035SAlex Elder {
33870ad5d953SIlya Dryomov 	struct rbd_img_request *img_req = obj_req->img_request;
33880192ce2eSIlya Dryomov 	struct rbd_device *rbd_dev = img_req->rbd_dev;
33890ad5d953SIlya Dryomov 	bool done;
33900ad5d953SIlya Dryomov 
339185b5e6d1SIlya Dryomov 	mutex_lock(&obj_req->state_mutex);
33920ad5d953SIlya Dryomov 	if (!rbd_img_is_write(img_req))
339385b5e6d1SIlya Dryomov 		done = rbd_obj_advance_read(obj_req, result);
33940ad5d953SIlya Dryomov 	else
339585b5e6d1SIlya Dryomov 		done = rbd_obj_advance_write(obj_req, result);
339685b5e6d1SIlya Dryomov 	mutex_unlock(&obj_req->state_mutex);
33970ad5d953SIlya Dryomov 
33980192ce2eSIlya Dryomov 	if (done && *result) {
33990192ce2eSIlya Dryomov 		rbd_assert(*result < 0);
34000192ce2eSIlya Dryomov 		rbd_warn(rbd_dev, "%s at objno %llu %llu~%llu result %d",
34010192ce2eSIlya Dryomov 			 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
34020192ce2eSIlya Dryomov 			 obj_req->ex.oe_off, obj_req->ex.oe_len, *result);
34030192ce2eSIlya Dryomov 	}
34040ad5d953SIlya Dryomov 	return done;
34059969ebc5SAlex Elder }
34069969ebc5SAlex Elder 
34070192ce2eSIlya Dryomov /*
34080192ce2eSIlya Dryomov  * This is open-coded in rbd_img_handle_request() to avoid parent chain
34090192ce2eSIlya Dryomov  * recursion.
34100192ce2eSIlya Dryomov  */
rbd_obj_handle_request(struct rbd_obj_request * obj_req,int result)341154ab3b24SIlya Dryomov static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result)
34129969ebc5SAlex Elder {
34130192ce2eSIlya Dryomov 	if (__rbd_obj_handle_request(obj_req, &result))
34140192ce2eSIlya Dryomov 		rbd_img_handle_request(obj_req->img_request, result);
34157114edacSIlya Dryomov }
34167114edacSIlya Dryomov 
need_exclusive_lock(struct rbd_img_request * img_req)3417e1fddc8fSIlya Dryomov static bool need_exclusive_lock(struct rbd_img_request *img_req)
3418e1fddc8fSIlya Dryomov {
3419e1fddc8fSIlya Dryomov 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3420e1fddc8fSIlya Dryomov 
3421e1fddc8fSIlya Dryomov 	if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK))
3422e1fddc8fSIlya Dryomov 		return false;
3423e1fddc8fSIlya Dryomov 
34243fe69921SIlya Dryomov 	if (rbd_is_ro(rbd_dev))
3425e1fddc8fSIlya Dryomov 		return false;
3426e1fddc8fSIlya Dryomov 
3427e1fddc8fSIlya Dryomov 	rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
342822e8bd51SIlya Dryomov 	if (rbd_dev->opts->lock_on_read ||
342922e8bd51SIlya Dryomov 	    (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3430e1fddc8fSIlya Dryomov 		return true;
3431e1fddc8fSIlya Dryomov 
3432e1fddc8fSIlya Dryomov 	return rbd_img_is_write(img_req);
3433e1fddc8fSIlya Dryomov }
3434e1fddc8fSIlya Dryomov 
rbd_lock_add_request(struct rbd_img_request * img_req)3435637cd060SIlya Dryomov static bool rbd_lock_add_request(struct rbd_img_request *img_req)
3436e1fddc8fSIlya Dryomov {
3437e1fddc8fSIlya Dryomov 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3438637cd060SIlya Dryomov 	bool locked;
3439e1fddc8fSIlya Dryomov 
3440e1fddc8fSIlya Dryomov 	lockdep_assert_held(&rbd_dev->lock_rwsem);
3441637cd060SIlya Dryomov 	locked = rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED;
3442e1fddc8fSIlya Dryomov 	spin_lock(&rbd_dev->lock_lists_lock);
3443e1fddc8fSIlya Dryomov 	rbd_assert(list_empty(&img_req->lock_item));
3444637cd060SIlya Dryomov 	if (!locked)
3445637cd060SIlya Dryomov 		list_add_tail(&img_req->lock_item, &rbd_dev->acquiring_list);
3446637cd060SIlya Dryomov 	else
3447e1fddc8fSIlya Dryomov 		list_add_tail(&img_req->lock_item, &rbd_dev->running_list);
3448e1fddc8fSIlya Dryomov 	spin_unlock(&rbd_dev->lock_lists_lock);
3449637cd060SIlya Dryomov 	return locked;
3450e1fddc8fSIlya Dryomov }
3451e1fddc8fSIlya Dryomov 
rbd_lock_del_request(struct rbd_img_request * img_req)3452e1fddc8fSIlya Dryomov static void rbd_lock_del_request(struct rbd_img_request *img_req)
3453e1fddc8fSIlya Dryomov {
3454e1fddc8fSIlya Dryomov 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3455*801474eaSIlya Dryomov 	bool need_wakeup = false;
3456e1fddc8fSIlya Dryomov 
3457e1fddc8fSIlya Dryomov 	lockdep_assert_held(&rbd_dev->lock_rwsem);
3458e1fddc8fSIlya Dryomov 	spin_lock(&rbd_dev->lock_lists_lock);
3459*801474eaSIlya Dryomov 	if (!list_empty(&img_req->lock_item)) {
3460e1fddc8fSIlya Dryomov 		list_del_init(&img_req->lock_item);
3461e1fddc8fSIlya Dryomov 		need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING &&
3462e1fddc8fSIlya Dryomov 			       list_empty(&rbd_dev->running_list));
3463*801474eaSIlya Dryomov 	}
3464e1fddc8fSIlya Dryomov 	spin_unlock(&rbd_dev->lock_lists_lock);
3465e1fddc8fSIlya Dryomov 	if (need_wakeup)
3466e1fddc8fSIlya Dryomov 		complete(&rbd_dev->releasing_wait);
3467e1fddc8fSIlya Dryomov }
3468e1fddc8fSIlya Dryomov 
rbd_img_exclusive_lock(struct rbd_img_request * img_req)3469637cd060SIlya Dryomov static int rbd_img_exclusive_lock(struct rbd_img_request *img_req)
3470637cd060SIlya Dryomov {
3471637cd060SIlya Dryomov 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3472637cd060SIlya Dryomov 
3473637cd060SIlya Dryomov 	if (!need_exclusive_lock(img_req))
3474637cd060SIlya Dryomov 		return 1;
3475637cd060SIlya Dryomov 
3476637cd060SIlya Dryomov 	if (rbd_lock_add_request(img_req))
3477637cd060SIlya Dryomov 		return 1;
3478637cd060SIlya Dryomov 
3479637cd060SIlya Dryomov 	if (rbd_dev->opts->exclusive) {
3480637cd060SIlya Dryomov 		WARN_ON(1); /* lock got released? */
3481637cd060SIlya Dryomov 		return -EROFS;
3482637cd060SIlya Dryomov 	}
3483637cd060SIlya Dryomov 
3484637cd060SIlya Dryomov 	/*
3485637cd060SIlya Dryomov 	 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3486637cd060SIlya Dryomov 	 * and cancel_delayed_work() in wake_lock_waiters().
3487637cd060SIlya Dryomov 	 */
3488637cd060SIlya Dryomov 	dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3489637cd060SIlya Dryomov 	queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3490637cd060SIlya Dryomov 	return 0;
3491637cd060SIlya Dryomov }
3492637cd060SIlya Dryomov 
rbd_img_object_requests(struct rbd_img_request * img_req)34930192ce2eSIlya Dryomov static void rbd_img_object_requests(struct rbd_img_request *img_req)
34940192ce2eSIlya Dryomov {
3495870611e4SIlya Dryomov 	struct rbd_device *rbd_dev = img_req->rbd_dev;
34960192ce2eSIlya Dryomov 	struct rbd_obj_request *obj_req;
34970192ce2eSIlya Dryomov 
34980192ce2eSIlya Dryomov 	rbd_assert(!img_req->pending.result && !img_req->pending.num_pending);
3499870611e4SIlya Dryomov 	rbd_assert(!need_exclusive_lock(img_req) ||
3500870611e4SIlya Dryomov 		   __rbd_is_lock_owner(rbd_dev));
3501870611e4SIlya Dryomov 
3502870611e4SIlya Dryomov 	if (rbd_img_is_write(img_req)) {
3503870611e4SIlya Dryomov 		rbd_assert(!img_req->snapc);
3504870611e4SIlya Dryomov 		down_read(&rbd_dev->header_rwsem);
3505870611e4SIlya Dryomov 		img_req->snapc = ceph_get_snap_context(rbd_dev->header.snapc);
3506870611e4SIlya Dryomov 		up_read(&rbd_dev->header_rwsem);
3507870611e4SIlya Dryomov 	}
35080192ce2eSIlya Dryomov 
35090192ce2eSIlya Dryomov 	for_each_obj_request(img_req, obj_req) {
35100192ce2eSIlya Dryomov 		int result = 0;
35110192ce2eSIlya Dryomov 
35120192ce2eSIlya Dryomov 		if (__rbd_obj_handle_request(obj_req, &result)) {
35130192ce2eSIlya Dryomov 			if (result) {
35140192ce2eSIlya Dryomov 				img_req->pending.result = result;
35150192ce2eSIlya Dryomov 				return;
35160192ce2eSIlya Dryomov 			}
35170192ce2eSIlya Dryomov 		} else {
35180192ce2eSIlya Dryomov 			img_req->pending.num_pending++;
35190192ce2eSIlya Dryomov 		}
35200192ce2eSIlya Dryomov 	}
35210192ce2eSIlya Dryomov }
35220192ce2eSIlya Dryomov 
rbd_img_advance(struct rbd_img_request * img_req,int * result)35230192ce2eSIlya Dryomov static bool rbd_img_advance(struct rbd_img_request *img_req, int *result)
35240192ce2eSIlya Dryomov {
3525637cd060SIlya Dryomov 	int ret;
3526637cd060SIlya Dryomov 
35270192ce2eSIlya Dryomov again:
35280192ce2eSIlya Dryomov 	switch (img_req->state) {
35290192ce2eSIlya Dryomov 	case RBD_IMG_START:
35300192ce2eSIlya Dryomov 		rbd_assert(!*result);
35310192ce2eSIlya Dryomov 
3532637cd060SIlya Dryomov 		ret = rbd_img_exclusive_lock(img_req);
3533637cd060SIlya Dryomov 		if (ret < 0) {
3534637cd060SIlya Dryomov 			*result = ret;
3535637cd060SIlya Dryomov 			return true;
3536637cd060SIlya Dryomov 		}
3537637cd060SIlya Dryomov 		img_req->state = RBD_IMG_EXCLUSIVE_LOCK;
3538637cd060SIlya Dryomov 		if (ret > 0)
3539637cd060SIlya Dryomov 			goto again;
3540637cd060SIlya Dryomov 		return false;
3541637cd060SIlya Dryomov 	case RBD_IMG_EXCLUSIVE_LOCK:
3542637cd060SIlya Dryomov 		if (*result)
3543637cd060SIlya Dryomov 			return true;
3544637cd060SIlya Dryomov 
35450192ce2eSIlya Dryomov 		rbd_img_object_requests(img_req);
35460192ce2eSIlya Dryomov 		if (!img_req->pending.num_pending) {
35470192ce2eSIlya Dryomov 			*result = img_req->pending.result;
35480192ce2eSIlya Dryomov 			img_req->state = RBD_IMG_OBJECT_REQUESTS;
35497114edacSIlya Dryomov 			goto again;
35507114edacSIlya Dryomov 		}
35510192ce2eSIlya Dryomov 		img_req->state = __RBD_IMG_OBJECT_REQUESTS;
35520192ce2eSIlya Dryomov 		return false;
35530192ce2eSIlya Dryomov 	case __RBD_IMG_OBJECT_REQUESTS:
35540192ce2eSIlya Dryomov 		if (!pending_result_dec(&img_req->pending, result))
35550192ce2eSIlya Dryomov 			return false;
3556df561f66SGustavo A. R. Silva 		fallthrough;
35570192ce2eSIlya Dryomov 	case RBD_IMG_OBJECT_REQUESTS:
35580192ce2eSIlya Dryomov 		return true;
35590192ce2eSIlya Dryomov 	default:
35600192ce2eSIlya Dryomov 		BUG();
35610192ce2eSIlya Dryomov 	}
35620192ce2eSIlya Dryomov }
35630192ce2eSIlya Dryomov 
35640192ce2eSIlya Dryomov /*
35650192ce2eSIlya Dryomov  * Return true if @img_req is completed.
35660192ce2eSIlya Dryomov  */
__rbd_img_handle_request(struct rbd_img_request * img_req,int * result)35670192ce2eSIlya Dryomov static bool __rbd_img_handle_request(struct rbd_img_request *img_req,
35680192ce2eSIlya Dryomov 				     int *result)
35690192ce2eSIlya Dryomov {
35700192ce2eSIlya Dryomov 	struct rbd_device *rbd_dev = img_req->rbd_dev;
35710192ce2eSIlya Dryomov 	bool done;
35720192ce2eSIlya Dryomov 
3573e1fddc8fSIlya Dryomov 	if (need_exclusive_lock(img_req)) {
3574e1fddc8fSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
3575e1fddc8fSIlya Dryomov 		mutex_lock(&img_req->state_mutex);
3576e1fddc8fSIlya Dryomov 		done = rbd_img_advance(img_req, result);
3577e1fddc8fSIlya Dryomov 		if (done)
3578e1fddc8fSIlya Dryomov 			rbd_lock_del_request(img_req);
3579e1fddc8fSIlya Dryomov 		mutex_unlock(&img_req->state_mutex);
3580e1fddc8fSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
3581e1fddc8fSIlya Dryomov 	} else {
35820192ce2eSIlya Dryomov 		mutex_lock(&img_req->state_mutex);
35830192ce2eSIlya Dryomov 		done = rbd_img_advance(img_req, result);
35840192ce2eSIlya Dryomov 		mutex_unlock(&img_req->state_mutex);
3585e1fddc8fSIlya Dryomov 	}
35860192ce2eSIlya Dryomov 
35870192ce2eSIlya Dryomov 	if (done && *result) {
35880192ce2eSIlya Dryomov 		rbd_assert(*result < 0);
35890192ce2eSIlya Dryomov 		rbd_warn(rbd_dev, "%s%s result %d",
35900192ce2eSIlya Dryomov 		      test_bit(IMG_REQ_CHILD, &img_req->flags) ? "child " : "",
35910192ce2eSIlya Dryomov 		      obj_op_name(img_req->op_type), *result);
35920192ce2eSIlya Dryomov 	}
35930192ce2eSIlya Dryomov 	return done;
35940192ce2eSIlya Dryomov }
35950192ce2eSIlya Dryomov 
rbd_img_handle_request(struct rbd_img_request * img_req,int result)35960192ce2eSIlya Dryomov static void rbd_img_handle_request(struct rbd_img_request *img_req, int result)
35970192ce2eSIlya Dryomov {
35980192ce2eSIlya Dryomov again:
35990192ce2eSIlya Dryomov 	if (!__rbd_img_handle_request(img_req, &result))
36000192ce2eSIlya Dryomov 		return;
36010192ce2eSIlya Dryomov 
36020192ce2eSIlya Dryomov 	if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
36030192ce2eSIlya Dryomov 		struct rbd_obj_request *obj_req = img_req->obj_request;
36040192ce2eSIlya Dryomov 
3605679a97d2SHannes Reinecke 		rbd_img_request_destroy(img_req);
36060192ce2eSIlya Dryomov 		if (__rbd_obj_handle_request(obj_req, &result)) {
36070192ce2eSIlya Dryomov 			img_req = obj_req->img_request;
36080192ce2eSIlya Dryomov 			goto again;
36090192ce2eSIlya Dryomov 		}
36100192ce2eSIlya Dryomov 	} else {
361159e542c8SIlya Dryomov 		struct request *rq = blk_mq_rq_from_pdu(img_req);
36120192ce2eSIlya Dryomov 
3613679a97d2SHannes Reinecke 		rbd_img_request_destroy(img_req);
36140192ce2eSIlya Dryomov 		blk_mq_end_request(rq, errno_to_blk_status(result));
36150192ce2eSIlya Dryomov 	}
36169969ebc5SAlex Elder }
36179969ebc5SAlex Elder 
3618ed95b21aSIlya Dryomov static const struct rbd_client_id rbd_empty_cid;
3619ed95b21aSIlya Dryomov 
rbd_cid_equal(const struct rbd_client_id * lhs,const struct rbd_client_id * rhs)3620ed95b21aSIlya Dryomov static bool rbd_cid_equal(const struct rbd_client_id *lhs,
3621ed95b21aSIlya Dryomov 			  const struct rbd_client_id *rhs)
3622ed95b21aSIlya Dryomov {
3623ed95b21aSIlya Dryomov 	return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
3624ed95b21aSIlya Dryomov }
3625ed95b21aSIlya Dryomov 
rbd_get_cid(struct rbd_device * rbd_dev)3626ed95b21aSIlya Dryomov static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
3627ed95b21aSIlya Dryomov {
3628ed95b21aSIlya Dryomov 	struct rbd_client_id cid;
3629ed95b21aSIlya Dryomov 
3630ed95b21aSIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
3631ed95b21aSIlya Dryomov 	cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
3632ed95b21aSIlya Dryomov 	cid.handle = rbd_dev->watch_cookie;
3633ed95b21aSIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
3634ed95b21aSIlya Dryomov 	return cid;
3635ed95b21aSIlya Dryomov }
3636ed95b21aSIlya Dryomov 
3637ed95b21aSIlya Dryomov /*
3638ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3639ed95b21aSIlya Dryomov  */
rbd_set_owner_cid(struct rbd_device * rbd_dev,const struct rbd_client_id * cid)3640ed95b21aSIlya Dryomov static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
3641ed95b21aSIlya Dryomov 			      const struct rbd_client_id *cid)
3642ed95b21aSIlya Dryomov {
3643ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
3644ed95b21aSIlya Dryomov 	     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
3645ed95b21aSIlya Dryomov 	     cid->gid, cid->handle);
3646ed95b21aSIlya Dryomov 	rbd_dev->owner_cid = *cid; /* struct */
3647ed95b21aSIlya Dryomov }
3648ed95b21aSIlya Dryomov 
format_lock_cookie(struct rbd_device * rbd_dev,char * buf)3649ed95b21aSIlya Dryomov static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
3650ed95b21aSIlya Dryomov {
3651ed95b21aSIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
3652ed95b21aSIlya Dryomov 	sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
3653ed95b21aSIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
3654ed95b21aSIlya Dryomov }
3655ed95b21aSIlya Dryomov 
__rbd_lock(struct rbd_device * rbd_dev,const char * cookie)3656edd8ca80SFlorian Margaine static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
3657edd8ca80SFlorian Margaine {
3658edd8ca80SFlorian Margaine 	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3659edd8ca80SFlorian Margaine 
3660a2b1da09SIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
3661edd8ca80SFlorian Margaine 	strcpy(rbd_dev->lock_cookie, cookie);
3662edd8ca80SFlorian Margaine 	rbd_set_owner_cid(rbd_dev, &cid);
3663edd8ca80SFlorian Margaine 	queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
3664edd8ca80SFlorian Margaine }
3665edd8ca80SFlorian Margaine 
3666ed95b21aSIlya Dryomov /*
3667ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3668ed95b21aSIlya Dryomov  */
rbd_lock(struct rbd_device * rbd_dev)3669ed95b21aSIlya Dryomov static int rbd_lock(struct rbd_device *rbd_dev)
3670ed95b21aSIlya Dryomov {
3671ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3672ed95b21aSIlya Dryomov 	char cookie[32];
3673ed95b21aSIlya Dryomov 	int ret;
3674ed95b21aSIlya Dryomov 
3675cbbfb0ffSIlya Dryomov 	WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
3676cbbfb0ffSIlya Dryomov 		rbd_dev->lock_cookie[0] != '\0');
3677ed95b21aSIlya Dryomov 
3678ed95b21aSIlya Dryomov 	format_lock_cookie(rbd_dev, cookie);
3679ed95b21aSIlya Dryomov 	ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3680ed95b21aSIlya Dryomov 			    RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
3681ed95b21aSIlya Dryomov 			    RBD_LOCK_TAG, "", 0);
36829d01e07fSIlya Dryomov 	if (ret && ret != -EEXIST)
3683ed95b21aSIlya Dryomov 		return ret;
3684ed95b21aSIlya Dryomov 
3685edd8ca80SFlorian Margaine 	__rbd_lock(rbd_dev, cookie);
3686ed95b21aSIlya Dryomov 	return 0;
3687ed95b21aSIlya Dryomov }
3688ed95b21aSIlya Dryomov 
3689ed95b21aSIlya Dryomov /*
3690ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3691ed95b21aSIlya Dryomov  */
rbd_unlock(struct rbd_device * rbd_dev)3692bbead745SIlya Dryomov static void rbd_unlock(struct rbd_device *rbd_dev)
3693ed95b21aSIlya Dryomov {
3694ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3695ed95b21aSIlya Dryomov 	int ret;
3696ed95b21aSIlya Dryomov 
3697cbbfb0ffSIlya Dryomov 	WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
3698cbbfb0ffSIlya Dryomov 		rbd_dev->lock_cookie[0] == '\0');
3699ed95b21aSIlya Dryomov 
3700ed95b21aSIlya Dryomov 	ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3701cbbfb0ffSIlya Dryomov 			      RBD_LOCK_NAME, rbd_dev->lock_cookie);
3702bbead745SIlya Dryomov 	if (ret && ret != -ENOENT)
3703637cd060SIlya Dryomov 		rbd_warn(rbd_dev, "failed to unlock header: %d", ret);
3704ed95b21aSIlya Dryomov 
3705bbead745SIlya Dryomov 	/* treat errors as the image is unlocked */
3706bbead745SIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
3707cbbfb0ffSIlya Dryomov 	rbd_dev->lock_cookie[0] = '\0';
3708ed95b21aSIlya Dryomov 	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3709ed95b21aSIlya Dryomov 	queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
3710ed95b21aSIlya Dryomov }
3711ed95b21aSIlya Dryomov 
__rbd_notify_op_lock(struct rbd_device * rbd_dev,enum rbd_notify_op notify_op,struct page *** preply_pages,size_t * preply_len)3712ed95b21aSIlya Dryomov static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
3713ed95b21aSIlya Dryomov 				enum rbd_notify_op notify_op,
3714ed95b21aSIlya Dryomov 				struct page ***preply_pages,
3715ed95b21aSIlya Dryomov 				size_t *preply_len)
3716ed95b21aSIlya Dryomov {
3717ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3718ed95b21aSIlya Dryomov 	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
371908a79102SKyle Spiers 	char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
372008a79102SKyle Spiers 	int buf_size = sizeof(buf);
3721ed95b21aSIlya Dryomov 	void *p = buf;
3722ed95b21aSIlya Dryomov 
3723ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
3724ed95b21aSIlya Dryomov 
3725ed95b21aSIlya Dryomov 	/* encode *LockPayload NotifyMessage (op + ClientId) */
3726ed95b21aSIlya Dryomov 	ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
3727ed95b21aSIlya Dryomov 	ceph_encode_32(&p, notify_op);
3728ed95b21aSIlya Dryomov 	ceph_encode_64(&p, cid.gid);
3729ed95b21aSIlya Dryomov 	ceph_encode_64(&p, cid.handle);
3730ed95b21aSIlya Dryomov 
3731ed95b21aSIlya Dryomov 	return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
3732ed95b21aSIlya Dryomov 				&rbd_dev->header_oloc, buf, buf_size,
3733ed95b21aSIlya Dryomov 				RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
3734ed95b21aSIlya Dryomov }
3735ed95b21aSIlya Dryomov 
rbd_notify_op_lock(struct rbd_device * rbd_dev,enum rbd_notify_op notify_op)3736ed95b21aSIlya Dryomov static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
3737ed95b21aSIlya Dryomov 			       enum rbd_notify_op notify_op)
3738ed95b21aSIlya Dryomov {
37398ae0299aSIlya Dryomov 	__rbd_notify_op_lock(rbd_dev, notify_op, NULL, NULL);
3740ed95b21aSIlya Dryomov }
3741ed95b21aSIlya Dryomov 
rbd_notify_acquired_lock(struct work_struct * work)3742ed95b21aSIlya Dryomov static void rbd_notify_acquired_lock(struct work_struct *work)
3743ed95b21aSIlya Dryomov {
3744ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3745ed95b21aSIlya Dryomov 						  acquired_lock_work);
3746ed95b21aSIlya Dryomov 
3747ed95b21aSIlya Dryomov 	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
3748ed95b21aSIlya Dryomov }
3749ed95b21aSIlya Dryomov 
rbd_notify_released_lock(struct work_struct * work)3750ed95b21aSIlya Dryomov static void rbd_notify_released_lock(struct work_struct *work)
3751ed95b21aSIlya Dryomov {
3752ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3753ed95b21aSIlya Dryomov 						  released_lock_work);
3754ed95b21aSIlya Dryomov 
3755ed95b21aSIlya Dryomov 	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
3756ed95b21aSIlya Dryomov }
3757ed95b21aSIlya Dryomov 
rbd_request_lock(struct rbd_device * rbd_dev)3758ed95b21aSIlya Dryomov static int rbd_request_lock(struct rbd_device *rbd_dev)
3759ed95b21aSIlya Dryomov {
3760ed95b21aSIlya Dryomov 	struct page **reply_pages;
3761ed95b21aSIlya Dryomov 	size_t reply_len;
3762ed95b21aSIlya Dryomov 	bool lock_owner_responded = false;
3763ed95b21aSIlya Dryomov 	int ret;
3764ed95b21aSIlya Dryomov 
3765ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3766ed95b21aSIlya Dryomov 
3767ed95b21aSIlya Dryomov 	ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
3768ed95b21aSIlya Dryomov 				   &reply_pages, &reply_len);
3769ed95b21aSIlya Dryomov 	if (ret && ret != -ETIMEDOUT) {
3770ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "failed to request lock: %d", ret);
3771ed95b21aSIlya Dryomov 		goto out;
3772ed95b21aSIlya Dryomov 	}
3773ed95b21aSIlya Dryomov 
3774ed95b21aSIlya Dryomov 	if (reply_len > 0 && reply_len <= PAGE_SIZE) {
3775ed95b21aSIlya Dryomov 		void *p = page_address(reply_pages[0]);
3776ed95b21aSIlya Dryomov 		void *const end = p + reply_len;
3777ed95b21aSIlya Dryomov 		u32 n;
3778ed95b21aSIlya Dryomov 
3779ed95b21aSIlya Dryomov 		ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
3780ed95b21aSIlya Dryomov 		while (n--) {
3781ed95b21aSIlya Dryomov 			u8 struct_v;
3782ed95b21aSIlya Dryomov 			u32 len;
3783ed95b21aSIlya Dryomov 
3784ed95b21aSIlya Dryomov 			ceph_decode_need(&p, end, 8 + 8, e_inval);
3785ed95b21aSIlya Dryomov 			p += 8 + 8; /* skip gid and cookie */
3786ed95b21aSIlya Dryomov 
3787ed95b21aSIlya Dryomov 			ceph_decode_32_safe(&p, end, len, e_inval);
3788ed95b21aSIlya Dryomov 			if (!len)
3789ed95b21aSIlya Dryomov 				continue;
3790ed95b21aSIlya Dryomov 
3791ed95b21aSIlya Dryomov 			if (lock_owner_responded) {
3792ed95b21aSIlya Dryomov 				rbd_warn(rbd_dev,
3793ed95b21aSIlya Dryomov 					 "duplicate lock owners detected");
3794ed95b21aSIlya Dryomov 				ret = -EIO;
3795ed95b21aSIlya Dryomov 				goto out;
3796ed95b21aSIlya Dryomov 			}
3797ed95b21aSIlya Dryomov 
3798ed95b21aSIlya Dryomov 			lock_owner_responded = true;
3799ed95b21aSIlya Dryomov 			ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
3800ed95b21aSIlya Dryomov 						  &struct_v, &len);
3801ed95b21aSIlya Dryomov 			if (ret) {
3802ed95b21aSIlya Dryomov 				rbd_warn(rbd_dev,
3803ed95b21aSIlya Dryomov 					 "failed to decode ResponseMessage: %d",
3804ed95b21aSIlya Dryomov 					 ret);
3805ed95b21aSIlya Dryomov 				goto e_inval;
3806ed95b21aSIlya Dryomov 			}
3807ed95b21aSIlya Dryomov 
3808ed95b21aSIlya Dryomov 			ret = ceph_decode_32(&p);
3809ed95b21aSIlya Dryomov 		}
3810ed95b21aSIlya Dryomov 	}
3811ed95b21aSIlya Dryomov 
3812ed95b21aSIlya Dryomov 	if (!lock_owner_responded) {
3813ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "no lock owners detected");
3814ed95b21aSIlya Dryomov 		ret = -ETIMEDOUT;
3815ed95b21aSIlya Dryomov 	}
3816ed95b21aSIlya Dryomov 
3817ed95b21aSIlya Dryomov out:
3818ed95b21aSIlya Dryomov 	ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3819ed95b21aSIlya Dryomov 	return ret;
3820ed95b21aSIlya Dryomov 
3821ed95b21aSIlya Dryomov e_inval:
3822ed95b21aSIlya Dryomov 	ret = -EINVAL;
3823ed95b21aSIlya Dryomov 	goto out;
3824ed95b21aSIlya Dryomov }
3825ed95b21aSIlya Dryomov 
3826637cd060SIlya Dryomov /*
3827637cd060SIlya Dryomov  * Either image request state machine(s) or rbd_add_acquire_lock()
3828637cd060SIlya Dryomov  * (i.e. "rbd map").
3829637cd060SIlya Dryomov  */
wake_lock_waiters(struct rbd_device * rbd_dev,int result)3830637cd060SIlya Dryomov static void wake_lock_waiters(struct rbd_device *rbd_dev, int result)
3831ed95b21aSIlya Dryomov {
3832637cd060SIlya Dryomov 	struct rbd_img_request *img_req;
3833637cd060SIlya Dryomov 
3834637cd060SIlya Dryomov 	dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3835d9b9c893SLinus Torvalds 	lockdep_assert_held_write(&rbd_dev->lock_rwsem);
3836ed95b21aSIlya Dryomov 
3837ed95b21aSIlya Dryomov 	cancel_delayed_work(&rbd_dev->lock_dwork);
3838637cd060SIlya Dryomov 	if (!completion_done(&rbd_dev->acquire_wait)) {
3839637cd060SIlya Dryomov 		rbd_assert(list_empty(&rbd_dev->acquiring_list) &&
3840637cd060SIlya Dryomov 			   list_empty(&rbd_dev->running_list));
3841637cd060SIlya Dryomov 		rbd_dev->acquire_err = result;
3842637cd060SIlya Dryomov 		complete_all(&rbd_dev->acquire_wait);
3843637cd060SIlya Dryomov 		return;
3844637cd060SIlya Dryomov 	}
3845637cd060SIlya Dryomov 
3846*801474eaSIlya Dryomov 	while (!list_empty(&rbd_dev->acquiring_list)) {
3847*801474eaSIlya Dryomov 		img_req = list_first_entry(&rbd_dev->acquiring_list,
3848*801474eaSIlya Dryomov 					   struct rbd_img_request, lock_item);
3849637cd060SIlya Dryomov 		mutex_lock(&img_req->state_mutex);
3850637cd060SIlya Dryomov 		rbd_assert(img_req->state == RBD_IMG_EXCLUSIVE_LOCK);
3851*801474eaSIlya Dryomov 		if (!result)
3852*801474eaSIlya Dryomov 			list_move_tail(&img_req->lock_item,
3853*801474eaSIlya Dryomov 				       &rbd_dev->running_list);
3854*801474eaSIlya Dryomov 		else
3855*801474eaSIlya Dryomov 			list_del_init(&img_req->lock_item);
3856637cd060SIlya Dryomov 		rbd_img_schedule(img_req, result);
3857637cd060SIlya Dryomov 		mutex_unlock(&img_req->state_mutex);
3858637cd060SIlya Dryomov 	}
3859ed95b21aSIlya Dryomov }
3860ed95b21aSIlya Dryomov 
locker_equal(const struct ceph_locker * lhs,const struct ceph_locker * rhs)386158815900SIlya Dryomov static bool locker_equal(const struct ceph_locker *lhs,
386258815900SIlya Dryomov 			 const struct ceph_locker *rhs)
386358815900SIlya Dryomov {
386458815900SIlya Dryomov 	return lhs->id.name.type == rhs->id.name.type &&
386558815900SIlya Dryomov 	       lhs->id.name.num == rhs->id.name.num &&
386658815900SIlya Dryomov 	       !strcmp(lhs->id.cookie, rhs->id.cookie) &&
386758815900SIlya Dryomov 	       ceph_addr_equal_no_type(&lhs->info.addr, &rhs->info.addr);
386858815900SIlya Dryomov }
386958815900SIlya Dryomov 
free_locker(struct ceph_locker * locker)3870f38cb9d9SIlya Dryomov static void free_locker(struct ceph_locker *locker)
3871f38cb9d9SIlya Dryomov {
3872f38cb9d9SIlya Dryomov 	if (locker)
3873f38cb9d9SIlya Dryomov 		ceph_free_lockers(locker, 1);
3874f38cb9d9SIlya Dryomov }
3875f38cb9d9SIlya Dryomov 
get_lock_owner_info(struct rbd_device * rbd_dev)3876f38cb9d9SIlya Dryomov static struct ceph_locker *get_lock_owner_info(struct rbd_device *rbd_dev)
3877ed95b21aSIlya Dryomov {
3878ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3879f38cb9d9SIlya Dryomov 	struct ceph_locker *lockers;
3880f38cb9d9SIlya Dryomov 	u32 num_lockers;
3881ed95b21aSIlya Dryomov 	u8 lock_type;
3882ed95b21aSIlya Dryomov 	char *lock_tag;
38838ff2c64cSIlya Dryomov 	u64 handle;
3884ed95b21aSIlya Dryomov 	int ret;
3885ed95b21aSIlya Dryomov 
3886ed95b21aSIlya Dryomov 	ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
3887ed95b21aSIlya Dryomov 				 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3888f38cb9d9SIlya Dryomov 				 &lock_type, &lock_tag, &lockers, &num_lockers);
3889f38cb9d9SIlya Dryomov 	if (ret) {
38909d01e07fSIlya Dryomov 		rbd_warn(rbd_dev, "failed to get header lockers: %d", ret);
3891f38cb9d9SIlya Dryomov 		return ERR_PTR(ret);
3892f38cb9d9SIlya Dryomov 	}
3893ed95b21aSIlya Dryomov 
3894f38cb9d9SIlya Dryomov 	if (num_lockers == 0) {
3895ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
3896f38cb9d9SIlya Dryomov 		lockers = NULL;
3897ed95b21aSIlya Dryomov 		goto out;
3898ed95b21aSIlya Dryomov 	}
3899ed95b21aSIlya Dryomov 
3900ed95b21aSIlya Dryomov 	if (strcmp(lock_tag, RBD_LOCK_TAG)) {
3901ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
3902ed95b21aSIlya Dryomov 			 lock_tag);
3903f38cb9d9SIlya Dryomov 		goto err_busy;
3904ed95b21aSIlya Dryomov 	}
3905ed95b21aSIlya Dryomov 
39068ff2c64cSIlya Dryomov 	if (lock_type != CEPH_CLS_LOCK_EXCLUSIVE) {
39078ff2c64cSIlya Dryomov 		rbd_warn(rbd_dev, "incompatible lock type detected");
3908f38cb9d9SIlya Dryomov 		goto err_busy;
3909ed95b21aSIlya Dryomov 	}
3910ed95b21aSIlya Dryomov 
3911f38cb9d9SIlya Dryomov 	WARN_ON(num_lockers != 1);
39128ff2c64cSIlya Dryomov 	ret = sscanf(lockers[0].id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu",
39138ff2c64cSIlya Dryomov 		     &handle);
39148ff2c64cSIlya Dryomov 	if (ret != 1) {
3915ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
3916f38cb9d9SIlya Dryomov 			 lockers[0].id.cookie);
3917f38cb9d9SIlya Dryomov 		goto err_busy;
3918ed95b21aSIlya Dryomov 	}
39198ff2c64cSIlya Dryomov 	if (ceph_addr_is_blank(&lockers[0].info.addr)) {
39208ff2c64cSIlya Dryomov 		rbd_warn(rbd_dev, "locker has a blank address");
39218ff2c64cSIlya Dryomov 		goto err_busy;
39228ff2c64cSIlya Dryomov 	}
39238ff2c64cSIlya Dryomov 
39248ff2c64cSIlya Dryomov 	dout("%s rbd_dev %p got locker %s%llu@%pISpc/%u handle %llu\n",
39258ff2c64cSIlya Dryomov 	     __func__, rbd_dev, ENTITY_NAME(lockers[0].id.name),
39268ff2c64cSIlya Dryomov 	     &lockers[0].info.addr.in_addr,
39278ff2c64cSIlya Dryomov 	     le32_to_cpu(lockers[0].info.addr.nonce), handle);
3928ed95b21aSIlya Dryomov 
3929ed95b21aSIlya Dryomov out:
3930ed95b21aSIlya Dryomov 	kfree(lock_tag);
3931f38cb9d9SIlya Dryomov 	return lockers;
3932f38cb9d9SIlya Dryomov 
3933f38cb9d9SIlya Dryomov err_busy:
3934f38cb9d9SIlya Dryomov 	kfree(lock_tag);
3935f38cb9d9SIlya Dryomov 	ceph_free_lockers(lockers, num_lockers);
3936f38cb9d9SIlya Dryomov 	return ERR_PTR(-EBUSY);
3937ed95b21aSIlya Dryomov }
3938ed95b21aSIlya Dryomov 
find_watcher(struct rbd_device * rbd_dev,const struct ceph_locker * locker)3939ed95b21aSIlya Dryomov static int find_watcher(struct rbd_device *rbd_dev,
3940ed95b21aSIlya Dryomov 			const struct ceph_locker *locker)
3941ed95b21aSIlya Dryomov {
3942ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3943ed95b21aSIlya Dryomov 	struct ceph_watch_item *watchers;
3944ed95b21aSIlya Dryomov 	u32 num_watchers;
3945ed95b21aSIlya Dryomov 	u64 cookie;
3946ed95b21aSIlya Dryomov 	int i;
3947ed95b21aSIlya Dryomov 	int ret;
3948ed95b21aSIlya Dryomov 
3949ed95b21aSIlya Dryomov 	ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
3950ed95b21aSIlya Dryomov 				      &rbd_dev->header_oloc, &watchers,
3951ed95b21aSIlya Dryomov 				      &num_watchers);
39529d01e07fSIlya Dryomov 	if (ret) {
39539d01e07fSIlya Dryomov 		rbd_warn(rbd_dev, "failed to get watchers: %d", ret);
3954ed95b21aSIlya Dryomov 		return ret;
39559d01e07fSIlya Dryomov 	}
3956ed95b21aSIlya Dryomov 
3957ed95b21aSIlya Dryomov 	sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
3958ed95b21aSIlya Dryomov 	for (i = 0; i < num_watchers; i++) {
3959313771e8SIlya Dryomov 		/*
3960313771e8SIlya Dryomov 		 * Ignore addr->type while comparing.  This mimics
3961313771e8SIlya Dryomov 		 * entity_addr_t::get_legacy_str() + strcmp().
3962313771e8SIlya Dryomov 		 */
3963313771e8SIlya Dryomov 		if (ceph_addr_equal_no_type(&watchers[i].addr,
3964313771e8SIlya Dryomov 					    &locker->info.addr) &&
3965ed95b21aSIlya Dryomov 		    watchers[i].cookie == cookie) {
3966ed95b21aSIlya Dryomov 			struct rbd_client_id cid = {
3967ed95b21aSIlya Dryomov 				.gid = le64_to_cpu(watchers[i].name.num),
3968ed95b21aSIlya Dryomov 				.handle = cookie,
3969ed95b21aSIlya Dryomov 			};
3970ed95b21aSIlya Dryomov 
3971ed95b21aSIlya Dryomov 			dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
3972ed95b21aSIlya Dryomov 			     rbd_dev, cid.gid, cid.handle);
3973ed95b21aSIlya Dryomov 			rbd_set_owner_cid(rbd_dev, &cid);
3974ed95b21aSIlya Dryomov 			ret = 1;
3975ed95b21aSIlya Dryomov 			goto out;
3976ed95b21aSIlya Dryomov 		}
3977ed95b21aSIlya Dryomov 	}
3978ed95b21aSIlya Dryomov 
3979ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
3980ed95b21aSIlya Dryomov 	ret = 0;
3981ed95b21aSIlya Dryomov out:
3982ed95b21aSIlya Dryomov 	kfree(watchers);
3983ed95b21aSIlya Dryomov 	return ret;
3984ed95b21aSIlya Dryomov }
3985ed95b21aSIlya Dryomov 
3986ed95b21aSIlya Dryomov /*
3987ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3988ed95b21aSIlya Dryomov  */
rbd_try_lock(struct rbd_device * rbd_dev)3989ed95b21aSIlya Dryomov static int rbd_try_lock(struct rbd_device *rbd_dev)
3990ed95b21aSIlya Dryomov {
3991ed95b21aSIlya Dryomov 	struct ceph_client *client = rbd_dev->rbd_client->client;
399258815900SIlya Dryomov 	struct ceph_locker *locker, *refreshed_locker;
3993ed95b21aSIlya Dryomov 	int ret;
3994ed95b21aSIlya Dryomov 
3995ed95b21aSIlya Dryomov 	for (;;) {
399658815900SIlya Dryomov 		locker = refreshed_locker = NULL;
3997f38cb9d9SIlya Dryomov 
3998ed95b21aSIlya Dryomov 		ret = rbd_lock(rbd_dev);
39999d01e07fSIlya Dryomov 		if (!ret)
4000f38cb9d9SIlya Dryomov 			goto out;
40019d01e07fSIlya Dryomov 		if (ret != -EBUSY) {
40029d01e07fSIlya Dryomov 			rbd_warn(rbd_dev, "failed to lock header: %d", ret);
40039d01e07fSIlya Dryomov 			goto out;
40049d01e07fSIlya Dryomov 		}
4005ed95b21aSIlya Dryomov 
4006ed95b21aSIlya Dryomov 		/* determine if the current lock holder is still alive */
4007f38cb9d9SIlya Dryomov 		locker = get_lock_owner_info(rbd_dev);
4008f38cb9d9SIlya Dryomov 		if (IS_ERR(locker)) {
4009f38cb9d9SIlya Dryomov 			ret = PTR_ERR(locker);
4010f38cb9d9SIlya Dryomov 			locker = NULL;
4011f38cb9d9SIlya Dryomov 			goto out;
4012f38cb9d9SIlya Dryomov 		}
4013f38cb9d9SIlya Dryomov 		if (!locker)
4014ed95b21aSIlya Dryomov 			goto again;
4015ed95b21aSIlya Dryomov 
4016f38cb9d9SIlya Dryomov 		ret = find_watcher(rbd_dev, locker);
4017637cd060SIlya Dryomov 		if (ret)
4018637cd060SIlya Dryomov 			goto out; /* request lock or error */
4019ed95b21aSIlya Dryomov 
402058815900SIlya Dryomov 		refreshed_locker = get_lock_owner_info(rbd_dev);
402158815900SIlya Dryomov 		if (IS_ERR(refreshed_locker)) {
402258815900SIlya Dryomov 			ret = PTR_ERR(refreshed_locker);
402358815900SIlya Dryomov 			refreshed_locker = NULL;
402458815900SIlya Dryomov 			goto out;
402558815900SIlya Dryomov 		}
402658815900SIlya Dryomov 		if (!refreshed_locker ||
402758815900SIlya Dryomov 		    !locker_equal(locker, refreshed_locker))
402858815900SIlya Dryomov 			goto again;
402958815900SIlya Dryomov 
403022e8bd51SIlya Dryomov 		rbd_warn(rbd_dev, "breaking header lock owned by %s%llu",
4031f38cb9d9SIlya Dryomov 			 ENTITY_NAME(locker->id.name));
4032ed95b21aSIlya Dryomov 
40330b98acd6SIlya Dryomov 		ret = ceph_monc_blocklist_add(&client->monc,
4034f38cb9d9SIlya Dryomov 					      &locker->info.addr);
4035ed95b21aSIlya Dryomov 		if (ret) {
4036f38cb9d9SIlya Dryomov 			rbd_warn(rbd_dev, "failed to blocklist %s%llu: %d",
4037f38cb9d9SIlya Dryomov 				 ENTITY_NAME(locker->id.name), ret);
4038ed95b21aSIlya Dryomov 			goto out;
4039ed95b21aSIlya Dryomov 		}
4040ed95b21aSIlya Dryomov 
4041ed95b21aSIlya Dryomov 		ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
4042ed95b21aSIlya Dryomov 					  &rbd_dev->header_oloc, RBD_LOCK_NAME,
4043f38cb9d9SIlya Dryomov 					  locker->id.cookie, &locker->id.name);
4044f38cb9d9SIlya Dryomov 		if (ret && ret != -ENOENT) {
4045f38cb9d9SIlya Dryomov 			rbd_warn(rbd_dev, "failed to break header lock: %d",
4046f38cb9d9SIlya Dryomov 				 ret);
4047ed95b21aSIlya Dryomov 			goto out;
4048f38cb9d9SIlya Dryomov 		}
4049ed95b21aSIlya Dryomov 
4050ed95b21aSIlya Dryomov again:
405158815900SIlya Dryomov 		free_locker(refreshed_locker);
4052f38cb9d9SIlya Dryomov 		free_locker(locker);
4053ed95b21aSIlya Dryomov 	}
4054ed95b21aSIlya Dryomov 
4055ed95b21aSIlya Dryomov out:
405658815900SIlya Dryomov 	free_locker(refreshed_locker);
4057f38cb9d9SIlya Dryomov 	free_locker(locker);
4058ed95b21aSIlya Dryomov 	return ret;
4059ed95b21aSIlya Dryomov }
4060ed95b21aSIlya Dryomov 
rbd_post_acquire_action(struct rbd_device * rbd_dev)406122e8bd51SIlya Dryomov static int rbd_post_acquire_action(struct rbd_device *rbd_dev)
4062ed95b21aSIlya Dryomov {
406322e8bd51SIlya Dryomov 	int ret;
406422e8bd51SIlya Dryomov 
4065870611e4SIlya Dryomov 	ret = rbd_dev_refresh(rbd_dev);
4066870611e4SIlya Dryomov 	if (ret)
4067870611e4SIlya Dryomov 		return ret;
4068870611e4SIlya Dryomov 
406922e8bd51SIlya Dryomov 	if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) {
407022e8bd51SIlya Dryomov 		ret = rbd_object_map_open(rbd_dev);
407122e8bd51SIlya Dryomov 		if (ret)
407222e8bd51SIlya Dryomov 			return ret;
407322e8bd51SIlya Dryomov 	}
407422e8bd51SIlya Dryomov 
407522e8bd51SIlya Dryomov 	return 0;
407622e8bd51SIlya Dryomov }
407722e8bd51SIlya Dryomov 
4078ed95b21aSIlya Dryomov /*
4079637cd060SIlya Dryomov  * Return:
4080637cd060SIlya Dryomov  *   0 - lock acquired
4081637cd060SIlya Dryomov  *   1 - caller should call rbd_request_lock()
4082637cd060SIlya Dryomov  *  <0 - error
4083ed95b21aSIlya Dryomov  */
rbd_try_acquire_lock(struct rbd_device * rbd_dev)4084637cd060SIlya Dryomov static int rbd_try_acquire_lock(struct rbd_device *rbd_dev)
4085ed95b21aSIlya Dryomov {
4086637cd060SIlya Dryomov 	int ret;
4087ed95b21aSIlya Dryomov 
4088ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
4089ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
4090ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
4091ed95b21aSIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev)) {
4092ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
4093637cd060SIlya Dryomov 		return 0;
4094ed95b21aSIlya Dryomov 	}
4095ed95b21aSIlya Dryomov 
4096ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
4097ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
4098ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
4099ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
4100637cd060SIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev)) {
4101637cd060SIlya Dryomov 		up_write(&rbd_dev->lock_rwsem);
4102637cd060SIlya Dryomov 		return 0;
4103ed95b21aSIlya Dryomov 	}
4104ed95b21aSIlya Dryomov 
4105637cd060SIlya Dryomov 	ret = rbd_try_lock(rbd_dev);
4106637cd060SIlya Dryomov 	if (ret < 0) {
41079d01e07fSIlya Dryomov 		rbd_warn(rbd_dev, "failed to acquire lock: %d", ret);
4108637cd060SIlya Dryomov 		goto out;
4109637cd060SIlya Dryomov 	}
4110637cd060SIlya Dryomov 	if (ret > 0) {
4111ed95b21aSIlya Dryomov 		up_write(&rbd_dev->lock_rwsem);
4112637cd060SIlya Dryomov 		return ret;
4113637cd060SIlya Dryomov 	}
4114637cd060SIlya Dryomov 
4115637cd060SIlya Dryomov 	rbd_assert(rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED);
4116637cd060SIlya Dryomov 	rbd_assert(list_empty(&rbd_dev->running_list));
4117637cd060SIlya Dryomov 
411822e8bd51SIlya Dryomov 	ret = rbd_post_acquire_action(rbd_dev);
411922e8bd51SIlya Dryomov 	if (ret) {
412022e8bd51SIlya Dryomov 		rbd_warn(rbd_dev, "post-acquire action failed: %d", ret);
412122e8bd51SIlya Dryomov 		/*
412222e8bd51SIlya Dryomov 		 * Can't stay in RBD_LOCK_STATE_LOCKED because
412322e8bd51SIlya Dryomov 		 * rbd_lock_add_request() would let the request through,
412422e8bd51SIlya Dryomov 		 * assuming that e.g. object map is locked and loaded.
412522e8bd51SIlya Dryomov 		 */
412622e8bd51SIlya Dryomov 		rbd_unlock(rbd_dev);
412722e8bd51SIlya Dryomov 	}
412822e8bd51SIlya Dryomov 
4129637cd060SIlya Dryomov out:
4130637cd060SIlya Dryomov 	wake_lock_waiters(rbd_dev, ret);
4131637cd060SIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
4132637cd060SIlya Dryomov 	return ret;
4133ed95b21aSIlya Dryomov }
4134ed95b21aSIlya Dryomov 
rbd_acquire_lock(struct work_struct * work)4135ed95b21aSIlya Dryomov static void rbd_acquire_lock(struct work_struct *work)
4136ed95b21aSIlya Dryomov {
4137ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
4138ed95b21aSIlya Dryomov 					    struct rbd_device, lock_dwork);
4139637cd060SIlya Dryomov 	int ret;
4140ed95b21aSIlya Dryomov 
4141ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4142ed95b21aSIlya Dryomov again:
4143637cd060SIlya Dryomov 	ret = rbd_try_acquire_lock(rbd_dev);
4144637cd060SIlya Dryomov 	if (ret <= 0) {
4145637cd060SIlya Dryomov 		dout("%s rbd_dev %p ret %d - done\n", __func__, rbd_dev, ret);
4146ed95b21aSIlya Dryomov 		return;
4147ed95b21aSIlya Dryomov 	}
4148ed95b21aSIlya Dryomov 
4149ed95b21aSIlya Dryomov 	ret = rbd_request_lock(rbd_dev);
4150ed95b21aSIlya Dryomov 	if (ret == -ETIMEDOUT) {
4151ed95b21aSIlya Dryomov 		goto again; /* treat this as a dead client */
4152e010dd0aSIlya Dryomov 	} else if (ret == -EROFS) {
4153e010dd0aSIlya Dryomov 		rbd_warn(rbd_dev, "peer will not release lock");
4154637cd060SIlya Dryomov 		down_write(&rbd_dev->lock_rwsem);
4155637cd060SIlya Dryomov 		wake_lock_waiters(rbd_dev, ret);
4156637cd060SIlya Dryomov 		up_write(&rbd_dev->lock_rwsem);
4157ed95b21aSIlya Dryomov 	} else if (ret < 0) {
4158ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "error requesting lock: %d", ret);
4159ed95b21aSIlya Dryomov 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4160ed95b21aSIlya Dryomov 				 RBD_RETRY_DELAY);
4161ed95b21aSIlya Dryomov 	} else {
4162ed95b21aSIlya Dryomov 		/*
4163ed95b21aSIlya Dryomov 		 * lock owner acked, but resend if we don't see them
4164ed95b21aSIlya Dryomov 		 * release the lock
4165ed95b21aSIlya Dryomov 		 */
41666b0a8774SColin Ian King 		dout("%s rbd_dev %p requeuing lock_dwork\n", __func__,
4167ed95b21aSIlya Dryomov 		     rbd_dev);
4168ed95b21aSIlya Dryomov 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4169ed95b21aSIlya Dryomov 		    msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
4170ed95b21aSIlya Dryomov 	}
4171ed95b21aSIlya Dryomov }
4172ed95b21aSIlya Dryomov 
rbd_quiesce_lock(struct rbd_device * rbd_dev)4173a2b1da09SIlya Dryomov static bool rbd_quiesce_lock(struct rbd_device *rbd_dev)
4174ed95b21aSIlya Dryomov {
4175a2b1da09SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4176d9b9c893SLinus Torvalds 	lockdep_assert_held_write(&rbd_dev->lock_rwsem);
4177a2b1da09SIlya Dryomov 
4178ed95b21aSIlya Dryomov 	if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
4179ed95b21aSIlya Dryomov 		return false;
4180ed95b21aSIlya Dryomov 
4181ed95b21aSIlya Dryomov 	/*
4182ed95b21aSIlya Dryomov 	 * Ensure that all in-flight IO is flushed.
4183ed95b21aSIlya Dryomov 	 */
4184e1fddc8fSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
4185e1fddc8fSIlya Dryomov 	rbd_assert(!completion_done(&rbd_dev->releasing_wait));
4186ed9eb710SIlya Dryomov 	if (list_empty(&rbd_dev->running_list))
4187ed9eb710SIlya Dryomov 		return true;
4188ed9eb710SIlya Dryomov 
4189ed9eb710SIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
4190e1fddc8fSIlya Dryomov 	wait_for_completion(&rbd_dev->releasing_wait);
4191ed95b21aSIlya Dryomov 
4192ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
4193ed95b21aSIlya Dryomov 	if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
4194ed95b21aSIlya Dryomov 		return false;
4195ed95b21aSIlya Dryomov 
4196e1fddc8fSIlya Dryomov 	rbd_assert(list_empty(&rbd_dev->running_list));
4197a2b1da09SIlya Dryomov 	return true;
4198a2b1da09SIlya Dryomov }
4199a2b1da09SIlya Dryomov 
rbd_pre_release_action(struct rbd_device * rbd_dev)420022e8bd51SIlya Dryomov static void rbd_pre_release_action(struct rbd_device *rbd_dev)
420122e8bd51SIlya Dryomov {
420222e8bd51SIlya Dryomov 	if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)
420322e8bd51SIlya Dryomov 		rbd_object_map_close(rbd_dev);
420422e8bd51SIlya Dryomov }
420522e8bd51SIlya Dryomov 
__rbd_release_lock(struct rbd_device * rbd_dev)4206e1fddc8fSIlya Dryomov static void __rbd_release_lock(struct rbd_device *rbd_dev)
4207e1fddc8fSIlya Dryomov {
4208e1fddc8fSIlya Dryomov 	rbd_assert(list_empty(&rbd_dev->running_list));
4209e1fddc8fSIlya Dryomov 
421022e8bd51SIlya Dryomov 	rbd_pre_release_action(rbd_dev);
4211bbead745SIlya Dryomov 	rbd_unlock(rbd_dev);
4212e1fddc8fSIlya Dryomov }
4213e1fddc8fSIlya Dryomov 
4214a2b1da09SIlya Dryomov /*
4215a2b1da09SIlya Dryomov  * lock_rwsem must be held for write
4216a2b1da09SIlya Dryomov  */
rbd_release_lock(struct rbd_device * rbd_dev)4217a2b1da09SIlya Dryomov static void rbd_release_lock(struct rbd_device *rbd_dev)
4218a2b1da09SIlya Dryomov {
4219a2b1da09SIlya Dryomov 	if (!rbd_quiesce_lock(rbd_dev))
4220a2b1da09SIlya Dryomov 		return;
4221a2b1da09SIlya Dryomov 
4222e1fddc8fSIlya Dryomov 	__rbd_release_lock(rbd_dev);
4223a2b1da09SIlya Dryomov 
4224ed95b21aSIlya Dryomov 	/*
4225ed95b21aSIlya Dryomov 	 * Give others a chance to grab the lock - we would re-acquire
4226637cd060SIlya Dryomov 	 * almost immediately if we got new IO while draining the running
4227637cd060SIlya Dryomov 	 * list otherwise.  We need to ack our own notifications, so this
4228637cd060SIlya Dryomov 	 * lock_dwork will be requeued from rbd_handle_released_lock() by
4229637cd060SIlya Dryomov 	 * way of maybe_kick_acquire().
4230ed95b21aSIlya Dryomov 	 */
4231ed95b21aSIlya Dryomov 	cancel_delayed_work(&rbd_dev->lock_dwork);
4232ed95b21aSIlya Dryomov }
4233ed95b21aSIlya Dryomov 
rbd_release_lock_work(struct work_struct * work)4234ed95b21aSIlya Dryomov static void rbd_release_lock_work(struct work_struct *work)
4235ed95b21aSIlya Dryomov {
4236ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
4237ed95b21aSIlya Dryomov 						  unlock_work);
4238ed95b21aSIlya Dryomov 
4239ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
4240ed95b21aSIlya Dryomov 	rbd_release_lock(rbd_dev);
4241ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
4242ed95b21aSIlya Dryomov }
4243ed95b21aSIlya Dryomov 
maybe_kick_acquire(struct rbd_device * rbd_dev)4244637cd060SIlya Dryomov static void maybe_kick_acquire(struct rbd_device *rbd_dev)
4245637cd060SIlya Dryomov {
4246637cd060SIlya Dryomov 	bool have_requests;
4247637cd060SIlya Dryomov 
4248637cd060SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4249637cd060SIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev))
4250637cd060SIlya Dryomov 		return;
4251637cd060SIlya Dryomov 
4252637cd060SIlya Dryomov 	spin_lock(&rbd_dev->lock_lists_lock);
4253637cd060SIlya Dryomov 	have_requests = !list_empty(&rbd_dev->acquiring_list);
4254637cd060SIlya Dryomov 	spin_unlock(&rbd_dev->lock_lists_lock);
4255637cd060SIlya Dryomov 	if (have_requests || delayed_work_pending(&rbd_dev->lock_dwork)) {
4256637cd060SIlya Dryomov 		dout("%s rbd_dev %p kicking lock_dwork\n", __func__, rbd_dev);
4257637cd060SIlya Dryomov 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
4258637cd060SIlya Dryomov 	}
4259637cd060SIlya Dryomov }
4260637cd060SIlya Dryomov 
rbd_handle_acquired_lock(struct rbd_device * rbd_dev,u8 struct_v,void ** p)4261ed95b21aSIlya Dryomov static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
4262ed95b21aSIlya Dryomov 				     void **p)
4263ed95b21aSIlya Dryomov {
4264ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
4265ed95b21aSIlya Dryomov 
4266ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
4267ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
4268ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
4269ed95b21aSIlya Dryomov 	}
4270ed95b21aSIlya Dryomov 
4271ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4272ed95b21aSIlya Dryomov 	     cid.handle);
4273ed95b21aSIlya Dryomov 	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4274ed95b21aSIlya Dryomov 		down_write(&rbd_dev->lock_rwsem);
4275ed95b21aSIlya Dryomov 		if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
42768798d070SIlya Dryomov 			dout("%s rbd_dev %p cid %llu-%llu == owner_cid\n",
42778798d070SIlya Dryomov 			     __func__, rbd_dev, cid.gid, cid.handle);
42788798d070SIlya Dryomov 		} else {
4279ed95b21aSIlya Dryomov 			rbd_set_owner_cid(rbd_dev, &cid);
42808798d070SIlya Dryomov 		}
4281ed95b21aSIlya Dryomov 		downgrade_write(&rbd_dev->lock_rwsem);
4282ed95b21aSIlya Dryomov 	} else {
4283ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
4284ed95b21aSIlya Dryomov 	}
4285ed95b21aSIlya Dryomov 
4286637cd060SIlya Dryomov 	maybe_kick_acquire(rbd_dev);
4287ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
4288ed95b21aSIlya Dryomov }
4289ed95b21aSIlya Dryomov 
rbd_handle_released_lock(struct rbd_device * rbd_dev,u8 struct_v,void ** p)4290ed95b21aSIlya Dryomov static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
4291ed95b21aSIlya Dryomov 				     void **p)
4292ed95b21aSIlya Dryomov {
4293ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
4294ed95b21aSIlya Dryomov 
4295ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
4296ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
4297ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
4298ed95b21aSIlya Dryomov 	}
4299ed95b21aSIlya Dryomov 
4300ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4301ed95b21aSIlya Dryomov 	     cid.handle);
4302ed95b21aSIlya Dryomov 	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4303ed95b21aSIlya Dryomov 		down_write(&rbd_dev->lock_rwsem);
4304ed95b21aSIlya Dryomov 		if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
43058798d070SIlya Dryomov 			dout("%s rbd_dev %p cid %llu-%llu != owner_cid %llu-%llu\n",
4306ed95b21aSIlya Dryomov 			     __func__, rbd_dev, cid.gid, cid.handle,
4307ed95b21aSIlya Dryomov 			     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
43088798d070SIlya Dryomov 		} else {
4309ed95b21aSIlya Dryomov 			rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
43108798d070SIlya Dryomov 		}
4311ed95b21aSIlya Dryomov 		downgrade_write(&rbd_dev->lock_rwsem);
4312ed95b21aSIlya Dryomov 	} else {
4313ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
4314ed95b21aSIlya Dryomov 	}
4315ed95b21aSIlya Dryomov 
4316637cd060SIlya Dryomov 	maybe_kick_acquire(rbd_dev);
4317ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
4318ed95b21aSIlya Dryomov }
4319ed95b21aSIlya Dryomov 
43203b77faa0SIlya Dryomov /*
43213b77faa0SIlya Dryomov  * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
43223b77faa0SIlya Dryomov  * ResponseMessage is needed.
43233b77faa0SIlya Dryomov  */
rbd_handle_request_lock(struct rbd_device * rbd_dev,u8 struct_v,void ** p)43243b77faa0SIlya Dryomov static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
4325ed95b21aSIlya Dryomov 				   void **p)
4326ed95b21aSIlya Dryomov {
4327ed95b21aSIlya Dryomov 	struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
4328ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
43293b77faa0SIlya Dryomov 	int result = 1;
4330ed95b21aSIlya Dryomov 
4331ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
4332ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
4333ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
4334ed95b21aSIlya Dryomov 	}
4335ed95b21aSIlya Dryomov 
4336ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4337ed95b21aSIlya Dryomov 	     cid.handle);
4338ed95b21aSIlya Dryomov 	if (rbd_cid_equal(&cid, &my_cid))
43393b77faa0SIlya Dryomov 		return result;
4340ed95b21aSIlya Dryomov 
4341ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
43423b77faa0SIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev)) {
43433b77faa0SIlya Dryomov 		if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
43443b77faa0SIlya Dryomov 		    rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
43453b77faa0SIlya Dryomov 			goto out_unlock;
43463b77faa0SIlya Dryomov 
43473b77faa0SIlya Dryomov 		/*
43483b77faa0SIlya Dryomov 		 * encode ResponseMessage(0) so the peer can detect
43493b77faa0SIlya Dryomov 		 * a missing owner
43503b77faa0SIlya Dryomov 		 */
43513b77faa0SIlya Dryomov 		result = 0;
43523b77faa0SIlya Dryomov 
4353ed95b21aSIlya Dryomov 		if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
4354e010dd0aSIlya Dryomov 			if (!rbd_dev->opts->exclusive) {
4355e010dd0aSIlya Dryomov 				dout("%s rbd_dev %p queueing unlock_work\n",
4356e010dd0aSIlya Dryomov 				     __func__, rbd_dev);
4357e010dd0aSIlya Dryomov 				queue_work(rbd_dev->task_wq,
4358e010dd0aSIlya Dryomov 					   &rbd_dev->unlock_work);
4359e010dd0aSIlya Dryomov 			} else {
4360e010dd0aSIlya Dryomov 				/* refuse to release the lock */
4361e010dd0aSIlya Dryomov 				result = -EROFS;
4362ed95b21aSIlya Dryomov 			}
4363ed95b21aSIlya Dryomov 		}
4364ed95b21aSIlya Dryomov 	}
43653b77faa0SIlya Dryomov 
43663b77faa0SIlya Dryomov out_unlock:
4367ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
43683b77faa0SIlya Dryomov 	return result;
4369ed95b21aSIlya Dryomov }
4370ed95b21aSIlya Dryomov 
__rbd_acknowledge_notify(struct rbd_device * rbd_dev,u64 notify_id,u64 cookie,s32 * result)4371ed95b21aSIlya Dryomov static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
4372ed95b21aSIlya Dryomov 				     u64 notify_id, u64 cookie, s32 *result)
4373ed95b21aSIlya Dryomov {
4374ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
437508a79102SKyle Spiers 	char buf[4 + CEPH_ENCODING_START_BLK_LEN];
437608a79102SKyle Spiers 	int buf_size = sizeof(buf);
4377ed95b21aSIlya Dryomov 	int ret;
4378ed95b21aSIlya Dryomov 
4379ed95b21aSIlya Dryomov 	if (result) {
4380ed95b21aSIlya Dryomov 		void *p = buf;
4381ed95b21aSIlya Dryomov 
4382ed95b21aSIlya Dryomov 		/* encode ResponseMessage */
4383ed95b21aSIlya Dryomov 		ceph_start_encoding(&p, 1, 1,
4384ed95b21aSIlya Dryomov 				    buf_size - CEPH_ENCODING_START_BLK_LEN);
4385ed95b21aSIlya Dryomov 		ceph_encode_32(&p, *result);
4386ed95b21aSIlya Dryomov 	} else {
4387ed95b21aSIlya Dryomov 		buf_size = 0;
4388ed95b21aSIlya Dryomov 	}
4389ed95b21aSIlya Dryomov 
4390ed95b21aSIlya Dryomov 	ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
4391ed95b21aSIlya Dryomov 				   &rbd_dev->header_oloc, notify_id, cookie,
4392ed95b21aSIlya Dryomov 				   buf, buf_size);
4393ed95b21aSIlya Dryomov 	if (ret)
4394ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
4395ed95b21aSIlya Dryomov }
4396ed95b21aSIlya Dryomov 
rbd_acknowledge_notify(struct rbd_device * rbd_dev,u64 notify_id,u64 cookie)4397ed95b21aSIlya Dryomov static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
4398ed95b21aSIlya Dryomov 				   u64 cookie)
4399ed95b21aSIlya Dryomov {
4400ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4401ed95b21aSIlya Dryomov 	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
4402ed95b21aSIlya Dryomov }
4403ed95b21aSIlya Dryomov 
rbd_acknowledge_notify_result(struct rbd_device * rbd_dev,u64 notify_id,u64 cookie,s32 result)4404ed95b21aSIlya Dryomov static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
4405ed95b21aSIlya Dryomov 					  u64 notify_id, u64 cookie, s32 result)
4406ed95b21aSIlya Dryomov {
4407ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
4408ed95b21aSIlya Dryomov 	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
4409ed95b21aSIlya Dryomov }
4410922dab61SIlya Dryomov 
rbd_watch_cb(void * arg,u64 notify_id,u64 cookie,u64 notifier_id,void * data,size_t data_len)4411922dab61SIlya Dryomov static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
4412922dab61SIlya Dryomov 			 u64 notifier_id, void *data, size_t data_len)
4413bf0d5f50SAlex Elder {
4414922dab61SIlya Dryomov 	struct rbd_device *rbd_dev = arg;
4415ed95b21aSIlya Dryomov 	void *p = data;
4416ed95b21aSIlya Dryomov 	void *const end = p + data_len;
4417d4c2269bSIlya Dryomov 	u8 struct_v = 0;
4418ed95b21aSIlya Dryomov 	u32 len;
4419ed95b21aSIlya Dryomov 	u32 notify_op;
4420bf0d5f50SAlex Elder 	int ret;
4421bf0d5f50SAlex Elder 
4422ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
4423ed95b21aSIlya Dryomov 	     __func__, rbd_dev, cookie, notify_id, data_len);
4424ed95b21aSIlya Dryomov 	if (data_len) {
4425ed95b21aSIlya Dryomov 		ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
4426ed95b21aSIlya Dryomov 					  &struct_v, &len);
4427ed95b21aSIlya Dryomov 		if (ret) {
4428ed95b21aSIlya Dryomov 			rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
4429ed95b21aSIlya Dryomov 				 ret);
4430ed95b21aSIlya Dryomov 			return;
4431ed95b21aSIlya Dryomov 		}
443252bb1f9bSIlya Dryomov 
4433ed95b21aSIlya Dryomov 		notify_op = ceph_decode_32(&p);
4434ed95b21aSIlya Dryomov 	} else {
4435ed95b21aSIlya Dryomov 		/* legacy notification for header updates */
4436ed95b21aSIlya Dryomov 		notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
4437ed95b21aSIlya Dryomov 		len = 0;
4438ed95b21aSIlya Dryomov 	}
4439ed95b21aSIlya Dryomov 
4440ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
4441ed95b21aSIlya Dryomov 	switch (notify_op) {
4442ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_ACQUIRED_LOCK:
4443ed95b21aSIlya Dryomov 		rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
4444ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4445ed95b21aSIlya Dryomov 		break;
4446ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_RELEASED_LOCK:
4447ed95b21aSIlya Dryomov 		rbd_handle_released_lock(rbd_dev, struct_v, &p);
4448ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4449ed95b21aSIlya Dryomov 		break;
4450ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_REQUEST_LOCK:
44513b77faa0SIlya Dryomov 		ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
44523b77faa0SIlya Dryomov 		if (ret <= 0)
4453ed95b21aSIlya Dryomov 			rbd_acknowledge_notify_result(rbd_dev, notify_id,
44543b77faa0SIlya Dryomov 						      cookie, ret);
4455ed95b21aSIlya Dryomov 		else
4456ed95b21aSIlya Dryomov 			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4457ed95b21aSIlya Dryomov 		break;
4458ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_HEADER_UPDATE:
4459e627db08SAlex Elder 		ret = rbd_dev_refresh(rbd_dev);
4460e627db08SAlex Elder 		if (ret)
44619584d508SIlya Dryomov 			rbd_warn(rbd_dev, "refresh failed: %d", ret);
4462bf0d5f50SAlex Elder 
4463ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4464ed95b21aSIlya Dryomov 		break;
4465ed95b21aSIlya Dryomov 	default:
4466ed95b21aSIlya Dryomov 		if (rbd_is_lock_owner(rbd_dev))
4467ed95b21aSIlya Dryomov 			rbd_acknowledge_notify_result(rbd_dev, notify_id,
4468ed95b21aSIlya Dryomov 						      cookie, -EOPNOTSUPP);
4469ed95b21aSIlya Dryomov 		else
4470ed95b21aSIlya Dryomov 			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4471ed95b21aSIlya Dryomov 		break;
44729969ebc5SAlex Elder 	}
44739969ebc5SAlex Elder }
44749969ebc5SAlex Elder 
447599d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
44769969ebc5SAlex Elder 
rbd_watch_errcb(void * arg,u64 cookie,int err)4477922dab61SIlya Dryomov static void rbd_watch_errcb(void *arg, u64 cookie, int err)
4478bb040aa0SIlya Dryomov {
4479922dab61SIlya Dryomov 	struct rbd_device *rbd_dev = arg;
4480bb040aa0SIlya Dryomov 
4481922dab61SIlya Dryomov 	rbd_warn(rbd_dev, "encountered watch error: %d", err);
4482bb040aa0SIlya Dryomov 
4483ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
4484ed95b21aSIlya Dryomov 	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
4485ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
4486bb040aa0SIlya Dryomov 
448799d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
448899d16943SIlya Dryomov 	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
448999d16943SIlya Dryomov 		__rbd_unregister_watch(rbd_dev);
449099d16943SIlya Dryomov 		rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
4491bb040aa0SIlya Dryomov 
449299d16943SIlya Dryomov 		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
4493bb040aa0SIlya Dryomov 	}
449499d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
4495bb040aa0SIlya Dryomov }
4496bb040aa0SIlya Dryomov 
4497bb040aa0SIlya Dryomov /*
449899d16943SIlya Dryomov  * watch_mutex must be locked
44999969ebc5SAlex Elder  */
__rbd_register_watch(struct rbd_device * rbd_dev)450099d16943SIlya Dryomov static int __rbd_register_watch(struct rbd_device *rbd_dev)
45019969ebc5SAlex Elder {
45029969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4503922dab61SIlya Dryomov 	struct ceph_osd_linger_request *handle;
45049969ebc5SAlex Elder 
4505922dab61SIlya Dryomov 	rbd_assert(!rbd_dev->watch_handle);
450699d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
45079969ebc5SAlex Elder 
4508922dab61SIlya Dryomov 	handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
4509922dab61SIlya Dryomov 				 &rbd_dev->header_oloc, rbd_watch_cb,
4510922dab61SIlya Dryomov 				 rbd_watch_errcb, rbd_dev);
4511922dab61SIlya Dryomov 	if (IS_ERR(handle))
4512922dab61SIlya Dryomov 		return PTR_ERR(handle);
45139969ebc5SAlex Elder 
4514922dab61SIlya Dryomov 	rbd_dev->watch_handle = handle;
45158eb87565SAlex Elder 	return 0;
45169969ebc5SAlex Elder }
45179969ebc5SAlex Elder 
451899d16943SIlya Dryomov /*
451999d16943SIlya Dryomov  * watch_mutex must be locked
452099d16943SIlya Dryomov  */
__rbd_unregister_watch(struct rbd_device * rbd_dev)452199d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
4522fca27065SIlya Dryomov {
4523922dab61SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4524922dab61SIlya Dryomov 	int ret;
4525b30a01f2SIlya Dryomov 
452699d16943SIlya Dryomov 	rbd_assert(rbd_dev->watch_handle);
452799d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4528b30a01f2SIlya Dryomov 
4529922dab61SIlya Dryomov 	ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
4530922dab61SIlya Dryomov 	if (ret)
4531922dab61SIlya Dryomov 		rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
4532b30a01f2SIlya Dryomov 
4533922dab61SIlya Dryomov 	rbd_dev->watch_handle = NULL;
4534c525f036SIlya Dryomov }
4535c525f036SIlya Dryomov 
rbd_register_watch(struct rbd_device * rbd_dev)453699d16943SIlya Dryomov static int rbd_register_watch(struct rbd_device *rbd_dev)
4537c525f036SIlya Dryomov {
453899d16943SIlya Dryomov 	int ret;
4539811c6688SIlya Dryomov 
454099d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
454199d16943SIlya Dryomov 	rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
454299d16943SIlya Dryomov 	ret = __rbd_register_watch(rbd_dev);
454399d16943SIlya Dryomov 	if (ret)
454499d16943SIlya Dryomov 		goto out;
454599d16943SIlya Dryomov 
454699d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
454799d16943SIlya Dryomov 	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
454899d16943SIlya Dryomov 
454999d16943SIlya Dryomov out:
455099d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
455199d16943SIlya Dryomov 	return ret;
455299d16943SIlya Dryomov }
455399d16943SIlya Dryomov 
cancel_tasks_sync(struct rbd_device * rbd_dev)455499d16943SIlya Dryomov static void cancel_tasks_sync(struct rbd_device *rbd_dev)
455599d16943SIlya Dryomov {
455699d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
455799d16943SIlya Dryomov 
4558ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->acquired_lock_work);
4559ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->released_lock_work);
4560ed95b21aSIlya Dryomov 	cancel_delayed_work_sync(&rbd_dev->lock_dwork);
4561ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->unlock_work);
456299d16943SIlya Dryomov }
456399d16943SIlya Dryomov 
45640e4e1de5SIlya Dryomov /*
45650e4e1de5SIlya Dryomov  * header_rwsem must not be held to avoid a deadlock with
45660e4e1de5SIlya Dryomov  * rbd_dev_refresh() when flushing notifies.
45670e4e1de5SIlya Dryomov  */
rbd_unregister_watch(struct rbd_device * rbd_dev)456899d16943SIlya Dryomov static void rbd_unregister_watch(struct rbd_device *rbd_dev)
456999d16943SIlya Dryomov {
457099d16943SIlya Dryomov 	cancel_tasks_sync(rbd_dev);
457199d16943SIlya Dryomov 
457299d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
457399d16943SIlya Dryomov 	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
457499d16943SIlya Dryomov 		__rbd_unregister_watch(rbd_dev);
457599d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
457699d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
457799d16943SIlya Dryomov 
457823edca86SDongsheng Yang 	cancel_delayed_work_sync(&rbd_dev->watch_dwork);
4579811c6688SIlya Dryomov 	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
4580fca27065SIlya Dryomov }
4581fca27065SIlya Dryomov 
458214bb211dSIlya Dryomov /*
458314bb211dSIlya Dryomov  * lock_rwsem must be held for write
458414bb211dSIlya Dryomov  */
rbd_reacquire_lock(struct rbd_device * rbd_dev)458514bb211dSIlya Dryomov static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
458614bb211dSIlya Dryomov {
458714bb211dSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
458814bb211dSIlya Dryomov 	char cookie[32];
458914bb211dSIlya Dryomov 	int ret;
459014bb211dSIlya Dryomov 
4591a2b1da09SIlya Dryomov 	if (!rbd_quiesce_lock(rbd_dev))
4592a2b1da09SIlya Dryomov 		return;
459314bb211dSIlya Dryomov 
459414bb211dSIlya Dryomov 	format_lock_cookie(rbd_dev, cookie);
459514bb211dSIlya Dryomov 	ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
459614bb211dSIlya Dryomov 				  &rbd_dev->header_oloc, RBD_LOCK_NAME,
459714bb211dSIlya Dryomov 				  CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
459814bb211dSIlya Dryomov 				  RBD_LOCK_TAG, cookie);
459914bb211dSIlya Dryomov 	if (ret) {
460014bb211dSIlya Dryomov 		if (ret != -EOPNOTSUPP)
460114bb211dSIlya Dryomov 			rbd_warn(rbd_dev, "failed to update lock cookie: %d",
460214bb211dSIlya Dryomov 				 ret);
460314bb211dSIlya Dryomov 
460414bb211dSIlya Dryomov 		/*
460514bb211dSIlya Dryomov 		 * Lock cookie cannot be updated on older OSDs, so do
460614bb211dSIlya Dryomov 		 * a manual release and queue an acquire.
460714bb211dSIlya Dryomov 		 */
4608e1fddc8fSIlya Dryomov 		__rbd_release_lock(rbd_dev);
4609a2b1da09SIlya Dryomov 		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
461014bb211dSIlya Dryomov 	} else {
4611edd8ca80SFlorian Margaine 		__rbd_lock(rbd_dev, cookie);
4612637cd060SIlya Dryomov 		wake_lock_waiters(rbd_dev, 0);
461314bb211dSIlya Dryomov 	}
461414bb211dSIlya Dryomov }
461514bb211dSIlya Dryomov 
rbd_reregister_watch(struct work_struct * work)461699d16943SIlya Dryomov static void rbd_reregister_watch(struct work_struct *work)
461799d16943SIlya Dryomov {
461899d16943SIlya Dryomov 	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
461999d16943SIlya Dryomov 					    struct rbd_device, watch_dwork);
462099d16943SIlya Dryomov 	int ret;
462199d16943SIlya Dryomov 
462299d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
462399d16943SIlya Dryomov 
462499d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
462587c0fdedSIlya Dryomov 	if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
462687c0fdedSIlya Dryomov 		mutex_unlock(&rbd_dev->watch_mutex);
462714bb211dSIlya Dryomov 		return;
462887c0fdedSIlya Dryomov 	}
462999d16943SIlya Dryomov 
463099d16943SIlya Dryomov 	ret = __rbd_register_watch(rbd_dev);
463199d16943SIlya Dryomov 	if (ret) {
463299d16943SIlya Dryomov 		rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
46330b98acd6SIlya Dryomov 		if (ret != -EBLOCKLISTED && ret != -ENOENT) {
463499d16943SIlya Dryomov 			queue_delayed_work(rbd_dev->task_wq,
463599d16943SIlya Dryomov 					   &rbd_dev->watch_dwork,
463699d16943SIlya Dryomov 					   RBD_RETRY_DELAY);
463787c0fdedSIlya Dryomov 			mutex_unlock(&rbd_dev->watch_mutex);
463814bb211dSIlya Dryomov 			return;
463999d16943SIlya Dryomov 		}
464099d16943SIlya Dryomov 
4641637cd060SIlya Dryomov 		mutex_unlock(&rbd_dev->watch_mutex);
4642637cd060SIlya Dryomov 		down_write(&rbd_dev->lock_rwsem);
4643637cd060SIlya Dryomov 		wake_lock_waiters(rbd_dev, ret);
4644637cd060SIlya Dryomov 		up_write(&rbd_dev->lock_rwsem);
4645637cd060SIlya Dryomov 		return;
4646637cd060SIlya Dryomov 	}
4647637cd060SIlya Dryomov 
464899d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
464999d16943SIlya Dryomov 	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
465099d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
465199d16943SIlya Dryomov 
465214bb211dSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
465314bb211dSIlya Dryomov 	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
465414bb211dSIlya Dryomov 		rbd_reacquire_lock(rbd_dev);
465514bb211dSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
465614bb211dSIlya Dryomov 
465799d16943SIlya Dryomov 	ret = rbd_dev_refresh(rbd_dev);
465899d16943SIlya Dryomov 	if (ret)
4659f6870cc9SColin Ian King 		rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
466099d16943SIlya Dryomov }
466199d16943SIlya Dryomov 
466236be9a76SAlex Elder /*
4663f40eb349SAlex Elder  * Synchronous osd object method call.  Returns the number of bytes
4664f40eb349SAlex Elder  * returned in the outbound buffer, or a negative error code.
466536be9a76SAlex Elder  */
rbd_obj_method_sync(struct rbd_device * rbd_dev,struct ceph_object_id * oid,struct ceph_object_locator * oloc,const char * method_name,const void * outbound,size_t outbound_size,void * inbound,size_t inbound_size)466636be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
4667ecd4a68aSIlya Dryomov 			     struct ceph_object_id *oid,
4668ecd4a68aSIlya Dryomov 			     struct ceph_object_locator *oloc,
466936be9a76SAlex Elder 			     const char *method_name,
46704157976bSAlex Elder 			     const void *outbound,
467136be9a76SAlex Elder 			     size_t outbound_size,
46724157976bSAlex Elder 			     void *inbound,
4673e2a58ee5SAlex Elder 			     size_t inbound_size)
467436be9a76SAlex Elder {
4675ecd4a68aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4676ecd4a68aSIlya Dryomov 	struct page *req_page = NULL;
4677ecd4a68aSIlya Dryomov 	struct page *reply_page;
467836be9a76SAlex Elder 	int ret;
467936be9a76SAlex Elder 
468036be9a76SAlex Elder 	/*
46816010a451SAlex Elder 	 * Method calls are ultimately read operations.  The result
46826010a451SAlex Elder 	 * should placed into the inbound buffer provided.  They
46836010a451SAlex Elder 	 * also supply outbound data--parameters for the object
46846010a451SAlex Elder 	 * method.  Currently if this is present it will be a
46856010a451SAlex Elder 	 * snapshot id.
468636be9a76SAlex Elder 	 */
4687ecd4a68aSIlya Dryomov 	if (outbound) {
4688ecd4a68aSIlya Dryomov 		if (outbound_size > PAGE_SIZE)
4689ecd4a68aSIlya Dryomov 			return -E2BIG;
469036be9a76SAlex Elder 
4691ecd4a68aSIlya Dryomov 		req_page = alloc_page(GFP_KERNEL);
4692ecd4a68aSIlya Dryomov 		if (!req_page)
4693ecd4a68aSIlya Dryomov 			return -ENOMEM;
469436be9a76SAlex Elder 
4695ecd4a68aSIlya Dryomov 		memcpy(page_address(req_page), outbound, outbound_size);
469604017e29SAlex Elder 	}
4697430c28c3SAlex Elder 
4698ecd4a68aSIlya Dryomov 	reply_page = alloc_page(GFP_KERNEL);
4699ecd4a68aSIlya Dryomov 	if (!reply_page) {
4700ecd4a68aSIlya Dryomov 		if (req_page)
4701ecd4a68aSIlya Dryomov 			__free_page(req_page);
4702ecd4a68aSIlya Dryomov 		return -ENOMEM;
4703ecd4a68aSIlya Dryomov 	}
470436be9a76SAlex Elder 
4705ecd4a68aSIlya Dryomov 	ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
4706ecd4a68aSIlya Dryomov 			     CEPH_OSD_FLAG_READ, req_page, outbound_size,
470768ada915SIlya Dryomov 			     &reply_page, &inbound_size);
4708ecd4a68aSIlya Dryomov 	if (!ret) {
4709ecd4a68aSIlya Dryomov 		memcpy(inbound, page_address(reply_page), inbound_size);
4710ecd4a68aSIlya Dryomov 		ret = inbound_size;
4711ecd4a68aSIlya Dryomov 	}
471257385b51SAlex Elder 
4713ecd4a68aSIlya Dryomov 	if (req_page)
4714ecd4a68aSIlya Dryomov 		__free_page(req_page);
4715ecd4a68aSIlya Dryomov 	__free_page(reply_page);
471636be9a76SAlex Elder 	return ret;
471736be9a76SAlex Elder }
471836be9a76SAlex Elder 
rbd_queue_workfn(struct work_struct * work)47197ad18afaSChristoph Hellwig static void rbd_queue_workfn(struct work_struct *work)
4720bc1ecc65SIlya Dryomov {
472159e542c8SIlya Dryomov 	struct rbd_img_request *img_request =
472259e542c8SIlya Dryomov 	    container_of(work, struct rbd_img_request, work);
472359e542c8SIlya Dryomov 	struct rbd_device *rbd_dev = img_request->rbd_dev;
472459e542c8SIlya Dryomov 	enum obj_operation_type op_type = img_request->op_type;
472559e542c8SIlya Dryomov 	struct request *rq = blk_mq_rq_from_pdu(img_request);
4726bc1ecc65SIlya Dryomov 	u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
4727bc1ecc65SIlya Dryomov 	u64 length = blk_rq_bytes(rq);
47284e752f0aSJosh Durgin 	u64 mapping_size;
4729bc1ecc65SIlya Dryomov 	int result;
4730bc1ecc65SIlya Dryomov 
4731bc1ecc65SIlya Dryomov 	/* Ignore/skip any zero-length requests */
4732bc1ecc65SIlya Dryomov 	if (!length) {
4733bc1ecc65SIlya Dryomov 		dout("%s: zero-length request\n", __func__);
4734bc1ecc65SIlya Dryomov 		result = 0;
473559e542c8SIlya Dryomov 		goto err_img_request;
4736bc1ecc65SIlya Dryomov 	}
4737bc1ecc65SIlya Dryomov 
47387ad18afaSChristoph Hellwig 	blk_mq_start_request(rq);
47397ad18afaSChristoph Hellwig 
47404e752f0aSJosh Durgin 	down_read(&rbd_dev->header_rwsem);
47414e752f0aSJosh Durgin 	mapping_size = rbd_dev->mapping.size;
4742a52cc685SIlya Dryomov 	rbd_img_capture_header(img_request);
47434e752f0aSJosh Durgin 	up_read(&rbd_dev->header_rwsem);
47444e752f0aSJosh Durgin 
47454e752f0aSJosh Durgin 	if (offset + length > mapping_size) {
4746bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
47474e752f0aSJosh Durgin 			 length, mapping_size);
4748bc1ecc65SIlya Dryomov 		result = -EIO;
4749a52cc685SIlya Dryomov 		goto err_img_request;
4750bc1ecc65SIlya Dryomov 	}
4751bc1ecc65SIlya Dryomov 
475221ed05a8SIlya Dryomov 	dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev,
475321ed05a8SIlya Dryomov 	     img_request, obj_op_name(op_type), offset, length);
475421ed05a8SIlya Dryomov 
47556484cbe9SIlya Dryomov 	if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
47565a237819SIlya Dryomov 		result = rbd_img_fill_nodata(img_request, offset, length);
475790e98c52SGuangliang Zhao 	else
47585a237819SIlya Dryomov 		result = rbd_img_fill_from_bio(img_request, offset, length,
475990e98c52SGuangliang Zhao 					       rq->bio);
47600192ce2eSIlya Dryomov 	if (result)
4761bc1ecc65SIlya Dryomov 		goto err_img_request;
4762bc1ecc65SIlya Dryomov 
4763e1fddc8fSIlya Dryomov 	rbd_img_handle_request(img_request, 0);
4764bc1ecc65SIlya Dryomov 	return;
4765bc1ecc65SIlya Dryomov 
4766bc1ecc65SIlya Dryomov err_img_request:
4767679a97d2SHannes Reinecke 	rbd_img_request_destroy(img_request);
4768bc1ecc65SIlya Dryomov 	if (result)
4769bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "%s %llx at %llx result %d",
47706d2940c8SGuangliang Zhao 			 obj_op_name(op_type), length, offset, result);
47712a842acaSChristoph Hellwig 	blk_mq_end_request(rq, errno_to_blk_status(result));
4772bc1ecc65SIlya Dryomov }
4773bc1ecc65SIlya Dryomov 
rbd_queue_rq(struct blk_mq_hw_ctx * hctx,const struct blk_mq_queue_data * bd)4774fc17b653SChristoph Hellwig static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
47757ad18afaSChristoph Hellwig 		const struct blk_mq_queue_data *bd)
4776bc1ecc65SIlya Dryomov {
477759e542c8SIlya Dryomov 	struct rbd_device *rbd_dev = hctx->queue->queuedata;
477859e542c8SIlya Dryomov 	struct rbd_img_request *img_req = blk_mq_rq_to_pdu(bd->rq);
477959e542c8SIlya Dryomov 	enum obj_operation_type op_type;
4780bc1ecc65SIlya Dryomov 
478159e542c8SIlya Dryomov 	switch (req_op(bd->rq)) {
478259e542c8SIlya Dryomov 	case REQ_OP_DISCARD:
478359e542c8SIlya Dryomov 		op_type = OBJ_OP_DISCARD;
478459e542c8SIlya Dryomov 		break;
478559e542c8SIlya Dryomov 	case REQ_OP_WRITE_ZEROES:
478659e542c8SIlya Dryomov 		op_type = OBJ_OP_ZEROOUT;
478759e542c8SIlya Dryomov 		break;
478859e542c8SIlya Dryomov 	case REQ_OP_WRITE:
478959e542c8SIlya Dryomov 		op_type = OBJ_OP_WRITE;
479059e542c8SIlya Dryomov 		break;
479159e542c8SIlya Dryomov 	case REQ_OP_READ:
479259e542c8SIlya Dryomov 		op_type = OBJ_OP_READ;
479359e542c8SIlya Dryomov 		break;
479459e542c8SIlya Dryomov 	default:
479559e542c8SIlya Dryomov 		rbd_warn(rbd_dev, "unknown req_op %d", req_op(bd->rq));
479659e542c8SIlya Dryomov 		return BLK_STS_IOERR;
479759e542c8SIlya Dryomov 	}
479859e542c8SIlya Dryomov 
479959e542c8SIlya Dryomov 	rbd_img_request_init(img_req, rbd_dev, op_type);
480059e542c8SIlya Dryomov 
480159e542c8SIlya Dryomov 	if (rbd_img_is_write(img_req)) {
480259e542c8SIlya Dryomov 		if (rbd_is_ro(rbd_dev)) {
480359e542c8SIlya Dryomov 			rbd_warn(rbd_dev, "%s on read-only mapping",
480459e542c8SIlya Dryomov 				 obj_op_name(img_req->op_type));
480559e542c8SIlya Dryomov 			return BLK_STS_IOERR;
480659e542c8SIlya Dryomov 		}
480759e542c8SIlya Dryomov 		rbd_assert(!rbd_is_snap(rbd_dev));
480859e542c8SIlya Dryomov 	}
480959e542c8SIlya Dryomov 
481059e542c8SIlya Dryomov 	INIT_WORK(&img_req->work, rbd_queue_workfn);
481159e542c8SIlya Dryomov 	queue_work(rbd_wq, &img_req->work);
4812fc17b653SChristoph Hellwig 	return BLK_STS_OK;
4813bf0d5f50SAlex Elder }
4814bf0d5f50SAlex Elder 
rbd_free_disk(struct rbd_device * rbd_dev)4815602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
4816602adf40SYehuda Sadeh {
48178b9ab626SChristoph Hellwig 	put_disk(rbd_dev->disk);
48187ad18afaSChristoph Hellwig 	blk_mq_free_tag_set(&rbd_dev->tag_set);
48195769ed0cSIlya Dryomov 	rbd_dev->disk = NULL;
4820602adf40SYehuda Sadeh }
4821602adf40SYehuda Sadeh 
rbd_obj_read_sync(struct rbd_device * rbd_dev,struct ceph_object_id * oid,struct ceph_object_locator * oloc,void * buf,int buf_len)4822788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
4823fe5478e0SIlya Dryomov 			     struct ceph_object_id *oid,
4824fe5478e0SIlya Dryomov 			     struct ceph_object_locator *oloc,
4825fe5478e0SIlya Dryomov 			     void *buf, int buf_len)
4826788e2df3SAlex Elder 
4827788e2df3SAlex Elder {
4828fe5478e0SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4829fe5478e0SIlya Dryomov 	struct ceph_osd_request *req;
4830fe5478e0SIlya Dryomov 	struct page **pages;
4831fe5478e0SIlya Dryomov 	int num_pages = calc_pages_for(0, buf_len);
4832788e2df3SAlex Elder 	int ret;
4833788e2df3SAlex Elder 
4834fe5478e0SIlya Dryomov 	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
4835fe5478e0SIlya Dryomov 	if (!req)
4836fe5478e0SIlya Dryomov 		return -ENOMEM;
4837788e2df3SAlex Elder 
4838fe5478e0SIlya Dryomov 	ceph_oid_copy(&req->r_base_oid, oid);
4839fe5478e0SIlya Dryomov 	ceph_oloc_copy(&req->r_base_oloc, oloc);
4840fe5478e0SIlya Dryomov 	req->r_flags = CEPH_OSD_FLAG_READ;
4841788e2df3SAlex Elder 
4842fe5478e0SIlya Dryomov 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
4843fe5478e0SIlya Dryomov 	if (IS_ERR(pages)) {
4844fe5478e0SIlya Dryomov 		ret = PTR_ERR(pages);
4845fe5478e0SIlya Dryomov 		goto out_req;
4846fe5478e0SIlya Dryomov 	}
48471ceae7efSAlex Elder 
4848fe5478e0SIlya Dryomov 	osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
4849fe5478e0SIlya Dryomov 	osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
4850fe5478e0SIlya Dryomov 					 true);
4851788e2df3SAlex Elder 
485226f887e0SIlya Dryomov 	ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
485326f887e0SIlya Dryomov 	if (ret)
485426f887e0SIlya Dryomov 		goto out_req;
485526f887e0SIlya Dryomov 
4856a8af0d68SJeff Layton 	ceph_osdc_start_request(osdc, req);
4857fe5478e0SIlya Dryomov 	ret = ceph_osdc_wait_request(osdc, req);
4858fe5478e0SIlya Dryomov 	if (ret >= 0)
4859fe5478e0SIlya Dryomov 		ceph_copy_from_page_vector(pages, buf, 0, ret);
4860fe5478e0SIlya Dryomov 
4861fe5478e0SIlya Dryomov out_req:
4862fe5478e0SIlya Dryomov 	ceph_osdc_put_request(req);
4863788e2df3SAlex Elder 	return ret;
4864788e2df3SAlex Elder }
4865788e2df3SAlex Elder 
4866602adf40SYehuda Sadeh /*
4867662518b1SAlex Elder  * Read the complete header for the given rbd device.  On successful
4868662518b1SAlex Elder  * return, the rbd_dev->header field will contain up-to-date
4869662518b1SAlex Elder  * information about the image.
48704156d998SAlex Elder  */
rbd_dev_v1_header_info(struct rbd_device * rbd_dev,struct rbd_image_header * header,bool first_time)4871510a7330SIlya Dryomov static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev,
4872510a7330SIlya Dryomov 				  struct rbd_image_header *header,
4873510a7330SIlya Dryomov 				  bool first_time)
48744156d998SAlex Elder {
48754156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
48764156d998SAlex Elder 	u32 snap_count = 0;
48774156d998SAlex Elder 	u64 names_size = 0;
48784156d998SAlex Elder 	u32 want_count;
48794156d998SAlex Elder 	int ret;
48804156d998SAlex Elder 
48814156d998SAlex Elder 	/*
48824156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
48834156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
48844156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
48854156d998SAlex Elder 	 * the number of snapshots could change by the time we read
48864156d998SAlex Elder 	 * it in, in which case we re-read it.
48874156d998SAlex Elder 	 */
48884156d998SAlex Elder 	do {
48894156d998SAlex Elder 		size_t size;
48904156d998SAlex Elder 
48914156d998SAlex Elder 		kfree(ondisk);
48924156d998SAlex Elder 
48934156d998SAlex Elder 		size = sizeof (*ondisk);
48944156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
48954156d998SAlex Elder 		size += names_size;
48964156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
48974156d998SAlex Elder 		if (!ondisk)
4898662518b1SAlex Elder 			return -ENOMEM;
48994156d998SAlex Elder 
4900fe5478e0SIlya Dryomov 		ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
4901fe5478e0SIlya Dryomov 					&rbd_dev->header_oloc, ondisk, size);
49024156d998SAlex Elder 		if (ret < 0)
4903662518b1SAlex Elder 			goto out;
4904c0cd10dbSAlex Elder 		if ((size_t)ret < size) {
49054156d998SAlex Elder 			ret = -ENXIO;
490606ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
490706ecc6cbSAlex Elder 				size, ret);
4908662518b1SAlex Elder 			goto out;
49094156d998SAlex Elder 		}
49104156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
49114156d998SAlex Elder 			ret = -ENXIO;
491206ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
4913662518b1SAlex Elder 			goto out;
49144156d998SAlex Elder 		}
49154156d998SAlex Elder 
49164156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
49174156d998SAlex Elder 		want_count = snap_count;
49184156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
49194156d998SAlex Elder 	} while (snap_count != want_count);
49204156d998SAlex Elder 
4921510a7330SIlya Dryomov 	ret = rbd_header_from_disk(header, ondisk, first_time);
4922662518b1SAlex Elder out:
49234156d998SAlex Elder 	kfree(ondisk);
49244156d998SAlex Elder 
4925dfc5606dSYehuda Sadeh 	return ret;
4926602adf40SYehuda Sadeh }
4927602adf40SYehuda Sadeh 
rbd_dev_update_size(struct rbd_device * rbd_dev)49289875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev)
49299875201eSJosh Durgin {
49309875201eSJosh Durgin 	sector_t size;
49319875201eSJosh Durgin 
49329875201eSJosh Durgin 	/*
4933811c6688SIlya Dryomov 	 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
4934811c6688SIlya Dryomov 	 * try to update its size.  If REMOVING is set, updating size
4935811c6688SIlya Dryomov 	 * is just useless work since the device can't be opened.
49369875201eSJosh Durgin 	 */
4937811c6688SIlya Dryomov 	if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
4938811c6688SIlya Dryomov 	    !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
49399875201eSJosh Durgin 		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
49409875201eSJosh Durgin 		dout("setting size to %llu sectors", (unsigned long long)size);
4941e864e49aSChristoph Hellwig 		set_capacity_and_notify(rbd_dev->disk, size);
49429875201eSJosh Durgin 	}
49439875201eSJosh Durgin }
49449875201eSJosh Durgin 
4945f363b089SEric Biggers static const struct blk_mq_ops rbd_mq_ops = {
49467ad18afaSChristoph Hellwig 	.queue_rq	= rbd_queue_rq,
49477ad18afaSChristoph Hellwig };
49487ad18afaSChristoph Hellwig 
rbd_init_disk(struct rbd_device * rbd_dev)4949602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
4950602adf40SYehuda Sadeh {
4951602adf40SYehuda Sadeh 	struct gendisk *disk;
4952602adf40SYehuda Sadeh 	struct request_queue *q;
4953420efbdfSIlya Dryomov 	unsigned int objset_bytes =
4954420efbdfSIlya Dryomov 	    rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
49557ad18afaSChristoph Hellwig 	int err;
4956602adf40SYehuda Sadeh 
49577ad18afaSChristoph Hellwig 	memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
49587ad18afaSChristoph Hellwig 	rbd_dev->tag_set.ops = &rbd_mq_ops;
4959b5584180SIlya Dryomov 	rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
49607ad18afaSChristoph Hellwig 	rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
496156d18f62SMing Lei 	rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
4962f9b6b98dSHannes Reinecke 	rbd_dev->tag_set.nr_hw_queues = num_present_cpus();
496359e542c8SIlya Dryomov 	rbd_dev->tag_set.cmd_size = sizeof(struct rbd_img_request);
49647ad18afaSChristoph Hellwig 
49657ad18afaSChristoph Hellwig 	err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
49667ad18afaSChristoph Hellwig 	if (err)
4967195b1956SChristoph Hellwig 		return err;
4968029bcbd8SJosh Durgin 
4969195b1956SChristoph Hellwig 	disk = blk_mq_alloc_disk(&rbd_dev->tag_set, rbd_dev);
4970195b1956SChristoph Hellwig 	if (IS_ERR(disk)) {
4971195b1956SChristoph Hellwig 		err = PTR_ERR(disk);
49727ad18afaSChristoph Hellwig 		goto out_tag_set;
49737ad18afaSChristoph Hellwig 	}
4974195b1956SChristoph Hellwig 	q = disk->queue;
4975195b1956SChristoph Hellwig 
4976195b1956SChristoph Hellwig 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
4977195b1956SChristoph Hellwig 		 rbd_dev->dev_id);
4978195b1956SChristoph Hellwig 	disk->major = rbd_dev->major;
4979195b1956SChristoph Hellwig 	disk->first_minor = rbd_dev->minor;
49801ebe2e5fSChristoph Hellwig 	if (single_major)
4981195b1956SChristoph Hellwig 		disk->minors = (1 << RBD_SINGLE_MAJOR_PART_SHIFT);
49821ebe2e5fSChristoph Hellwig 	else
4983195b1956SChristoph Hellwig 		disk->minors = RBD_MINORS_PER_MAJOR;
4984195b1956SChristoph Hellwig 	disk->fops = &rbd_bd_ops;
49850077a500SIlya Dryomov 	disk->private_data = rbd_dev;
49867ad18afaSChristoph Hellwig 
49878b904b5bSBart Van Assche 	blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
4988d8a2c89cSIlya Dryomov 	/* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
4989593a9e7bSAlex Elder 
4990420efbdfSIlya Dryomov 	blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
49910d9fde4fSIlya Dryomov 	q->limits.max_sectors = queue_max_hw_sectors(q);
499221acdf45SIlya Dryomov 	blk_queue_max_segments(q, USHRT_MAX);
499324f1df60SIlya Dryomov 	blk_queue_max_segment_size(q, UINT_MAX);
499416d80c54SIlya Dryomov 	blk_queue_io_min(q, rbd_dev->opts->alloc_size);
499516d80c54SIlya Dryomov 	blk_queue_io_opt(q, rbd_dev->opts->alloc_size);
4996029bcbd8SJosh Durgin 
4997d9360540SIlya Dryomov 	if (rbd_dev->opts->trim) {
499816d80c54SIlya Dryomov 		q->limits.discard_granularity = rbd_dev->opts->alloc_size;
4999420efbdfSIlya Dryomov 		blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
5000420efbdfSIlya Dryomov 		blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
5001d9360540SIlya Dryomov 	}
500290e98c52SGuangliang Zhao 
5003bae818eeSRonny Hegewald 	if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
50041cb039f3SChristoph Hellwig 		blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
5005bae818eeSRonny Hegewald 
5006602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
5007602adf40SYehuda Sadeh 
5008602adf40SYehuda Sadeh 	return 0;
50097ad18afaSChristoph Hellwig out_tag_set:
50107ad18afaSChristoph Hellwig 	blk_mq_free_tag_set(&rbd_dev->tag_set);
50117ad18afaSChristoph Hellwig 	return err;
5012602adf40SYehuda Sadeh }
5013602adf40SYehuda Sadeh 
5014dfc5606dSYehuda Sadeh /*
5015dfc5606dSYehuda Sadeh   sysfs
5016dfc5606dSYehuda Sadeh */
5017602adf40SYehuda Sadeh 
dev_to_rbd_dev(struct device * dev)5018593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
5019593a9e7bSAlex Elder {
5020593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
5021593a9e7bSAlex Elder }
5022593a9e7bSAlex Elder 
rbd_size_show(struct device * dev,struct device_attribute * attr,char * buf)5023dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
5024dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
5025602adf40SYehuda Sadeh {
5026593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5027dfc5606dSYehuda Sadeh 
5028fc71d833SAlex Elder 	return sprintf(buf, "%llu\n",
5029fc71d833SAlex Elder 		(unsigned long long)rbd_dev->mapping.size);
5030602adf40SYehuda Sadeh }
5031602adf40SYehuda Sadeh 
rbd_features_show(struct device * dev,struct device_attribute * attr,char * buf)503234b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
503334b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
503434b13184SAlex Elder {
503534b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
503634b13184SAlex Elder 
5037fa58bcadSIlya Dryomov 	return sprintf(buf, "0x%016llx\n", rbd_dev->header.features);
503834b13184SAlex Elder }
503934b13184SAlex Elder 
rbd_major_show(struct device * dev,struct device_attribute * attr,char * buf)5040dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
5041dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
5042602adf40SYehuda Sadeh {
5043593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5044dfc5606dSYehuda Sadeh 
5045fc71d833SAlex Elder 	if (rbd_dev->major)
5046dfc5606dSYehuda Sadeh 		return sprintf(buf, "%d\n", rbd_dev->major);
5047fc71d833SAlex Elder 
5048fc71d833SAlex Elder 	return sprintf(buf, "(none)\n");
5049dd82fff1SIlya Dryomov }
5050fc71d833SAlex Elder 
rbd_minor_show(struct device * dev,struct device_attribute * attr,char * buf)5051dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev,
5052dd82fff1SIlya Dryomov 			      struct device_attribute *attr, char *buf)
5053dd82fff1SIlya Dryomov {
5054dd82fff1SIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5055dd82fff1SIlya Dryomov 
5056dd82fff1SIlya Dryomov 	return sprintf(buf, "%d\n", rbd_dev->minor);
5057dfc5606dSYehuda Sadeh }
5058dfc5606dSYehuda Sadeh 
rbd_client_addr_show(struct device * dev,struct device_attribute * attr,char * buf)5059005a07bfSIlya Dryomov static ssize_t rbd_client_addr_show(struct device *dev,
5060005a07bfSIlya Dryomov 				    struct device_attribute *attr, char *buf)
5061005a07bfSIlya Dryomov {
5062005a07bfSIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5063005a07bfSIlya Dryomov 	struct ceph_entity_addr *client_addr =
5064005a07bfSIlya Dryomov 	    ceph_client_addr(rbd_dev->rbd_client->client);
5065005a07bfSIlya Dryomov 
5066005a07bfSIlya Dryomov 	return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
5067005a07bfSIlya Dryomov 		       le32_to_cpu(client_addr->nonce));
5068005a07bfSIlya Dryomov }
5069005a07bfSIlya Dryomov 
rbd_client_id_show(struct device * dev,struct device_attribute * attr,char * buf)5070dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
5071dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
5072dfc5606dSYehuda Sadeh {
5073593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5074dfc5606dSYehuda Sadeh 
50751dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
5076033268a5SIlya Dryomov 		       ceph_client_gid(rbd_dev->rbd_client->client));
5077dfc5606dSYehuda Sadeh }
5078dfc5606dSYehuda Sadeh 
rbd_cluster_fsid_show(struct device * dev,struct device_attribute * attr,char * buf)5079267fb90bSMike Christie static ssize_t rbd_cluster_fsid_show(struct device *dev,
5080267fb90bSMike Christie 				     struct device_attribute *attr, char *buf)
5081267fb90bSMike Christie {
5082267fb90bSMike Christie 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5083267fb90bSMike Christie 
5084267fb90bSMike Christie 	return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
5085267fb90bSMike Christie }
5086267fb90bSMike Christie 
rbd_config_info_show(struct device * dev,struct device_attribute * attr,char * buf)50870d6d1e9cSMike Christie static ssize_t rbd_config_info_show(struct device *dev,
50880d6d1e9cSMike Christie 				    struct device_attribute *attr, char *buf)
50890d6d1e9cSMike Christie {
50900d6d1e9cSMike Christie 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
50910d6d1e9cSMike Christie 
5092f44d04e6SIlya Dryomov 	if (!capable(CAP_SYS_ADMIN))
5093f44d04e6SIlya Dryomov 		return -EPERM;
5094f44d04e6SIlya Dryomov 
50950d6d1e9cSMike Christie 	return sprintf(buf, "%s\n", rbd_dev->config_info);
5096dfc5606dSYehuda Sadeh }
5097dfc5606dSYehuda Sadeh 
rbd_pool_show(struct device * dev,struct device_attribute * attr,char * buf)5098dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
5099dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
5100dfc5606dSYehuda Sadeh {
5101593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5102dfc5606dSYehuda Sadeh 
51030d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
5104dfc5606dSYehuda Sadeh }
5105dfc5606dSYehuda Sadeh 
rbd_pool_id_show(struct device * dev,struct device_attribute * attr,char * buf)51069bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
51079bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
51089bb2f334SAlex Elder {
51099bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
51109bb2f334SAlex Elder 
51110d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
51120d7dbfceSAlex Elder 			(unsigned long long) rbd_dev->spec->pool_id);
51139bb2f334SAlex Elder }
51149bb2f334SAlex Elder 
rbd_pool_ns_show(struct device * dev,struct device_attribute * attr,char * buf)5115b26c047bSIlya Dryomov static ssize_t rbd_pool_ns_show(struct device *dev,
5116b26c047bSIlya Dryomov 				struct device_attribute *attr, char *buf)
5117b26c047bSIlya Dryomov {
5118b26c047bSIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5119b26c047bSIlya Dryomov 
5120b26c047bSIlya Dryomov 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: "");
5121b26c047bSIlya Dryomov }
5122b26c047bSIlya Dryomov 
rbd_name_show(struct device * dev,struct device_attribute * attr,char * buf)5123dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
5124dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
5125dfc5606dSYehuda Sadeh {
5126593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5127dfc5606dSYehuda Sadeh 
5128a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
51290d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
5130a92ffdf8SAlex Elder 
5131a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
5132dfc5606dSYehuda Sadeh }
5133dfc5606dSYehuda Sadeh 
rbd_image_id_show(struct device * dev,struct device_attribute * attr,char * buf)5134589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
5135589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
5136589d30e0SAlex Elder {
5137589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5138589d30e0SAlex Elder 
51390d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
5140589d30e0SAlex Elder }
5141589d30e0SAlex Elder 
514234b13184SAlex Elder /*
514334b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
514434b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
514534b13184SAlex Elder  */
rbd_snap_show(struct device * dev,struct device_attribute * attr,char * buf)5146dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
5147dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
5148dfc5606dSYehuda Sadeh 			     char *buf)
5149dfc5606dSYehuda Sadeh {
5150593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5151dfc5606dSYehuda Sadeh 
51520d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
5153dfc5606dSYehuda Sadeh }
5154dfc5606dSYehuda Sadeh 
rbd_snap_id_show(struct device * dev,struct device_attribute * attr,char * buf)515592a58671SMike Christie static ssize_t rbd_snap_id_show(struct device *dev,
515692a58671SMike Christie 				struct device_attribute *attr, char *buf)
515792a58671SMike Christie {
515892a58671SMike Christie 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
515992a58671SMike Christie 
516092a58671SMike Christie 	return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
516192a58671SMike Christie }
516292a58671SMike Christie 
516386b00e0dSAlex Elder /*
5164ff96128fSIlya Dryomov  * For a v2 image, shows the chain of parent images, separated by empty
5165ff96128fSIlya Dryomov  * lines.  For v1 images or if there is no parent, shows "(no parent
5166ff96128fSIlya Dryomov  * image)".
516786b00e0dSAlex Elder  */
rbd_parent_show(struct device * dev,struct device_attribute * attr,char * buf)516886b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
516986b00e0dSAlex Elder 			       struct device_attribute *attr,
517086b00e0dSAlex Elder 			       char *buf)
517186b00e0dSAlex Elder {
517286b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5173ff96128fSIlya Dryomov 	ssize_t count = 0;
517486b00e0dSAlex Elder 
5175ff96128fSIlya Dryomov 	if (!rbd_dev->parent)
517686b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
517786b00e0dSAlex Elder 
5178ff96128fSIlya Dryomov 	for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
5179ff96128fSIlya Dryomov 		struct rbd_spec *spec = rbd_dev->parent_spec;
518086b00e0dSAlex Elder 
5181ff96128fSIlya Dryomov 		count += sprintf(&buf[count], "%s"
5182ff96128fSIlya Dryomov 			    "pool_id %llu\npool_name %s\n"
5183e92c0eafSIlya Dryomov 			    "pool_ns %s\n"
5184ff96128fSIlya Dryomov 			    "image_id %s\nimage_name %s\n"
5185ff96128fSIlya Dryomov 			    "snap_id %llu\nsnap_name %s\n"
5186ff96128fSIlya Dryomov 			    "overlap %llu\n",
5187ff96128fSIlya Dryomov 			    !count ? "" : "\n", /* first? */
5188ff96128fSIlya Dryomov 			    spec->pool_id, spec->pool_name,
5189e92c0eafSIlya Dryomov 			    spec->pool_ns ?: "",
5190ff96128fSIlya Dryomov 			    spec->image_id, spec->image_name ?: "(unknown)",
5191ff96128fSIlya Dryomov 			    spec->snap_id, spec->snap_name,
5192ff96128fSIlya Dryomov 			    rbd_dev->parent_overlap);
5193ff96128fSIlya Dryomov 	}
519486b00e0dSAlex Elder 
519586b00e0dSAlex Elder 	return count;
519686b00e0dSAlex Elder }
519786b00e0dSAlex Elder 
rbd_image_refresh(struct device * dev,struct device_attribute * attr,const char * buf,size_t size)5198dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
5199dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
5200dfc5606dSYehuda Sadeh 				 const char *buf,
5201dfc5606dSYehuda Sadeh 				 size_t size)
5202dfc5606dSYehuda Sadeh {
5203593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5204b813623aSAlex Elder 	int ret;
5205602adf40SYehuda Sadeh 
5206f44d04e6SIlya Dryomov 	if (!capable(CAP_SYS_ADMIN))
5207f44d04e6SIlya Dryomov 		return -EPERM;
5208f44d04e6SIlya Dryomov 
5209cc4a38bdSAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
5210e627db08SAlex Elder 	if (ret)
521152bb1f9bSIlya Dryomov 		return ret;
5212b813623aSAlex Elder 
521352bb1f9bSIlya Dryomov 	return size;
5214dfc5606dSYehuda Sadeh }
5215602adf40SYehuda Sadeh 
52165657a819SJoe Perches static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
52175657a819SJoe Perches static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
52185657a819SJoe Perches static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
52195657a819SJoe Perches static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
52205657a819SJoe Perches static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
52215657a819SJoe Perches static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
52225657a819SJoe Perches static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
52235657a819SJoe Perches static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
52245657a819SJoe Perches static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
52255657a819SJoe Perches static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
5226b26c047bSIlya Dryomov static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL);
52275657a819SJoe Perches static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
52285657a819SJoe Perches static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
52295657a819SJoe Perches static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
52305657a819SJoe Perches static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
52315657a819SJoe Perches static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
52325657a819SJoe Perches static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
5233dfc5606dSYehuda Sadeh 
5234dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
5235dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
523634b13184SAlex Elder 	&dev_attr_features.attr,
5237dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
5238dd82fff1SIlya Dryomov 	&dev_attr_minor.attr,
5239005a07bfSIlya Dryomov 	&dev_attr_client_addr.attr,
5240dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
5241267fb90bSMike Christie 	&dev_attr_cluster_fsid.attr,
52420d6d1e9cSMike Christie 	&dev_attr_config_info.attr,
5243dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
52449bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
5245b26c047bSIlya Dryomov 	&dev_attr_pool_ns.attr,
5246dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
5247589d30e0SAlex Elder 	&dev_attr_image_id.attr,
5248dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
524992a58671SMike Christie 	&dev_attr_snap_id.attr,
525086b00e0dSAlex Elder 	&dev_attr_parent.attr,
5251dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
5252dfc5606dSYehuda Sadeh 	NULL
5253dfc5606dSYehuda Sadeh };
5254dfc5606dSYehuda Sadeh 
5255dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
5256dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
5257dfc5606dSYehuda Sadeh };
5258dfc5606dSYehuda Sadeh 
5259dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
5260dfc5606dSYehuda Sadeh 	&rbd_attr_group,
5261dfc5606dSYehuda Sadeh 	NULL
5262dfc5606dSYehuda Sadeh };
5263dfc5606dSYehuda Sadeh 
52646cac4695SIlya Dryomov static void rbd_dev_release(struct device *dev);
5265dfc5606dSYehuda Sadeh 
5266b9942bc9SBhumika Goyal static const struct device_type rbd_device_type = {
5267dfc5606dSYehuda Sadeh 	.name		= "rbd",
5268dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
52696cac4695SIlya Dryomov 	.release	= rbd_dev_release,
5270dfc5606dSYehuda Sadeh };
5271dfc5606dSYehuda Sadeh 
rbd_spec_get(struct rbd_spec * spec)52728b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
52738b8fb99cSAlex Elder {
52748b8fb99cSAlex Elder 	kref_get(&spec->kref);
52758b8fb99cSAlex Elder 
52768b8fb99cSAlex Elder 	return spec;
52778b8fb99cSAlex Elder }
52788b8fb99cSAlex Elder 
52798b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
rbd_spec_put(struct rbd_spec * spec)52808b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
52818b8fb99cSAlex Elder {
52828b8fb99cSAlex Elder 	if (spec)
52838b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
52848b8fb99cSAlex Elder }
52858b8fb99cSAlex Elder 
rbd_spec_alloc(void)52868b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
52878b8fb99cSAlex Elder {
52888b8fb99cSAlex Elder 	struct rbd_spec *spec;
52898b8fb99cSAlex Elder 
52908b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
52918b8fb99cSAlex Elder 	if (!spec)
52928b8fb99cSAlex Elder 		return NULL;
529304077599SIlya Dryomov 
529404077599SIlya Dryomov 	spec->pool_id = CEPH_NOPOOL;
529504077599SIlya Dryomov 	spec->snap_id = CEPH_NOSNAP;
52968b8fb99cSAlex Elder 	kref_init(&spec->kref);
52978b8fb99cSAlex Elder 
52988b8fb99cSAlex Elder 	return spec;
52998b8fb99cSAlex Elder }
53008b8fb99cSAlex Elder 
rbd_spec_free(struct kref * kref)53018b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
53028b8fb99cSAlex Elder {
53038b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
53048b8fb99cSAlex Elder 
53058b8fb99cSAlex Elder 	kfree(spec->pool_name);
5306b26c047bSIlya Dryomov 	kfree(spec->pool_ns);
53078b8fb99cSAlex Elder 	kfree(spec->image_id);
53088b8fb99cSAlex Elder 	kfree(spec->image_name);
53098b8fb99cSAlex Elder 	kfree(spec->snap_name);
53108b8fb99cSAlex Elder 	kfree(spec);
53118b8fb99cSAlex Elder }
53128b8fb99cSAlex Elder 
rbd_dev_free(struct rbd_device * rbd_dev)53131643dfa4SIlya Dryomov static void rbd_dev_free(struct rbd_device *rbd_dev)
5314dd5ac32dSIlya Dryomov {
531599d16943SIlya Dryomov 	WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
5316ed95b21aSIlya Dryomov 	WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
5317dd5ac32dSIlya Dryomov 
5318c41d13a3SIlya Dryomov 	ceph_oid_destroy(&rbd_dev->header_oid);
53196b6dddbeSIlya Dryomov 	ceph_oloc_destroy(&rbd_dev->header_oloc);
53200d6d1e9cSMike Christie 	kfree(rbd_dev->config_info);
5321c41d13a3SIlya Dryomov 
5322dd5ac32dSIlya Dryomov 	rbd_put_client(rbd_dev->rbd_client);
5323dd5ac32dSIlya Dryomov 	rbd_spec_put(rbd_dev->spec);
5324dd5ac32dSIlya Dryomov 	kfree(rbd_dev->opts);
5325dd5ac32dSIlya Dryomov 	kfree(rbd_dev);
53261643dfa4SIlya Dryomov }
53271643dfa4SIlya Dryomov 
rbd_dev_release(struct device * dev)53281643dfa4SIlya Dryomov static void rbd_dev_release(struct device *dev)
53291643dfa4SIlya Dryomov {
53301643dfa4SIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
53311643dfa4SIlya Dryomov 	bool need_put = !!rbd_dev->opts;
53321643dfa4SIlya Dryomov 
53331643dfa4SIlya Dryomov 	if (need_put) {
53341643dfa4SIlya Dryomov 		destroy_workqueue(rbd_dev->task_wq);
53351643dfa4SIlya Dryomov 		ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
53361643dfa4SIlya Dryomov 	}
53371643dfa4SIlya Dryomov 
53381643dfa4SIlya Dryomov 	rbd_dev_free(rbd_dev);
5339dd5ac32dSIlya Dryomov 
5340dd5ac32dSIlya Dryomov 	/*
5341dd5ac32dSIlya Dryomov 	 * This is racy, but way better than putting module outside of
5342dd5ac32dSIlya Dryomov 	 * the release callback.  The race window is pretty small, so
5343dd5ac32dSIlya Dryomov 	 * doing something similar to dm (dm-builtin.c) is overkill.
5344dd5ac32dSIlya Dryomov 	 */
5345dd5ac32dSIlya Dryomov 	if (need_put)
5346dd5ac32dSIlya Dryomov 		module_put(THIS_MODULE);
5347dd5ac32dSIlya Dryomov }
5348dd5ac32dSIlya Dryomov 
__rbd_dev_create(struct rbd_spec * spec)5349f7c4d9b1SIlya Dryomov static struct rbd_device *__rbd_dev_create(struct rbd_spec *spec)
5350c53d5893SAlex Elder {
5351c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
5352c53d5893SAlex Elder 
5353c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
5354c53d5893SAlex Elder 	if (!rbd_dev)
5355c53d5893SAlex Elder 		return NULL;
5356c53d5893SAlex Elder 
5357c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
5358c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
5359c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
5360c53d5893SAlex Elder 
53617e97332eSIlya Dryomov 	rbd_dev->header.data_pool_id = CEPH_NOPOOL;
5362c41d13a3SIlya Dryomov 	ceph_oid_init(&rbd_dev->header_oid);
5363431a02cdSIlya Dryomov 	rbd_dev->header_oloc.pool = spec->pool_id;
5364b26c047bSIlya Dryomov 	if (spec->pool_ns) {
5365b26c047bSIlya Dryomov 		WARN_ON(!*spec->pool_ns);
5366b26c047bSIlya Dryomov 		rbd_dev->header_oloc.pool_ns =
5367b26c047bSIlya Dryomov 		    ceph_find_or_create_string(spec->pool_ns,
5368b26c047bSIlya Dryomov 					       strlen(spec->pool_ns));
5369b26c047bSIlya Dryomov 	}
5370c41d13a3SIlya Dryomov 
537199d16943SIlya Dryomov 	mutex_init(&rbd_dev->watch_mutex);
537299d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
537399d16943SIlya Dryomov 	INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
537499d16943SIlya Dryomov 
5375ed95b21aSIlya Dryomov 	init_rwsem(&rbd_dev->lock_rwsem);
5376ed95b21aSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
5377ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
5378ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
5379ed95b21aSIlya Dryomov 	INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
5380ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
5381e1fddc8fSIlya Dryomov 	spin_lock_init(&rbd_dev->lock_lists_lock);
5382637cd060SIlya Dryomov 	INIT_LIST_HEAD(&rbd_dev->acquiring_list);
5383e1fddc8fSIlya Dryomov 	INIT_LIST_HEAD(&rbd_dev->running_list);
5384637cd060SIlya Dryomov 	init_completion(&rbd_dev->acquire_wait);
5385e1fddc8fSIlya Dryomov 	init_completion(&rbd_dev->releasing_wait);
5386ed95b21aSIlya Dryomov 
538722e8bd51SIlya Dryomov 	spin_lock_init(&rbd_dev->object_map_lock);
5388c53d5893SAlex Elder 
5389dd5ac32dSIlya Dryomov 	rbd_dev->dev.bus = &rbd_bus_type;
5390dd5ac32dSIlya Dryomov 	rbd_dev->dev.type = &rbd_device_type;
5391dd5ac32dSIlya Dryomov 	rbd_dev->dev.parent = &rbd_root_dev;
5392dd5ac32dSIlya Dryomov 	device_initialize(&rbd_dev->dev);
5393dd5ac32dSIlya Dryomov 
53941643dfa4SIlya Dryomov 	return rbd_dev;
53951643dfa4SIlya Dryomov }
53961643dfa4SIlya Dryomov 
5397dd5ac32dSIlya Dryomov /*
53981643dfa4SIlya Dryomov  * Create a mapping rbd_dev.
5399dd5ac32dSIlya Dryomov  */
rbd_dev_create(struct rbd_client * rbdc,struct rbd_spec * spec,struct rbd_options * opts)54001643dfa4SIlya Dryomov static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
54011643dfa4SIlya Dryomov 					 struct rbd_spec *spec,
54021643dfa4SIlya Dryomov 					 struct rbd_options *opts)
54031643dfa4SIlya Dryomov {
54041643dfa4SIlya Dryomov 	struct rbd_device *rbd_dev;
54051643dfa4SIlya Dryomov 
5406f7c4d9b1SIlya Dryomov 	rbd_dev = __rbd_dev_create(spec);
54071643dfa4SIlya Dryomov 	if (!rbd_dev)
54081643dfa4SIlya Dryomov 		return NULL;
54091643dfa4SIlya Dryomov 
54101643dfa4SIlya Dryomov 	/* get an id and fill in device name */
54111643dfa4SIlya Dryomov 	rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
54121643dfa4SIlya Dryomov 					 minor_to_rbd_dev_id(1 << MINORBITS),
54131643dfa4SIlya Dryomov 					 GFP_KERNEL);
54141643dfa4SIlya Dryomov 	if (rbd_dev->dev_id < 0)
54151643dfa4SIlya Dryomov 		goto fail_rbd_dev;
54161643dfa4SIlya Dryomov 
54171643dfa4SIlya Dryomov 	sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
54181643dfa4SIlya Dryomov 	rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
54191643dfa4SIlya Dryomov 						   rbd_dev->name);
54201643dfa4SIlya Dryomov 	if (!rbd_dev->task_wq)
54211643dfa4SIlya Dryomov 		goto fail_dev_id;
54221643dfa4SIlya Dryomov 
54231643dfa4SIlya Dryomov 	/* we have a ref from do_rbd_add() */
5424dd5ac32dSIlya Dryomov 	__module_get(THIS_MODULE);
5425dd5ac32dSIlya Dryomov 
5426f7c4d9b1SIlya Dryomov 	rbd_dev->rbd_client = rbdc;
5427f7c4d9b1SIlya Dryomov 	rbd_dev->spec = spec;
5428f7c4d9b1SIlya Dryomov 	rbd_dev->opts = opts;
5429f7c4d9b1SIlya Dryomov 
54301643dfa4SIlya Dryomov 	dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
5431c53d5893SAlex Elder 	return rbd_dev;
54321643dfa4SIlya Dryomov 
54331643dfa4SIlya Dryomov fail_dev_id:
54341643dfa4SIlya Dryomov 	ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
54351643dfa4SIlya Dryomov fail_rbd_dev:
54361643dfa4SIlya Dryomov 	rbd_dev_free(rbd_dev);
54371643dfa4SIlya Dryomov 	return NULL;
5438c53d5893SAlex Elder }
5439c53d5893SAlex Elder 
rbd_dev_destroy(struct rbd_device * rbd_dev)5440c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
5441c53d5893SAlex Elder {
5442dd5ac32dSIlya Dryomov 	if (rbd_dev)
5443dd5ac32dSIlya Dryomov 		put_device(&rbd_dev->dev);
5444c53d5893SAlex Elder }
5445c53d5893SAlex Elder 
5446dfc5606dSYehuda Sadeh /*
54479d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
54489d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
54499d475de5SAlex Elder  * image.
54509d475de5SAlex Elder  */
_rbd_dev_v2_snap_size(struct rbd_device * rbd_dev,u64 snap_id,u8 * order,u64 * snap_size)54519d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
54529d475de5SAlex Elder 				u8 *order, u64 *snap_size)
54539d475de5SAlex Elder {
54549d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
54559d475de5SAlex Elder 	int ret;
54569d475de5SAlex Elder 	struct {
54579d475de5SAlex Elder 		u8 order;
54589d475de5SAlex Elder 		__le64 size;
54599d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
54609d475de5SAlex Elder 
5461ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5462ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_size",
54634157976bSAlex Elder 				  &snapid, sizeof(snapid),
5464e2a58ee5SAlex Elder 				  &size_buf, sizeof(size_buf));
546536be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
54669d475de5SAlex Elder 	if (ret < 0)
54679d475de5SAlex Elder 		return ret;
546857385b51SAlex Elder 	if (ret < sizeof (size_buf))
546957385b51SAlex Elder 		return -ERANGE;
54709d475de5SAlex Elder 
5471c3545579SJosh Durgin 	if (order) {
54729d475de5SAlex Elder 		*order = size_buf.order;
5473c3545579SJosh Durgin 		dout("  order %u", (unsigned int)*order);
5474c3545579SJosh Durgin 	}
54759d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
54769d475de5SAlex Elder 
5477c3545579SJosh Durgin 	dout("  snap_id 0x%016llx snap_size = %llu\n",
5478c3545579SJosh Durgin 		(unsigned long long)snap_id,
54799d475de5SAlex Elder 		(unsigned long long)*snap_size);
54809d475de5SAlex Elder 
54819d475de5SAlex Elder 	return 0;
54829d475de5SAlex Elder }
54839d475de5SAlex Elder 
rbd_dev_v2_object_prefix(struct rbd_device * rbd_dev,char ** pobject_prefix)5484510a7330SIlya Dryomov static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev,
5485510a7330SIlya Dryomov 				    char **pobject_prefix)
54861e130199SAlex Elder {
54875435d206SDongsheng Yang 	size_t size;
54881e130199SAlex Elder 	void *reply_buf;
5489510a7330SIlya Dryomov 	char *object_prefix;
54901e130199SAlex Elder 	int ret;
54911e130199SAlex Elder 	void *p;
54921e130199SAlex Elder 
54935435d206SDongsheng Yang 	/* Response will be an encoded string, which includes a length */
54945435d206SDongsheng Yang 	size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX;
54955435d206SDongsheng Yang 	reply_buf = kzalloc(size, GFP_KERNEL);
54961e130199SAlex Elder 	if (!reply_buf)
54971e130199SAlex Elder 		return -ENOMEM;
54981e130199SAlex Elder 
5499ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5500ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_object_prefix",
55015435d206SDongsheng Yang 				  NULL, 0, reply_buf, size);
550236be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
55031e130199SAlex Elder 	if (ret < 0)
55041e130199SAlex Elder 		goto out;
55051e130199SAlex Elder 
55061e130199SAlex Elder 	p = reply_buf;
5507510a7330SIlya Dryomov 	object_prefix = ceph_extract_encoded_string(&p, p + ret, NULL,
5508510a7330SIlya Dryomov 						    GFP_NOIO);
5509510a7330SIlya Dryomov 	if (IS_ERR(object_prefix)) {
5510510a7330SIlya Dryomov 		ret = PTR_ERR(object_prefix);
5511510a7330SIlya Dryomov 		goto out;
5512510a7330SIlya Dryomov 	}
551357385b51SAlex Elder 	ret = 0;
55141e130199SAlex Elder 
5515510a7330SIlya Dryomov 	*pobject_prefix = object_prefix;
5516510a7330SIlya Dryomov 	dout("  object_prefix = %s\n", object_prefix);
55171e130199SAlex Elder out:
55181e130199SAlex Elder 	kfree(reply_buf);
55191e130199SAlex Elder 
55201e130199SAlex Elder 	return ret;
55211e130199SAlex Elder }
55221e130199SAlex Elder 
_rbd_dev_v2_snap_features(struct rbd_device * rbd_dev,u64 snap_id,bool read_only,u64 * snap_features)5523b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
5524196e2d6dSIlya Dryomov 				     bool read_only, u64 *snap_features)
5525b1b5402aSAlex Elder {
5526196e2d6dSIlya Dryomov 	struct {
5527196e2d6dSIlya Dryomov 		__le64 snap_id;
5528196e2d6dSIlya Dryomov 		u8 read_only;
5529196e2d6dSIlya Dryomov 	} features_in;
5530b1b5402aSAlex Elder 	struct {
5531b1b5402aSAlex Elder 		__le64 features;
5532b1b5402aSAlex Elder 		__le64 incompat;
55334157976bSAlex Elder 	} __attribute__ ((packed)) features_buf = { 0 };
5534d3767f0fSIlya Dryomov 	u64 unsup;
5535b1b5402aSAlex Elder 	int ret;
5536b1b5402aSAlex Elder 
5537196e2d6dSIlya Dryomov 	features_in.snap_id = cpu_to_le64(snap_id);
5538196e2d6dSIlya Dryomov 	features_in.read_only = read_only;
5539196e2d6dSIlya Dryomov 
5540ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5541ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_features",
5542196e2d6dSIlya Dryomov 				  &features_in, sizeof(features_in),
5543e2a58ee5SAlex Elder 				  &features_buf, sizeof(features_buf));
554436be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5545b1b5402aSAlex Elder 	if (ret < 0)
5546b1b5402aSAlex Elder 		return ret;
554757385b51SAlex Elder 	if (ret < sizeof (features_buf))
554857385b51SAlex Elder 		return -ERANGE;
5549d889140cSAlex Elder 
5550d3767f0fSIlya Dryomov 	unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
5551d3767f0fSIlya Dryomov 	if (unsup) {
5552d3767f0fSIlya Dryomov 		rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
5553d3767f0fSIlya Dryomov 			 unsup);
5554b8f5c6edSAlex Elder 		return -ENXIO;
5555d3767f0fSIlya Dryomov 	}
5556d889140cSAlex Elder 
5557b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
5558b1b5402aSAlex Elder 
5559b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
5560b1b5402aSAlex Elder 		(unsigned long long)snap_id,
5561b1b5402aSAlex Elder 		(unsigned long long)*snap_features,
5562b1b5402aSAlex Elder 		(unsigned long long)le64_to_cpu(features_buf.incompat));
5563b1b5402aSAlex Elder 
5564b1b5402aSAlex Elder 	return 0;
5565b1b5402aSAlex Elder }
5566b1b5402aSAlex Elder 
556722e8bd51SIlya Dryomov /*
556822e8bd51SIlya Dryomov  * These are generic image flags, but since they are used only for
556922e8bd51SIlya Dryomov  * object map, store them in rbd_dev->object_map_flags.
557022e8bd51SIlya Dryomov  *
557122e8bd51SIlya Dryomov  * For the same reason, this function is called only on object map
557222e8bd51SIlya Dryomov  * (re)load and not on header refresh.
557322e8bd51SIlya Dryomov  */
rbd_dev_v2_get_flags(struct rbd_device * rbd_dev)557422e8bd51SIlya Dryomov static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev)
557522e8bd51SIlya Dryomov {
557622e8bd51SIlya Dryomov 	__le64 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
557722e8bd51SIlya Dryomov 	__le64 flags;
557822e8bd51SIlya Dryomov 	int ret;
557922e8bd51SIlya Dryomov 
558022e8bd51SIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
558122e8bd51SIlya Dryomov 				  &rbd_dev->header_oloc, "get_flags",
558222e8bd51SIlya Dryomov 				  &snapid, sizeof(snapid),
558322e8bd51SIlya Dryomov 				  &flags, sizeof(flags));
558422e8bd51SIlya Dryomov 	if (ret < 0)
558522e8bd51SIlya Dryomov 		return ret;
558622e8bd51SIlya Dryomov 	if (ret < sizeof(flags))
558722e8bd51SIlya Dryomov 		return -EBADMSG;
558822e8bd51SIlya Dryomov 
558922e8bd51SIlya Dryomov 	rbd_dev->object_map_flags = le64_to_cpu(flags);
559022e8bd51SIlya Dryomov 	return 0;
559122e8bd51SIlya Dryomov }
559222e8bd51SIlya Dryomov 
5593eb3b2d6bSIlya Dryomov struct parent_image_info {
5594eb3b2d6bSIlya Dryomov 	u64		pool_id;
5595e92c0eafSIlya Dryomov 	const char	*pool_ns;
5596eb3b2d6bSIlya Dryomov 	const char	*image_id;
5597eb3b2d6bSIlya Dryomov 	u64		snap_id;
5598eb3b2d6bSIlya Dryomov 
5599e92c0eafSIlya Dryomov 	bool		has_overlap;
5600eb3b2d6bSIlya Dryomov 	u64		overlap;
5601eb3b2d6bSIlya Dryomov };
5602eb3b2d6bSIlya Dryomov 
rbd_parent_info_cleanup(struct parent_image_info * pii)5603c1031177SIlya Dryomov static void rbd_parent_info_cleanup(struct parent_image_info *pii)
5604c1031177SIlya Dryomov {
5605c1031177SIlya Dryomov 	kfree(pii->pool_ns);
5606c1031177SIlya Dryomov 	kfree(pii->image_id);
5607c1031177SIlya Dryomov 
5608c1031177SIlya Dryomov 	memset(pii, 0, sizeof(*pii));
5609c1031177SIlya Dryomov }
5610c1031177SIlya Dryomov 
5611eb3b2d6bSIlya Dryomov /*
5612eb3b2d6bSIlya Dryomov  * The caller is responsible for @pii.
5613eb3b2d6bSIlya Dryomov  */
decode_parent_image_spec(void ** p,void * end,struct parent_image_info * pii)5614e92c0eafSIlya Dryomov static int decode_parent_image_spec(void **p, void *end,
5615e92c0eafSIlya Dryomov 				    struct parent_image_info *pii)
5616e92c0eafSIlya Dryomov {
5617e92c0eafSIlya Dryomov 	u8 struct_v;
5618e92c0eafSIlya Dryomov 	u32 struct_len;
5619e92c0eafSIlya Dryomov 	int ret;
5620e92c0eafSIlya Dryomov 
5621e92c0eafSIlya Dryomov 	ret = ceph_start_decoding(p, end, 1, "ParentImageSpec",
5622e92c0eafSIlya Dryomov 				  &struct_v, &struct_len);
5623e92c0eafSIlya Dryomov 	if (ret)
5624e92c0eafSIlya Dryomov 		return ret;
5625e92c0eafSIlya Dryomov 
5626e92c0eafSIlya Dryomov 	ceph_decode_64_safe(p, end, pii->pool_id, e_inval);
5627e92c0eafSIlya Dryomov 	pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5628e92c0eafSIlya Dryomov 	if (IS_ERR(pii->pool_ns)) {
5629e92c0eafSIlya Dryomov 		ret = PTR_ERR(pii->pool_ns);
5630e92c0eafSIlya Dryomov 		pii->pool_ns = NULL;
5631e92c0eafSIlya Dryomov 		return ret;
5632e92c0eafSIlya Dryomov 	}
5633e92c0eafSIlya Dryomov 	pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5634e92c0eafSIlya Dryomov 	if (IS_ERR(pii->image_id)) {
5635e92c0eafSIlya Dryomov 		ret = PTR_ERR(pii->image_id);
5636e92c0eafSIlya Dryomov 		pii->image_id = NULL;
5637e92c0eafSIlya Dryomov 		return ret;
5638e92c0eafSIlya Dryomov 	}
5639e92c0eafSIlya Dryomov 	ceph_decode_64_safe(p, end, pii->snap_id, e_inval);
5640e92c0eafSIlya Dryomov 	return 0;
5641e92c0eafSIlya Dryomov 
5642e92c0eafSIlya Dryomov e_inval:
5643e92c0eafSIlya Dryomov 	return -EINVAL;
5644e92c0eafSIlya Dryomov }
5645e92c0eafSIlya Dryomov 
__get_parent_info(struct rbd_device * rbd_dev,struct page * req_page,struct page * reply_page,struct parent_image_info * pii)5646e92c0eafSIlya Dryomov static int __get_parent_info(struct rbd_device *rbd_dev,
5647e92c0eafSIlya Dryomov 			     struct page *req_page,
5648e92c0eafSIlya Dryomov 			     struct page *reply_page,
5649e92c0eafSIlya Dryomov 			     struct parent_image_info *pii)
5650e92c0eafSIlya Dryomov {
5651e92c0eafSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5652e92c0eafSIlya Dryomov 	size_t reply_len = PAGE_SIZE;
5653e92c0eafSIlya Dryomov 	void *p, *end;
5654e92c0eafSIlya Dryomov 	int ret;
5655e92c0eafSIlya Dryomov 
5656e92c0eafSIlya Dryomov 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5657e92c0eafSIlya Dryomov 			     "rbd", "parent_get", CEPH_OSD_FLAG_READ,
565868ada915SIlya Dryomov 			     req_page, sizeof(u64), &reply_page, &reply_len);
5659e92c0eafSIlya Dryomov 	if (ret)
5660e92c0eafSIlya Dryomov 		return ret == -EOPNOTSUPP ? 1 : ret;
5661e92c0eafSIlya Dryomov 
5662e92c0eafSIlya Dryomov 	p = page_address(reply_page);
5663e92c0eafSIlya Dryomov 	end = p + reply_len;
5664e92c0eafSIlya Dryomov 	ret = decode_parent_image_spec(&p, end, pii);
5665e92c0eafSIlya Dryomov 	if (ret)
5666e92c0eafSIlya Dryomov 		return ret;
5667e92c0eafSIlya Dryomov 
5668e92c0eafSIlya Dryomov 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5669e92c0eafSIlya Dryomov 			     "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
567068ada915SIlya Dryomov 			     req_page, sizeof(u64), &reply_page, &reply_len);
5671e92c0eafSIlya Dryomov 	if (ret)
5672e92c0eafSIlya Dryomov 		return ret;
5673e92c0eafSIlya Dryomov 
5674e92c0eafSIlya Dryomov 	p = page_address(reply_page);
5675e92c0eafSIlya Dryomov 	end = p + reply_len;
5676e92c0eafSIlya Dryomov 	ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval);
5677e92c0eafSIlya Dryomov 	if (pii->has_overlap)
5678e92c0eafSIlya Dryomov 		ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5679e92c0eafSIlya Dryomov 
5680c1031177SIlya Dryomov 	dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
5681c1031177SIlya Dryomov 	     __func__, pii->pool_id, pii->pool_ns, pii->image_id, pii->snap_id,
5682c1031177SIlya Dryomov 	     pii->has_overlap, pii->overlap);
5683e92c0eafSIlya Dryomov 	return 0;
5684e92c0eafSIlya Dryomov 
5685e92c0eafSIlya Dryomov e_inval:
5686e92c0eafSIlya Dryomov 	return -EINVAL;
5687e92c0eafSIlya Dryomov }
5688e92c0eafSIlya Dryomov 
5689e92c0eafSIlya Dryomov /*
5690e92c0eafSIlya Dryomov  * The caller is responsible for @pii.
5691e92c0eafSIlya Dryomov  */
__get_parent_info_legacy(struct rbd_device * rbd_dev,struct page * req_page,struct page * reply_page,struct parent_image_info * pii)5692eb3b2d6bSIlya Dryomov static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
5693eb3b2d6bSIlya Dryomov 				    struct page *req_page,
5694eb3b2d6bSIlya Dryomov 				    struct page *reply_page,
5695eb3b2d6bSIlya Dryomov 				    struct parent_image_info *pii)
5696eb3b2d6bSIlya Dryomov {
5697eb3b2d6bSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5698eb3b2d6bSIlya Dryomov 	size_t reply_len = PAGE_SIZE;
5699eb3b2d6bSIlya Dryomov 	void *p, *end;
5700eb3b2d6bSIlya Dryomov 	int ret;
5701eb3b2d6bSIlya Dryomov 
5702eb3b2d6bSIlya Dryomov 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5703eb3b2d6bSIlya Dryomov 			     "rbd", "get_parent", CEPH_OSD_FLAG_READ,
570468ada915SIlya Dryomov 			     req_page, sizeof(u64), &reply_page, &reply_len);
5705eb3b2d6bSIlya Dryomov 	if (ret)
5706eb3b2d6bSIlya Dryomov 		return ret;
5707eb3b2d6bSIlya Dryomov 
5708eb3b2d6bSIlya Dryomov 	p = page_address(reply_page);
5709eb3b2d6bSIlya Dryomov 	end = p + reply_len;
5710eb3b2d6bSIlya Dryomov 	ceph_decode_64_safe(&p, end, pii->pool_id, e_inval);
5711eb3b2d6bSIlya Dryomov 	pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
5712eb3b2d6bSIlya Dryomov 	if (IS_ERR(pii->image_id)) {
5713eb3b2d6bSIlya Dryomov 		ret = PTR_ERR(pii->image_id);
5714eb3b2d6bSIlya Dryomov 		pii->image_id = NULL;
5715eb3b2d6bSIlya Dryomov 		return ret;
5716eb3b2d6bSIlya Dryomov 	}
5717eb3b2d6bSIlya Dryomov 	ceph_decode_64_safe(&p, end, pii->snap_id, e_inval);
5718e92c0eafSIlya Dryomov 	pii->has_overlap = true;
5719eb3b2d6bSIlya Dryomov 	ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5720eb3b2d6bSIlya Dryomov 
5721c1031177SIlya Dryomov 	dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
5722c1031177SIlya Dryomov 	     __func__, pii->pool_id, pii->pool_ns, pii->image_id, pii->snap_id,
5723c1031177SIlya Dryomov 	     pii->has_overlap, pii->overlap);
5724eb3b2d6bSIlya Dryomov 	return 0;
5725eb3b2d6bSIlya Dryomov 
5726eb3b2d6bSIlya Dryomov e_inval:
5727eb3b2d6bSIlya Dryomov 	return -EINVAL;
5728eb3b2d6bSIlya Dryomov }
5729eb3b2d6bSIlya Dryomov 
rbd_dev_v2_parent_info(struct rbd_device * rbd_dev,struct parent_image_info * pii)5730c1031177SIlya Dryomov static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev,
5731eb3b2d6bSIlya Dryomov 				  struct parent_image_info *pii)
5732eb3b2d6bSIlya Dryomov {
5733eb3b2d6bSIlya Dryomov 	struct page *req_page, *reply_page;
5734eb3b2d6bSIlya Dryomov 	void *p;
5735eb3b2d6bSIlya Dryomov 	int ret;
5736eb3b2d6bSIlya Dryomov 
5737eb3b2d6bSIlya Dryomov 	req_page = alloc_page(GFP_KERNEL);
5738eb3b2d6bSIlya Dryomov 	if (!req_page)
5739eb3b2d6bSIlya Dryomov 		return -ENOMEM;
5740eb3b2d6bSIlya Dryomov 
5741eb3b2d6bSIlya Dryomov 	reply_page = alloc_page(GFP_KERNEL);
5742eb3b2d6bSIlya Dryomov 	if (!reply_page) {
5743eb3b2d6bSIlya Dryomov 		__free_page(req_page);
5744eb3b2d6bSIlya Dryomov 		return -ENOMEM;
5745eb3b2d6bSIlya Dryomov 	}
5746eb3b2d6bSIlya Dryomov 
5747eb3b2d6bSIlya Dryomov 	p = page_address(req_page);
5748eb3b2d6bSIlya Dryomov 	ceph_encode_64(&p, rbd_dev->spec->snap_id);
5749e92c0eafSIlya Dryomov 	ret = __get_parent_info(rbd_dev, req_page, reply_page, pii);
5750e92c0eafSIlya Dryomov 	if (ret > 0)
5751e92c0eafSIlya Dryomov 		ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page,
5752e92c0eafSIlya Dryomov 					       pii);
5753eb3b2d6bSIlya Dryomov 
5754eb3b2d6bSIlya Dryomov 	__free_page(req_page);
5755eb3b2d6bSIlya Dryomov 	__free_page(reply_page);
5756eb3b2d6bSIlya Dryomov 	return ret;
5757eb3b2d6bSIlya Dryomov }
5758eb3b2d6bSIlya Dryomov 
rbd_dev_setup_parent(struct rbd_device * rbd_dev)5759c1031177SIlya Dryomov static int rbd_dev_setup_parent(struct rbd_device *rbd_dev)
576086b00e0dSAlex Elder {
576186b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
5762eb3b2d6bSIlya Dryomov 	struct parent_image_info pii = { 0 };
576386b00e0dSAlex Elder 	int ret;
576486b00e0dSAlex Elder 
576586b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
576686b00e0dSAlex Elder 	if (!parent_spec)
576786b00e0dSAlex Elder 		return -ENOMEM;
576886b00e0dSAlex Elder 
5769c1031177SIlya Dryomov 	ret = rbd_dev_v2_parent_info(rbd_dev, &pii);
5770eb3b2d6bSIlya Dryomov 	if (ret)
577186b00e0dSAlex Elder 		goto out_err;
577286b00e0dSAlex Elder 
5773c1031177SIlya Dryomov 	if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap)
577486b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
577586b00e0dSAlex Elder 
57760903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
57770903e875SAlex Elder 
57780903e875SAlex Elder 	ret = -EIO;
5779eb3b2d6bSIlya Dryomov 	if (pii.pool_id > (u64)U32_MAX) {
57809584d508SIlya Dryomov 		rbd_warn(NULL, "parent pool id too large (%llu > %u)",
5781eb3b2d6bSIlya Dryomov 			(unsigned long long)pii.pool_id, U32_MAX);
578257385b51SAlex Elder 		goto out_err;
5783c0cd10dbSAlex Elder 	}
57840903e875SAlex Elder 
57853b5cf2a2SAlex Elder 	/*
5786c1031177SIlya Dryomov 	 * The parent won't change except when the clone is flattened,
5787c1031177SIlya Dryomov 	 * so we only need to record the parent image spec once.
57883b5cf2a2SAlex Elder 	 */
5789eb3b2d6bSIlya Dryomov 	parent_spec->pool_id = pii.pool_id;
5790e92c0eafSIlya Dryomov 	if (pii.pool_ns && *pii.pool_ns) {
5791e92c0eafSIlya Dryomov 		parent_spec->pool_ns = pii.pool_ns;
5792e92c0eafSIlya Dryomov 		pii.pool_ns = NULL;
5793e92c0eafSIlya Dryomov 	}
5794eb3b2d6bSIlya Dryomov 	parent_spec->image_id = pii.image_id;
5795eb3b2d6bSIlya Dryomov 	pii.image_id = NULL;
5796eb3b2d6bSIlya Dryomov 	parent_spec->snap_id = pii.snap_id;
5797b26c047bSIlya Dryomov 
5798c1031177SIlya Dryomov 	rbd_assert(!rbd_dev->parent_spec);
579986b00e0dSAlex Elder 	rbd_dev->parent_spec = parent_spec;
580086b00e0dSAlex Elder 	parent_spec = NULL;	/* rbd_dev now owns this */
58013b5cf2a2SAlex Elder 
58023b5cf2a2SAlex Elder 	/*
5803c1031177SIlya Dryomov 	 * Record the parent overlap.  If it's zero, issue a warning as
5804c1031177SIlya Dryomov 	 * we will proceed as if there is no parent.
58053b5cf2a2SAlex Elder 	 */
5806c1031177SIlya Dryomov 	if (!pii.overlap)
5807cf32bd9cSIlya Dryomov 		rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
5808eb3b2d6bSIlya Dryomov 	rbd_dev->parent_overlap = pii.overlap;
5809cf32bd9cSIlya Dryomov 
581086b00e0dSAlex Elder out:
581186b00e0dSAlex Elder 	ret = 0;
581286b00e0dSAlex Elder out_err:
5813c1031177SIlya Dryomov 	rbd_parent_info_cleanup(&pii);
581486b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
581586b00e0dSAlex Elder 	return ret;
581686b00e0dSAlex Elder }
581786b00e0dSAlex Elder 
rbd_dev_v2_striping_info(struct rbd_device * rbd_dev,u64 * stripe_unit,u64 * stripe_count)5818510a7330SIlya Dryomov static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev,
5819510a7330SIlya Dryomov 				    u64 *stripe_unit, u64 *stripe_count)
5820cc070d59SAlex Elder {
5821cc070d59SAlex Elder 	struct {
5822cc070d59SAlex Elder 		__le64 stripe_unit;
5823cc070d59SAlex Elder 		__le64 stripe_count;
5824cc070d59SAlex Elder 	} __attribute__ ((packed)) striping_info_buf = { 0 };
5825cc070d59SAlex Elder 	size_t size = sizeof (striping_info_buf);
5826cc070d59SAlex Elder 	int ret;
5827cc070d59SAlex Elder 
5828ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5829ecd4a68aSIlya Dryomov 				&rbd_dev->header_oloc, "get_stripe_unit_count",
5830ecd4a68aSIlya Dryomov 				NULL, 0, &striping_info_buf, size);
5831cc070d59SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5832cc070d59SAlex Elder 	if (ret < 0)
5833cc070d59SAlex Elder 		return ret;
5834cc070d59SAlex Elder 	if (ret < size)
5835cc070d59SAlex Elder 		return -ERANGE;
5836cc070d59SAlex Elder 
5837510a7330SIlya Dryomov 	*stripe_unit = le64_to_cpu(striping_info_buf.stripe_unit);
5838510a7330SIlya Dryomov 	*stripe_count = le64_to_cpu(striping_info_buf.stripe_count);
5839510a7330SIlya Dryomov 	dout("  stripe_unit = %llu stripe_count = %llu\n", *stripe_unit,
5840510a7330SIlya Dryomov 	     *stripe_count);
5841510a7330SIlya Dryomov 
5842cc070d59SAlex Elder 	return 0;
5843cc070d59SAlex Elder }
5844cc070d59SAlex Elder 
rbd_dev_v2_data_pool(struct rbd_device * rbd_dev,s64 * data_pool_id)5845510a7330SIlya Dryomov static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev, s64 *data_pool_id)
58467e97332eSIlya Dryomov {
5847510a7330SIlya Dryomov 	__le64 data_pool_buf;
58487e97332eSIlya Dryomov 	int ret;
58497e97332eSIlya Dryomov 
58507e97332eSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
58517e97332eSIlya Dryomov 				  &rbd_dev->header_oloc, "get_data_pool",
5852510a7330SIlya Dryomov 				  NULL, 0, &data_pool_buf,
5853510a7330SIlya Dryomov 				  sizeof(data_pool_buf));
5854510a7330SIlya Dryomov 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
58557e97332eSIlya Dryomov 	if (ret < 0)
58567e97332eSIlya Dryomov 		return ret;
5857510a7330SIlya Dryomov 	if (ret < sizeof(data_pool_buf))
58587e97332eSIlya Dryomov 		return -EBADMSG;
58597e97332eSIlya Dryomov 
5860510a7330SIlya Dryomov 	*data_pool_id = le64_to_cpu(data_pool_buf);
5861510a7330SIlya Dryomov 	dout("  data_pool_id = %lld\n", *data_pool_id);
5862510a7330SIlya Dryomov 	WARN_ON(*data_pool_id == CEPH_NOPOOL);
5863510a7330SIlya Dryomov 
58647e97332eSIlya Dryomov 	return 0;
58657e97332eSIlya Dryomov }
58667e97332eSIlya Dryomov 
rbd_dev_image_name(struct rbd_device * rbd_dev)58679e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
58689e15b77dSAlex Elder {
5869ecd4a68aSIlya Dryomov 	CEPH_DEFINE_OID_ONSTACK(oid);
58709e15b77dSAlex Elder 	size_t image_id_size;
58719e15b77dSAlex Elder 	char *image_id;
58729e15b77dSAlex Elder 	void *p;
58739e15b77dSAlex Elder 	void *end;
58749e15b77dSAlex Elder 	size_t size;
58759e15b77dSAlex Elder 	void *reply_buf = NULL;
58769e15b77dSAlex Elder 	size_t len = 0;
58779e15b77dSAlex Elder 	char *image_name = NULL;
58789e15b77dSAlex Elder 	int ret;
58799e15b77dSAlex Elder 
58809e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
58819e15b77dSAlex Elder 
588269e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
588369e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
58849e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
58859e15b77dSAlex Elder 	if (!image_id)
58869e15b77dSAlex Elder 		return NULL;
58879e15b77dSAlex Elder 
58889e15b77dSAlex Elder 	p = image_id;
58894157976bSAlex Elder 	end = image_id + image_id_size;
589069e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
58919e15b77dSAlex Elder 
58929e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
58939e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
58949e15b77dSAlex Elder 	if (!reply_buf)
58959e15b77dSAlex Elder 		goto out;
58969e15b77dSAlex Elder 
5897ecd4a68aSIlya Dryomov 	ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
5898ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5899ecd4a68aSIlya Dryomov 				  "dir_get_name", image_id, image_id_size,
5900e2a58ee5SAlex Elder 				  reply_buf, size);
59019e15b77dSAlex Elder 	if (ret < 0)
59029e15b77dSAlex Elder 		goto out;
59039e15b77dSAlex Elder 	p = reply_buf;
5904f40eb349SAlex Elder 	end = reply_buf + ret;
5905f40eb349SAlex Elder 
59069e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
59079e15b77dSAlex Elder 	if (IS_ERR(image_name))
59089e15b77dSAlex Elder 		image_name = NULL;
59099e15b77dSAlex Elder 	else
59109e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
59119e15b77dSAlex Elder out:
59129e15b77dSAlex Elder 	kfree(reply_buf);
59139e15b77dSAlex Elder 	kfree(image_id);
59149e15b77dSAlex Elder 
59159e15b77dSAlex Elder 	return image_name;
59169e15b77dSAlex Elder }
59179e15b77dSAlex Elder 
rbd_v1_snap_id_by_name(struct rbd_device * rbd_dev,const char * name)59182ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
59192ad3d716SAlex Elder {
59202ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
59212ad3d716SAlex Elder 	const char *snap_name;
59222ad3d716SAlex Elder 	u32 which = 0;
59232ad3d716SAlex Elder 
59242ad3d716SAlex Elder 	/* Skip over names until we find the one we are looking for */
59252ad3d716SAlex Elder 
59262ad3d716SAlex Elder 	snap_name = rbd_dev->header.snap_names;
59272ad3d716SAlex Elder 	while (which < snapc->num_snaps) {
59282ad3d716SAlex Elder 		if (!strcmp(name, snap_name))
59292ad3d716SAlex Elder 			return snapc->snaps[which];
59302ad3d716SAlex Elder 		snap_name += strlen(snap_name) + 1;
59312ad3d716SAlex Elder 		which++;
59322ad3d716SAlex Elder 	}
59332ad3d716SAlex Elder 	return CEPH_NOSNAP;
59342ad3d716SAlex Elder }
59352ad3d716SAlex Elder 
rbd_v2_snap_id_by_name(struct rbd_device * rbd_dev,const char * name)59362ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
59372ad3d716SAlex Elder {
59382ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
59392ad3d716SAlex Elder 	u32 which;
59402ad3d716SAlex Elder 	bool found = false;
59412ad3d716SAlex Elder 	u64 snap_id;
59422ad3d716SAlex Elder 
59432ad3d716SAlex Elder 	for (which = 0; !found && which < snapc->num_snaps; which++) {
59442ad3d716SAlex Elder 		const char *snap_name;
59452ad3d716SAlex Elder 
59462ad3d716SAlex Elder 		snap_id = snapc->snaps[which];
59472ad3d716SAlex Elder 		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
5948efadc98aSJosh Durgin 		if (IS_ERR(snap_name)) {
5949efadc98aSJosh Durgin 			/* ignore no-longer existing snapshots */
5950efadc98aSJosh Durgin 			if (PTR_ERR(snap_name) == -ENOENT)
5951efadc98aSJosh Durgin 				continue;
5952efadc98aSJosh Durgin 			else
59532ad3d716SAlex Elder 				break;
5954efadc98aSJosh Durgin 		}
59552ad3d716SAlex Elder 		found = !strcmp(name, snap_name);
59562ad3d716SAlex Elder 		kfree(snap_name);
59572ad3d716SAlex Elder 	}
59582ad3d716SAlex Elder 	return found ? snap_id : CEPH_NOSNAP;
59592ad3d716SAlex Elder }
59602ad3d716SAlex Elder 
59612ad3d716SAlex Elder /*
59622ad3d716SAlex Elder  * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
59632ad3d716SAlex Elder  * no snapshot by that name is found, or if an error occurs.
59642ad3d716SAlex Elder  */
rbd_snap_id_by_name(struct rbd_device * rbd_dev,const char * name)59652ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
59662ad3d716SAlex Elder {
59672ad3d716SAlex Elder 	if (rbd_dev->image_format == 1)
59682ad3d716SAlex Elder 		return rbd_v1_snap_id_by_name(rbd_dev, name);
59692ad3d716SAlex Elder 
59702ad3d716SAlex Elder 	return rbd_v2_snap_id_by_name(rbd_dev, name);
59712ad3d716SAlex Elder }
59722ad3d716SAlex Elder 
59739e15b77dSAlex Elder /*
597404077599SIlya Dryomov  * An image being mapped will have everything but the snap id.
59759e15b77dSAlex Elder  */
rbd_spec_fill_snap_id(struct rbd_device * rbd_dev)597604077599SIlya Dryomov static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
597704077599SIlya Dryomov {
597804077599SIlya Dryomov 	struct rbd_spec *spec = rbd_dev->spec;
597904077599SIlya Dryomov 
598004077599SIlya Dryomov 	rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
598104077599SIlya Dryomov 	rbd_assert(spec->image_id && spec->image_name);
598204077599SIlya Dryomov 	rbd_assert(spec->snap_name);
598304077599SIlya Dryomov 
598404077599SIlya Dryomov 	if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
598504077599SIlya Dryomov 		u64 snap_id;
598604077599SIlya Dryomov 
598704077599SIlya Dryomov 		snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
598804077599SIlya Dryomov 		if (snap_id == CEPH_NOSNAP)
598904077599SIlya Dryomov 			return -ENOENT;
599004077599SIlya Dryomov 
599104077599SIlya Dryomov 		spec->snap_id = snap_id;
599204077599SIlya Dryomov 	} else {
599304077599SIlya Dryomov 		spec->snap_id = CEPH_NOSNAP;
599404077599SIlya Dryomov 	}
599504077599SIlya Dryomov 
599604077599SIlya Dryomov 	return 0;
599704077599SIlya Dryomov }
599804077599SIlya Dryomov 
599904077599SIlya Dryomov /*
600004077599SIlya Dryomov  * A parent image will have all ids but none of the names.
600104077599SIlya Dryomov  *
600204077599SIlya Dryomov  * All names in an rbd spec are dynamically allocated.  It's OK if we
600304077599SIlya Dryomov  * can't figure out the name for an image id.
600404077599SIlya Dryomov  */
rbd_spec_fill_names(struct rbd_device * rbd_dev)600504077599SIlya Dryomov static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
60069e15b77dSAlex Elder {
60072e9f7f1cSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
60082e9f7f1cSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
60092e9f7f1cSAlex Elder 	const char *pool_name;
60102e9f7f1cSAlex Elder 	const char *image_name;
60112e9f7f1cSAlex Elder 	const char *snap_name;
60129e15b77dSAlex Elder 	int ret;
60139e15b77dSAlex Elder 
601404077599SIlya Dryomov 	rbd_assert(spec->pool_id != CEPH_NOPOOL);
601504077599SIlya Dryomov 	rbd_assert(spec->image_id);
601604077599SIlya Dryomov 	rbd_assert(spec->snap_id != CEPH_NOSNAP);
60179e15b77dSAlex Elder 
60182e9f7f1cSAlex Elder 	/* Get the pool name; we have to make our own copy of this */
60199e15b77dSAlex Elder 
60202e9f7f1cSAlex Elder 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
60212e9f7f1cSAlex Elder 	if (!pool_name) {
60222e9f7f1cSAlex Elder 		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
6023935dc89fSAlex Elder 		return -EIO;
6024935dc89fSAlex Elder 	}
60252e9f7f1cSAlex Elder 	pool_name = kstrdup(pool_name, GFP_KERNEL);
60262e9f7f1cSAlex Elder 	if (!pool_name)
60279e15b77dSAlex Elder 		return -ENOMEM;
60289e15b77dSAlex Elder 
60299e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
60309e15b77dSAlex Elder 
60312e9f7f1cSAlex Elder 	image_name = rbd_dev_image_name(rbd_dev);
60322e9f7f1cSAlex Elder 	if (!image_name)
603306ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
60349e15b77dSAlex Elder 
603504077599SIlya Dryomov 	/* Fetch the snapshot name */
60369e15b77dSAlex Elder 
60372e9f7f1cSAlex Elder 	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
6038da6a6b63SJosh Durgin 	if (IS_ERR(snap_name)) {
6039da6a6b63SJosh Durgin 		ret = PTR_ERR(snap_name);
60409e15b77dSAlex Elder 		goto out_err;
60412e9f7f1cSAlex Elder 	}
60422e9f7f1cSAlex Elder 
60432e9f7f1cSAlex Elder 	spec->pool_name = pool_name;
60442e9f7f1cSAlex Elder 	spec->image_name = image_name;
60452e9f7f1cSAlex Elder 	spec->snap_name = snap_name;
60469e15b77dSAlex Elder 
60479e15b77dSAlex Elder 	return 0;
604804077599SIlya Dryomov 
60499e15b77dSAlex Elder out_err:
60502e9f7f1cSAlex Elder 	kfree(image_name);
60512e9f7f1cSAlex Elder 	kfree(pool_name);
60529e15b77dSAlex Elder 	return ret;
60539e15b77dSAlex Elder }
60549e15b77dSAlex Elder 
rbd_dev_v2_snap_context(struct rbd_device * rbd_dev,struct ceph_snap_context ** psnapc)6055510a7330SIlya Dryomov static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev,
6056510a7330SIlya Dryomov 				   struct ceph_snap_context **psnapc)
605735d489f9SAlex Elder {
605835d489f9SAlex Elder 	size_t size;
605935d489f9SAlex Elder 	int ret;
606035d489f9SAlex Elder 	void *reply_buf;
606135d489f9SAlex Elder 	void *p;
606235d489f9SAlex Elder 	void *end;
606335d489f9SAlex Elder 	u64 seq;
606435d489f9SAlex Elder 	u32 snap_count;
606535d489f9SAlex Elder 	struct ceph_snap_context *snapc;
606635d489f9SAlex Elder 	u32 i;
606735d489f9SAlex Elder 
606835d489f9SAlex Elder 	/*
606935d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
607035d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
607135d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
607235d489f9SAlex Elder 	 * prepared to receive.
607335d489f9SAlex Elder 	 */
607435d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
607535d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
607635d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
607735d489f9SAlex Elder 	if (!reply_buf)
607835d489f9SAlex Elder 		return -ENOMEM;
607935d489f9SAlex Elder 
6080ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6081ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_snapcontext",
6082ecd4a68aSIlya Dryomov 				  NULL, 0, reply_buf, size);
608336be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
608435d489f9SAlex Elder 	if (ret < 0)
608535d489f9SAlex Elder 		goto out;
608635d489f9SAlex Elder 
608735d489f9SAlex Elder 	p = reply_buf;
608857385b51SAlex Elder 	end = reply_buf + ret;
608957385b51SAlex Elder 	ret = -ERANGE;
609035d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
609135d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
609235d489f9SAlex Elder 
609335d489f9SAlex Elder 	/*
609435d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
609535d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
609635d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
609735d489f9SAlex Elder 	 * allocate is representable in a size_t.
609835d489f9SAlex Elder 	 */
609935d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
610035d489f9SAlex Elder 				 / sizeof (u64)) {
610135d489f9SAlex Elder 		ret = -EINVAL;
610235d489f9SAlex Elder 		goto out;
610335d489f9SAlex Elder 	}
610435d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
610535d489f9SAlex Elder 		goto out;
6106468521c1SAlex Elder 	ret = 0;
610735d489f9SAlex Elder 
6108812164f8SAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
610935d489f9SAlex Elder 	if (!snapc) {
611035d489f9SAlex Elder 		ret = -ENOMEM;
611135d489f9SAlex Elder 		goto out;
611235d489f9SAlex Elder 	}
611335d489f9SAlex Elder 	snapc->seq = seq;
611435d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
611535d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
611635d489f9SAlex Elder 
6117510a7330SIlya Dryomov 	*psnapc = snapc;
611835d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
611935d489f9SAlex Elder 		(unsigned long long)seq, (unsigned int)snap_count);
612035d489f9SAlex Elder out:
612135d489f9SAlex Elder 	kfree(reply_buf);
612235d489f9SAlex Elder 
612357385b51SAlex Elder 	return ret;
612435d489f9SAlex Elder }
612535d489f9SAlex Elder 
rbd_dev_v2_snap_name(struct rbd_device * rbd_dev,u64 snap_id)612654cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
612754cac61fSAlex Elder 					u64 snap_id)
6128b8b1e2dbSAlex Elder {
6129b8b1e2dbSAlex Elder 	size_t size;
6130b8b1e2dbSAlex Elder 	void *reply_buf;
613154cac61fSAlex Elder 	__le64 snapid;
6132b8b1e2dbSAlex Elder 	int ret;
6133b8b1e2dbSAlex Elder 	void *p;
6134b8b1e2dbSAlex Elder 	void *end;
6135b8b1e2dbSAlex Elder 	char *snap_name;
6136b8b1e2dbSAlex Elder 
6137b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
6138b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
6139b8b1e2dbSAlex Elder 	if (!reply_buf)
6140b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
6141b8b1e2dbSAlex Elder 
614254cac61fSAlex Elder 	snapid = cpu_to_le64(snap_id);
6143ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6144ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_snapshot_name",
6145ecd4a68aSIlya Dryomov 				  &snapid, sizeof(snapid), reply_buf, size);
614636be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6147f40eb349SAlex Elder 	if (ret < 0) {
6148f40eb349SAlex Elder 		snap_name = ERR_PTR(ret);
6149b8b1e2dbSAlex Elder 		goto out;
6150f40eb349SAlex Elder 	}
6151b8b1e2dbSAlex Elder 
6152b8b1e2dbSAlex Elder 	p = reply_buf;
6153f40eb349SAlex Elder 	end = reply_buf + ret;
6154e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
6155f40eb349SAlex Elder 	if (IS_ERR(snap_name))
6156b8b1e2dbSAlex Elder 		goto out;
6157f40eb349SAlex Elder 
6158b8b1e2dbSAlex Elder 	dout("  snap_id 0x%016llx snap_name = %s\n",
615954cac61fSAlex Elder 		(unsigned long long)snap_id, snap_name);
6160b8b1e2dbSAlex Elder out:
6161b8b1e2dbSAlex Elder 	kfree(reply_buf);
6162b8b1e2dbSAlex Elder 
6163f40eb349SAlex Elder 	return snap_name;
6164b8b1e2dbSAlex Elder }
6165b8b1e2dbSAlex Elder 
rbd_dev_v2_header_info(struct rbd_device * rbd_dev,struct rbd_image_header * header,bool first_time)6166510a7330SIlya Dryomov static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev,
6167510a7330SIlya Dryomov 				  struct rbd_image_header *header,
6168510a7330SIlya Dryomov 				  bool first_time)
6169117973fbSAlex Elder {
6170117973fbSAlex Elder 	int ret;
6171117973fbSAlex Elder 
6172510a7330SIlya Dryomov 	ret = _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
6173510a7330SIlya Dryomov 				    first_time ? &header->obj_order : NULL,
6174510a7330SIlya Dryomov 				    &header->image_size);
61751617e40cSJosh Durgin 	if (ret)
6176cfbf6377SAlex Elder 		return ret;
61771617e40cSJosh Durgin 
61782df3fac7SAlex Elder 	if (first_time) {
6179510a7330SIlya Dryomov 		ret = rbd_dev_v2_header_onetime(rbd_dev, header);
61802df3fac7SAlex Elder 		if (ret)
6181cfbf6377SAlex Elder 			return ret;
61822df3fac7SAlex Elder 	}
61832df3fac7SAlex Elder 
6184510a7330SIlya Dryomov 	ret = rbd_dev_v2_snap_context(rbd_dev, &header->snapc);
6185510a7330SIlya Dryomov 	if (ret)
6186117973fbSAlex Elder 		return ret;
6187510a7330SIlya Dryomov 
6188510a7330SIlya Dryomov 	return 0;
6189117973fbSAlex Elder }
6190117973fbSAlex Elder 
rbd_dev_header_info(struct rbd_device * rbd_dev,struct rbd_image_header * header,bool first_time)6191510a7330SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev,
6192510a7330SIlya Dryomov 			       struct rbd_image_header *header,
6193510a7330SIlya Dryomov 			       bool first_time)
6194a720ae09SIlya Dryomov {
6195a720ae09SIlya Dryomov 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
6196510a7330SIlya Dryomov 	rbd_assert(!header->object_prefix && !header->snapc);
6197a720ae09SIlya Dryomov 
6198a720ae09SIlya Dryomov 	if (rbd_dev->image_format == 1)
6199510a7330SIlya Dryomov 		return rbd_dev_v1_header_info(rbd_dev, header, first_time);
6200a720ae09SIlya Dryomov 
6201510a7330SIlya Dryomov 	return rbd_dev_v2_header_info(rbd_dev, header, first_time);
6202a720ae09SIlya Dryomov }
6203a720ae09SIlya Dryomov 
62041ddbe94eSAlex Elder /*
6205e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
6206e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
6207593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
6208593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
6209e28fff26SAlex Elder  */
next_token(const char ** buf)6210e28fff26SAlex Elder static inline size_t next_token(const char **buf)
6211e28fff26SAlex Elder {
6212e28fff26SAlex Elder         /*
6213e28fff26SAlex Elder         * These are the characters that produce nonzero for
6214e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
6215e28fff26SAlex Elder         */
6216435a120aSColin Ian King 	static const char spaces[] = " \f\n\r\t\v";
6217e28fff26SAlex Elder 
6218e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
6219e28fff26SAlex Elder 
6220e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
6221e28fff26SAlex Elder }
6222e28fff26SAlex Elder 
6223e28fff26SAlex Elder /*
6224ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
6225ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
6226ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
6227ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
6228ea3352f4SAlex Elder  *
6229ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
6230ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
6231ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
6232ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
6233ea3352f4SAlex Elder  *
6234ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
6235ea3352f4SAlex Elder  * the end of the found token.
6236ea3352f4SAlex Elder  *
6237ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
6238ea3352f4SAlex Elder  */
dup_token(const char ** buf,size_t * lenp)6239ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
6240ea3352f4SAlex Elder {
6241ea3352f4SAlex Elder 	char *dup;
6242ea3352f4SAlex Elder 	size_t len;
6243ea3352f4SAlex Elder 
6244ea3352f4SAlex Elder 	len = next_token(buf);
62454caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
6246ea3352f4SAlex Elder 	if (!dup)
6247ea3352f4SAlex Elder 		return NULL;
6248ea3352f4SAlex Elder 	*(dup + len) = '\0';
6249ea3352f4SAlex Elder 	*buf += len;
6250ea3352f4SAlex Elder 
6251ea3352f4SAlex Elder 	if (lenp)
6252ea3352f4SAlex Elder 		*lenp = len;
6253ea3352f4SAlex Elder 
6254ea3352f4SAlex Elder 	return dup;
6255ea3352f4SAlex Elder }
6256ea3352f4SAlex Elder 
rbd_parse_param(struct fs_parameter * param,struct rbd_parse_opts_ctx * pctx)625782995cc6SDavid Howells static int rbd_parse_param(struct fs_parameter *param,
625882995cc6SDavid Howells 			    struct rbd_parse_opts_ctx *pctx)
625982995cc6SDavid Howells {
626082995cc6SDavid Howells 	struct rbd_options *opt = pctx->opts;
626182995cc6SDavid Howells 	struct fs_parse_result result;
62623fbb8d55SAl Viro 	struct p_log log = {.prefix = "rbd"};
626382995cc6SDavid Howells 	int token, ret;
626482995cc6SDavid Howells 
626582995cc6SDavid Howells 	ret = ceph_parse_param(param, pctx->copts, NULL);
626682995cc6SDavid Howells 	if (ret != -ENOPARAM)
626782995cc6SDavid Howells 		return ret;
626882995cc6SDavid Howells 
6269d7167b14SAl Viro 	token = __fs_parse(&log, rbd_parameters, param, &result);
627082995cc6SDavid Howells 	dout("%s fs_parse '%s' token %d\n", __func__, param->key, token);
627182995cc6SDavid Howells 	if (token < 0) {
62722c3f3dc3SAl Viro 		if (token == -ENOPARAM)
62732c3f3dc3SAl Viro 			return inval_plog(&log, "Unknown parameter '%s'",
627482995cc6SDavid Howells 					  param->key);
627582995cc6SDavid Howells 		return token;
627682995cc6SDavid Howells 	}
627782995cc6SDavid Howells 
627882995cc6SDavid Howells 	switch (token) {
627982995cc6SDavid Howells 	case Opt_queue_depth:
628082995cc6SDavid Howells 		if (result.uint_32 < 1)
628182995cc6SDavid Howells 			goto out_of_range;
628282995cc6SDavid Howells 		opt->queue_depth = result.uint_32;
628382995cc6SDavid Howells 		break;
628482995cc6SDavid Howells 	case Opt_alloc_size:
628582995cc6SDavid Howells 		if (result.uint_32 < SECTOR_SIZE)
628682995cc6SDavid Howells 			goto out_of_range;
62872c3f3dc3SAl Viro 		if (!is_power_of_2(result.uint_32))
62882c3f3dc3SAl Viro 			return inval_plog(&log, "alloc_size must be a power of 2");
628982995cc6SDavid Howells 		opt->alloc_size = result.uint_32;
629082995cc6SDavid Howells 		break;
629182995cc6SDavid Howells 	case Opt_lock_timeout:
629282995cc6SDavid Howells 		/* 0 is "wait forever" (i.e. infinite timeout) */
629382995cc6SDavid Howells 		if (result.uint_32 > INT_MAX / 1000)
629482995cc6SDavid Howells 			goto out_of_range;
629582995cc6SDavid Howells 		opt->lock_timeout = msecs_to_jiffies(result.uint_32 * 1000);
629682995cc6SDavid Howells 		break;
629782995cc6SDavid Howells 	case Opt_pool_ns:
629882995cc6SDavid Howells 		kfree(pctx->spec->pool_ns);
629982995cc6SDavid Howells 		pctx->spec->pool_ns = param->string;
630082995cc6SDavid Howells 		param->string = NULL;
630182995cc6SDavid Howells 		break;
6302dc1dad8eSIlya Dryomov 	case Opt_compression_hint:
6303dc1dad8eSIlya Dryomov 		switch (result.uint_32) {
6304dc1dad8eSIlya Dryomov 		case Opt_compression_hint_none:
6305dc1dad8eSIlya Dryomov 			opt->alloc_hint_flags &=
6306dc1dad8eSIlya Dryomov 			    ~(CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE |
6307dc1dad8eSIlya Dryomov 			      CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE);
6308dc1dad8eSIlya Dryomov 			break;
6309dc1dad8eSIlya Dryomov 		case Opt_compression_hint_compressible:
6310dc1dad8eSIlya Dryomov 			opt->alloc_hint_flags |=
6311dc1dad8eSIlya Dryomov 			    CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE;
6312dc1dad8eSIlya Dryomov 			opt->alloc_hint_flags &=
6313dc1dad8eSIlya Dryomov 			    ~CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE;
6314dc1dad8eSIlya Dryomov 			break;
6315dc1dad8eSIlya Dryomov 		case Opt_compression_hint_incompressible:
6316dc1dad8eSIlya Dryomov 			opt->alloc_hint_flags |=
6317dc1dad8eSIlya Dryomov 			    CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE;
6318dc1dad8eSIlya Dryomov 			opt->alloc_hint_flags &=
6319dc1dad8eSIlya Dryomov 			    ~CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE;
6320dc1dad8eSIlya Dryomov 			break;
6321dc1dad8eSIlya Dryomov 		default:
6322dc1dad8eSIlya Dryomov 			BUG();
6323dc1dad8eSIlya Dryomov 		}
6324dc1dad8eSIlya Dryomov 		break;
632582995cc6SDavid Howells 	case Opt_read_only:
632682995cc6SDavid Howells 		opt->read_only = true;
632782995cc6SDavid Howells 		break;
632882995cc6SDavid Howells 	case Opt_read_write:
632982995cc6SDavid Howells 		opt->read_only = false;
633082995cc6SDavid Howells 		break;
633182995cc6SDavid Howells 	case Opt_lock_on_read:
633282995cc6SDavid Howells 		opt->lock_on_read = true;
633382995cc6SDavid Howells 		break;
633482995cc6SDavid Howells 	case Opt_exclusive:
633582995cc6SDavid Howells 		opt->exclusive = true;
633682995cc6SDavid Howells 		break;
633782995cc6SDavid Howells 	case Opt_notrim:
633882995cc6SDavid Howells 		opt->trim = false;
633982995cc6SDavid Howells 		break;
634082995cc6SDavid Howells 	default:
634182995cc6SDavid Howells 		BUG();
634282995cc6SDavid Howells 	}
634382995cc6SDavid Howells 
634482995cc6SDavid Howells 	return 0;
634582995cc6SDavid Howells 
634682995cc6SDavid Howells out_of_range:
63472c3f3dc3SAl Viro 	return inval_plog(&log, "%s out of range", param->key);
634882995cc6SDavid Howells }
634982995cc6SDavid Howells 
635082995cc6SDavid Howells /*
635182995cc6SDavid Howells  * This duplicates most of generic_parse_monolithic(), untying it from
635282995cc6SDavid Howells  * fs_context and skipping standard superblock and security options.
635382995cc6SDavid Howells  */
rbd_parse_options(char * options,struct rbd_parse_opts_ctx * pctx)635482995cc6SDavid Howells static int rbd_parse_options(char *options, struct rbd_parse_opts_ctx *pctx)
635582995cc6SDavid Howells {
635682995cc6SDavid Howells 	char *key;
635782995cc6SDavid Howells 	int ret = 0;
635882995cc6SDavid Howells 
635982995cc6SDavid Howells 	dout("%s '%s'\n", __func__, options);
636082995cc6SDavid Howells 	while ((key = strsep(&options, ",")) != NULL) {
636182995cc6SDavid Howells 		if (*key) {
636282995cc6SDavid Howells 			struct fs_parameter param = {
636382995cc6SDavid Howells 				.key	= key,
63640f89589aSAl Viro 				.type	= fs_value_is_flag,
636582995cc6SDavid Howells 			};
636682995cc6SDavid Howells 			char *value = strchr(key, '=');
636782995cc6SDavid Howells 			size_t v_len = 0;
636882995cc6SDavid Howells 
636982995cc6SDavid Howells 			if (value) {
637082995cc6SDavid Howells 				if (value == key)
637182995cc6SDavid Howells 					continue;
637282995cc6SDavid Howells 				*value++ = 0;
637382995cc6SDavid Howells 				v_len = strlen(value);
637482995cc6SDavid Howells 				param.string = kmemdup_nul(value, v_len,
637582995cc6SDavid Howells 							   GFP_KERNEL);
637682995cc6SDavid Howells 				if (!param.string)
637782995cc6SDavid Howells 					return -ENOMEM;
63780f89589aSAl Viro 				param.type = fs_value_is_string;
637982995cc6SDavid Howells 			}
638082995cc6SDavid Howells 			param.size = v_len;
638182995cc6SDavid Howells 
638282995cc6SDavid Howells 			ret = rbd_parse_param(&param, pctx);
638382995cc6SDavid Howells 			kfree(param.string);
638482995cc6SDavid Howells 			if (ret)
638582995cc6SDavid Howells 				break;
638682995cc6SDavid Howells 		}
638782995cc6SDavid Howells 	}
638882995cc6SDavid Howells 
638982995cc6SDavid Howells 	return ret;
639082995cc6SDavid Howells }
639182995cc6SDavid Howells 
6392ea3352f4SAlex Elder /*
6393859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
6394859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
6395859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
6396859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
6397d22f76e7SAlex Elder  *
6398859c31dfSAlex Elder  * The information extracted from these options is recorded in
6399859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
6400859c31dfSAlex Elder  * structures:
6401859c31dfSAlex Elder  *  ceph_opts
6402859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
6403859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
6404859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
6405859c31dfSAlex Elder  *  rbd_opts
6406859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
6407859c31dfSAlex Elder  *	this function; caller must release with kfree().
6408859c31dfSAlex Elder  *  spec
6409859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
6410859c31dfSAlex Elder  *	initialized by this function based on parsed options.
6411859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
6412859c31dfSAlex Elder  *
6413859c31dfSAlex Elder  * The options passed take this form:
6414859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
6415859c31dfSAlex Elder  * where:
6416859c31dfSAlex Elder  *  <mon_addrs>
6417859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
6418859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
6419859c31dfSAlex Elder  *      by a port number (separated by a colon).
6420859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
6421859c31dfSAlex Elder  *  <options>
6422859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
6423859c31dfSAlex Elder  *  <pool_name>
6424859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
6425859c31dfSAlex Elder  *  <image_name>
6426859c31dfSAlex Elder  *      The name of the image in that pool to map.
6427859c31dfSAlex Elder  *  <snap_id>
6428859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
6429859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
6430859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
6431859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
6432a725f65eSAlex Elder  */
rbd_add_parse_args(const char * buf,struct ceph_options ** ceph_opts,struct rbd_options ** opts,struct rbd_spec ** rbd_spec)6433859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
6434dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
6435859c31dfSAlex Elder 				struct rbd_options **opts,
6436859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
6437a725f65eSAlex Elder {
6438e28fff26SAlex Elder 	size_t len;
6439859c31dfSAlex Elder 	char *options;
64400ddebc0cSAlex Elder 	const char *mon_addrs;
6441ecb4dc22SAlex Elder 	char *snap_name;
64420ddebc0cSAlex Elder 	size_t mon_addrs_size;
644382995cc6SDavid Howells 	struct rbd_parse_opts_ctx pctx = { 0 };
6444dc79b113SAlex Elder 	int ret;
6445e28fff26SAlex Elder 
6446e28fff26SAlex Elder 	/* The first four tokens are required */
6447e28fff26SAlex Elder 
64487ef3214aSAlex Elder 	len = next_token(&buf);
64494fb5d671SAlex Elder 	if (!len) {
64504fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
64514fb5d671SAlex Elder 		return -EINVAL;
64524fb5d671SAlex Elder 	}
64530ddebc0cSAlex Elder 	mon_addrs = buf;
645482995cc6SDavid Howells 	mon_addrs_size = len;
64557ef3214aSAlex Elder 	buf += len;
6456a725f65eSAlex Elder 
6457dc79b113SAlex Elder 	ret = -EINVAL;
6458f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
6459f28e565aSAlex Elder 	if (!options)
6460dc79b113SAlex Elder 		return -ENOMEM;
64614fb5d671SAlex Elder 	if (!*options) {
64624fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
64634fb5d671SAlex Elder 		goto out_err;
64644fb5d671SAlex Elder 	}
6465a725f65eSAlex Elder 
6466c300156bSIlya Dryomov 	pctx.spec = rbd_spec_alloc();
6467c300156bSIlya Dryomov 	if (!pctx.spec)
6468f28e565aSAlex Elder 		goto out_mem;
6469859c31dfSAlex Elder 
6470c300156bSIlya Dryomov 	pctx.spec->pool_name = dup_token(&buf, NULL);
6471c300156bSIlya Dryomov 	if (!pctx.spec->pool_name)
6472859c31dfSAlex Elder 		goto out_mem;
6473c300156bSIlya Dryomov 	if (!*pctx.spec->pool_name) {
64744fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
64754fb5d671SAlex Elder 		goto out_err;
64764fb5d671SAlex Elder 	}
6477e28fff26SAlex Elder 
6478c300156bSIlya Dryomov 	pctx.spec->image_name = dup_token(&buf, NULL);
6479c300156bSIlya Dryomov 	if (!pctx.spec->image_name)
6480f28e565aSAlex Elder 		goto out_mem;
6481c300156bSIlya Dryomov 	if (!*pctx.spec->image_name) {
64824fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
64834fb5d671SAlex Elder 		goto out_err;
64844fb5d671SAlex Elder 	}
6485e28fff26SAlex Elder 
6486f28e565aSAlex Elder 	/*
6487f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
6488f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
6489f28e565aSAlex Elder 	 */
64903feeb894SAlex Elder 	len = next_token(&buf);
6491820a5f3eSAlex Elder 	if (!len) {
64923feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
64933feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
6494f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
6495dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
6496f28e565aSAlex Elder 		goto out_err;
6497849b4260SAlex Elder 	}
6498ecb4dc22SAlex Elder 	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
6499ecb4dc22SAlex Elder 	if (!snap_name)
6500f28e565aSAlex Elder 		goto out_mem;
6501ecb4dc22SAlex Elder 	*(snap_name + len) = '\0';
6502c300156bSIlya Dryomov 	pctx.spec->snap_name = snap_name;
6503e5c35534SAlex Elder 
650482995cc6SDavid Howells 	pctx.copts = ceph_alloc_options();
650582995cc6SDavid Howells 	if (!pctx.copts)
650682995cc6SDavid Howells 		goto out_mem;
650782995cc6SDavid Howells 
65080ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
6509e28fff26SAlex Elder 
6510c300156bSIlya Dryomov 	pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL);
6511c300156bSIlya Dryomov 	if (!pctx.opts)
65124e9afebaSAlex Elder 		goto out_mem;
65134e9afebaSAlex Elder 
6514c300156bSIlya Dryomov 	pctx.opts->read_only = RBD_READ_ONLY_DEFAULT;
6515c300156bSIlya Dryomov 	pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
65160c93e1b7SIlya Dryomov 	pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT;
6517c300156bSIlya Dryomov 	pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
6518c300156bSIlya Dryomov 	pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
6519c300156bSIlya Dryomov 	pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
6520c300156bSIlya Dryomov 	pctx.opts->trim = RBD_TRIM_DEFAULT;
6521d22f76e7SAlex Elder 
65222d7c86a8SVenky Shankar 	ret = ceph_parse_mon_ips(mon_addrs, mon_addrs_size, pctx.copts, NULL,
65232d7c86a8SVenky Shankar 				 ',');
652482995cc6SDavid Howells 	if (ret)
6525dc79b113SAlex Elder 		goto out_err;
6526859c31dfSAlex Elder 
652782995cc6SDavid Howells 	ret = rbd_parse_options(options, &pctx);
652882995cc6SDavid Howells 	if (ret)
652982995cc6SDavid Howells 		goto out_err;
653082995cc6SDavid Howells 
653182995cc6SDavid Howells 	*ceph_opts = pctx.copts;
6532c300156bSIlya Dryomov 	*opts = pctx.opts;
6533c300156bSIlya Dryomov 	*rbd_spec = pctx.spec;
653482995cc6SDavid Howells 	kfree(options);
6535dc79b113SAlex Elder 	return 0;
653682995cc6SDavid Howells 
6537f28e565aSAlex Elder out_mem:
6538dc79b113SAlex Elder 	ret = -ENOMEM;
6539d22f76e7SAlex Elder out_err:
6540c300156bSIlya Dryomov 	kfree(pctx.opts);
654182995cc6SDavid Howells 	ceph_destroy_options(pctx.copts);
6542c300156bSIlya Dryomov 	rbd_spec_put(pctx.spec);
6543f28e565aSAlex Elder 	kfree(options);
6544dc79b113SAlex Elder 	return ret;
6545a725f65eSAlex Elder }
6546a725f65eSAlex Elder 
rbd_dev_image_unlock(struct rbd_device * rbd_dev)6547e010dd0aSIlya Dryomov static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
6548e010dd0aSIlya Dryomov {
6549e010dd0aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
6550e010dd0aSIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev))
6551e1fddc8fSIlya Dryomov 		__rbd_release_lock(rbd_dev);
6552e010dd0aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
6553e010dd0aSIlya Dryomov }
6554e010dd0aSIlya Dryomov 
6555637cd060SIlya Dryomov /*
6556637cd060SIlya Dryomov  * If the wait is interrupted, an error is returned even if the lock
6557637cd060SIlya Dryomov  * was successfully acquired.  rbd_dev_image_unlock() will release it
6558637cd060SIlya Dryomov  * if needed.
6559637cd060SIlya Dryomov  */
rbd_add_acquire_lock(struct rbd_device * rbd_dev)6560e010dd0aSIlya Dryomov static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
6561e010dd0aSIlya Dryomov {
6562637cd060SIlya Dryomov 	long ret;
65632f18d466SIlya Dryomov 
6564e010dd0aSIlya Dryomov 	if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
6565637cd060SIlya Dryomov 		if (!rbd_dev->opts->exclusive && !rbd_dev->opts->lock_on_read)
6566637cd060SIlya Dryomov 			return 0;
6567637cd060SIlya Dryomov 
6568e010dd0aSIlya Dryomov 		rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
6569e010dd0aSIlya Dryomov 		return -EINVAL;
6570e010dd0aSIlya Dryomov 	}
6571e010dd0aSIlya Dryomov 
65723fe69921SIlya Dryomov 	if (rbd_is_ro(rbd_dev))
6573637cd060SIlya Dryomov 		return 0;
6574637cd060SIlya Dryomov 
6575637cd060SIlya Dryomov 	rbd_assert(!rbd_is_lock_owner(rbd_dev));
6576637cd060SIlya Dryomov 	queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
6577637cd060SIlya Dryomov 	ret = wait_for_completion_killable_timeout(&rbd_dev->acquire_wait,
6578637cd060SIlya Dryomov 			    ceph_timeout_jiffies(rbd_dev->opts->lock_timeout));
657925e6be21SDongsheng Yang 	if (ret > 0) {
6580637cd060SIlya Dryomov 		ret = rbd_dev->acquire_err;
658125e6be21SDongsheng Yang 	} else {
658225e6be21SDongsheng Yang 		cancel_delayed_work_sync(&rbd_dev->lock_dwork);
658325e6be21SDongsheng Yang 		if (!ret)
6584637cd060SIlya Dryomov 			ret = -ETIMEDOUT;
6585637cd060SIlya Dryomov 
65869d01e07fSIlya Dryomov 		rbd_warn(rbd_dev, "failed to acquire lock: %ld", ret);
6587e010dd0aSIlya Dryomov 	}
65889d01e07fSIlya Dryomov 	if (ret)
65899d01e07fSIlya Dryomov 		return ret;
6590e010dd0aSIlya Dryomov 
6591637cd060SIlya Dryomov 	/*
6592637cd060SIlya Dryomov 	 * The lock may have been released by now, unless automatic lock
6593637cd060SIlya Dryomov 	 * transitions are disabled.
6594637cd060SIlya Dryomov 	 */
6595637cd060SIlya Dryomov 	rbd_assert(!rbd_dev->opts->exclusive || rbd_is_lock_owner(rbd_dev));
6596e010dd0aSIlya Dryomov 	return 0;
6597e010dd0aSIlya Dryomov }
6598e010dd0aSIlya Dryomov 
659930ba1f02SIlya Dryomov /*
6600589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
6601589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
6602589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
6603589d30e0SAlex Elder  *
6604589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
6605589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
6606589d30e0SAlex Elder  * with the supplied name.
6607589d30e0SAlex Elder  *
6608589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
6609589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
6610589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
6611589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
6612589d30e0SAlex Elder  */
rbd_dev_image_id(struct rbd_device * rbd_dev)6613589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
6614589d30e0SAlex Elder {
6615589d30e0SAlex Elder 	int ret;
6616589d30e0SAlex Elder 	size_t size;
6617ecd4a68aSIlya Dryomov 	CEPH_DEFINE_OID_ONSTACK(oid);
6618589d30e0SAlex Elder 	void *response;
6619c0fba368SAlex Elder 	char *image_id;
66202f82ee54SAlex Elder 
6621589d30e0SAlex Elder 	/*
66222c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
66232c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
6624c0fba368SAlex Elder 	 * need to fetch the image id again in this case.  We
6625c0fba368SAlex Elder 	 * do still need to set the image format though.
66262c0d0a10SAlex Elder 	 */
6627c0fba368SAlex Elder 	if (rbd_dev->spec->image_id) {
6628c0fba368SAlex Elder 		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
6629c0fba368SAlex Elder 
66302c0d0a10SAlex Elder 		return 0;
6631c0fba368SAlex Elder 	}
66322c0d0a10SAlex Elder 
66332c0d0a10SAlex Elder 	/*
6634589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
6635589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
6636589d30e0SAlex Elder 	 */
6637ecd4a68aSIlya Dryomov 	ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
6638ecd4a68aSIlya Dryomov 			       rbd_dev->spec->image_name);
6639ecd4a68aSIlya Dryomov 	if (ret)
6640ecd4a68aSIlya Dryomov 		return ret;
6641ecd4a68aSIlya Dryomov 
6642ecd4a68aSIlya Dryomov 	dout("rbd id object name is %s\n", oid.name);
6643589d30e0SAlex Elder 
6644589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
6645589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
6646589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
6647589d30e0SAlex Elder 	if (!response) {
6648589d30e0SAlex Elder 		ret = -ENOMEM;
6649589d30e0SAlex Elder 		goto out;
6650589d30e0SAlex Elder 	}
6651589d30e0SAlex Elder 
6652c0fba368SAlex Elder 	/* If it doesn't exist we'll assume it's a format 1 image */
6653c0fba368SAlex Elder 
6654ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
6655ecd4a68aSIlya Dryomov 				  "get_id", NULL, 0,
66565435d206SDongsheng Yang 				  response, size);
665736be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6658c0fba368SAlex Elder 	if (ret == -ENOENT) {
6659c0fba368SAlex Elder 		image_id = kstrdup("", GFP_KERNEL);
6660c0fba368SAlex Elder 		ret = image_id ? 0 : -ENOMEM;
6661c0fba368SAlex Elder 		if (!ret)
6662c0fba368SAlex Elder 			rbd_dev->image_format = 1;
66637dd440c9SIlya Dryomov 	} else if (ret >= 0) {
6664c0fba368SAlex Elder 		void *p = response;
6665589d30e0SAlex Elder 
6666c0fba368SAlex Elder 		image_id = ceph_extract_encoded_string(&p, p + ret,
6667979ed480SAlex Elder 						NULL, GFP_NOIO);
6668461f758aSDuan Jiong 		ret = PTR_ERR_OR_ZERO(image_id);
6669c0fba368SAlex Elder 		if (!ret)
6670c0fba368SAlex Elder 			rbd_dev->image_format = 2;
6671c0fba368SAlex Elder 	}
6672c0fba368SAlex Elder 
6673c0fba368SAlex Elder 	if (!ret) {
6674c0fba368SAlex Elder 		rbd_dev->spec->image_id = image_id;
6675c0fba368SAlex Elder 		dout("image_id is %s\n", image_id);
6676589d30e0SAlex Elder 	}
6677589d30e0SAlex Elder out:
6678589d30e0SAlex Elder 	kfree(response);
6679ecd4a68aSIlya Dryomov 	ceph_oid_destroy(&oid);
6680589d30e0SAlex Elder 	return ret;
6681589d30e0SAlex Elder }
6682589d30e0SAlex Elder 
66833abef3b3SAlex Elder /*
66843abef3b3SAlex Elder  * Undo whatever state changes are made by v1 or v2 header info
66853abef3b3SAlex Elder  * call.
66863abef3b3SAlex Elder  */
rbd_dev_unprobe(struct rbd_device * rbd_dev)66876fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
66886fd48b3bSAlex Elder {
6689a2acd00eSAlex Elder 	rbd_dev_parent_put(rbd_dev);
669022e8bd51SIlya Dryomov 	rbd_object_map_free(rbd_dev);
6691da5ef6beSIlya Dryomov 	rbd_dev_mapping_clear(rbd_dev);
66926fd48b3bSAlex Elder 
66936fd48b3bSAlex Elder 	/* Free dynamic fields from the header, then zero it out */
66946fd48b3bSAlex Elder 
6695510a7330SIlya Dryomov 	rbd_image_header_cleanup(&rbd_dev->header);
66966fd48b3bSAlex Elder }
66976fd48b3bSAlex Elder 
rbd_dev_v2_header_onetime(struct rbd_device * rbd_dev,struct rbd_image_header * header)6698510a7330SIlya Dryomov static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev,
6699510a7330SIlya Dryomov 				     struct rbd_image_header *header)
6700a30b71b9SAlex Elder {
6701a30b71b9SAlex Elder 	int ret;
6702a30b71b9SAlex Elder 
6703510a7330SIlya Dryomov 	ret = rbd_dev_v2_object_prefix(rbd_dev, &header->object_prefix);
670457385b51SAlex Elder 	if (ret)
6705510a7330SIlya Dryomov 		return ret;
6706b1b5402aSAlex Elder 
67072df3fac7SAlex Elder 	/*
67082df3fac7SAlex Elder 	 * Get the and check features for the image.  Currently the
67092df3fac7SAlex Elder 	 * features are assumed to never change.
67102df3fac7SAlex Elder 	 */
6711510a7330SIlya Dryomov 	ret = _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
6712510a7330SIlya Dryomov 					rbd_is_ro(rbd_dev), &header->features);
671357385b51SAlex Elder 	if (ret)
6714510a7330SIlya Dryomov 		return ret;
671535d489f9SAlex Elder 
6716cc070d59SAlex Elder 	/* If the image supports fancy striping, get its parameters */
6717cc070d59SAlex Elder 
6718510a7330SIlya Dryomov 	if (header->features & RBD_FEATURE_STRIPINGV2) {
6719510a7330SIlya Dryomov 		ret = rbd_dev_v2_striping_info(rbd_dev, &header->stripe_unit,
6720510a7330SIlya Dryomov 					       &header->stripe_count);
67217e97332eSIlya Dryomov 		if (ret)
6722510a7330SIlya Dryomov 			return ret;
67237e97332eSIlya Dryomov 	}
67247e97332eSIlya Dryomov 
6725510a7330SIlya Dryomov 	if (header->features & RBD_FEATURE_DATA_POOL) {
6726510a7330SIlya Dryomov 		ret = rbd_dev_v2_data_pool(rbd_dev, &header->data_pool_id);
6727510a7330SIlya Dryomov 		if (ret)
67289d475de5SAlex Elder 			return ret;
6729a30b71b9SAlex Elder 	}
6730a30b71b9SAlex Elder 
6731510a7330SIlya Dryomov 	return 0;
6732510a7330SIlya Dryomov }
6733510a7330SIlya Dryomov 
67346d69bb53SIlya Dryomov /*
67356d69bb53SIlya Dryomov  * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
67366d69bb53SIlya Dryomov  * rbd_dev_image_probe() recursion depth, which means it's also the
67376d69bb53SIlya Dryomov  * length of the already discovered part of the parent chain.
67386d69bb53SIlya Dryomov  */
rbd_dev_probe_parent(struct rbd_device * rbd_dev,int depth)67396d69bb53SIlya Dryomov static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
674083a06263SAlex Elder {
67412f82ee54SAlex Elder 	struct rbd_device *parent = NULL;
6742124afba2SAlex Elder 	int ret;
6743124afba2SAlex Elder 
6744124afba2SAlex Elder 	if (!rbd_dev->parent_spec)
6745124afba2SAlex Elder 		return 0;
6746124afba2SAlex Elder 
67476d69bb53SIlya Dryomov 	if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
67486d69bb53SIlya Dryomov 		pr_info("parent chain is too long (%d)\n", depth);
67496d69bb53SIlya Dryomov 		ret = -EINVAL;
67506d69bb53SIlya Dryomov 		goto out_err;
67516d69bb53SIlya Dryomov 	}
67526d69bb53SIlya Dryomov 
6753f7c4d9b1SIlya Dryomov 	parent = __rbd_dev_create(rbd_dev->parent_spec);
67541f2c6651SIlya Dryomov 	if (!parent) {
6755124afba2SAlex Elder 		ret = -ENOMEM;
6756124afba2SAlex Elder 		goto out_err;
67571f2c6651SIlya Dryomov 	}
67581f2c6651SIlya Dryomov 
67591f2c6651SIlya Dryomov 	/*
67601f2c6651SIlya Dryomov 	 * Images related by parent/child relationships always share
67611f2c6651SIlya Dryomov 	 * rbd_client and spec/parent_spec, so bump their refcounts.
67621f2c6651SIlya Dryomov 	 */
6763f7c4d9b1SIlya Dryomov 	parent->rbd_client = __rbd_get_client(rbd_dev->rbd_client);
6764f7c4d9b1SIlya Dryomov 	parent->spec = rbd_spec_get(rbd_dev->parent_spec);
6765124afba2SAlex Elder 
676639258aa2SIlya Dryomov 	__set_bit(RBD_DEV_FLAG_READONLY, &parent->flags);
676739258aa2SIlya Dryomov 
67686d69bb53SIlya Dryomov 	ret = rbd_dev_image_probe(parent, depth);
6769124afba2SAlex Elder 	if (ret < 0)
6770124afba2SAlex Elder 		goto out_err;
67711f2c6651SIlya Dryomov 
6772124afba2SAlex Elder 	rbd_dev->parent = parent;
6773a2acd00eSAlex Elder 	atomic_set(&rbd_dev->parent_ref, 1);
6774124afba2SAlex Elder 	return 0;
6775124afba2SAlex Elder 
67761f2c6651SIlya Dryomov out_err:
67771f2c6651SIlya Dryomov 	rbd_dev_unparent(rbd_dev);
67781f2c6651SIlya Dryomov 	rbd_dev_destroy(parent);
6779124afba2SAlex Elder 	return ret;
6780124afba2SAlex Elder }
6781124afba2SAlex Elder 
rbd_dev_device_release(struct rbd_device * rbd_dev)67825769ed0cSIlya Dryomov static void rbd_dev_device_release(struct rbd_device *rbd_dev)
67835769ed0cSIlya Dryomov {
67845769ed0cSIlya Dryomov 	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
67855769ed0cSIlya Dryomov 	rbd_free_disk(rbd_dev);
67865769ed0cSIlya Dryomov 	if (!single_major)
67875769ed0cSIlya Dryomov 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
67885769ed0cSIlya Dryomov }
67895769ed0cSIlya Dryomov 
6790811c6688SIlya Dryomov /*
6791811c6688SIlya Dryomov  * rbd_dev->header_rwsem must be locked for write and will be unlocked
6792811c6688SIlya Dryomov  * upon return.
6793811c6688SIlya Dryomov  */
rbd_dev_device_setup(struct rbd_device * rbd_dev)6794200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
6795124afba2SAlex Elder {
679683a06263SAlex Elder 	int ret;
679783a06263SAlex Elder 
67989b60e70bSIlya Dryomov 	/* Record our major and minor device numbers. */
679983a06263SAlex Elder 
68009b60e70bSIlya Dryomov 	if (!single_major) {
680183a06263SAlex Elder 		ret = register_blkdev(0, rbd_dev->name);
680283a06263SAlex Elder 		if (ret < 0)
68031643dfa4SIlya Dryomov 			goto err_out_unlock;
68049b60e70bSIlya Dryomov 
680583a06263SAlex Elder 		rbd_dev->major = ret;
6806dd82fff1SIlya Dryomov 		rbd_dev->minor = 0;
68079b60e70bSIlya Dryomov 	} else {
68089b60e70bSIlya Dryomov 		rbd_dev->major = rbd_major;
68099b60e70bSIlya Dryomov 		rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
68109b60e70bSIlya Dryomov 	}
681183a06263SAlex Elder 
681283a06263SAlex Elder 	/* Set up the blkdev mapping. */
681383a06263SAlex Elder 
681483a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
681583a06263SAlex Elder 	if (ret)
681683a06263SAlex Elder 		goto err_out_blkdev;
681783a06263SAlex Elder 
6818f35a4deeSAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
681939258aa2SIlya Dryomov 	set_disk_ro(rbd_dev->disk, rbd_is_ro(rbd_dev));
6820f35a4deeSAlex Elder 
68215769ed0cSIlya Dryomov 	ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
6822f35a4deeSAlex Elder 	if (ret)
6823da5ef6beSIlya Dryomov 		goto err_out_disk;
682483a06263SAlex Elder 
6825129b79d4SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
6826811c6688SIlya Dryomov 	up_write(&rbd_dev->header_rwsem);
68275769ed0cSIlya Dryomov 	return 0;
68282f82ee54SAlex Elder 
682983a06263SAlex Elder err_out_disk:
683083a06263SAlex Elder 	rbd_free_disk(rbd_dev);
683183a06263SAlex Elder err_out_blkdev:
68329b60e70bSIlya Dryomov 	if (!single_major)
683383a06263SAlex Elder 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
6834811c6688SIlya Dryomov err_out_unlock:
6835811c6688SIlya Dryomov 	up_write(&rbd_dev->header_rwsem);
683683a06263SAlex Elder 	return ret;
683783a06263SAlex Elder }
683883a06263SAlex Elder 
rbd_dev_header_name(struct rbd_device * rbd_dev)6839332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev)
6840332bb12dSAlex Elder {
6841332bb12dSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
6842c41d13a3SIlya Dryomov 	int ret;
6843332bb12dSAlex Elder 
6844332bb12dSAlex Elder 	/* Record the header object name for this rbd image. */
6845332bb12dSAlex Elder 
6846332bb12dSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
6847332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
6848c41d13a3SIlya Dryomov 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6849332bb12dSAlex Elder 				       spec->image_name, RBD_SUFFIX);
6850332bb12dSAlex Elder 	else
6851c41d13a3SIlya Dryomov 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6852332bb12dSAlex Elder 				       RBD_HEADER_PREFIX, spec->image_id);
6853c41d13a3SIlya Dryomov 
6854c41d13a3SIlya Dryomov 	return ret;
6855332bb12dSAlex Elder }
6856332bb12dSAlex Elder 
rbd_print_dne(struct rbd_device * rbd_dev,bool is_snap)6857b9ef2b88SIlya Dryomov static void rbd_print_dne(struct rbd_device *rbd_dev, bool is_snap)
6858b9ef2b88SIlya Dryomov {
6859b9ef2b88SIlya Dryomov 	if (!is_snap) {
6860b9ef2b88SIlya Dryomov 		pr_info("image %s/%s%s%s does not exist\n",
6861b9ef2b88SIlya Dryomov 			rbd_dev->spec->pool_name,
6862b9ef2b88SIlya Dryomov 			rbd_dev->spec->pool_ns ?: "",
6863b9ef2b88SIlya Dryomov 			rbd_dev->spec->pool_ns ? "/" : "",
6864b9ef2b88SIlya Dryomov 			rbd_dev->spec->image_name);
6865b9ef2b88SIlya Dryomov 	} else {
6866b9ef2b88SIlya Dryomov 		pr_info("snap %s/%s%s%s@%s does not exist\n",
6867b9ef2b88SIlya Dryomov 			rbd_dev->spec->pool_name,
6868b9ef2b88SIlya Dryomov 			rbd_dev->spec->pool_ns ?: "",
6869b9ef2b88SIlya Dryomov 			rbd_dev->spec->pool_ns ? "/" : "",
6870b9ef2b88SIlya Dryomov 			rbd_dev->spec->image_name,
6871b9ef2b88SIlya Dryomov 			rbd_dev->spec->snap_name);
6872b9ef2b88SIlya Dryomov 	}
6873b9ef2b88SIlya Dryomov }
6874b9ef2b88SIlya Dryomov 
rbd_dev_image_release(struct rbd_device * rbd_dev)6875200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev)
6876200a6a8bSAlex Elder {
6877b8776051SIlya Dryomov 	if (!rbd_is_ro(rbd_dev))
6878fd22aef8SIlya Dryomov 		rbd_unregister_watch(rbd_dev);
6879952c48b0SIlya Dryomov 
6880952c48b0SIlya Dryomov 	rbd_dev_unprobe(rbd_dev);
68816fd48b3bSAlex Elder 	rbd_dev->image_format = 0;
68826fd48b3bSAlex Elder 	kfree(rbd_dev->spec->image_id);
68836fd48b3bSAlex Elder 	rbd_dev->spec->image_id = NULL;
6884200a6a8bSAlex Elder }
6885200a6a8bSAlex Elder 
6886a30b71b9SAlex Elder /*
6887a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
68881f3ef788SAlex Elder  * device.  If this image is the one being mapped (i.e., not a
68891f3ef788SAlex Elder  * parent), initiate a watch on its header object before using that
68901f3ef788SAlex Elder  * object to get detailed information about the rbd image.
68910e4e1de5SIlya Dryomov  *
68920e4e1de5SIlya Dryomov  * On success, returns with header_rwsem held for write if called
68930e4e1de5SIlya Dryomov  * with @depth == 0.
6894a30b71b9SAlex Elder  */
rbd_dev_image_probe(struct rbd_device * rbd_dev,int depth)68956d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
6896a30b71b9SAlex Elder {
6897b9ef2b88SIlya Dryomov 	bool need_watch = !rbd_is_ro(rbd_dev);
6898a30b71b9SAlex Elder 	int ret;
6899a30b71b9SAlex Elder 
6900a30b71b9SAlex Elder 	/*
69013abef3b3SAlex Elder 	 * Get the id from the image id object.  Unless there's an
69023abef3b3SAlex Elder 	 * error, rbd_dev->spec->image_id will be filled in with
69033abef3b3SAlex Elder 	 * a dynamically-allocated string, and rbd_dev->image_format
69043abef3b3SAlex Elder 	 * will be set to either 1 or 2.
6905a30b71b9SAlex Elder 	 */
6906a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
6907a30b71b9SAlex Elder 	if (ret)
6908c0fba368SAlex Elder 		return ret;
6909c0fba368SAlex Elder 
6910332bb12dSAlex Elder 	ret = rbd_dev_header_name(rbd_dev);
6911332bb12dSAlex Elder 	if (ret)
6912332bb12dSAlex Elder 		goto err_out_format;
6913332bb12dSAlex Elder 
6914b9ef2b88SIlya Dryomov 	if (need_watch) {
691599d16943SIlya Dryomov 		ret = rbd_register_watch(rbd_dev);
69161fe48023SIlya Dryomov 		if (ret) {
69171fe48023SIlya Dryomov 			if (ret == -ENOENT)
6918b9ef2b88SIlya Dryomov 				rbd_print_dne(rbd_dev, false);
6919c41d13a3SIlya Dryomov 			goto err_out_format;
69201f3ef788SAlex Elder 		}
69211fe48023SIlya Dryomov 	}
6922b644de2bSAlex Elder 
69230e4e1de5SIlya Dryomov 	if (!depth)
69240e4e1de5SIlya Dryomov 		down_write(&rbd_dev->header_rwsem);
69250e4e1de5SIlya Dryomov 
6926510a7330SIlya Dryomov 	ret = rbd_dev_header_info(rbd_dev, &rbd_dev->header, true);
6927b9ef2b88SIlya Dryomov 	if (ret) {
6928b9ef2b88SIlya Dryomov 		if (ret == -ENOENT && !need_watch)
6929b9ef2b88SIlya Dryomov 			rbd_print_dne(rbd_dev, false);
6930952c48b0SIlya Dryomov 		goto err_out_probe;
6931b9ef2b88SIlya Dryomov 	}
6932a30b71b9SAlex Elder 
6933510a7330SIlya Dryomov 	rbd_init_layout(rbd_dev);
6934510a7330SIlya Dryomov 
693504077599SIlya Dryomov 	/*
693604077599SIlya Dryomov 	 * If this image is the one being mapped, we have pool name and
693704077599SIlya Dryomov 	 * id, image name and id, and snap name - need to fill snap id.
693804077599SIlya Dryomov 	 * Otherwise this is a parent image, identified by pool, image
693904077599SIlya Dryomov 	 * and snap ids - need to fill in names for those ids.
694004077599SIlya Dryomov 	 */
69416d69bb53SIlya Dryomov 	if (!depth)
694204077599SIlya Dryomov 		ret = rbd_spec_fill_snap_id(rbd_dev);
694304077599SIlya Dryomov 	else
694404077599SIlya Dryomov 		ret = rbd_spec_fill_names(rbd_dev);
69451fe48023SIlya Dryomov 	if (ret) {
69461fe48023SIlya Dryomov 		if (ret == -ENOENT)
6947b9ef2b88SIlya Dryomov 			rbd_print_dne(rbd_dev, true);
694833dca39fSAlex Elder 		goto err_out_probe;
69491fe48023SIlya Dryomov 	}
69509bb81c9bSAlex Elder 
6951da5ef6beSIlya Dryomov 	ret = rbd_dev_mapping_set(rbd_dev);
6952da5ef6beSIlya Dryomov 	if (ret)
6953da5ef6beSIlya Dryomov 		goto err_out_probe;
6954da5ef6beSIlya Dryomov 
6955f3c0e459SIlya Dryomov 	if (rbd_is_snap(rbd_dev) &&
695622e8bd51SIlya Dryomov 	    (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) {
695722e8bd51SIlya Dryomov 		ret = rbd_object_map_load(rbd_dev);
695822e8bd51SIlya Dryomov 		if (ret)
695922e8bd51SIlya Dryomov 			goto err_out_probe;
696022e8bd51SIlya Dryomov 	}
696122e8bd51SIlya Dryomov 
6962e8f59b59SIlya Dryomov 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
6963c1031177SIlya Dryomov 		ret = rbd_dev_setup_parent(rbd_dev);
6964e8f59b59SIlya Dryomov 		if (ret)
6965e8f59b59SIlya Dryomov 			goto err_out_probe;
6966e8f59b59SIlya Dryomov 	}
6967e8f59b59SIlya Dryomov 
69686d69bb53SIlya Dryomov 	ret = rbd_dev_probe_parent(rbd_dev, depth);
696930d60ba2SAlex Elder 	if (ret)
697030d60ba2SAlex Elder 		goto err_out_probe;
697183a06263SAlex Elder 
697230d60ba2SAlex Elder 	dout("discovered format %u image, header name is %s\n",
6973c41d13a3SIlya Dryomov 		rbd_dev->image_format, rbd_dev->header_oid.name);
697430d60ba2SAlex Elder 	return 0;
6975e8f59b59SIlya Dryomov 
69766fd48b3bSAlex Elder err_out_probe:
69770e4e1de5SIlya Dryomov 	if (!depth)
69780e4e1de5SIlya Dryomov 		up_write(&rbd_dev->header_rwsem);
6979b9ef2b88SIlya Dryomov 	if (need_watch)
698099d16943SIlya Dryomov 		rbd_unregister_watch(rbd_dev);
6981952c48b0SIlya Dryomov 	rbd_dev_unprobe(rbd_dev);
6982332bb12dSAlex Elder err_out_format:
6983332bb12dSAlex Elder 	rbd_dev->image_format = 0;
69845655c4d9SAlex Elder 	kfree(rbd_dev->spec->image_id);
69855655c4d9SAlex Elder 	rbd_dev->spec->image_id = NULL;
69865655c4d9SAlex Elder 	return ret;
698783a06263SAlex Elder }
698883a06263SAlex Elder 
rbd_dev_update_header(struct rbd_device * rbd_dev,struct rbd_image_header * header)6989510a7330SIlya Dryomov static void rbd_dev_update_header(struct rbd_device *rbd_dev,
6990510a7330SIlya Dryomov 				  struct rbd_image_header *header)
6991510a7330SIlya Dryomov {
6992510a7330SIlya Dryomov 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
6993510a7330SIlya Dryomov 	rbd_assert(rbd_dev->header.object_prefix); /* !first_time */
6994510a7330SIlya Dryomov 
69950b207d02SIlya Dryomov 	if (rbd_dev->header.image_size != header->image_size) {
6996510a7330SIlya Dryomov 		rbd_dev->header.image_size = header->image_size;
6997510a7330SIlya Dryomov 
69980b207d02SIlya Dryomov 		if (!rbd_is_snap(rbd_dev)) {
69990b207d02SIlya Dryomov 			rbd_dev->mapping.size = header->image_size;
70000b207d02SIlya Dryomov 			rbd_dev_update_size(rbd_dev);
70010b207d02SIlya Dryomov 		}
70020b207d02SIlya Dryomov 	}
70030b207d02SIlya Dryomov 
7004510a7330SIlya Dryomov 	ceph_put_snap_context(rbd_dev->header.snapc);
7005510a7330SIlya Dryomov 	rbd_dev->header.snapc = header->snapc;
7006510a7330SIlya Dryomov 	header->snapc = NULL;
7007510a7330SIlya Dryomov 
7008510a7330SIlya Dryomov 	if (rbd_dev->image_format == 1) {
7009510a7330SIlya Dryomov 		kfree(rbd_dev->header.snap_names);
7010510a7330SIlya Dryomov 		rbd_dev->header.snap_names = header->snap_names;
7011510a7330SIlya Dryomov 		header->snap_names = NULL;
7012510a7330SIlya Dryomov 
7013510a7330SIlya Dryomov 		kfree(rbd_dev->header.snap_sizes);
7014510a7330SIlya Dryomov 		rbd_dev->header.snap_sizes = header->snap_sizes;
7015510a7330SIlya Dryomov 		header->snap_sizes = NULL;
7016510a7330SIlya Dryomov 	}
7017510a7330SIlya Dryomov }
7018510a7330SIlya Dryomov 
rbd_dev_update_parent(struct rbd_device * rbd_dev,struct parent_image_info * pii)7019c1031177SIlya Dryomov static void rbd_dev_update_parent(struct rbd_device *rbd_dev,
7020c1031177SIlya Dryomov 				  struct parent_image_info *pii)
7021c1031177SIlya Dryomov {
7022c1031177SIlya Dryomov 	if (pii->pool_id == CEPH_NOPOOL || !pii->has_overlap) {
7023c1031177SIlya Dryomov 		/*
7024c1031177SIlya Dryomov 		 * Either the parent never existed, or we have
7025c1031177SIlya Dryomov 		 * record of it but the image got flattened so it no
7026c1031177SIlya Dryomov 		 * longer has a parent.  When the parent of a
7027c1031177SIlya Dryomov 		 * layered image disappears we immediately set the
7028c1031177SIlya Dryomov 		 * overlap to 0.  The effect of this is that all new
7029c1031177SIlya Dryomov 		 * requests will be treated as if the image had no
7030c1031177SIlya Dryomov 		 * parent.
7031c1031177SIlya Dryomov 		 *
7032c1031177SIlya Dryomov 		 * If !pii.has_overlap, the parent image spec is not
7033c1031177SIlya Dryomov 		 * applicable.  It's there to avoid duplication in each
7034c1031177SIlya Dryomov 		 * snapshot record.
7035c1031177SIlya Dryomov 		 */
7036c1031177SIlya Dryomov 		if (rbd_dev->parent_overlap) {
7037c1031177SIlya Dryomov 			rbd_dev->parent_overlap = 0;
7038c1031177SIlya Dryomov 			rbd_dev_parent_put(rbd_dev);
7039c1031177SIlya Dryomov 			pr_info("%s: clone has been flattened\n",
7040c1031177SIlya Dryomov 				rbd_dev->disk->disk_name);
7041c1031177SIlya Dryomov 		}
7042c1031177SIlya Dryomov 	} else {
7043c1031177SIlya Dryomov 		rbd_assert(rbd_dev->parent_spec);
7044c1031177SIlya Dryomov 
7045c1031177SIlya Dryomov 		/*
7046c1031177SIlya Dryomov 		 * Update the parent overlap.  If it became zero, issue
7047c1031177SIlya Dryomov 		 * a warning as we will proceed as if there is no parent.
7048c1031177SIlya Dryomov 		 */
7049c1031177SIlya Dryomov 		if (!pii->overlap && rbd_dev->parent_overlap)
7050c1031177SIlya Dryomov 			rbd_warn(rbd_dev,
7051c1031177SIlya Dryomov 				 "clone has become standalone (overlap 0)");
7052c1031177SIlya Dryomov 		rbd_dev->parent_overlap = pii->overlap;
7053c1031177SIlya Dryomov 	}
7054c1031177SIlya Dryomov }
7055c1031177SIlya Dryomov 
rbd_dev_refresh(struct rbd_device * rbd_dev)70560b035401SIlya Dryomov static int rbd_dev_refresh(struct rbd_device *rbd_dev)
70570b035401SIlya Dryomov {
7058510a7330SIlya Dryomov 	struct rbd_image_header	header = { 0 };
7059c1031177SIlya Dryomov 	struct parent_image_info pii = { 0 };
70600b035401SIlya Dryomov 	int ret;
70610b035401SIlya Dryomov 
70620b207d02SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
70630b035401SIlya Dryomov 
7064510a7330SIlya Dryomov 	ret = rbd_dev_header_info(rbd_dev, &header, false);
70650b035401SIlya Dryomov 	if (ret)
70660b035401SIlya Dryomov 		goto out;
70670b035401SIlya Dryomov 
70680b035401SIlya Dryomov 	/*
70690b035401SIlya Dryomov 	 * If there is a parent, see if it has disappeared due to the
70700b035401SIlya Dryomov 	 * mapped image getting flattened.
70710b035401SIlya Dryomov 	 */
70720b035401SIlya Dryomov 	if (rbd_dev->parent) {
7073c1031177SIlya Dryomov 		ret = rbd_dev_v2_parent_info(rbd_dev, &pii);
70740b035401SIlya Dryomov 		if (ret)
70750b035401SIlya Dryomov 			goto out;
70760b035401SIlya Dryomov 	}
70770b035401SIlya Dryomov 
70780b207d02SIlya Dryomov 	down_write(&rbd_dev->header_rwsem);
7079510a7330SIlya Dryomov 	rbd_dev_update_header(rbd_dev, &header);
7080c1031177SIlya Dryomov 	if (rbd_dev->parent)
7081c1031177SIlya Dryomov 		rbd_dev_update_parent(rbd_dev, &pii);
70820b207d02SIlya Dryomov 	up_write(&rbd_dev->header_rwsem);
70830b035401SIlya Dryomov 
70840b035401SIlya Dryomov out:
7085c1031177SIlya Dryomov 	rbd_parent_info_cleanup(&pii);
7086510a7330SIlya Dryomov 	rbd_image_header_cleanup(&header);
70870b035401SIlya Dryomov 	return ret;
70880b035401SIlya Dryomov }
70890b035401SIlya Dryomov 
do_rbd_add(const char * buf,size_t count)709075cff725SGreg Kroah-Hartman static ssize_t do_rbd_add(const char *buf, size_t count)
7091602adf40SYehuda Sadeh {
7092cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
7093dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
70944e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
7095859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
70969d3997fdSAlex Elder 	struct rbd_client *rbdc;
7097b51c83c2SIlya Dryomov 	int rc;
7098602adf40SYehuda Sadeh 
7099f44d04e6SIlya Dryomov 	if (!capable(CAP_SYS_ADMIN))
7100f44d04e6SIlya Dryomov 		return -EPERM;
7101f44d04e6SIlya Dryomov 
7102602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
7103602adf40SYehuda Sadeh 		return -ENODEV;
7104602adf40SYehuda Sadeh 
7105a725f65eSAlex Elder 	/* parse add command */
7106859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
7107dc79b113SAlex Elder 	if (rc < 0)
7108dd5ac32dSIlya Dryomov 		goto out;
7109a725f65eSAlex Elder 
71109d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
71119d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
71129d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
71130ddebc0cSAlex Elder 		goto err_out_args;
71149d3997fdSAlex Elder 	}
7115602adf40SYehuda Sadeh 
7116602adf40SYehuda Sadeh 	/* pick the pool */
7117dd435855SIlya Dryomov 	rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
71181fe48023SIlya Dryomov 	if (rc < 0) {
71191fe48023SIlya Dryomov 		if (rc == -ENOENT)
71201fe48023SIlya Dryomov 			pr_info("pool %s does not exist\n", spec->pool_name);
7121602adf40SYehuda Sadeh 		goto err_out_client;
71221fe48023SIlya Dryomov 	}
7123859c31dfSAlex Elder 	spec->pool_id = (u64)rc;
7124859c31dfSAlex Elder 
7125d147543dSIlya Dryomov 	rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
7126b51c83c2SIlya Dryomov 	if (!rbd_dev) {
7127b51c83c2SIlya Dryomov 		rc = -ENOMEM;
7128bd4ba655SAlex Elder 		goto err_out_client;
7129b51c83c2SIlya Dryomov 	}
7130c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
7131c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
7132d147543dSIlya Dryomov 	rbd_opts = NULL;	/* rbd_dev now owns this */
7133602adf40SYehuda Sadeh 
713439258aa2SIlya Dryomov 	/* if we are mapping a snapshot it will be a read-only mapping */
713539258aa2SIlya Dryomov 	if (rbd_dev->opts->read_only ||
713639258aa2SIlya Dryomov 	    strcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME))
713739258aa2SIlya Dryomov 		__set_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags);
713839258aa2SIlya Dryomov 
71390d6d1e9cSMike Christie 	rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
71400d6d1e9cSMike Christie 	if (!rbd_dev->config_info) {
71410d6d1e9cSMike Christie 		rc = -ENOMEM;
71420d6d1e9cSMike Christie 		goto err_out_rbd_dev;
71430d6d1e9cSMike Christie 	}
71440d6d1e9cSMike Christie 
71456d69bb53SIlya Dryomov 	rc = rbd_dev_image_probe(rbd_dev, 0);
71460e4e1de5SIlya Dryomov 	if (rc < 0)
7147c53d5893SAlex Elder 		goto err_out_rbd_dev;
714805fd6f6fSAlex Elder 
71490c93e1b7SIlya Dryomov 	if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) {
71500c93e1b7SIlya Dryomov 		rbd_warn(rbd_dev, "alloc_size adjusted to %u",
71510c93e1b7SIlya Dryomov 			 rbd_dev->layout.object_size);
71520c93e1b7SIlya Dryomov 		rbd_dev->opts->alloc_size = rbd_dev->layout.object_size;
71530c93e1b7SIlya Dryomov 	}
71540c93e1b7SIlya Dryomov 
7155b536f69aSAlex Elder 	rc = rbd_dev_device_setup(rbd_dev);
7156fd22aef8SIlya Dryomov 	if (rc)
71578b679ec5SIlya Dryomov 		goto err_out_image_probe;
71583abef3b3SAlex Elder 
7159e010dd0aSIlya Dryomov 	rc = rbd_add_acquire_lock(rbd_dev);
7160e010dd0aSIlya Dryomov 	if (rc)
7161637cd060SIlya Dryomov 		goto err_out_image_lock;
7162b536f69aSAlex Elder 
71635769ed0cSIlya Dryomov 	/* Everything's ready.  Announce the disk to the world. */
71645769ed0cSIlya Dryomov 
71655769ed0cSIlya Dryomov 	rc = device_add(&rbd_dev->dev);
71665769ed0cSIlya Dryomov 	if (rc)
7167e010dd0aSIlya Dryomov 		goto err_out_image_lock;
71685769ed0cSIlya Dryomov 
716927c97abcSLuis Chamberlain 	rc = device_add_disk(&rbd_dev->dev, rbd_dev->disk, NULL);
717027c97abcSLuis Chamberlain 	if (rc)
717127c97abcSLuis Chamberlain 		goto err_out_cleanup_disk;
71725769ed0cSIlya Dryomov 
71735769ed0cSIlya Dryomov 	spin_lock(&rbd_dev_list_lock);
71745769ed0cSIlya Dryomov 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
71755769ed0cSIlya Dryomov 	spin_unlock(&rbd_dev_list_lock);
71765769ed0cSIlya Dryomov 
71775769ed0cSIlya Dryomov 	pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
71785769ed0cSIlya Dryomov 		(unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
71795769ed0cSIlya Dryomov 		rbd_dev->header.features);
7180dd5ac32dSIlya Dryomov 	rc = count;
7181dd5ac32dSIlya Dryomov out:
7182dd5ac32dSIlya Dryomov 	module_put(THIS_MODULE);
7183dd5ac32dSIlya Dryomov 	return rc;
7184b536f69aSAlex Elder 
718527c97abcSLuis Chamberlain err_out_cleanup_disk:
718627c97abcSLuis Chamberlain 	rbd_free_disk(rbd_dev);
7187e010dd0aSIlya Dryomov err_out_image_lock:
7188e010dd0aSIlya Dryomov 	rbd_dev_image_unlock(rbd_dev);
71895769ed0cSIlya Dryomov 	rbd_dev_device_release(rbd_dev);
71908b679ec5SIlya Dryomov err_out_image_probe:
71918b679ec5SIlya Dryomov 	rbd_dev_image_release(rbd_dev);
7192c53d5893SAlex Elder err_out_rbd_dev:
7193c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
7194bd4ba655SAlex Elder err_out_client:
71959d3997fdSAlex Elder 	rbd_put_client(rbdc);
71960ddebc0cSAlex Elder err_out_args:
7197859c31dfSAlex Elder 	rbd_spec_put(spec);
7198d147543dSIlya Dryomov 	kfree(rbd_opts);
7199dd5ac32dSIlya Dryomov 	goto out;
7200602adf40SYehuda Sadeh }
7201602adf40SYehuda Sadeh 
add_store(const struct bus_type * bus,const char * buf,size_t count)720275cff725SGreg Kroah-Hartman static ssize_t add_store(const struct bus_type *bus, const char *buf, size_t count)
72039b60e70bSIlya Dryomov {
72049b60e70bSIlya Dryomov 	if (single_major)
72059b60e70bSIlya Dryomov 		return -EINVAL;
72069b60e70bSIlya Dryomov 
720775cff725SGreg Kroah-Hartman 	return do_rbd_add(buf, count);
72089b60e70bSIlya Dryomov }
72099b60e70bSIlya Dryomov 
add_single_major_store(const struct bus_type * bus,const char * buf,size_t count)721075cff725SGreg Kroah-Hartman static ssize_t add_single_major_store(const struct bus_type *bus, const char *buf,
72119b60e70bSIlya Dryomov 				      size_t count)
72129b60e70bSIlya Dryomov {
721375cff725SGreg Kroah-Hartman 	return do_rbd_add(buf, count);
72149b60e70bSIlya Dryomov }
72159b60e70bSIlya Dryomov 
rbd_dev_remove_parent(struct rbd_device * rbd_dev)721605a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
721705a46afdSAlex Elder {
7218ad945fc1SAlex Elder 	while (rbd_dev->parent) {
721905a46afdSAlex Elder 		struct rbd_device *first = rbd_dev;
722005a46afdSAlex Elder 		struct rbd_device *second = first->parent;
722105a46afdSAlex Elder 		struct rbd_device *third;
722205a46afdSAlex Elder 
722305a46afdSAlex Elder 		/*
722405a46afdSAlex Elder 		 * Follow to the parent with no grandparent and
722505a46afdSAlex Elder 		 * remove it.
722605a46afdSAlex Elder 		 */
722705a46afdSAlex Elder 		while (second && (third = second->parent)) {
722805a46afdSAlex Elder 			first = second;
722905a46afdSAlex Elder 			second = third;
723005a46afdSAlex Elder 		}
7231ad945fc1SAlex Elder 		rbd_assert(second);
72328ad42cd0SAlex Elder 		rbd_dev_image_release(second);
72338b679ec5SIlya Dryomov 		rbd_dev_destroy(second);
7234ad945fc1SAlex Elder 		first->parent = NULL;
7235ad945fc1SAlex Elder 		first->parent_overlap = 0;
7236ad945fc1SAlex Elder 
7237ad945fc1SAlex Elder 		rbd_assert(first->parent_spec);
723805a46afdSAlex Elder 		rbd_spec_put(first->parent_spec);
723905a46afdSAlex Elder 		first->parent_spec = NULL;
724005a46afdSAlex Elder 	}
724105a46afdSAlex Elder }
724205a46afdSAlex Elder 
do_rbd_remove(const char * buf,size_t count)724375cff725SGreg Kroah-Hartman static ssize_t do_rbd_remove(const char *buf, size_t count)
7244602adf40SYehuda Sadeh {
7245602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
7246751cc0e3SAlex Elder 	int dev_id;
72470276dca6SMike Christie 	char opt_buf[6];
72480276dca6SMike Christie 	bool force = false;
72490d8189e1SAlex Elder 	int ret;
7250602adf40SYehuda Sadeh 
7251f44d04e6SIlya Dryomov 	if (!capable(CAP_SYS_ADMIN))
7252f44d04e6SIlya Dryomov 		return -EPERM;
7253f44d04e6SIlya Dryomov 
72540276dca6SMike Christie 	dev_id = -1;
72550276dca6SMike Christie 	opt_buf[0] = '\0';
72560276dca6SMike Christie 	sscanf(buf, "%d %5s", &dev_id, opt_buf);
72570276dca6SMike Christie 	if (dev_id < 0) {
72580276dca6SMike Christie 		pr_err("dev_id out of range\n");
7259602adf40SYehuda Sadeh 		return -EINVAL;
72600276dca6SMike Christie 	}
72610276dca6SMike Christie 	if (opt_buf[0] != '\0') {
72620276dca6SMike Christie 		if (!strcmp(opt_buf, "force")) {
72630276dca6SMike Christie 			force = true;
72640276dca6SMike Christie 		} else {
72650276dca6SMike Christie 			pr_err("bad remove option at '%s'\n", opt_buf);
72660276dca6SMike Christie 			return -EINVAL;
72670276dca6SMike Christie 		}
72680276dca6SMike Christie 	}
7269602adf40SYehuda Sadeh 
7270602adf40SYehuda Sadeh 	ret = -ENOENT;
7271751cc0e3SAlex Elder 	spin_lock(&rbd_dev_list_lock);
7272cd59cdefSJinjie Ruan 	list_for_each_entry(rbd_dev, &rbd_dev_list, node) {
7273751cc0e3SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
7274751cc0e3SAlex Elder 			ret = 0;
7275751cc0e3SAlex Elder 			break;
7276602adf40SYehuda Sadeh 		}
7277751cc0e3SAlex Elder 	}
7278751cc0e3SAlex Elder 	if (!ret) {
7279a14ea269SAlex Elder 		spin_lock_irq(&rbd_dev->lock);
72800276dca6SMike Christie 		if (rbd_dev->open_count && !force)
728142382b70SAlex Elder 			ret = -EBUSY;
728285f5a4d6SIlya Dryomov 		else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING,
728385f5a4d6SIlya Dryomov 					  &rbd_dev->flags))
728485f5a4d6SIlya Dryomov 			ret = -EINPROGRESS;
7285a14ea269SAlex Elder 		spin_unlock_irq(&rbd_dev->lock);
7286751cc0e3SAlex Elder 	}
7287751cc0e3SAlex Elder 	spin_unlock(&rbd_dev_list_lock);
728885f5a4d6SIlya Dryomov 	if (ret)
72891ba0f1e7SAlex Elder 		return ret;
7290751cc0e3SAlex Elder 
72910276dca6SMike Christie 	if (force) {
72920276dca6SMike Christie 		/*
72930276dca6SMike Christie 		 * Prevent new IO from being queued and wait for existing
72940276dca6SMike Christie 		 * IO to complete/fail.
72950276dca6SMike Christie 		 */
72960276dca6SMike Christie 		blk_mq_freeze_queue(rbd_dev->disk->queue);
72977a5428dcSChristoph Hellwig 		blk_mark_disk_dead(rbd_dev->disk);
72980276dca6SMike Christie 	}
72990276dca6SMike Christie 
73005769ed0cSIlya Dryomov 	del_gendisk(rbd_dev->disk);
73015769ed0cSIlya Dryomov 	spin_lock(&rbd_dev_list_lock);
73025769ed0cSIlya Dryomov 	list_del_init(&rbd_dev->node);
73035769ed0cSIlya Dryomov 	spin_unlock(&rbd_dev_list_lock);
73045769ed0cSIlya Dryomov 	device_del(&rbd_dev->dev);
7305fca27065SIlya Dryomov 
7306e010dd0aSIlya Dryomov 	rbd_dev_image_unlock(rbd_dev);
7307dd5ac32dSIlya Dryomov 	rbd_dev_device_release(rbd_dev);
73088ad42cd0SAlex Elder 	rbd_dev_image_release(rbd_dev);
73098b679ec5SIlya Dryomov 	rbd_dev_destroy(rbd_dev);
73101ba0f1e7SAlex Elder 	return count;
7311602adf40SYehuda Sadeh }
7312602adf40SYehuda Sadeh 
remove_store(const struct bus_type * bus,const char * buf,size_t count)731375cff725SGreg Kroah-Hartman static ssize_t remove_store(const struct bus_type *bus, const char *buf, size_t count)
73149b60e70bSIlya Dryomov {
73159b60e70bSIlya Dryomov 	if (single_major)
73169b60e70bSIlya Dryomov 		return -EINVAL;
73179b60e70bSIlya Dryomov 
731875cff725SGreg Kroah-Hartman 	return do_rbd_remove(buf, count);
73199b60e70bSIlya Dryomov }
73209b60e70bSIlya Dryomov 
remove_single_major_store(const struct bus_type * bus,const char * buf,size_t count)732175cff725SGreg Kroah-Hartman static ssize_t remove_single_major_store(const struct bus_type *bus, const char *buf,
73229b60e70bSIlya Dryomov 					 size_t count)
73239b60e70bSIlya Dryomov {
732475cff725SGreg Kroah-Hartman 	return do_rbd_remove(buf, count);
73259b60e70bSIlya Dryomov }
73269b60e70bSIlya Dryomov 
7327602adf40SYehuda Sadeh /*
7328602adf40SYehuda Sadeh  * create control files in sysfs
7329dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
7330602adf40SYehuda Sadeh  */
rbd_sysfs_init(void)73317d8dc534SChengguang Xu static int __init rbd_sysfs_init(void)
7332602adf40SYehuda Sadeh {
7333dfc5606dSYehuda Sadeh 	int ret;
7334602adf40SYehuda Sadeh 
7335fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
73367f21735fSYang Yingliang 	if (ret < 0) {
73377f21735fSYang Yingliang 		put_device(&rbd_root_dev);
7338dfc5606dSYehuda Sadeh 		return ret;
73397f21735fSYang Yingliang 	}
7340602adf40SYehuda Sadeh 
7341fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
7342fed4c143SAlex Elder 	if (ret < 0)
7343fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
7344602adf40SYehuda Sadeh 
7345602adf40SYehuda Sadeh 	return ret;
7346602adf40SYehuda Sadeh }
7347602adf40SYehuda Sadeh 
rbd_sysfs_cleanup(void)73487d8dc534SChengguang Xu static void __exit rbd_sysfs_cleanup(void)
7349602adf40SYehuda Sadeh {
7350dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
7351fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
7352602adf40SYehuda Sadeh }
7353602adf40SYehuda Sadeh 
rbd_slab_init(void)73547d8dc534SChengguang Xu static int __init rbd_slab_init(void)
73551c2a9dfeSAlex Elder {
73561c2a9dfeSAlex Elder 	rbd_assert(!rbd_img_request_cache);
735703d94406SGeliang Tang 	rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
7358868311b1SAlex Elder 	if (!rbd_img_request_cache)
7359868311b1SAlex Elder 		return -ENOMEM;
7360868311b1SAlex Elder 
7361868311b1SAlex Elder 	rbd_assert(!rbd_obj_request_cache);
736203d94406SGeliang Tang 	rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
736378c2a44aSAlex Elder 	if (!rbd_obj_request_cache)
736478c2a44aSAlex Elder 		goto out_err;
736578c2a44aSAlex Elder 
73661c2a9dfeSAlex Elder 	return 0;
73671c2a9dfeSAlex Elder 
73686c696d85SIlya Dryomov out_err:
7369868311b1SAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
7370868311b1SAlex Elder 	rbd_img_request_cache = NULL;
73711c2a9dfeSAlex Elder 	return -ENOMEM;
73721c2a9dfeSAlex Elder }
73731c2a9dfeSAlex Elder 
rbd_slab_exit(void)73741c2a9dfeSAlex Elder static void rbd_slab_exit(void)
73751c2a9dfeSAlex Elder {
7376868311b1SAlex Elder 	rbd_assert(rbd_obj_request_cache);
7377868311b1SAlex Elder 	kmem_cache_destroy(rbd_obj_request_cache);
7378868311b1SAlex Elder 	rbd_obj_request_cache = NULL;
7379868311b1SAlex Elder 
73801c2a9dfeSAlex Elder 	rbd_assert(rbd_img_request_cache);
73811c2a9dfeSAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
73821c2a9dfeSAlex Elder 	rbd_img_request_cache = NULL;
73831c2a9dfeSAlex Elder }
73841c2a9dfeSAlex Elder 
rbd_init(void)7385cc344fa1SAlex Elder static int __init rbd_init(void)
7386602adf40SYehuda Sadeh {
7387602adf40SYehuda Sadeh 	int rc;
7388602adf40SYehuda Sadeh 
73891e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
73901e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
73911e32d34cSAlex Elder 		return -EINVAL;
73921e32d34cSAlex Elder 	}
7393e1b4d96dSIlya Dryomov 
73941c2a9dfeSAlex Elder 	rc = rbd_slab_init();
7395602adf40SYehuda Sadeh 	if (rc)
7396602adf40SYehuda Sadeh 		return rc;
7397e1b4d96dSIlya Dryomov 
7398f5ee37bdSIlya Dryomov 	/*
7399f5ee37bdSIlya Dryomov 	 * The number of active work items is limited by the number of
7400f77303bdSIlya Dryomov 	 * rbd devices * queue depth, so leave @max_active at default.
7401f5ee37bdSIlya Dryomov 	 */
7402f5ee37bdSIlya Dryomov 	rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
7403f5ee37bdSIlya Dryomov 	if (!rbd_wq) {
7404f5ee37bdSIlya Dryomov 		rc = -ENOMEM;
7405f5ee37bdSIlya Dryomov 		goto err_out_slab;
7406f5ee37bdSIlya Dryomov 	}
7407f5ee37bdSIlya Dryomov 
74089b60e70bSIlya Dryomov 	if (single_major) {
74099b60e70bSIlya Dryomov 		rbd_major = register_blkdev(0, RBD_DRV_NAME);
74109b60e70bSIlya Dryomov 		if (rbd_major < 0) {
74119b60e70bSIlya Dryomov 			rc = rbd_major;
7412f5ee37bdSIlya Dryomov 			goto err_out_wq;
74139b60e70bSIlya Dryomov 		}
74149b60e70bSIlya Dryomov 	}
74159b60e70bSIlya Dryomov 
74161c2a9dfeSAlex Elder 	rc = rbd_sysfs_init();
74171c2a9dfeSAlex Elder 	if (rc)
74189b60e70bSIlya Dryomov 		goto err_out_blkdev;
74191c2a9dfeSAlex Elder 
74209b60e70bSIlya Dryomov 	if (single_major)
74219b60e70bSIlya Dryomov 		pr_info("loaded (major %d)\n", rbd_major);
74229b60e70bSIlya Dryomov 	else
7423e1b4d96dSIlya Dryomov 		pr_info("loaded\n");
74249b60e70bSIlya Dryomov 
7425e1b4d96dSIlya Dryomov 	return 0;
7426e1b4d96dSIlya Dryomov 
74279b60e70bSIlya Dryomov err_out_blkdev:
74289b60e70bSIlya Dryomov 	if (single_major)
74299b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
7430f5ee37bdSIlya Dryomov err_out_wq:
7431f5ee37bdSIlya Dryomov 	destroy_workqueue(rbd_wq);
7432e1b4d96dSIlya Dryomov err_out_slab:
7433e1b4d96dSIlya Dryomov 	rbd_slab_exit();
74341c2a9dfeSAlex Elder 	return rc;
7435602adf40SYehuda Sadeh }
7436602adf40SYehuda Sadeh 
rbd_exit(void)7437cc344fa1SAlex Elder static void __exit rbd_exit(void)
7438602adf40SYehuda Sadeh {
7439ffe312cfSIlya Dryomov 	ida_destroy(&rbd_dev_id_ida);
7440602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
74419b60e70bSIlya Dryomov 	if (single_major)
74429b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
7443f5ee37bdSIlya Dryomov 	destroy_workqueue(rbd_wq);
74441c2a9dfeSAlex Elder 	rbd_slab_exit();
7445602adf40SYehuda Sadeh }
7446602adf40SYehuda Sadeh 
7447602adf40SYehuda Sadeh module_init(rbd_init);
7448602adf40SYehuda Sadeh module_exit(rbd_exit);
7449602adf40SYehuda Sadeh 
7450d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
7451602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
7452602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
7453602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
7454602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
7455602adf40SYehuda Sadeh 
745690da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
7457602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
7458