xref: /openbmc/linux/drivers/block/rbd.c (revision 3a482501)
1e2a58ee5SAlex Elder 
2602adf40SYehuda Sadeh /*
3602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh 
6602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
7602adf40SYehuda Sadeh 
8602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
9602adf40SYehuda Sadeh 
10602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
11602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
12602adf40SYehuda Sadeh    the Free Software Foundation.
13602adf40SYehuda Sadeh 
14602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
15602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
16602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17602adf40SYehuda Sadeh    GNU General Public License for more details.
18602adf40SYehuda Sadeh 
19602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
20602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
21602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24602adf40SYehuda Sadeh 
25dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
26602adf40SYehuda Sadeh 
27dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
28602adf40SYehuda Sadeh 
29602adf40SYehuda Sadeh  */
30602adf40SYehuda Sadeh 
31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
34ed95b21aSIlya Dryomov #include <linux/ceph/cls_lock_client.h>
3543df3d35SIlya Dryomov #include <linux/ceph/striper.h>
36602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3759c2be1eSYehuda Sadeh #include <linux/parser.h>
3830d1cff8SAlex Elder #include <linux/bsearch.h>
39602adf40SYehuda Sadeh 
40602adf40SYehuda Sadeh #include <linux/kernel.h>
41602adf40SYehuda Sadeh #include <linux/device.h>
42602adf40SYehuda Sadeh #include <linux/module.h>
437ad18afaSChristoph Hellwig #include <linux/blk-mq.h>
44602adf40SYehuda Sadeh #include <linux/fs.h>
45602adf40SYehuda Sadeh #include <linux/blkdev.h>
461c2a9dfeSAlex Elder #include <linux/slab.h>
47f8a22fc2SIlya Dryomov #include <linux/idr.h>
48bc1ecc65SIlya Dryomov #include <linux/workqueue.h>
49602adf40SYehuda Sadeh 
50602adf40SYehuda Sadeh #include "rbd_types.h"
51602adf40SYehuda Sadeh 
52aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
53aafb230eSAlex Elder 
54593a9e7bSAlex Elder /*
55a2acd00eSAlex Elder  * Increment the given counter and return its updated value.
56a2acd00eSAlex Elder  * If the counter is already 0 it will not be incremented.
57a2acd00eSAlex Elder  * If the counter is already at its maximum value returns
58a2acd00eSAlex Elder  * -EINVAL without updating it.
59a2acd00eSAlex Elder  */
60a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v)
61a2acd00eSAlex Elder {
62a2acd00eSAlex Elder 	unsigned int counter;
63a2acd00eSAlex Elder 
64bfc18e38SMark Rutland 	counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0);
65a2acd00eSAlex Elder 	if (counter <= (unsigned int)INT_MAX)
66a2acd00eSAlex Elder 		return (int)counter;
67a2acd00eSAlex Elder 
68a2acd00eSAlex Elder 	atomic_dec(v);
69a2acd00eSAlex Elder 
70a2acd00eSAlex Elder 	return -EINVAL;
71a2acd00eSAlex Elder }
72a2acd00eSAlex Elder 
73a2acd00eSAlex Elder /* Decrement the counter.  Return the resulting value, or -EINVAL */
74a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v)
75a2acd00eSAlex Elder {
76a2acd00eSAlex Elder 	int counter;
77a2acd00eSAlex Elder 
78a2acd00eSAlex Elder 	counter = atomic_dec_return(v);
79a2acd00eSAlex Elder 	if (counter >= 0)
80a2acd00eSAlex Elder 		return counter;
81a2acd00eSAlex Elder 
82a2acd00eSAlex Elder 	atomic_inc(v);
83a2acd00eSAlex Elder 
84a2acd00eSAlex Elder 	return -EINVAL;
85a2acd00eSAlex Elder }
86a2acd00eSAlex Elder 
87f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
88602adf40SYehuda Sadeh 
897e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR		256
907e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT	4
91602adf40SYehuda Sadeh 
926d69bb53SIlya Dryomov #define RBD_MAX_PARENT_CHAIN_LEN	16
936d69bb53SIlya Dryomov 
94d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
95d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
96d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
97d4b125e9SAlex Elder 
9835d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
99602adf40SYehuda Sadeh 
100602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
101602adf40SYehuda Sadeh 
1029682fc6dSAlex Elder #define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
1039682fc6dSAlex Elder 
1049e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
1059e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
106589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
1079e15b77dSAlex Elder 
1081e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
109589d30e0SAlex Elder 
110ed95b21aSIlya Dryomov #define RBD_NOTIFY_TIMEOUT	5	/* seconds */
11199d16943SIlya Dryomov #define RBD_RETRY_DELAY		msecs_to_jiffies(1000)
11299d16943SIlya Dryomov 
113d889140cSAlex Elder /* Feature bits */
114d889140cSAlex Elder 
1158767b293SIlya Dryomov #define RBD_FEATURE_LAYERING		(1ULL<<0)
1168767b293SIlya Dryomov #define RBD_FEATURE_STRIPINGV2		(1ULL<<1)
1178767b293SIlya Dryomov #define RBD_FEATURE_EXCLUSIVE_LOCK	(1ULL<<2)
1188767b293SIlya Dryomov #define RBD_FEATURE_DATA_POOL		(1ULL<<7)
119e573427aSIlya Dryomov #define RBD_FEATURE_OPERATIONS		(1ULL<<8)
1208767b293SIlya Dryomov 
121ed95b21aSIlya Dryomov #define RBD_FEATURES_ALL	(RBD_FEATURE_LAYERING |		\
122ed95b21aSIlya Dryomov 				 RBD_FEATURE_STRIPINGV2 |	\
1237e97332eSIlya Dryomov 				 RBD_FEATURE_EXCLUSIVE_LOCK |	\
124e573427aSIlya Dryomov 				 RBD_FEATURE_DATA_POOL |	\
125e573427aSIlya Dryomov 				 RBD_FEATURE_OPERATIONS)
126d889140cSAlex Elder 
127d889140cSAlex Elder /* Features supported by this (client software) implementation. */
128d889140cSAlex Elder 
129770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
130d889140cSAlex Elder 
13181a89793SAlex Elder /*
13281a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
13381a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
13481a89793SAlex Elder  */
135602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
136602adf40SYehuda Sadeh 
137602adf40SYehuda Sadeh /*
138602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
139602adf40SYehuda Sadeh  */
140602adf40SYehuda Sadeh struct rbd_image_header {
141f35a4deeSAlex Elder 	/* These six fields never change for a given rbd image */
142849b4260SAlex Elder 	char *object_prefix;
143602adf40SYehuda Sadeh 	__u8 obj_order;
144f35a4deeSAlex Elder 	u64 stripe_unit;
145f35a4deeSAlex Elder 	u64 stripe_count;
1467e97332eSIlya Dryomov 	s64 data_pool_id;
147f35a4deeSAlex Elder 	u64 features;		/* Might be changeable someday? */
148602adf40SYehuda Sadeh 
149f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
150f84344f3SAlex Elder 	u64 image_size;
151f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
152f35a4deeSAlex Elder 	char *snap_names;	/* format 1 only */
153f35a4deeSAlex Elder 	u64 *snap_sizes;	/* format 1 only */
15459c2be1eSYehuda Sadeh };
15559c2be1eSYehuda Sadeh 
1560d7dbfceSAlex Elder /*
1570d7dbfceSAlex Elder  * An rbd image specification.
1580d7dbfceSAlex Elder  *
1590d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
160c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
161c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
162c66c6e0cSAlex Elder  *
163c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
164c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
165c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
166c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
167c66c6e0cSAlex Elder  *
168c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
169c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
170c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
171c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
172c66c6e0cSAlex Elder  * is shared between the parent and child).
173c66c6e0cSAlex Elder  *
174c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
175c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
176c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
177c66c6e0cSAlex Elder  *
178c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
179c66c6e0cSAlex Elder  * could be a null pointer).
1800d7dbfceSAlex Elder  */
1810d7dbfceSAlex Elder struct rbd_spec {
1820d7dbfceSAlex Elder 	u64		pool_id;
183ecb4dc22SAlex Elder 	const char	*pool_name;
184b26c047bSIlya Dryomov 	const char	*pool_ns;	/* NULL if default, never "" */
1850d7dbfceSAlex Elder 
186ecb4dc22SAlex Elder 	const char	*image_id;
187ecb4dc22SAlex Elder 	const char	*image_name;
1880d7dbfceSAlex Elder 
1890d7dbfceSAlex Elder 	u64		snap_id;
190ecb4dc22SAlex Elder 	const char	*snap_name;
1910d7dbfceSAlex Elder 
1920d7dbfceSAlex Elder 	struct kref	kref;
1930d7dbfceSAlex Elder };
1940d7dbfceSAlex Elder 
195602adf40SYehuda Sadeh /*
196f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
197602adf40SYehuda Sadeh  */
198602adf40SYehuda Sadeh struct rbd_client {
199602adf40SYehuda Sadeh 	struct ceph_client	*client;
200602adf40SYehuda Sadeh 	struct kref		kref;
201602adf40SYehuda Sadeh 	struct list_head	node;
202602adf40SYehuda Sadeh };
203602adf40SYehuda Sadeh 
204bf0d5f50SAlex Elder struct rbd_img_request;
205bf0d5f50SAlex Elder 
2069969ebc5SAlex Elder enum obj_request_type {
207a1fbb5e7SIlya Dryomov 	OBJ_REQUEST_NODATA = 1,
2085359a17dSIlya Dryomov 	OBJ_REQUEST_BIO,	/* pointer into provided bio (list) */
2097e07efb1SIlya Dryomov 	OBJ_REQUEST_BVECS,	/* pointer into provided bio_vec array */
210afb97888SIlya Dryomov 	OBJ_REQUEST_OWN_BVECS,	/* private bio_vec array, doesn't own pages */
2119969ebc5SAlex Elder };
212bf0d5f50SAlex Elder 
2136d2940c8SGuangliang Zhao enum obj_operation_type {
214a1fbb5e7SIlya Dryomov 	OBJ_OP_READ = 1,
2156d2940c8SGuangliang Zhao 	OBJ_OP_WRITE,
21690e98c52SGuangliang Zhao 	OBJ_OP_DISCARD,
2176484cbe9SIlya Dryomov 	OBJ_OP_ZEROOUT,
2186d2940c8SGuangliang Zhao };
2196d2940c8SGuangliang Zhao 
2203da691bfSIlya Dryomov /*
2213da691bfSIlya Dryomov  * Writes go through the following state machine to deal with
2223da691bfSIlya Dryomov  * layering:
2233da691bfSIlya Dryomov  *
2243da691bfSIlya Dryomov  *                       need copyup
2253da691bfSIlya Dryomov  * RBD_OBJ_WRITE_GUARD ---------------> RBD_OBJ_WRITE_COPYUP
2263da691bfSIlya Dryomov  *        |     ^                              |
2273da691bfSIlya Dryomov  *        v     \------------------------------/
2283da691bfSIlya Dryomov  *      done
2293da691bfSIlya Dryomov  *        ^
2303da691bfSIlya Dryomov  *        |
2313da691bfSIlya Dryomov  * RBD_OBJ_WRITE_FLAT
2323da691bfSIlya Dryomov  *
2333da691bfSIlya Dryomov  * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
2343da691bfSIlya Dryomov  * there is a parent or not.
2353da691bfSIlya Dryomov  */
2363da691bfSIlya Dryomov enum rbd_obj_write_state {
2373da691bfSIlya Dryomov 	RBD_OBJ_WRITE_FLAT = 1,
2383da691bfSIlya Dryomov 	RBD_OBJ_WRITE_GUARD,
2393a482501SIlya Dryomov 	RBD_OBJ_WRITE_READ_FROM_PARENT,
2403a482501SIlya Dryomov 	RBD_OBJ_WRITE_COPYUP_OPS,
241926f9b3fSAlex Elder };
242926f9b3fSAlex Elder 
243bf0d5f50SAlex Elder struct rbd_obj_request {
24443df3d35SIlya Dryomov 	struct ceph_object_extent ex;
245c5b5ef6cSAlex Elder 	union {
2463da691bfSIlya Dryomov 		bool			tried_parent;	/* for reads */
2473da691bfSIlya Dryomov 		enum rbd_obj_write_state write_state;	/* for writes */
2483da691bfSIlya Dryomov 	};
249bf0d5f50SAlex Elder 
250bf0d5f50SAlex Elder 	struct rbd_img_request	*img_request;
25186bd7998SIlya Dryomov 	struct ceph_file_extent	*img_extents;
25286bd7998SIlya Dryomov 	u32			num_img_extents;
253bf0d5f50SAlex Elder 
254788e2df3SAlex Elder 	union {
2555359a17dSIlya Dryomov 		struct ceph_bio_iter	bio_pos;
256788e2df3SAlex Elder 		struct {
2577e07efb1SIlya Dryomov 			struct ceph_bvec_iter	bvec_pos;
2587e07efb1SIlya Dryomov 			u32			bvec_count;
259afb97888SIlya Dryomov 			u32			bvec_idx;
260788e2df3SAlex Elder 		};
261788e2df3SAlex Elder 	};
2627e07efb1SIlya Dryomov 	struct bio_vec		*copyup_bvecs;
2637e07efb1SIlya Dryomov 	u32			copyup_bvec_count;
264bf0d5f50SAlex Elder 
265bf0d5f50SAlex Elder 	struct ceph_osd_request	*osd_req;
266bf0d5f50SAlex Elder 
267bf0d5f50SAlex Elder 	u64			xferred;	/* bytes transferred */
2681b83bef2SSage Weil 	int			result;
269bf0d5f50SAlex Elder 
270bf0d5f50SAlex Elder 	struct kref		kref;
271bf0d5f50SAlex Elder };
272bf0d5f50SAlex Elder 
2730c425248SAlex Elder enum img_req_flags {
2749849e986SAlex Elder 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
275d0b2e944SAlex Elder 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
2760c425248SAlex Elder };
2770c425248SAlex Elder 
278bf0d5f50SAlex Elder struct rbd_img_request {
279bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
2809bb0248dSIlya Dryomov 	enum obj_operation_type	op_type;
281ecc633caSIlya Dryomov 	enum obj_request_type	data_type;
2820c425248SAlex Elder 	unsigned long		flags;
283bf0d5f50SAlex Elder 	union {
284bf0d5f50SAlex Elder 		u64			snap_id;	/* for reads */
2859849e986SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
2869849e986SAlex Elder 	};
2879849e986SAlex Elder 	union {
2889849e986SAlex Elder 		struct request		*rq;		/* block request */
2899849e986SAlex Elder 		struct rbd_obj_request	*obj_request;	/* obj req initiator */
290bf0d5f50SAlex Elder 	};
29115961b44SIlya Dryomov 	spinlock_t		completion_lock;
29255f27e09SAlex Elder 	u64			xferred;/* aggregate bytes transferred */
293a5a337d4SAlex Elder 	int			result;	/* first nonzero obj_request result */
294bf0d5f50SAlex Elder 
29543df3d35SIlya Dryomov 	struct list_head	object_extents;	/* obj_req.ex structs */
2967114edacSIlya Dryomov 	u32			pending_count;
297bf0d5f50SAlex Elder 
298bf0d5f50SAlex Elder 	struct kref		kref;
299bf0d5f50SAlex Elder };
300bf0d5f50SAlex Elder 
301bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
30243df3d35SIlya Dryomov 	list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
303bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
30443df3d35SIlya Dryomov 	list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
305bf0d5f50SAlex Elder 
30699d16943SIlya Dryomov enum rbd_watch_state {
30799d16943SIlya Dryomov 	RBD_WATCH_STATE_UNREGISTERED,
30899d16943SIlya Dryomov 	RBD_WATCH_STATE_REGISTERED,
30999d16943SIlya Dryomov 	RBD_WATCH_STATE_ERROR,
31099d16943SIlya Dryomov };
31199d16943SIlya Dryomov 
312ed95b21aSIlya Dryomov enum rbd_lock_state {
313ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_UNLOCKED,
314ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_LOCKED,
315ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_RELEASING,
316ed95b21aSIlya Dryomov };
317ed95b21aSIlya Dryomov 
318ed95b21aSIlya Dryomov /* WatchNotify::ClientId */
319ed95b21aSIlya Dryomov struct rbd_client_id {
320ed95b21aSIlya Dryomov 	u64 gid;
321ed95b21aSIlya Dryomov 	u64 handle;
322ed95b21aSIlya Dryomov };
323ed95b21aSIlya Dryomov 
324f84344f3SAlex Elder struct rbd_mapping {
32599c1f08fSAlex Elder 	u64                     size;
32634b13184SAlex Elder 	u64                     features;
327f84344f3SAlex Elder };
328f84344f3SAlex Elder 
329602adf40SYehuda Sadeh /*
330602adf40SYehuda Sadeh  * a single device
331602adf40SYehuda Sadeh  */
332602adf40SYehuda Sadeh struct rbd_device {
333de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
334602adf40SYehuda Sadeh 
335602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
336dd82fff1SIlya Dryomov 	int			minor;
337602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
338602adf40SYehuda Sadeh 
339a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
340602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
341602adf40SYehuda Sadeh 
342602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
343602adf40SYehuda Sadeh 
344b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
345602adf40SYehuda Sadeh 
346602adf40SYehuda Sadeh 	struct rbd_image_header	header;
347b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
3480d7dbfceSAlex Elder 	struct rbd_spec		*spec;
349d147543dSIlya Dryomov 	struct rbd_options	*opts;
3500d6d1e9cSMike Christie 	char			*config_info;	/* add{,_single_major} string */
351602adf40SYehuda Sadeh 
352c41d13a3SIlya Dryomov 	struct ceph_object_id	header_oid;
353922dab61SIlya Dryomov 	struct ceph_object_locator header_oloc;
354971f839aSAlex Elder 
3551643dfa4SIlya Dryomov 	struct ceph_file_layout	layout;		/* used for all rbd requests */
3560903e875SAlex Elder 
35799d16943SIlya Dryomov 	struct mutex		watch_mutex;
35899d16943SIlya Dryomov 	enum rbd_watch_state	watch_state;
359922dab61SIlya Dryomov 	struct ceph_osd_linger_request *watch_handle;
36099d16943SIlya Dryomov 	u64			watch_cookie;
36199d16943SIlya Dryomov 	struct delayed_work	watch_dwork;
36259c2be1eSYehuda Sadeh 
363ed95b21aSIlya Dryomov 	struct rw_semaphore	lock_rwsem;
364ed95b21aSIlya Dryomov 	enum rbd_lock_state	lock_state;
365cbbfb0ffSIlya Dryomov 	char			lock_cookie[32];
366ed95b21aSIlya Dryomov 	struct rbd_client_id	owner_cid;
367ed95b21aSIlya Dryomov 	struct work_struct	acquired_lock_work;
368ed95b21aSIlya Dryomov 	struct work_struct	released_lock_work;
369ed95b21aSIlya Dryomov 	struct delayed_work	lock_dwork;
370ed95b21aSIlya Dryomov 	struct work_struct	unlock_work;
371ed95b21aSIlya Dryomov 	wait_queue_head_t	lock_waitq;
372ed95b21aSIlya Dryomov 
3731643dfa4SIlya Dryomov 	struct workqueue_struct	*task_wq;
374602adf40SYehuda Sadeh 
37586b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
37686b00e0dSAlex Elder 	u64			parent_overlap;
377a2acd00eSAlex Elder 	atomic_t		parent_ref;
3782f82ee54SAlex Elder 	struct rbd_device	*parent;
37986b00e0dSAlex Elder 
3807ad18afaSChristoph Hellwig 	/* Block layer tags. */
3817ad18afaSChristoph Hellwig 	struct blk_mq_tag_set	tag_set;
3827ad18afaSChristoph Hellwig 
383c666601aSJosh Durgin 	/* protects updating the header */
384c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
385f84344f3SAlex Elder 
386f84344f3SAlex Elder 	struct rbd_mapping	mapping;
387602adf40SYehuda Sadeh 
388602adf40SYehuda Sadeh 	struct list_head	node;
389dfc5606dSYehuda Sadeh 
390dfc5606dSYehuda Sadeh 	/* sysfs related */
391dfc5606dSYehuda Sadeh 	struct device		dev;
392b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
393dfc5606dSYehuda Sadeh };
394dfc5606dSYehuda Sadeh 
395b82d167bSAlex Elder /*
39687c0fdedSIlya Dryomov  * Flag bits for rbd_dev->flags:
39787c0fdedSIlya Dryomov  * - REMOVING (which is coupled with rbd_dev->open_count) is protected
39887c0fdedSIlya Dryomov  *   by rbd_dev->lock
39987c0fdedSIlya Dryomov  * - BLACKLISTED is protected by rbd_dev->lock_rwsem
400b82d167bSAlex Elder  */
4016d292906SAlex Elder enum rbd_dev_flags {
4026d292906SAlex Elder 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
403b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
40487c0fdedSIlya Dryomov 	RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */
4056d292906SAlex Elder };
4066d292906SAlex Elder 
407cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex);	/* Serialize client creation */
408e124a82fSAlex Elder 
409602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
410e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
411e124a82fSAlex Elder 
412602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
413432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
414602adf40SYehuda Sadeh 
41578c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */
41678c2a44aSAlex Elder 
4171c2a9dfeSAlex Elder static struct kmem_cache	*rbd_img_request_cache;
418868311b1SAlex Elder static struct kmem_cache	*rbd_obj_request_cache;
4191c2a9dfeSAlex Elder 
4209b60e70bSIlya Dryomov static int rbd_major;
421f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida);
422f8a22fc2SIlya Dryomov 
423f5ee37bdSIlya Dryomov static struct workqueue_struct *rbd_wq;
424f5ee37bdSIlya Dryomov 
4259b60e70bSIlya Dryomov /*
4263cfa3b16SIlya Dryomov  * single-major requires >= 0.75 version of userspace rbd utility.
4279b60e70bSIlya Dryomov  */
4283cfa3b16SIlya Dryomov static bool single_major = true;
4295657a819SJoe Perches module_param(single_major, bool, 0444);
4303cfa3b16SIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
4319b60e70bSIlya Dryomov 
432f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
433f0f8cef5SAlex Elder 		       size_t count);
434f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
435f0f8cef5SAlex Elder 			  size_t count);
4369b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
4379b60e70bSIlya Dryomov 				    size_t count);
4389b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
4399b60e70bSIlya Dryomov 				       size_t count);
4406d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
441f0f8cef5SAlex Elder 
4429b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id)
4439b60e70bSIlya Dryomov {
4447e513d43SIlya Dryomov 	return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
4459b60e70bSIlya Dryomov }
4469b60e70bSIlya Dryomov 
4479b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor)
4489b60e70bSIlya Dryomov {
4497e513d43SIlya Dryomov 	return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
4509b60e70bSIlya Dryomov }
4519b60e70bSIlya Dryomov 
452ed95b21aSIlya Dryomov static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
453ed95b21aSIlya Dryomov {
454ed95b21aSIlya Dryomov 	return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
455ed95b21aSIlya Dryomov 	       rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
456ed95b21aSIlya Dryomov }
457ed95b21aSIlya Dryomov 
458ed95b21aSIlya Dryomov static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
459ed95b21aSIlya Dryomov {
460ed95b21aSIlya Dryomov 	bool is_lock_owner;
461ed95b21aSIlya Dryomov 
462ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
463ed95b21aSIlya Dryomov 	is_lock_owner = __rbd_is_lock_owner(rbd_dev);
464ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
465ed95b21aSIlya Dryomov 	return is_lock_owner;
466ed95b21aSIlya Dryomov }
467ed95b21aSIlya Dryomov 
4688767b293SIlya Dryomov static ssize_t rbd_supported_features_show(struct bus_type *bus, char *buf)
4698767b293SIlya Dryomov {
4708767b293SIlya Dryomov 	return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
4718767b293SIlya Dryomov }
4728767b293SIlya Dryomov 
4735657a819SJoe Perches static BUS_ATTR(add, 0200, NULL, rbd_add);
4745657a819SJoe Perches static BUS_ATTR(remove, 0200, NULL, rbd_remove);
4755657a819SJoe Perches static BUS_ATTR(add_single_major, 0200, NULL, rbd_add_single_major);
4765657a819SJoe Perches static BUS_ATTR(remove_single_major, 0200, NULL, rbd_remove_single_major);
4775657a819SJoe Perches static BUS_ATTR(supported_features, 0444, rbd_supported_features_show, NULL);
478b15a21ddSGreg Kroah-Hartman 
479b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = {
480b15a21ddSGreg Kroah-Hartman 	&bus_attr_add.attr,
481b15a21ddSGreg Kroah-Hartman 	&bus_attr_remove.attr,
4829b60e70bSIlya Dryomov 	&bus_attr_add_single_major.attr,
4839b60e70bSIlya Dryomov 	&bus_attr_remove_single_major.attr,
4848767b293SIlya Dryomov 	&bus_attr_supported_features.attr,
485b15a21ddSGreg Kroah-Hartman 	NULL,
486f0f8cef5SAlex Elder };
48792c76dc0SIlya Dryomov 
48892c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj,
48992c76dc0SIlya Dryomov 				  struct attribute *attr, int index)
49092c76dc0SIlya Dryomov {
4919b60e70bSIlya Dryomov 	if (!single_major &&
4929b60e70bSIlya Dryomov 	    (attr == &bus_attr_add_single_major.attr ||
4939b60e70bSIlya Dryomov 	     attr == &bus_attr_remove_single_major.attr))
4949b60e70bSIlya Dryomov 		return 0;
4959b60e70bSIlya Dryomov 
49692c76dc0SIlya Dryomov 	return attr->mode;
49792c76dc0SIlya Dryomov }
49892c76dc0SIlya Dryomov 
49992c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = {
50092c76dc0SIlya Dryomov 	.attrs = rbd_bus_attrs,
50192c76dc0SIlya Dryomov 	.is_visible = rbd_bus_is_visible,
50292c76dc0SIlya Dryomov };
50392c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus);
504f0f8cef5SAlex Elder 
505f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
506f0f8cef5SAlex Elder 	.name		= "rbd",
507b15a21ddSGreg Kroah-Hartman 	.bus_groups	= rbd_bus_groups,
508f0f8cef5SAlex Elder };
509f0f8cef5SAlex Elder 
510f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
511f0f8cef5SAlex Elder {
512f0f8cef5SAlex Elder }
513f0f8cef5SAlex Elder 
514f0f8cef5SAlex Elder static struct device rbd_root_dev = {
515f0f8cef5SAlex Elder 	.init_name =    "rbd",
516f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
517f0f8cef5SAlex Elder };
518f0f8cef5SAlex Elder 
51906ecc6cbSAlex Elder static __printf(2, 3)
52006ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
52106ecc6cbSAlex Elder {
52206ecc6cbSAlex Elder 	struct va_format vaf;
52306ecc6cbSAlex Elder 	va_list args;
52406ecc6cbSAlex Elder 
52506ecc6cbSAlex Elder 	va_start(args, fmt);
52606ecc6cbSAlex Elder 	vaf.fmt = fmt;
52706ecc6cbSAlex Elder 	vaf.va = &args;
52806ecc6cbSAlex Elder 
52906ecc6cbSAlex Elder 	if (!rbd_dev)
53006ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
53106ecc6cbSAlex Elder 	else if (rbd_dev->disk)
53206ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
53306ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
53406ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
53506ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
53606ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
53706ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
53806ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
53906ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
54006ecc6cbSAlex Elder 	else	/* punt */
54106ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
54206ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
54306ecc6cbSAlex Elder 	va_end(args);
54406ecc6cbSAlex Elder }
54506ecc6cbSAlex Elder 
546aafb230eSAlex Elder #ifdef RBD_DEBUG
547aafb230eSAlex Elder #define rbd_assert(expr)						\
548aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
549aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
550aafb230eSAlex Elder 						"at line %d:\n\n"	\
551aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
552aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
553aafb230eSAlex Elder 			BUG();						\
554aafb230eSAlex Elder 		}
555aafb230eSAlex Elder #else /* !RBD_DEBUG */
556aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
557aafb230eSAlex Elder #endif /* !RBD_DEBUG */
558dfc5606dSYehuda Sadeh 
55905a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
5608b3e1a56SAlex Elder 
561cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev);
5622df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
563a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev);
564e8f59b59SIlya Dryomov static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
56554cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
56654cac61fSAlex Elder 					u64 snap_id);
5672ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
5682ad3d716SAlex Elder 				u8 *order, u64 *snap_size);
5692ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
5702ad3d716SAlex Elder 		u64 *snap_features);
57159c2be1eSYehuda Sadeh 
572602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
573602adf40SYehuda Sadeh {
574f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
575b82d167bSAlex Elder 	bool removing = false;
576602adf40SYehuda Sadeh 
577a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
578b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
579b82d167bSAlex Elder 		removing = true;
580b82d167bSAlex Elder 	else
581b82d167bSAlex Elder 		rbd_dev->open_count++;
582a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
583b82d167bSAlex Elder 	if (removing)
584b82d167bSAlex Elder 		return -ENOENT;
585b82d167bSAlex Elder 
586c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
587340c7a2bSAlex Elder 
588602adf40SYehuda Sadeh 	return 0;
589602adf40SYehuda Sadeh }
590602adf40SYehuda Sadeh 
591db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode)
592dfc5606dSYehuda Sadeh {
593dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
594b82d167bSAlex Elder 	unsigned long open_count_before;
595b82d167bSAlex Elder 
596a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
597b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
598a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
599b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
600dfc5606dSYehuda Sadeh 
601c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
602dfc5606dSYehuda Sadeh }
603dfc5606dSYehuda Sadeh 
604131fd9f6SGuangliang Zhao static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
605131fd9f6SGuangliang Zhao {
6061de797bbSIlya Dryomov 	int ro;
607131fd9f6SGuangliang Zhao 
6081de797bbSIlya Dryomov 	if (get_user(ro, (int __user *)arg))
609131fd9f6SGuangliang Zhao 		return -EFAULT;
610131fd9f6SGuangliang Zhao 
6111de797bbSIlya Dryomov 	/* Snapshots can't be marked read-write */
612131fd9f6SGuangliang Zhao 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
613131fd9f6SGuangliang Zhao 		return -EROFS;
614131fd9f6SGuangliang Zhao 
6151de797bbSIlya Dryomov 	/* Let blkdev_roset() handle it */
6161de797bbSIlya Dryomov 	return -ENOTTY;
617131fd9f6SGuangliang Zhao }
618131fd9f6SGuangliang Zhao 
619131fd9f6SGuangliang Zhao static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
620131fd9f6SGuangliang Zhao 			unsigned int cmd, unsigned long arg)
621131fd9f6SGuangliang Zhao {
622131fd9f6SGuangliang Zhao 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
6231de797bbSIlya Dryomov 	int ret;
624131fd9f6SGuangliang Zhao 
625131fd9f6SGuangliang Zhao 	switch (cmd) {
626131fd9f6SGuangliang Zhao 	case BLKROSET:
627131fd9f6SGuangliang Zhao 		ret = rbd_ioctl_set_ro(rbd_dev, arg);
628131fd9f6SGuangliang Zhao 		break;
629131fd9f6SGuangliang Zhao 	default:
630131fd9f6SGuangliang Zhao 		ret = -ENOTTY;
631131fd9f6SGuangliang Zhao 	}
632131fd9f6SGuangliang Zhao 
633131fd9f6SGuangliang Zhao 	return ret;
634131fd9f6SGuangliang Zhao }
635131fd9f6SGuangliang Zhao 
636131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT
637131fd9f6SGuangliang Zhao static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
638131fd9f6SGuangliang Zhao 				unsigned int cmd, unsigned long arg)
639131fd9f6SGuangliang Zhao {
640131fd9f6SGuangliang Zhao 	return rbd_ioctl(bdev, mode, cmd, arg);
641131fd9f6SGuangliang Zhao }
642131fd9f6SGuangliang Zhao #endif /* CONFIG_COMPAT */
643131fd9f6SGuangliang Zhao 
644602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
645602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
646602adf40SYehuda Sadeh 	.open			= rbd_open,
647dfc5606dSYehuda Sadeh 	.release		= rbd_release,
648131fd9f6SGuangliang Zhao 	.ioctl			= rbd_ioctl,
649131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT
650131fd9f6SGuangliang Zhao 	.compat_ioctl		= rbd_compat_ioctl,
651131fd9f6SGuangliang Zhao #endif
652602adf40SYehuda Sadeh };
653602adf40SYehuda Sadeh 
654602adf40SYehuda Sadeh /*
6557262cfcaSAlex Elder  * Initialize an rbd client instance.  Success or not, this function
656cfbf6377SAlex Elder  * consumes ceph_opts.  Caller holds client_mutex.
657602adf40SYehuda Sadeh  */
658f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
659602adf40SYehuda Sadeh {
660602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
661602adf40SYehuda Sadeh 	int ret = -ENOMEM;
662602adf40SYehuda Sadeh 
66337206ee5SAlex Elder 	dout("%s:\n", __func__);
664602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
665602adf40SYehuda Sadeh 	if (!rbdc)
666602adf40SYehuda Sadeh 		goto out_opt;
667602adf40SYehuda Sadeh 
668602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
669602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
670602adf40SYehuda Sadeh 
67174da4a0fSIlya Dryomov 	rbdc->client = ceph_create_client(ceph_opts, rbdc);
672602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
67308f75463SAlex Elder 		goto out_rbdc;
67443ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
675602adf40SYehuda Sadeh 
676602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
677602adf40SYehuda Sadeh 	if (ret < 0)
67808f75463SAlex Elder 		goto out_client;
679602adf40SYehuda Sadeh 
680432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
681602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
682432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
683602adf40SYehuda Sadeh 
68437206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
685bc534d86SAlex Elder 
686602adf40SYehuda Sadeh 	return rbdc;
68708f75463SAlex Elder out_client:
688602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
68908f75463SAlex Elder out_rbdc:
690602adf40SYehuda Sadeh 	kfree(rbdc);
691602adf40SYehuda Sadeh out_opt:
69243ae4701SAlex Elder 	if (ceph_opts)
69343ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
69437206ee5SAlex Elder 	dout("%s: error %d\n", __func__, ret);
69537206ee5SAlex Elder 
69628f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
697602adf40SYehuda Sadeh }
698602adf40SYehuda Sadeh 
6992f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
7002f82ee54SAlex Elder {
7012f82ee54SAlex Elder 	kref_get(&rbdc->kref);
7022f82ee54SAlex Elder 
7032f82ee54SAlex Elder 	return rbdc;
7042f82ee54SAlex Elder }
7052f82ee54SAlex Elder 
706602adf40SYehuda Sadeh /*
7071f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
7081f7ba331SAlex Elder  * found, bump its reference count.
709602adf40SYehuda Sadeh  */
7101f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
711602adf40SYehuda Sadeh {
712602adf40SYehuda Sadeh 	struct rbd_client *client_node;
7131f7ba331SAlex Elder 	bool found = false;
714602adf40SYehuda Sadeh 
71543ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
716602adf40SYehuda Sadeh 		return NULL;
717602adf40SYehuda Sadeh 
7181f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
7191f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
7201f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
7212f82ee54SAlex Elder 			__rbd_get_client(client_node);
7222f82ee54SAlex Elder 
7231f7ba331SAlex Elder 			found = true;
7241f7ba331SAlex Elder 			break;
7251f7ba331SAlex Elder 		}
7261f7ba331SAlex Elder 	}
7271f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
7281f7ba331SAlex Elder 
7291f7ba331SAlex Elder 	return found ? client_node : NULL;
730602adf40SYehuda Sadeh }
731602adf40SYehuda Sadeh 
732602adf40SYehuda Sadeh /*
733210c104cSIlya Dryomov  * (Per device) rbd map options
73459c2be1eSYehuda Sadeh  */
73559c2be1eSYehuda Sadeh enum {
736b5584180SIlya Dryomov 	Opt_queue_depth,
7370c93e1b7SIlya Dryomov 	Opt_alloc_size,
73834f55d0bSDongsheng Yang 	Opt_lock_timeout,
73959c2be1eSYehuda Sadeh 	Opt_last_int,
74059c2be1eSYehuda Sadeh 	/* int args above */
741b26c047bSIlya Dryomov 	Opt_pool_ns,
74259c2be1eSYehuda Sadeh 	Opt_last_string,
74359c2be1eSYehuda Sadeh 	/* string args above */
744cc0538b6SAlex Elder 	Opt_read_only,
745cc0538b6SAlex Elder 	Opt_read_write,
74680de1912SIlya Dryomov 	Opt_lock_on_read,
747e010dd0aSIlya Dryomov 	Opt_exclusive,
748d9360540SIlya Dryomov 	Opt_notrim,
749210c104cSIlya Dryomov 	Opt_err
75059c2be1eSYehuda Sadeh };
75159c2be1eSYehuda Sadeh 
75243ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
753b5584180SIlya Dryomov 	{Opt_queue_depth, "queue_depth=%d"},
7540c93e1b7SIlya Dryomov 	{Opt_alloc_size, "alloc_size=%d"},
75534f55d0bSDongsheng Yang 	{Opt_lock_timeout, "lock_timeout=%d"},
75659c2be1eSYehuda Sadeh 	/* int args above */
757b26c047bSIlya Dryomov 	{Opt_pool_ns, "_pool_ns=%s"},
75859c2be1eSYehuda Sadeh 	/* string args above */
759be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
760cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
761cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
762cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
76380de1912SIlya Dryomov 	{Opt_lock_on_read, "lock_on_read"},
764e010dd0aSIlya Dryomov 	{Opt_exclusive, "exclusive"},
765d9360540SIlya Dryomov 	{Opt_notrim, "notrim"},
766210c104cSIlya Dryomov 	{Opt_err, NULL}
76759c2be1eSYehuda Sadeh };
76859c2be1eSYehuda Sadeh 
76998571b5aSAlex Elder struct rbd_options {
770b5584180SIlya Dryomov 	int	queue_depth;
7710c93e1b7SIlya Dryomov 	int	alloc_size;
77234f55d0bSDongsheng Yang 	unsigned long	lock_timeout;
77398571b5aSAlex Elder 	bool	read_only;
77480de1912SIlya Dryomov 	bool	lock_on_read;
775e010dd0aSIlya Dryomov 	bool	exclusive;
776d9360540SIlya Dryomov 	bool	trim;
77798571b5aSAlex Elder };
77898571b5aSAlex Elder 
779b5584180SIlya Dryomov #define RBD_QUEUE_DEPTH_DEFAULT	BLKDEV_MAX_RQ
7800c93e1b7SIlya Dryomov #define RBD_ALLOC_SIZE_DEFAULT	(64 * 1024)
78134f55d0bSDongsheng Yang #define RBD_LOCK_TIMEOUT_DEFAULT 0  /* no timeout */
78298571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
78380de1912SIlya Dryomov #define RBD_LOCK_ON_READ_DEFAULT false
784e010dd0aSIlya Dryomov #define RBD_EXCLUSIVE_DEFAULT	false
785d9360540SIlya Dryomov #define RBD_TRIM_DEFAULT	true
78698571b5aSAlex Elder 
787c300156bSIlya Dryomov struct parse_rbd_opts_ctx {
788c300156bSIlya Dryomov 	struct rbd_spec		*spec;
789c300156bSIlya Dryomov 	struct rbd_options	*opts;
790c300156bSIlya Dryomov };
791c300156bSIlya Dryomov 
79259c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
79359c2be1eSYehuda Sadeh {
794c300156bSIlya Dryomov 	struct parse_rbd_opts_ctx *pctx = private;
79559c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
79659c2be1eSYehuda Sadeh 	int token, intval, ret;
79759c2be1eSYehuda Sadeh 
79843ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
79959c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
80059c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
80159c2be1eSYehuda Sadeh 		if (ret < 0) {
8022f56b6baSIlya Dryomov 			pr_err("bad option arg (not int) at '%s'\n", c);
80359c2be1eSYehuda Sadeh 			return ret;
80459c2be1eSYehuda Sadeh 		}
80559c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
80659c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
807210c104cSIlya Dryomov 		dout("got string token %d val %s\n", token, argstr[0].from);
80859c2be1eSYehuda Sadeh 	} else {
80959c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
81059c2be1eSYehuda Sadeh 	}
81159c2be1eSYehuda Sadeh 
81259c2be1eSYehuda Sadeh 	switch (token) {
813b5584180SIlya Dryomov 	case Opt_queue_depth:
814b5584180SIlya Dryomov 		if (intval < 1) {
815b5584180SIlya Dryomov 			pr_err("queue_depth out of range\n");
816b5584180SIlya Dryomov 			return -EINVAL;
817b5584180SIlya Dryomov 		}
818c300156bSIlya Dryomov 		pctx->opts->queue_depth = intval;
819b5584180SIlya Dryomov 		break;
8200c93e1b7SIlya Dryomov 	case Opt_alloc_size:
8210c93e1b7SIlya Dryomov 		if (intval < 1) {
8220c93e1b7SIlya Dryomov 			pr_err("alloc_size out of range\n");
8230c93e1b7SIlya Dryomov 			return -EINVAL;
8240c93e1b7SIlya Dryomov 		}
8250c93e1b7SIlya Dryomov 		if (!is_power_of_2(intval)) {
8260c93e1b7SIlya Dryomov 			pr_err("alloc_size must be a power of 2\n");
8270c93e1b7SIlya Dryomov 			return -EINVAL;
8280c93e1b7SIlya Dryomov 		}
8290c93e1b7SIlya Dryomov 		pctx->opts->alloc_size = intval;
8300c93e1b7SIlya Dryomov 		break;
83134f55d0bSDongsheng Yang 	case Opt_lock_timeout:
83234f55d0bSDongsheng Yang 		/* 0 is "wait forever" (i.e. infinite timeout) */
83334f55d0bSDongsheng Yang 		if (intval < 0 || intval > INT_MAX / 1000) {
83434f55d0bSDongsheng Yang 			pr_err("lock_timeout out of range\n");
83534f55d0bSDongsheng Yang 			return -EINVAL;
83634f55d0bSDongsheng Yang 		}
837c300156bSIlya Dryomov 		pctx->opts->lock_timeout = msecs_to_jiffies(intval * 1000);
83834f55d0bSDongsheng Yang 		break;
839b26c047bSIlya Dryomov 	case Opt_pool_ns:
840b26c047bSIlya Dryomov 		kfree(pctx->spec->pool_ns);
841b26c047bSIlya Dryomov 		pctx->spec->pool_ns = match_strdup(argstr);
842b26c047bSIlya Dryomov 		if (!pctx->spec->pool_ns)
843b26c047bSIlya Dryomov 			return -ENOMEM;
84459c2be1eSYehuda Sadeh 		break;
845cc0538b6SAlex Elder 	case Opt_read_only:
846c300156bSIlya Dryomov 		pctx->opts->read_only = true;
847cc0538b6SAlex Elder 		break;
848cc0538b6SAlex Elder 	case Opt_read_write:
849c300156bSIlya Dryomov 		pctx->opts->read_only = false;
850cc0538b6SAlex Elder 		break;
85180de1912SIlya Dryomov 	case Opt_lock_on_read:
852c300156bSIlya Dryomov 		pctx->opts->lock_on_read = true;
85380de1912SIlya Dryomov 		break;
854e010dd0aSIlya Dryomov 	case Opt_exclusive:
855c300156bSIlya Dryomov 		pctx->opts->exclusive = true;
856e010dd0aSIlya Dryomov 		break;
857d9360540SIlya Dryomov 	case Opt_notrim:
858c300156bSIlya Dryomov 		pctx->opts->trim = false;
859d9360540SIlya Dryomov 		break;
86059c2be1eSYehuda Sadeh 	default:
861210c104cSIlya Dryomov 		/* libceph prints "bad option" msg */
862210c104cSIlya Dryomov 		return -EINVAL;
86359c2be1eSYehuda Sadeh 	}
864210c104cSIlya Dryomov 
86559c2be1eSYehuda Sadeh 	return 0;
86659c2be1eSYehuda Sadeh }
86759c2be1eSYehuda Sadeh 
8686d2940c8SGuangliang Zhao static char* obj_op_name(enum obj_operation_type op_type)
8696d2940c8SGuangliang Zhao {
8706d2940c8SGuangliang Zhao 	switch (op_type) {
8716d2940c8SGuangliang Zhao 	case OBJ_OP_READ:
8726d2940c8SGuangliang Zhao 		return "read";
8736d2940c8SGuangliang Zhao 	case OBJ_OP_WRITE:
8746d2940c8SGuangliang Zhao 		return "write";
87590e98c52SGuangliang Zhao 	case OBJ_OP_DISCARD:
87690e98c52SGuangliang Zhao 		return "discard";
8776484cbe9SIlya Dryomov 	case OBJ_OP_ZEROOUT:
8786484cbe9SIlya Dryomov 		return "zeroout";
8796d2940c8SGuangliang Zhao 	default:
8806d2940c8SGuangliang Zhao 		return "???";
8816d2940c8SGuangliang Zhao 	}
8826d2940c8SGuangliang Zhao }
8836d2940c8SGuangliang Zhao 
88459c2be1eSYehuda Sadeh /*
885602adf40SYehuda Sadeh  * Destroy ceph client
886d23a4b3fSAlex Elder  *
887432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
888602adf40SYehuda Sadeh  */
889602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
890602adf40SYehuda Sadeh {
891602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
892602adf40SYehuda Sadeh 
89337206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
894cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
895602adf40SYehuda Sadeh 	list_del(&rbdc->node);
896cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
897602adf40SYehuda Sadeh 
898602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
899602adf40SYehuda Sadeh 	kfree(rbdc);
900602adf40SYehuda Sadeh }
901602adf40SYehuda Sadeh 
902602adf40SYehuda Sadeh /*
903602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
904602adf40SYehuda Sadeh  * it.
905602adf40SYehuda Sadeh  */
9069d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
907602adf40SYehuda Sadeh {
908c53d5893SAlex Elder 	if (rbdc)
9099d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
910602adf40SYehuda Sadeh }
911602adf40SYehuda Sadeh 
912dd435855SIlya Dryomov static int wait_for_latest_osdmap(struct ceph_client *client)
913dd435855SIlya Dryomov {
914dd435855SIlya Dryomov 	u64 newest_epoch;
915dd435855SIlya Dryomov 	int ret;
916dd435855SIlya Dryomov 
917dd435855SIlya Dryomov 	ret = ceph_monc_get_version(&client->monc, "osdmap", &newest_epoch);
918dd435855SIlya Dryomov 	if (ret)
919dd435855SIlya Dryomov 		return ret;
920dd435855SIlya Dryomov 
921dd435855SIlya Dryomov 	if (client->osdc.osdmap->epoch >= newest_epoch)
922dd435855SIlya Dryomov 		return 0;
923dd435855SIlya Dryomov 
924dd435855SIlya Dryomov 	ceph_osdc_maybe_request_map(&client->osdc);
925dd435855SIlya Dryomov 	return ceph_monc_wait_osdmap(&client->monc, newest_epoch,
926dd435855SIlya Dryomov 				     client->options->mount_timeout);
927dd435855SIlya Dryomov }
928dd435855SIlya Dryomov 
9295feb0d8dSIlya Dryomov /*
9305feb0d8dSIlya Dryomov  * Get a ceph client with specific addr and configuration, if one does
9315feb0d8dSIlya Dryomov  * not exist create it.  Either way, ceph_opts is consumed by this
9325feb0d8dSIlya Dryomov  * function.
9335feb0d8dSIlya Dryomov  */
9345feb0d8dSIlya Dryomov static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
9355feb0d8dSIlya Dryomov {
9365feb0d8dSIlya Dryomov 	struct rbd_client *rbdc;
937dd435855SIlya Dryomov 	int ret;
9385feb0d8dSIlya Dryomov 
9395feb0d8dSIlya Dryomov 	mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
9405feb0d8dSIlya Dryomov 	rbdc = rbd_client_find(ceph_opts);
941dd435855SIlya Dryomov 	if (rbdc) {
9425feb0d8dSIlya Dryomov 		ceph_destroy_options(ceph_opts);
943dd435855SIlya Dryomov 
944dd435855SIlya Dryomov 		/*
945dd435855SIlya Dryomov 		 * Using an existing client.  Make sure ->pg_pools is up to
946dd435855SIlya Dryomov 		 * date before we look up the pool id in do_rbd_add().
947dd435855SIlya Dryomov 		 */
948dd435855SIlya Dryomov 		ret = wait_for_latest_osdmap(rbdc->client);
949dd435855SIlya Dryomov 		if (ret) {
950dd435855SIlya Dryomov 			rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
951dd435855SIlya Dryomov 			rbd_put_client(rbdc);
952dd435855SIlya Dryomov 			rbdc = ERR_PTR(ret);
953dd435855SIlya Dryomov 		}
954dd435855SIlya Dryomov 	} else {
9555feb0d8dSIlya Dryomov 		rbdc = rbd_client_create(ceph_opts);
956dd435855SIlya Dryomov 	}
9575feb0d8dSIlya Dryomov 	mutex_unlock(&client_mutex);
9585feb0d8dSIlya Dryomov 
9595feb0d8dSIlya Dryomov 	return rbdc;
9605feb0d8dSIlya Dryomov }
9615feb0d8dSIlya Dryomov 
962a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
963a30b71b9SAlex Elder {
964a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
965a30b71b9SAlex Elder }
966a30b71b9SAlex Elder 
9678e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
9688e94af8eSAlex Elder {
969103a150fSAlex Elder 	size_t size;
970103a150fSAlex Elder 	u32 snap_count;
971103a150fSAlex Elder 
972103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
973103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
974103a150fSAlex Elder 		return false;
975103a150fSAlex Elder 
976db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
977db2388b6SAlex Elder 
978db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
979db2388b6SAlex Elder 		return false;
980db2388b6SAlex Elder 
981db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
982db2388b6SAlex Elder 
983db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
984db2388b6SAlex Elder 		return false;
985db2388b6SAlex Elder 
986103a150fSAlex Elder 	/*
987103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
988103a150fSAlex Elder 	 * that limits the number of snapshots.
989103a150fSAlex Elder 	 */
990103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
991103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
992103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
993103a150fSAlex Elder 		return false;
994103a150fSAlex Elder 
995103a150fSAlex Elder 	/*
996103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
997103a150fSAlex Elder 	 * header must also be representable in a size_t.
998103a150fSAlex Elder 	 */
999103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
1000103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
1001103a150fSAlex Elder 		return false;
1002103a150fSAlex Elder 
1003103a150fSAlex Elder 	return true;
10048e94af8eSAlex Elder }
10058e94af8eSAlex Elder 
1006602adf40SYehuda Sadeh /*
10075bc3fb17SIlya Dryomov  * returns the size of an object in the image
10085bc3fb17SIlya Dryomov  */
10095bc3fb17SIlya Dryomov static u32 rbd_obj_bytes(struct rbd_image_header *header)
10105bc3fb17SIlya Dryomov {
10115bc3fb17SIlya Dryomov 	return 1U << header->obj_order;
10125bc3fb17SIlya Dryomov }
10135bc3fb17SIlya Dryomov 
1014263423f8SIlya Dryomov static void rbd_init_layout(struct rbd_device *rbd_dev)
1015263423f8SIlya Dryomov {
1016263423f8SIlya Dryomov 	if (rbd_dev->header.stripe_unit == 0 ||
1017263423f8SIlya Dryomov 	    rbd_dev->header.stripe_count == 0) {
1018263423f8SIlya Dryomov 		rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
1019263423f8SIlya Dryomov 		rbd_dev->header.stripe_count = 1;
1020263423f8SIlya Dryomov 	}
1021263423f8SIlya Dryomov 
1022263423f8SIlya Dryomov 	rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
1023263423f8SIlya Dryomov 	rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
1024263423f8SIlya Dryomov 	rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
10257e97332eSIlya Dryomov 	rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
10267e97332eSIlya Dryomov 			  rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
1027263423f8SIlya Dryomov 	RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
1028263423f8SIlya Dryomov }
1029263423f8SIlya Dryomov 
10305bc3fb17SIlya Dryomov /*
1031bb23e37aSAlex Elder  * Fill an rbd image header with information from the given format 1
1032bb23e37aSAlex Elder  * on-disk header.
1033602adf40SYehuda Sadeh  */
1034662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev,
10354156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
1036602adf40SYehuda Sadeh {
1037662518b1SAlex Elder 	struct rbd_image_header *header = &rbd_dev->header;
1038bb23e37aSAlex Elder 	bool first_time = header->object_prefix == NULL;
1039bb23e37aSAlex Elder 	struct ceph_snap_context *snapc;
1040bb23e37aSAlex Elder 	char *object_prefix = NULL;
1041bb23e37aSAlex Elder 	char *snap_names = NULL;
1042bb23e37aSAlex Elder 	u64 *snap_sizes = NULL;
1043ccece235SAlex Elder 	u32 snap_count;
1044bb23e37aSAlex Elder 	int ret = -ENOMEM;
1045621901d6SAlex Elder 	u32 i;
1046602adf40SYehuda Sadeh 
1047bb23e37aSAlex Elder 	/* Allocate this now to avoid having to handle failure below */
1048103a150fSAlex Elder 
1049bb23e37aSAlex Elder 	if (first_time) {
1050848d796cSIlya Dryomov 		object_prefix = kstrndup(ondisk->object_prefix,
1051848d796cSIlya Dryomov 					 sizeof(ondisk->object_prefix),
1052848d796cSIlya Dryomov 					 GFP_KERNEL);
1053bb23e37aSAlex Elder 		if (!object_prefix)
1054602adf40SYehuda Sadeh 			return -ENOMEM;
1055bb23e37aSAlex Elder 	}
105600f1f36fSAlex Elder 
1057bb23e37aSAlex Elder 	/* Allocate the snapshot context and fill it in */
1058d2bb24e5SAlex Elder 
1059602adf40SYehuda Sadeh 	snap_count = le32_to_cpu(ondisk->snap_count);
1060bb23e37aSAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1061bb23e37aSAlex Elder 	if (!snapc)
1062bb23e37aSAlex Elder 		goto out_err;
1063bb23e37aSAlex Elder 	snapc->seq = le64_to_cpu(ondisk->snap_seq);
1064602adf40SYehuda Sadeh 	if (snap_count) {
1065bb23e37aSAlex Elder 		struct rbd_image_snap_ondisk *snaps;
1066f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1067f785cc1dSAlex Elder 
1068bb23e37aSAlex Elder 		/* We'll keep a copy of the snapshot names... */
1069621901d6SAlex Elder 
1070f785cc1dSAlex Elder 		if (snap_names_len > (u64)SIZE_MAX)
1071bb23e37aSAlex Elder 			goto out_2big;
1072bb23e37aSAlex Elder 		snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1073bb23e37aSAlex Elder 		if (!snap_names)
1074602adf40SYehuda Sadeh 			goto out_err;
1075bb23e37aSAlex Elder 
1076bb23e37aSAlex Elder 		/* ...as well as the array of their sizes. */
107788a25a5fSMarkus Elfring 		snap_sizes = kmalloc_array(snap_count,
107888a25a5fSMarkus Elfring 					   sizeof(*header->snap_sizes),
107988a25a5fSMarkus Elfring 					   GFP_KERNEL);
1080bb23e37aSAlex Elder 		if (!snap_sizes)
1081bb23e37aSAlex Elder 			goto out_err;
1082bb23e37aSAlex Elder 
1083f785cc1dSAlex Elder 		/*
1084bb23e37aSAlex Elder 		 * Copy the names, and fill in each snapshot's id
1085bb23e37aSAlex Elder 		 * and size.
1086bb23e37aSAlex Elder 		 *
108799a41ebcSAlex Elder 		 * Note that rbd_dev_v1_header_info() guarantees the
1088bb23e37aSAlex Elder 		 * ondisk buffer we're working with has
1089f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
1090f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
1091f785cc1dSAlex Elder 		 */
1092bb23e37aSAlex Elder 		memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1093bb23e37aSAlex Elder 		snaps = ondisk->snaps;
1094bb23e37aSAlex Elder 		for (i = 0; i < snap_count; i++) {
1095bb23e37aSAlex Elder 			snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1096bb23e37aSAlex Elder 			snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1097bb23e37aSAlex Elder 		}
1098602adf40SYehuda Sadeh 	}
1099849b4260SAlex Elder 
1100bb23e37aSAlex Elder 	/* We won't fail any more, fill in the header */
1101bb23e37aSAlex Elder 
1102bb23e37aSAlex Elder 	if (first_time) {
1103bb23e37aSAlex Elder 		header->object_prefix = object_prefix;
1104602adf40SYehuda Sadeh 		header->obj_order = ondisk->options.order;
1105263423f8SIlya Dryomov 		rbd_init_layout(rbd_dev);
1106662518b1SAlex Elder 	} else {
1107662518b1SAlex Elder 		ceph_put_snap_context(header->snapc);
1108662518b1SAlex Elder 		kfree(header->snap_names);
1109662518b1SAlex Elder 		kfree(header->snap_sizes);
1110bb23e37aSAlex Elder 	}
11116a52325fSAlex Elder 
1112bb23e37aSAlex Elder 	/* The remaining fields always get updated (when we refresh) */
1113621901d6SAlex Elder 
1114f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
1115bb23e37aSAlex Elder 	header->snapc = snapc;
1116bb23e37aSAlex Elder 	header->snap_names = snap_names;
1117bb23e37aSAlex Elder 	header->snap_sizes = snap_sizes;
1118468521c1SAlex Elder 
1119602adf40SYehuda Sadeh 	return 0;
1120bb23e37aSAlex Elder out_2big:
1121bb23e37aSAlex Elder 	ret = -EIO;
11226a52325fSAlex Elder out_err:
1123bb23e37aSAlex Elder 	kfree(snap_sizes);
1124bb23e37aSAlex Elder 	kfree(snap_names);
1125bb23e37aSAlex Elder 	ceph_put_snap_context(snapc);
1126bb23e37aSAlex Elder 	kfree(object_prefix);
1127ccece235SAlex Elder 
1128bb23e37aSAlex Elder 	return ret;
1129602adf40SYehuda Sadeh }
1130602adf40SYehuda Sadeh 
11319682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
11329682fc6dSAlex Elder {
11339682fc6dSAlex Elder 	const char *snap_name;
11349682fc6dSAlex Elder 
11359682fc6dSAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
11369682fc6dSAlex Elder 
11379682fc6dSAlex Elder 	/* Skip over names until we find the one we are looking for */
11389682fc6dSAlex Elder 
11399682fc6dSAlex Elder 	snap_name = rbd_dev->header.snap_names;
11409682fc6dSAlex Elder 	while (which--)
11419682fc6dSAlex Elder 		snap_name += strlen(snap_name) + 1;
11429682fc6dSAlex Elder 
11439682fc6dSAlex Elder 	return kstrdup(snap_name, GFP_KERNEL);
11449682fc6dSAlex Elder }
11459682fc6dSAlex Elder 
114630d1cff8SAlex Elder /*
114730d1cff8SAlex Elder  * Snapshot id comparison function for use with qsort()/bsearch().
114830d1cff8SAlex Elder  * Note that result is for snapshots in *descending* order.
114930d1cff8SAlex Elder  */
115030d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2)
115130d1cff8SAlex Elder {
115230d1cff8SAlex Elder 	u64 snap_id1 = *(u64 *)s1;
115330d1cff8SAlex Elder 	u64 snap_id2 = *(u64 *)s2;
115430d1cff8SAlex Elder 
115530d1cff8SAlex Elder 	if (snap_id1 < snap_id2)
115630d1cff8SAlex Elder 		return 1;
115730d1cff8SAlex Elder 	return snap_id1 == snap_id2 ? 0 : -1;
115830d1cff8SAlex Elder }
115930d1cff8SAlex Elder 
116030d1cff8SAlex Elder /*
116130d1cff8SAlex Elder  * Search a snapshot context to see if the given snapshot id is
116230d1cff8SAlex Elder  * present.
116330d1cff8SAlex Elder  *
116430d1cff8SAlex Elder  * Returns the position of the snapshot id in the array if it's found,
116530d1cff8SAlex Elder  * or BAD_SNAP_INDEX otherwise.
116630d1cff8SAlex Elder  *
116730d1cff8SAlex Elder  * Note: The snapshot array is in kept sorted (by the osd) in
116830d1cff8SAlex Elder  * reverse order, highest snapshot id first.
116930d1cff8SAlex Elder  */
11709682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
11719682fc6dSAlex Elder {
11729682fc6dSAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
117330d1cff8SAlex Elder 	u64 *found;
11749682fc6dSAlex Elder 
117530d1cff8SAlex Elder 	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
117630d1cff8SAlex Elder 				sizeof (snap_id), snapid_compare_reverse);
11779682fc6dSAlex Elder 
117830d1cff8SAlex Elder 	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
11799682fc6dSAlex Elder }
11809682fc6dSAlex Elder 
11812ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
11822ad3d716SAlex Elder 					u64 snap_id)
118354cac61fSAlex Elder {
118454cac61fSAlex Elder 	u32 which;
1185da6a6b63SJosh Durgin 	const char *snap_name;
118654cac61fSAlex Elder 
118754cac61fSAlex Elder 	which = rbd_dev_snap_index(rbd_dev, snap_id);
118854cac61fSAlex Elder 	if (which == BAD_SNAP_INDEX)
1189da6a6b63SJosh Durgin 		return ERR_PTR(-ENOENT);
119054cac61fSAlex Elder 
1191da6a6b63SJosh Durgin 	snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1192da6a6b63SJosh Durgin 	return snap_name ? snap_name : ERR_PTR(-ENOMEM);
119354cac61fSAlex Elder }
119454cac61fSAlex Elder 
11959e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
11969e15b77dSAlex Elder {
11979e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
11989e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
11999e15b77dSAlex Elder 
120054cac61fSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
120154cac61fSAlex Elder 	if (rbd_dev->image_format == 1)
120254cac61fSAlex Elder 		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
12039e15b77dSAlex Elder 
120454cac61fSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
12059e15b77dSAlex Elder }
12069e15b77dSAlex Elder 
12072ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
12082ad3d716SAlex Elder 				u64 *snap_size)
1209602adf40SYehuda Sadeh {
12102ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
12112ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
12122ad3d716SAlex Elder 		*snap_size = rbd_dev->header.image_size;
12132ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
12142ad3d716SAlex Elder 		u32 which;
121500f1f36fSAlex Elder 
12162ad3d716SAlex Elder 		which = rbd_dev_snap_index(rbd_dev, snap_id);
12172ad3d716SAlex Elder 		if (which == BAD_SNAP_INDEX)
12182ad3d716SAlex Elder 			return -ENOENT;
121900f1f36fSAlex Elder 
12202ad3d716SAlex Elder 		*snap_size = rbd_dev->header.snap_sizes[which];
12212ad3d716SAlex Elder 	} else {
12222ad3d716SAlex Elder 		u64 size = 0;
12232ad3d716SAlex Elder 		int ret;
12242ad3d716SAlex Elder 
12252ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
12262ad3d716SAlex Elder 		if (ret)
12272ad3d716SAlex Elder 			return ret;
12282ad3d716SAlex Elder 
12292ad3d716SAlex Elder 		*snap_size = size;
12302ad3d716SAlex Elder 	}
12312ad3d716SAlex Elder 	return 0;
12322ad3d716SAlex Elder }
12332ad3d716SAlex Elder 
12342ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
12352ad3d716SAlex Elder 			u64 *snap_features)
12362ad3d716SAlex Elder {
12372ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
12382ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
12392ad3d716SAlex Elder 		*snap_features = rbd_dev->header.features;
12402ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
12412ad3d716SAlex Elder 		*snap_features = 0;	/* No features for format 1 */
12422ad3d716SAlex Elder 	} else {
12432ad3d716SAlex Elder 		u64 features = 0;
12442ad3d716SAlex Elder 		int ret;
12452ad3d716SAlex Elder 
12462ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
12472ad3d716SAlex Elder 		if (ret)
12482ad3d716SAlex Elder 			return ret;
12492ad3d716SAlex Elder 
12502ad3d716SAlex Elder 		*snap_features = features;
12512ad3d716SAlex Elder 	}
12522ad3d716SAlex Elder 	return 0;
125300f1f36fSAlex Elder }
1254602adf40SYehuda Sadeh 
1255d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1256602adf40SYehuda Sadeh {
12578f4b7d98SAlex Elder 	u64 snap_id = rbd_dev->spec->snap_id;
12582ad3d716SAlex Elder 	u64 size = 0;
12592ad3d716SAlex Elder 	u64 features = 0;
12602ad3d716SAlex Elder 	int ret;
12618b0241f8SAlex Elder 
12622ad3d716SAlex Elder 	ret = rbd_snap_size(rbd_dev, snap_id, &size);
12632ad3d716SAlex Elder 	if (ret)
12642ad3d716SAlex Elder 		return ret;
12652ad3d716SAlex Elder 	ret = rbd_snap_features(rbd_dev, snap_id, &features);
12662ad3d716SAlex Elder 	if (ret)
12672ad3d716SAlex Elder 		return ret;
12682ad3d716SAlex Elder 
12692ad3d716SAlex Elder 	rbd_dev->mapping.size = size;
12702ad3d716SAlex Elder 	rbd_dev->mapping.features = features;
12712ad3d716SAlex Elder 
12728b0241f8SAlex Elder 	return 0;
1273602adf40SYehuda Sadeh }
1274602adf40SYehuda Sadeh 
1275d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1276d1cf5788SAlex Elder {
1277d1cf5788SAlex Elder 	rbd_dev->mapping.size = 0;
1278d1cf5788SAlex Elder 	rbd_dev->mapping.features = 0;
1279200a6a8bSAlex Elder }
1280200a6a8bSAlex Elder 
12815359a17dSIlya Dryomov static void zero_bvec(struct bio_vec *bv)
128265ccfe21SAlex Elder {
1283602adf40SYehuda Sadeh 	void *buf;
12845359a17dSIlya Dryomov 	unsigned long flags;
1285602adf40SYehuda Sadeh 
12865359a17dSIlya Dryomov 	buf = bvec_kmap_irq(bv, &flags);
12875359a17dSIlya Dryomov 	memset(buf, 0, bv->bv_len);
12885359a17dSIlya Dryomov 	flush_dcache_page(bv->bv_page);
128985b5aaa6SDan Carpenter 	bvec_kunmap_irq(buf, &flags);
1290602adf40SYehuda Sadeh }
1291602adf40SYehuda Sadeh 
12925359a17dSIlya Dryomov static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
1293b9434c5bSAlex Elder {
12945359a17dSIlya Dryomov 	struct ceph_bio_iter it = *bio_pos;
1295b9434c5bSAlex Elder 
12965359a17dSIlya Dryomov 	ceph_bio_iter_advance(&it, off);
12975359a17dSIlya Dryomov 	ceph_bio_iter_advance_step(&it, bytes, ({
12985359a17dSIlya Dryomov 		zero_bvec(&bv);
12995359a17dSIlya Dryomov 	}));
1300b9434c5bSAlex Elder }
1301b9434c5bSAlex Elder 
13027e07efb1SIlya Dryomov static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
1303602adf40SYehuda Sadeh {
13047e07efb1SIlya Dryomov 	struct ceph_bvec_iter it = *bvec_pos;
1305602adf40SYehuda Sadeh 
13067e07efb1SIlya Dryomov 	ceph_bvec_iter_advance(&it, off);
13077e07efb1SIlya Dryomov 	ceph_bvec_iter_advance_step(&it, bytes, ({
13087e07efb1SIlya Dryomov 		zero_bvec(&bv);
13097e07efb1SIlya Dryomov 	}));
1310602adf40SYehuda Sadeh }
1311602adf40SYehuda Sadeh 
1312f7760dadSAlex Elder /*
13133da691bfSIlya Dryomov  * Zero a range in @obj_req data buffer defined by a bio (list) or
1314afb97888SIlya Dryomov  * (private) bio_vec array.
1315f7760dadSAlex Elder  *
13163da691bfSIlya Dryomov  * @off is relative to the start of the data buffer.
1317f7760dadSAlex Elder  */
13183da691bfSIlya Dryomov static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
13193da691bfSIlya Dryomov 			       u32 bytes)
1320f7760dadSAlex Elder {
1321ecc633caSIlya Dryomov 	switch (obj_req->img_request->data_type) {
13223da691bfSIlya Dryomov 	case OBJ_REQUEST_BIO:
13233da691bfSIlya Dryomov 		zero_bios(&obj_req->bio_pos, off, bytes);
13243da691bfSIlya Dryomov 		break;
13253da691bfSIlya Dryomov 	case OBJ_REQUEST_BVECS:
1326afb97888SIlya Dryomov 	case OBJ_REQUEST_OWN_BVECS:
13273da691bfSIlya Dryomov 		zero_bvecs(&obj_req->bvec_pos, off, bytes);
13283da691bfSIlya Dryomov 		break;
13293da691bfSIlya Dryomov 	default:
13303da691bfSIlya Dryomov 		rbd_assert(0);
1331f5400b7aSAlex Elder 	}
1332bf0d5f50SAlex Elder }
1333bf0d5f50SAlex Elder 
1334bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
1335bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1336bf0d5f50SAlex Elder {
1337bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
133837206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
13392c935bc5SPeter Zijlstra 		kref_read(&obj_request->kref));
1340bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1341bf0d5f50SAlex Elder }
1342bf0d5f50SAlex Elder 
13430f2d5be7SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request)
13440f2d5be7SAlex Elder {
13450f2d5be7SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
13462c935bc5SPeter Zijlstra 	     kref_read(&img_request->kref));
13470f2d5be7SAlex Elder 	kref_get(&img_request->kref);
13480f2d5be7SAlex Elder }
13490f2d5be7SAlex Elder 
1350bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref);
1351bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request)
1352bf0d5f50SAlex Elder {
1353bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
135437206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
13552c935bc5SPeter Zijlstra 		kref_read(&img_request->kref));
1356bf0d5f50SAlex Elder 	kref_put(&img_request->kref, rbd_img_request_destroy);
1357bf0d5f50SAlex Elder }
1358bf0d5f50SAlex Elder 
1359bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1360bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1361bf0d5f50SAlex Elder {
136225dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
136325dcf954SAlex Elder 
1364b155e86cSAlex Elder 	/* Image request now owns object's original reference */
1365bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
13667114edacSIlya Dryomov 	img_request->pending_count++;
136715961b44SIlya Dryomov 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1368bf0d5f50SAlex Elder }
1369bf0d5f50SAlex Elder 
1370bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1371bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1372bf0d5f50SAlex Elder {
137315961b44SIlya Dryomov 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
137443df3d35SIlya Dryomov 	list_del(&obj_request->ex.oe_item);
1375bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1376bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1377bf0d5f50SAlex Elder }
1378bf0d5f50SAlex Elder 
1379980917fcSIlya Dryomov static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
1380bf0d5f50SAlex Elder {
1381980917fcSIlya Dryomov 	struct ceph_osd_request *osd_req = obj_request->osd_req;
1382980917fcSIlya Dryomov 
1383a90bb0c1SIlya Dryomov 	dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__,
138443df3d35SIlya Dryomov 	     obj_request, obj_request->ex.oe_objno, obj_request->ex.oe_off,
138543df3d35SIlya Dryomov 	     obj_request->ex.oe_len, osd_req);
1386980917fcSIlya Dryomov 	ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
1387bf0d5f50SAlex Elder }
1388bf0d5f50SAlex Elder 
13890c425248SAlex Elder /*
13900c425248SAlex Elder  * The default/initial value for all image request flags is 0.  Each
13910c425248SAlex Elder  * is conditionally set to 1 at image request initialization time
13920c425248SAlex Elder  * and currently never change thereafter.
13930c425248SAlex Elder  */
1394d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request)
1395d0b2e944SAlex Elder {
1396d0b2e944SAlex Elder 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1397d0b2e944SAlex Elder 	smp_mb();
1398d0b2e944SAlex Elder }
1399d0b2e944SAlex Elder 
1400a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request)
1401a2acd00eSAlex Elder {
1402a2acd00eSAlex Elder 	clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1403a2acd00eSAlex Elder 	smp_mb();
1404a2acd00eSAlex Elder }
1405a2acd00eSAlex Elder 
1406d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request)
1407d0b2e944SAlex Elder {
1408d0b2e944SAlex Elder 	smp_mb();
1409d0b2e944SAlex Elder 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1410d0b2e944SAlex Elder }
1411d0b2e944SAlex Elder 
14123da691bfSIlya Dryomov static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
14133b434a2aSJosh Durgin {
14143da691bfSIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
14153da691bfSIlya Dryomov 
141643df3d35SIlya Dryomov 	return !obj_req->ex.oe_off &&
141743df3d35SIlya Dryomov 	       obj_req->ex.oe_len == rbd_dev->layout.object_size;
14183b434a2aSJosh Durgin }
14193b434a2aSJosh Durgin 
14203da691bfSIlya Dryomov static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
14216e2a4505SAlex Elder {
14223da691bfSIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1423b9434c5bSAlex Elder 
142443df3d35SIlya Dryomov 	return obj_req->ex.oe_off + obj_req->ex.oe_len ==
14253da691bfSIlya Dryomov 					rbd_dev->layout.object_size;
14266e2a4505SAlex Elder }
14276e2a4505SAlex Elder 
142813488d53SIlya Dryomov /*
142913488d53SIlya Dryomov  * Must be called after rbd_obj_calc_img_extents().
143013488d53SIlya Dryomov  */
143113488d53SIlya Dryomov static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req)
143213488d53SIlya Dryomov {
143313488d53SIlya Dryomov 	if (!obj_req->num_img_extents ||
143413488d53SIlya Dryomov 	    rbd_obj_is_entire(obj_req))
143513488d53SIlya Dryomov 		return false;
143613488d53SIlya Dryomov 
143713488d53SIlya Dryomov 	return true;
143813488d53SIlya Dryomov }
143913488d53SIlya Dryomov 
144086bd7998SIlya Dryomov static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
1441bf0d5f50SAlex Elder {
144286bd7998SIlya Dryomov 	return ceph_file_extents_bytes(obj_req->img_extents,
144386bd7998SIlya Dryomov 				       obj_req->num_img_extents);
1444bf0d5f50SAlex Elder }
1445bf0d5f50SAlex Elder 
14463da691bfSIlya Dryomov static bool rbd_img_is_write(struct rbd_img_request *img_req)
14470dcc685eSIlya Dryomov {
14489bb0248dSIlya Dryomov 	switch (img_req->op_type) {
14493da691bfSIlya Dryomov 	case OBJ_OP_READ:
14503da691bfSIlya Dryomov 		return false;
14513da691bfSIlya Dryomov 	case OBJ_OP_WRITE:
14523da691bfSIlya Dryomov 	case OBJ_OP_DISCARD:
14536484cbe9SIlya Dryomov 	case OBJ_OP_ZEROOUT:
14543da691bfSIlya Dryomov 		return true;
14553da691bfSIlya Dryomov 	default:
1456c6244b3bSArnd Bergmann 		BUG();
14570dcc685eSIlya Dryomov 	}
14580dcc685eSIlya Dryomov }
14590dcc685eSIlya Dryomov 
14603da691bfSIlya Dryomov static void rbd_obj_handle_request(struct rbd_obj_request *obj_req);
14612761713dSIlya Dryomov 
146285e084feSIlya Dryomov static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
1463bf0d5f50SAlex Elder {
14643da691bfSIlya Dryomov 	struct rbd_obj_request *obj_req = osd_req->r_priv;
1465bf0d5f50SAlex Elder 
14663da691bfSIlya Dryomov 	dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
14673da691bfSIlya Dryomov 	     osd_req->r_result, obj_req);
14683da691bfSIlya Dryomov 	rbd_assert(osd_req == obj_req->osd_req);
1469bf0d5f50SAlex Elder 
14703da691bfSIlya Dryomov 	obj_req->result = osd_req->r_result < 0 ? osd_req->r_result : 0;
14713da691bfSIlya Dryomov 	if (!obj_req->result && !rbd_img_is_write(obj_req->img_request))
14723da691bfSIlya Dryomov 		obj_req->xferred = osd_req->r_result;
14733da691bfSIlya Dryomov 	else
1474c47f9371SAlex Elder 		/*
14753da691bfSIlya Dryomov 		 * Writes aren't allowed to return a data payload.  In some
14763da691bfSIlya Dryomov 		 * guarded write cases (e.g. stat + zero on an empty object)
14773da691bfSIlya Dryomov 		 * a stat response makes it through, but we don't care.
1478c47f9371SAlex Elder 		 */
14793da691bfSIlya Dryomov 		obj_req->xferred = 0;
14800ccd5926SIlya Dryomov 
14813da691bfSIlya Dryomov 	rbd_obj_handle_request(obj_req);
1482bf0d5f50SAlex Elder }
1483bf0d5f50SAlex Elder 
14849d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1485430c28c3SAlex Elder {
14868c042b0dSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
1487430c28c3SAlex Elder 
1488a162b308SIlya Dryomov 	osd_req->r_flags = CEPH_OSD_FLAG_READ;
14897c84883aSIlya Dryomov 	osd_req->r_snapid = obj_request->img_request->snap_id;
14909d4df01fSAlex Elder }
14919d4df01fSAlex Elder 
14929d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
14939d4df01fSAlex Elder {
14949d4df01fSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
14959d4df01fSAlex Elder 
1496a162b308SIlya Dryomov 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
1497fac02ddfSArnd Bergmann 	ktime_get_real_ts64(&osd_req->r_mtime);
149843df3d35SIlya Dryomov 	osd_req->r_data_offset = obj_request->ex.oe_off;
1499430c28c3SAlex Elder }
1500430c28c3SAlex Elder 
1501bc81207eSIlya Dryomov static struct ceph_osd_request *
1502e28eded5SIlya Dryomov __rbd_osd_req_create(struct rbd_obj_request *obj_req,
1503e28eded5SIlya Dryomov 		     struct ceph_snap_context *snapc, unsigned int num_ops)
1504bc81207eSIlya Dryomov {
1505e28eded5SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1506bc81207eSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1507bc81207eSIlya Dryomov 	struct ceph_osd_request *req;
1508a90bb0c1SIlya Dryomov 	const char *name_format = rbd_dev->image_format == 1 ?
1509a90bb0c1SIlya Dryomov 				      RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
1510bc81207eSIlya Dryomov 
1511e28eded5SIlya Dryomov 	req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
1512bc81207eSIlya Dryomov 	if (!req)
1513bc81207eSIlya Dryomov 		return NULL;
1514bc81207eSIlya Dryomov 
1515bc81207eSIlya Dryomov 	req->r_callback = rbd_osd_req_callback;
1516a162b308SIlya Dryomov 	req->r_priv = obj_req;
1517bc81207eSIlya Dryomov 
1518b26c047bSIlya Dryomov 	/*
1519b26c047bSIlya Dryomov 	 * Data objects may be stored in a separate pool, but always in
1520b26c047bSIlya Dryomov 	 * the same namespace in that pool as the header in its pool.
1521b26c047bSIlya Dryomov 	 */
1522b26c047bSIlya Dryomov 	ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
1523bc81207eSIlya Dryomov 	req->r_base_oloc.pool = rbd_dev->layout.pool_id;
1524b26c047bSIlya Dryomov 
1525a90bb0c1SIlya Dryomov 	if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
152643df3d35SIlya Dryomov 			rbd_dev->header.object_prefix, obj_req->ex.oe_objno))
1527bc81207eSIlya Dryomov 		goto err_req;
1528bc81207eSIlya Dryomov 
1529bc81207eSIlya Dryomov 	return req;
1530bc81207eSIlya Dryomov 
1531bc81207eSIlya Dryomov err_req:
1532bc81207eSIlya Dryomov 	ceph_osdc_put_request(req);
1533bc81207eSIlya Dryomov 	return NULL;
1534bc81207eSIlya Dryomov }
1535bc81207eSIlya Dryomov 
1536e28eded5SIlya Dryomov static struct ceph_osd_request *
1537e28eded5SIlya Dryomov rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops)
1538e28eded5SIlya Dryomov {
1539e28eded5SIlya Dryomov 	return __rbd_osd_req_create(obj_req, obj_req->img_request->snapc,
1540e28eded5SIlya Dryomov 				    num_ops);
1541e28eded5SIlya Dryomov }
1542e28eded5SIlya Dryomov 
1543bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1544bf0d5f50SAlex Elder {
1545bf0d5f50SAlex Elder 	ceph_osdc_put_request(osd_req);
1546bf0d5f50SAlex Elder }
1547bf0d5f50SAlex Elder 
1548ecc633caSIlya Dryomov static struct rbd_obj_request *rbd_obj_request_create(void)
1549bf0d5f50SAlex Elder {
1550bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1551bf0d5f50SAlex Elder 
15525a60e876SIlya Dryomov 	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
15536c696d85SIlya Dryomov 	if (!obj_request)
1554f907ad55SAlex Elder 		return NULL;
1555f907ad55SAlex Elder 
155643df3d35SIlya Dryomov 	ceph_object_extent_init(&obj_request->ex);
1557bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
1558bf0d5f50SAlex Elder 
155967e2b652SIlya Dryomov 	dout("%s %p\n", __func__, obj_request);
1560bf0d5f50SAlex Elder 	return obj_request;
1561bf0d5f50SAlex Elder }
1562bf0d5f50SAlex Elder 
1563bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
1564bf0d5f50SAlex Elder {
1565bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
15667e07efb1SIlya Dryomov 	u32 i;
1567bf0d5f50SAlex Elder 
1568bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
1569bf0d5f50SAlex Elder 
157037206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
157137206ee5SAlex Elder 
1572bf0d5f50SAlex Elder 	if (obj_request->osd_req)
1573bf0d5f50SAlex Elder 		rbd_osd_req_destroy(obj_request->osd_req);
1574bf0d5f50SAlex Elder 
1575ecc633caSIlya Dryomov 	switch (obj_request->img_request->data_type) {
15769969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1577bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
15787e07efb1SIlya Dryomov 	case OBJ_REQUEST_BVECS:
15795359a17dSIlya Dryomov 		break;		/* Nothing to do */
1580afb97888SIlya Dryomov 	case OBJ_REQUEST_OWN_BVECS:
1581afb97888SIlya Dryomov 		kfree(obj_request->bvec_pos.bvecs);
1582bf0d5f50SAlex Elder 		break;
15837e07efb1SIlya Dryomov 	default:
15847e07efb1SIlya Dryomov 		rbd_assert(0);
1585bf0d5f50SAlex Elder 	}
1586bf0d5f50SAlex Elder 
158786bd7998SIlya Dryomov 	kfree(obj_request->img_extents);
15887e07efb1SIlya Dryomov 	if (obj_request->copyup_bvecs) {
15897e07efb1SIlya Dryomov 		for (i = 0; i < obj_request->copyup_bvec_count; i++) {
15907e07efb1SIlya Dryomov 			if (obj_request->copyup_bvecs[i].bv_page)
15917e07efb1SIlya Dryomov 				__free_page(obj_request->copyup_bvecs[i].bv_page);
15927e07efb1SIlya Dryomov 		}
15937e07efb1SIlya Dryomov 		kfree(obj_request->copyup_bvecs);
1594bf0d5f50SAlex Elder 	}
1595bf0d5f50SAlex Elder 
1596868311b1SAlex Elder 	kmem_cache_free(rbd_obj_request_cache, obj_request);
1597bf0d5f50SAlex Elder }
1598bf0d5f50SAlex Elder 
1599fb65d228SAlex Elder /* It's OK to call this for a device with no parent */
1600fb65d228SAlex Elder 
1601fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
1602fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1603fb65d228SAlex Elder {
1604fb65d228SAlex Elder 	rbd_dev_remove_parent(rbd_dev);
1605fb65d228SAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
1606fb65d228SAlex Elder 	rbd_dev->parent_spec = NULL;
1607fb65d228SAlex Elder 	rbd_dev->parent_overlap = 0;
1608fb65d228SAlex Elder }
1609fb65d228SAlex Elder 
1610bf0d5f50SAlex Elder /*
1611a2acd00eSAlex Elder  * Parent image reference counting is used to determine when an
1612a2acd00eSAlex Elder  * image's parent fields can be safely torn down--after there are no
1613a2acd00eSAlex Elder  * more in-flight requests to the parent image.  When the last
1614a2acd00eSAlex Elder  * reference is dropped, cleaning them up is safe.
1615a2acd00eSAlex Elder  */
1616a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1617a2acd00eSAlex Elder {
1618a2acd00eSAlex Elder 	int counter;
1619a2acd00eSAlex Elder 
1620a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
1621a2acd00eSAlex Elder 		return;
1622a2acd00eSAlex Elder 
1623a2acd00eSAlex Elder 	counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1624a2acd00eSAlex Elder 	if (counter > 0)
1625a2acd00eSAlex Elder 		return;
1626a2acd00eSAlex Elder 
1627a2acd00eSAlex Elder 	/* Last reference; clean up parent data structures */
1628a2acd00eSAlex Elder 
1629a2acd00eSAlex Elder 	if (!counter)
1630a2acd00eSAlex Elder 		rbd_dev_unparent(rbd_dev);
1631a2acd00eSAlex Elder 	else
16329584d508SIlya Dryomov 		rbd_warn(rbd_dev, "parent reference underflow");
1633a2acd00eSAlex Elder }
1634a2acd00eSAlex Elder 
1635a2acd00eSAlex Elder /*
1636a2acd00eSAlex Elder  * If an image has a non-zero parent overlap, get a reference to its
1637a2acd00eSAlex Elder  * parent.
1638a2acd00eSAlex Elder  *
1639a2acd00eSAlex Elder  * Returns true if the rbd device has a parent with a non-zero
1640a2acd00eSAlex Elder  * overlap and a reference for it was successfully taken, or
1641a2acd00eSAlex Elder  * false otherwise.
1642a2acd00eSAlex Elder  */
1643a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1644a2acd00eSAlex Elder {
1645ae43e9d0SIlya Dryomov 	int counter = 0;
1646a2acd00eSAlex Elder 
1647a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
1648a2acd00eSAlex Elder 		return false;
1649a2acd00eSAlex Elder 
1650ae43e9d0SIlya Dryomov 	down_read(&rbd_dev->header_rwsem);
1651ae43e9d0SIlya Dryomov 	if (rbd_dev->parent_overlap)
1652a2acd00eSAlex Elder 		counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1653ae43e9d0SIlya Dryomov 	up_read(&rbd_dev->header_rwsem);
1654a2acd00eSAlex Elder 
1655a2acd00eSAlex Elder 	if (counter < 0)
16569584d508SIlya Dryomov 		rbd_warn(rbd_dev, "parent reference overflow");
1657a2acd00eSAlex Elder 
1658ae43e9d0SIlya Dryomov 	return counter > 0;
1659a2acd00eSAlex Elder }
1660a2acd00eSAlex Elder 
1661bf0d5f50SAlex Elder /*
1662bf0d5f50SAlex Elder  * Caller is responsible for filling in the list of object requests
1663bf0d5f50SAlex Elder  * that comprises the image request, and the Linux request pointer
1664bf0d5f50SAlex Elder  * (if there is one).
1665bf0d5f50SAlex Elder  */
1666cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create(
1667cc344fa1SAlex Elder 					struct rbd_device *rbd_dev,
16686d2940c8SGuangliang Zhao 					enum obj_operation_type op_type,
16694e752f0aSJosh Durgin 					struct ceph_snap_context *snapc)
1670bf0d5f50SAlex Elder {
1671bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1672bf0d5f50SAlex Elder 
1673a0c5895bSIlya Dryomov 	img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO);
1674bf0d5f50SAlex Elder 	if (!img_request)
1675bf0d5f50SAlex Elder 		return NULL;
1676bf0d5f50SAlex Elder 
1677bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
16789bb0248dSIlya Dryomov 	img_request->op_type = op_type;
16799bb0248dSIlya Dryomov 	if (!rbd_img_is_write(img_request))
1680bf0d5f50SAlex Elder 		img_request->snap_id = rbd_dev->spec->snap_id;
16819bb0248dSIlya Dryomov 	else
16829bb0248dSIlya Dryomov 		img_request->snapc = snapc;
16839bb0248dSIlya Dryomov 
1684a2acd00eSAlex Elder 	if (rbd_dev_parent_get(rbd_dev))
1685d0b2e944SAlex Elder 		img_request_layered_set(img_request);
1686a0c5895bSIlya Dryomov 
1687bf0d5f50SAlex Elder 	spin_lock_init(&img_request->completion_lock);
168843df3d35SIlya Dryomov 	INIT_LIST_HEAD(&img_request->object_extents);
1689bf0d5f50SAlex Elder 	kref_init(&img_request->kref);
1690bf0d5f50SAlex Elder 
1691dfd9875fSIlya Dryomov 	dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev,
1692dfd9875fSIlya Dryomov 	     obj_op_name(op_type), img_request);
1693bf0d5f50SAlex Elder 	return img_request;
1694bf0d5f50SAlex Elder }
1695bf0d5f50SAlex Elder 
1696bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref)
1697bf0d5f50SAlex Elder {
1698bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1699bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1700bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
1701bf0d5f50SAlex Elder 
1702bf0d5f50SAlex Elder 	img_request = container_of(kref, struct rbd_img_request, kref);
1703bf0d5f50SAlex Elder 
170437206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
170537206ee5SAlex Elder 
1706bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1707bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
1708bf0d5f50SAlex Elder 
1709a2acd00eSAlex Elder 	if (img_request_layered_test(img_request)) {
1710a2acd00eSAlex Elder 		img_request_layered_clear(img_request);
1711a2acd00eSAlex Elder 		rbd_dev_parent_put(img_request->rbd_dev);
1712a2acd00eSAlex Elder 	}
1713a2acd00eSAlex Elder 
17149bb0248dSIlya Dryomov 	if (rbd_img_is_write(img_request))
1715812164f8SAlex Elder 		ceph_put_snap_context(img_request->snapc);
1716bf0d5f50SAlex Elder 
17171c2a9dfeSAlex Elder 	kmem_cache_free(rbd_img_request_cache, img_request);
1718bf0d5f50SAlex Elder }
1719bf0d5f50SAlex Elder 
172086bd7998SIlya Dryomov static void prune_extents(struct ceph_file_extent *img_extents,
172186bd7998SIlya Dryomov 			  u32 *num_img_extents, u64 overlap)
1722e93f3152SAlex Elder {
172386bd7998SIlya Dryomov 	u32 cnt = *num_img_extents;
1724e93f3152SAlex Elder 
172586bd7998SIlya Dryomov 	/* drop extents completely beyond the overlap */
172686bd7998SIlya Dryomov 	while (cnt && img_extents[cnt - 1].fe_off >= overlap)
172786bd7998SIlya Dryomov 		cnt--;
1728e93f3152SAlex Elder 
172986bd7998SIlya Dryomov 	if (cnt) {
173086bd7998SIlya Dryomov 		struct ceph_file_extent *ex = &img_extents[cnt - 1];
1731e93f3152SAlex Elder 
173286bd7998SIlya Dryomov 		/* trim final overlapping extent */
173386bd7998SIlya Dryomov 		if (ex->fe_off + ex->fe_len > overlap)
173486bd7998SIlya Dryomov 			ex->fe_len = overlap - ex->fe_off;
1735e93f3152SAlex Elder 	}
1736e93f3152SAlex Elder 
173786bd7998SIlya Dryomov 	*num_img_extents = cnt;
173886bd7998SIlya Dryomov }
173986bd7998SIlya Dryomov 
174086bd7998SIlya Dryomov /*
174186bd7998SIlya Dryomov  * Determine the byte range(s) covered by either just the object extent
174286bd7998SIlya Dryomov  * or the entire object in the parent image.
174386bd7998SIlya Dryomov  */
174486bd7998SIlya Dryomov static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
174586bd7998SIlya Dryomov 				    bool entire)
1746e93f3152SAlex Elder {
174786bd7998SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1748c5b5ef6cSAlex Elder 	int ret;
1749c5b5ef6cSAlex Elder 
175086bd7998SIlya Dryomov 	if (!rbd_dev->parent_overlap)
175186bd7998SIlya Dryomov 		return 0;
175286bd7998SIlya Dryomov 
175386bd7998SIlya Dryomov 	ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
175486bd7998SIlya Dryomov 				  entire ? 0 : obj_req->ex.oe_off,
175586bd7998SIlya Dryomov 				  entire ? rbd_dev->layout.object_size :
175686bd7998SIlya Dryomov 							obj_req->ex.oe_len,
175786bd7998SIlya Dryomov 				  &obj_req->img_extents,
175886bd7998SIlya Dryomov 				  &obj_req->num_img_extents);
175986bd7998SIlya Dryomov 	if (ret)
176086bd7998SIlya Dryomov 		return ret;
176186bd7998SIlya Dryomov 
176286bd7998SIlya Dryomov 	prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
176386bd7998SIlya Dryomov 		      rbd_dev->parent_overlap);
176486bd7998SIlya Dryomov 	return 0;
176586bd7998SIlya Dryomov }
176686bd7998SIlya Dryomov 
17673da691bfSIlya Dryomov static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which)
17683da691bfSIlya Dryomov {
1769ecc633caSIlya Dryomov 	switch (obj_req->img_request->data_type) {
17703da691bfSIlya Dryomov 	case OBJ_REQUEST_BIO:
17713da691bfSIlya Dryomov 		osd_req_op_extent_osd_data_bio(obj_req->osd_req, which,
17723da691bfSIlya Dryomov 					       &obj_req->bio_pos,
177343df3d35SIlya Dryomov 					       obj_req->ex.oe_len);
17743da691bfSIlya Dryomov 		break;
17753da691bfSIlya Dryomov 	case OBJ_REQUEST_BVECS:
1776afb97888SIlya Dryomov 	case OBJ_REQUEST_OWN_BVECS:
17773da691bfSIlya Dryomov 		rbd_assert(obj_req->bvec_pos.iter.bi_size ==
177843df3d35SIlya Dryomov 							obj_req->ex.oe_len);
1779afb97888SIlya Dryomov 		rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
17803da691bfSIlya Dryomov 		osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which,
17813da691bfSIlya Dryomov 						    &obj_req->bvec_pos);
17823da691bfSIlya Dryomov 		break;
17833da691bfSIlya Dryomov 	default:
17843da691bfSIlya Dryomov 		rbd_assert(0);
17853da691bfSIlya Dryomov 	}
17863da691bfSIlya Dryomov }
17873da691bfSIlya Dryomov 
17883da691bfSIlya Dryomov static int rbd_obj_setup_read(struct rbd_obj_request *obj_req)
17893da691bfSIlya Dryomov {
1790e28eded5SIlya Dryomov 	obj_req->osd_req = __rbd_osd_req_create(obj_req, NULL, 1);
17913da691bfSIlya Dryomov 	if (!obj_req->osd_req)
1792710214e3SIlya Dryomov 		return -ENOMEM;
1793710214e3SIlya Dryomov 
17943da691bfSIlya Dryomov 	osd_req_op_extent_init(obj_req->osd_req, 0, CEPH_OSD_OP_READ,
179543df3d35SIlya Dryomov 			       obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
17963da691bfSIlya Dryomov 	rbd_osd_req_setup_data(obj_req, 0);
1797a90bb0c1SIlya Dryomov 
17983da691bfSIlya Dryomov 	rbd_osd_req_format_read(obj_req);
17993da691bfSIlya Dryomov 	return 0;
1800710214e3SIlya Dryomov }
1801710214e3SIlya Dryomov 
18023da691bfSIlya Dryomov static int __rbd_obj_setup_stat(struct rbd_obj_request *obj_req,
18033da691bfSIlya Dryomov 				unsigned int which)
18043da691bfSIlya Dryomov {
18053da691bfSIlya Dryomov 	struct page **pages;
18063da691bfSIlya Dryomov 
1807c5b5ef6cSAlex Elder 	/*
1808c5b5ef6cSAlex Elder 	 * The response data for a STAT call consists of:
1809c5b5ef6cSAlex Elder 	 *     le64 length;
1810c5b5ef6cSAlex Elder 	 *     struct {
1811c5b5ef6cSAlex Elder 	 *         le32 tv_sec;
1812c5b5ef6cSAlex Elder 	 *         le32 tv_nsec;
1813c5b5ef6cSAlex Elder 	 *     } mtime;
1814c5b5ef6cSAlex Elder 	 */
18153da691bfSIlya Dryomov 	pages = ceph_alloc_page_vector(1, GFP_NOIO);
18163da691bfSIlya Dryomov 	if (IS_ERR(pages))
18173da691bfSIlya Dryomov 		return PTR_ERR(pages);
18183da691bfSIlya Dryomov 
18193da691bfSIlya Dryomov 	osd_req_op_init(obj_req->osd_req, which, CEPH_OSD_OP_STAT, 0);
18203da691bfSIlya Dryomov 	osd_req_op_raw_data_in_pages(obj_req->osd_req, which, pages,
18213da691bfSIlya Dryomov 				     8 + sizeof(struct ceph_timespec),
18223da691bfSIlya Dryomov 				     0, false, true);
18233da691bfSIlya Dryomov 	return 0;
1824710214e3SIlya Dryomov }
1825c5b5ef6cSAlex Elder 
182613488d53SIlya Dryomov static int count_write_ops(struct rbd_obj_request *obj_req)
182713488d53SIlya Dryomov {
182813488d53SIlya Dryomov 	return 2; /* setallochint + write/writefull */
182913488d53SIlya Dryomov }
183013488d53SIlya Dryomov 
18313da691bfSIlya Dryomov static void __rbd_obj_setup_write(struct rbd_obj_request *obj_req,
18323da691bfSIlya Dryomov 				  unsigned int which)
18333da691bfSIlya Dryomov {
18343da691bfSIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
18353da691bfSIlya Dryomov 	u16 opcode;
1836c5b5ef6cSAlex Elder 
18373da691bfSIlya Dryomov 	osd_req_op_alloc_hint_init(obj_req->osd_req, which++,
18383da691bfSIlya Dryomov 				   rbd_dev->layout.object_size,
18393da691bfSIlya Dryomov 				   rbd_dev->layout.object_size);
1840c5b5ef6cSAlex Elder 
18413da691bfSIlya Dryomov 	if (rbd_obj_is_entire(obj_req))
18423da691bfSIlya Dryomov 		opcode = CEPH_OSD_OP_WRITEFULL;
18433da691bfSIlya Dryomov 	else
18443da691bfSIlya Dryomov 		opcode = CEPH_OSD_OP_WRITE;
1845c5b5ef6cSAlex Elder 
18463da691bfSIlya Dryomov 	osd_req_op_extent_init(obj_req->osd_req, which, opcode,
184743df3d35SIlya Dryomov 			       obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
18483da691bfSIlya Dryomov 	rbd_osd_req_setup_data(obj_req, which++);
18493da691bfSIlya Dryomov 
18503da691bfSIlya Dryomov 	rbd_assert(which == obj_req->osd_req->r_num_ops);
18513da691bfSIlya Dryomov 	rbd_osd_req_format_write(obj_req);
18523da691bfSIlya Dryomov }
18533da691bfSIlya Dryomov 
18543da691bfSIlya Dryomov static int rbd_obj_setup_write(struct rbd_obj_request *obj_req)
18553da691bfSIlya Dryomov {
18563da691bfSIlya Dryomov 	unsigned int num_osd_ops, which = 0;
185713488d53SIlya Dryomov 	bool need_guard;
18583da691bfSIlya Dryomov 	int ret;
18593da691bfSIlya Dryomov 
186086bd7998SIlya Dryomov 	/* reverse map the entire object onto the parent */
186186bd7998SIlya Dryomov 	ret = rbd_obj_calc_img_extents(obj_req, true);
186286bd7998SIlya Dryomov 	if (ret)
186386bd7998SIlya Dryomov 		return ret;
186486bd7998SIlya Dryomov 
186513488d53SIlya Dryomov 	need_guard = rbd_obj_copyup_enabled(obj_req);
186613488d53SIlya Dryomov 	num_osd_ops = need_guard + count_write_ops(obj_req);
18673da691bfSIlya Dryomov 
1868a162b308SIlya Dryomov 	obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
18693da691bfSIlya Dryomov 	if (!obj_req->osd_req)
18703da691bfSIlya Dryomov 		return -ENOMEM;
18713da691bfSIlya Dryomov 
187213488d53SIlya Dryomov 	if (need_guard) {
18733da691bfSIlya Dryomov 		ret = __rbd_obj_setup_stat(obj_req, which++);
18743da691bfSIlya Dryomov 		if (ret)
1875c5b5ef6cSAlex Elder 			return ret;
187613488d53SIlya Dryomov 
187713488d53SIlya Dryomov 		obj_req->write_state = RBD_OBJ_WRITE_GUARD;
187813488d53SIlya Dryomov 	} else {
187913488d53SIlya Dryomov 		obj_req->write_state = RBD_OBJ_WRITE_FLAT;
1880c5b5ef6cSAlex Elder 	}
1881c5b5ef6cSAlex Elder 
18823da691bfSIlya Dryomov 	__rbd_obj_setup_write(obj_req, which);
18833da691bfSIlya Dryomov 	return 0;
188470d045f6SIlya Dryomov }
188570d045f6SIlya Dryomov 
18866484cbe9SIlya Dryomov static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req)
18876484cbe9SIlya Dryomov {
18886484cbe9SIlya Dryomov 	return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE :
18896484cbe9SIlya Dryomov 					  CEPH_OSD_OP_ZERO;
18906484cbe9SIlya Dryomov }
18916484cbe9SIlya Dryomov 
18926484cbe9SIlya Dryomov static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req)
18936484cbe9SIlya Dryomov {
18940c93e1b7SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
18950c93e1b7SIlya Dryomov 	u64 off = obj_req->ex.oe_off;
18960c93e1b7SIlya Dryomov 	u64 next_off = obj_req->ex.oe_off + obj_req->ex.oe_len;
18976484cbe9SIlya Dryomov 	int ret;
18986484cbe9SIlya Dryomov 
18990c93e1b7SIlya Dryomov 	/*
19000c93e1b7SIlya Dryomov 	 * Align the range to alloc_size boundary and punt on discards
19010c93e1b7SIlya Dryomov 	 * that are too small to free up any space.
19020c93e1b7SIlya Dryomov 	 *
19030c93e1b7SIlya Dryomov 	 * alloc_size == object_size && is_tail() is a special case for
19040c93e1b7SIlya Dryomov 	 * filestore with filestore_punch_hole = false, needed to allow
19050c93e1b7SIlya Dryomov 	 * truncate (in addition to delete).
19060c93e1b7SIlya Dryomov 	 */
19070c93e1b7SIlya Dryomov 	if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size ||
19080c93e1b7SIlya Dryomov 	    !rbd_obj_is_tail(obj_req)) {
19090c93e1b7SIlya Dryomov 		off = round_up(off, rbd_dev->opts->alloc_size);
19100c93e1b7SIlya Dryomov 		next_off = round_down(next_off, rbd_dev->opts->alloc_size);
19110c93e1b7SIlya Dryomov 		if (off >= next_off)
19120c93e1b7SIlya Dryomov 			return 1;
19130c93e1b7SIlya Dryomov 	}
19140c93e1b7SIlya Dryomov 
19156484cbe9SIlya Dryomov 	/* reverse map the entire object onto the parent */
19166484cbe9SIlya Dryomov 	ret = rbd_obj_calc_img_extents(obj_req, true);
19176484cbe9SIlya Dryomov 	if (ret)
19186484cbe9SIlya Dryomov 		return ret;
19196484cbe9SIlya Dryomov 
19206484cbe9SIlya Dryomov 	obj_req->osd_req = rbd_osd_req_create(obj_req, 1);
19216484cbe9SIlya Dryomov 	if (!obj_req->osd_req)
19226484cbe9SIlya Dryomov 		return -ENOMEM;
19236484cbe9SIlya Dryomov 
19246484cbe9SIlya Dryomov 	if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
19256484cbe9SIlya Dryomov 		osd_req_op_init(obj_req->osd_req, 0, CEPH_OSD_OP_DELETE, 0);
19266484cbe9SIlya Dryomov 	} else {
19270c93e1b7SIlya Dryomov 		dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
19280c93e1b7SIlya Dryomov 		     obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
19290c93e1b7SIlya Dryomov 		     off, next_off - off);
19306484cbe9SIlya Dryomov 		osd_req_op_extent_init(obj_req->osd_req, 0,
19316484cbe9SIlya Dryomov 				       truncate_or_zero_opcode(obj_req),
19320c93e1b7SIlya Dryomov 				       off, next_off - off, 0, 0);
19336484cbe9SIlya Dryomov 	}
19346484cbe9SIlya Dryomov 
19356484cbe9SIlya Dryomov 	obj_req->write_state = RBD_OBJ_WRITE_FLAT;
19366484cbe9SIlya Dryomov 	rbd_osd_req_format_write(obj_req);
19376484cbe9SIlya Dryomov 	return 0;
19386484cbe9SIlya Dryomov }
19396484cbe9SIlya Dryomov 
194013488d53SIlya Dryomov static int count_zeroout_ops(struct rbd_obj_request *obj_req)
194113488d53SIlya Dryomov {
194213488d53SIlya Dryomov 	int num_osd_ops;
194313488d53SIlya Dryomov 
194413488d53SIlya Dryomov 	if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents)
194513488d53SIlya Dryomov 		num_osd_ops = 2; /* create + truncate */
194613488d53SIlya Dryomov 	else
194713488d53SIlya Dryomov 		num_osd_ops = 1; /* delete/truncate/zero */
194813488d53SIlya Dryomov 
194913488d53SIlya Dryomov 	return num_osd_ops;
195013488d53SIlya Dryomov }
195113488d53SIlya Dryomov 
19526484cbe9SIlya Dryomov static void __rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req,
19533da691bfSIlya Dryomov 				    unsigned int which)
195470d045f6SIlya Dryomov {
19553da691bfSIlya Dryomov 	u16 opcode;
1956058aa991SIlya Dryomov 
19573da691bfSIlya Dryomov 	if (rbd_obj_is_entire(obj_req)) {
195886bd7998SIlya Dryomov 		if (obj_req->num_img_extents) {
19592bb1e56eSIlya Dryomov 			osd_req_op_init(obj_req->osd_req, which++,
19602bb1e56eSIlya Dryomov 					CEPH_OSD_OP_CREATE, 0);
19613da691bfSIlya Dryomov 			opcode = CEPH_OSD_OP_TRUNCATE;
19623da691bfSIlya Dryomov 		} else {
19633da691bfSIlya Dryomov 			osd_req_op_init(obj_req->osd_req, which++,
19643da691bfSIlya Dryomov 					CEPH_OSD_OP_DELETE, 0);
19653da691bfSIlya Dryomov 			opcode = 0;
19663da691bfSIlya Dryomov 		}
19673da691bfSIlya Dryomov 	} else {
19686484cbe9SIlya Dryomov 		opcode = truncate_or_zero_opcode(obj_req);
19693da691bfSIlya Dryomov 	}
19703da691bfSIlya Dryomov 
19713da691bfSIlya Dryomov 	if (opcode)
19723da691bfSIlya Dryomov 		osd_req_op_extent_init(obj_req->osd_req, which++, opcode,
197343df3d35SIlya Dryomov 				       obj_req->ex.oe_off, obj_req->ex.oe_len,
19743da691bfSIlya Dryomov 				       0, 0);
19753da691bfSIlya Dryomov 
19763da691bfSIlya Dryomov 	rbd_assert(which == obj_req->osd_req->r_num_ops);
19773da691bfSIlya Dryomov 	rbd_osd_req_format_write(obj_req);
19783da691bfSIlya Dryomov }
19793da691bfSIlya Dryomov 
19806484cbe9SIlya Dryomov static int rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req)
19813da691bfSIlya Dryomov {
19823da691bfSIlya Dryomov 	unsigned int num_osd_ops, which = 0;
198313488d53SIlya Dryomov 	bool need_guard;
19843da691bfSIlya Dryomov 	int ret;
19853da691bfSIlya Dryomov 
198686bd7998SIlya Dryomov 	/* reverse map the entire object onto the parent */
198786bd7998SIlya Dryomov 	ret = rbd_obj_calc_img_extents(obj_req, true);
198886bd7998SIlya Dryomov 	if (ret)
198986bd7998SIlya Dryomov 		return ret;
199086bd7998SIlya Dryomov 
199113488d53SIlya Dryomov 	need_guard = rbd_obj_copyup_enabled(obj_req);
199213488d53SIlya Dryomov 	num_osd_ops = need_guard + count_zeroout_ops(obj_req);
19933da691bfSIlya Dryomov 
1994a162b308SIlya Dryomov 	obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
19953da691bfSIlya Dryomov 	if (!obj_req->osd_req)
19963da691bfSIlya Dryomov 		return -ENOMEM;
19973da691bfSIlya Dryomov 
199813488d53SIlya Dryomov 	if (need_guard) {
19993da691bfSIlya Dryomov 		ret = __rbd_obj_setup_stat(obj_req, which++);
20003da691bfSIlya Dryomov 		if (ret)
20013da691bfSIlya Dryomov 			return ret;
200213488d53SIlya Dryomov 
200313488d53SIlya Dryomov 		obj_req->write_state = RBD_OBJ_WRITE_GUARD;
200413488d53SIlya Dryomov 	} else {
200513488d53SIlya Dryomov 		obj_req->write_state = RBD_OBJ_WRITE_FLAT;
20063da691bfSIlya Dryomov 	}
20073da691bfSIlya Dryomov 
20086484cbe9SIlya Dryomov 	__rbd_obj_setup_zeroout(obj_req, which);
2009980917fcSIlya Dryomov 	return 0;
2010b454e36dSAlex Elder }
2011b454e36dSAlex Elder 
2012b454e36dSAlex Elder /*
20133da691bfSIlya Dryomov  * For each object request in @img_req, allocate an OSD request, add
20143da691bfSIlya Dryomov  * individual OSD ops and prepare them for submission.  The number of
20153da691bfSIlya Dryomov  * OSD ops depends on op_type and the overlap point (if any).
2016b454e36dSAlex Elder  */
20173da691bfSIlya Dryomov static int __rbd_img_fill_request(struct rbd_img_request *img_req)
20183da691bfSIlya Dryomov {
20190c93e1b7SIlya Dryomov 	struct rbd_obj_request *obj_req, *next_obj_req;
20203da691bfSIlya Dryomov 	int ret;
20213d7efd18SAlex Elder 
20220c93e1b7SIlya Dryomov 	for_each_obj_request_safe(img_req, obj_req, next_obj_req) {
20239bb0248dSIlya Dryomov 		switch (img_req->op_type) {
20243da691bfSIlya Dryomov 		case OBJ_OP_READ:
20253da691bfSIlya Dryomov 			ret = rbd_obj_setup_read(obj_req);
20263da691bfSIlya Dryomov 			break;
20273da691bfSIlya Dryomov 		case OBJ_OP_WRITE:
20283da691bfSIlya Dryomov 			ret = rbd_obj_setup_write(obj_req);
20293da691bfSIlya Dryomov 			break;
20303da691bfSIlya Dryomov 		case OBJ_OP_DISCARD:
20313da691bfSIlya Dryomov 			ret = rbd_obj_setup_discard(obj_req);
20323da691bfSIlya Dryomov 			break;
20336484cbe9SIlya Dryomov 		case OBJ_OP_ZEROOUT:
20346484cbe9SIlya Dryomov 			ret = rbd_obj_setup_zeroout(obj_req);
20356484cbe9SIlya Dryomov 			break;
20363da691bfSIlya Dryomov 		default:
20373da691bfSIlya Dryomov 			rbd_assert(0);
20383da691bfSIlya Dryomov 		}
20390c93e1b7SIlya Dryomov 		if (ret < 0)
20403da691bfSIlya Dryomov 			return ret;
20410c93e1b7SIlya Dryomov 		if (ret > 0) {
20420c93e1b7SIlya Dryomov 			img_req->xferred += obj_req->ex.oe_len;
20430c93e1b7SIlya Dryomov 			img_req->pending_count--;
20440c93e1b7SIlya Dryomov 			rbd_img_obj_request_del(img_req, obj_req);
20450c93e1b7SIlya Dryomov 			continue;
20460c93e1b7SIlya Dryomov 		}
204726f887e0SIlya Dryomov 
204826f887e0SIlya Dryomov 		ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO);
204926f887e0SIlya Dryomov 		if (ret)
205026f887e0SIlya Dryomov 			return ret;
2051b454e36dSAlex Elder 	}
2052b454e36dSAlex Elder 
20533da691bfSIlya Dryomov 	return 0;
20543da691bfSIlya Dryomov }
20553da691bfSIlya Dryomov 
20565a237819SIlya Dryomov union rbd_img_fill_iter {
20575a237819SIlya Dryomov 	struct ceph_bio_iter	bio_iter;
20585a237819SIlya Dryomov 	struct ceph_bvec_iter	bvec_iter;
20595a237819SIlya Dryomov };
20605a237819SIlya Dryomov 
20615a237819SIlya Dryomov struct rbd_img_fill_ctx {
20625a237819SIlya Dryomov 	enum obj_request_type	pos_type;
20635a237819SIlya Dryomov 	union rbd_img_fill_iter	*pos;
20645a237819SIlya Dryomov 	union rbd_img_fill_iter	iter;
20655a237819SIlya Dryomov 	ceph_object_extent_fn_t	set_pos_fn;
2066afb97888SIlya Dryomov 	ceph_object_extent_fn_t	count_fn;
2067afb97888SIlya Dryomov 	ceph_object_extent_fn_t	copy_fn;
20685a237819SIlya Dryomov };
20695a237819SIlya Dryomov 
20705a237819SIlya Dryomov static struct ceph_object_extent *alloc_object_extent(void *arg)
20715a237819SIlya Dryomov {
20725a237819SIlya Dryomov 	struct rbd_img_request *img_req = arg;
20735a237819SIlya Dryomov 	struct rbd_obj_request *obj_req;
20745a237819SIlya Dryomov 
20755a237819SIlya Dryomov 	obj_req = rbd_obj_request_create();
20765a237819SIlya Dryomov 	if (!obj_req)
20775a237819SIlya Dryomov 		return NULL;
20785a237819SIlya Dryomov 
20795a237819SIlya Dryomov 	rbd_img_obj_request_add(img_req, obj_req);
20805a237819SIlya Dryomov 	return &obj_req->ex;
20815a237819SIlya Dryomov }
20825a237819SIlya Dryomov 
20835a237819SIlya Dryomov /*
2084afb97888SIlya Dryomov  * While su != os && sc == 1 is technically not fancy (it's the same
2085afb97888SIlya Dryomov  * layout as su == os && sc == 1), we can't use the nocopy path for it
2086afb97888SIlya Dryomov  * because ->set_pos_fn() should be called only once per object.
2087afb97888SIlya Dryomov  * ceph_file_to_extents() invokes action_fn once per stripe unit, so
2088afb97888SIlya Dryomov  * treat su != os && sc == 1 as fancy.
20895a237819SIlya Dryomov  */
2090afb97888SIlya Dryomov static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
2091afb97888SIlya Dryomov {
2092afb97888SIlya Dryomov 	return l->stripe_unit != l->object_size;
2093afb97888SIlya Dryomov }
2094afb97888SIlya Dryomov 
2095afb97888SIlya Dryomov static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
20965a237819SIlya Dryomov 				       struct ceph_file_extent *img_extents,
20975a237819SIlya Dryomov 				       u32 num_img_extents,
20985a237819SIlya Dryomov 				       struct rbd_img_fill_ctx *fctx)
20995a237819SIlya Dryomov {
21005a237819SIlya Dryomov 	u32 i;
21015a237819SIlya Dryomov 	int ret;
21025a237819SIlya Dryomov 
21035a237819SIlya Dryomov 	img_req->data_type = fctx->pos_type;
21045a237819SIlya Dryomov 
21055a237819SIlya Dryomov 	/*
21065a237819SIlya Dryomov 	 * Create object requests and set each object request's starting
21075a237819SIlya Dryomov 	 * position in the provided bio (list) or bio_vec array.
21085a237819SIlya Dryomov 	 */
21095a237819SIlya Dryomov 	fctx->iter = *fctx->pos;
21105a237819SIlya Dryomov 	for (i = 0; i < num_img_extents; i++) {
21115a237819SIlya Dryomov 		ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
21125a237819SIlya Dryomov 					   img_extents[i].fe_off,
21135a237819SIlya Dryomov 					   img_extents[i].fe_len,
21145a237819SIlya Dryomov 					   &img_req->object_extents,
21155a237819SIlya Dryomov 					   alloc_object_extent, img_req,
21165a237819SIlya Dryomov 					   fctx->set_pos_fn, &fctx->iter);
21175a237819SIlya Dryomov 		if (ret)
21185a237819SIlya Dryomov 			return ret;
21195a237819SIlya Dryomov 	}
21205a237819SIlya Dryomov 
21215a237819SIlya Dryomov 	return __rbd_img_fill_request(img_req);
21225a237819SIlya Dryomov }
21235a237819SIlya Dryomov 
2124afb97888SIlya Dryomov /*
2125afb97888SIlya Dryomov  * Map a list of image extents to a list of object extents, create the
2126afb97888SIlya Dryomov  * corresponding object requests (normally each to a different object,
2127afb97888SIlya Dryomov  * but not always) and add them to @img_req.  For each object request,
2128afb97888SIlya Dryomov  * set up its data descriptor to point to the corresponding chunk(s) of
2129afb97888SIlya Dryomov  * @fctx->pos data buffer.
2130afb97888SIlya Dryomov  *
2131afb97888SIlya Dryomov  * Because ceph_file_to_extents() will merge adjacent object extents
2132afb97888SIlya Dryomov  * together, each object request's data descriptor may point to multiple
2133afb97888SIlya Dryomov  * different chunks of @fctx->pos data buffer.
2134afb97888SIlya Dryomov  *
2135afb97888SIlya Dryomov  * @fctx->pos data buffer is assumed to be large enough.
2136afb97888SIlya Dryomov  */
2137afb97888SIlya Dryomov static int rbd_img_fill_request(struct rbd_img_request *img_req,
2138afb97888SIlya Dryomov 				struct ceph_file_extent *img_extents,
2139afb97888SIlya Dryomov 				u32 num_img_extents,
2140afb97888SIlya Dryomov 				struct rbd_img_fill_ctx *fctx)
2141afb97888SIlya Dryomov {
2142afb97888SIlya Dryomov 	struct rbd_device *rbd_dev = img_req->rbd_dev;
2143afb97888SIlya Dryomov 	struct rbd_obj_request *obj_req;
2144afb97888SIlya Dryomov 	u32 i;
2145afb97888SIlya Dryomov 	int ret;
2146afb97888SIlya Dryomov 
2147afb97888SIlya Dryomov 	if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2148afb97888SIlya Dryomov 	    !rbd_layout_is_fancy(&rbd_dev->layout))
2149afb97888SIlya Dryomov 		return rbd_img_fill_request_nocopy(img_req, img_extents,
2150afb97888SIlya Dryomov 						   num_img_extents, fctx);
2151afb97888SIlya Dryomov 
2152afb97888SIlya Dryomov 	img_req->data_type = OBJ_REQUEST_OWN_BVECS;
2153afb97888SIlya Dryomov 
2154afb97888SIlya Dryomov 	/*
2155afb97888SIlya Dryomov 	 * Create object requests and determine ->bvec_count for each object
2156afb97888SIlya Dryomov 	 * request.  Note that ->bvec_count sum over all object requests may
2157afb97888SIlya Dryomov 	 * be greater than the number of bio_vecs in the provided bio (list)
2158afb97888SIlya Dryomov 	 * or bio_vec array because when mapped, those bio_vecs can straddle
2159afb97888SIlya Dryomov 	 * stripe unit boundaries.
2160afb97888SIlya Dryomov 	 */
2161afb97888SIlya Dryomov 	fctx->iter = *fctx->pos;
2162afb97888SIlya Dryomov 	for (i = 0; i < num_img_extents; i++) {
2163afb97888SIlya Dryomov 		ret = ceph_file_to_extents(&rbd_dev->layout,
2164afb97888SIlya Dryomov 					   img_extents[i].fe_off,
2165afb97888SIlya Dryomov 					   img_extents[i].fe_len,
2166afb97888SIlya Dryomov 					   &img_req->object_extents,
2167afb97888SIlya Dryomov 					   alloc_object_extent, img_req,
2168afb97888SIlya Dryomov 					   fctx->count_fn, &fctx->iter);
2169afb97888SIlya Dryomov 		if (ret)
2170afb97888SIlya Dryomov 			return ret;
2171afb97888SIlya Dryomov 	}
2172afb97888SIlya Dryomov 
2173afb97888SIlya Dryomov 	for_each_obj_request(img_req, obj_req) {
2174afb97888SIlya Dryomov 		obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2175afb97888SIlya Dryomov 					      sizeof(*obj_req->bvec_pos.bvecs),
2176afb97888SIlya Dryomov 					      GFP_NOIO);
2177afb97888SIlya Dryomov 		if (!obj_req->bvec_pos.bvecs)
2178afb97888SIlya Dryomov 			return -ENOMEM;
2179afb97888SIlya Dryomov 	}
2180afb97888SIlya Dryomov 
2181afb97888SIlya Dryomov 	/*
2182afb97888SIlya Dryomov 	 * Fill in each object request's private bio_vec array, splitting and
2183afb97888SIlya Dryomov 	 * rearranging the provided bio_vecs in stripe unit chunks as needed.
2184afb97888SIlya Dryomov 	 */
2185afb97888SIlya Dryomov 	fctx->iter = *fctx->pos;
2186afb97888SIlya Dryomov 	for (i = 0; i < num_img_extents; i++) {
2187afb97888SIlya Dryomov 		ret = ceph_iterate_extents(&rbd_dev->layout,
2188afb97888SIlya Dryomov 					   img_extents[i].fe_off,
2189afb97888SIlya Dryomov 					   img_extents[i].fe_len,
2190afb97888SIlya Dryomov 					   &img_req->object_extents,
2191afb97888SIlya Dryomov 					   fctx->copy_fn, &fctx->iter);
2192afb97888SIlya Dryomov 		if (ret)
2193afb97888SIlya Dryomov 			return ret;
2194afb97888SIlya Dryomov 	}
2195afb97888SIlya Dryomov 
2196afb97888SIlya Dryomov 	return __rbd_img_fill_request(img_req);
2197afb97888SIlya Dryomov }
2198afb97888SIlya Dryomov 
21995a237819SIlya Dryomov static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
22005a237819SIlya Dryomov 			       u64 off, u64 len)
22015a237819SIlya Dryomov {
22025a237819SIlya Dryomov 	struct ceph_file_extent ex = { off, len };
22035a237819SIlya Dryomov 	union rbd_img_fill_iter dummy;
22045a237819SIlya Dryomov 	struct rbd_img_fill_ctx fctx = {
22055a237819SIlya Dryomov 		.pos_type = OBJ_REQUEST_NODATA,
22065a237819SIlya Dryomov 		.pos = &dummy,
22075a237819SIlya Dryomov 	};
22085a237819SIlya Dryomov 
22095a237819SIlya Dryomov 	return rbd_img_fill_request(img_req, &ex, 1, &fctx);
22105a237819SIlya Dryomov }
22115a237819SIlya Dryomov 
22125a237819SIlya Dryomov static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
22135a237819SIlya Dryomov {
22145a237819SIlya Dryomov 	struct rbd_obj_request *obj_req =
22155a237819SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
22165a237819SIlya Dryomov 	struct ceph_bio_iter *it = arg;
22175a237819SIlya Dryomov 
22185a237819SIlya Dryomov 	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
22195a237819SIlya Dryomov 	obj_req->bio_pos = *it;
22205a237819SIlya Dryomov 	ceph_bio_iter_advance(it, bytes);
22215a237819SIlya Dryomov }
22225a237819SIlya Dryomov 
2223afb97888SIlya Dryomov static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2224afb97888SIlya Dryomov {
2225afb97888SIlya Dryomov 	struct rbd_obj_request *obj_req =
2226afb97888SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
2227afb97888SIlya Dryomov 	struct ceph_bio_iter *it = arg;
2228afb97888SIlya Dryomov 
2229afb97888SIlya Dryomov 	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2230afb97888SIlya Dryomov 	ceph_bio_iter_advance_step(it, bytes, ({
2231afb97888SIlya Dryomov 		obj_req->bvec_count++;
2232afb97888SIlya Dryomov 	}));
2233afb97888SIlya Dryomov 
2234afb97888SIlya Dryomov }
2235afb97888SIlya Dryomov 
2236afb97888SIlya Dryomov static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2237afb97888SIlya Dryomov {
2238afb97888SIlya Dryomov 	struct rbd_obj_request *obj_req =
2239afb97888SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
2240afb97888SIlya Dryomov 	struct ceph_bio_iter *it = arg;
2241afb97888SIlya Dryomov 
2242afb97888SIlya Dryomov 	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2243afb97888SIlya Dryomov 	ceph_bio_iter_advance_step(it, bytes, ({
2244afb97888SIlya Dryomov 		obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2245afb97888SIlya Dryomov 		obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2246afb97888SIlya Dryomov 	}));
2247afb97888SIlya Dryomov }
2248afb97888SIlya Dryomov 
22495a237819SIlya Dryomov static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
22505a237819SIlya Dryomov 				   struct ceph_file_extent *img_extents,
22515a237819SIlya Dryomov 				   u32 num_img_extents,
22525a237819SIlya Dryomov 				   struct ceph_bio_iter *bio_pos)
22535a237819SIlya Dryomov {
22545a237819SIlya Dryomov 	struct rbd_img_fill_ctx fctx = {
22555a237819SIlya Dryomov 		.pos_type = OBJ_REQUEST_BIO,
22565a237819SIlya Dryomov 		.pos = (union rbd_img_fill_iter *)bio_pos,
22575a237819SIlya Dryomov 		.set_pos_fn = set_bio_pos,
2258afb97888SIlya Dryomov 		.count_fn = count_bio_bvecs,
2259afb97888SIlya Dryomov 		.copy_fn = copy_bio_bvecs,
22605a237819SIlya Dryomov 	};
22615a237819SIlya Dryomov 
22625a237819SIlya Dryomov 	return rbd_img_fill_request(img_req, img_extents, num_img_extents,
22635a237819SIlya Dryomov 				    &fctx);
22645a237819SIlya Dryomov }
22655a237819SIlya Dryomov 
22665a237819SIlya Dryomov static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
22675a237819SIlya Dryomov 				 u64 off, u64 len, struct bio *bio)
22685a237819SIlya Dryomov {
22695a237819SIlya Dryomov 	struct ceph_file_extent ex = { off, len };
22705a237819SIlya Dryomov 	struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
22715a237819SIlya Dryomov 
22725a237819SIlya Dryomov 	return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
22735a237819SIlya Dryomov }
22745a237819SIlya Dryomov 
22755a237819SIlya Dryomov static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
22765a237819SIlya Dryomov {
22775a237819SIlya Dryomov 	struct rbd_obj_request *obj_req =
22785a237819SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
22795a237819SIlya Dryomov 	struct ceph_bvec_iter *it = arg;
22805a237819SIlya Dryomov 
22815a237819SIlya Dryomov 	obj_req->bvec_pos = *it;
22825a237819SIlya Dryomov 	ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
22835a237819SIlya Dryomov 	ceph_bvec_iter_advance(it, bytes);
22845a237819SIlya Dryomov }
22855a237819SIlya Dryomov 
2286afb97888SIlya Dryomov static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2287afb97888SIlya Dryomov {
2288afb97888SIlya Dryomov 	struct rbd_obj_request *obj_req =
2289afb97888SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
2290afb97888SIlya Dryomov 	struct ceph_bvec_iter *it = arg;
2291afb97888SIlya Dryomov 
2292afb97888SIlya Dryomov 	ceph_bvec_iter_advance_step(it, bytes, ({
2293afb97888SIlya Dryomov 		obj_req->bvec_count++;
2294afb97888SIlya Dryomov 	}));
2295afb97888SIlya Dryomov }
2296afb97888SIlya Dryomov 
2297afb97888SIlya Dryomov static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2298afb97888SIlya Dryomov {
2299afb97888SIlya Dryomov 	struct rbd_obj_request *obj_req =
2300afb97888SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
2301afb97888SIlya Dryomov 	struct ceph_bvec_iter *it = arg;
2302afb97888SIlya Dryomov 
2303afb97888SIlya Dryomov 	ceph_bvec_iter_advance_step(it, bytes, ({
2304afb97888SIlya Dryomov 		obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2305afb97888SIlya Dryomov 		obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2306afb97888SIlya Dryomov 	}));
2307afb97888SIlya Dryomov }
2308afb97888SIlya Dryomov 
23095a237819SIlya Dryomov static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
23105a237819SIlya Dryomov 				     struct ceph_file_extent *img_extents,
23115a237819SIlya Dryomov 				     u32 num_img_extents,
23125a237819SIlya Dryomov 				     struct ceph_bvec_iter *bvec_pos)
23135a237819SIlya Dryomov {
23145a237819SIlya Dryomov 	struct rbd_img_fill_ctx fctx = {
23155a237819SIlya Dryomov 		.pos_type = OBJ_REQUEST_BVECS,
23165a237819SIlya Dryomov 		.pos = (union rbd_img_fill_iter *)bvec_pos,
23175a237819SIlya Dryomov 		.set_pos_fn = set_bvec_pos,
2318afb97888SIlya Dryomov 		.count_fn = count_bvecs,
2319afb97888SIlya Dryomov 		.copy_fn = copy_bvecs,
23205a237819SIlya Dryomov 	};
23215a237819SIlya Dryomov 
23225a237819SIlya Dryomov 	return rbd_img_fill_request(img_req, img_extents, num_img_extents,
23235a237819SIlya Dryomov 				    &fctx);
23245a237819SIlya Dryomov }
23255a237819SIlya Dryomov 
23265a237819SIlya Dryomov static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
23275a237819SIlya Dryomov 				   struct ceph_file_extent *img_extents,
23285a237819SIlya Dryomov 				   u32 num_img_extents,
23295a237819SIlya Dryomov 				   struct bio_vec *bvecs)
23305a237819SIlya Dryomov {
23315a237819SIlya Dryomov 	struct ceph_bvec_iter it = {
23325a237819SIlya Dryomov 		.bvecs = bvecs,
23335a237819SIlya Dryomov 		.iter = { .bi_size = ceph_file_extents_bytes(img_extents,
23345a237819SIlya Dryomov 							     num_img_extents) },
23355a237819SIlya Dryomov 	};
23365a237819SIlya Dryomov 
23375a237819SIlya Dryomov 	return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
23385a237819SIlya Dryomov 					 &it);
23395a237819SIlya Dryomov }
23405a237819SIlya Dryomov 
2341efbd1a11SIlya Dryomov static void rbd_img_request_submit(struct rbd_img_request *img_request)
2342bf0d5f50SAlex Elder {
2343bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2344bf0d5f50SAlex Elder 
234537206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
2346bf0d5f50SAlex Elder 
2347663ae2ccSIlya Dryomov 	rbd_img_request_get(img_request);
2348efbd1a11SIlya Dryomov 	for_each_obj_request(img_request, obj_request)
23493da691bfSIlya Dryomov 		rbd_obj_request_submit(obj_request);
2350bf0d5f50SAlex Elder 
2351663ae2ccSIlya Dryomov 	rbd_img_request_put(img_request);
2352bf0d5f50SAlex Elder }
2353bf0d5f50SAlex Elder 
235486bd7998SIlya Dryomov static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
23553da691bfSIlya Dryomov {
23563da691bfSIlya Dryomov 	struct rbd_img_request *img_req = obj_req->img_request;
23573da691bfSIlya Dryomov 	struct rbd_img_request *child_img_req;
23583da691bfSIlya Dryomov 	int ret;
23593da691bfSIlya Dryomov 
2360e93aca0aSIlya Dryomov 	child_img_req = rbd_img_request_create(img_req->rbd_dev->parent,
2361e93aca0aSIlya Dryomov 					       OBJ_OP_READ, NULL);
23623da691bfSIlya Dryomov 	if (!child_img_req)
23633da691bfSIlya Dryomov 		return -ENOMEM;
23643da691bfSIlya Dryomov 
2365e93aca0aSIlya Dryomov 	__set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2366e93aca0aSIlya Dryomov 	child_img_req->obj_request = obj_req;
2367e93aca0aSIlya Dryomov 
23683da691bfSIlya Dryomov 	if (!rbd_img_is_write(img_req)) {
2369ecc633caSIlya Dryomov 		switch (img_req->data_type) {
23703da691bfSIlya Dryomov 		case OBJ_REQUEST_BIO:
23715a237819SIlya Dryomov 			ret = __rbd_img_fill_from_bio(child_img_req,
23725a237819SIlya Dryomov 						      obj_req->img_extents,
23735a237819SIlya Dryomov 						      obj_req->num_img_extents,
23743da691bfSIlya Dryomov 						      &obj_req->bio_pos);
23753da691bfSIlya Dryomov 			break;
23763da691bfSIlya Dryomov 		case OBJ_REQUEST_BVECS:
2377afb97888SIlya Dryomov 		case OBJ_REQUEST_OWN_BVECS:
23785a237819SIlya Dryomov 			ret = __rbd_img_fill_from_bvecs(child_img_req,
23795a237819SIlya Dryomov 						      obj_req->img_extents,
23805a237819SIlya Dryomov 						      obj_req->num_img_extents,
23813da691bfSIlya Dryomov 						      &obj_req->bvec_pos);
23823da691bfSIlya Dryomov 			break;
23833da691bfSIlya Dryomov 		default:
23843da691bfSIlya Dryomov 			rbd_assert(0);
23853da691bfSIlya Dryomov 		}
23863da691bfSIlya Dryomov 	} else {
23875a237819SIlya Dryomov 		ret = rbd_img_fill_from_bvecs(child_img_req,
23885a237819SIlya Dryomov 					      obj_req->img_extents,
23895a237819SIlya Dryomov 					      obj_req->num_img_extents,
23905a237819SIlya Dryomov 					      obj_req->copyup_bvecs);
23913da691bfSIlya Dryomov 	}
23923da691bfSIlya Dryomov 	if (ret) {
23933da691bfSIlya Dryomov 		rbd_img_request_put(child_img_req);
2394663ae2ccSIlya Dryomov 		return ret;
2395bf0d5f50SAlex Elder 	}
2396bf0d5f50SAlex Elder 
23973da691bfSIlya Dryomov 	rbd_img_request_submit(child_img_req);
23983da691bfSIlya Dryomov 	return 0;
23993da691bfSIlya Dryomov }
24003da691bfSIlya Dryomov 
24013da691bfSIlya Dryomov static bool rbd_obj_handle_read(struct rbd_obj_request *obj_req)
24028b3e1a56SAlex Elder {
24033da691bfSIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
24043da691bfSIlya Dryomov 	int ret;
24058b3e1a56SAlex Elder 
24063da691bfSIlya Dryomov 	if (obj_req->result == -ENOENT &&
240786bd7998SIlya Dryomov 	    rbd_dev->parent_overlap && !obj_req->tried_parent) {
240886bd7998SIlya Dryomov 		/* reverse map this object extent onto the parent */
240986bd7998SIlya Dryomov 		ret = rbd_obj_calc_img_extents(obj_req, false);
241086bd7998SIlya Dryomov 		if (ret) {
241186bd7998SIlya Dryomov 			obj_req->result = ret;
241286bd7998SIlya Dryomov 			return true;
241386bd7998SIlya Dryomov 		}
24148b3e1a56SAlex Elder 
241586bd7998SIlya Dryomov 		if (obj_req->num_img_extents) {
24163da691bfSIlya Dryomov 			obj_req->tried_parent = true;
241786bd7998SIlya Dryomov 			ret = rbd_obj_read_from_parent(obj_req);
24183da691bfSIlya Dryomov 			if (ret) {
24193da691bfSIlya Dryomov 				obj_req->result = ret;
24203da691bfSIlya Dryomov 				return true;
24213da691bfSIlya Dryomov 			}
24223da691bfSIlya Dryomov 			return false;
24233da691bfSIlya Dryomov 		}
242486bd7998SIlya Dryomov 	}
242502c74fbaSAlex Elder 
242602c74fbaSAlex Elder 	/*
24273da691bfSIlya Dryomov 	 * -ENOENT means a hole in the image -- zero-fill the entire
24283da691bfSIlya Dryomov 	 * length of the request.  A short read also implies zero-fill
24293da691bfSIlya Dryomov 	 * to the end of the request.  In both cases we update xferred
24303da691bfSIlya Dryomov 	 * count to indicate the whole request was satisfied.
243102c74fbaSAlex Elder 	 */
24323da691bfSIlya Dryomov 	if (obj_req->result == -ENOENT ||
243343df3d35SIlya Dryomov 	    (!obj_req->result && obj_req->xferred < obj_req->ex.oe_len)) {
24343da691bfSIlya Dryomov 		rbd_assert(!obj_req->xferred || !obj_req->result);
24353da691bfSIlya Dryomov 		rbd_obj_zero_range(obj_req, obj_req->xferred,
243643df3d35SIlya Dryomov 				   obj_req->ex.oe_len - obj_req->xferred);
24373da691bfSIlya Dryomov 		obj_req->result = 0;
243843df3d35SIlya Dryomov 		obj_req->xferred = obj_req->ex.oe_len;
24393da691bfSIlya Dryomov 	}
24403da691bfSIlya Dryomov 
24413da691bfSIlya Dryomov 	return true;
24423da691bfSIlya Dryomov }
24433da691bfSIlya Dryomov 
24443da691bfSIlya Dryomov /*
24453da691bfSIlya Dryomov  * copyup_bvecs pages are never highmem pages
24463da691bfSIlya Dryomov  */
24473da691bfSIlya Dryomov static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
24483da691bfSIlya Dryomov {
24493da691bfSIlya Dryomov 	struct ceph_bvec_iter it = {
24503da691bfSIlya Dryomov 		.bvecs = bvecs,
24513da691bfSIlya Dryomov 		.iter = { .bi_size = bytes },
24523da691bfSIlya Dryomov 	};
24533da691bfSIlya Dryomov 
24543da691bfSIlya Dryomov 	ceph_bvec_iter_advance_step(&it, bytes, ({
24553da691bfSIlya Dryomov 		if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
24563da691bfSIlya Dryomov 			       bv.bv_len))
24573da691bfSIlya Dryomov 			return false;
24583da691bfSIlya Dryomov 	}));
24593da691bfSIlya Dryomov 	return true;
24603da691bfSIlya Dryomov }
24613da691bfSIlya Dryomov 
24623a482501SIlya Dryomov #define MODS_ONLY	U32_MAX
24633a482501SIlya Dryomov 
24643a482501SIlya Dryomov static int rbd_obj_issue_copyup_ops(struct rbd_obj_request *obj_req, u32 bytes)
24653da691bfSIlya Dryomov {
246613488d53SIlya Dryomov 	struct rbd_img_request *img_req = obj_req->img_request;
24673a482501SIlya Dryomov 	unsigned int num_osd_ops = (bytes != MODS_ONLY);
24683a482501SIlya Dryomov 	unsigned int which = 0;
2469fe943d50SChengguang Xu 	int ret;
24703da691bfSIlya Dryomov 
24713da691bfSIlya Dryomov 	dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
24723da691bfSIlya Dryomov 	rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT);
24733da691bfSIlya Dryomov 	rbd_osd_req_destroy(obj_req->osd_req);
24743da691bfSIlya Dryomov 
247513488d53SIlya Dryomov 	switch (img_req->op_type) {
247613488d53SIlya Dryomov 	case OBJ_OP_WRITE:
247713488d53SIlya Dryomov 		num_osd_ops += count_write_ops(obj_req);
247813488d53SIlya Dryomov 		break;
247913488d53SIlya Dryomov 	case OBJ_OP_ZEROOUT:
248013488d53SIlya Dryomov 		num_osd_ops += count_zeroout_ops(obj_req);
248113488d53SIlya Dryomov 		break;
248213488d53SIlya Dryomov 	default:
248313488d53SIlya Dryomov 		rbd_assert(0);
248413488d53SIlya Dryomov 	}
248513488d53SIlya Dryomov 
2486a162b308SIlya Dryomov 	obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
24873da691bfSIlya Dryomov 	if (!obj_req->osd_req)
24883da691bfSIlya Dryomov 		return -ENOMEM;
24893da691bfSIlya Dryomov 
24903a482501SIlya Dryomov 	if (bytes != MODS_ONLY) {
24913a482501SIlya Dryomov 		ret = osd_req_op_cls_init(obj_req->osd_req, which, "rbd",
24923a482501SIlya Dryomov 					  "copyup");
2493fe943d50SChengguang Xu 		if (ret)
2494fe943d50SChengguang Xu 			return ret;
2495fe943d50SChengguang Xu 
24963a482501SIlya Dryomov 		osd_req_op_cls_request_data_bvecs(obj_req->osd_req, which++,
24970010f705SIlya Dryomov 						  obj_req->copyup_bvecs,
24980010f705SIlya Dryomov 						  obj_req->copyup_bvec_count,
24990010f705SIlya Dryomov 						  bytes);
25003a482501SIlya Dryomov 	}
25013da691bfSIlya Dryomov 
250213488d53SIlya Dryomov 	switch (img_req->op_type) {
25033da691bfSIlya Dryomov 	case OBJ_OP_WRITE:
25043a482501SIlya Dryomov 		__rbd_obj_setup_write(obj_req, which);
25053da691bfSIlya Dryomov 		break;
25066484cbe9SIlya Dryomov 	case OBJ_OP_ZEROOUT:
25073da691bfSIlya Dryomov 		rbd_assert(!rbd_obj_is_entire(obj_req));
25083a482501SIlya Dryomov 		__rbd_obj_setup_zeroout(obj_req, which);
25093da691bfSIlya Dryomov 		break;
25103da691bfSIlya Dryomov 	default:
25113da691bfSIlya Dryomov 		rbd_assert(0);
25123da691bfSIlya Dryomov 	}
25133da691bfSIlya Dryomov 
251426f887e0SIlya Dryomov 	ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO);
251526f887e0SIlya Dryomov 	if (ret)
251626f887e0SIlya Dryomov 		return ret;
251726f887e0SIlya Dryomov 
25183da691bfSIlya Dryomov 	rbd_obj_request_submit(obj_req);
25193da691bfSIlya Dryomov 	return 0;
25203da691bfSIlya Dryomov }
25213da691bfSIlya Dryomov 
25223a482501SIlya Dryomov static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes)
25233a482501SIlya Dryomov {
25243a482501SIlya Dryomov 	/*
25253a482501SIlya Dryomov 	 * Only send non-zero copyup data to save some I/O and network
25263a482501SIlya Dryomov 	 * bandwidth -- zero copyup data is equivalent to the object not
25273a482501SIlya Dryomov 	 * existing.
25283a482501SIlya Dryomov 	 */
25293a482501SIlya Dryomov 	if (is_zero_bvecs(obj_req->copyup_bvecs, bytes)) {
25303a482501SIlya Dryomov 		dout("%s obj_req %p detected zeroes\n", __func__, obj_req);
25313a482501SIlya Dryomov 		bytes = 0;
25323a482501SIlya Dryomov 	}
25333a482501SIlya Dryomov 
25343a482501SIlya Dryomov 	obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS;
25353a482501SIlya Dryomov 	return rbd_obj_issue_copyup_ops(obj_req, bytes);
25363a482501SIlya Dryomov }
25373a482501SIlya Dryomov 
25387e07efb1SIlya Dryomov static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
25397e07efb1SIlya Dryomov {
25407e07efb1SIlya Dryomov 	u32 i;
25417e07efb1SIlya Dryomov 
25427e07efb1SIlya Dryomov 	rbd_assert(!obj_req->copyup_bvecs);
25437e07efb1SIlya Dryomov 	obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
25447e07efb1SIlya Dryomov 	obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
25457e07efb1SIlya Dryomov 					sizeof(*obj_req->copyup_bvecs),
25467e07efb1SIlya Dryomov 					GFP_NOIO);
25477e07efb1SIlya Dryomov 	if (!obj_req->copyup_bvecs)
25487e07efb1SIlya Dryomov 		return -ENOMEM;
25497e07efb1SIlya Dryomov 
25507e07efb1SIlya Dryomov 	for (i = 0; i < obj_req->copyup_bvec_count; i++) {
25517e07efb1SIlya Dryomov 		unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
25527e07efb1SIlya Dryomov 
25537e07efb1SIlya Dryomov 		obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
25547e07efb1SIlya Dryomov 		if (!obj_req->copyup_bvecs[i].bv_page)
25557e07efb1SIlya Dryomov 			return -ENOMEM;
25567e07efb1SIlya Dryomov 
25577e07efb1SIlya Dryomov 		obj_req->copyup_bvecs[i].bv_offset = 0;
25587e07efb1SIlya Dryomov 		obj_req->copyup_bvecs[i].bv_len = len;
25597e07efb1SIlya Dryomov 		obj_overlap -= len;
25607e07efb1SIlya Dryomov 	}
25617e07efb1SIlya Dryomov 
25627e07efb1SIlya Dryomov 	rbd_assert(!obj_overlap);
25637e07efb1SIlya Dryomov 	return 0;
25647e07efb1SIlya Dryomov }
25657e07efb1SIlya Dryomov 
25663da691bfSIlya Dryomov static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req)
25673da691bfSIlya Dryomov {
25683da691bfSIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
25693da691bfSIlya Dryomov 	int ret;
25703da691bfSIlya Dryomov 
257186bd7998SIlya Dryomov 	rbd_assert(obj_req->num_img_extents);
257286bd7998SIlya Dryomov 	prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
257386bd7998SIlya Dryomov 		      rbd_dev->parent_overlap);
257486bd7998SIlya Dryomov 	if (!obj_req->num_img_extents) {
25753da691bfSIlya Dryomov 		/*
25763da691bfSIlya Dryomov 		 * The overlap has become 0 (most likely because the
25773a482501SIlya Dryomov 		 * image has been flattened).  Re-submit the original write
25783a482501SIlya Dryomov 		 * request -- pass MODS_ONLY since the copyup isn't needed
25793a482501SIlya Dryomov 		 * anymore.
25803da691bfSIlya Dryomov 		 */
25813a482501SIlya Dryomov 		obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS;
25823a482501SIlya Dryomov 		return rbd_obj_issue_copyup_ops(obj_req, MODS_ONLY);
25833da691bfSIlya Dryomov 	}
25843da691bfSIlya Dryomov 
258586bd7998SIlya Dryomov 	ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
25863da691bfSIlya Dryomov 	if (ret)
25873da691bfSIlya Dryomov 		return ret;
25883da691bfSIlya Dryomov 
25893a482501SIlya Dryomov 	obj_req->write_state = RBD_OBJ_WRITE_READ_FROM_PARENT;
259086bd7998SIlya Dryomov 	return rbd_obj_read_from_parent(obj_req);
25913da691bfSIlya Dryomov }
25923da691bfSIlya Dryomov 
25933da691bfSIlya Dryomov static bool rbd_obj_handle_write(struct rbd_obj_request *obj_req)
25943da691bfSIlya Dryomov {
25953da691bfSIlya Dryomov 	int ret;
25963da691bfSIlya Dryomov 
25973da691bfSIlya Dryomov 	switch (obj_req->write_state) {
25983da691bfSIlya Dryomov 	case RBD_OBJ_WRITE_GUARD:
25993da691bfSIlya Dryomov 		rbd_assert(!obj_req->xferred);
26003da691bfSIlya Dryomov 		if (obj_req->result == -ENOENT) {
26013da691bfSIlya Dryomov 			/*
26023da691bfSIlya Dryomov 			 * The target object doesn't exist.  Read the data for
26033da691bfSIlya Dryomov 			 * the entire target object up to the overlap point (if
26043da691bfSIlya Dryomov 			 * any) from the parent, so we can use it for a copyup.
26053da691bfSIlya Dryomov 			 */
26063da691bfSIlya Dryomov 			ret = rbd_obj_handle_write_guard(obj_req);
26073da691bfSIlya Dryomov 			if (ret) {
26083da691bfSIlya Dryomov 				obj_req->result = ret;
26093da691bfSIlya Dryomov 				return true;
26103da691bfSIlya Dryomov 			}
26113da691bfSIlya Dryomov 			return false;
26123da691bfSIlya Dryomov 		}
26133da691bfSIlya Dryomov 		/* fall through */
26143da691bfSIlya Dryomov 	case RBD_OBJ_WRITE_FLAT:
26153a482501SIlya Dryomov 	case RBD_OBJ_WRITE_COPYUP_OPS:
26163da691bfSIlya Dryomov 		if (!obj_req->result)
26173da691bfSIlya Dryomov 			/*
26183da691bfSIlya Dryomov 			 * There is no such thing as a successful short
26193da691bfSIlya Dryomov 			 * write -- indicate the whole request was satisfied.
26203da691bfSIlya Dryomov 			 */
262143df3d35SIlya Dryomov 			obj_req->xferred = obj_req->ex.oe_len;
26223da691bfSIlya Dryomov 		return true;
26233a482501SIlya Dryomov 	case RBD_OBJ_WRITE_READ_FROM_PARENT:
26243da691bfSIlya Dryomov 		if (obj_req->result)
26253a482501SIlya Dryomov 			return true;
26263da691bfSIlya Dryomov 
26273da691bfSIlya Dryomov 		rbd_assert(obj_req->xferred);
26283da691bfSIlya Dryomov 		ret = rbd_obj_issue_copyup(obj_req, obj_req->xferred);
26293da691bfSIlya Dryomov 		if (ret) {
26303da691bfSIlya Dryomov 			obj_req->result = ret;
2631356889c4SIlya Dryomov 			obj_req->xferred = 0;
26323da691bfSIlya Dryomov 			return true;
26333da691bfSIlya Dryomov 		}
26343da691bfSIlya Dryomov 		return false;
26353da691bfSIlya Dryomov 	default:
2636c6244b3bSArnd Bergmann 		BUG();
26373da691bfSIlya Dryomov 	}
26383da691bfSIlya Dryomov }
26393da691bfSIlya Dryomov 
26403da691bfSIlya Dryomov /*
26413da691bfSIlya Dryomov  * Returns true if @obj_req is completed, or false otherwise.
26423da691bfSIlya Dryomov  */
26433da691bfSIlya Dryomov static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req)
26443da691bfSIlya Dryomov {
26459bb0248dSIlya Dryomov 	switch (obj_req->img_request->op_type) {
26463da691bfSIlya Dryomov 	case OBJ_OP_READ:
26473da691bfSIlya Dryomov 		return rbd_obj_handle_read(obj_req);
26483da691bfSIlya Dryomov 	case OBJ_OP_WRITE:
26493da691bfSIlya Dryomov 		return rbd_obj_handle_write(obj_req);
26503da691bfSIlya Dryomov 	case OBJ_OP_DISCARD:
26516484cbe9SIlya Dryomov 	case OBJ_OP_ZEROOUT:
26523da691bfSIlya Dryomov 		if (rbd_obj_handle_write(obj_req)) {
26533da691bfSIlya Dryomov 			/*
26543da691bfSIlya Dryomov 			 * Hide -ENOENT from delete/truncate/zero -- discarding
26553da691bfSIlya Dryomov 			 * a non-existent object is not a problem.
26563da691bfSIlya Dryomov 			 */
26573da691bfSIlya Dryomov 			if (obj_req->result == -ENOENT) {
26583da691bfSIlya Dryomov 				obj_req->result = 0;
265943df3d35SIlya Dryomov 				obj_req->xferred = obj_req->ex.oe_len;
26603da691bfSIlya Dryomov 			}
26613da691bfSIlya Dryomov 			return true;
26623da691bfSIlya Dryomov 		}
26633da691bfSIlya Dryomov 		return false;
26643da691bfSIlya Dryomov 	default:
2665c6244b3bSArnd Bergmann 		BUG();
26663da691bfSIlya Dryomov 	}
26673da691bfSIlya Dryomov }
26683da691bfSIlya Dryomov 
26697114edacSIlya Dryomov static void rbd_obj_end_request(struct rbd_obj_request *obj_req)
26707114edacSIlya Dryomov {
26717114edacSIlya Dryomov 	struct rbd_img_request *img_req = obj_req->img_request;
26727114edacSIlya Dryomov 
26737114edacSIlya Dryomov 	rbd_assert((!obj_req->result &&
267443df3d35SIlya Dryomov 		    obj_req->xferred == obj_req->ex.oe_len) ||
26757114edacSIlya Dryomov 		   (obj_req->result < 0 && !obj_req->xferred));
26767114edacSIlya Dryomov 	if (!obj_req->result) {
26777114edacSIlya Dryomov 		img_req->xferred += obj_req->xferred;
267802c74fbaSAlex Elder 		return;
267902c74fbaSAlex Elder 	}
268002c74fbaSAlex Elder 
26817114edacSIlya Dryomov 	rbd_warn(img_req->rbd_dev,
26827114edacSIlya Dryomov 		 "%s at objno %llu %llu~%llu result %d xferred %llu",
268343df3d35SIlya Dryomov 		 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
268443df3d35SIlya Dryomov 		 obj_req->ex.oe_off, obj_req->ex.oe_len, obj_req->result,
26857114edacSIlya Dryomov 		 obj_req->xferred);
26867114edacSIlya Dryomov 	if (!img_req->result) {
26877114edacSIlya Dryomov 		img_req->result = obj_req->result;
26887114edacSIlya Dryomov 		img_req->xferred = 0;
2689a9e8ba2cSAlex Elder 	}
26908b3e1a56SAlex Elder }
26918b3e1a56SAlex Elder 
26923da691bfSIlya Dryomov static void rbd_img_end_child_request(struct rbd_img_request *img_req)
26938b3e1a56SAlex Elder {
26943da691bfSIlya Dryomov 	struct rbd_obj_request *obj_req = img_req->obj_request;
26958b3e1a56SAlex Elder 
26963da691bfSIlya Dryomov 	rbd_assert(test_bit(IMG_REQ_CHILD, &img_req->flags));
269786bd7998SIlya Dryomov 	rbd_assert((!img_req->result &&
269886bd7998SIlya Dryomov 		    img_req->xferred == rbd_obj_img_extents_bytes(obj_req)) ||
269986bd7998SIlya Dryomov 		   (img_req->result < 0 && !img_req->xferred));
27008b3e1a56SAlex Elder 
27013da691bfSIlya Dryomov 	obj_req->result = img_req->result;
27023da691bfSIlya Dryomov 	obj_req->xferred = img_req->xferred;
27033da691bfSIlya Dryomov 	rbd_img_request_put(img_req);
27047114edacSIlya Dryomov }
27058b3e1a56SAlex Elder 
27067114edacSIlya Dryomov static void rbd_img_end_request(struct rbd_img_request *img_req)
27077114edacSIlya Dryomov {
27087114edacSIlya Dryomov 	rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
27097114edacSIlya Dryomov 	rbd_assert((!img_req->result &&
27107114edacSIlya Dryomov 		    img_req->xferred == blk_rq_bytes(img_req->rq)) ||
27117114edacSIlya Dryomov 		   (img_req->result < 0 && !img_req->xferred));
27128b3e1a56SAlex Elder 
27137114edacSIlya Dryomov 	blk_mq_end_request(img_req->rq,
27147114edacSIlya Dryomov 			   errno_to_blk_status(img_req->result));
27157114edacSIlya Dryomov 	rbd_img_request_put(img_req);
27163da691bfSIlya Dryomov }
27178b3e1a56SAlex Elder 
27183da691bfSIlya Dryomov static void rbd_obj_handle_request(struct rbd_obj_request *obj_req)
27193da691bfSIlya Dryomov {
27207114edacSIlya Dryomov 	struct rbd_img_request *img_req;
27217114edacSIlya Dryomov 
27227114edacSIlya Dryomov again:
27233da691bfSIlya Dryomov 	if (!__rbd_obj_handle_request(obj_req))
27248b3e1a56SAlex Elder 		return;
27253da691bfSIlya Dryomov 
27267114edacSIlya Dryomov 	img_req = obj_req->img_request;
27277114edacSIlya Dryomov 	spin_lock(&img_req->completion_lock);
27287114edacSIlya Dryomov 	rbd_obj_end_request(obj_req);
27297114edacSIlya Dryomov 	rbd_assert(img_req->pending_count);
27307114edacSIlya Dryomov 	if (--img_req->pending_count) {
27317114edacSIlya Dryomov 		spin_unlock(&img_req->completion_lock);
27327114edacSIlya Dryomov 		return;
27337114edacSIlya Dryomov 	}
27347114edacSIlya Dryomov 
27357114edacSIlya Dryomov 	spin_unlock(&img_req->completion_lock);
27367114edacSIlya Dryomov 	if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
27377114edacSIlya Dryomov 		obj_req = img_req->obj_request;
27387114edacSIlya Dryomov 		rbd_img_end_child_request(img_req);
27397114edacSIlya Dryomov 		goto again;
27407114edacSIlya Dryomov 	}
27417114edacSIlya Dryomov 	rbd_img_end_request(img_req);
27428b3e1a56SAlex Elder }
27438b3e1a56SAlex Elder 
2744ed95b21aSIlya Dryomov static const struct rbd_client_id rbd_empty_cid;
2745ed95b21aSIlya Dryomov 
2746ed95b21aSIlya Dryomov static bool rbd_cid_equal(const struct rbd_client_id *lhs,
2747ed95b21aSIlya Dryomov 			  const struct rbd_client_id *rhs)
2748ed95b21aSIlya Dryomov {
2749ed95b21aSIlya Dryomov 	return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
2750ed95b21aSIlya Dryomov }
2751ed95b21aSIlya Dryomov 
2752ed95b21aSIlya Dryomov static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
2753ed95b21aSIlya Dryomov {
2754ed95b21aSIlya Dryomov 	struct rbd_client_id cid;
2755ed95b21aSIlya Dryomov 
2756ed95b21aSIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
2757ed95b21aSIlya Dryomov 	cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
2758ed95b21aSIlya Dryomov 	cid.handle = rbd_dev->watch_cookie;
2759ed95b21aSIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
2760ed95b21aSIlya Dryomov 	return cid;
2761ed95b21aSIlya Dryomov }
2762ed95b21aSIlya Dryomov 
2763ed95b21aSIlya Dryomov /*
2764ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
2765ed95b21aSIlya Dryomov  */
2766ed95b21aSIlya Dryomov static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
2767ed95b21aSIlya Dryomov 			      const struct rbd_client_id *cid)
2768ed95b21aSIlya Dryomov {
2769ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
2770ed95b21aSIlya Dryomov 	     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
2771ed95b21aSIlya Dryomov 	     cid->gid, cid->handle);
2772ed95b21aSIlya Dryomov 	rbd_dev->owner_cid = *cid; /* struct */
2773ed95b21aSIlya Dryomov }
2774ed95b21aSIlya Dryomov 
2775ed95b21aSIlya Dryomov static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
2776ed95b21aSIlya Dryomov {
2777ed95b21aSIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
2778ed95b21aSIlya Dryomov 	sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
2779ed95b21aSIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
2780ed95b21aSIlya Dryomov }
2781ed95b21aSIlya Dryomov 
2782edd8ca80SFlorian Margaine static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
2783edd8ca80SFlorian Margaine {
2784edd8ca80SFlorian Margaine 	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
2785edd8ca80SFlorian Margaine 
2786edd8ca80SFlorian Margaine 	strcpy(rbd_dev->lock_cookie, cookie);
2787edd8ca80SFlorian Margaine 	rbd_set_owner_cid(rbd_dev, &cid);
2788edd8ca80SFlorian Margaine 	queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
2789edd8ca80SFlorian Margaine }
2790edd8ca80SFlorian Margaine 
2791ed95b21aSIlya Dryomov /*
2792ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
2793ed95b21aSIlya Dryomov  */
2794ed95b21aSIlya Dryomov static int rbd_lock(struct rbd_device *rbd_dev)
2795ed95b21aSIlya Dryomov {
2796ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2797ed95b21aSIlya Dryomov 	char cookie[32];
2798ed95b21aSIlya Dryomov 	int ret;
2799ed95b21aSIlya Dryomov 
2800cbbfb0ffSIlya Dryomov 	WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
2801cbbfb0ffSIlya Dryomov 		rbd_dev->lock_cookie[0] != '\0');
2802ed95b21aSIlya Dryomov 
2803ed95b21aSIlya Dryomov 	format_lock_cookie(rbd_dev, cookie);
2804ed95b21aSIlya Dryomov 	ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
2805ed95b21aSIlya Dryomov 			    RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
2806ed95b21aSIlya Dryomov 			    RBD_LOCK_TAG, "", 0);
2807ed95b21aSIlya Dryomov 	if (ret)
2808ed95b21aSIlya Dryomov 		return ret;
2809ed95b21aSIlya Dryomov 
2810ed95b21aSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
2811edd8ca80SFlorian Margaine 	__rbd_lock(rbd_dev, cookie);
2812ed95b21aSIlya Dryomov 	return 0;
2813ed95b21aSIlya Dryomov }
2814ed95b21aSIlya Dryomov 
2815ed95b21aSIlya Dryomov /*
2816ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
2817ed95b21aSIlya Dryomov  */
2818bbead745SIlya Dryomov static void rbd_unlock(struct rbd_device *rbd_dev)
2819ed95b21aSIlya Dryomov {
2820ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2821ed95b21aSIlya Dryomov 	int ret;
2822ed95b21aSIlya Dryomov 
2823cbbfb0ffSIlya Dryomov 	WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
2824cbbfb0ffSIlya Dryomov 		rbd_dev->lock_cookie[0] == '\0');
2825ed95b21aSIlya Dryomov 
2826ed95b21aSIlya Dryomov 	ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
2827cbbfb0ffSIlya Dryomov 			      RBD_LOCK_NAME, rbd_dev->lock_cookie);
2828bbead745SIlya Dryomov 	if (ret && ret != -ENOENT)
2829bbead745SIlya Dryomov 		rbd_warn(rbd_dev, "failed to unlock: %d", ret);
2830ed95b21aSIlya Dryomov 
2831bbead745SIlya Dryomov 	/* treat errors as the image is unlocked */
2832bbead745SIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
2833cbbfb0ffSIlya Dryomov 	rbd_dev->lock_cookie[0] = '\0';
2834ed95b21aSIlya Dryomov 	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
2835ed95b21aSIlya Dryomov 	queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
2836ed95b21aSIlya Dryomov }
2837ed95b21aSIlya Dryomov 
2838ed95b21aSIlya Dryomov static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
2839ed95b21aSIlya Dryomov 				enum rbd_notify_op notify_op,
2840ed95b21aSIlya Dryomov 				struct page ***preply_pages,
2841ed95b21aSIlya Dryomov 				size_t *preply_len)
2842ed95b21aSIlya Dryomov {
2843ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2844ed95b21aSIlya Dryomov 	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
284508a79102SKyle Spiers 	char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
284608a79102SKyle Spiers 	int buf_size = sizeof(buf);
2847ed95b21aSIlya Dryomov 	void *p = buf;
2848ed95b21aSIlya Dryomov 
2849ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
2850ed95b21aSIlya Dryomov 
2851ed95b21aSIlya Dryomov 	/* encode *LockPayload NotifyMessage (op + ClientId) */
2852ed95b21aSIlya Dryomov 	ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
2853ed95b21aSIlya Dryomov 	ceph_encode_32(&p, notify_op);
2854ed95b21aSIlya Dryomov 	ceph_encode_64(&p, cid.gid);
2855ed95b21aSIlya Dryomov 	ceph_encode_64(&p, cid.handle);
2856ed95b21aSIlya Dryomov 
2857ed95b21aSIlya Dryomov 	return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
2858ed95b21aSIlya Dryomov 				&rbd_dev->header_oloc, buf, buf_size,
2859ed95b21aSIlya Dryomov 				RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
2860ed95b21aSIlya Dryomov }
2861ed95b21aSIlya Dryomov 
2862ed95b21aSIlya Dryomov static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
2863ed95b21aSIlya Dryomov 			       enum rbd_notify_op notify_op)
2864ed95b21aSIlya Dryomov {
2865ed95b21aSIlya Dryomov 	struct page **reply_pages;
2866ed95b21aSIlya Dryomov 	size_t reply_len;
2867ed95b21aSIlya Dryomov 
2868ed95b21aSIlya Dryomov 	__rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
2869ed95b21aSIlya Dryomov 	ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
2870ed95b21aSIlya Dryomov }
2871ed95b21aSIlya Dryomov 
2872ed95b21aSIlya Dryomov static void rbd_notify_acquired_lock(struct work_struct *work)
2873ed95b21aSIlya Dryomov {
2874ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
2875ed95b21aSIlya Dryomov 						  acquired_lock_work);
2876ed95b21aSIlya Dryomov 
2877ed95b21aSIlya Dryomov 	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
2878ed95b21aSIlya Dryomov }
2879ed95b21aSIlya Dryomov 
2880ed95b21aSIlya Dryomov static void rbd_notify_released_lock(struct work_struct *work)
2881ed95b21aSIlya Dryomov {
2882ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
2883ed95b21aSIlya Dryomov 						  released_lock_work);
2884ed95b21aSIlya Dryomov 
2885ed95b21aSIlya Dryomov 	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
2886ed95b21aSIlya Dryomov }
2887ed95b21aSIlya Dryomov 
2888ed95b21aSIlya Dryomov static int rbd_request_lock(struct rbd_device *rbd_dev)
2889ed95b21aSIlya Dryomov {
2890ed95b21aSIlya Dryomov 	struct page **reply_pages;
2891ed95b21aSIlya Dryomov 	size_t reply_len;
2892ed95b21aSIlya Dryomov 	bool lock_owner_responded = false;
2893ed95b21aSIlya Dryomov 	int ret;
2894ed95b21aSIlya Dryomov 
2895ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
2896ed95b21aSIlya Dryomov 
2897ed95b21aSIlya Dryomov 	ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
2898ed95b21aSIlya Dryomov 				   &reply_pages, &reply_len);
2899ed95b21aSIlya Dryomov 	if (ret && ret != -ETIMEDOUT) {
2900ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "failed to request lock: %d", ret);
2901ed95b21aSIlya Dryomov 		goto out;
2902ed95b21aSIlya Dryomov 	}
2903ed95b21aSIlya Dryomov 
2904ed95b21aSIlya Dryomov 	if (reply_len > 0 && reply_len <= PAGE_SIZE) {
2905ed95b21aSIlya Dryomov 		void *p = page_address(reply_pages[0]);
2906ed95b21aSIlya Dryomov 		void *const end = p + reply_len;
2907ed95b21aSIlya Dryomov 		u32 n;
2908ed95b21aSIlya Dryomov 
2909ed95b21aSIlya Dryomov 		ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
2910ed95b21aSIlya Dryomov 		while (n--) {
2911ed95b21aSIlya Dryomov 			u8 struct_v;
2912ed95b21aSIlya Dryomov 			u32 len;
2913ed95b21aSIlya Dryomov 
2914ed95b21aSIlya Dryomov 			ceph_decode_need(&p, end, 8 + 8, e_inval);
2915ed95b21aSIlya Dryomov 			p += 8 + 8; /* skip gid and cookie */
2916ed95b21aSIlya Dryomov 
2917ed95b21aSIlya Dryomov 			ceph_decode_32_safe(&p, end, len, e_inval);
2918ed95b21aSIlya Dryomov 			if (!len)
2919ed95b21aSIlya Dryomov 				continue;
2920ed95b21aSIlya Dryomov 
2921ed95b21aSIlya Dryomov 			if (lock_owner_responded) {
2922ed95b21aSIlya Dryomov 				rbd_warn(rbd_dev,
2923ed95b21aSIlya Dryomov 					 "duplicate lock owners detected");
2924ed95b21aSIlya Dryomov 				ret = -EIO;
2925ed95b21aSIlya Dryomov 				goto out;
2926ed95b21aSIlya Dryomov 			}
2927ed95b21aSIlya Dryomov 
2928ed95b21aSIlya Dryomov 			lock_owner_responded = true;
2929ed95b21aSIlya Dryomov 			ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
2930ed95b21aSIlya Dryomov 						  &struct_v, &len);
2931ed95b21aSIlya Dryomov 			if (ret) {
2932ed95b21aSIlya Dryomov 				rbd_warn(rbd_dev,
2933ed95b21aSIlya Dryomov 					 "failed to decode ResponseMessage: %d",
2934ed95b21aSIlya Dryomov 					 ret);
2935ed95b21aSIlya Dryomov 				goto e_inval;
2936ed95b21aSIlya Dryomov 			}
2937ed95b21aSIlya Dryomov 
2938ed95b21aSIlya Dryomov 			ret = ceph_decode_32(&p);
2939ed95b21aSIlya Dryomov 		}
2940ed95b21aSIlya Dryomov 	}
2941ed95b21aSIlya Dryomov 
2942ed95b21aSIlya Dryomov 	if (!lock_owner_responded) {
2943ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "no lock owners detected");
2944ed95b21aSIlya Dryomov 		ret = -ETIMEDOUT;
2945ed95b21aSIlya Dryomov 	}
2946ed95b21aSIlya Dryomov 
2947ed95b21aSIlya Dryomov out:
2948ed95b21aSIlya Dryomov 	ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
2949ed95b21aSIlya Dryomov 	return ret;
2950ed95b21aSIlya Dryomov 
2951ed95b21aSIlya Dryomov e_inval:
2952ed95b21aSIlya Dryomov 	ret = -EINVAL;
2953ed95b21aSIlya Dryomov 	goto out;
2954ed95b21aSIlya Dryomov }
2955ed95b21aSIlya Dryomov 
2956ed95b21aSIlya Dryomov static void wake_requests(struct rbd_device *rbd_dev, bool wake_all)
2957ed95b21aSIlya Dryomov {
2958ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all);
2959ed95b21aSIlya Dryomov 
2960ed95b21aSIlya Dryomov 	cancel_delayed_work(&rbd_dev->lock_dwork);
2961ed95b21aSIlya Dryomov 	if (wake_all)
2962ed95b21aSIlya Dryomov 		wake_up_all(&rbd_dev->lock_waitq);
2963ed95b21aSIlya Dryomov 	else
2964ed95b21aSIlya Dryomov 		wake_up(&rbd_dev->lock_waitq);
2965ed95b21aSIlya Dryomov }
2966ed95b21aSIlya Dryomov 
2967ed95b21aSIlya Dryomov static int get_lock_owner_info(struct rbd_device *rbd_dev,
2968ed95b21aSIlya Dryomov 			       struct ceph_locker **lockers, u32 *num_lockers)
2969ed95b21aSIlya Dryomov {
2970ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2971ed95b21aSIlya Dryomov 	u8 lock_type;
2972ed95b21aSIlya Dryomov 	char *lock_tag;
2973ed95b21aSIlya Dryomov 	int ret;
2974ed95b21aSIlya Dryomov 
2975ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
2976ed95b21aSIlya Dryomov 
2977ed95b21aSIlya Dryomov 	ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
2978ed95b21aSIlya Dryomov 				 &rbd_dev->header_oloc, RBD_LOCK_NAME,
2979ed95b21aSIlya Dryomov 				 &lock_type, &lock_tag, lockers, num_lockers);
2980ed95b21aSIlya Dryomov 	if (ret)
2981ed95b21aSIlya Dryomov 		return ret;
2982ed95b21aSIlya Dryomov 
2983ed95b21aSIlya Dryomov 	if (*num_lockers == 0) {
2984ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
2985ed95b21aSIlya Dryomov 		goto out;
2986ed95b21aSIlya Dryomov 	}
2987ed95b21aSIlya Dryomov 
2988ed95b21aSIlya Dryomov 	if (strcmp(lock_tag, RBD_LOCK_TAG)) {
2989ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
2990ed95b21aSIlya Dryomov 			 lock_tag);
2991ed95b21aSIlya Dryomov 		ret = -EBUSY;
2992ed95b21aSIlya Dryomov 		goto out;
2993ed95b21aSIlya Dryomov 	}
2994ed95b21aSIlya Dryomov 
2995ed95b21aSIlya Dryomov 	if (lock_type == CEPH_CLS_LOCK_SHARED) {
2996ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "shared lock type detected");
2997ed95b21aSIlya Dryomov 		ret = -EBUSY;
2998ed95b21aSIlya Dryomov 		goto out;
2999ed95b21aSIlya Dryomov 	}
3000ed95b21aSIlya Dryomov 
3001ed95b21aSIlya Dryomov 	if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
3002ed95b21aSIlya Dryomov 		    strlen(RBD_LOCK_COOKIE_PREFIX))) {
3003ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
3004ed95b21aSIlya Dryomov 			 (*lockers)[0].id.cookie);
3005ed95b21aSIlya Dryomov 		ret = -EBUSY;
3006ed95b21aSIlya Dryomov 		goto out;
3007ed95b21aSIlya Dryomov 	}
3008ed95b21aSIlya Dryomov 
3009ed95b21aSIlya Dryomov out:
3010ed95b21aSIlya Dryomov 	kfree(lock_tag);
3011ed95b21aSIlya Dryomov 	return ret;
3012ed95b21aSIlya Dryomov }
3013ed95b21aSIlya Dryomov 
3014ed95b21aSIlya Dryomov static int find_watcher(struct rbd_device *rbd_dev,
3015ed95b21aSIlya Dryomov 			const struct ceph_locker *locker)
3016ed95b21aSIlya Dryomov {
3017ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3018ed95b21aSIlya Dryomov 	struct ceph_watch_item *watchers;
3019ed95b21aSIlya Dryomov 	u32 num_watchers;
3020ed95b21aSIlya Dryomov 	u64 cookie;
3021ed95b21aSIlya Dryomov 	int i;
3022ed95b21aSIlya Dryomov 	int ret;
3023ed95b21aSIlya Dryomov 
3024ed95b21aSIlya Dryomov 	ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
3025ed95b21aSIlya Dryomov 				      &rbd_dev->header_oloc, &watchers,
3026ed95b21aSIlya Dryomov 				      &num_watchers);
3027ed95b21aSIlya Dryomov 	if (ret)
3028ed95b21aSIlya Dryomov 		return ret;
3029ed95b21aSIlya Dryomov 
3030ed95b21aSIlya Dryomov 	sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
3031ed95b21aSIlya Dryomov 	for (i = 0; i < num_watchers; i++) {
3032ed95b21aSIlya Dryomov 		if (!memcmp(&watchers[i].addr, &locker->info.addr,
3033ed95b21aSIlya Dryomov 			    sizeof(locker->info.addr)) &&
3034ed95b21aSIlya Dryomov 		    watchers[i].cookie == cookie) {
3035ed95b21aSIlya Dryomov 			struct rbd_client_id cid = {
3036ed95b21aSIlya Dryomov 				.gid = le64_to_cpu(watchers[i].name.num),
3037ed95b21aSIlya Dryomov 				.handle = cookie,
3038ed95b21aSIlya Dryomov 			};
3039ed95b21aSIlya Dryomov 
3040ed95b21aSIlya Dryomov 			dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
3041ed95b21aSIlya Dryomov 			     rbd_dev, cid.gid, cid.handle);
3042ed95b21aSIlya Dryomov 			rbd_set_owner_cid(rbd_dev, &cid);
3043ed95b21aSIlya Dryomov 			ret = 1;
3044ed95b21aSIlya Dryomov 			goto out;
3045ed95b21aSIlya Dryomov 		}
3046ed95b21aSIlya Dryomov 	}
3047ed95b21aSIlya Dryomov 
3048ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
3049ed95b21aSIlya Dryomov 	ret = 0;
3050ed95b21aSIlya Dryomov out:
3051ed95b21aSIlya Dryomov 	kfree(watchers);
3052ed95b21aSIlya Dryomov 	return ret;
3053ed95b21aSIlya Dryomov }
3054ed95b21aSIlya Dryomov 
3055ed95b21aSIlya Dryomov /*
3056ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3057ed95b21aSIlya Dryomov  */
3058ed95b21aSIlya Dryomov static int rbd_try_lock(struct rbd_device *rbd_dev)
3059ed95b21aSIlya Dryomov {
3060ed95b21aSIlya Dryomov 	struct ceph_client *client = rbd_dev->rbd_client->client;
3061ed95b21aSIlya Dryomov 	struct ceph_locker *lockers;
3062ed95b21aSIlya Dryomov 	u32 num_lockers;
3063ed95b21aSIlya Dryomov 	int ret;
3064ed95b21aSIlya Dryomov 
3065ed95b21aSIlya Dryomov 	for (;;) {
3066ed95b21aSIlya Dryomov 		ret = rbd_lock(rbd_dev);
3067ed95b21aSIlya Dryomov 		if (ret != -EBUSY)
3068ed95b21aSIlya Dryomov 			return ret;
3069ed95b21aSIlya Dryomov 
3070ed95b21aSIlya Dryomov 		/* determine if the current lock holder is still alive */
3071ed95b21aSIlya Dryomov 		ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
3072ed95b21aSIlya Dryomov 		if (ret)
3073ed95b21aSIlya Dryomov 			return ret;
3074ed95b21aSIlya Dryomov 
3075ed95b21aSIlya Dryomov 		if (num_lockers == 0)
3076ed95b21aSIlya Dryomov 			goto again;
3077ed95b21aSIlya Dryomov 
3078ed95b21aSIlya Dryomov 		ret = find_watcher(rbd_dev, lockers);
3079ed95b21aSIlya Dryomov 		if (ret) {
3080ed95b21aSIlya Dryomov 			if (ret > 0)
3081ed95b21aSIlya Dryomov 				ret = 0; /* have to request lock */
3082ed95b21aSIlya Dryomov 			goto out;
3083ed95b21aSIlya Dryomov 		}
3084ed95b21aSIlya Dryomov 
3085ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
3086ed95b21aSIlya Dryomov 			 ENTITY_NAME(lockers[0].id.name));
3087ed95b21aSIlya Dryomov 
3088ed95b21aSIlya Dryomov 		ret = ceph_monc_blacklist_add(&client->monc,
3089ed95b21aSIlya Dryomov 					      &lockers[0].info.addr);
3090ed95b21aSIlya Dryomov 		if (ret) {
3091ed95b21aSIlya Dryomov 			rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
3092ed95b21aSIlya Dryomov 				 ENTITY_NAME(lockers[0].id.name), ret);
3093ed95b21aSIlya Dryomov 			goto out;
3094ed95b21aSIlya Dryomov 		}
3095ed95b21aSIlya Dryomov 
3096ed95b21aSIlya Dryomov 		ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
3097ed95b21aSIlya Dryomov 					  &rbd_dev->header_oloc, RBD_LOCK_NAME,
3098ed95b21aSIlya Dryomov 					  lockers[0].id.cookie,
3099ed95b21aSIlya Dryomov 					  &lockers[0].id.name);
3100ed95b21aSIlya Dryomov 		if (ret && ret != -ENOENT)
3101ed95b21aSIlya Dryomov 			goto out;
3102ed95b21aSIlya Dryomov 
3103ed95b21aSIlya Dryomov again:
3104ed95b21aSIlya Dryomov 		ceph_free_lockers(lockers, num_lockers);
3105ed95b21aSIlya Dryomov 	}
3106ed95b21aSIlya Dryomov 
3107ed95b21aSIlya Dryomov out:
3108ed95b21aSIlya Dryomov 	ceph_free_lockers(lockers, num_lockers);
3109ed95b21aSIlya Dryomov 	return ret;
3110ed95b21aSIlya Dryomov }
3111ed95b21aSIlya Dryomov 
3112ed95b21aSIlya Dryomov /*
3113ed95b21aSIlya Dryomov  * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED
3114ed95b21aSIlya Dryomov  */
3115ed95b21aSIlya Dryomov static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev,
3116ed95b21aSIlya Dryomov 						int *pret)
3117ed95b21aSIlya Dryomov {
3118ed95b21aSIlya Dryomov 	enum rbd_lock_state lock_state;
3119ed95b21aSIlya Dryomov 
3120ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
3121ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3122ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
3123ed95b21aSIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev)) {
3124ed95b21aSIlya Dryomov 		lock_state = rbd_dev->lock_state;
3125ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
3126ed95b21aSIlya Dryomov 		return lock_state;
3127ed95b21aSIlya Dryomov 	}
3128ed95b21aSIlya Dryomov 
3129ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3130ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3131ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3132ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
3133ed95b21aSIlya Dryomov 	if (!__rbd_is_lock_owner(rbd_dev)) {
3134ed95b21aSIlya Dryomov 		*pret = rbd_try_lock(rbd_dev);
3135ed95b21aSIlya Dryomov 		if (*pret)
3136ed95b21aSIlya Dryomov 			rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret);
3137ed95b21aSIlya Dryomov 	}
3138ed95b21aSIlya Dryomov 
3139ed95b21aSIlya Dryomov 	lock_state = rbd_dev->lock_state;
3140ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
3141ed95b21aSIlya Dryomov 	return lock_state;
3142ed95b21aSIlya Dryomov }
3143ed95b21aSIlya Dryomov 
3144ed95b21aSIlya Dryomov static void rbd_acquire_lock(struct work_struct *work)
3145ed95b21aSIlya Dryomov {
3146ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3147ed95b21aSIlya Dryomov 					    struct rbd_device, lock_dwork);
3148ed95b21aSIlya Dryomov 	enum rbd_lock_state lock_state;
314937f13252SKefeng Wang 	int ret = 0;
3150ed95b21aSIlya Dryomov 
3151ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3152ed95b21aSIlya Dryomov again:
3153ed95b21aSIlya Dryomov 	lock_state = rbd_try_acquire_lock(rbd_dev, &ret);
3154ed95b21aSIlya Dryomov 	if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) {
3155ed95b21aSIlya Dryomov 		if (lock_state == RBD_LOCK_STATE_LOCKED)
3156ed95b21aSIlya Dryomov 			wake_requests(rbd_dev, true);
3157ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__,
3158ed95b21aSIlya Dryomov 		     rbd_dev, lock_state, ret);
3159ed95b21aSIlya Dryomov 		return;
3160ed95b21aSIlya Dryomov 	}
3161ed95b21aSIlya Dryomov 
3162ed95b21aSIlya Dryomov 	ret = rbd_request_lock(rbd_dev);
3163ed95b21aSIlya Dryomov 	if (ret == -ETIMEDOUT) {
3164ed95b21aSIlya Dryomov 		goto again; /* treat this as a dead client */
3165e010dd0aSIlya Dryomov 	} else if (ret == -EROFS) {
3166e010dd0aSIlya Dryomov 		rbd_warn(rbd_dev, "peer will not release lock");
3167e010dd0aSIlya Dryomov 		/*
3168e010dd0aSIlya Dryomov 		 * If this is rbd_add_acquire_lock(), we want to fail
3169e010dd0aSIlya Dryomov 		 * immediately -- reuse BLACKLISTED flag.  Otherwise we
3170e010dd0aSIlya Dryomov 		 * want to block.
3171e010dd0aSIlya Dryomov 		 */
3172e010dd0aSIlya Dryomov 		if (!(rbd_dev->disk->flags & GENHD_FL_UP)) {
3173e010dd0aSIlya Dryomov 			set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
3174e010dd0aSIlya Dryomov 			/* wake "rbd map --exclusive" process */
3175e010dd0aSIlya Dryomov 			wake_requests(rbd_dev, false);
3176e010dd0aSIlya Dryomov 		}
3177ed95b21aSIlya Dryomov 	} else if (ret < 0) {
3178ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "error requesting lock: %d", ret);
3179ed95b21aSIlya Dryomov 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3180ed95b21aSIlya Dryomov 				 RBD_RETRY_DELAY);
3181ed95b21aSIlya Dryomov 	} else {
3182ed95b21aSIlya Dryomov 		/*
3183ed95b21aSIlya Dryomov 		 * lock owner acked, but resend if we don't see them
3184ed95b21aSIlya Dryomov 		 * release the lock
3185ed95b21aSIlya Dryomov 		 */
3186ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p requeueing lock_dwork\n", __func__,
3187ed95b21aSIlya Dryomov 		     rbd_dev);
3188ed95b21aSIlya Dryomov 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3189ed95b21aSIlya Dryomov 		    msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
3190ed95b21aSIlya Dryomov 	}
3191ed95b21aSIlya Dryomov }
3192ed95b21aSIlya Dryomov 
3193ed95b21aSIlya Dryomov /*
3194ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3195ed95b21aSIlya Dryomov  */
3196ed95b21aSIlya Dryomov static bool rbd_release_lock(struct rbd_device *rbd_dev)
3197ed95b21aSIlya Dryomov {
3198ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3199ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
3200ed95b21aSIlya Dryomov 	if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
3201ed95b21aSIlya Dryomov 		return false;
3202ed95b21aSIlya Dryomov 
3203ed95b21aSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
3204ed95b21aSIlya Dryomov 	downgrade_write(&rbd_dev->lock_rwsem);
3205ed95b21aSIlya Dryomov 	/*
3206ed95b21aSIlya Dryomov 	 * Ensure that all in-flight IO is flushed.
3207ed95b21aSIlya Dryomov 	 *
3208ed95b21aSIlya Dryomov 	 * FIXME: ceph_osdc_sync() flushes the entire OSD client, which
3209ed95b21aSIlya Dryomov 	 * may be shared with other devices.
3210ed95b21aSIlya Dryomov 	 */
3211ed95b21aSIlya Dryomov 	ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc);
3212ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3213ed95b21aSIlya Dryomov 
3214ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3215ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3216ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
3217ed95b21aSIlya Dryomov 	if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
3218ed95b21aSIlya Dryomov 		return false;
3219ed95b21aSIlya Dryomov 
3220bbead745SIlya Dryomov 	rbd_unlock(rbd_dev);
3221ed95b21aSIlya Dryomov 	/*
3222ed95b21aSIlya Dryomov 	 * Give others a chance to grab the lock - we would re-acquire
3223ed95b21aSIlya Dryomov 	 * almost immediately if we got new IO during ceph_osdc_sync()
3224ed95b21aSIlya Dryomov 	 * otherwise.  We need to ack our own notifications, so this
3225ed95b21aSIlya Dryomov 	 * lock_dwork will be requeued from rbd_wait_state_locked()
3226ed95b21aSIlya Dryomov 	 * after wake_requests() in rbd_handle_released_lock().
3227ed95b21aSIlya Dryomov 	 */
3228ed95b21aSIlya Dryomov 	cancel_delayed_work(&rbd_dev->lock_dwork);
3229ed95b21aSIlya Dryomov 	return true;
3230ed95b21aSIlya Dryomov }
3231ed95b21aSIlya Dryomov 
3232ed95b21aSIlya Dryomov static void rbd_release_lock_work(struct work_struct *work)
3233ed95b21aSIlya Dryomov {
3234ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3235ed95b21aSIlya Dryomov 						  unlock_work);
3236ed95b21aSIlya Dryomov 
3237ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3238ed95b21aSIlya Dryomov 	rbd_release_lock(rbd_dev);
3239ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
3240ed95b21aSIlya Dryomov }
3241ed95b21aSIlya Dryomov 
3242ed95b21aSIlya Dryomov static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
3243ed95b21aSIlya Dryomov 				     void **p)
3244ed95b21aSIlya Dryomov {
3245ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
3246ed95b21aSIlya Dryomov 
3247ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
3248ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
3249ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
3250ed95b21aSIlya Dryomov 	}
3251ed95b21aSIlya Dryomov 
3252ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3253ed95b21aSIlya Dryomov 	     cid.handle);
3254ed95b21aSIlya Dryomov 	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3255ed95b21aSIlya Dryomov 		down_write(&rbd_dev->lock_rwsem);
3256ed95b21aSIlya Dryomov 		if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3257ed95b21aSIlya Dryomov 			/*
3258ed95b21aSIlya Dryomov 			 * we already know that the remote client is
3259ed95b21aSIlya Dryomov 			 * the owner
3260ed95b21aSIlya Dryomov 			 */
3261ed95b21aSIlya Dryomov 			up_write(&rbd_dev->lock_rwsem);
3262ed95b21aSIlya Dryomov 			return;
3263ed95b21aSIlya Dryomov 		}
3264ed95b21aSIlya Dryomov 
3265ed95b21aSIlya Dryomov 		rbd_set_owner_cid(rbd_dev, &cid);
3266ed95b21aSIlya Dryomov 		downgrade_write(&rbd_dev->lock_rwsem);
3267ed95b21aSIlya Dryomov 	} else {
3268ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
3269ed95b21aSIlya Dryomov 	}
3270ed95b21aSIlya Dryomov 
3271ed95b21aSIlya Dryomov 	if (!__rbd_is_lock_owner(rbd_dev))
3272ed95b21aSIlya Dryomov 		wake_requests(rbd_dev, false);
3273ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3274ed95b21aSIlya Dryomov }
3275ed95b21aSIlya Dryomov 
3276ed95b21aSIlya Dryomov static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
3277ed95b21aSIlya Dryomov 				     void **p)
3278ed95b21aSIlya Dryomov {
3279ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
3280ed95b21aSIlya Dryomov 
3281ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
3282ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
3283ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
3284ed95b21aSIlya Dryomov 	}
3285ed95b21aSIlya Dryomov 
3286ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3287ed95b21aSIlya Dryomov 	     cid.handle);
3288ed95b21aSIlya Dryomov 	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3289ed95b21aSIlya Dryomov 		down_write(&rbd_dev->lock_rwsem);
3290ed95b21aSIlya Dryomov 		if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3291ed95b21aSIlya Dryomov 			dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
3292ed95b21aSIlya Dryomov 			     __func__, rbd_dev, cid.gid, cid.handle,
3293ed95b21aSIlya Dryomov 			     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
3294ed95b21aSIlya Dryomov 			up_write(&rbd_dev->lock_rwsem);
3295ed95b21aSIlya Dryomov 			return;
3296ed95b21aSIlya Dryomov 		}
3297ed95b21aSIlya Dryomov 
3298ed95b21aSIlya Dryomov 		rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3299ed95b21aSIlya Dryomov 		downgrade_write(&rbd_dev->lock_rwsem);
3300ed95b21aSIlya Dryomov 	} else {
3301ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
3302ed95b21aSIlya Dryomov 	}
3303ed95b21aSIlya Dryomov 
3304ed95b21aSIlya Dryomov 	if (!__rbd_is_lock_owner(rbd_dev))
3305ed95b21aSIlya Dryomov 		wake_requests(rbd_dev, false);
3306ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3307ed95b21aSIlya Dryomov }
3308ed95b21aSIlya Dryomov 
33093b77faa0SIlya Dryomov /*
33103b77faa0SIlya Dryomov  * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
33113b77faa0SIlya Dryomov  * ResponseMessage is needed.
33123b77faa0SIlya Dryomov  */
33133b77faa0SIlya Dryomov static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
3314ed95b21aSIlya Dryomov 				   void **p)
3315ed95b21aSIlya Dryomov {
3316ed95b21aSIlya Dryomov 	struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
3317ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
33183b77faa0SIlya Dryomov 	int result = 1;
3319ed95b21aSIlya Dryomov 
3320ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
3321ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
3322ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
3323ed95b21aSIlya Dryomov 	}
3324ed95b21aSIlya Dryomov 
3325ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3326ed95b21aSIlya Dryomov 	     cid.handle);
3327ed95b21aSIlya Dryomov 	if (rbd_cid_equal(&cid, &my_cid))
33283b77faa0SIlya Dryomov 		return result;
3329ed95b21aSIlya Dryomov 
3330ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
33313b77faa0SIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev)) {
33323b77faa0SIlya Dryomov 		if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
33333b77faa0SIlya Dryomov 		    rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
33343b77faa0SIlya Dryomov 			goto out_unlock;
33353b77faa0SIlya Dryomov 
33363b77faa0SIlya Dryomov 		/*
33373b77faa0SIlya Dryomov 		 * encode ResponseMessage(0) so the peer can detect
33383b77faa0SIlya Dryomov 		 * a missing owner
33393b77faa0SIlya Dryomov 		 */
33403b77faa0SIlya Dryomov 		result = 0;
33413b77faa0SIlya Dryomov 
3342ed95b21aSIlya Dryomov 		if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
3343e010dd0aSIlya Dryomov 			if (!rbd_dev->opts->exclusive) {
3344e010dd0aSIlya Dryomov 				dout("%s rbd_dev %p queueing unlock_work\n",
3345e010dd0aSIlya Dryomov 				     __func__, rbd_dev);
3346e010dd0aSIlya Dryomov 				queue_work(rbd_dev->task_wq,
3347e010dd0aSIlya Dryomov 					   &rbd_dev->unlock_work);
3348e010dd0aSIlya Dryomov 			} else {
3349e010dd0aSIlya Dryomov 				/* refuse to release the lock */
3350e010dd0aSIlya Dryomov 				result = -EROFS;
3351ed95b21aSIlya Dryomov 			}
3352ed95b21aSIlya Dryomov 		}
3353ed95b21aSIlya Dryomov 	}
33543b77faa0SIlya Dryomov 
33553b77faa0SIlya Dryomov out_unlock:
3356ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
33573b77faa0SIlya Dryomov 	return result;
3358ed95b21aSIlya Dryomov }
3359ed95b21aSIlya Dryomov 
3360ed95b21aSIlya Dryomov static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
3361ed95b21aSIlya Dryomov 				     u64 notify_id, u64 cookie, s32 *result)
3362ed95b21aSIlya Dryomov {
3363ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
336408a79102SKyle Spiers 	char buf[4 + CEPH_ENCODING_START_BLK_LEN];
336508a79102SKyle Spiers 	int buf_size = sizeof(buf);
3366ed95b21aSIlya Dryomov 	int ret;
3367ed95b21aSIlya Dryomov 
3368ed95b21aSIlya Dryomov 	if (result) {
3369ed95b21aSIlya Dryomov 		void *p = buf;
3370ed95b21aSIlya Dryomov 
3371ed95b21aSIlya Dryomov 		/* encode ResponseMessage */
3372ed95b21aSIlya Dryomov 		ceph_start_encoding(&p, 1, 1,
3373ed95b21aSIlya Dryomov 				    buf_size - CEPH_ENCODING_START_BLK_LEN);
3374ed95b21aSIlya Dryomov 		ceph_encode_32(&p, *result);
3375ed95b21aSIlya Dryomov 	} else {
3376ed95b21aSIlya Dryomov 		buf_size = 0;
3377ed95b21aSIlya Dryomov 	}
3378ed95b21aSIlya Dryomov 
3379ed95b21aSIlya Dryomov 	ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
3380ed95b21aSIlya Dryomov 				   &rbd_dev->header_oloc, notify_id, cookie,
3381ed95b21aSIlya Dryomov 				   buf, buf_size);
3382ed95b21aSIlya Dryomov 	if (ret)
3383ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
3384ed95b21aSIlya Dryomov }
3385ed95b21aSIlya Dryomov 
3386ed95b21aSIlya Dryomov static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
3387ed95b21aSIlya Dryomov 				   u64 cookie)
3388ed95b21aSIlya Dryomov {
3389ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3390ed95b21aSIlya Dryomov 	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
3391ed95b21aSIlya Dryomov }
3392ed95b21aSIlya Dryomov 
3393ed95b21aSIlya Dryomov static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
3394ed95b21aSIlya Dryomov 					  u64 notify_id, u64 cookie, s32 result)
3395ed95b21aSIlya Dryomov {
3396ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3397ed95b21aSIlya Dryomov 	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
3398ed95b21aSIlya Dryomov }
3399922dab61SIlya Dryomov 
3400922dab61SIlya Dryomov static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
3401922dab61SIlya Dryomov 			 u64 notifier_id, void *data, size_t data_len)
3402b8d70035SAlex Elder {
3403922dab61SIlya Dryomov 	struct rbd_device *rbd_dev = arg;
3404ed95b21aSIlya Dryomov 	void *p = data;
3405ed95b21aSIlya Dryomov 	void *const end = p + data_len;
3406d4c2269bSIlya Dryomov 	u8 struct_v = 0;
3407ed95b21aSIlya Dryomov 	u32 len;
3408ed95b21aSIlya Dryomov 	u32 notify_op;
3409b8d70035SAlex Elder 	int ret;
3410b8d70035SAlex Elder 
3411ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
3412ed95b21aSIlya Dryomov 	     __func__, rbd_dev, cookie, notify_id, data_len);
3413ed95b21aSIlya Dryomov 	if (data_len) {
3414ed95b21aSIlya Dryomov 		ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
3415ed95b21aSIlya Dryomov 					  &struct_v, &len);
3416ed95b21aSIlya Dryomov 		if (ret) {
3417ed95b21aSIlya Dryomov 			rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
3418ed95b21aSIlya Dryomov 				 ret);
3419ed95b21aSIlya Dryomov 			return;
3420ed95b21aSIlya Dryomov 		}
342152bb1f9bSIlya Dryomov 
3422ed95b21aSIlya Dryomov 		notify_op = ceph_decode_32(&p);
3423ed95b21aSIlya Dryomov 	} else {
3424ed95b21aSIlya Dryomov 		/* legacy notification for header updates */
3425ed95b21aSIlya Dryomov 		notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
3426ed95b21aSIlya Dryomov 		len = 0;
3427ed95b21aSIlya Dryomov 	}
3428ed95b21aSIlya Dryomov 
3429ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
3430ed95b21aSIlya Dryomov 	switch (notify_op) {
3431ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_ACQUIRED_LOCK:
3432ed95b21aSIlya Dryomov 		rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
3433ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3434ed95b21aSIlya Dryomov 		break;
3435ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_RELEASED_LOCK:
3436ed95b21aSIlya Dryomov 		rbd_handle_released_lock(rbd_dev, struct_v, &p);
3437ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3438ed95b21aSIlya Dryomov 		break;
3439ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_REQUEST_LOCK:
34403b77faa0SIlya Dryomov 		ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
34413b77faa0SIlya Dryomov 		if (ret <= 0)
3442ed95b21aSIlya Dryomov 			rbd_acknowledge_notify_result(rbd_dev, notify_id,
34433b77faa0SIlya Dryomov 						      cookie, ret);
3444ed95b21aSIlya Dryomov 		else
3445ed95b21aSIlya Dryomov 			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3446ed95b21aSIlya Dryomov 		break;
3447ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_HEADER_UPDATE:
3448e627db08SAlex Elder 		ret = rbd_dev_refresh(rbd_dev);
3449e627db08SAlex Elder 		if (ret)
34509584d508SIlya Dryomov 			rbd_warn(rbd_dev, "refresh failed: %d", ret);
3451b8d70035SAlex Elder 
3452ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3453ed95b21aSIlya Dryomov 		break;
3454ed95b21aSIlya Dryomov 	default:
3455ed95b21aSIlya Dryomov 		if (rbd_is_lock_owner(rbd_dev))
3456ed95b21aSIlya Dryomov 			rbd_acknowledge_notify_result(rbd_dev, notify_id,
3457ed95b21aSIlya Dryomov 						      cookie, -EOPNOTSUPP);
3458ed95b21aSIlya Dryomov 		else
3459ed95b21aSIlya Dryomov 			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3460ed95b21aSIlya Dryomov 		break;
3461b8d70035SAlex Elder 	}
3462b8d70035SAlex Elder }
3463b8d70035SAlex Elder 
346499d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
34659969ebc5SAlex Elder 
3466922dab61SIlya Dryomov static void rbd_watch_errcb(void *arg, u64 cookie, int err)
3467bb040aa0SIlya Dryomov {
3468922dab61SIlya Dryomov 	struct rbd_device *rbd_dev = arg;
3469bb040aa0SIlya Dryomov 
3470922dab61SIlya Dryomov 	rbd_warn(rbd_dev, "encountered watch error: %d", err);
3471bb040aa0SIlya Dryomov 
3472ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3473ed95b21aSIlya Dryomov 	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3474ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
3475bb040aa0SIlya Dryomov 
347699d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
347799d16943SIlya Dryomov 	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
347899d16943SIlya Dryomov 		__rbd_unregister_watch(rbd_dev);
347999d16943SIlya Dryomov 		rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
3480bb040aa0SIlya Dryomov 
348199d16943SIlya Dryomov 		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
3482bb040aa0SIlya Dryomov 	}
348399d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
3484bb040aa0SIlya Dryomov }
3485bb040aa0SIlya Dryomov 
3486bb040aa0SIlya Dryomov /*
348799d16943SIlya Dryomov  * watch_mutex must be locked
34889969ebc5SAlex Elder  */
348999d16943SIlya Dryomov static int __rbd_register_watch(struct rbd_device *rbd_dev)
34909969ebc5SAlex Elder {
34919969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3492922dab61SIlya Dryomov 	struct ceph_osd_linger_request *handle;
34939969ebc5SAlex Elder 
3494922dab61SIlya Dryomov 	rbd_assert(!rbd_dev->watch_handle);
349599d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
34969969ebc5SAlex Elder 
3497922dab61SIlya Dryomov 	handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
3498922dab61SIlya Dryomov 				 &rbd_dev->header_oloc, rbd_watch_cb,
3499922dab61SIlya Dryomov 				 rbd_watch_errcb, rbd_dev);
3500922dab61SIlya Dryomov 	if (IS_ERR(handle))
3501922dab61SIlya Dryomov 		return PTR_ERR(handle);
35029969ebc5SAlex Elder 
3503922dab61SIlya Dryomov 	rbd_dev->watch_handle = handle;
35048eb87565SAlex Elder 	return 0;
35059969ebc5SAlex Elder }
35069969ebc5SAlex Elder 
350799d16943SIlya Dryomov /*
350899d16943SIlya Dryomov  * watch_mutex must be locked
350999d16943SIlya Dryomov  */
351099d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
3511fca27065SIlya Dryomov {
3512922dab61SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3513922dab61SIlya Dryomov 	int ret;
3514b30a01f2SIlya Dryomov 
351599d16943SIlya Dryomov 	rbd_assert(rbd_dev->watch_handle);
351699d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3517b30a01f2SIlya Dryomov 
3518922dab61SIlya Dryomov 	ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
3519922dab61SIlya Dryomov 	if (ret)
3520922dab61SIlya Dryomov 		rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
3521b30a01f2SIlya Dryomov 
3522922dab61SIlya Dryomov 	rbd_dev->watch_handle = NULL;
3523c525f036SIlya Dryomov }
3524c525f036SIlya Dryomov 
352599d16943SIlya Dryomov static int rbd_register_watch(struct rbd_device *rbd_dev)
3526c525f036SIlya Dryomov {
352799d16943SIlya Dryomov 	int ret;
3528811c6688SIlya Dryomov 
352999d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
353099d16943SIlya Dryomov 	rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
353199d16943SIlya Dryomov 	ret = __rbd_register_watch(rbd_dev);
353299d16943SIlya Dryomov 	if (ret)
353399d16943SIlya Dryomov 		goto out;
353499d16943SIlya Dryomov 
353599d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
353699d16943SIlya Dryomov 	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
353799d16943SIlya Dryomov 
353899d16943SIlya Dryomov out:
353999d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
354099d16943SIlya Dryomov 	return ret;
354199d16943SIlya Dryomov }
354299d16943SIlya Dryomov 
354399d16943SIlya Dryomov static void cancel_tasks_sync(struct rbd_device *rbd_dev)
354499d16943SIlya Dryomov {
354599d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
354699d16943SIlya Dryomov 
3547ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->acquired_lock_work);
3548ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->released_lock_work);
3549ed95b21aSIlya Dryomov 	cancel_delayed_work_sync(&rbd_dev->lock_dwork);
3550ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->unlock_work);
355199d16943SIlya Dryomov }
355299d16943SIlya Dryomov 
355399d16943SIlya Dryomov static void rbd_unregister_watch(struct rbd_device *rbd_dev)
355499d16943SIlya Dryomov {
3555ed95b21aSIlya Dryomov 	WARN_ON(waitqueue_active(&rbd_dev->lock_waitq));
355699d16943SIlya Dryomov 	cancel_tasks_sync(rbd_dev);
355799d16943SIlya Dryomov 
355899d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
355999d16943SIlya Dryomov 	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
356099d16943SIlya Dryomov 		__rbd_unregister_watch(rbd_dev);
356199d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
356299d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
356399d16943SIlya Dryomov 
356423edca86SDongsheng Yang 	cancel_delayed_work_sync(&rbd_dev->watch_dwork);
3565811c6688SIlya Dryomov 	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
3566fca27065SIlya Dryomov }
3567fca27065SIlya Dryomov 
356814bb211dSIlya Dryomov /*
356914bb211dSIlya Dryomov  * lock_rwsem must be held for write
357014bb211dSIlya Dryomov  */
357114bb211dSIlya Dryomov static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
357214bb211dSIlya Dryomov {
357314bb211dSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
357414bb211dSIlya Dryomov 	char cookie[32];
357514bb211dSIlya Dryomov 	int ret;
357614bb211dSIlya Dryomov 
357714bb211dSIlya Dryomov 	WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
357814bb211dSIlya Dryomov 
357914bb211dSIlya Dryomov 	format_lock_cookie(rbd_dev, cookie);
358014bb211dSIlya Dryomov 	ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
358114bb211dSIlya Dryomov 				  &rbd_dev->header_oloc, RBD_LOCK_NAME,
358214bb211dSIlya Dryomov 				  CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
358314bb211dSIlya Dryomov 				  RBD_LOCK_TAG, cookie);
358414bb211dSIlya Dryomov 	if (ret) {
358514bb211dSIlya Dryomov 		if (ret != -EOPNOTSUPP)
358614bb211dSIlya Dryomov 			rbd_warn(rbd_dev, "failed to update lock cookie: %d",
358714bb211dSIlya Dryomov 				 ret);
358814bb211dSIlya Dryomov 
358914bb211dSIlya Dryomov 		/*
359014bb211dSIlya Dryomov 		 * Lock cookie cannot be updated on older OSDs, so do
359114bb211dSIlya Dryomov 		 * a manual release and queue an acquire.
359214bb211dSIlya Dryomov 		 */
359314bb211dSIlya Dryomov 		if (rbd_release_lock(rbd_dev))
359414bb211dSIlya Dryomov 			queue_delayed_work(rbd_dev->task_wq,
359514bb211dSIlya Dryomov 					   &rbd_dev->lock_dwork, 0);
359614bb211dSIlya Dryomov 	} else {
3597edd8ca80SFlorian Margaine 		__rbd_lock(rbd_dev, cookie);
359814bb211dSIlya Dryomov 	}
359914bb211dSIlya Dryomov }
360014bb211dSIlya Dryomov 
360199d16943SIlya Dryomov static void rbd_reregister_watch(struct work_struct *work)
360299d16943SIlya Dryomov {
360399d16943SIlya Dryomov 	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
360499d16943SIlya Dryomov 					    struct rbd_device, watch_dwork);
360599d16943SIlya Dryomov 	int ret;
360699d16943SIlya Dryomov 
360799d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
360899d16943SIlya Dryomov 
360999d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
361087c0fdedSIlya Dryomov 	if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
361187c0fdedSIlya Dryomov 		mutex_unlock(&rbd_dev->watch_mutex);
361214bb211dSIlya Dryomov 		return;
361387c0fdedSIlya Dryomov 	}
361499d16943SIlya Dryomov 
361599d16943SIlya Dryomov 	ret = __rbd_register_watch(rbd_dev);
361699d16943SIlya Dryomov 	if (ret) {
361799d16943SIlya Dryomov 		rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
36184d73644bSIlya Dryomov 		if (ret == -EBLACKLISTED || ret == -ENOENT) {
361987c0fdedSIlya Dryomov 			set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
362014bb211dSIlya Dryomov 			wake_requests(rbd_dev, true);
362187c0fdedSIlya Dryomov 		} else {
362299d16943SIlya Dryomov 			queue_delayed_work(rbd_dev->task_wq,
362399d16943SIlya Dryomov 					   &rbd_dev->watch_dwork,
362499d16943SIlya Dryomov 					   RBD_RETRY_DELAY);
362587c0fdedSIlya Dryomov 		}
362687c0fdedSIlya Dryomov 		mutex_unlock(&rbd_dev->watch_mutex);
362714bb211dSIlya Dryomov 		return;
362899d16943SIlya Dryomov 	}
362999d16943SIlya Dryomov 
363099d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
363199d16943SIlya Dryomov 	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
363299d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
363399d16943SIlya Dryomov 
363414bb211dSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
363514bb211dSIlya Dryomov 	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
363614bb211dSIlya Dryomov 		rbd_reacquire_lock(rbd_dev);
363714bb211dSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
363814bb211dSIlya Dryomov 
363999d16943SIlya Dryomov 	ret = rbd_dev_refresh(rbd_dev);
364099d16943SIlya Dryomov 	if (ret)
3641f6870cc9SColin Ian King 		rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
364299d16943SIlya Dryomov }
364399d16943SIlya Dryomov 
364436be9a76SAlex Elder /*
3645f40eb349SAlex Elder  * Synchronous osd object method call.  Returns the number of bytes
3646f40eb349SAlex Elder  * returned in the outbound buffer, or a negative error code.
364736be9a76SAlex Elder  */
364836be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
3649ecd4a68aSIlya Dryomov 			     struct ceph_object_id *oid,
3650ecd4a68aSIlya Dryomov 			     struct ceph_object_locator *oloc,
365136be9a76SAlex Elder 			     const char *method_name,
36524157976bSAlex Elder 			     const void *outbound,
365336be9a76SAlex Elder 			     size_t outbound_size,
36544157976bSAlex Elder 			     void *inbound,
3655e2a58ee5SAlex Elder 			     size_t inbound_size)
365636be9a76SAlex Elder {
3657ecd4a68aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3658ecd4a68aSIlya Dryomov 	struct page *req_page = NULL;
3659ecd4a68aSIlya Dryomov 	struct page *reply_page;
366036be9a76SAlex Elder 	int ret;
366136be9a76SAlex Elder 
366236be9a76SAlex Elder 	/*
36636010a451SAlex Elder 	 * Method calls are ultimately read operations.  The result
36646010a451SAlex Elder 	 * should placed into the inbound buffer provided.  They
36656010a451SAlex Elder 	 * also supply outbound data--parameters for the object
36666010a451SAlex Elder 	 * method.  Currently if this is present it will be a
36676010a451SAlex Elder 	 * snapshot id.
366836be9a76SAlex Elder 	 */
3669ecd4a68aSIlya Dryomov 	if (outbound) {
3670ecd4a68aSIlya Dryomov 		if (outbound_size > PAGE_SIZE)
3671ecd4a68aSIlya Dryomov 			return -E2BIG;
367236be9a76SAlex Elder 
3673ecd4a68aSIlya Dryomov 		req_page = alloc_page(GFP_KERNEL);
3674ecd4a68aSIlya Dryomov 		if (!req_page)
3675ecd4a68aSIlya Dryomov 			return -ENOMEM;
367636be9a76SAlex Elder 
3677ecd4a68aSIlya Dryomov 		memcpy(page_address(req_page), outbound, outbound_size);
367804017e29SAlex Elder 	}
3679430c28c3SAlex Elder 
3680ecd4a68aSIlya Dryomov 	reply_page = alloc_page(GFP_KERNEL);
3681ecd4a68aSIlya Dryomov 	if (!reply_page) {
3682ecd4a68aSIlya Dryomov 		if (req_page)
3683ecd4a68aSIlya Dryomov 			__free_page(req_page);
3684ecd4a68aSIlya Dryomov 		return -ENOMEM;
3685ecd4a68aSIlya Dryomov 	}
368636be9a76SAlex Elder 
3687ecd4a68aSIlya Dryomov 	ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
3688ecd4a68aSIlya Dryomov 			     CEPH_OSD_FLAG_READ, req_page, outbound_size,
3689ecd4a68aSIlya Dryomov 			     reply_page, &inbound_size);
3690ecd4a68aSIlya Dryomov 	if (!ret) {
3691ecd4a68aSIlya Dryomov 		memcpy(inbound, page_address(reply_page), inbound_size);
3692ecd4a68aSIlya Dryomov 		ret = inbound_size;
3693ecd4a68aSIlya Dryomov 	}
369457385b51SAlex Elder 
3695ecd4a68aSIlya Dryomov 	if (req_page)
3696ecd4a68aSIlya Dryomov 		__free_page(req_page);
3697ecd4a68aSIlya Dryomov 	__free_page(reply_page);
369836be9a76SAlex Elder 	return ret;
369936be9a76SAlex Elder }
370036be9a76SAlex Elder 
3701ed95b21aSIlya Dryomov /*
3702ed95b21aSIlya Dryomov  * lock_rwsem must be held for read
3703ed95b21aSIlya Dryomov  */
37042f18d466SIlya Dryomov static int rbd_wait_state_locked(struct rbd_device *rbd_dev, bool may_acquire)
3705ed95b21aSIlya Dryomov {
3706ed95b21aSIlya Dryomov 	DEFINE_WAIT(wait);
370734f55d0bSDongsheng Yang 	unsigned long timeout;
37082f18d466SIlya Dryomov 	int ret = 0;
37092f18d466SIlya Dryomov 
37102f18d466SIlya Dryomov 	if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags))
37112f18d466SIlya Dryomov 		return -EBLACKLISTED;
37122f18d466SIlya Dryomov 
37132f18d466SIlya Dryomov 	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
37142f18d466SIlya Dryomov 		return 0;
37152f18d466SIlya Dryomov 
37162f18d466SIlya Dryomov 	if (!may_acquire) {
37172f18d466SIlya Dryomov 		rbd_warn(rbd_dev, "exclusive lock required");
37182f18d466SIlya Dryomov 		return -EROFS;
37192f18d466SIlya Dryomov 	}
3720ed95b21aSIlya Dryomov 
3721ed95b21aSIlya Dryomov 	do {
3722ed95b21aSIlya Dryomov 		/*
3723ed95b21aSIlya Dryomov 		 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3724ed95b21aSIlya Dryomov 		 * and cancel_delayed_work() in wake_requests().
3725ed95b21aSIlya Dryomov 		 */
3726ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3727ed95b21aSIlya Dryomov 		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3728ed95b21aSIlya Dryomov 		prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait,
3729ed95b21aSIlya Dryomov 					  TASK_UNINTERRUPTIBLE);
3730ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
373134f55d0bSDongsheng Yang 		timeout = schedule_timeout(ceph_timeout_jiffies(
373234f55d0bSDongsheng Yang 						rbd_dev->opts->lock_timeout));
3733ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
37342f18d466SIlya Dryomov 		if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
37352f18d466SIlya Dryomov 			ret = -EBLACKLISTED;
37362f18d466SIlya Dryomov 			break;
37372f18d466SIlya Dryomov 		}
373834f55d0bSDongsheng Yang 		if (!timeout) {
373934f55d0bSDongsheng Yang 			rbd_warn(rbd_dev, "timed out waiting for lock");
374034f55d0bSDongsheng Yang 			ret = -ETIMEDOUT;
374134f55d0bSDongsheng Yang 			break;
374234f55d0bSDongsheng Yang 		}
37432f18d466SIlya Dryomov 	} while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
374487c0fdedSIlya Dryomov 
3745ed95b21aSIlya Dryomov 	finish_wait(&rbd_dev->lock_waitq, &wait);
37462f18d466SIlya Dryomov 	return ret;
3747ed95b21aSIlya Dryomov }
3748ed95b21aSIlya Dryomov 
37497ad18afaSChristoph Hellwig static void rbd_queue_workfn(struct work_struct *work)
3750bc1ecc65SIlya Dryomov {
37517ad18afaSChristoph Hellwig 	struct request *rq = blk_mq_rq_from_pdu(work);
37527ad18afaSChristoph Hellwig 	struct rbd_device *rbd_dev = rq->q->queuedata;
3753bc1ecc65SIlya Dryomov 	struct rbd_img_request *img_request;
37544e752f0aSJosh Durgin 	struct ceph_snap_context *snapc = NULL;
3755bc1ecc65SIlya Dryomov 	u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
3756bc1ecc65SIlya Dryomov 	u64 length = blk_rq_bytes(rq);
37576d2940c8SGuangliang Zhao 	enum obj_operation_type op_type;
37584e752f0aSJosh Durgin 	u64 mapping_size;
375980de1912SIlya Dryomov 	bool must_be_locked;
3760bc1ecc65SIlya Dryomov 	int result;
3761bc1ecc65SIlya Dryomov 
3762aebf526bSChristoph Hellwig 	switch (req_op(rq)) {
3763aebf526bSChristoph Hellwig 	case REQ_OP_DISCARD:
3764aebf526bSChristoph Hellwig 		op_type = OBJ_OP_DISCARD;
3765aebf526bSChristoph Hellwig 		break;
37666484cbe9SIlya Dryomov 	case REQ_OP_WRITE_ZEROES:
37676484cbe9SIlya Dryomov 		op_type = OBJ_OP_ZEROOUT;
37686484cbe9SIlya Dryomov 		break;
3769aebf526bSChristoph Hellwig 	case REQ_OP_WRITE:
3770aebf526bSChristoph Hellwig 		op_type = OBJ_OP_WRITE;
3771aebf526bSChristoph Hellwig 		break;
3772aebf526bSChristoph Hellwig 	case REQ_OP_READ:
3773aebf526bSChristoph Hellwig 		op_type = OBJ_OP_READ;
3774aebf526bSChristoph Hellwig 		break;
3775aebf526bSChristoph Hellwig 	default:
3776aebf526bSChristoph Hellwig 		dout("%s: non-fs request type %d\n", __func__, req_op(rq));
37777ad18afaSChristoph Hellwig 		result = -EIO;
37787ad18afaSChristoph Hellwig 		goto err;
37797ad18afaSChristoph Hellwig 	}
37807ad18afaSChristoph Hellwig 
3781bc1ecc65SIlya Dryomov 	/* Ignore/skip any zero-length requests */
3782bc1ecc65SIlya Dryomov 
3783bc1ecc65SIlya Dryomov 	if (!length) {
3784bc1ecc65SIlya Dryomov 		dout("%s: zero-length request\n", __func__);
3785bc1ecc65SIlya Dryomov 		result = 0;
3786bc1ecc65SIlya Dryomov 		goto err_rq;
3787bc1ecc65SIlya Dryomov 	}
3788bc1ecc65SIlya Dryomov 
37899568c93eSIlya Dryomov 	rbd_assert(op_type == OBJ_OP_READ ||
37909568c93eSIlya Dryomov 		   rbd_dev->spec->snap_id == CEPH_NOSNAP);
3791bc1ecc65SIlya Dryomov 
3792bc1ecc65SIlya Dryomov 	/*
3793bc1ecc65SIlya Dryomov 	 * Quit early if the mapped snapshot no longer exists.  It's
3794bc1ecc65SIlya Dryomov 	 * still possible the snapshot will have disappeared by the
3795bc1ecc65SIlya Dryomov 	 * time our request arrives at the osd, but there's no sense in
3796bc1ecc65SIlya Dryomov 	 * sending it if we already know.
3797bc1ecc65SIlya Dryomov 	 */
3798bc1ecc65SIlya Dryomov 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3799bc1ecc65SIlya Dryomov 		dout("request for non-existent snapshot");
3800bc1ecc65SIlya Dryomov 		rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3801bc1ecc65SIlya Dryomov 		result = -ENXIO;
3802bc1ecc65SIlya Dryomov 		goto err_rq;
3803bc1ecc65SIlya Dryomov 	}
3804bc1ecc65SIlya Dryomov 
3805bc1ecc65SIlya Dryomov 	if (offset && length > U64_MAX - offset + 1) {
3806bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
3807bc1ecc65SIlya Dryomov 			 length);
3808bc1ecc65SIlya Dryomov 		result = -EINVAL;
3809bc1ecc65SIlya Dryomov 		goto err_rq;	/* Shouldn't happen */
3810bc1ecc65SIlya Dryomov 	}
3811bc1ecc65SIlya Dryomov 
38127ad18afaSChristoph Hellwig 	blk_mq_start_request(rq);
38137ad18afaSChristoph Hellwig 
38144e752f0aSJosh Durgin 	down_read(&rbd_dev->header_rwsem);
38154e752f0aSJosh Durgin 	mapping_size = rbd_dev->mapping.size;
38166d2940c8SGuangliang Zhao 	if (op_type != OBJ_OP_READ) {
38174e752f0aSJosh Durgin 		snapc = rbd_dev->header.snapc;
38184e752f0aSJosh Durgin 		ceph_get_snap_context(snapc);
38194e752f0aSJosh Durgin 	}
38204e752f0aSJosh Durgin 	up_read(&rbd_dev->header_rwsem);
38214e752f0aSJosh Durgin 
38224e752f0aSJosh Durgin 	if (offset + length > mapping_size) {
3823bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
38244e752f0aSJosh Durgin 			 length, mapping_size);
3825bc1ecc65SIlya Dryomov 		result = -EIO;
3826bc1ecc65SIlya Dryomov 		goto err_rq;
3827bc1ecc65SIlya Dryomov 	}
3828bc1ecc65SIlya Dryomov 
3829f9bebd58SIlya Dryomov 	must_be_locked =
3830f9bebd58SIlya Dryomov 	    (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
3831f9bebd58SIlya Dryomov 	    (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read);
3832ed95b21aSIlya Dryomov 	if (must_be_locked) {
3833ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
38342f18d466SIlya Dryomov 		result = rbd_wait_state_locked(rbd_dev,
38352f18d466SIlya Dryomov 					       !rbd_dev->opts->exclusive);
38362f18d466SIlya Dryomov 		if (result)
3837e010dd0aSIlya Dryomov 			goto err_unlock;
3838e010dd0aSIlya Dryomov 	}
3839ed95b21aSIlya Dryomov 
3840dfd9875fSIlya Dryomov 	img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
3841bc1ecc65SIlya Dryomov 	if (!img_request) {
3842bc1ecc65SIlya Dryomov 		result = -ENOMEM;
3843ed95b21aSIlya Dryomov 		goto err_unlock;
3844bc1ecc65SIlya Dryomov 	}
3845bc1ecc65SIlya Dryomov 	img_request->rq = rq;
384670b16db8SIlya Dryomov 	snapc = NULL; /* img_request consumes a ref */
3847bc1ecc65SIlya Dryomov 
38486484cbe9SIlya Dryomov 	if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
38495a237819SIlya Dryomov 		result = rbd_img_fill_nodata(img_request, offset, length);
385090e98c52SGuangliang Zhao 	else
38515a237819SIlya Dryomov 		result = rbd_img_fill_from_bio(img_request, offset, length,
385290e98c52SGuangliang Zhao 					       rq->bio);
38530c93e1b7SIlya Dryomov 	if (result || !img_request->pending_count)
3854bc1ecc65SIlya Dryomov 		goto err_img_request;
3855bc1ecc65SIlya Dryomov 
3856efbd1a11SIlya Dryomov 	rbd_img_request_submit(img_request);
3857ed95b21aSIlya Dryomov 	if (must_be_locked)
3858ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
3859bc1ecc65SIlya Dryomov 	return;
3860bc1ecc65SIlya Dryomov 
3861bc1ecc65SIlya Dryomov err_img_request:
3862bc1ecc65SIlya Dryomov 	rbd_img_request_put(img_request);
3863ed95b21aSIlya Dryomov err_unlock:
3864ed95b21aSIlya Dryomov 	if (must_be_locked)
3865ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
3866bc1ecc65SIlya Dryomov err_rq:
3867bc1ecc65SIlya Dryomov 	if (result)
3868bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "%s %llx at %llx result %d",
38696d2940c8SGuangliang Zhao 			 obj_op_name(op_type), length, offset, result);
38704e752f0aSJosh Durgin 	ceph_put_snap_context(snapc);
38717ad18afaSChristoph Hellwig err:
38722a842acaSChristoph Hellwig 	blk_mq_end_request(rq, errno_to_blk_status(result));
3873bc1ecc65SIlya Dryomov }
3874bc1ecc65SIlya Dryomov 
3875fc17b653SChristoph Hellwig static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
38767ad18afaSChristoph Hellwig 		const struct blk_mq_queue_data *bd)
3877bc1ecc65SIlya Dryomov {
38787ad18afaSChristoph Hellwig 	struct request *rq = bd->rq;
38797ad18afaSChristoph Hellwig 	struct work_struct *work = blk_mq_rq_to_pdu(rq);
3880bc1ecc65SIlya Dryomov 
38817ad18afaSChristoph Hellwig 	queue_work(rbd_wq, work);
3882fc17b653SChristoph Hellwig 	return BLK_STS_OK;
3883bf0d5f50SAlex Elder }
3884bf0d5f50SAlex Elder 
3885602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
3886602adf40SYehuda Sadeh {
38875769ed0cSIlya Dryomov 	blk_cleanup_queue(rbd_dev->disk->queue);
38887ad18afaSChristoph Hellwig 	blk_mq_free_tag_set(&rbd_dev->tag_set);
38895769ed0cSIlya Dryomov 	put_disk(rbd_dev->disk);
38905769ed0cSIlya Dryomov 	rbd_dev->disk = NULL;
3891602adf40SYehuda Sadeh }
3892602adf40SYehuda Sadeh 
3893788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
3894fe5478e0SIlya Dryomov 			     struct ceph_object_id *oid,
3895fe5478e0SIlya Dryomov 			     struct ceph_object_locator *oloc,
3896fe5478e0SIlya Dryomov 			     void *buf, int buf_len)
3897788e2df3SAlex Elder 
3898788e2df3SAlex Elder {
3899fe5478e0SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3900fe5478e0SIlya Dryomov 	struct ceph_osd_request *req;
3901fe5478e0SIlya Dryomov 	struct page **pages;
3902fe5478e0SIlya Dryomov 	int num_pages = calc_pages_for(0, buf_len);
3903788e2df3SAlex Elder 	int ret;
3904788e2df3SAlex Elder 
3905fe5478e0SIlya Dryomov 	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
3906fe5478e0SIlya Dryomov 	if (!req)
3907fe5478e0SIlya Dryomov 		return -ENOMEM;
3908788e2df3SAlex Elder 
3909fe5478e0SIlya Dryomov 	ceph_oid_copy(&req->r_base_oid, oid);
3910fe5478e0SIlya Dryomov 	ceph_oloc_copy(&req->r_base_oloc, oloc);
3911fe5478e0SIlya Dryomov 	req->r_flags = CEPH_OSD_FLAG_READ;
3912788e2df3SAlex Elder 
3913fe5478e0SIlya Dryomov 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
3914fe5478e0SIlya Dryomov 	if (IS_ERR(pages)) {
3915fe5478e0SIlya Dryomov 		ret = PTR_ERR(pages);
3916fe5478e0SIlya Dryomov 		goto out_req;
3917fe5478e0SIlya Dryomov 	}
39181ceae7efSAlex Elder 
3919fe5478e0SIlya Dryomov 	osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
3920fe5478e0SIlya Dryomov 	osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
3921fe5478e0SIlya Dryomov 					 true);
3922788e2df3SAlex Elder 
392326f887e0SIlya Dryomov 	ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
392426f887e0SIlya Dryomov 	if (ret)
392526f887e0SIlya Dryomov 		goto out_req;
392626f887e0SIlya Dryomov 
3927fe5478e0SIlya Dryomov 	ceph_osdc_start_request(osdc, req, false);
3928fe5478e0SIlya Dryomov 	ret = ceph_osdc_wait_request(osdc, req);
3929fe5478e0SIlya Dryomov 	if (ret >= 0)
3930fe5478e0SIlya Dryomov 		ceph_copy_from_page_vector(pages, buf, 0, ret);
3931fe5478e0SIlya Dryomov 
3932fe5478e0SIlya Dryomov out_req:
3933fe5478e0SIlya Dryomov 	ceph_osdc_put_request(req);
3934788e2df3SAlex Elder 	return ret;
3935788e2df3SAlex Elder }
3936788e2df3SAlex Elder 
3937602adf40SYehuda Sadeh /*
3938662518b1SAlex Elder  * Read the complete header for the given rbd device.  On successful
3939662518b1SAlex Elder  * return, the rbd_dev->header field will contain up-to-date
3940662518b1SAlex Elder  * information about the image.
39414156d998SAlex Elder  */
394299a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
39434156d998SAlex Elder {
39444156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
39454156d998SAlex Elder 	u32 snap_count = 0;
39464156d998SAlex Elder 	u64 names_size = 0;
39474156d998SAlex Elder 	u32 want_count;
39484156d998SAlex Elder 	int ret;
39494156d998SAlex Elder 
39504156d998SAlex Elder 	/*
39514156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
39524156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
39534156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
39544156d998SAlex Elder 	 * the number of snapshots could change by the time we read
39554156d998SAlex Elder 	 * it in, in which case we re-read it.
39564156d998SAlex Elder 	 */
39574156d998SAlex Elder 	do {
39584156d998SAlex Elder 		size_t size;
39594156d998SAlex Elder 
39604156d998SAlex Elder 		kfree(ondisk);
39614156d998SAlex Elder 
39624156d998SAlex Elder 		size = sizeof (*ondisk);
39634156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
39644156d998SAlex Elder 		size += names_size;
39654156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
39664156d998SAlex Elder 		if (!ondisk)
3967662518b1SAlex Elder 			return -ENOMEM;
39684156d998SAlex Elder 
3969fe5478e0SIlya Dryomov 		ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
3970fe5478e0SIlya Dryomov 					&rbd_dev->header_oloc, ondisk, size);
39714156d998SAlex Elder 		if (ret < 0)
3972662518b1SAlex Elder 			goto out;
3973c0cd10dbSAlex Elder 		if ((size_t)ret < size) {
39744156d998SAlex Elder 			ret = -ENXIO;
397506ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
397606ecc6cbSAlex Elder 				size, ret);
3977662518b1SAlex Elder 			goto out;
39784156d998SAlex Elder 		}
39794156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
39804156d998SAlex Elder 			ret = -ENXIO;
398106ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
3982662518b1SAlex Elder 			goto out;
39834156d998SAlex Elder 		}
39844156d998SAlex Elder 
39854156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
39864156d998SAlex Elder 		want_count = snap_count;
39874156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
39884156d998SAlex Elder 	} while (snap_count != want_count);
39894156d998SAlex Elder 
3990662518b1SAlex Elder 	ret = rbd_header_from_disk(rbd_dev, ondisk);
3991662518b1SAlex Elder out:
39924156d998SAlex Elder 	kfree(ondisk);
39934156d998SAlex Elder 
3994dfc5606dSYehuda Sadeh 	return ret;
3995602adf40SYehuda Sadeh }
3996602adf40SYehuda Sadeh 
399715228edeSAlex Elder /*
399815228edeSAlex Elder  * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
399915228edeSAlex Elder  * has disappeared from the (just updated) snapshot context.
400015228edeSAlex Elder  */
400115228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev)
400215228edeSAlex Elder {
400315228edeSAlex Elder 	u64 snap_id;
400415228edeSAlex Elder 
400515228edeSAlex Elder 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
400615228edeSAlex Elder 		return;
400715228edeSAlex Elder 
400815228edeSAlex Elder 	snap_id = rbd_dev->spec->snap_id;
400915228edeSAlex Elder 	if (snap_id == CEPH_NOSNAP)
401015228edeSAlex Elder 		return;
401115228edeSAlex Elder 
401215228edeSAlex Elder 	if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
401315228edeSAlex Elder 		clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
401415228edeSAlex Elder }
401515228edeSAlex Elder 
40169875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev)
40179875201eSJosh Durgin {
40189875201eSJosh Durgin 	sector_t size;
40199875201eSJosh Durgin 
40209875201eSJosh Durgin 	/*
4021811c6688SIlya Dryomov 	 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
4022811c6688SIlya Dryomov 	 * try to update its size.  If REMOVING is set, updating size
4023811c6688SIlya Dryomov 	 * is just useless work since the device can't be opened.
40249875201eSJosh Durgin 	 */
4025811c6688SIlya Dryomov 	if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
4026811c6688SIlya Dryomov 	    !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
40279875201eSJosh Durgin 		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
40289875201eSJosh Durgin 		dout("setting size to %llu sectors", (unsigned long long)size);
40299875201eSJosh Durgin 		set_capacity(rbd_dev->disk, size);
40309875201eSJosh Durgin 		revalidate_disk(rbd_dev->disk);
40319875201eSJosh Durgin 	}
40329875201eSJosh Durgin }
40339875201eSJosh Durgin 
4034cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev)
40351fe5e993SAlex Elder {
4036e627db08SAlex Elder 	u64 mapping_size;
40371fe5e993SAlex Elder 	int ret;
40381fe5e993SAlex Elder 
4039cfbf6377SAlex Elder 	down_write(&rbd_dev->header_rwsem);
40403b5cf2a2SAlex Elder 	mapping_size = rbd_dev->mapping.size;
4041a720ae09SIlya Dryomov 
4042a720ae09SIlya Dryomov 	ret = rbd_dev_header_info(rbd_dev);
404352bb1f9bSIlya Dryomov 	if (ret)
404473e39e4dSIlya Dryomov 		goto out;
404515228edeSAlex Elder 
4046e8f59b59SIlya Dryomov 	/*
4047e8f59b59SIlya Dryomov 	 * If there is a parent, see if it has disappeared due to the
4048e8f59b59SIlya Dryomov 	 * mapped image getting flattened.
4049e8f59b59SIlya Dryomov 	 */
4050e8f59b59SIlya Dryomov 	if (rbd_dev->parent) {
4051e8f59b59SIlya Dryomov 		ret = rbd_dev_v2_parent_info(rbd_dev);
4052e8f59b59SIlya Dryomov 		if (ret)
405373e39e4dSIlya Dryomov 			goto out;
4054e8f59b59SIlya Dryomov 	}
4055e8f59b59SIlya Dryomov 
40565ff1108cSIlya Dryomov 	if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
40575ff1108cSIlya Dryomov 		rbd_dev->mapping.size = rbd_dev->header.image_size;
40585ff1108cSIlya Dryomov 	} else {
40595ff1108cSIlya Dryomov 		/* validate mapped snapshot's EXISTS flag */
406015228edeSAlex Elder 		rbd_exists_validate(rbd_dev);
40615ff1108cSIlya Dryomov 	}
40625ff1108cSIlya Dryomov 
406373e39e4dSIlya Dryomov out:
4064cfbf6377SAlex Elder 	up_write(&rbd_dev->header_rwsem);
406573e39e4dSIlya Dryomov 	if (!ret && mapping_size != rbd_dev->mapping.size)
40669875201eSJosh Durgin 		rbd_dev_update_size(rbd_dev);
40671fe5e993SAlex Elder 
406873e39e4dSIlya Dryomov 	return ret;
40691fe5e993SAlex Elder }
40701fe5e993SAlex Elder 
4071d6296d39SChristoph Hellwig static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
4072d6296d39SChristoph Hellwig 		unsigned int hctx_idx, unsigned int numa_node)
40737ad18afaSChristoph Hellwig {
40747ad18afaSChristoph Hellwig 	struct work_struct *work = blk_mq_rq_to_pdu(rq);
40757ad18afaSChristoph Hellwig 
40767ad18afaSChristoph Hellwig 	INIT_WORK(work, rbd_queue_workfn);
40777ad18afaSChristoph Hellwig 	return 0;
40787ad18afaSChristoph Hellwig }
40797ad18afaSChristoph Hellwig 
4080f363b089SEric Biggers static const struct blk_mq_ops rbd_mq_ops = {
40817ad18afaSChristoph Hellwig 	.queue_rq	= rbd_queue_rq,
40827ad18afaSChristoph Hellwig 	.init_request	= rbd_init_request,
40837ad18afaSChristoph Hellwig };
40847ad18afaSChristoph Hellwig 
4085602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
4086602adf40SYehuda Sadeh {
4087602adf40SYehuda Sadeh 	struct gendisk *disk;
4088602adf40SYehuda Sadeh 	struct request_queue *q;
4089420efbdfSIlya Dryomov 	unsigned int objset_bytes =
4090420efbdfSIlya Dryomov 	    rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
40917ad18afaSChristoph Hellwig 	int err;
4092602adf40SYehuda Sadeh 
4093602adf40SYehuda Sadeh 	/* create gendisk info */
40947e513d43SIlya Dryomov 	disk = alloc_disk(single_major ?
40957e513d43SIlya Dryomov 			  (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
40967e513d43SIlya Dryomov 			  RBD_MINORS_PER_MAJOR);
4097602adf40SYehuda Sadeh 	if (!disk)
40981fcdb8aaSAlex Elder 		return -ENOMEM;
4099602adf40SYehuda Sadeh 
4100f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
4101de71a297SAlex Elder 		 rbd_dev->dev_id);
4102602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
4103dd82fff1SIlya Dryomov 	disk->first_minor = rbd_dev->minor;
41047e513d43SIlya Dryomov 	if (single_major)
41057e513d43SIlya Dryomov 		disk->flags |= GENHD_FL_EXT_DEVT;
4106602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
4107602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
4108602adf40SYehuda Sadeh 
41097ad18afaSChristoph Hellwig 	memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
41107ad18afaSChristoph Hellwig 	rbd_dev->tag_set.ops = &rbd_mq_ops;
4111b5584180SIlya Dryomov 	rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
41127ad18afaSChristoph Hellwig 	rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
4113b5584180SIlya Dryomov 	rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
41147ad18afaSChristoph Hellwig 	rbd_dev->tag_set.nr_hw_queues = 1;
41157ad18afaSChristoph Hellwig 	rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
41167ad18afaSChristoph Hellwig 
41177ad18afaSChristoph Hellwig 	err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
41187ad18afaSChristoph Hellwig 	if (err)
4119602adf40SYehuda Sadeh 		goto out_disk;
4120029bcbd8SJosh Durgin 
41217ad18afaSChristoph Hellwig 	q = blk_mq_init_queue(&rbd_dev->tag_set);
41227ad18afaSChristoph Hellwig 	if (IS_ERR(q)) {
41237ad18afaSChristoph Hellwig 		err = PTR_ERR(q);
41247ad18afaSChristoph Hellwig 		goto out_tag_set;
41257ad18afaSChristoph Hellwig 	}
41267ad18afaSChristoph Hellwig 
41278b904b5bSBart Van Assche 	blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
4128d8a2c89cSIlya Dryomov 	/* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
4129593a9e7bSAlex Elder 
4130420efbdfSIlya Dryomov 	blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
41310d9fde4fSIlya Dryomov 	q->limits.max_sectors = queue_max_hw_sectors(q);
413221acdf45SIlya Dryomov 	blk_queue_max_segments(q, USHRT_MAX);
413324f1df60SIlya Dryomov 	blk_queue_max_segment_size(q, UINT_MAX);
4134420efbdfSIlya Dryomov 	blk_queue_io_min(q, objset_bytes);
4135420efbdfSIlya Dryomov 	blk_queue_io_opt(q, objset_bytes);
4136029bcbd8SJosh Durgin 
4137d9360540SIlya Dryomov 	if (rbd_dev->opts->trim) {
41388b904b5bSBart Van Assche 		blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
4139420efbdfSIlya Dryomov 		q->limits.discard_granularity = objset_bytes;
4140420efbdfSIlya Dryomov 		blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
4141420efbdfSIlya Dryomov 		blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
4142d9360540SIlya Dryomov 	}
414390e98c52SGuangliang Zhao 
4144bae818eeSRonny Hegewald 	if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
4145dc3b17ccSJan Kara 		q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
4146bae818eeSRonny Hegewald 
41475769ed0cSIlya Dryomov 	/*
41485769ed0cSIlya Dryomov 	 * disk_release() expects a queue ref from add_disk() and will
41495769ed0cSIlya Dryomov 	 * put it.  Hold an extra ref until add_disk() is called.
41505769ed0cSIlya Dryomov 	 */
41515769ed0cSIlya Dryomov 	WARN_ON(!blk_get_queue(q));
4152602adf40SYehuda Sadeh 	disk->queue = q;
4153602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
4154602adf40SYehuda Sadeh 
4155602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
4156602adf40SYehuda Sadeh 
4157602adf40SYehuda Sadeh 	return 0;
41587ad18afaSChristoph Hellwig out_tag_set:
41597ad18afaSChristoph Hellwig 	blk_mq_free_tag_set(&rbd_dev->tag_set);
4160602adf40SYehuda Sadeh out_disk:
4161602adf40SYehuda Sadeh 	put_disk(disk);
41627ad18afaSChristoph Hellwig 	return err;
4163602adf40SYehuda Sadeh }
4164602adf40SYehuda Sadeh 
4165dfc5606dSYehuda Sadeh /*
4166dfc5606dSYehuda Sadeh   sysfs
4167dfc5606dSYehuda Sadeh */
4168602adf40SYehuda Sadeh 
4169593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
4170593a9e7bSAlex Elder {
4171593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
4172593a9e7bSAlex Elder }
4173593a9e7bSAlex Elder 
4174dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
4175dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
4176602adf40SYehuda Sadeh {
4177593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4178dfc5606dSYehuda Sadeh 
4179fc71d833SAlex Elder 	return sprintf(buf, "%llu\n",
4180fc71d833SAlex Elder 		(unsigned long long)rbd_dev->mapping.size);
4181602adf40SYehuda Sadeh }
4182602adf40SYehuda Sadeh 
418334b13184SAlex Elder /*
418434b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
418534b13184SAlex Elder  * necessarily the base image.
418634b13184SAlex Elder  */
418734b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
418834b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
418934b13184SAlex Elder {
419034b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
419134b13184SAlex Elder 
419234b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
419334b13184SAlex Elder 			(unsigned long long)rbd_dev->mapping.features);
419434b13184SAlex Elder }
419534b13184SAlex Elder 
4196dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
4197dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
4198602adf40SYehuda Sadeh {
4199593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4200dfc5606dSYehuda Sadeh 
4201fc71d833SAlex Elder 	if (rbd_dev->major)
4202dfc5606dSYehuda Sadeh 		return sprintf(buf, "%d\n", rbd_dev->major);
4203fc71d833SAlex Elder 
4204fc71d833SAlex Elder 	return sprintf(buf, "(none)\n");
4205dd82fff1SIlya Dryomov }
4206fc71d833SAlex Elder 
4207dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev,
4208dd82fff1SIlya Dryomov 			      struct device_attribute *attr, char *buf)
4209dd82fff1SIlya Dryomov {
4210dd82fff1SIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4211dd82fff1SIlya Dryomov 
4212dd82fff1SIlya Dryomov 	return sprintf(buf, "%d\n", rbd_dev->minor);
4213dfc5606dSYehuda Sadeh }
4214dfc5606dSYehuda Sadeh 
4215005a07bfSIlya Dryomov static ssize_t rbd_client_addr_show(struct device *dev,
4216005a07bfSIlya Dryomov 				    struct device_attribute *attr, char *buf)
4217005a07bfSIlya Dryomov {
4218005a07bfSIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4219005a07bfSIlya Dryomov 	struct ceph_entity_addr *client_addr =
4220005a07bfSIlya Dryomov 	    ceph_client_addr(rbd_dev->rbd_client->client);
4221005a07bfSIlya Dryomov 
4222005a07bfSIlya Dryomov 	return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
4223005a07bfSIlya Dryomov 		       le32_to_cpu(client_addr->nonce));
4224005a07bfSIlya Dryomov }
4225005a07bfSIlya Dryomov 
4226dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
4227dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
4228dfc5606dSYehuda Sadeh {
4229593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4230dfc5606dSYehuda Sadeh 
42311dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
4232033268a5SIlya Dryomov 		       ceph_client_gid(rbd_dev->rbd_client->client));
4233dfc5606dSYehuda Sadeh }
4234dfc5606dSYehuda Sadeh 
4235267fb90bSMike Christie static ssize_t rbd_cluster_fsid_show(struct device *dev,
4236267fb90bSMike Christie 				     struct device_attribute *attr, char *buf)
4237267fb90bSMike Christie {
4238267fb90bSMike Christie 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4239267fb90bSMike Christie 
4240267fb90bSMike Christie 	return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
4241267fb90bSMike Christie }
4242267fb90bSMike Christie 
42430d6d1e9cSMike Christie static ssize_t rbd_config_info_show(struct device *dev,
42440d6d1e9cSMike Christie 				    struct device_attribute *attr, char *buf)
42450d6d1e9cSMike Christie {
42460d6d1e9cSMike Christie 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
42470d6d1e9cSMike Christie 
42480d6d1e9cSMike Christie 	return sprintf(buf, "%s\n", rbd_dev->config_info);
4249dfc5606dSYehuda Sadeh }
4250dfc5606dSYehuda Sadeh 
4251dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
4252dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
4253dfc5606dSYehuda Sadeh {
4254593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4255dfc5606dSYehuda Sadeh 
42560d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
4257dfc5606dSYehuda Sadeh }
4258dfc5606dSYehuda Sadeh 
42599bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
42609bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
42619bb2f334SAlex Elder {
42629bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
42639bb2f334SAlex Elder 
42640d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
42650d7dbfceSAlex Elder 			(unsigned long long) rbd_dev->spec->pool_id);
42669bb2f334SAlex Elder }
42679bb2f334SAlex Elder 
4268b26c047bSIlya Dryomov static ssize_t rbd_pool_ns_show(struct device *dev,
4269b26c047bSIlya Dryomov 				struct device_attribute *attr, char *buf)
4270b26c047bSIlya Dryomov {
4271b26c047bSIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4272b26c047bSIlya Dryomov 
4273b26c047bSIlya Dryomov 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: "");
4274b26c047bSIlya Dryomov }
4275b26c047bSIlya Dryomov 
4276dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
4277dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
4278dfc5606dSYehuda Sadeh {
4279593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4280dfc5606dSYehuda Sadeh 
4281a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
42820d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
4283a92ffdf8SAlex Elder 
4284a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
4285dfc5606dSYehuda Sadeh }
4286dfc5606dSYehuda Sadeh 
4287589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
4288589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
4289589d30e0SAlex Elder {
4290589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4291589d30e0SAlex Elder 
42920d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
4293589d30e0SAlex Elder }
4294589d30e0SAlex Elder 
429534b13184SAlex Elder /*
429634b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
429734b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
429834b13184SAlex Elder  */
4299dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
4300dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
4301dfc5606dSYehuda Sadeh 			     char *buf)
4302dfc5606dSYehuda Sadeh {
4303593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4304dfc5606dSYehuda Sadeh 
43050d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
4306dfc5606dSYehuda Sadeh }
4307dfc5606dSYehuda Sadeh 
430892a58671SMike Christie static ssize_t rbd_snap_id_show(struct device *dev,
430992a58671SMike Christie 				struct device_attribute *attr, char *buf)
431092a58671SMike Christie {
431192a58671SMike Christie 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
431292a58671SMike Christie 
431392a58671SMike Christie 	return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
431492a58671SMike Christie }
431592a58671SMike Christie 
431686b00e0dSAlex Elder /*
4317ff96128fSIlya Dryomov  * For a v2 image, shows the chain of parent images, separated by empty
4318ff96128fSIlya Dryomov  * lines.  For v1 images or if there is no parent, shows "(no parent
4319ff96128fSIlya Dryomov  * image)".
432086b00e0dSAlex Elder  */
432186b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
432286b00e0dSAlex Elder 			       struct device_attribute *attr,
432386b00e0dSAlex Elder 			       char *buf)
432486b00e0dSAlex Elder {
432586b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4326ff96128fSIlya Dryomov 	ssize_t count = 0;
432786b00e0dSAlex Elder 
4328ff96128fSIlya Dryomov 	if (!rbd_dev->parent)
432986b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
433086b00e0dSAlex Elder 
4331ff96128fSIlya Dryomov 	for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
4332ff96128fSIlya Dryomov 		struct rbd_spec *spec = rbd_dev->parent_spec;
433386b00e0dSAlex Elder 
4334ff96128fSIlya Dryomov 		count += sprintf(&buf[count], "%s"
4335ff96128fSIlya Dryomov 			    "pool_id %llu\npool_name %s\n"
4336e92c0eafSIlya Dryomov 			    "pool_ns %s\n"
4337ff96128fSIlya Dryomov 			    "image_id %s\nimage_name %s\n"
4338ff96128fSIlya Dryomov 			    "snap_id %llu\nsnap_name %s\n"
4339ff96128fSIlya Dryomov 			    "overlap %llu\n",
4340ff96128fSIlya Dryomov 			    !count ? "" : "\n", /* first? */
4341ff96128fSIlya Dryomov 			    spec->pool_id, spec->pool_name,
4342e92c0eafSIlya Dryomov 			    spec->pool_ns ?: "",
4343ff96128fSIlya Dryomov 			    spec->image_id, spec->image_name ?: "(unknown)",
4344ff96128fSIlya Dryomov 			    spec->snap_id, spec->snap_name,
4345ff96128fSIlya Dryomov 			    rbd_dev->parent_overlap);
4346ff96128fSIlya Dryomov 	}
434786b00e0dSAlex Elder 
434886b00e0dSAlex Elder 	return count;
434986b00e0dSAlex Elder }
435086b00e0dSAlex Elder 
4351dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
4352dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
4353dfc5606dSYehuda Sadeh 				 const char *buf,
4354dfc5606dSYehuda Sadeh 				 size_t size)
4355dfc5606dSYehuda Sadeh {
4356593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4357b813623aSAlex Elder 	int ret;
4358602adf40SYehuda Sadeh 
4359cc4a38bdSAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
4360e627db08SAlex Elder 	if (ret)
436152bb1f9bSIlya Dryomov 		return ret;
4362b813623aSAlex Elder 
436352bb1f9bSIlya Dryomov 	return size;
4364dfc5606dSYehuda Sadeh }
4365602adf40SYehuda Sadeh 
43665657a819SJoe Perches static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
43675657a819SJoe Perches static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
43685657a819SJoe Perches static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
43695657a819SJoe Perches static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
43705657a819SJoe Perches static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
43715657a819SJoe Perches static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
43725657a819SJoe Perches static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
43735657a819SJoe Perches static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
43745657a819SJoe Perches static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
43755657a819SJoe Perches static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
4376b26c047bSIlya Dryomov static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL);
43775657a819SJoe Perches static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
43785657a819SJoe Perches static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
43795657a819SJoe Perches static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
43805657a819SJoe Perches static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
43815657a819SJoe Perches static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
43825657a819SJoe Perches static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
4383dfc5606dSYehuda Sadeh 
4384dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
4385dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
438634b13184SAlex Elder 	&dev_attr_features.attr,
4387dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
4388dd82fff1SIlya Dryomov 	&dev_attr_minor.attr,
4389005a07bfSIlya Dryomov 	&dev_attr_client_addr.attr,
4390dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
4391267fb90bSMike Christie 	&dev_attr_cluster_fsid.attr,
43920d6d1e9cSMike Christie 	&dev_attr_config_info.attr,
4393dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
43949bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
4395b26c047bSIlya Dryomov 	&dev_attr_pool_ns.attr,
4396dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
4397589d30e0SAlex Elder 	&dev_attr_image_id.attr,
4398dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
439992a58671SMike Christie 	&dev_attr_snap_id.attr,
440086b00e0dSAlex Elder 	&dev_attr_parent.attr,
4401dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
4402dfc5606dSYehuda Sadeh 	NULL
4403dfc5606dSYehuda Sadeh };
4404dfc5606dSYehuda Sadeh 
4405dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
4406dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
4407dfc5606dSYehuda Sadeh };
4408dfc5606dSYehuda Sadeh 
4409dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
4410dfc5606dSYehuda Sadeh 	&rbd_attr_group,
4411dfc5606dSYehuda Sadeh 	NULL
4412dfc5606dSYehuda Sadeh };
4413dfc5606dSYehuda Sadeh 
44146cac4695SIlya Dryomov static void rbd_dev_release(struct device *dev);
4415dfc5606dSYehuda Sadeh 
4416b9942bc9SBhumika Goyal static const struct device_type rbd_device_type = {
4417dfc5606dSYehuda Sadeh 	.name		= "rbd",
4418dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
44196cac4695SIlya Dryomov 	.release	= rbd_dev_release,
4420dfc5606dSYehuda Sadeh };
4421dfc5606dSYehuda Sadeh 
44228b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
44238b8fb99cSAlex Elder {
44248b8fb99cSAlex Elder 	kref_get(&spec->kref);
44258b8fb99cSAlex Elder 
44268b8fb99cSAlex Elder 	return spec;
44278b8fb99cSAlex Elder }
44288b8fb99cSAlex Elder 
44298b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
44308b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
44318b8fb99cSAlex Elder {
44328b8fb99cSAlex Elder 	if (spec)
44338b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
44348b8fb99cSAlex Elder }
44358b8fb99cSAlex Elder 
44368b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
44378b8fb99cSAlex Elder {
44388b8fb99cSAlex Elder 	struct rbd_spec *spec;
44398b8fb99cSAlex Elder 
44408b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
44418b8fb99cSAlex Elder 	if (!spec)
44428b8fb99cSAlex Elder 		return NULL;
444304077599SIlya Dryomov 
444404077599SIlya Dryomov 	spec->pool_id = CEPH_NOPOOL;
444504077599SIlya Dryomov 	spec->snap_id = CEPH_NOSNAP;
44468b8fb99cSAlex Elder 	kref_init(&spec->kref);
44478b8fb99cSAlex Elder 
44488b8fb99cSAlex Elder 	return spec;
44498b8fb99cSAlex Elder }
44508b8fb99cSAlex Elder 
44518b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
44528b8fb99cSAlex Elder {
44538b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
44548b8fb99cSAlex Elder 
44558b8fb99cSAlex Elder 	kfree(spec->pool_name);
4456b26c047bSIlya Dryomov 	kfree(spec->pool_ns);
44578b8fb99cSAlex Elder 	kfree(spec->image_id);
44588b8fb99cSAlex Elder 	kfree(spec->image_name);
44598b8fb99cSAlex Elder 	kfree(spec->snap_name);
44608b8fb99cSAlex Elder 	kfree(spec);
44618b8fb99cSAlex Elder }
44628b8fb99cSAlex Elder 
44631643dfa4SIlya Dryomov static void rbd_dev_free(struct rbd_device *rbd_dev)
4464dd5ac32dSIlya Dryomov {
446599d16943SIlya Dryomov 	WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
4466ed95b21aSIlya Dryomov 	WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
4467dd5ac32dSIlya Dryomov 
4468c41d13a3SIlya Dryomov 	ceph_oid_destroy(&rbd_dev->header_oid);
44696b6dddbeSIlya Dryomov 	ceph_oloc_destroy(&rbd_dev->header_oloc);
44700d6d1e9cSMike Christie 	kfree(rbd_dev->config_info);
4471c41d13a3SIlya Dryomov 
4472dd5ac32dSIlya Dryomov 	rbd_put_client(rbd_dev->rbd_client);
4473dd5ac32dSIlya Dryomov 	rbd_spec_put(rbd_dev->spec);
4474dd5ac32dSIlya Dryomov 	kfree(rbd_dev->opts);
4475dd5ac32dSIlya Dryomov 	kfree(rbd_dev);
44761643dfa4SIlya Dryomov }
44771643dfa4SIlya Dryomov 
44781643dfa4SIlya Dryomov static void rbd_dev_release(struct device *dev)
44791643dfa4SIlya Dryomov {
44801643dfa4SIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
44811643dfa4SIlya Dryomov 	bool need_put = !!rbd_dev->opts;
44821643dfa4SIlya Dryomov 
44831643dfa4SIlya Dryomov 	if (need_put) {
44841643dfa4SIlya Dryomov 		destroy_workqueue(rbd_dev->task_wq);
44851643dfa4SIlya Dryomov 		ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
44861643dfa4SIlya Dryomov 	}
44871643dfa4SIlya Dryomov 
44881643dfa4SIlya Dryomov 	rbd_dev_free(rbd_dev);
4489dd5ac32dSIlya Dryomov 
4490dd5ac32dSIlya Dryomov 	/*
4491dd5ac32dSIlya Dryomov 	 * This is racy, but way better than putting module outside of
4492dd5ac32dSIlya Dryomov 	 * the release callback.  The race window is pretty small, so
4493dd5ac32dSIlya Dryomov 	 * doing something similar to dm (dm-builtin.c) is overkill.
4494dd5ac32dSIlya Dryomov 	 */
4495dd5ac32dSIlya Dryomov 	if (need_put)
4496dd5ac32dSIlya Dryomov 		module_put(THIS_MODULE);
4497dd5ac32dSIlya Dryomov }
4498dd5ac32dSIlya Dryomov 
44991643dfa4SIlya Dryomov static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
45001643dfa4SIlya Dryomov 					   struct rbd_spec *spec)
4501c53d5893SAlex Elder {
4502c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
4503c53d5893SAlex Elder 
4504c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
4505c53d5893SAlex Elder 	if (!rbd_dev)
4506c53d5893SAlex Elder 		return NULL;
4507c53d5893SAlex Elder 
4508c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
4509c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
4510c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
4511c53d5893SAlex Elder 
45127e97332eSIlya Dryomov 	rbd_dev->header.data_pool_id = CEPH_NOPOOL;
4513c41d13a3SIlya Dryomov 	ceph_oid_init(&rbd_dev->header_oid);
4514431a02cdSIlya Dryomov 	rbd_dev->header_oloc.pool = spec->pool_id;
4515b26c047bSIlya Dryomov 	if (spec->pool_ns) {
4516b26c047bSIlya Dryomov 		WARN_ON(!*spec->pool_ns);
4517b26c047bSIlya Dryomov 		rbd_dev->header_oloc.pool_ns =
4518b26c047bSIlya Dryomov 		    ceph_find_or_create_string(spec->pool_ns,
4519b26c047bSIlya Dryomov 					       strlen(spec->pool_ns));
4520b26c047bSIlya Dryomov 	}
4521c41d13a3SIlya Dryomov 
452299d16943SIlya Dryomov 	mutex_init(&rbd_dev->watch_mutex);
452399d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
452499d16943SIlya Dryomov 	INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
452599d16943SIlya Dryomov 
4526ed95b21aSIlya Dryomov 	init_rwsem(&rbd_dev->lock_rwsem);
4527ed95b21aSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
4528ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
4529ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
4530ed95b21aSIlya Dryomov 	INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
4531ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
4532ed95b21aSIlya Dryomov 	init_waitqueue_head(&rbd_dev->lock_waitq);
4533ed95b21aSIlya Dryomov 
4534dd5ac32dSIlya Dryomov 	rbd_dev->dev.bus = &rbd_bus_type;
4535dd5ac32dSIlya Dryomov 	rbd_dev->dev.type = &rbd_device_type;
4536dd5ac32dSIlya Dryomov 	rbd_dev->dev.parent = &rbd_root_dev;
4537dd5ac32dSIlya Dryomov 	device_initialize(&rbd_dev->dev);
4538dd5ac32dSIlya Dryomov 
4539c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
4540d147543dSIlya Dryomov 	rbd_dev->spec = spec;
45410903e875SAlex Elder 
45421643dfa4SIlya Dryomov 	return rbd_dev;
45431643dfa4SIlya Dryomov }
45441643dfa4SIlya Dryomov 
4545dd5ac32dSIlya Dryomov /*
45461643dfa4SIlya Dryomov  * Create a mapping rbd_dev.
4547dd5ac32dSIlya Dryomov  */
45481643dfa4SIlya Dryomov static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
45491643dfa4SIlya Dryomov 					 struct rbd_spec *spec,
45501643dfa4SIlya Dryomov 					 struct rbd_options *opts)
45511643dfa4SIlya Dryomov {
45521643dfa4SIlya Dryomov 	struct rbd_device *rbd_dev;
45531643dfa4SIlya Dryomov 
45541643dfa4SIlya Dryomov 	rbd_dev = __rbd_dev_create(rbdc, spec);
45551643dfa4SIlya Dryomov 	if (!rbd_dev)
45561643dfa4SIlya Dryomov 		return NULL;
45571643dfa4SIlya Dryomov 
45581643dfa4SIlya Dryomov 	rbd_dev->opts = opts;
45591643dfa4SIlya Dryomov 
45601643dfa4SIlya Dryomov 	/* get an id and fill in device name */
45611643dfa4SIlya Dryomov 	rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
45621643dfa4SIlya Dryomov 					 minor_to_rbd_dev_id(1 << MINORBITS),
45631643dfa4SIlya Dryomov 					 GFP_KERNEL);
45641643dfa4SIlya Dryomov 	if (rbd_dev->dev_id < 0)
45651643dfa4SIlya Dryomov 		goto fail_rbd_dev;
45661643dfa4SIlya Dryomov 
45671643dfa4SIlya Dryomov 	sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
45681643dfa4SIlya Dryomov 	rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
45691643dfa4SIlya Dryomov 						   rbd_dev->name);
45701643dfa4SIlya Dryomov 	if (!rbd_dev->task_wq)
45711643dfa4SIlya Dryomov 		goto fail_dev_id;
45721643dfa4SIlya Dryomov 
45731643dfa4SIlya Dryomov 	/* we have a ref from do_rbd_add() */
4574dd5ac32dSIlya Dryomov 	__module_get(THIS_MODULE);
4575dd5ac32dSIlya Dryomov 
45761643dfa4SIlya Dryomov 	dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
4577c53d5893SAlex Elder 	return rbd_dev;
45781643dfa4SIlya Dryomov 
45791643dfa4SIlya Dryomov fail_dev_id:
45801643dfa4SIlya Dryomov 	ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
45811643dfa4SIlya Dryomov fail_rbd_dev:
45821643dfa4SIlya Dryomov 	rbd_dev_free(rbd_dev);
45831643dfa4SIlya Dryomov 	return NULL;
4584c53d5893SAlex Elder }
4585c53d5893SAlex Elder 
4586c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4587c53d5893SAlex Elder {
4588dd5ac32dSIlya Dryomov 	if (rbd_dev)
4589dd5ac32dSIlya Dryomov 		put_device(&rbd_dev->dev);
4590c53d5893SAlex Elder }
4591c53d5893SAlex Elder 
4592dfc5606dSYehuda Sadeh /*
45939d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
45949d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
45959d475de5SAlex Elder  * image.
45969d475de5SAlex Elder  */
45979d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
45989d475de5SAlex Elder 				u8 *order, u64 *snap_size)
45999d475de5SAlex Elder {
46009d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
46019d475de5SAlex Elder 	int ret;
46029d475de5SAlex Elder 	struct {
46039d475de5SAlex Elder 		u8 order;
46049d475de5SAlex Elder 		__le64 size;
46059d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
46069d475de5SAlex Elder 
4607ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4608ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_size",
46094157976bSAlex Elder 				  &snapid, sizeof(snapid),
4610e2a58ee5SAlex Elder 				  &size_buf, sizeof(size_buf));
461136be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
46129d475de5SAlex Elder 	if (ret < 0)
46139d475de5SAlex Elder 		return ret;
461457385b51SAlex Elder 	if (ret < sizeof (size_buf))
461557385b51SAlex Elder 		return -ERANGE;
46169d475de5SAlex Elder 
4617c3545579SJosh Durgin 	if (order) {
46189d475de5SAlex Elder 		*order = size_buf.order;
4619c3545579SJosh Durgin 		dout("  order %u", (unsigned int)*order);
4620c3545579SJosh Durgin 	}
46219d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
46229d475de5SAlex Elder 
4623c3545579SJosh Durgin 	dout("  snap_id 0x%016llx snap_size = %llu\n",
4624c3545579SJosh Durgin 		(unsigned long long)snap_id,
46259d475de5SAlex Elder 		(unsigned long long)*snap_size);
46269d475de5SAlex Elder 
46279d475de5SAlex Elder 	return 0;
46289d475de5SAlex Elder }
46299d475de5SAlex Elder 
46309d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
46319d475de5SAlex Elder {
46329d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
46339d475de5SAlex Elder 					&rbd_dev->header.obj_order,
46349d475de5SAlex Elder 					&rbd_dev->header.image_size);
46359d475de5SAlex Elder }
46369d475de5SAlex Elder 
46371e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
46381e130199SAlex Elder {
46391e130199SAlex Elder 	void *reply_buf;
46401e130199SAlex Elder 	int ret;
46411e130199SAlex Elder 	void *p;
46421e130199SAlex Elder 
46431e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
46441e130199SAlex Elder 	if (!reply_buf)
46451e130199SAlex Elder 		return -ENOMEM;
46461e130199SAlex Elder 
4647ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4648ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_object_prefix",
4649ecd4a68aSIlya Dryomov 				  NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
465036be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
46511e130199SAlex Elder 	if (ret < 0)
46521e130199SAlex Elder 		goto out;
46531e130199SAlex Elder 
46541e130199SAlex Elder 	p = reply_buf;
46551e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
465657385b51SAlex Elder 						p + ret, NULL, GFP_NOIO);
465757385b51SAlex Elder 	ret = 0;
46581e130199SAlex Elder 
46591e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
46601e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
46611e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
46621e130199SAlex Elder 	} else {
46631e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
46641e130199SAlex Elder 	}
46651e130199SAlex Elder out:
46661e130199SAlex Elder 	kfree(reply_buf);
46671e130199SAlex Elder 
46681e130199SAlex Elder 	return ret;
46691e130199SAlex Elder }
46701e130199SAlex Elder 
4671b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4672b1b5402aSAlex Elder 		u64 *snap_features)
4673b1b5402aSAlex Elder {
4674b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
4675b1b5402aSAlex Elder 	struct {
4676b1b5402aSAlex Elder 		__le64 features;
4677b1b5402aSAlex Elder 		__le64 incompat;
46784157976bSAlex Elder 	} __attribute__ ((packed)) features_buf = { 0 };
4679d3767f0fSIlya Dryomov 	u64 unsup;
4680b1b5402aSAlex Elder 	int ret;
4681b1b5402aSAlex Elder 
4682ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4683ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_features",
46844157976bSAlex Elder 				  &snapid, sizeof(snapid),
4685e2a58ee5SAlex Elder 				  &features_buf, sizeof(features_buf));
468636be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4687b1b5402aSAlex Elder 	if (ret < 0)
4688b1b5402aSAlex Elder 		return ret;
468957385b51SAlex Elder 	if (ret < sizeof (features_buf))
469057385b51SAlex Elder 		return -ERANGE;
4691d889140cSAlex Elder 
4692d3767f0fSIlya Dryomov 	unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
4693d3767f0fSIlya Dryomov 	if (unsup) {
4694d3767f0fSIlya Dryomov 		rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
4695d3767f0fSIlya Dryomov 			 unsup);
4696b8f5c6edSAlex Elder 		return -ENXIO;
4697d3767f0fSIlya Dryomov 	}
4698d889140cSAlex Elder 
4699b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
4700b1b5402aSAlex Elder 
4701b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
4702b1b5402aSAlex Elder 		(unsigned long long)snap_id,
4703b1b5402aSAlex Elder 		(unsigned long long)*snap_features,
4704b1b5402aSAlex Elder 		(unsigned long long)le64_to_cpu(features_buf.incompat));
4705b1b5402aSAlex Elder 
4706b1b5402aSAlex Elder 	return 0;
4707b1b5402aSAlex Elder }
4708b1b5402aSAlex Elder 
4709b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
4710b1b5402aSAlex Elder {
4711b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
4712b1b5402aSAlex Elder 						&rbd_dev->header.features);
4713b1b5402aSAlex Elder }
4714b1b5402aSAlex Elder 
4715eb3b2d6bSIlya Dryomov struct parent_image_info {
4716eb3b2d6bSIlya Dryomov 	u64		pool_id;
4717e92c0eafSIlya Dryomov 	const char	*pool_ns;
4718eb3b2d6bSIlya Dryomov 	const char	*image_id;
4719eb3b2d6bSIlya Dryomov 	u64		snap_id;
4720eb3b2d6bSIlya Dryomov 
4721e92c0eafSIlya Dryomov 	bool		has_overlap;
4722eb3b2d6bSIlya Dryomov 	u64		overlap;
4723eb3b2d6bSIlya Dryomov };
4724eb3b2d6bSIlya Dryomov 
4725eb3b2d6bSIlya Dryomov /*
4726eb3b2d6bSIlya Dryomov  * The caller is responsible for @pii.
4727eb3b2d6bSIlya Dryomov  */
4728e92c0eafSIlya Dryomov static int decode_parent_image_spec(void **p, void *end,
4729e92c0eafSIlya Dryomov 				    struct parent_image_info *pii)
4730e92c0eafSIlya Dryomov {
4731e92c0eafSIlya Dryomov 	u8 struct_v;
4732e92c0eafSIlya Dryomov 	u32 struct_len;
4733e92c0eafSIlya Dryomov 	int ret;
4734e92c0eafSIlya Dryomov 
4735e92c0eafSIlya Dryomov 	ret = ceph_start_decoding(p, end, 1, "ParentImageSpec",
4736e92c0eafSIlya Dryomov 				  &struct_v, &struct_len);
4737e92c0eafSIlya Dryomov 	if (ret)
4738e92c0eafSIlya Dryomov 		return ret;
4739e92c0eafSIlya Dryomov 
4740e92c0eafSIlya Dryomov 	ceph_decode_64_safe(p, end, pii->pool_id, e_inval);
4741e92c0eafSIlya Dryomov 	pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
4742e92c0eafSIlya Dryomov 	if (IS_ERR(pii->pool_ns)) {
4743e92c0eafSIlya Dryomov 		ret = PTR_ERR(pii->pool_ns);
4744e92c0eafSIlya Dryomov 		pii->pool_ns = NULL;
4745e92c0eafSIlya Dryomov 		return ret;
4746e92c0eafSIlya Dryomov 	}
4747e92c0eafSIlya Dryomov 	pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
4748e92c0eafSIlya Dryomov 	if (IS_ERR(pii->image_id)) {
4749e92c0eafSIlya Dryomov 		ret = PTR_ERR(pii->image_id);
4750e92c0eafSIlya Dryomov 		pii->image_id = NULL;
4751e92c0eafSIlya Dryomov 		return ret;
4752e92c0eafSIlya Dryomov 	}
4753e92c0eafSIlya Dryomov 	ceph_decode_64_safe(p, end, pii->snap_id, e_inval);
4754e92c0eafSIlya Dryomov 	return 0;
4755e92c0eafSIlya Dryomov 
4756e92c0eafSIlya Dryomov e_inval:
4757e92c0eafSIlya Dryomov 	return -EINVAL;
4758e92c0eafSIlya Dryomov }
4759e92c0eafSIlya Dryomov 
4760e92c0eafSIlya Dryomov static int __get_parent_info(struct rbd_device *rbd_dev,
4761e92c0eafSIlya Dryomov 			     struct page *req_page,
4762e92c0eafSIlya Dryomov 			     struct page *reply_page,
4763e92c0eafSIlya Dryomov 			     struct parent_image_info *pii)
4764e92c0eafSIlya Dryomov {
4765e92c0eafSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4766e92c0eafSIlya Dryomov 	size_t reply_len = PAGE_SIZE;
4767e92c0eafSIlya Dryomov 	void *p, *end;
4768e92c0eafSIlya Dryomov 	int ret;
4769e92c0eafSIlya Dryomov 
4770e92c0eafSIlya Dryomov 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4771e92c0eafSIlya Dryomov 			     "rbd", "parent_get", CEPH_OSD_FLAG_READ,
4772e92c0eafSIlya Dryomov 			     req_page, sizeof(u64), reply_page, &reply_len);
4773e92c0eafSIlya Dryomov 	if (ret)
4774e92c0eafSIlya Dryomov 		return ret == -EOPNOTSUPP ? 1 : ret;
4775e92c0eafSIlya Dryomov 
4776e92c0eafSIlya Dryomov 	p = page_address(reply_page);
4777e92c0eafSIlya Dryomov 	end = p + reply_len;
4778e92c0eafSIlya Dryomov 	ret = decode_parent_image_spec(&p, end, pii);
4779e92c0eafSIlya Dryomov 	if (ret)
4780e92c0eafSIlya Dryomov 		return ret;
4781e92c0eafSIlya Dryomov 
4782e92c0eafSIlya Dryomov 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4783e92c0eafSIlya Dryomov 			     "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
4784e92c0eafSIlya Dryomov 			     req_page, sizeof(u64), reply_page, &reply_len);
4785e92c0eafSIlya Dryomov 	if (ret)
4786e92c0eafSIlya Dryomov 		return ret;
4787e92c0eafSIlya Dryomov 
4788e92c0eafSIlya Dryomov 	p = page_address(reply_page);
4789e92c0eafSIlya Dryomov 	end = p + reply_len;
4790e92c0eafSIlya Dryomov 	ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval);
4791e92c0eafSIlya Dryomov 	if (pii->has_overlap)
4792e92c0eafSIlya Dryomov 		ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
4793e92c0eafSIlya Dryomov 
4794e92c0eafSIlya Dryomov 	return 0;
4795e92c0eafSIlya Dryomov 
4796e92c0eafSIlya Dryomov e_inval:
4797e92c0eafSIlya Dryomov 	return -EINVAL;
4798e92c0eafSIlya Dryomov }
4799e92c0eafSIlya Dryomov 
4800e92c0eafSIlya Dryomov /*
4801e92c0eafSIlya Dryomov  * The caller is responsible for @pii.
4802e92c0eafSIlya Dryomov  */
4803eb3b2d6bSIlya Dryomov static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
4804eb3b2d6bSIlya Dryomov 				    struct page *req_page,
4805eb3b2d6bSIlya Dryomov 				    struct page *reply_page,
4806eb3b2d6bSIlya Dryomov 				    struct parent_image_info *pii)
4807eb3b2d6bSIlya Dryomov {
4808eb3b2d6bSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4809eb3b2d6bSIlya Dryomov 	size_t reply_len = PAGE_SIZE;
4810eb3b2d6bSIlya Dryomov 	void *p, *end;
4811eb3b2d6bSIlya Dryomov 	int ret;
4812eb3b2d6bSIlya Dryomov 
4813eb3b2d6bSIlya Dryomov 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4814eb3b2d6bSIlya Dryomov 			     "rbd", "get_parent", CEPH_OSD_FLAG_READ,
4815eb3b2d6bSIlya Dryomov 			     req_page, sizeof(u64), reply_page, &reply_len);
4816eb3b2d6bSIlya Dryomov 	if (ret)
4817eb3b2d6bSIlya Dryomov 		return ret;
4818eb3b2d6bSIlya Dryomov 
4819eb3b2d6bSIlya Dryomov 	p = page_address(reply_page);
4820eb3b2d6bSIlya Dryomov 	end = p + reply_len;
4821eb3b2d6bSIlya Dryomov 	ceph_decode_64_safe(&p, end, pii->pool_id, e_inval);
4822eb3b2d6bSIlya Dryomov 	pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
4823eb3b2d6bSIlya Dryomov 	if (IS_ERR(pii->image_id)) {
4824eb3b2d6bSIlya Dryomov 		ret = PTR_ERR(pii->image_id);
4825eb3b2d6bSIlya Dryomov 		pii->image_id = NULL;
4826eb3b2d6bSIlya Dryomov 		return ret;
4827eb3b2d6bSIlya Dryomov 	}
4828eb3b2d6bSIlya Dryomov 	ceph_decode_64_safe(&p, end, pii->snap_id, e_inval);
4829e92c0eafSIlya Dryomov 	pii->has_overlap = true;
4830eb3b2d6bSIlya Dryomov 	ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
4831eb3b2d6bSIlya Dryomov 
4832eb3b2d6bSIlya Dryomov 	return 0;
4833eb3b2d6bSIlya Dryomov 
4834eb3b2d6bSIlya Dryomov e_inval:
4835eb3b2d6bSIlya Dryomov 	return -EINVAL;
4836eb3b2d6bSIlya Dryomov }
4837eb3b2d6bSIlya Dryomov 
4838eb3b2d6bSIlya Dryomov static int get_parent_info(struct rbd_device *rbd_dev,
4839eb3b2d6bSIlya Dryomov 			   struct parent_image_info *pii)
4840eb3b2d6bSIlya Dryomov {
4841eb3b2d6bSIlya Dryomov 	struct page *req_page, *reply_page;
4842eb3b2d6bSIlya Dryomov 	void *p;
4843eb3b2d6bSIlya Dryomov 	int ret;
4844eb3b2d6bSIlya Dryomov 
4845eb3b2d6bSIlya Dryomov 	req_page = alloc_page(GFP_KERNEL);
4846eb3b2d6bSIlya Dryomov 	if (!req_page)
4847eb3b2d6bSIlya Dryomov 		return -ENOMEM;
4848eb3b2d6bSIlya Dryomov 
4849eb3b2d6bSIlya Dryomov 	reply_page = alloc_page(GFP_KERNEL);
4850eb3b2d6bSIlya Dryomov 	if (!reply_page) {
4851eb3b2d6bSIlya Dryomov 		__free_page(req_page);
4852eb3b2d6bSIlya Dryomov 		return -ENOMEM;
4853eb3b2d6bSIlya Dryomov 	}
4854eb3b2d6bSIlya Dryomov 
4855eb3b2d6bSIlya Dryomov 	p = page_address(req_page);
4856eb3b2d6bSIlya Dryomov 	ceph_encode_64(&p, rbd_dev->spec->snap_id);
4857e92c0eafSIlya Dryomov 	ret = __get_parent_info(rbd_dev, req_page, reply_page, pii);
4858e92c0eafSIlya Dryomov 	if (ret > 0)
4859e92c0eafSIlya Dryomov 		ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page,
4860e92c0eafSIlya Dryomov 					       pii);
4861eb3b2d6bSIlya Dryomov 
4862eb3b2d6bSIlya Dryomov 	__free_page(req_page);
4863eb3b2d6bSIlya Dryomov 	__free_page(reply_page);
4864eb3b2d6bSIlya Dryomov 	return ret;
4865eb3b2d6bSIlya Dryomov }
4866eb3b2d6bSIlya Dryomov 
486786b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
486886b00e0dSAlex Elder {
486986b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
4870eb3b2d6bSIlya Dryomov 	struct parent_image_info pii = { 0 };
487186b00e0dSAlex Elder 	int ret;
487286b00e0dSAlex Elder 
487386b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
487486b00e0dSAlex Elder 	if (!parent_spec)
487586b00e0dSAlex Elder 		return -ENOMEM;
487686b00e0dSAlex Elder 
4877eb3b2d6bSIlya Dryomov 	ret = get_parent_info(rbd_dev, &pii);
4878eb3b2d6bSIlya Dryomov 	if (ret)
487986b00e0dSAlex Elder 		goto out_err;
488086b00e0dSAlex Elder 
4881e92c0eafSIlya Dryomov 	dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
4882e92c0eafSIlya Dryomov 	     __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
4883e92c0eafSIlya Dryomov 	     pii.has_overlap, pii.overlap);
4884eb3b2d6bSIlya Dryomov 
4885e92c0eafSIlya Dryomov 	if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
4886392a9dadSAlex Elder 		/*
4887392a9dadSAlex Elder 		 * Either the parent never existed, or we have
4888392a9dadSAlex Elder 		 * record of it but the image got flattened so it no
4889392a9dadSAlex Elder 		 * longer has a parent.  When the parent of a
4890392a9dadSAlex Elder 		 * layered image disappears we immediately set the
4891392a9dadSAlex Elder 		 * overlap to 0.  The effect of this is that all new
4892392a9dadSAlex Elder 		 * requests will be treated as if the image had no
4893392a9dadSAlex Elder 		 * parent.
4894e92c0eafSIlya Dryomov 		 *
4895e92c0eafSIlya Dryomov 		 * If !pii.has_overlap, the parent image spec is not
4896e92c0eafSIlya Dryomov 		 * applicable.  It's there to avoid duplication in each
4897e92c0eafSIlya Dryomov 		 * snapshot record.
4898392a9dadSAlex Elder 		 */
4899392a9dadSAlex Elder 		if (rbd_dev->parent_overlap) {
4900392a9dadSAlex Elder 			rbd_dev->parent_overlap = 0;
4901392a9dadSAlex Elder 			rbd_dev_parent_put(rbd_dev);
4902392a9dadSAlex Elder 			pr_info("%s: clone image has been flattened\n",
4903392a9dadSAlex Elder 				rbd_dev->disk->disk_name);
4904392a9dadSAlex Elder 		}
4905392a9dadSAlex Elder 
490686b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
4907392a9dadSAlex Elder 	}
490886b00e0dSAlex Elder 
49090903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
49100903e875SAlex Elder 
49110903e875SAlex Elder 	ret = -EIO;
4912eb3b2d6bSIlya Dryomov 	if (pii.pool_id > (u64)U32_MAX) {
49139584d508SIlya Dryomov 		rbd_warn(NULL, "parent pool id too large (%llu > %u)",
4914eb3b2d6bSIlya Dryomov 			(unsigned long long)pii.pool_id, U32_MAX);
491557385b51SAlex Elder 		goto out_err;
4916c0cd10dbSAlex Elder 	}
49170903e875SAlex Elder 
49183b5cf2a2SAlex Elder 	/*
49193b5cf2a2SAlex Elder 	 * The parent won't change (except when the clone is
49203b5cf2a2SAlex Elder 	 * flattened, already handled that).  So we only need to
49213b5cf2a2SAlex Elder 	 * record the parent spec we have not already done so.
49223b5cf2a2SAlex Elder 	 */
49233b5cf2a2SAlex Elder 	if (!rbd_dev->parent_spec) {
4924eb3b2d6bSIlya Dryomov 		parent_spec->pool_id = pii.pool_id;
4925e92c0eafSIlya Dryomov 		if (pii.pool_ns && *pii.pool_ns) {
4926e92c0eafSIlya Dryomov 			parent_spec->pool_ns = pii.pool_ns;
4927e92c0eafSIlya Dryomov 			pii.pool_ns = NULL;
4928e92c0eafSIlya Dryomov 		}
4929eb3b2d6bSIlya Dryomov 		parent_spec->image_id = pii.image_id;
4930eb3b2d6bSIlya Dryomov 		pii.image_id = NULL;
4931eb3b2d6bSIlya Dryomov 		parent_spec->snap_id = pii.snap_id;
4932b26c047bSIlya Dryomov 
493386b00e0dSAlex Elder 		rbd_dev->parent_spec = parent_spec;
493486b00e0dSAlex Elder 		parent_spec = NULL;	/* rbd_dev now owns this */
49353b5cf2a2SAlex Elder 	}
49363b5cf2a2SAlex Elder 
49373b5cf2a2SAlex Elder 	/*
4938cf32bd9cSIlya Dryomov 	 * We always update the parent overlap.  If it's zero we issue
4939cf32bd9cSIlya Dryomov 	 * a warning, as we will proceed as if there was no parent.
49403b5cf2a2SAlex Elder 	 */
4941eb3b2d6bSIlya Dryomov 	if (!pii.overlap) {
49423b5cf2a2SAlex Elder 		if (parent_spec) {
4943cf32bd9cSIlya Dryomov 			/* refresh, careful to warn just once */
4944cf32bd9cSIlya Dryomov 			if (rbd_dev->parent_overlap)
4945cf32bd9cSIlya Dryomov 				rbd_warn(rbd_dev,
4946cf32bd9cSIlya Dryomov 				    "clone now standalone (overlap became 0)");
494770cf49cfSAlex Elder 		} else {
4948cf32bd9cSIlya Dryomov 			/* initial probe */
4949cf32bd9cSIlya Dryomov 			rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
49503b5cf2a2SAlex Elder 		}
495170cf49cfSAlex Elder 	}
4952eb3b2d6bSIlya Dryomov 	rbd_dev->parent_overlap = pii.overlap;
4953cf32bd9cSIlya Dryomov 
495486b00e0dSAlex Elder out:
495586b00e0dSAlex Elder 	ret = 0;
495686b00e0dSAlex Elder out_err:
4957e92c0eafSIlya Dryomov 	kfree(pii.pool_ns);
4958eb3b2d6bSIlya Dryomov 	kfree(pii.image_id);
495986b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
496086b00e0dSAlex Elder 	return ret;
496186b00e0dSAlex Elder }
496286b00e0dSAlex Elder 
4963cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
4964cc070d59SAlex Elder {
4965cc070d59SAlex Elder 	struct {
4966cc070d59SAlex Elder 		__le64 stripe_unit;
4967cc070d59SAlex Elder 		__le64 stripe_count;
4968cc070d59SAlex Elder 	} __attribute__ ((packed)) striping_info_buf = { 0 };
4969cc070d59SAlex Elder 	size_t size = sizeof (striping_info_buf);
4970cc070d59SAlex Elder 	void *p;
4971cc070d59SAlex Elder 	int ret;
4972cc070d59SAlex Elder 
4973ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4974ecd4a68aSIlya Dryomov 				&rbd_dev->header_oloc, "get_stripe_unit_count",
4975ecd4a68aSIlya Dryomov 				NULL, 0, &striping_info_buf, size);
4976cc070d59SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4977cc070d59SAlex Elder 	if (ret < 0)
4978cc070d59SAlex Elder 		return ret;
4979cc070d59SAlex Elder 	if (ret < size)
4980cc070d59SAlex Elder 		return -ERANGE;
4981cc070d59SAlex Elder 
4982cc070d59SAlex Elder 	p = &striping_info_buf;
4983b1331852SIlya Dryomov 	rbd_dev->header.stripe_unit = ceph_decode_64(&p);
4984b1331852SIlya Dryomov 	rbd_dev->header.stripe_count = ceph_decode_64(&p);
4985cc070d59SAlex Elder 	return 0;
4986cc070d59SAlex Elder }
4987cc070d59SAlex Elder 
49887e97332eSIlya Dryomov static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
49897e97332eSIlya Dryomov {
49907e97332eSIlya Dryomov 	__le64 data_pool_id;
49917e97332eSIlya Dryomov 	int ret;
49927e97332eSIlya Dryomov 
49937e97332eSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
49947e97332eSIlya Dryomov 				  &rbd_dev->header_oloc, "get_data_pool",
49957e97332eSIlya Dryomov 				  NULL, 0, &data_pool_id, sizeof(data_pool_id));
49967e97332eSIlya Dryomov 	if (ret < 0)
49977e97332eSIlya Dryomov 		return ret;
49987e97332eSIlya Dryomov 	if (ret < sizeof(data_pool_id))
49997e97332eSIlya Dryomov 		return -EBADMSG;
50007e97332eSIlya Dryomov 
50017e97332eSIlya Dryomov 	rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
50027e97332eSIlya Dryomov 	WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
50037e97332eSIlya Dryomov 	return 0;
50047e97332eSIlya Dryomov }
50057e97332eSIlya Dryomov 
50069e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
50079e15b77dSAlex Elder {
5008ecd4a68aSIlya Dryomov 	CEPH_DEFINE_OID_ONSTACK(oid);
50099e15b77dSAlex Elder 	size_t image_id_size;
50109e15b77dSAlex Elder 	char *image_id;
50119e15b77dSAlex Elder 	void *p;
50129e15b77dSAlex Elder 	void *end;
50139e15b77dSAlex Elder 	size_t size;
50149e15b77dSAlex Elder 	void *reply_buf = NULL;
50159e15b77dSAlex Elder 	size_t len = 0;
50169e15b77dSAlex Elder 	char *image_name = NULL;
50179e15b77dSAlex Elder 	int ret;
50189e15b77dSAlex Elder 
50199e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
50209e15b77dSAlex Elder 
502169e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
502269e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
50239e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
50249e15b77dSAlex Elder 	if (!image_id)
50259e15b77dSAlex Elder 		return NULL;
50269e15b77dSAlex Elder 
50279e15b77dSAlex Elder 	p = image_id;
50284157976bSAlex Elder 	end = image_id + image_id_size;
502969e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
50309e15b77dSAlex Elder 
50319e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
50329e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
50339e15b77dSAlex Elder 	if (!reply_buf)
50349e15b77dSAlex Elder 		goto out;
50359e15b77dSAlex Elder 
5036ecd4a68aSIlya Dryomov 	ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
5037ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5038ecd4a68aSIlya Dryomov 				  "dir_get_name", image_id, image_id_size,
5039e2a58ee5SAlex Elder 				  reply_buf, size);
50409e15b77dSAlex Elder 	if (ret < 0)
50419e15b77dSAlex Elder 		goto out;
50429e15b77dSAlex Elder 	p = reply_buf;
5043f40eb349SAlex Elder 	end = reply_buf + ret;
5044f40eb349SAlex Elder 
50459e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
50469e15b77dSAlex Elder 	if (IS_ERR(image_name))
50479e15b77dSAlex Elder 		image_name = NULL;
50489e15b77dSAlex Elder 	else
50499e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
50509e15b77dSAlex Elder out:
50519e15b77dSAlex Elder 	kfree(reply_buf);
50529e15b77dSAlex Elder 	kfree(image_id);
50539e15b77dSAlex Elder 
50549e15b77dSAlex Elder 	return image_name;
50559e15b77dSAlex Elder }
50569e15b77dSAlex Elder 
50572ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
50582ad3d716SAlex Elder {
50592ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
50602ad3d716SAlex Elder 	const char *snap_name;
50612ad3d716SAlex Elder 	u32 which = 0;
50622ad3d716SAlex Elder 
50632ad3d716SAlex Elder 	/* Skip over names until we find the one we are looking for */
50642ad3d716SAlex Elder 
50652ad3d716SAlex Elder 	snap_name = rbd_dev->header.snap_names;
50662ad3d716SAlex Elder 	while (which < snapc->num_snaps) {
50672ad3d716SAlex Elder 		if (!strcmp(name, snap_name))
50682ad3d716SAlex Elder 			return snapc->snaps[which];
50692ad3d716SAlex Elder 		snap_name += strlen(snap_name) + 1;
50702ad3d716SAlex Elder 		which++;
50712ad3d716SAlex Elder 	}
50722ad3d716SAlex Elder 	return CEPH_NOSNAP;
50732ad3d716SAlex Elder }
50742ad3d716SAlex Elder 
50752ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
50762ad3d716SAlex Elder {
50772ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
50782ad3d716SAlex Elder 	u32 which;
50792ad3d716SAlex Elder 	bool found = false;
50802ad3d716SAlex Elder 	u64 snap_id;
50812ad3d716SAlex Elder 
50822ad3d716SAlex Elder 	for (which = 0; !found && which < snapc->num_snaps; which++) {
50832ad3d716SAlex Elder 		const char *snap_name;
50842ad3d716SAlex Elder 
50852ad3d716SAlex Elder 		snap_id = snapc->snaps[which];
50862ad3d716SAlex Elder 		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
5087efadc98aSJosh Durgin 		if (IS_ERR(snap_name)) {
5088efadc98aSJosh Durgin 			/* ignore no-longer existing snapshots */
5089efadc98aSJosh Durgin 			if (PTR_ERR(snap_name) == -ENOENT)
5090efadc98aSJosh Durgin 				continue;
5091efadc98aSJosh Durgin 			else
50922ad3d716SAlex Elder 				break;
5093efadc98aSJosh Durgin 		}
50942ad3d716SAlex Elder 		found = !strcmp(name, snap_name);
50952ad3d716SAlex Elder 		kfree(snap_name);
50962ad3d716SAlex Elder 	}
50972ad3d716SAlex Elder 	return found ? snap_id : CEPH_NOSNAP;
50982ad3d716SAlex Elder }
50992ad3d716SAlex Elder 
51002ad3d716SAlex Elder /*
51012ad3d716SAlex Elder  * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
51022ad3d716SAlex Elder  * no snapshot by that name is found, or if an error occurs.
51032ad3d716SAlex Elder  */
51042ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
51052ad3d716SAlex Elder {
51062ad3d716SAlex Elder 	if (rbd_dev->image_format == 1)
51072ad3d716SAlex Elder 		return rbd_v1_snap_id_by_name(rbd_dev, name);
51082ad3d716SAlex Elder 
51092ad3d716SAlex Elder 	return rbd_v2_snap_id_by_name(rbd_dev, name);
51102ad3d716SAlex Elder }
51112ad3d716SAlex Elder 
51129e15b77dSAlex Elder /*
511304077599SIlya Dryomov  * An image being mapped will have everything but the snap id.
51149e15b77dSAlex Elder  */
511504077599SIlya Dryomov static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
511604077599SIlya Dryomov {
511704077599SIlya Dryomov 	struct rbd_spec *spec = rbd_dev->spec;
511804077599SIlya Dryomov 
511904077599SIlya Dryomov 	rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
512004077599SIlya Dryomov 	rbd_assert(spec->image_id && spec->image_name);
512104077599SIlya Dryomov 	rbd_assert(spec->snap_name);
512204077599SIlya Dryomov 
512304077599SIlya Dryomov 	if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
512404077599SIlya Dryomov 		u64 snap_id;
512504077599SIlya Dryomov 
512604077599SIlya Dryomov 		snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
512704077599SIlya Dryomov 		if (snap_id == CEPH_NOSNAP)
512804077599SIlya Dryomov 			return -ENOENT;
512904077599SIlya Dryomov 
513004077599SIlya Dryomov 		spec->snap_id = snap_id;
513104077599SIlya Dryomov 	} else {
513204077599SIlya Dryomov 		spec->snap_id = CEPH_NOSNAP;
513304077599SIlya Dryomov 	}
513404077599SIlya Dryomov 
513504077599SIlya Dryomov 	return 0;
513604077599SIlya Dryomov }
513704077599SIlya Dryomov 
513804077599SIlya Dryomov /*
513904077599SIlya Dryomov  * A parent image will have all ids but none of the names.
514004077599SIlya Dryomov  *
514104077599SIlya Dryomov  * All names in an rbd spec are dynamically allocated.  It's OK if we
514204077599SIlya Dryomov  * can't figure out the name for an image id.
514304077599SIlya Dryomov  */
514404077599SIlya Dryomov static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
51459e15b77dSAlex Elder {
51462e9f7f1cSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
51472e9f7f1cSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
51482e9f7f1cSAlex Elder 	const char *pool_name;
51492e9f7f1cSAlex Elder 	const char *image_name;
51502e9f7f1cSAlex Elder 	const char *snap_name;
51519e15b77dSAlex Elder 	int ret;
51529e15b77dSAlex Elder 
515304077599SIlya Dryomov 	rbd_assert(spec->pool_id != CEPH_NOPOOL);
515404077599SIlya Dryomov 	rbd_assert(spec->image_id);
515504077599SIlya Dryomov 	rbd_assert(spec->snap_id != CEPH_NOSNAP);
51569e15b77dSAlex Elder 
51572e9f7f1cSAlex Elder 	/* Get the pool name; we have to make our own copy of this */
51589e15b77dSAlex Elder 
51592e9f7f1cSAlex Elder 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
51602e9f7f1cSAlex Elder 	if (!pool_name) {
51612e9f7f1cSAlex Elder 		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
5162935dc89fSAlex Elder 		return -EIO;
5163935dc89fSAlex Elder 	}
51642e9f7f1cSAlex Elder 	pool_name = kstrdup(pool_name, GFP_KERNEL);
51652e9f7f1cSAlex Elder 	if (!pool_name)
51669e15b77dSAlex Elder 		return -ENOMEM;
51679e15b77dSAlex Elder 
51689e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
51699e15b77dSAlex Elder 
51702e9f7f1cSAlex Elder 	image_name = rbd_dev_image_name(rbd_dev);
51712e9f7f1cSAlex Elder 	if (!image_name)
517206ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
51739e15b77dSAlex Elder 
517404077599SIlya Dryomov 	/* Fetch the snapshot name */
51759e15b77dSAlex Elder 
51762e9f7f1cSAlex Elder 	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
5177da6a6b63SJosh Durgin 	if (IS_ERR(snap_name)) {
5178da6a6b63SJosh Durgin 		ret = PTR_ERR(snap_name);
51799e15b77dSAlex Elder 		goto out_err;
51802e9f7f1cSAlex Elder 	}
51812e9f7f1cSAlex Elder 
51822e9f7f1cSAlex Elder 	spec->pool_name = pool_name;
51832e9f7f1cSAlex Elder 	spec->image_name = image_name;
51842e9f7f1cSAlex Elder 	spec->snap_name = snap_name;
51859e15b77dSAlex Elder 
51869e15b77dSAlex Elder 	return 0;
518704077599SIlya Dryomov 
51889e15b77dSAlex Elder out_err:
51892e9f7f1cSAlex Elder 	kfree(image_name);
51902e9f7f1cSAlex Elder 	kfree(pool_name);
51919e15b77dSAlex Elder 	return ret;
51929e15b77dSAlex Elder }
51939e15b77dSAlex Elder 
5194cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
519535d489f9SAlex Elder {
519635d489f9SAlex Elder 	size_t size;
519735d489f9SAlex Elder 	int ret;
519835d489f9SAlex Elder 	void *reply_buf;
519935d489f9SAlex Elder 	void *p;
520035d489f9SAlex Elder 	void *end;
520135d489f9SAlex Elder 	u64 seq;
520235d489f9SAlex Elder 	u32 snap_count;
520335d489f9SAlex Elder 	struct ceph_snap_context *snapc;
520435d489f9SAlex Elder 	u32 i;
520535d489f9SAlex Elder 
520635d489f9SAlex Elder 	/*
520735d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
520835d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
520935d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
521035d489f9SAlex Elder 	 * prepared to receive.
521135d489f9SAlex Elder 	 */
521235d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
521335d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
521435d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
521535d489f9SAlex Elder 	if (!reply_buf)
521635d489f9SAlex Elder 		return -ENOMEM;
521735d489f9SAlex Elder 
5218ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5219ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_snapcontext",
5220ecd4a68aSIlya Dryomov 				  NULL, 0, reply_buf, size);
522136be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
522235d489f9SAlex Elder 	if (ret < 0)
522335d489f9SAlex Elder 		goto out;
522435d489f9SAlex Elder 
522535d489f9SAlex Elder 	p = reply_buf;
522657385b51SAlex Elder 	end = reply_buf + ret;
522757385b51SAlex Elder 	ret = -ERANGE;
522835d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
522935d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
523035d489f9SAlex Elder 
523135d489f9SAlex Elder 	/*
523235d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
523335d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
523435d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
523535d489f9SAlex Elder 	 * allocate is representable in a size_t.
523635d489f9SAlex Elder 	 */
523735d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
523835d489f9SAlex Elder 				 / sizeof (u64)) {
523935d489f9SAlex Elder 		ret = -EINVAL;
524035d489f9SAlex Elder 		goto out;
524135d489f9SAlex Elder 	}
524235d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
524335d489f9SAlex Elder 		goto out;
5244468521c1SAlex Elder 	ret = 0;
524535d489f9SAlex Elder 
5246812164f8SAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
524735d489f9SAlex Elder 	if (!snapc) {
524835d489f9SAlex Elder 		ret = -ENOMEM;
524935d489f9SAlex Elder 		goto out;
525035d489f9SAlex Elder 	}
525135d489f9SAlex Elder 	snapc->seq = seq;
525235d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
525335d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
525435d489f9SAlex Elder 
525549ece554SAlex Elder 	ceph_put_snap_context(rbd_dev->header.snapc);
525635d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
525735d489f9SAlex Elder 
525835d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
525935d489f9SAlex Elder 		(unsigned long long)seq, (unsigned int)snap_count);
526035d489f9SAlex Elder out:
526135d489f9SAlex Elder 	kfree(reply_buf);
526235d489f9SAlex Elder 
526357385b51SAlex Elder 	return ret;
526435d489f9SAlex Elder }
526535d489f9SAlex Elder 
526654cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
526754cac61fSAlex Elder 					u64 snap_id)
5268b8b1e2dbSAlex Elder {
5269b8b1e2dbSAlex Elder 	size_t size;
5270b8b1e2dbSAlex Elder 	void *reply_buf;
527154cac61fSAlex Elder 	__le64 snapid;
5272b8b1e2dbSAlex Elder 	int ret;
5273b8b1e2dbSAlex Elder 	void *p;
5274b8b1e2dbSAlex Elder 	void *end;
5275b8b1e2dbSAlex Elder 	char *snap_name;
5276b8b1e2dbSAlex Elder 
5277b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
5278b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
5279b8b1e2dbSAlex Elder 	if (!reply_buf)
5280b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
5281b8b1e2dbSAlex Elder 
528254cac61fSAlex Elder 	snapid = cpu_to_le64(snap_id);
5283ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5284ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_snapshot_name",
5285ecd4a68aSIlya Dryomov 				  &snapid, sizeof(snapid), reply_buf, size);
528636be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5287f40eb349SAlex Elder 	if (ret < 0) {
5288f40eb349SAlex Elder 		snap_name = ERR_PTR(ret);
5289b8b1e2dbSAlex Elder 		goto out;
5290f40eb349SAlex Elder 	}
5291b8b1e2dbSAlex Elder 
5292b8b1e2dbSAlex Elder 	p = reply_buf;
5293f40eb349SAlex Elder 	end = reply_buf + ret;
5294e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
5295f40eb349SAlex Elder 	if (IS_ERR(snap_name))
5296b8b1e2dbSAlex Elder 		goto out;
5297f40eb349SAlex Elder 
5298b8b1e2dbSAlex Elder 	dout("  snap_id 0x%016llx snap_name = %s\n",
529954cac61fSAlex Elder 		(unsigned long long)snap_id, snap_name);
5300b8b1e2dbSAlex Elder out:
5301b8b1e2dbSAlex Elder 	kfree(reply_buf);
5302b8b1e2dbSAlex Elder 
5303f40eb349SAlex Elder 	return snap_name;
5304b8b1e2dbSAlex Elder }
5305b8b1e2dbSAlex Elder 
53062df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
5307117973fbSAlex Elder {
53082df3fac7SAlex Elder 	bool first_time = rbd_dev->header.object_prefix == NULL;
5309117973fbSAlex Elder 	int ret;
5310117973fbSAlex Elder 
53111617e40cSJosh Durgin 	ret = rbd_dev_v2_image_size(rbd_dev);
53121617e40cSJosh Durgin 	if (ret)
5313cfbf6377SAlex Elder 		return ret;
53141617e40cSJosh Durgin 
53152df3fac7SAlex Elder 	if (first_time) {
53162df3fac7SAlex Elder 		ret = rbd_dev_v2_header_onetime(rbd_dev);
53172df3fac7SAlex Elder 		if (ret)
5318cfbf6377SAlex Elder 			return ret;
53192df3fac7SAlex Elder 	}
53202df3fac7SAlex Elder 
5321cc4a38bdSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev);
5322d194cd1dSIlya Dryomov 	if (ret && first_time) {
5323d194cd1dSIlya Dryomov 		kfree(rbd_dev->header.object_prefix);
5324d194cd1dSIlya Dryomov 		rbd_dev->header.object_prefix = NULL;
5325d194cd1dSIlya Dryomov 	}
5326117973fbSAlex Elder 
5327117973fbSAlex Elder 	return ret;
5328117973fbSAlex Elder }
5329117973fbSAlex Elder 
5330a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev)
5331a720ae09SIlya Dryomov {
5332a720ae09SIlya Dryomov 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5333a720ae09SIlya Dryomov 
5334a720ae09SIlya Dryomov 	if (rbd_dev->image_format == 1)
5335a720ae09SIlya Dryomov 		return rbd_dev_v1_header_info(rbd_dev);
5336a720ae09SIlya Dryomov 
5337a720ae09SIlya Dryomov 	return rbd_dev_v2_header_info(rbd_dev);
5338a720ae09SIlya Dryomov }
5339a720ae09SIlya Dryomov 
53401ddbe94eSAlex Elder /*
5341e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
5342e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
5343593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
5344593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
5345e28fff26SAlex Elder  */
5346e28fff26SAlex Elder static inline size_t next_token(const char **buf)
5347e28fff26SAlex Elder {
5348e28fff26SAlex Elder         /*
5349e28fff26SAlex Elder         * These are the characters that produce nonzero for
5350e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
5351e28fff26SAlex Elder         */
5352e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
5353e28fff26SAlex Elder 
5354e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
5355e28fff26SAlex Elder 
5356e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
5357e28fff26SAlex Elder }
5358e28fff26SAlex Elder 
5359e28fff26SAlex Elder /*
5360ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
5361ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
5362ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
5363ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
5364ea3352f4SAlex Elder  *
5365ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
5366ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
5367ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
5368ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
5369ea3352f4SAlex Elder  *
5370ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
5371ea3352f4SAlex Elder  * the end of the found token.
5372ea3352f4SAlex Elder  *
5373ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
5374ea3352f4SAlex Elder  */
5375ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
5376ea3352f4SAlex Elder {
5377ea3352f4SAlex Elder 	char *dup;
5378ea3352f4SAlex Elder 	size_t len;
5379ea3352f4SAlex Elder 
5380ea3352f4SAlex Elder 	len = next_token(buf);
53814caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
5382ea3352f4SAlex Elder 	if (!dup)
5383ea3352f4SAlex Elder 		return NULL;
5384ea3352f4SAlex Elder 	*(dup + len) = '\0';
5385ea3352f4SAlex Elder 	*buf += len;
5386ea3352f4SAlex Elder 
5387ea3352f4SAlex Elder 	if (lenp)
5388ea3352f4SAlex Elder 		*lenp = len;
5389ea3352f4SAlex Elder 
5390ea3352f4SAlex Elder 	return dup;
5391ea3352f4SAlex Elder }
5392ea3352f4SAlex Elder 
5393ea3352f4SAlex Elder /*
5394859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
5395859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
5396859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
5397859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
5398d22f76e7SAlex Elder  *
5399859c31dfSAlex Elder  * The information extracted from these options is recorded in
5400859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
5401859c31dfSAlex Elder  * structures:
5402859c31dfSAlex Elder  *  ceph_opts
5403859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
5404859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
5405859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
5406859c31dfSAlex Elder  *  rbd_opts
5407859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
5408859c31dfSAlex Elder  *	this function; caller must release with kfree().
5409859c31dfSAlex Elder  *  spec
5410859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
5411859c31dfSAlex Elder  *	initialized by this function based on parsed options.
5412859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
5413859c31dfSAlex Elder  *
5414859c31dfSAlex Elder  * The options passed take this form:
5415859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
5416859c31dfSAlex Elder  * where:
5417859c31dfSAlex Elder  *  <mon_addrs>
5418859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
5419859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
5420859c31dfSAlex Elder  *      by a port number (separated by a colon).
5421859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
5422859c31dfSAlex Elder  *  <options>
5423859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
5424859c31dfSAlex Elder  *  <pool_name>
5425859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
5426859c31dfSAlex Elder  *  <image_name>
5427859c31dfSAlex Elder  *      The name of the image in that pool to map.
5428859c31dfSAlex Elder  *  <snap_id>
5429859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
5430859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
5431859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
5432859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
5433a725f65eSAlex Elder  */
5434859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
5435dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
5436859c31dfSAlex Elder 				struct rbd_options **opts,
5437859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
5438a725f65eSAlex Elder {
5439e28fff26SAlex Elder 	size_t len;
5440859c31dfSAlex Elder 	char *options;
54410ddebc0cSAlex Elder 	const char *mon_addrs;
5442ecb4dc22SAlex Elder 	char *snap_name;
54430ddebc0cSAlex Elder 	size_t mon_addrs_size;
5444c300156bSIlya Dryomov 	struct parse_rbd_opts_ctx pctx = { 0 };
5445859c31dfSAlex Elder 	struct ceph_options *copts;
5446dc79b113SAlex Elder 	int ret;
5447e28fff26SAlex Elder 
5448e28fff26SAlex Elder 	/* The first four tokens are required */
5449e28fff26SAlex Elder 
54507ef3214aSAlex Elder 	len = next_token(&buf);
54514fb5d671SAlex Elder 	if (!len) {
54524fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
54534fb5d671SAlex Elder 		return -EINVAL;
54544fb5d671SAlex Elder 	}
54550ddebc0cSAlex Elder 	mon_addrs = buf;
5456f28e565aSAlex Elder 	mon_addrs_size = len + 1;
54577ef3214aSAlex Elder 	buf += len;
5458a725f65eSAlex Elder 
5459dc79b113SAlex Elder 	ret = -EINVAL;
5460f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
5461f28e565aSAlex Elder 	if (!options)
5462dc79b113SAlex Elder 		return -ENOMEM;
54634fb5d671SAlex Elder 	if (!*options) {
54644fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
54654fb5d671SAlex Elder 		goto out_err;
54664fb5d671SAlex Elder 	}
5467a725f65eSAlex Elder 
5468c300156bSIlya Dryomov 	pctx.spec = rbd_spec_alloc();
5469c300156bSIlya Dryomov 	if (!pctx.spec)
5470f28e565aSAlex Elder 		goto out_mem;
5471859c31dfSAlex Elder 
5472c300156bSIlya Dryomov 	pctx.spec->pool_name = dup_token(&buf, NULL);
5473c300156bSIlya Dryomov 	if (!pctx.spec->pool_name)
5474859c31dfSAlex Elder 		goto out_mem;
5475c300156bSIlya Dryomov 	if (!*pctx.spec->pool_name) {
54764fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
54774fb5d671SAlex Elder 		goto out_err;
54784fb5d671SAlex Elder 	}
5479e28fff26SAlex Elder 
5480c300156bSIlya Dryomov 	pctx.spec->image_name = dup_token(&buf, NULL);
5481c300156bSIlya Dryomov 	if (!pctx.spec->image_name)
5482f28e565aSAlex Elder 		goto out_mem;
5483c300156bSIlya Dryomov 	if (!*pctx.spec->image_name) {
54844fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
54854fb5d671SAlex Elder 		goto out_err;
54864fb5d671SAlex Elder 	}
5487e28fff26SAlex Elder 
5488f28e565aSAlex Elder 	/*
5489f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
5490f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
5491f28e565aSAlex Elder 	 */
54923feeb894SAlex Elder 	len = next_token(&buf);
5493820a5f3eSAlex Elder 	if (!len) {
54943feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
54953feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
5496f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
5497dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
5498f28e565aSAlex Elder 		goto out_err;
5499849b4260SAlex Elder 	}
5500ecb4dc22SAlex Elder 	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
5501ecb4dc22SAlex Elder 	if (!snap_name)
5502f28e565aSAlex Elder 		goto out_mem;
5503ecb4dc22SAlex Elder 	*(snap_name + len) = '\0';
5504c300156bSIlya Dryomov 	pctx.spec->snap_name = snap_name;
5505e5c35534SAlex Elder 
55060ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
5507e28fff26SAlex Elder 
5508c300156bSIlya Dryomov 	pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL);
5509c300156bSIlya Dryomov 	if (!pctx.opts)
55104e9afebaSAlex Elder 		goto out_mem;
55114e9afebaSAlex Elder 
5512c300156bSIlya Dryomov 	pctx.opts->read_only = RBD_READ_ONLY_DEFAULT;
5513c300156bSIlya Dryomov 	pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
55140c93e1b7SIlya Dryomov 	pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT;
5515c300156bSIlya Dryomov 	pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
5516c300156bSIlya Dryomov 	pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
5517c300156bSIlya Dryomov 	pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
5518c300156bSIlya Dryomov 	pctx.opts->trim = RBD_TRIM_DEFAULT;
5519d22f76e7SAlex Elder 
5520859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
55210ddebc0cSAlex Elder 				   mon_addrs + mon_addrs_size - 1,
5522c300156bSIlya Dryomov 				   parse_rbd_opts_token, &pctx);
5523859c31dfSAlex Elder 	if (IS_ERR(copts)) {
5524859c31dfSAlex Elder 		ret = PTR_ERR(copts);
5525dc79b113SAlex Elder 		goto out_err;
5526dc79b113SAlex Elder 	}
5527859c31dfSAlex Elder 	kfree(options);
5528859c31dfSAlex Elder 
5529859c31dfSAlex Elder 	*ceph_opts = copts;
5530c300156bSIlya Dryomov 	*opts = pctx.opts;
5531c300156bSIlya Dryomov 	*rbd_spec = pctx.spec;
55320ddebc0cSAlex Elder 
5533dc79b113SAlex Elder 	return 0;
5534f28e565aSAlex Elder out_mem:
5535dc79b113SAlex Elder 	ret = -ENOMEM;
5536d22f76e7SAlex Elder out_err:
5537c300156bSIlya Dryomov 	kfree(pctx.opts);
5538c300156bSIlya Dryomov 	rbd_spec_put(pctx.spec);
5539f28e565aSAlex Elder 	kfree(options);
5540d22f76e7SAlex Elder 
5541dc79b113SAlex Elder 	return ret;
5542a725f65eSAlex Elder }
5543a725f65eSAlex Elder 
5544e010dd0aSIlya Dryomov static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
5545e010dd0aSIlya Dryomov {
5546e010dd0aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
5547e010dd0aSIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev))
5548e010dd0aSIlya Dryomov 		rbd_unlock(rbd_dev);
5549e010dd0aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
5550e010dd0aSIlya Dryomov }
5551e010dd0aSIlya Dryomov 
5552e010dd0aSIlya Dryomov static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
5553e010dd0aSIlya Dryomov {
55542f18d466SIlya Dryomov 	int ret;
55552f18d466SIlya Dryomov 
5556e010dd0aSIlya Dryomov 	if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
5557e010dd0aSIlya Dryomov 		rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
5558e010dd0aSIlya Dryomov 		return -EINVAL;
5559e010dd0aSIlya Dryomov 	}
5560e010dd0aSIlya Dryomov 
5561e010dd0aSIlya Dryomov 	/* FIXME: "rbd map --exclusive" should be in interruptible */
5562e010dd0aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
55632f18d466SIlya Dryomov 	ret = rbd_wait_state_locked(rbd_dev, true);
5564e010dd0aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
55652f18d466SIlya Dryomov 	if (ret) {
5566e010dd0aSIlya Dryomov 		rbd_warn(rbd_dev, "failed to acquire exclusive lock");
5567e010dd0aSIlya Dryomov 		return -EROFS;
5568e010dd0aSIlya Dryomov 	}
5569e010dd0aSIlya Dryomov 
5570e010dd0aSIlya Dryomov 	return 0;
5571e010dd0aSIlya Dryomov }
5572e010dd0aSIlya Dryomov 
557330ba1f02SIlya Dryomov /*
5574589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
5575589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
5576589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
5577589d30e0SAlex Elder  *
5578589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
5579589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
5580589d30e0SAlex Elder  * with the supplied name.
5581589d30e0SAlex Elder  *
5582589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
5583589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
5584589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
5585589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
5586589d30e0SAlex Elder  */
5587589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
5588589d30e0SAlex Elder {
5589589d30e0SAlex Elder 	int ret;
5590589d30e0SAlex Elder 	size_t size;
5591ecd4a68aSIlya Dryomov 	CEPH_DEFINE_OID_ONSTACK(oid);
5592589d30e0SAlex Elder 	void *response;
5593c0fba368SAlex Elder 	char *image_id;
55942f82ee54SAlex Elder 
5595589d30e0SAlex Elder 	/*
55962c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
55972c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
5598c0fba368SAlex Elder 	 * need to fetch the image id again in this case.  We
5599c0fba368SAlex Elder 	 * do still need to set the image format though.
56002c0d0a10SAlex Elder 	 */
5601c0fba368SAlex Elder 	if (rbd_dev->spec->image_id) {
5602c0fba368SAlex Elder 		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
5603c0fba368SAlex Elder 
56042c0d0a10SAlex Elder 		return 0;
5605c0fba368SAlex Elder 	}
56062c0d0a10SAlex Elder 
56072c0d0a10SAlex Elder 	/*
5608589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
5609589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
5610589d30e0SAlex Elder 	 */
5611ecd4a68aSIlya Dryomov 	ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
5612ecd4a68aSIlya Dryomov 			       rbd_dev->spec->image_name);
5613ecd4a68aSIlya Dryomov 	if (ret)
5614ecd4a68aSIlya Dryomov 		return ret;
5615ecd4a68aSIlya Dryomov 
5616ecd4a68aSIlya Dryomov 	dout("rbd id object name is %s\n", oid.name);
5617589d30e0SAlex Elder 
5618589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
5619589d30e0SAlex Elder 
5620589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
5621589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
5622589d30e0SAlex Elder 	if (!response) {
5623589d30e0SAlex Elder 		ret = -ENOMEM;
5624589d30e0SAlex Elder 		goto out;
5625589d30e0SAlex Elder 	}
5626589d30e0SAlex Elder 
5627c0fba368SAlex Elder 	/* If it doesn't exist we'll assume it's a format 1 image */
5628c0fba368SAlex Elder 
5629ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5630ecd4a68aSIlya Dryomov 				  "get_id", NULL, 0,
5631e2a58ee5SAlex Elder 				  response, RBD_IMAGE_ID_LEN_MAX);
563236be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5633c0fba368SAlex Elder 	if (ret == -ENOENT) {
5634c0fba368SAlex Elder 		image_id = kstrdup("", GFP_KERNEL);
5635c0fba368SAlex Elder 		ret = image_id ? 0 : -ENOMEM;
5636c0fba368SAlex Elder 		if (!ret)
5637c0fba368SAlex Elder 			rbd_dev->image_format = 1;
56387dd440c9SIlya Dryomov 	} else if (ret >= 0) {
5639c0fba368SAlex Elder 		void *p = response;
5640589d30e0SAlex Elder 
5641c0fba368SAlex Elder 		image_id = ceph_extract_encoded_string(&p, p + ret,
5642979ed480SAlex Elder 						NULL, GFP_NOIO);
5643461f758aSDuan Jiong 		ret = PTR_ERR_OR_ZERO(image_id);
5644c0fba368SAlex Elder 		if (!ret)
5645c0fba368SAlex Elder 			rbd_dev->image_format = 2;
5646c0fba368SAlex Elder 	}
5647c0fba368SAlex Elder 
5648c0fba368SAlex Elder 	if (!ret) {
5649c0fba368SAlex Elder 		rbd_dev->spec->image_id = image_id;
5650c0fba368SAlex Elder 		dout("image_id is %s\n", image_id);
5651589d30e0SAlex Elder 	}
5652589d30e0SAlex Elder out:
5653589d30e0SAlex Elder 	kfree(response);
5654ecd4a68aSIlya Dryomov 	ceph_oid_destroy(&oid);
5655589d30e0SAlex Elder 	return ret;
5656589d30e0SAlex Elder }
5657589d30e0SAlex Elder 
56583abef3b3SAlex Elder /*
56593abef3b3SAlex Elder  * Undo whatever state changes are made by v1 or v2 header info
56603abef3b3SAlex Elder  * call.
56613abef3b3SAlex Elder  */
56626fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
56636fd48b3bSAlex Elder {
56646fd48b3bSAlex Elder 	struct rbd_image_header	*header;
56656fd48b3bSAlex Elder 
5666a2acd00eSAlex Elder 	rbd_dev_parent_put(rbd_dev);
56676fd48b3bSAlex Elder 
56686fd48b3bSAlex Elder 	/* Free dynamic fields from the header, then zero it out */
56696fd48b3bSAlex Elder 
56706fd48b3bSAlex Elder 	header = &rbd_dev->header;
5671812164f8SAlex Elder 	ceph_put_snap_context(header->snapc);
56726fd48b3bSAlex Elder 	kfree(header->snap_sizes);
56736fd48b3bSAlex Elder 	kfree(header->snap_names);
56746fd48b3bSAlex Elder 	kfree(header->object_prefix);
56756fd48b3bSAlex Elder 	memset(header, 0, sizeof (*header));
56766fd48b3bSAlex Elder }
56776fd48b3bSAlex Elder 
56782df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
5679a30b71b9SAlex Elder {
5680a30b71b9SAlex Elder 	int ret;
5681a30b71b9SAlex Elder 
56821e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
568357385b51SAlex Elder 	if (ret)
56841e130199SAlex Elder 		goto out_err;
5685b1b5402aSAlex Elder 
56862df3fac7SAlex Elder 	/*
56872df3fac7SAlex Elder 	 * Get the and check features for the image.  Currently the
56882df3fac7SAlex Elder 	 * features are assumed to never change.
56892df3fac7SAlex Elder 	 */
5690b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
569157385b51SAlex Elder 	if (ret)
5692b1b5402aSAlex Elder 		goto out_err;
569335d489f9SAlex Elder 
5694cc070d59SAlex Elder 	/* If the image supports fancy striping, get its parameters */
5695cc070d59SAlex Elder 
5696cc070d59SAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
5697cc070d59SAlex Elder 		ret = rbd_dev_v2_striping_info(rbd_dev);
5698cc070d59SAlex Elder 		if (ret < 0)
5699cc070d59SAlex Elder 			goto out_err;
5700cc070d59SAlex Elder 	}
5701a30b71b9SAlex Elder 
57027e97332eSIlya Dryomov 	if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
57037e97332eSIlya Dryomov 		ret = rbd_dev_v2_data_pool(rbd_dev);
57047e97332eSIlya Dryomov 		if (ret)
57057e97332eSIlya Dryomov 			goto out_err;
57067e97332eSIlya Dryomov 	}
57077e97332eSIlya Dryomov 
5708263423f8SIlya Dryomov 	rbd_init_layout(rbd_dev);
570935152979SAlex Elder 	return 0;
5710263423f8SIlya Dryomov 
57119d475de5SAlex Elder out_err:
5712642a2537SAlex Elder 	rbd_dev->header.features = 0;
57131e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
57141e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
57159d475de5SAlex Elder 	return ret;
5716a30b71b9SAlex Elder }
5717a30b71b9SAlex Elder 
57186d69bb53SIlya Dryomov /*
57196d69bb53SIlya Dryomov  * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
57206d69bb53SIlya Dryomov  * rbd_dev_image_probe() recursion depth, which means it's also the
57216d69bb53SIlya Dryomov  * length of the already discovered part of the parent chain.
57226d69bb53SIlya Dryomov  */
57236d69bb53SIlya Dryomov static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
572483a06263SAlex Elder {
57252f82ee54SAlex Elder 	struct rbd_device *parent = NULL;
5726124afba2SAlex Elder 	int ret;
5727124afba2SAlex Elder 
5728124afba2SAlex Elder 	if (!rbd_dev->parent_spec)
5729124afba2SAlex Elder 		return 0;
5730124afba2SAlex Elder 
57316d69bb53SIlya Dryomov 	if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
57326d69bb53SIlya Dryomov 		pr_info("parent chain is too long (%d)\n", depth);
57336d69bb53SIlya Dryomov 		ret = -EINVAL;
57346d69bb53SIlya Dryomov 		goto out_err;
57356d69bb53SIlya Dryomov 	}
57366d69bb53SIlya Dryomov 
57371643dfa4SIlya Dryomov 	parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
57381f2c6651SIlya Dryomov 	if (!parent) {
5739124afba2SAlex Elder 		ret = -ENOMEM;
5740124afba2SAlex Elder 		goto out_err;
57411f2c6651SIlya Dryomov 	}
57421f2c6651SIlya Dryomov 
57431f2c6651SIlya Dryomov 	/*
57441f2c6651SIlya Dryomov 	 * Images related by parent/child relationships always share
57451f2c6651SIlya Dryomov 	 * rbd_client and spec/parent_spec, so bump their refcounts.
57461f2c6651SIlya Dryomov 	 */
57471f2c6651SIlya Dryomov 	__rbd_get_client(rbd_dev->rbd_client);
57481f2c6651SIlya Dryomov 	rbd_spec_get(rbd_dev->parent_spec);
5749124afba2SAlex Elder 
57506d69bb53SIlya Dryomov 	ret = rbd_dev_image_probe(parent, depth);
5751124afba2SAlex Elder 	if (ret < 0)
5752124afba2SAlex Elder 		goto out_err;
57531f2c6651SIlya Dryomov 
5754124afba2SAlex Elder 	rbd_dev->parent = parent;
5755a2acd00eSAlex Elder 	atomic_set(&rbd_dev->parent_ref, 1);
5756124afba2SAlex Elder 	return 0;
5757124afba2SAlex Elder 
57581f2c6651SIlya Dryomov out_err:
57591f2c6651SIlya Dryomov 	rbd_dev_unparent(rbd_dev);
57601f2c6651SIlya Dryomov 	rbd_dev_destroy(parent);
5761124afba2SAlex Elder 	return ret;
5762124afba2SAlex Elder }
5763124afba2SAlex Elder 
57645769ed0cSIlya Dryomov static void rbd_dev_device_release(struct rbd_device *rbd_dev)
57655769ed0cSIlya Dryomov {
57665769ed0cSIlya Dryomov 	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
57675769ed0cSIlya Dryomov 	rbd_dev_mapping_clear(rbd_dev);
57685769ed0cSIlya Dryomov 	rbd_free_disk(rbd_dev);
57695769ed0cSIlya Dryomov 	if (!single_major)
57705769ed0cSIlya Dryomov 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
57715769ed0cSIlya Dryomov }
57725769ed0cSIlya Dryomov 
5773811c6688SIlya Dryomov /*
5774811c6688SIlya Dryomov  * rbd_dev->header_rwsem must be locked for write and will be unlocked
5775811c6688SIlya Dryomov  * upon return.
5776811c6688SIlya Dryomov  */
5777200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
5778124afba2SAlex Elder {
577983a06263SAlex Elder 	int ret;
578083a06263SAlex Elder 
57819b60e70bSIlya Dryomov 	/* Record our major and minor device numbers. */
578283a06263SAlex Elder 
57839b60e70bSIlya Dryomov 	if (!single_major) {
578483a06263SAlex Elder 		ret = register_blkdev(0, rbd_dev->name);
578583a06263SAlex Elder 		if (ret < 0)
57861643dfa4SIlya Dryomov 			goto err_out_unlock;
57879b60e70bSIlya Dryomov 
578883a06263SAlex Elder 		rbd_dev->major = ret;
5789dd82fff1SIlya Dryomov 		rbd_dev->minor = 0;
57909b60e70bSIlya Dryomov 	} else {
57919b60e70bSIlya Dryomov 		rbd_dev->major = rbd_major;
57929b60e70bSIlya Dryomov 		rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
57939b60e70bSIlya Dryomov 	}
579483a06263SAlex Elder 
579583a06263SAlex Elder 	/* Set up the blkdev mapping. */
579683a06263SAlex Elder 
579783a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
579883a06263SAlex Elder 	if (ret)
579983a06263SAlex Elder 		goto err_out_blkdev;
580083a06263SAlex Elder 
5801f35a4deeSAlex Elder 	ret = rbd_dev_mapping_set(rbd_dev);
580283a06263SAlex Elder 	if (ret)
580383a06263SAlex Elder 		goto err_out_disk;
5804bc1ecc65SIlya Dryomov 
5805f35a4deeSAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
58069568c93eSIlya Dryomov 	set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only);
5807f35a4deeSAlex Elder 
58085769ed0cSIlya Dryomov 	ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
5809f35a4deeSAlex Elder 	if (ret)
5810f5ee37bdSIlya Dryomov 		goto err_out_mapping;
581183a06263SAlex Elder 
5812129b79d4SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5813811c6688SIlya Dryomov 	up_write(&rbd_dev->header_rwsem);
58145769ed0cSIlya Dryomov 	return 0;
58152f82ee54SAlex Elder 
5816f35a4deeSAlex Elder err_out_mapping:
5817f35a4deeSAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
581883a06263SAlex Elder err_out_disk:
581983a06263SAlex Elder 	rbd_free_disk(rbd_dev);
582083a06263SAlex Elder err_out_blkdev:
58219b60e70bSIlya Dryomov 	if (!single_major)
582283a06263SAlex Elder 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
5823811c6688SIlya Dryomov err_out_unlock:
5824811c6688SIlya Dryomov 	up_write(&rbd_dev->header_rwsem);
582583a06263SAlex Elder 	return ret;
582683a06263SAlex Elder }
582783a06263SAlex Elder 
5828332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev)
5829332bb12dSAlex Elder {
5830332bb12dSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
5831c41d13a3SIlya Dryomov 	int ret;
5832332bb12dSAlex Elder 
5833332bb12dSAlex Elder 	/* Record the header object name for this rbd image. */
5834332bb12dSAlex Elder 
5835332bb12dSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5836332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
5837c41d13a3SIlya Dryomov 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5838332bb12dSAlex Elder 				       spec->image_name, RBD_SUFFIX);
5839332bb12dSAlex Elder 	else
5840c41d13a3SIlya Dryomov 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5841332bb12dSAlex Elder 				       RBD_HEADER_PREFIX, spec->image_id);
5842c41d13a3SIlya Dryomov 
5843c41d13a3SIlya Dryomov 	return ret;
5844332bb12dSAlex Elder }
5845332bb12dSAlex Elder 
5846200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev)
5847200a6a8bSAlex Elder {
58486fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
5849fd22aef8SIlya Dryomov 	if (rbd_dev->opts)
5850fd22aef8SIlya Dryomov 		rbd_unregister_watch(rbd_dev);
58516fd48b3bSAlex Elder 	rbd_dev->image_format = 0;
58526fd48b3bSAlex Elder 	kfree(rbd_dev->spec->image_id);
58536fd48b3bSAlex Elder 	rbd_dev->spec->image_id = NULL;
5854200a6a8bSAlex Elder }
5855200a6a8bSAlex Elder 
5856a30b71b9SAlex Elder /*
5857a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
58581f3ef788SAlex Elder  * device.  If this image is the one being mapped (i.e., not a
58591f3ef788SAlex Elder  * parent), initiate a watch on its header object before using that
58601f3ef788SAlex Elder  * object to get detailed information about the rbd image.
5861a30b71b9SAlex Elder  */
58626d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
5863a30b71b9SAlex Elder {
5864a30b71b9SAlex Elder 	int ret;
5865a30b71b9SAlex Elder 
5866a30b71b9SAlex Elder 	/*
58673abef3b3SAlex Elder 	 * Get the id from the image id object.  Unless there's an
58683abef3b3SAlex Elder 	 * error, rbd_dev->spec->image_id will be filled in with
58693abef3b3SAlex Elder 	 * a dynamically-allocated string, and rbd_dev->image_format
58703abef3b3SAlex Elder 	 * will be set to either 1 or 2.
5871a30b71b9SAlex Elder 	 */
5872a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
5873a30b71b9SAlex Elder 	if (ret)
5874c0fba368SAlex Elder 		return ret;
5875c0fba368SAlex Elder 
5876332bb12dSAlex Elder 	ret = rbd_dev_header_name(rbd_dev);
5877332bb12dSAlex Elder 	if (ret)
5878332bb12dSAlex Elder 		goto err_out_format;
5879332bb12dSAlex Elder 
58806d69bb53SIlya Dryomov 	if (!depth) {
588199d16943SIlya Dryomov 		ret = rbd_register_watch(rbd_dev);
58821fe48023SIlya Dryomov 		if (ret) {
58831fe48023SIlya Dryomov 			if (ret == -ENOENT)
5884b26c047bSIlya Dryomov 				pr_info("image %s/%s%s%s does not exist\n",
58851fe48023SIlya Dryomov 					rbd_dev->spec->pool_name,
5886b26c047bSIlya Dryomov 					rbd_dev->spec->pool_ns ?: "",
5887b26c047bSIlya Dryomov 					rbd_dev->spec->pool_ns ? "/" : "",
58881fe48023SIlya Dryomov 					rbd_dev->spec->image_name);
5889c41d13a3SIlya Dryomov 			goto err_out_format;
58901f3ef788SAlex Elder 		}
58911fe48023SIlya Dryomov 	}
5892b644de2bSAlex Elder 
5893a720ae09SIlya Dryomov 	ret = rbd_dev_header_info(rbd_dev);
58945655c4d9SAlex Elder 	if (ret)
5895b644de2bSAlex Elder 		goto err_out_watch;
5896a30b71b9SAlex Elder 
589704077599SIlya Dryomov 	/*
589804077599SIlya Dryomov 	 * If this image is the one being mapped, we have pool name and
589904077599SIlya Dryomov 	 * id, image name and id, and snap name - need to fill snap id.
590004077599SIlya Dryomov 	 * Otherwise this is a parent image, identified by pool, image
590104077599SIlya Dryomov 	 * and snap ids - need to fill in names for those ids.
590204077599SIlya Dryomov 	 */
59036d69bb53SIlya Dryomov 	if (!depth)
590404077599SIlya Dryomov 		ret = rbd_spec_fill_snap_id(rbd_dev);
590504077599SIlya Dryomov 	else
590604077599SIlya Dryomov 		ret = rbd_spec_fill_names(rbd_dev);
59071fe48023SIlya Dryomov 	if (ret) {
59081fe48023SIlya Dryomov 		if (ret == -ENOENT)
5909b26c047bSIlya Dryomov 			pr_info("snap %s/%s%s%s@%s does not exist\n",
59101fe48023SIlya Dryomov 				rbd_dev->spec->pool_name,
5911b26c047bSIlya Dryomov 				rbd_dev->spec->pool_ns ?: "",
5912b26c047bSIlya Dryomov 				rbd_dev->spec->pool_ns ? "/" : "",
59131fe48023SIlya Dryomov 				rbd_dev->spec->image_name,
59141fe48023SIlya Dryomov 				rbd_dev->spec->snap_name);
591533dca39fSAlex Elder 		goto err_out_probe;
59161fe48023SIlya Dryomov 	}
59179bb81c9bSAlex Elder 
5918e8f59b59SIlya Dryomov 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
5919e8f59b59SIlya Dryomov 		ret = rbd_dev_v2_parent_info(rbd_dev);
5920e8f59b59SIlya Dryomov 		if (ret)
5921e8f59b59SIlya Dryomov 			goto err_out_probe;
5922e8f59b59SIlya Dryomov 	}
5923e8f59b59SIlya Dryomov 
59246d69bb53SIlya Dryomov 	ret = rbd_dev_probe_parent(rbd_dev, depth);
592530d60ba2SAlex Elder 	if (ret)
592630d60ba2SAlex Elder 		goto err_out_probe;
592783a06263SAlex Elder 
592830d60ba2SAlex Elder 	dout("discovered format %u image, header name is %s\n",
5929c41d13a3SIlya Dryomov 		rbd_dev->image_format, rbd_dev->header_oid.name);
593030d60ba2SAlex Elder 	return 0;
5931e8f59b59SIlya Dryomov 
59326fd48b3bSAlex Elder err_out_probe:
59336fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
5934b644de2bSAlex Elder err_out_watch:
59356d69bb53SIlya Dryomov 	if (!depth)
593699d16943SIlya Dryomov 		rbd_unregister_watch(rbd_dev);
5937332bb12dSAlex Elder err_out_format:
5938332bb12dSAlex Elder 	rbd_dev->image_format = 0;
59395655c4d9SAlex Elder 	kfree(rbd_dev->spec->image_id);
59405655c4d9SAlex Elder 	rbd_dev->spec->image_id = NULL;
59415655c4d9SAlex Elder 	return ret;
594283a06263SAlex Elder }
594383a06263SAlex Elder 
59449b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus,
594559c2be1eSYehuda Sadeh 			  const char *buf,
594659c2be1eSYehuda Sadeh 			  size_t count)
5947602adf40SYehuda Sadeh {
5948cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
5949dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
59504e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
5951859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
59529d3997fdSAlex Elder 	struct rbd_client *rbdc;
5953b51c83c2SIlya Dryomov 	int rc;
5954602adf40SYehuda Sadeh 
5955602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
5956602adf40SYehuda Sadeh 		return -ENODEV;
5957602adf40SYehuda Sadeh 
5958a725f65eSAlex Elder 	/* parse add command */
5959859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
5960dc79b113SAlex Elder 	if (rc < 0)
5961dd5ac32dSIlya Dryomov 		goto out;
5962a725f65eSAlex Elder 
59639d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
59649d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
59659d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
59660ddebc0cSAlex Elder 		goto err_out_args;
59679d3997fdSAlex Elder 	}
5968602adf40SYehuda Sadeh 
5969602adf40SYehuda Sadeh 	/* pick the pool */
5970dd435855SIlya Dryomov 	rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
59711fe48023SIlya Dryomov 	if (rc < 0) {
59721fe48023SIlya Dryomov 		if (rc == -ENOENT)
59731fe48023SIlya Dryomov 			pr_info("pool %s does not exist\n", spec->pool_name);
5974602adf40SYehuda Sadeh 		goto err_out_client;
59751fe48023SIlya Dryomov 	}
5976859c31dfSAlex Elder 	spec->pool_id = (u64)rc;
5977859c31dfSAlex Elder 
5978d147543dSIlya Dryomov 	rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
5979b51c83c2SIlya Dryomov 	if (!rbd_dev) {
5980b51c83c2SIlya Dryomov 		rc = -ENOMEM;
5981bd4ba655SAlex Elder 		goto err_out_client;
5982b51c83c2SIlya Dryomov 	}
5983c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
5984c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
5985d147543dSIlya Dryomov 	rbd_opts = NULL;	/* rbd_dev now owns this */
5986602adf40SYehuda Sadeh 
59870d6d1e9cSMike Christie 	rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
59880d6d1e9cSMike Christie 	if (!rbd_dev->config_info) {
59890d6d1e9cSMike Christie 		rc = -ENOMEM;
59900d6d1e9cSMike Christie 		goto err_out_rbd_dev;
59910d6d1e9cSMike Christie 	}
59920d6d1e9cSMike Christie 
5993811c6688SIlya Dryomov 	down_write(&rbd_dev->header_rwsem);
59946d69bb53SIlya Dryomov 	rc = rbd_dev_image_probe(rbd_dev, 0);
59950d6d1e9cSMike Christie 	if (rc < 0) {
59960d6d1e9cSMike Christie 		up_write(&rbd_dev->header_rwsem);
5997c53d5893SAlex Elder 		goto err_out_rbd_dev;
59980d6d1e9cSMike Christie 	}
599905fd6f6fSAlex Elder 
60007ce4eef7SAlex Elder 	/* If we are mapping a snapshot it must be marked read-only */
60017ce4eef7SAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
60029568c93eSIlya Dryomov 		rbd_dev->opts->read_only = true;
60037ce4eef7SAlex Elder 
60040c93e1b7SIlya Dryomov 	if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) {
60050c93e1b7SIlya Dryomov 		rbd_warn(rbd_dev, "alloc_size adjusted to %u",
60060c93e1b7SIlya Dryomov 			 rbd_dev->layout.object_size);
60070c93e1b7SIlya Dryomov 		rbd_dev->opts->alloc_size = rbd_dev->layout.object_size;
60080c93e1b7SIlya Dryomov 	}
60090c93e1b7SIlya Dryomov 
6010b536f69aSAlex Elder 	rc = rbd_dev_device_setup(rbd_dev);
6011fd22aef8SIlya Dryomov 	if (rc)
60128b679ec5SIlya Dryomov 		goto err_out_image_probe;
60133abef3b3SAlex Elder 
6014e010dd0aSIlya Dryomov 	if (rbd_dev->opts->exclusive) {
6015e010dd0aSIlya Dryomov 		rc = rbd_add_acquire_lock(rbd_dev);
6016e010dd0aSIlya Dryomov 		if (rc)
6017e010dd0aSIlya Dryomov 			goto err_out_device_setup;
6018b536f69aSAlex Elder 	}
6019b536f69aSAlex Elder 
60205769ed0cSIlya Dryomov 	/* Everything's ready.  Announce the disk to the world. */
60215769ed0cSIlya Dryomov 
60225769ed0cSIlya Dryomov 	rc = device_add(&rbd_dev->dev);
60235769ed0cSIlya Dryomov 	if (rc)
6024e010dd0aSIlya Dryomov 		goto err_out_image_lock;
60255769ed0cSIlya Dryomov 
60265769ed0cSIlya Dryomov 	add_disk(rbd_dev->disk);
60275769ed0cSIlya Dryomov 	/* see rbd_init_disk() */
60285769ed0cSIlya Dryomov 	blk_put_queue(rbd_dev->disk->queue);
60295769ed0cSIlya Dryomov 
60305769ed0cSIlya Dryomov 	spin_lock(&rbd_dev_list_lock);
60315769ed0cSIlya Dryomov 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
60325769ed0cSIlya Dryomov 	spin_unlock(&rbd_dev_list_lock);
60335769ed0cSIlya Dryomov 
60345769ed0cSIlya Dryomov 	pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
60355769ed0cSIlya Dryomov 		(unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
60365769ed0cSIlya Dryomov 		rbd_dev->header.features);
6037dd5ac32dSIlya Dryomov 	rc = count;
6038dd5ac32dSIlya Dryomov out:
6039dd5ac32dSIlya Dryomov 	module_put(THIS_MODULE);
6040dd5ac32dSIlya Dryomov 	return rc;
6041b536f69aSAlex Elder 
6042e010dd0aSIlya Dryomov err_out_image_lock:
6043e010dd0aSIlya Dryomov 	rbd_dev_image_unlock(rbd_dev);
60445769ed0cSIlya Dryomov err_out_device_setup:
60455769ed0cSIlya Dryomov 	rbd_dev_device_release(rbd_dev);
60468b679ec5SIlya Dryomov err_out_image_probe:
60478b679ec5SIlya Dryomov 	rbd_dev_image_release(rbd_dev);
6048c53d5893SAlex Elder err_out_rbd_dev:
6049c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
6050bd4ba655SAlex Elder err_out_client:
60519d3997fdSAlex Elder 	rbd_put_client(rbdc);
60520ddebc0cSAlex Elder err_out_args:
6053859c31dfSAlex Elder 	rbd_spec_put(spec);
6054d147543dSIlya Dryomov 	kfree(rbd_opts);
6055dd5ac32dSIlya Dryomov 	goto out;
6056602adf40SYehuda Sadeh }
6057602adf40SYehuda Sadeh 
60589b60e70bSIlya Dryomov static ssize_t rbd_add(struct bus_type *bus,
60599b60e70bSIlya Dryomov 		       const char *buf,
60609b60e70bSIlya Dryomov 		       size_t count)
60619b60e70bSIlya Dryomov {
60629b60e70bSIlya Dryomov 	if (single_major)
60639b60e70bSIlya Dryomov 		return -EINVAL;
60649b60e70bSIlya Dryomov 
60659b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
60669b60e70bSIlya Dryomov }
60679b60e70bSIlya Dryomov 
60689b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus,
60699b60e70bSIlya Dryomov 				    const char *buf,
60709b60e70bSIlya Dryomov 				    size_t count)
60719b60e70bSIlya Dryomov {
60729b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
60739b60e70bSIlya Dryomov }
60749b60e70bSIlya Dryomov 
607505a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
607605a46afdSAlex Elder {
6077ad945fc1SAlex Elder 	while (rbd_dev->parent) {
607805a46afdSAlex Elder 		struct rbd_device *first = rbd_dev;
607905a46afdSAlex Elder 		struct rbd_device *second = first->parent;
608005a46afdSAlex Elder 		struct rbd_device *third;
608105a46afdSAlex Elder 
608205a46afdSAlex Elder 		/*
608305a46afdSAlex Elder 		 * Follow to the parent with no grandparent and
608405a46afdSAlex Elder 		 * remove it.
608505a46afdSAlex Elder 		 */
608605a46afdSAlex Elder 		while (second && (third = second->parent)) {
608705a46afdSAlex Elder 			first = second;
608805a46afdSAlex Elder 			second = third;
608905a46afdSAlex Elder 		}
6090ad945fc1SAlex Elder 		rbd_assert(second);
60918ad42cd0SAlex Elder 		rbd_dev_image_release(second);
60928b679ec5SIlya Dryomov 		rbd_dev_destroy(second);
6093ad945fc1SAlex Elder 		first->parent = NULL;
6094ad945fc1SAlex Elder 		first->parent_overlap = 0;
6095ad945fc1SAlex Elder 
6096ad945fc1SAlex Elder 		rbd_assert(first->parent_spec);
609705a46afdSAlex Elder 		rbd_spec_put(first->parent_spec);
609805a46afdSAlex Elder 		first->parent_spec = NULL;
609905a46afdSAlex Elder 	}
610005a46afdSAlex Elder }
610105a46afdSAlex Elder 
61029b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus,
6103602adf40SYehuda Sadeh 			     const char *buf,
6104602adf40SYehuda Sadeh 			     size_t count)
6105602adf40SYehuda Sadeh {
6106602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
6107751cc0e3SAlex Elder 	struct list_head *tmp;
6108751cc0e3SAlex Elder 	int dev_id;
61090276dca6SMike Christie 	char opt_buf[6];
61100276dca6SMike Christie 	bool force = false;
61110d8189e1SAlex Elder 	int ret;
6112602adf40SYehuda Sadeh 
61130276dca6SMike Christie 	dev_id = -1;
61140276dca6SMike Christie 	opt_buf[0] = '\0';
61150276dca6SMike Christie 	sscanf(buf, "%d %5s", &dev_id, opt_buf);
61160276dca6SMike Christie 	if (dev_id < 0) {
61170276dca6SMike Christie 		pr_err("dev_id out of range\n");
6118602adf40SYehuda Sadeh 		return -EINVAL;
61190276dca6SMike Christie 	}
61200276dca6SMike Christie 	if (opt_buf[0] != '\0') {
61210276dca6SMike Christie 		if (!strcmp(opt_buf, "force")) {
61220276dca6SMike Christie 			force = true;
61230276dca6SMike Christie 		} else {
61240276dca6SMike Christie 			pr_err("bad remove option at '%s'\n", opt_buf);
61250276dca6SMike Christie 			return -EINVAL;
61260276dca6SMike Christie 		}
61270276dca6SMike Christie 	}
6128602adf40SYehuda Sadeh 
6129602adf40SYehuda Sadeh 	ret = -ENOENT;
6130751cc0e3SAlex Elder 	spin_lock(&rbd_dev_list_lock);
6131751cc0e3SAlex Elder 	list_for_each(tmp, &rbd_dev_list) {
6132751cc0e3SAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
6133751cc0e3SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
6134751cc0e3SAlex Elder 			ret = 0;
6135751cc0e3SAlex Elder 			break;
6136602adf40SYehuda Sadeh 		}
6137751cc0e3SAlex Elder 	}
6138751cc0e3SAlex Elder 	if (!ret) {
6139a14ea269SAlex Elder 		spin_lock_irq(&rbd_dev->lock);
61400276dca6SMike Christie 		if (rbd_dev->open_count && !force)
614142382b70SAlex Elder 			ret = -EBUSY;
614285f5a4d6SIlya Dryomov 		else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING,
614385f5a4d6SIlya Dryomov 					  &rbd_dev->flags))
614485f5a4d6SIlya Dryomov 			ret = -EINPROGRESS;
6145a14ea269SAlex Elder 		spin_unlock_irq(&rbd_dev->lock);
6146751cc0e3SAlex Elder 	}
6147751cc0e3SAlex Elder 	spin_unlock(&rbd_dev_list_lock);
614885f5a4d6SIlya Dryomov 	if (ret)
61491ba0f1e7SAlex Elder 		return ret;
6150751cc0e3SAlex Elder 
61510276dca6SMike Christie 	if (force) {
61520276dca6SMike Christie 		/*
61530276dca6SMike Christie 		 * Prevent new IO from being queued and wait for existing
61540276dca6SMike Christie 		 * IO to complete/fail.
61550276dca6SMike Christie 		 */
61560276dca6SMike Christie 		blk_mq_freeze_queue(rbd_dev->disk->queue);
61570276dca6SMike Christie 		blk_set_queue_dying(rbd_dev->disk->queue);
61580276dca6SMike Christie 	}
61590276dca6SMike Christie 
61605769ed0cSIlya Dryomov 	del_gendisk(rbd_dev->disk);
61615769ed0cSIlya Dryomov 	spin_lock(&rbd_dev_list_lock);
61625769ed0cSIlya Dryomov 	list_del_init(&rbd_dev->node);
61635769ed0cSIlya Dryomov 	spin_unlock(&rbd_dev_list_lock);
61645769ed0cSIlya Dryomov 	device_del(&rbd_dev->dev);
6165fca27065SIlya Dryomov 
6166e010dd0aSIlya Dryomov 	rbd_dev_image_unlock(rbd_dev);
6167dd5ac32dSIlya Dryomov 	rbd_dev_device_release(rbd_dev);
61688ad42cd0SAlex Elder 	rbd_dev_image_release(rbd_dev);
61698b679ec5SIlya Dryomov 	rbd_dev_destroy(rbd_dev);
61701ba0f1e7SAlex Elder 	return count;
6171602adf40SYehuda Sadeh }
6172602adf40SYehuda Sadeh 
61739b60e70bSIlya Dryomov static ssize_t rbd_remove(struct bus_type *bus,
61749b60e70bSIlya Dryomov 			  const char *buf,
61759b60e70bSIlya Dryomov 			  size_t count)
61769b60e70bSIlya Dryomov {
61779b60e70bSIlya Dryomov 	if (single_major)
61789b60e70bSIlya Dryomov 		return -EINVAL;
61799b60e70bSIlya Dryomov 
61809b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
61819b60e70bSIlya Dryomov }
61829b60e70bSIlya Dryomov 
61839b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus,
61849b60e70bSIlya Dryomov 				       const char *buf,
61859b60e70bSIlya Dryomov 				       size_t count)
61869b60e70bSIlya Dryomov {
61879b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
61889b60e70bSIlya Dryomov }
61899b60e70bSIlya Dryomov 
6190602adf40SYehuda Sadeh /*
6191602adf40SYehuda Sadeh  * create control files in sysfs
6192dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
6193602adf40SYehuda Sadeh  */
61947d8dc534SChengguang Xu static int __init rbd_sysfs_init(void)
6195602adf40SYehuda Sadeh {
6196dfc5606dSYehuda Sadeh 	int ret;
6197602adf40SYehuda Sadeh 
6198fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
6199dfc5606dSYehuda Sadeh 	if (ret < 0)
6200dfc5606dSYehuda Sadeh 		return ret;
6201602adf40SYehuda Sadeh 
6202fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
6203fed4c143SAlex Elder 	if (ret < 0)
6204fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
6205602adf40SYehuda Sadeh 
6206602adf40SYehuda Sadeh 	return ret;
6207602adf40SYehuda Sadeh }
6208602adf40SYehuda Sadeh 
62097d8dc534SChengguang Xu static void __exit rbd_sysfs_cleanup(void)
6210602adf40SYehuda Sadeh {
6211dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
6212fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
6213602adf40SYehuda Sadeh }
6214602adf40SYehuda Sadeh 
62157d8dc534SChengguang Xu static int __init rbd_slab_init(void)
62161c2a9dfeSAlex Elder {
62171c2a9dfeSAlex Elder 	rbd_assert(!rbd_img_request_cache);
621803d94406SGeliang Tang 	rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
6219868311b1SAlex Elder 	if (!rbd_img_request_cache)
6220868311b1SAlex Elder 		return -ENOMEM;
6221868311b1SAlex Elder 
6222868311b1SAlex Elder 	rbd_assert(!rbd_obj_request_cache);
622303d94406SGeliang Tang 	rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
622478c2a44aSAlex Elder 	if (!rbd_obj_request_cache)
622578c2a44aSAlex Elder 		goto out_err;
622678c2a44aSAlex Elder 
62271c2a9dfeSAlex Elder 	return 0;
62281c2a9dfeSAlex Elder 
62296c696d85SIlya Dryomov out_err:
6230868311b1SAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
6231868311b1SAlex Elder 	rbd_img_request_cache = NULL;
62321c2a9dfeSAlex Elder 	return -ENOMEM;
62331c2a9dfeSAlex Elder }
62341c2a9dfeSAlex Elder 
62351c2a9dfeSAlex Elder static void rbd_slab_exit(void)
62361c2a9dfeSAlex Elder {
6237868311b1SAlex Elder 	rbd_assert(rbd_obj_request_cache);
6238868311b1SAlex Elder 	kmem_cache_destroy(rbd_obj_request_cache);
6239868311b1SAlex Elder 	rbd_obj_request_cache = NULL;
6240868311b1SAlex Elder 
62411c2a9dfeSAlex Elder 	rbd_assert(rbd_img_request_cache);
62421c2a9dfeSAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
62431c2a9dfeSAlex Elder 	rbd_img_request_cache = NULL;
62441c2a9dfeSAlex Elder }
62451c2a9dfeSAlex Elder 
6246cc344fa1SAlex Elder static int __init rbd_init(void)
6247602adf40SYehuda Sadeh {
6248602adf40SYehuda Sadeh 	int rc;
6249602adf40SYehuda Sadeh 
62501e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
62511e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
62521e32d34cSAlex Elder 		return -EINVAL;
62531e32d34cSAlex Elder 	}
6254e1b4d96dSIlya Dryomov 
62551c2a9dfeSAlex Elder 	rc = rbd_slab_init();
6256602adf40SYehuda Sadeh 	if (rc)
6257602adf40SYehuda Sadeh 		return rc;
6258e1b4d96dSIlya Dryomov 
6259f5ee37bdSIlya Dryomov 	/*
6260f5ee37bdSIlya Dryomov 	 * The number of active work items is limited by the number of
6261f77303bdSIlya Dryomov 	 * rbd devices * queue depth, so leave @max_active at default.
6262f5ee37bdSIlya Dryomov 	 */
6263f5ee37bdSIlya Dryomov 	rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
6264f5ee37bdSIlya Dryomov 	if (!rbd_wq) {
6265f5ee37bdSIlya Dryomov 		rc = -ENOMEM;
6266f5ee37bdSIlya Dryomov 		goto err_out_slab;
6267f5ee37bdSIlya Dryomov 	}
6268f5ee37bdSIlya Dryomov 
62699b60e70bSIlya Dryomov 	if (single_major) {
62709b60e70bSIlya Dryomov 		rbd_major = register_blkdev(0, RBD_DRV_NAME);
62719b60e70bSIlya Dryomov 		if (rbd_major < 0) {
62729b60e70bSIlya Dryomov 			rc = rbd_major;
6273f5ee37bdSIlya Dryomov 			goto err_out_wq;
62749b60e70bSIlya Dryomov 		}
62759b60e70bSIlya Dryomov 	}
62769b60e70bSIlya Dryomov 
62771c2a9dfeSAlex Elder 	rc = rbd_sysfs_init();
62781c2a9dfeSAlex Elder 	if (rc)
62799b60e70bSIlya Dryomov 		goto err_out_blkdev;
62801c2a9dfeSAlex Elder 
62819b60e70bSIlya Dryomov 	if (single_major)
62829b60e70bSIlya Dryomov 		pr_info("loaded (major %d)\n", rbd_major);
62839b60e70bSIlya Dryomov 	else
6284e1b4d96dSIlya Dryomov 		pr_info("loaded\n");
62859b60e70bSIlya Dryomov 
6286e1b4d96dSIlya Dryomov 	return 0;
6287e1b4d96dSIlya Dryomov 
62889b60e70bSIlya Dryomov err_out_blkdev:
62899b60e70bSIlya Dryomov 	if (single_major)
62909b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
6291f5ee37bdSIlya Dryomov err_out_wq:
6292f5ee37bdSIlya Dryomov 	destroy_workqueue(rbd_wq);
6293e1b4d96dSIlya Dryomov err_out_slab:
6294e1b4d96dSIlya Dryomov 	rbd_slab_exit();
62951c2a9dfeSAlex Elder 	return rc;
6296602adf40SYehuda Sadeh }
6297602adf40SYehuda Sadeh 
6298cc344fa1SAlex Elder static void __exit rbd_exit(void)
6299602adf40SYehuda Sadeh {
6300ffe312cfSIlya Dryomov 	ida_destroy(&rbd_dev_id_ida);
6301602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
63029b60e70bSIlya Dryomov 	if (single_major)
63039b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
6304f5ee37bdSIlya Dryomov 	destroy_workqueue(rbd_wq);
63051c2a9dfeSAlex Elder 	rbd_slab_exit();
6306602adf40SYehuda Sadeh }
6307602adf40SYehuda Sadeh 
6308602adf40SYehuda Sadeh module_init(rbd_init);
6309602adf40SYehuda Sadeh module_exit(rbd_exit);
6310602adf40SYehuda Sadeh 
6311d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
6312602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
6313602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
6314602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
6315602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
6316602adf40SYehuda Sadeh 
631790da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
6318602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
6319