xref: /openbmc/linux/drivers/block/rbd.c (revision f9bebd58)
1e2a58ee5SAlex Elder 
2602adf40SYehuda Sadeh /*
3602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh 
6602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
7602adf40SYehuda Sadeh 
8602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
9602adf40SYehuda Sadeh 
10602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
11602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
12602adf40SYehuda Sadeh    the Free Software Foundation.
13602adf40SYehuda Sadeh 
14602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
15602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
16602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17602adf40SYehuda Sadeh    GNU General Public License for more details.
18602adf40SYehuda Sadeh 
19602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
20602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
21602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24602adf40SYehuda Sadeh 
25dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
26602adf40SYehuda Sadeh 
27dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
28602adf40SYehuda Sadeh 
29602adf40SYehuda Sadeh  */
30602adf40SYehuda Sadeh 
31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
34ed95b21aSIlya Dryomov #include <linux/ceph/cls_lock_client.h>
35602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3659c2be1eSYehuda Sadeh #include <linux/parser.h>
3730d1cff8SAlex Elder #include <linux/bsearch.h>
38602adf40SYehuda Sadeh 
39602adf40SYehuda Sadeh #include <linux/kernel.h>
40602adf40SYehuda Sadeh #include <linux/device.h>
41602adf40SYehuda Sadeh #include <linux/module.h>
427ad18afaSChristoph Hellwig #include <linux/blk-mq.h>
43602adf40SYehuda Sadeh #include <linux/fs.h>
44602adf40SYehuda Sadeh #include <linux/blkdev.h>
451c2a9dfeSAlex Elder #include <linux/slab.h>
46f8a22fc2SIlya Dryomov #include <linux/idr.h>
47bc1ecc65SIlya Dryomov #include <linux/workqueue.h>
48602adf40SYehuda Sadeh 
49602adf40SYehuda Sadeh #include "rbd_types.h"
50602adf40SYehuda Sadeh 
51aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
52aafb230eSAlex Elder 
53593a9e7bSAlex Elder /*
54593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
55593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
56593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
57593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
58593a9e7bSAlex Elder  */
59593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
60593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
61593a9e7bSAlex Elder 
62a2acd00eSAlex Elder /*
63a2acd00eSAlex Elder  * Increment the given counter and return its updated value.
64a2acd00eSAlex Elder  * If the counter is already 0 it will not be incremented.
65a2acd00eSAlex Elder  * If the counter is already at its maximum value returns
66a2acd00eSAlex Elder  * -EINVAL without updating it.
67a2acd00eSAlex Elder  */
68a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v)
69a2acd00eSAlex Elder {
70a2acd00eSAlex Elder 	unsigned int counter;
71a2acd00eSAlex Elder 
72a2acd00eSAlex Elder 	counter = (unsigned int)__atomic_add_unless(v, 1, 0);
73a2acd00eSAlex Elder 	if (counter <= (unsigned int)INT_MAX)
74a2acd00eSAlex Elder 		return (int)counter;
75a2acd00eSAlex Elder 
76a2acd00eSAlex Elder 	atomic_dec(v);
77a2acd00eSAlex Elder 
78a2acd00eSAlex Elder 	return -EINVAL;
79a2acd00eSAlex Elder }
80a2acd00eSAlex Elder 
81a2acd00eSAlex Elder /* Decrement the counter.  Return the resulting value, or -EINVAL */
82a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v)
83a2acd00eSAlex Elder {
84a2acd00eSAlex Elder 	int counter;
85a2acd00eSAlex Elder 
86a2acd00eSAlex Elder 	counter = atomic_dec_return(v);
87a2acd00eSAlex Elder 	if (counter >= 0)
88a2acd00eSAlex Elder 		return counter;
89a2acd00eSAlex Elder 
90a2acd00eSAlex Elder 	atomic_inc(v);
91a2acd00eSAlex Elder 
92a2acd00eSAlex Elder 	return -EINVAL;
93a2acd00eSAlex Elder }
94a2acd00eSAlex Elder 
95f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
96602adf40SYehuda Sadeh 
977e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR		256
987e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT	4
99602adf40SYehuda Sadeh 
1006d69bb53SIlya Dryomov #define RBD_MAX_PARENT_CHAIN_LEN	16
1016d69bb53SIlya Dryomov 
102d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
103d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
104d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
105d4b125e9SAlex Elder 
10635d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
107602adf40SYehuda Sadeh 
108602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
109602adf40SYehuda Sadeh 
1109682fc6dSAlex Elder #define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
1119682fc6dSAlex Elder 
1129e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
1139e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
114589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
1159e15b77dSAlex Elder 
1161e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
117589d30e0SAlex Elder 
118ed95b21aSIlya Dryomov #define RBD_NOTIFY_TIMEOUT	5	/* seconds */
11999d16943SIlya Dryomov #define RBD_RETRY_DELAY		msecs_to_jiffies(1000)
12099d16943SIlya Dryomov 
121d889140cSAlex Elder /* Feature bits */
122d889140cSAlex Elder 
1238767b293SIlya Dryomov #define RBD_FEATURE_LAYERING		(1ULL<<0)
1248767b293SIlya Dryomov #define RBD_FEATURE_STRIPINGV2		(1ULL<<1)
1258767b293SIlya Dryomov #define RBD_FEATURE_EXCLUSIVE_LOCK	(1ULL<<2)
1268767b293SIlya Dryomov #define RBD_FEATURE_DATA_POOL		(1ULL<<7)
1278767b293SIlya Dryomov 
128ed95b21aSIlya Dryomov #define RBD_FEATURES_ALL	(RBD_FEATURE_LAYERING |		\
129ed95b21aSIlya Dryomov 				 RBD_FEATURE_STRIPINGV2 |	\
1307e97332eSIlya Dryomov 				 RBD_FEATURE_EXCLUSIVE_LOCK |	\
1317e97332eSIlya Dryomov 				 RBD_FEATURE_DATA_POOL)
132d889140cSAlex Elder 
133d889140cSAlex Elder /* Features supported by this (client software) implementation. */
134d889140cSAlex Elder 
135770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
136d889140cSAlex Elder 
13781a89793SAlex Elder /*
13881a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
13981a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
14081a89793SAlex Elder  */
141602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
142602adf40SYehuda Sadeh 
143602adf40SYehuda Sadeh /*
144602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
145602adf40SYehuda Sadeh  */
146602adf40SYehuda Sadeh struct rbd_image_header {
147f35a4deeSAlex Elder 	/* These six fields never change for a given rbd image */
148849b4260SAlex Elder 	char *object_prefix;
149602adf40SYehuda Sadeh 	__u8 obj_order;
150f35a4deeSAlex Elder 	u64 stripe_unit;
151f35a4deeSAlex Elder 	u64 stripe_count;
1527e97332eSIlya Dryomov 	s64 data_pool_id;
153f35a4deeSAlex Elder 	u64 features;		/* Might be changeable someday? */
154602adf40SYehuda Sadeh 
155f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
156f84344f3SAlex Elder 	u64 image_size;
157f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
158f35a4deeSAlex Elder 	char *snap_names;	/* format 1 only */
159f35a4deeSAlex Elder 	u64 *snap_sizes;	/* format 1 only */
16059c2be1eSYehuda Sadeh };
16159c2be1eSYehuda Sadeh 
1620d7dbfceSAlex Elder /*
1630d7dbfceSAlex Elder  * An rbd image specification.
1640d7dbfceSAlex Elder  *
1650d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
166c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
167c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
168c66c6e0cSAlex Elder  *
169c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
170c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
171c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
172c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
173c66c6e0cSAlex Elder  *
174c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
175c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
176c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
177c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
178c66c6e0cSAlex Elder  * is shared between the parent and child).
179c66c6e0cSAlex Elder  *
180c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
181c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
182c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
183c66c6e0cSAlex Elder  *
184c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
185c66c6e0cSAlex Elder  * could be a null pointer).
1860d7dbfceSAlex Elder  */
1870d7dbfceSAlex Elder struct rbd_spec {
1880d7dbfceSAlex Elder 	u64		pool_id;
189ecb4dc22SAlex Elder 	const char	*pool_name;
1900d7dbfceSAlex Elder 
191ecb4dc22SAlex Elder 	const char	*image_id;
192ecb4dc22SAlex Elder 	const char	*image_name;
1930d7dbfceSAlex Elder 
1940d7dbfceSAlex Elder 	u64		snap_id;
195ecb4dc22SAlex Elder 	const char	*snap_name;
1960d7dbfceSAlex Elder 
1970d7dbfceSAlex Elder 	struct kref	kref;
1980d7dbfceSAlex Elder };
1990d7dbfceSAlex Elder 
200602adf40SYehuda Sadeh /*
201f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
202602adf40SYehuda Sadeh  */
203602adf40SYehuda Sadeh struct rbd_client {
204602adf40SYehuda Sadeh 	struct ceph_client	*client;
205602adf40SYehuda Sadeh 	struct kref		kref;
206602adf40SYehuda Sadeh 	struct list_head	node;
207602adf40SYehuda Sadeh };
208602adf40SYehuda Sadeh 
209bf0d5f50SAlex Elder struct rbd_img_request;
210bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
211bf0d5f50SAlex Elder 
212bf0d5f50SAlex Elder #define	BAD_WHICH	U32_MAX		/* Good which or bad which, which? */
213bf0d5f50SAlex Elder 
214bf0d5f50SAlex Elder struct rbd_obj_request;
215bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
216bf0d5f50SAlex Elder 
2179969ebc5SAlex Elder enum obj_request_type {
2189969ebc5SAlex Elder 	OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
2199969ebc5SAlex Elder };
220bf0d5f50SAlex Elder 
2216d2940c8SGuangliang Zhao enum obj_operation_type {
2226d2940c8SGuangliang Zhao 	OBJ_OP_WRITE,
2236d2940c8SGuangliang Zhao 	OBJ_OP_READ,
22490e98c52SGuangliang Zhao 	OBJ_OP_DISCARD,
2256d2940c8SGuangliang Zhao };
2266d2940c8SGuangliang Zhao 
227926f9b3fSAlex Elder enum obj_req_flags {
228926f9b3fSAlex Elder 	OBJ_REQ_DONE,		/* completion flag: not done = 0, done = 1 */
2296365d33aSAlex Elder 	OBJ_REQ_IMG_DATA,	/* object usage: standalone = 0, image = 1 */
2305679c59fSAlex Elder 	OBJ_REQ_KNOWN,		/* EXISTS flag valid: no = 0, yes = 1 */
2315679c59fSAlex Elder 	OBJ_REQ_EXISTS,		/* target exists: no = 0, yes = 1 */
232926f9b3fSAlex Elder };
233926f9b3fSAlex Elder 
234bf0d5f50SAlex Elder struct rbd_obj_request {
235a90bb0c1SIlya Dryomov 	u64			object_no;
236bf0d5f50SAlex Elder 	u64			offset;		/* object start byte */
237bf0d5f50SAlex Elder 	u64			length;		/* bytes from offset */
238926f9b3fSAlex Elder 	unsigned long		flags;
239bf0d5f50SAlex Elder 
240c5b5ef6cSAlex Elder 	/*
241c5b5ef6cSAlex Elder 	 * An object request associated with an image will have its
242c5b5ef6cSAlex Elder 	 * img_data flag set; a standalone object request will not.
243c5b5ef6cSAlex Elder 	 *
244c5b5ef6cSAlex Elder 	 * A standalone object request will have which == BAD_WHICH
245c5b5ef6cSAlex Elder 	 * and a null obj_request pointer.
246c5b5ef6cSAlex Elder 	 *
247c5b5ef6cSAlex Elder 	 * An object request initiated in support of a layered image
248c5b5ef6cSAlex Elder 	 * object (to check for its existence before a write) will
249c5b5ef6cSAlex Elder 	 * have which == BAD_WHICH and a non-null obj_request pointer.
250c5b5ef6cSAlex Elder 	 *
251c5b5ef6cSAlex Elder 	 * Finally, an object request for rbd image data will have
252c5b5ef6cSAlex Elder 	 * which != BAD_WHICH, and will have a non-null img_request
253c5b5ef6cSAlex Elder 	 * pointer.  The value of which will be in the range
254c5b5ef6cSAlex Elder 	 * 0..(img_request->obj_request_count-1).
255c5b5ef6cSAlex Elder 	 */
256c5b5ef6cSAlex Elder 	union {
257c5b5ef6cSAlex Elder 		struct rbd_obj_request	*obj_request;	/* STAT op */
258c5b5ef6cSAlex Elder 		struct {
259bf0d5f50SAlex Elder 			struct rbd_img_request	*img_request;
260c5b5ef6cSAlex Elder 			u64			img_offset;
261c5b5ef6cSAlex Elder 			/* links for img_request->obj_requests list */
262c5b5ef6cSAlex Elder 			struct list_head	links;
263c5b5ef6cSAlex Elder 		};
264c5b5ef6cSAlex Elder 	};
265bf0d5f50SAlex Elder 	u32			which;		/* posn image request list */
266bf0d5f50SAlex Elder 
267bf0d5f50SAlex Elder 	enum obj_request_type	type;
268788e2df3SAlex Elder 	union {
269bf0d5f50SAlex Elder 		struct bio	*bio_list;
270788e2df3SAlex Elder 		struct {
271788e2df3SAlex Elder 			struct page	**pages;
272788e2df3SAlex Elder 			u32		page_count;
273788e2df3SAlex Elder 		};
274788e2df3SAlex Elder 	};
2750eefd470SAlex Elder 	struct page		**copyup_pages;
276ebda6408SAlex Elder 	u32			copyup_page_count;
277bf0d5f50SAlex Elder 
278bf0d5f50SAlex Elder 	struct ceph_osd_request	*osd_req;
279bf0d5f50SAlex Elder 
280bf0d5f50SAlex Elder 	u64			xferred;	/* bytes transferred */
2811b83bef2SSage Weil 	int			result;
282bf0d5f50SAlex Elder 
283bf0d5f50SAlex Elder 	rbd_obj_callback_t	callback;
284788e2df3SAlex Elder 	struct completion	completion;
285bf0d5f50SAlex Elder 
286bf0d5f50SAlex Elder 	struct kref		kref;
287bf0d5f50SAlex Elder };
288bf0d5f50SAlex Elder 
2890c425248SAlex Elder enum img_req_flags {
2909849e986SAlex Elder 	IMG_REQ_WRITE,		/* I/O direction: read = 0, write = 1 */
2919849e986SAlex Elder 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
292d0b2e944SAlex Elder 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
29390e98c52SGuangliang Zhao 	IMG_REQ_DISCARD,	/* discard: normal = 0, discard request = 1 */
2940c425248SAlex Elder };
2950c425248SAlex Elder 
296bf0d5f50SAlex Elder struct rbd_img_request {
297bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
298bf0d5f50SAlex Elder 	u64			offset;	/* starting image byte offset */
299bf0d5f50SAlex Elder 	u64			length;	/* byte count from offset */
3000c425248SAlex Elder 	unsigned long		flags;
301bf0d5f50SAlex Elder 	union {
302bf0d5f50SAlex Elder 		u64			snap_id;	/* for reads */
3039849e986SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
3049849e986SAlex Elder 	};
3059849e986SAlex Elder 	union {
3069849e986SAlex Elder 		struct request		*rq;		/* block request */
3079849e986SAlex Elder 		struct rbd_obj_request	*obj_request;	/* obj req initiator */
308bf0d5f50SAlex Elder 	};
3093d7efd18SAlex Elder 	struct page		**copyup_pages;
310ebda6408SAlex Elder 	u32			copyup_page_count;
311bf0d5f50SAlex Elder 	spinlock_t		completion_lock;/* protects next_completion */
312bf0d5f50SAlex Elder 	u32			next_completion;
313bf0d5f50SAlex Elder 	rbd_img_callback_t	callback;
31455f27e09SAlex Elder 	u64			xferred;/* aggregate bytes transferred */
315a5a337d4SAlex Elder 	int			result;	/* first nonzero obj_request result */
316bf0d5f50SAlex Elder 
317bf0d5f50SAlex Elder 	u32			obj_request_count;
318bf0d5f50SAlex Elder 	struct list_head	obj_requests;	/* rbd_obj_request structs */
319bf0d5f50SAlex Elder 
320bf0d5f50SAlex Elder 	struct kref		kref;
321bf0d5f50SAlex Elder };
322bf0d5f50SAlex Elder 
323bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
324ef06f4d3SAlex Elder 	list_for_each_entry(oreq, &(ireq)->obj_requests, links)
325bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \
326ef06f4d3SAlex Elder 	list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
327bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
328ef06f4d3SAlex Elder 	list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
329bf0d5f50SAlex Elder 
33099d16943SIlya Dryomov enum rbd_watch_state {
33199d16943SIlya Dryomov 	RBD_WATCH_STATE_UNREGISTERED,
33299d16943SIlya Dryomov 	RBD_WATCH_STATE_REGISTERED,
33399d16943SIlya Dryomov 	RBD_WATCH_STATE_ERROR,
33499d16943SIlya Dryomov };
33599d16943SIlya Dryomov 
336ed95b21aSIlya Dryomov enum rbd_lock_state {
337ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_UNLOCKED,
338ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_LOCKED,
339ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_RELEASING,
340ed95b21aSIlya Dryomov };
341ed95b21aSIlya Dryomov 
342ed95b21aSIlya Dryomov /* WatchNotify::ClientId */
343ed95b21aSIlya Dryomov struct rbd_client_id {
344ed95b21aSIlya Dryomov 	u64 gid;
345ed95b21aSIlya Dryomov 	u64 handle;
346ed95b21aSIlya Dryomov };
347ed95b21aSIlya Dryomov 
348f84344f3SAlex Elder struct rbd_mapping {
34999c1f08fSAlex Elder 	u64                     size;
35034b13184SAlex Elder 	u64                     features;
351f84344f3SAlex Elder 	bool			read_only;
352f84344f3SAlex Elder };
353f84344f3SAlex Elder 
354602adf40SYehuda Sadeh /*
355602adf40SYehuda Sadeh  * a single device
356602adf40SYehuda Sadeh  */
357602adf40SYehuda Sadeh struct rbd_device {
358de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
359602adf40SYehuda Sadeh 
360602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
361dd82fff1SIlya Dryomov 	int			minor;
362602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
363602adf40SYehuda Sadeh 
364a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
365602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
366602adf40SYehuda Sadeh 
367602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
368602adf40SYehuda Sadeh 
369b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
370602adf40SYehuda Sadeh 
371602adf40SYehuda Sadeh 	struct rbd_image_header	header;
372b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
3730d7dbfceSAlex Elder 	struct rbd_spec		*spec;
374d147543dSIlya Dryomov 	struct rbd_options	*opts;
3750d6d1e9cSMike Christie 	char			*config_info;	/* add{,_single_major} string */
376602adf40SYehuda Sadeh 
377c41d13a3SIlya Dryomov 	struct ceph_object_id	header_oid;
378922dab61SIlya Dryomov 	struct ceph_object_locator header_oloc;
379971f839aSAlex Elder 
3801643dfa4SIlya Dryomov 	struct ceph_file_layout	layout;		/* used for all rbd requests */
3810903e875SAlex Elder 
38299d16943SIlya Dryomov 	struct mutex		watch_mutex;
38399d16943SIlya Dryomov 	enum rbd_watch_state	watch_state;
384922dab61SIlya Dryomov 	struct ceph_osd_linger_request *watch_handle;
38599d16943SIlya Dryomov 	u64			watch_cookie;
38699d16943SIlya Dryomov 	struct delayed_work	watch_dwork;
38759c2be1eSYehuda Sadeh 
388ed95b21aSIlya Dryomov 	struct rw_semaphore	lock_rwsem;
389ed95b21aSIlya Dryomov 	enum rbd_lock_state	lock_state;
390cbbfb0ffSIlya Dryomov 	char			lock_cookie[32];
391ed95b21aSIlya Dryomov 	struct rbd_client_id	owner_cid;
392ed95b21aSIlya Dryomov 	struct work_struct	acquired_lock_work;
393ed95b21aSIlya Dryomov 	struct work_struct	released_lock_work;
394ed95b21aSIlya Dryomov 	struct delayed_work	lock_dwork;
395ed95b21aSIlya Dryomov 	struct work_struct	unlock_work;
396ed95b21aSIlya Dryomov 	wait_queue_head_t	lock_waitq;
397ed95b21aSIlya Dryomov 
3981643dfa4SIlya Dryomov 	struct workqueue_struct	*task_wq;
399602adf40SYehuda Sadeh 
40086b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
40186b00e0dSAlex Elder 	u64			parent_overlap;
402a2acd00eSAlex Elder 	atomic_t		parent_ref;
4032f82ee54SAlex Elder 	struct rbd_device	*parent;
40486b00e0dSAlex Elder 
4057ad18afaSChristoph Hellwig 	/* Block layer tags. */
4067ad18afaSChristoph Hellwig 	struct blk_mq_tag_set	tag_set;
4077ad18afaSChristoph Hellwig 
408c666601aSJosh Durgin 	/* protects updating the header */
409c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
410f84344f3SAlex Elder 
411f84344f3SAlex Elder 	struct rbd_mapping	mapping;
412602adf40SYehuda Sadeh 
413602adf40SYehuda Sadeh 	struct list_head	node;
414dfc5606dSYehuda Sadeh 
415dfc5606dSYehuda Sadeh 	/* sysfs related */
416dfc5606dSYehuda Sadeh 	struct device		dev;
417b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
418dfc5606dSYehuda Sadeh };
419dfc5606dSYehuda Sadeh 
420b82d167bSAlex Elder /*
42187c0fdedSIlya Dryomov  * Flag bits for rbd_dev->flags:
42287c0fdedSIlya Dryomov  * - REMOVING (which is coupled with rbd_dev->open_count) is protected
42387c0fdedSIlya Dryomov  *   by rbd_dev->lock
42487c0fdedSIlya Dryomov  * - BLACKLISTED is protected by rbd_dev->lock_rwsem
425b82d167bSAlex Elder  */
4266d292906SAlex Elder enum rbd_dev_flags {
4276d292906SAlex Elder 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
428b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
42987c0fdedSIlya Dryomov 	RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */
4306d292906SAlex Elder };
4316d292906SAlex Elder 
432cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex);	/* Serialize client creation */
433e124a82fSAlex Elder 
434602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
435e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
436e124a82fSAlex Elder 
437602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
438432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
439602adf40SYehuda Sadeh 
44078c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */
44178c2a44aSAlex Elder 
4421c2a9dfeSAlex Elder static struct kmem_cache	*rbd_img_request_cache;
443868311b1SAlex Elder static struct kmem_cache	*rbd_obj_request_cache;
4441c2a9dfeSAlex Elder 
4459b60e70bSIlya Dryomov static int rbd_major;
446f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida);
447f8a22fc2SIlya Dryomov 
448f5ee37bdSIlya Dryomov static struct workqueue_struct *rbd_wq;
449f5ee37bdSIlya Dryomov 
4509b60e70bSIlya Dryomov /*
4519b60e70bSIlya Dryomov  * Default to false for now, as single-major requires >= 0.75 version of
4529b60e70bSIlya Dryomov  * userspace rbd utility.
4539b60e70bSIlya Dryomov  */
4549b60e70bSIlya Dryomov static bool single_major = false;
4559b60e70bSIlya Dryomov module_param(single_major, bool, S_IRUGO);
4569b60e70bSIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)");
4579b60e70bSIlya Dryomov 
4583d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request);
4593d7efd18SAlex Elder 
460f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
461f0f8cef5SAlex Elder 		       size_t count);
462f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
463f0f8cef5SAlex Elder 			  size_t count);
4649b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
4659b60e70bSIlya Dryomov 				    size_t count);
4669b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
4679b60e70bSIlya Dryomov 				       size_t count);
4686d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
469a2acd00eSAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
470f0f8cef5SAlex Elder 
4719b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id)
4729b60e70bSIlya Dryomov {
4737e513d43SIlya Dryomov 	return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
4749b60e70bSIlya Dryomov }
4759b60e70bSIlya Dryomov 
4769b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor)
4779b60e70bSIlya Dryomov {
4787e513d43SIlya Dryomov 	return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
4799b60e70bSIlya Dryomov }
4809b60e70bSIlya Dryomov 
481ed95b21aSIlya Dryomov static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
482ed95b21aSIlya Dryomov {
483ed95b21aSIlya Dryomov 	return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
484ed95b21aSIlya Dryomov 	       rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
485ed95b21aSIlya Dryomov }
486ed95b21aSIlya Dryomov 
487ed95b21aSIlya Dryomov static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
488ed95b21aSIlya Dryomov {
489ed95b21aSIlya Dryomov 	bool is_lock_owner;
490ed95b21aSIlya Dryomov 
491ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
492ed95b21aSIlya Dryomov 	is_lock_owner = __rbd_is_lock_owner(rbd_dev);
493ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
494ed95b21aSIlya Dryomov 	return is_lock_owner;
495ed95b21aSIlya Dryomov }
496ed95b21aSIlya Dryomov 
4978767b293SIlya Dryomov static ssize_t rbd_supported_features_show(struct bus_type *bus, char *buf)
4988767b293SIlya Dryomov {
4998767b293SIlya Dryomov 	return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
5008767b293SIlya Dryomov }
5018767b293SIlya Dryomov 
502b15a21ddSGreg Kroah-Hartman static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
503b15a21ddSGreg Kroah-Hartman static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
5049b60e70bSIlya Dryomov static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
5059b60e70bSIlya Dryomov static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
5068767b293SIlya Dryomov static BUS_ATTR(supported_features, S_IRUGO, rbd_supported_features_show, NULL);
507b15a21ddSGreg Kroah-Hartman 
508b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = {
509b15a21ddSGreg Kroah-Hartman 	&bus_attr_add.attr,
510b15a21ddSGreg Kroah-Hartman 	&bus_attr_remove.attr,
5119b60e70bSIlya Dryomov 	&bus_attr_add_single_major.attr,
5129b60e70bSIlya Dryomov 	&bus_attr_remove_single_major.attr,
5138767b293SIlya Dryomov 	&bus_attr_supported_features.attr,
514b15a21ddSGreg Kroah-Hartman 	NULL,
515f0f8cef5SAlex Elder };
51692c76dc0SIlya Dryomov 
51792c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj,
51892c76dc0SIlya Dryomov 				  struct attribute *attr, int index)
51992c76dc0SIlya Dryomov {
5209b60e70bSIlya Dryomov 	if (!single_major &&
5219b60e70bSIlya Dryomov 	    (attr == &bus_attr_add_single_major.attr ||
5229b60e70bSIlya Dryomov 	     attr == &bus_attr_remove_single_major.attr))
5239b60e70bSIlya Dryomov 		return 0;
5249b60e70bSIlya Dryomov 
52592c76dc0SIlya Dryomov 	return attr->mode;
52692c76dc0SIlya Dryomov }
52792c76dc0SIlya Dryomov 
52892c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = {
52992c76dc0SIlya Dryomov 	.attrs = rbd_bus_attrs,
53092c76dc0SIlya Dryomov 	.is_visible = rbd_bus_is_visible,
53192c76dc0SIlya Dryomov };
53292c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus);
533f0f8cef5SAlex Elder 
534f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
535f0f8cef5SAlex Elder 	.name		= "rbd",
536b15a21ddSGreg Kroah-Hartman 	.bus_groups	= rbd_bus_groups,
537f0f8cef5SAlex Elder };
538f0f8cef5SAlex Elder 
539f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
540f0f8cef5SAlex Elder {
541f0f8cef5SAlex Elder }
542f0f8cef5SAlex Elder 
543f0f8cef5SAlex Elder static struct device rbd_root_dev = {
544f0f8cef5SAlex Elder 	.init_name =    "rbd",
545f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
546f0f8cef5SAlex Elder };
547f0f8cef5SAlex Elder 
54806ecc6cbSAlex Elder static __printf(2, 3)
54906ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
55006ecc6cbSAlex Elder {
55106ecc6cbSAlex Elder 	struct va_format vaf;
55206ecc6cbSAlex Elder 	va_list args;
55306ecc6cbSAlex Elder 
55406ecc6cbSAlex Elder 	va_start(args, fmt);
55506ecc6cbSAlex Elder 	vaf.fmt = fmt;
55606ecc6cbSAlex Elder 	vaf.va = &args;
55706ecc6cbSAlex Elder 
55806ecc6cbSAlex Elder 	if (!rbd_dev)
55906ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
56006ecc6cbSAlex Elder 	else if (rbd_dev->disk)
56106ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
56206ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
56306ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
56406ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
56506ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
56606ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
56706ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
56806ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
56906ecc6cbSAlex Elder 	else	/* punt */
57006ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
57106ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
57206ecc6cbSAlex Elder 	va_end(args);
57306ecc6cbSAlex Elder }
57406ecc6cbSAlex Elder 
575aafb230eSAlex Elder #ifdef RBD_DEBUG
576aafb230eSAlex Elder #define rbd_assert(expr)						\
577aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
578aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
579aafb230eSAlex Elder 						"at line %d:\n\n"	\
580aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
581aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
582aafb230eSAlex Elder 			BUG();						\
583aafb230eSAlex Elder 		}
584aafb230eSAlex Elder #else /* !RBD_DEBUG */
585aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
586aafb230eSAlex Elder #endif /* !RBD_DEBUG */
587dfc5606dSYehuda Sadeh 
5882761713dSIlya Dryomov static void rbd_osd_copyup_callback(struct rbd_obj_request *obj_request);
589b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
59005a46afdSAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
59105a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
5928b3e1a56SAlex Elder 
593cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev);
5942df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
595a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev);
596e8f59b59SIlya Dryomov static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
59754cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
59854cac61fSAlex Elder 					u64 snap_id);
5992ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
6002ad3d716SAlex Elder 				u8 *order, u64 *snap_size);
6012ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
6022ad3d716SAlex Elder 		u64 *snap_features);
60359c2be1eSYehuda Sadeh 
604602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
605602adf40SYehuda Sadeh {
606f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
607b82d167bSAlex Elder 	bool removing = false;
608602adf40SYehuda Sadeh 
609f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
610602adf40SYehuda Sadeh 		return -EROFS;
611602adf40SYehuda Sadeh 
612a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
613b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
614b82d167bSAlex Elder 		removing = true;
615b82d167bSAlex Elder 	else
616b82d167bSAlex Elder 		rbd_dev->open_count++;
617a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
618b82d167bSAlex Elder 	if (removing)
619b82d167bSAlex Elder 		return -ENOENT;
620b82d167bSAlex Elder 
621c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
622340c7a2bSAlex Elder 
623602adf40SYehuda Sadeh 	return 0;
624602adf40SYehuda Sadeh }
625602adf40SYehuda Sadeh 
626db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode)
627dfc5606dSYehuda Sadeh {
628dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
629b82d167bSAlex Elder 	unsigned long open_count_before;
630b82d167bSAlex Elder 
631a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
632b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
633a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
634b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
635dfc5606dSYehuda Sadeh 
636c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
637dfc5606dSYehuda Sadeh }
638dfc5606dSYehuda Sadeh 
639131fd9f6SGuangliang Zhao static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
640131fd9f6SGuangliang Zhao {
64177f33c03SJosh Durgin 	int ret = 0;
642131fd9f6SGuangliang Zhao 	int val;
643131fd9f6SGuangliang Zhao 	bool ro;
64477f33c03SJosh Durgin 	bool ro_changed = false;
645131fd9f6SGuangliang Zhao 
64677f33c03SJosh Durgin 	/* get_user() may sleep, so call it before taking rbd_dev->lock */
647131fd9f6SGuangliang Zhao 	if (get_user(val, (int __user *)(arg)))
648131fd9f6SGuangliang Zhao 		return -EFAULT;
649131fd9f6SGuangliang Zhao 
650131fd9f6SGuangliang Zhao 	ro = val ? true : false;
651131fd9f6SGuangliang Zhao 	/* Snapshot doesn't allow to write*/
652131fd9f6SGuangliang Zhao 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
653131fd9f6SGuangliang Zhao 		return -EROFS;
654131fd9f6SGuangliang Zhao 
65577f33c03SJosh Durgin 	spin_lock_irq(&rbd_dev->lock);
65677f33c03SJosh Durgin 	/* prevent others open this device */
65777f33c03SJosh Durgin 	if (rbd_dev->open_count > 1) {
65877f33c03SJosh Durgin 		ret = -EBUSY;
65977f33c03SJosh Durgin 		goto out;
660131fd9f6SGuangliang Zhao 	}
661131fd9f6SGuangliang Zhao 
66277f33c03SJosh Durgin 	if (rbd_dev->mapping.read_only != ro) {
66377f33c03SJosh Durgin 		rbd_dev->mapping.read_only = ro;
66477f33c03SJosh Durgin 		ro_changed = true;
66577f33c03SJosh Durgin 	}
66677f33c03SJosh Durgin 
66777f33c03SJosh Durgin out:
66877f33c03SJosh Durgin 	spin_unlock_irq(&rbd_dev->lock);
66977f33c03SJosh Durgin 	/* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */
67077f33c03SJosh Durgin 	if (ret == 0 && ro_changed)
67177f33c03SJosh Durgin 		set_disk_ro(rbd_dev->disk, ro ? 1 : 0);
67277f33c03SJosh Durgin 
67377f33c03SJosh Durgin 	return ret;
674131fd9f6SGuangliang Zhao }
675131fd9f6SGuangliang Zhao 
676131fd9f6SGuangliang Zhao static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
677131fd9f6SGuangliang Zhao 			unsigned int cmd, unsigned long arg)
678131fd9f6SGuangliang Zhao {
679131fd9f6SGuangliang Zhao 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
680131fd9f6SGuangliang Zhao 	int ret = 0;
681131fd9f6SGuangliang Zhao 
682131fd9f6SGuangliang Zhao 	switch (cmd) {
683131fd9f6SGuangliang Zhao 	case BLKROSET:
684131fd9f6SGuangliang Zhao 		ret = rbd_ioctl_set_ro(rbd_dev, arg);
685131fd9f6SGuangliang Zhao 		break;
686131fd9f6SGuangliang Zhao 	default:
687131fd9f6SGuangliang Zhao 		ret = -ENOTTY;
688131fd9f6SGuangliang Zhao 	}
689131fd9f6SGuangliang Zhao 
690131fd9f6SGuangliang Zhao 	return ret;
691131fd9f6SGuangliang Zhao }
692131fd9f6SGuangliang Zhao 
693131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT
694131fd9f6SGuangliang Zhao static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
695131fd9f6SGuangliang Zhao 				unsigned int cmd, unsigned long arg)
696131fd9f6SGuangliang Zhao {
697131fd9f6SGuangliang Zhao 	return rbd_ioctl(bdev, mode, cmd, arg);
698131fd9f6SGuangliang Zhao }
699131fd9f6SGuangliang Zhao #endif /* CONFIG_COMPAT */
700131fd9f6SGuangliang Zhao 
701602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
702602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
703602adf40SYehuda Sadeh 	.open			= rbd_open,
704dfc5606dSYehuda Sadeh 	.release		= rbd_release,
705131fd9f6SGuangliang Zhao 	.ioctl			= rbd_ioctl,
706131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT
707131fd9f6SGuangliang Zhao 	.compat_ioctl		= rbd_compat_ioctl,
708131fd9f6SGuangliang Zhao #endif
709602adf40SYehuda Sadeh };
710602adf40SYehuda Sadeh 
711602adf40SYehuda Sadeh /*
7127262cfcaSAlex Elder  * Initialize an rbd client instance.  Success or not, this function
713cfbf6377SAlex Elder  * consumes ceph_opts.  Caller holds client_mutex.
714602adf40SYehuda Sadeh  */
715f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
716602adf40SYehuda Sadeh {
717602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
718602adf40SYehuda Sadeh 	int ret = -ENOMEM;
719602adf40SYehuda Sadeh 
72037206ee5SAlex Elder 	dout("%s:\n", __func__);
721602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
722602adf40SYehuda Sadeh 	if (!rbdc)
723602adf40SYehuda Sadeh 		goto out_opt;
724602adf40SYehuda Sadeh 
725602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
726602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
727602adf40SYehuda Sadeh 
72874da4a0fSIlya Dryomov 	rbdc->client = ceph_create_client(ceph_opts, rbdc);
729602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
73008f75463SAlex Elder 		goto out_rbdc;
73143ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
732602adf40SYehuda Sadeh 
733602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
734602adf40SYehuda Sadeh 	if (ret < 0)
73508f75463SAlex Elder 		goto out_client;
736602adf40SYehuda Sadeh 
737432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
738602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
739432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
740602adf40SYehuda Sadeh 
74137206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
742bc534d86SAlex Elder 
743602adf40SYehuda Sadeh 	return rbdc;
74408f75463SAlex Elder out_client:
745602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
74608f75463SAlex Elder out_rbdc:
747602adf40SYehuda Sadeh 	kfree(rbdc);
748602adf40SYehuda Sadeh out_opt:
74943ae4701SAlex Elder 	if (ceph_opts)
75043ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
75137206ee5SAlex Elder 	dout("%s: error %d\n", __func__, ret);
75237206ee5SAlex Elder 
75328f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
754602adf40SYehuda Sadeh }
755602adf40SYehuda Sadeh 
7562f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
7572f82ee54SAlex Elder {
7582f82ee54SAlex Elder 	kref_get(&rbdc->kref);
7592f82ee54SAlex Elder 
7602f82ee54SAlex Elder 	return rbdc;
7612f82ee54SAlex Elder }
7622f82ee54SAlex Elder 
763602adf40SYehuda Sadeh /*
7641f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
7651f7ba331SAlex Elder  * found, bump its reference count.
766602adf40SYehuda Sadeh  */
7671f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
768602adf40SYehuda Sadeh {
769602adf40SYehuda Sadeh 	struct rbd_client *client_node;
7701f7ba331SAlex Elder 	bool found = false;
771602adf40SYehuda Sadeh 
77243ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
773602adf40SYehuda Sadeh 		return NULL;
774602adf40SYehuda Sadeh 
7751f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
7761f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
7771f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
7782f82ee54SAlex Elder 			__rbd_get_client(client_node);
7792f82ee54SAlex Elder 
7801f7ba331SAlex Elder 			found = true;
7811f7ba331SAlex Elder 			break;
7821f7ba331SAlex Elder 		}
7831f7ba331SAlex Elder 	}
7841f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
7851f7ba331SAlex Elder 
7861f7ba331SAlex Elder 	return found ? client_node : NULL;
787602adf40SYehuda Sadeh }
788602adf40SYehuda Sadeh 
789602adf40SYehuda Sadeh /*
790210c104cSIlya Dryomov  * (Per device) rbd map options
79159c2be1eSYehuda Sadeh  */
79259c2be1eSYehuda Sadeh enum {
793b5584180SIlya Dryomov 	Opt_queue_depth,
79459c2be1eSYehuda Sadeh 	Opt_last_int,
79559c2be1eSYehuda Sadeh 	/* int args above */
79659c2be1eSYehuda Sadeh 	Opt_last_string,
79759c2be1eSYehuda Sadeh 	/* string args above */
798cc0538b6SAlex Elder 	Opt_read_only,
799cc0538b6SAlex Elder 	Opt_read_write,
80080de1912SIlya Dryomov 	Opt_lock_on_read,
801210c104cSIlya Dryomov 	Opt_err
80259c2be1eSYehuda Sadeh };
80359c2be1eSYehuda Sadeh 
80443ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
805b5584180SIlya Dryomov 	{Opt_queue_depth, "queue_depth=%d"},
80659c2be1eSYehuda Sadeh 	/* int args above */
80759c2be1eSYehuda Sadeh 	/* string args above */
808be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
809cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
810cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
811cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
81280de1912SIlya Dryomov 	{Opt_lock_on_read, "lock_on_read"},
813210c104cSIlya Dryomov 	{Opt_err, NULL}
81459c2be1eSYehuda Sadeh };
81559c2be1eSYehuda Sadeh 
81698571b5aSAlex Elder struct rbd_options {
817b5584180SIlya Dryomov 	int	queue_depth;
81898571b5aSAlex Elder 	bool	read_only;
81980de1912SIlya Dryomov 	bool	lock_on_read;
82098571b5aSAlex Elder };
82198571b5aSAlex Elder 
822b5584180SIlya Dryomov #define RBD_QUEUE_DEPTH_DEFAULT	BLKDEV_MAX_RQ
82398571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
82480de1912SIlya Dryomov #define RBD_LOCK_ON_READ_DEFAULT false
82598571b5aSAlex Elder 
82659c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
82759c2be1eSYehuda Sadeh {
82843ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
82959c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
83059c2be1eSYehuda Sadeh 	int token, intval, ret;
83159c2be1eSYehuda Sadeh 
83243ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
83359c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
83459c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
83559c2be1eSYehuda Sadeh 		if (ret < 0) {
836210c104cSIlya Dryomov 			pr_err("bad mount option arg (not int) at '%s'\n", c);
83759c2be1eSYehuda Sadeh 			return ret;
83859c2be1eSYehuda Sadeh 		}
83959c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
84059c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
841210c104cSIlya Dryomov 		dout("got string token %d val %s\n", token, argstr[0].from);
84259c2be1eSYehuda Sadeh 	} else {
84359c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
84459c2be1eSYehuda Sadeh 	}
84559c2be1eSYehuda Sadeh 
84659c2be1eSYehuda Sadeh 	switch (token) {
847b5584180SIlya Dryomov 	case Opt_queue_depth:
848b5584180SIlya Dryomov 		if (intval < 1) {
849b5584180SIlya Dryomov 			pr_err("queue_depth out of range\n");
850b5584180SIlya Dryomov 			return -EINVAL;
851b5584180SIlya Dryomov 		}
852b5584180SIlya Dryomov 		rbd_opts->queue_depth = intval;
853b5584180SIlya Dryomov 		break;
854cc0538b6SAlex Elder 	case Opt_read_only:
855cc0538b6SAlex Elder 		rbd_opts->read_only = true;
856cc0538b6SAlex Elder 		break;
857cc0538b6SAlex Elder 	case Opt_read_write:
858cc0538b6SAlex Elder 		rbd_opts->read_only = false;
859cc0538b6SAlex Elder 		break;
86080de1912SIlya Dryomov 	case Opt_lock_on_read:
86180de1912SIlya Dryomov 		rbd_opts->lock_on_read = true;
86280de1912SIlya Dryomov 		break;
86359c2be1eSYehuda Sadeh 	default:
864210c104cSIlya Dryomov 		/* libceph prints "bad option" msg */
865210c104cSIlya Dryomov 		return -EINVAL;
86659c2be1eSYehuda Sadeh 	}
867210c104cSIlya Dryomov 
86859c2be1eSYehuda Sadeh 	return 0;
86959c2be1eSYehuda Sadeh }
87059c2be1eSYehuda Sadeh 
8716d2940c8SGuangliang Zhao static char* obj_op_name(enum obj_operation_type op_type)
8726d2940c8SGuangliang Zhao {
8736d2940c8SGuangliang Zhao 	switch (op_type) {
8746d2940c8SGuangliang Zhao 	case OBJ_OP_READ:
8756d2940c8SGuangliang Zhao 		return "read";
8766d2940c8SGuangliang Zhao 	case OBJ_OP_WRITE:
8776d2940c8SGuangliang Zhao 		return "write";
87890e98c52SGuangliang Zhao 	case OBJ_OP_DISCARD:
87990e98c52SGuangliang Zhao 		return "discard";
8806d2940c8SGuangliang Zhao 	default:
8816d2940c8SGuangliang Zhao 		return "???";
8826d2940c8SGuangliang Zhao 	}
8836d2940c8SGuangliang Zhao }
8846d2940c8SGuangliang Zhao 
88559c2be1eSYehuda Sadeh /*
886602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
8877262cfcaSAlex Elder  * not exist create it.  Either way, ceph_opts is consumed by this
8887262cfcaSAlex Elder  * function.
889602adf40SYehuda Sadeh  */
8909d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
891602adf40SYehuda Sadeh {
892f8c38929SAlex Elder 	struct rbd_client *rbdc;
89359c2be1eSYehuda Sadeh 
894cfbf6377SAlex Elder 	mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
8951f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
8969d3997fdSAlex Elder 	if (rbdc)	/* using an existing client */
89743ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
8989d3997fdSAlex Elder 	else
899f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
900cfbf6377SAlex Elder 	mutex_unlock(&client_mutex);
901d720bcb0SAlex Elder 
9029d3997fdSAlex Elder 	return rbdc;
903602adf40SYehuda Sadeh }
904602adf40SYehuda Sadeh 
905602adf40SYehuda Sadeh /*
906602adf40SYehuda Sadeh  * Destroy ceph client
907d23a4b3fSAlex Elder  *
908432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
909602adf40SYehuda Sadeh  */
910602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
911602adf40SYehuda Sadeh {
912602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
913602adf40SYehuda Sadeh 
91437206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
915cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
916602adf40SYehuda Sadeh 	list_del(&rbdc->node);
917cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
918602adf40SYehuda Sadeh 
919602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
920602adf40SYehuda Sadeh 	kfree(rbdc);
921602adf40SYehuda Sadeh }
922602adf40SYehuda Sadeh 
923602adf40SYehuda Sadeh /*
924602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
925602adf40SYehuda Sadeh  * it.
926602adf40SYehuda Sadeh  */
9279d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
928602adf40SYehuda Sadeh {
929c53d5893SAlex Elder 	if (rbdc)
9309d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
931602adf40SYehuda Sadeh }
932602adf40SYehuda Sadeh 
933a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
934a30b71b9SAlex Elder {
935a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
936a30b71b9SAlex Elder }
937a30b71b9SAlex Elder 
9388e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
9398e94af8eSAlex Elder {
940103a150fSAlex Elder 	size_t size;
941103a150fSAlex Elder 	u32 snap_count;
942103a150fSAlex Elder 
943103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
944103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
945103a150fSAlex Elder 		return false;
946103a150fSAlex Elder 
947db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
948db2388b6SAlex Elder 
949db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
950db2388b6SAlex Elder 		return false;
951db2388b6SAlex Elder 
952db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
953db2388b6SAlex Elder 
954db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
955db2388b6SAlex Elder 		return false;
956db2388b6SAlex Elder 
957103a150fSAlex Elder 	/*
958103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
959103a150fSAlex Elder 	 * that limits the number of snapshots.
960103a150fSAlex Elder 	 */
961103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
962103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
963103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
964103a150fSAlex Elder 		return false;
965103a150fSAlex Elder 
966103a150fSAlex Elder 	/*
967103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
968103a150fSAlex Elder 	 * header must also be representable in a size_t.
969103a150fSAlex Elder 	 */
970103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
971103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
972103a150fSAlex Elder 		return false;
973103a150fSAlex Elder 
974103a150fSAlex Elder 	return true;
9758e94af8eSAlex Elder }
9768e94af8eSAlex Elder 
977602adf40SYehuda Sadeh /*
9785bc3fb17SIlya Dryomov  * returns the size of an object in the image
9795bc3fb17SIlya Dryomov  */
9805bc3fb17SIlya Dryomov static u32 rbd_obj_bytes(struct rbd_image_header *header)
9815bc3fb17SIlya Dryomov {
9825bc3fb17SIlya Dryomov 	return 1U << header->obj_order;
9835bc3fb17SIlya Dryomov }
9845bc3fb17SIlya Dryomov 
985263423f8SIlya Dryomov static void rbd_init_layout(struct rbd_device *rbd_dev)
986263423f8SIlya Dryomov {
987263423f8SIlya Dryomov 	if (rbd_dev->header.stripe_unit == 0 ||
988263423f8SIlya Dryomov 	    rbd_dev->header.stripe_count == 0) {
989263423f8SIlya Dryomov 		rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
990263423f8SIlya Dryomov 		rbd_dev->header.stripe_count = 1;
991263423f8SIlya Dryomov 	}
992263423f8SIlya Dryomov 
993263423f8SIlya Dryomov 	rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
994263423f8SIlya Dryomov 	rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
995263423f8SIlya Dryomov 	rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
9967e97332eSIlya Dryomov 	rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
9977e97332eSIlya Dryomov 			  rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
998263423f8SIlya Dryomov 	RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
999263423f8SIlya Dryomov }
1000263423f8SIlya Dryomov 
10015bc3fb17SIlya Dryomov /*
1002bb23e37aSAlex Elder  * Fill an rbd image header with information from the given format 1
1003bb23e37aSAlex Elder  * on-disk header.
1004602adf40SYehuda Sadeh  */
1005662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev,
10064156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
1007602adf40SYehuda Sadeh {
1008662518b1SAlex Elder 	struct rbd_image_header *header = &rbd_dev->header;
1009bb23e37aSAlex Elder 	bool first_time = header->object_prefix == NULL;
1010bb23e37aSAlex Elder 	struct ceph_snap_context *snapc;
1011bb23e37aSAlex Elder 	char *object_prefix = NULL;
1012bb23e37aSAlex Elder 	char *snap_names = NULL;
1013bb23e37aSAlex Elder 	u64 *snap_sizes = NULL;
1014ccece235SAlex Elder 	u32 snap_count;
1015bb23e37aSAlex Elder 	int ret = -ENOMEM;
1016621901d6SAlex Elder 	u32 i;
1017602adf40SYehuda Sadeh 
1018bb23e37aSAlex Elder 	/* Allocate this now to avoid having to handle failure below */
1019103a150fSAlex Elder 
1020bb23e37aSAlex Elder 	if (first_time) {
1021848d796cSIlya Dryomov 		object_prefix = kstrndup(ondisk->object_prefix,
1022848d796cSIlya Dryomov 					 sizeof(ondisk->object_prefix),
1023848d796cSIlya Dryomov 					 GFP_KERNEL);
1024bb23e37aSAlex Elder 		if (!object_prefix)
1025602adf40SYehuda Sadeh 			return -ENOMEM;
1026bb23e37aSAlex Elder 	}
102700f1f36fSAlex Elder 
1028bb23e37aSAlex Elder 	/* Allocate the snapshot context and fill it in */
1029d2bb24e5SAlex Elder 
1030602adf40SYehuda Sadeh 	snap_count = le32_to_cpu(ondisk->snap_count);
1031bb23e37aSAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1032bb23e37aSAlex Elder 	if (!snapc)
1033bb23e37aSAlex Elder 		goto out_err;
1034bb23e37aSAlex Elder 	snapc->seq = le64_to_cpu(ondisk->snap_seq);
1035602adf40SYehuda Sadeh 	if (snap_count) {
1036bb23e37aSAlex Elder 		struct rbd_image_snap_ondisk *snaps;
1037f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1038f785cc1dSAlex Elder 
1039bb23e37aSAlex Elder 		/* We'll keep a copy of the snapshot names... */
1040621901d6SAlex Elder 
1041f785cc1dSAlex Elder 		if (snap_names_len > (u64)SIZE_MAX)
1042bb23e37aSAlex Elder 			goto out_2big;
1043bb23e37aSAlex Elder 		snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1044bb23e37aSAlex Elder 		if (!snap_names)
1045602adf40SYehuda Sadeh 			goto out_err;
1046bb23e37aSAlex Elder 
1047bb23e37aSAlex Elder 		/* ...as well as the array of their sizes. */
104888a25a5fSMarkus Elfring 		snap_sizes = kmalloc_array(snap_count,
104988a25a5fSMarkus Elfring 					   sizeof(*header->snap_sizes),
105088a25a5fSMarkus Elfring 					   GFP_KERNEL);
1051bb23e37aSAlex Elder 		if (!snap_sizes)
1052bb23e37aSAlex Elder 			goto out_err;
1053bb23e37aSAlex Elder 
1054f785cc1dSAlex Elder 		/*
1055bb23e37aSAlex Elder 		 * Copy the names, and fill in each snapshot's id
1056bb23e37aSAlex Elder 		 * and size.
1057bb23e37aSAlex Elder 		 *
105899a41ebcSAlex Elder 		 * Note that rbd_dev_v1_header_info() guarantees the
1059bb23e37aSAlex Elder 		 * ondisk buffer we're working with has
1060f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
1061f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
1062f785cc1dSAlex Elder 		 */
1063bb23e37aSAlex Elder 		memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1064bb23e37aSAlex Elder 		snaps = ondisk->snaps;
1065bb23e37aSAlex Elder 		for (i = 0; i < snap_count; i++) {
1066bb23e37aSAlex Elder 			snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1067bb23e37aSAlex Elder 			snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1068bb23e37aSAlex Elder 		}
1069602adf40SYehuda Sadeh 	}
1070849b4260SAlex Elder 
1071bb23e37aSAlex Elder 	/* We won't fail any more, fill in the header */
1072bb23e37aSAlex Elder 
1073bb23e37aSAlex Elder 	if (first_time) {
1074bb23e37aSAlex Elder 		header->object_prefix = object_prefix;
1075602adf40SYehuda Sadeh 		header->obj_order = ondisk->options.order;
1076263423f8SIlya Dryomov 		rbd_init_layout(rbd_dev);
1077662518b1SAlex Elder 	} else {
1078662518b1SAlex Elder 		ceph_put_snap_context(header->snapc);
1079662518b1SAlex Elder 		kfree(header->snap_names);
1080662518b1SAlex Elder 		kfree(header->snap_sizes);
1081bb23e37aSAlex Elder 	}
10826a52325fSAlex Elder 
1083bb23e37aSAlex Elder 	/* The remaining fields always get updated (when we refresh) */
1084621901d6SAlex Elder 
1085f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
1086bb23e37aSAlex Elder 	header->snapc = snapc;
1087bb23e37aSAlex Elder 	header->snap_names = snap_names;
1088bb23e37aSAlex Elder 	header->snap_sizes = snap_sizes;
1089468521c1SAlex Elder 
1090602adf40SYehuda Sadeh 	return 0;
1091bb23e37aSAlex Elder out_2big:
1092bb23e37aSAlex Elder 	ret = -EIO;
10936a52325fSAlex Elder out_err:
1094bb23e37aSAlex Elder 	kfree(snap_sizes);
1095bb23e37aSAlex Elder 	kfree(snap_names);
1096bb23e37aSAlex Elder 	ceph_put_snap_context(snapc);
1097bb23e37aSAlex Elder 	kfree(object_prefix);
1098ccece235SAlex Elder 
1099bb23e37aSAlex Elder 	return ret;
1100602adf40SYehuda Sadeh }
1101602adf40SYehuda Sadeh 
11029682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
11039682fc6dSAlex Elder {
11049682fc6dSAlex Elder 	const char *snap_name;
11059682fc6dSAlex Elder 
11069682fc6dSAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
11079682fc6dSAlex Elder 
11089682fc6dSAlex Elder 	/* Skip over names until we find the one we are looking for */
11099682fc6dSAlex Elder 
11109682fc6dSAlex Elder 	snap_name = rbd_dev->header.snap_names;
11119682fc6dSAlex Elder 	while (which--)
11129682fc6dSAlex Elder 		snap_name += strlen(snap_name) + 1;
11139682fc6dSAlex Elder 
11149682fc6dSAlex Elder 	return kstrdup(snap_name, GFP_KERNEL);
11159682fc6dSAlex Elder }
11169682fc6dSAlex Elder 
111730d1cff8SAlex Elder /*
111830d1cff8SAlex Elder  * Snapshot id comparison function for use with qsort()/bsearch().
111930d1cff8SAlex Elder  * Note that result is for snapshots in *descending* order.
112030d1cff8SAlex Elder  */
112130d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2)
112230d1cff8SAlex Elder {
112330d1cff8SAlex Elder 	u64 snap_id1 = *(u64 *)s1;
112430d1cff8SAlex Elder 	u64 snap_id2 = *(u64 *)s2;
112530d1cff8SAlex Elder 
112630d1cff8SAlex Elder 	if (snap_id1 < snap_id2)
112730d1cff8SAlex Elder 		return 1;
112830d1cff8SAlex Elder 	return snap_id1 == snap_id2 ? 0 : -1;
112930d1cff8SAlex Elder }
113030d1cff8SAlex Elder 
113130d1cff8SAlex Elder /*
113230d1cff8SAlex Elder  * Search a snapshot context to see if the given snapshot id is
113330d1cff8SAlex Elder  * present.
113430d1cff8SAlex Elder  *
113530d1cff8SAlex Elder  * Returns the position of the snapshot id in the array if it's found,
113630d1cff8SAlex Elder  * or BAD_SNAP_INDEX otherwise.
113730d1cff8SAlex Elder  *
113830d1cff8SAlex Elder  * Note: The snapshot array is in kept sorted (by the osd) in
113930d1cff8SAlex Elder  * reverse order, highest snapshot id first.
114030d1cff8SAlex Elder  */
11419682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
11429682fc6dSAlex Elder {
11439682fc6dSAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
114430d1cff8SAlex Elder 	u64 *found;
11459682fc6dSAlex Elder 
114630d1cff8SAlex Elder 	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
114730d1cff8SAlex Elder 				sizeof (snap_id), snapid_compare_reverse);
11489682fc6dSAlex Elder 
114930d1cff8SAlex Elder 	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
11509682fc6dSAlex Elder }
11519682fc6dSAlex Elder 
11522ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
11532ad3d716SAlex Elder 					u64 snap_id)
115454cac61fSAlex Elder {
115554cac61fSAlex Elder 	u32 which;
1156da6a6b63SJosh Durgin 	const char *snap_name;
115754cac61fSAlex Elder 
115854cac61fSAlex Elder 	which = rbd_dev_snap_index(rbd_dev, snap_id);
115954cac61fSAlex Elder 	if (which == BAD_SNAP_INDEX)
1160da6a6b63SJosh Durgin 		return ERR_PTR(-ENOENT);
116154cac61fSAlex Elder 
1162da6a6b63SJosh Durgin 	snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1163da6a6b63SJosh Durgin 	return snap_name ? snap_name : ERR_PTR(-ENOMEM);
116454cac61fSAlex Elder }
116554cac61fSAlex Elder 
11669e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
11679e15b77dSAlex Elder {
11689e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
11699e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
11709e15b77dSAlex Elder 
117154cac61fSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
117254cac61fSAlex Elder 	if (rbd_dev->image_format == 1)
117354cac61fSAlex Elder 		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
11749e15b77dSAlex Elder 
117554cac61fSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
11769e15b77dSAlex Elder }
11779e15b77dSAlex Elder 
11782ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
11792ad3d716SAlex Elder 				u64 *snap_size)
1180602adf40SYehuda Sadeh {
11812ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
11822ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
11832ad3d716SAlex Elder 		*snap_size = rbd_dev->header.image_size;
11842ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
11852ad3d716SAlex Elder 		u32 which;
118600f1f36fSAlex Elder 
11872ad3d716SAlex Elder 		which = rbd_dev_snap_index(rbd_dev, snap_id);
11882ad3d716SAlex Elder 		if (which == BAD_SNAP_INDEX)
11892ad3d716SAlex Elder 			return -ENOENT;
119000f1f36fSAlex Elder 
11912ad3d716SAlex Elder 		*snap_size = rbd_dev->header.snap_sizes[which];
11922ad3d716SAlex Elder 	} else {
11932ad3d716SAlex Elder 		u64 size = 0;
11942ad3d716SAlex Elder 		int ret;
11952ad3d716SAlex Elder 
11962ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
11972ad3d716SAlex Elder 		if (ret)
11982ad3d716SAlex Elder 			return ret;
11992ad3d716SAlex Elder 
12002ad3d716SAlex Elder 		*snap_size = size;
12012ad3d716SAlex Elder 	}
12022ad3d716SAlex Elder 	return 0;
12032ad3d716SAlex Elder }
12042ad3d716SAlex Elder 
12052ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
12062ad3d716SAlex Elder 			u64 *snap_features)
12072ad3d716SAlex Elder {
12082ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
12092ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
12102ad3d716SAlex Elder 		*snap_features = rbd_dev->header.features;
12112ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
12122ad3d716SAlex Elder 		*snap_features = 0;	/* No features for format 1 */
12132ad3d716SAlex Elder 	} else {
12142ad3d716SAlex Elder 		u64 features = 0;
12152ad3d716SAlex Elder 		int ret;
12162ad3d716SAlex Elder 
12172ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
12182ad3d716SAlex Elder 		if (ret)
12192ad3d716SAlex Elder 			return ret;
12202ad3d716SAlex Elder 
12212ad3d716SAlex Elder 		*snap_features = features;
12222ad3d716SAlex Elder 	}
12232ad3d716SAlex Elder 	return 0;
122400f1f36fSAlex Elder }
1225602adf40SYehuda Sadeh 
1226d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1227602adf40SYehuda Sadeh {
12288f4b7d98SAlex Elder 	u64 snap_id = rbd_dev->spec->snap_id;
12292ad3d716SAlex Elder 	u64 size = 0;
12302ad3d716SAlex Elder 	u64 features = 0;
12312ad3d716SAlex Elder 	int ret;
12328b0241f8SAlex Elder 
12332ad3d716SAlex Elder 	ret = rbd_snap_size(rbd_dev, snap_id, &size);
12342ad3d716SAlex Elder 	if (ret)
12352ad3d716SAlex Elder 		return ret;
12362ad3d716SAlex Elder 	ret = rbd_snap_features(rbd_dev, snap_id, &features);
12372ad3d716SAlex Elder 	if (ret)
12382ad3d716SAlex Elder 		return ret;
12392ad3d716SAlex Elder 
12402ad3d716SAlex Elder 	rbd_dev->mapping.size = size;
12412ad3d716SAlex Elder 	rbd_dev->mapping.features = features;
12422ad3d716SAlex Elder 
12438b0241f8SAlex Elder 	return 0;
1244602adf40SYehuda Sadeh }
1245602adf40SYehuda Sadeh 
1246d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1247d1cf5788SAlex Elder {
1248d1cf5788SAlex Elder 	rbd_dev->mapping.size = 0;
1249d1cf5788SAlex Elder 	rbd_dev->mapping.features = 0;
1250200a6a8bSAlex Elder }
1251200a6a8bSAlex Elder 
125265ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
125365ccfe21SAlex Elder {
12545bc3fb17SIlya Dryomov 	u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
1255602adf40SYehuda Sadeh 
125665ccfe21SAlex Elder 	return offset & (segment_size - 1);
125765ccfe21SAlex Elder }
125865ccfe21SAlex Elder 
125965ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
126065ccfe21SAlex Elder 				u64 offset, u64 length)
126165ccfe21SAlex Elder {
12625bc3fb17SIlya Dryomov 	u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
126365ccfe21SAlex Elder 
126465ccfe21SAlex Elder 	offset &= segment_size - 1;
126565ccfe21SAlex Elder 
1266aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
126765ccfe21SAlex Elder 	if (offset + length > segment_size)
126865ccfe21SAlex Elder 		length = segment_size - offset;
126965ccfe21SAlex Elder 
127065ccfe21SAlex Elder 	return length;
1271602adf40SYehuda Sadeh }
1272602adf40SYehuda Sadeh 
1273602adf40SYehuda Sadeh /*
1274602adf40SYehuda Sadeh  * bio helpers
1275602adf40SYehuda Sadeh  */
1276602adf40SYehuda Sadeh 
1277602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
1278602adf40SYehuda Sadeh {
1279602adf40SYehuda Sadeh 	struct bio *tmp;
1280602adf40SYehuda Sadeh 
1281602adf40SYehuda Sadeh 	while (chain) {
1282602adf40SYehuda Sadeh 		tmp = chain;
1283602adf40SYehuda Sadeh 		chain = chain->bi_next;
1284602adf40SYehuda Sadeh 		bio_put(tmp);
1285602adf40SYehuda Sadeh 	}
1286602adf40SYehuda Sadeh }
1287602adf40SYehuda Sadeh 
1288602adf40SYehuda Sadeh /*
1289602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
1290602adf40SYehuda Sadeh  */
1291602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
1292602adf40SYehuda Sadeh {
12937988613bSKent Overstreet 	struct bio_vec bv;
12947988613bSKent Overstreet 	struct bvec_iter iter;
1295602adf40SYehuda Sadeh 	unsigned long flags;
1296602adf40SYehuda Sadeh 	void *buf;
1297602adf40SYehuda Sadeh 	int pos = 0;
1298602adf40SYehuda Sadeh 
1299602adf40SYehuda Sadeh 	while (chain) {
13007988613bSKent Overstreet 		bio_for_each_segment(bv, chain, iter) {
13017988613bSKent Overstreet 			if (pos + bv.bv_len > start_ofs) {
1302602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
13037988613bSKent Overstreet 				buf = bvec_kmap_irq(&bv, &flags);
1304602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
13057988613bSKent Overstreet 				       bv.bv_len - remainder);
13067988613bSKent Overstreet 				flush_dcache_page(bv.bv_page);
130785b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
1308602adf40SYehuda Sadeh 			}
13097988613bSKent Overstreet 			pos += bv.bv_len;
1310602adf40SYehuda Sadeh 		}
1311602adf40SYehuda Sadeh 
1312602adf40SYehuda Sadeh 		chain = chain->bi_next;
1313602adf40SYehuda Sadeh 	}
1314602adf40SYehuda Sadeh }
1315602adf40SYehuda Sadeh 
1316602adf40SYehuda Sadeh /*
1317b9434c5bSAlex Elder  * similar to zero_bio_chain(), zeros data defined by a page array,
1318b9434c5bSAlex Elder  * starting at the given byte offset from the start of the array and
1319b9434c5bSAlex Elder  * continuing up to the given end offset.  The pages array is
1320b9434c5bSAlex Elder  * assumed to be big enough to hold all bytes up to the end.
1321b9434c5bSAlex Elder  */
1322b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end)
1323b9434c5bSAlex Elder {
1324b9434c5bSAlex Elder 	struct page **page = &pages[offset >> PAGE_SHIFT];
1325b9434c5bSAlex Elder 
1326b9434c5bSAlex Elder 	rbd_assert(end > offset);
1327b9434c5bSAlex Elder 	rbd_assert(end - offset <= (u64)SIZE_MAX);
1328b9434c5bSAlex Elder 	while (offset < end) {
1329b9434c5bSAlex Elder 		size_t page_offset;
1330b9434c5bSAlex Elder 		size_t length;
1331b9434c5bSAlex Elder 		unsigned long flags;
1332b9434c5bSAlex Elder 		void *kaddr;
1333b9434c5bSAlex Elder 
1334491205a8SGeert Uytterhoeven 		page_offset = offset & ~PAGE_MASK;
1335491205a8SGeert Uytterhoeven 		length = min_t(size_t, PAGE_SIZE - page_offset, end - offset);
1336b9434c5bSAlex Elder 		local_irq_save(flags);
1337b9434c5bSAlex Elder 		kaddr = kmap_atomic(*page);
1338b9434c5bSAlex Elder 		memset(kaddr + page_offset, 0, length);
1339e2156054SAlex Elder 		flush_dcache_page(*page);
1340b9434c5bSAlex Elder 		kunmap_atomic(kaddr);
1341b9434c5bSAlex Elder 		local_irq_restore(flags);
1342b9434c5bSAlex Elder 
1343b9434c5bSAlex Elder 		offset += length;
1344b9434c5bSAlex Elder 		page++;
1345b9434c5bSAlex Elder 	}
1346b9434c5bSAlex Elder }
1347b9434c5bSAlex Elder 
1348b9434c5bSAlex Elder /*
1349f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
1350f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
1351602adf40SYehuda Sadeh  */
1352f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
1353f7760dadSAlex Elder 					unsigned int offset,
1354f7760dadSAlex Elder 					unsigned int len,
1355f7760dadSAlex Elder 					gfp_t gfpmask)
1356602adf40SYehuda Sadeh {
1357f7760dadSAlex Elder 	struct bio *bio;
1358602adf40SYehuda Sadeh 
13595341a627SKent Overstreet 	bio = bio_clone(bio_src, gfpmask);
1360f7760dadSAlex Elder 	if (!bio)
1361f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
1362f7760dadSAlex Elder 
13635341a627SKent Overstreet 	bio_advance(bio, offset);
13644f024f37SKent Overstreet 	bio->bi_iter.bi_size = len;
1365602adf40SYehuda Sadeh 
1366f7760dadSAlex Elder 	return bio;
1367602adf40SYehuda Sadeh }
1368602adf40SYehuda Sadeh 
1369f7760dadSAlex Elder /*
1370f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
1371f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
1372f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
1373f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
1374f7760dadSAlex Elder  *
1375f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
1376f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
1377f7760dadSAlex Elder  * the start of data to be cloned is located.
1378f7760dadSAlex Elder  *
1379f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
1380f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
1381f7760dadSAlex Elder  * contain the offset of that byte within that bio.
1382f7760dadSAlex Elder  */
1383f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
1384f7760dadSAlex Elder 					unsigned int *offset,
1385f7760dadSAlex Elder 					unsigned int len,
1386f7760dadSAlex Elder 					gfp_t gfpmask)
1387f7760dadSAlex Elder {
1388f7760dadSAlex Elder 	struct bio *bi = *bio_src;
1389f7760dadSAlex Elder 	unsigned int off = *offset;
1390f7760dadSAlex Elder 	struct bio *chain = NULL;
1391f7760dadSAlex Elder 	struct bio **end;
1392602adf40SYehuda Sadeh 
1393f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
1394602adf40SYehuda Sadeh 
13954f024f37SKent Overstreet 	if (!bi || off >= bi->bi_iter.bi_size || !len)
1396f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
1397602adf40SYehuda Sadeh 
1398f7760dadSAlex Elder 	end = &chain;
1399f7760dadSAlex Elder 	while (len) {
1400f7760dadSAlex Elder 		unsigned int bi_size;
1401f7760dadSAlex Elder 		struct bio *bio;
1402f7760dadSAlex Elder 
1403f5400b7aSAlex Elder 		if (!bi) {
1404f5400b7aSAlex Elder 			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1405f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
1406f5400b7aSAlex Elder 		}
14074f024f37SKent Overstreet 		bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len);
1408f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
1409f7760dadSAlex Elder 		if (!bio)
1410f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
1411f7760dadSAlex Elder 
1412f7760dadSAlex Elder 		*end = bio;
1413f7760dadSAlex Elder 		end = &bio->bi_next;
1414f7760dadSAlex Elder 
1415f7760dadSAlex Elder 		off += bi_size;
14164f024f37SKent Overstreet 		if (off == bi->bi_iter.bi_size) {
1417f7760dadSAlex Elder 			bi = bi->bi_next;
1418f7760dadSAlex Elder 			off = 0;
1419f7760dadSAlex Elder 		}
1420f7760dadSAlex Elder 		len -= bi_size;
1421f7760dadSAlex Elder 	}
1422f7760dadSAlex Elder 	*bio_src = bi;
1423f7760dadSAlex Elder 	*offset = off;
1424f7760dadSAlex Elder 
1425f7760dadSAlex Elder 	return chain;
1426f7760dadSAlex Elder out_err:
1427f7760dadSAlex Elder 	bio_chain_put(chain);
1428f7760dadSAlex Elder 
1429602adf40SYehuda Sadeh 	return NULL;
1430602adf40SYehuda Sadeh }
1431602adf40SYehuda Sadeh 
1432926f9b3fSAlex Elder /*
1433926f9b3fSAlex Elder  * The default/initial value for all object request flags is 0.  For
1434926f9b3fSAlex Elder  * each flag, once its value is set to 1 it is never reset to 0
1435926f9b3fSAlex Elder  * again.
1436926f9b3fSAlex Elder  */
14376365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
14386365d33aSAlex Elder {
14396365d33aSAlex Elder 	if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
14406365d33aSAlex Elder 		struct rbd_device *rbd_dev;
14416365d33aSAlex Elder 
144257acbaa7SAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
14439584d508SIlya Dryomov 		rbd_warn(rbd_dev, "obj_request %p already marked img_data",
14446365d33aSAlex Elder 			obj_request);
14456365d33aSAlex Elder 	}
14466365d33aSAlex Elder }
14476365d33aSAlex Elder 
14486365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
14496365d33aSAlex Elder {
14506365d33aSAlex Elder 	smp_mb();
14516365d33aSAlex Elder 	return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
14526365d33aSAlex Elder }
14536365d33aSAlex Elder 
145457acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request)
145557acbaa7SAlex Elder {
145657acbaa7SAlex Elder 	if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
145757acbaa7SAlex Elder 		struct rbd_device *rbd_dev = NULL;
145857acbaa7SAlex Elder 
145957acbaa7SAlex Elder 		if (obj_request_img_data_test(obj_request))
146057acbaa7SAlex Elder 			rbd_dev = obj_request->img_request->rbd_dev;
14619584d508SIlya Dryomov 		rbd_warn(rbd_dev, "obj_request %p already marked done",
146257acbaa7SAlex Elder 			obj_request);
146357acbaa7SAlex Elder 	}
146457acbaa7SAlex Elder }
146557acbaa7SAlex Elder 
146657acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request)
146757acbaa7SAlex Elder {
146857acbaa7SAlex Elder 	smp_mb();
146957acbaa7SAlex Elder 	return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
147057acbaa7SAlex Elder }
147157acbaa7SAlex Elder 
14725679c59fSAlex Elder /*
14735679c59fSAlex Elder  * This sets the KNOWN flag after (possibly) setting the EXISTS
14745679c59fSAlex Elder  * flag.  The latter is set based on the "exists" value provided.
14755679c59fSAlex Elder  *
14765679c59fSAlex Elder  * Note that for our purposes once an object exists it never goes
14775679c59fSAlex Elder  * away again.  It's possible that the response from two existence
14785679c59fSAlex Elder  * checks are separated by the creation of the target object, and
14795679c59fSAlex Elder  * the first ("doesn't exist") response arrives *after* the second
14805679c59fSAlex Elder  * ("does exist").  In that case we ignore the second one.
14815679c59fSAlex Elder  */
14825679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request,
14835679c59fSAlex Elder 				bool exists)
14845679c59fSAlex Elder {
14855679c59fSAlex Elder 	if (exists)
14865679c59fSAlex Elder 		set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
14875679c59fSAlex Elder 	set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
14885679c59fSAlex Elder 	smp_mb();
14895679c59fSAlex Elder }
14905679c59fSAlex Elder 
14915679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request)
14925679c59fSAlex Elder {
14935679c59fSAlex Elder 	smp_mb();
14945679c59fSAlex Elder 	return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
14955679c59fSAlex Elder }
14965679c59fSAlex Elder 
14975679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
14985679c59fSAlex Elder {
14995679c59fSAlex Elder 	smp_mb();
15005679c59fSAlex Elder 	return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
15015679c59fSAlex Elder }
15025679c59fSAlex Elder 
15039638556aSIlya Dryomov static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request)
15049638556aSIlya Dryomov {
15059638556aSIlya Dryomov 	struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
15069638556aSIlya Dryomov 
15079638556aSIlya Dryomov 	return obj_request->img_offset <
15089638556aSIlya Dryomov 	    round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header));
15099638556aSIlya Dryomov }
15109638556aSIlya Dryomov 
1511bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1512bf0d5f50SAlex Elder {
151337206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
15142c935bc5SPeter Zijlstra 		kref_read(&obj_request->kref));
1515bf0d5f50SAlex Elder 	kref_get(&obj_request->kref);
1516bf0d5f50SAlex Elder }
1517bf0d5f50SAlex Elder 
1518bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
1519bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1520bf0d5f50SAlex Elder {
1521bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
152237206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
15232c935bc5SPeter Zijlstra 		kref_read(&obj_request->kref));
1524bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1525bf0d5f50SAlex Elder }
1526bf0d5f50SAlex Elder 
15270f2d5be7SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request)
15280f2d5be7SAlex Elder {
15290f2d5be7SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
15302c935bc5SPeter Zijlstra 	     kref_read(&img_request->kref));
15310f2d5be7SAlex Elder 	kref_get(&img_request->kref);
15320f2d5be7SAlex Elder }
15330f2d5be7SAlex Elder 
1534e93f3152SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request);
1535e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref);
1536bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref);
1537bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request)
1538bf0d5f50SAlex Elder {
1539bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
154037206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
15412c935bc5SPeter Zijlstra 		kref_read(&img_request->kref));
1542e93f3152SAlex Elder 	if (img_request_child_test(img_request))
1543e93f3152SAlex Elder 		kref_put(&img_request->kref, rbd_parent_request_destroy);
1544e93f3152SAlex Elder 	else
1545bf0d5f50SAlex Elder 		kref_put(&img_request->kref, rbd_img_request_destroy);
1546bf0d5f50SAlex Elder }
1547bf0d5f50SAlex Elder 
1548bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1549bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1550bf0d5f50SAlex Elder {
155125dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
155225dcf954SAlex Elder 
1553b155e86cSAlex Elder 	/* Image request now owns object's original reference */
1554bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
155525dcf954SAlex Elder 	obj_request->which = img_request->obj_request_count;
15566365d33aSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
15576365d33aSAlex Elder 	obj_request_img_data_set(obj_request);
1558bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
155925dcf954SAlex Elder 	img_request->obj_request_count++;
156025dcf954SAlex Elder 	list_add_tail(&obj_request->links, &img_request->obj_requests);
156137206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
156237206ee5SAlex Elder 		obj_request->which);
1563bf0d5f50SAlex Elder }
1564bf0d5f50SAlex Elder 
1565bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1566bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1567bf0d5f50SAlex Elder {
1568bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
156925dcf954SAlex Elder 
157037206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
157137206ee5SAlex Elder 		obj_request->which);
1572bf0d5f50SAlex Elder 	list_del(&obj_request->links);
157325dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
157425dcf954SAlex Elder 	img_request->obj_request_count--;
157525dcf954SAlex Elder 	rbd_assert(obj_request->which == img_request->obj_request_count);
157625dcf954SAlex Elder 	obj_request->which = BAD_WHICH;
15776365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
1578bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1579bf0d5f50SAlex Elder 	obj_request->img_request = NULL;
158025dcf954SAlex Elder 	obj_request->callback = NULL;
1581bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1582bf0d5f50SAlex Elder }
1583bf0d5f50SAlex Elder 
1584bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type)
1585bf0d5f50SAlex Elder {
1586bf0d5f50SAlex Elder 	switch (type) {
15879969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1588bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1589788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1590bf0d5f50SAlex Elder 		return true;
1591bf0d5f50SAlex Elder 	default:
1592bf0d5f50SAlex Elder 		return false;
1593bf0d5f50SAlex Elder 	}
1594bf0d5f50SAlex Elder }
1595bf0d5f50SAlex Elder 
15964a17dadcSIlya Dryomov static void rbd_img_obj_callback(struct rbd_obj_request *obj_request);
15974a17dadcSIlya Dryomov 
1598980917fcSIlya Dryomov static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
1599bf0d5f50SAlex Elder {
1600980917fcSIlya Dryomov 	struct ceph_osd_request *osd_req = obj_request->osd_req;
1601980917fcSIlya Dryomov 
1602a90bb0c1SIlya Dryomov 	dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__,
1603a90bb0c1SIlya Dryomov 	     obj_request, obj_request->object_no, obj_request->offset,
160467e2b652SIlya Dryomov 	     obj_request->length, osd_req);
16054a17dadcSIlya Dryomov 	if (obj_request_img_data_test(obj_request)) {
16064a17dadcSIlya Dryomov 		WARN_ON(obj_request->callback != rbd_img_obj_callback);
16074a17dadcSIlya Dryomov 		rbd_img_request_get(obj_request->img_request);
16084a17dadcSIlya Dryomov 	}
1609980917fcSIlya Dryomov 	ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
1610bf0d5f50SAlex Elder }
1611bf0d5f50SAlex Elder 
1612bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request)
1613bf0d5f50SAlex Elder {
161455f27e09SAlex Elder 
161537206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
161655f27e09SAlex Elder 
161755f27e09SAlex Elder 	/*
161855f27e09SAlex Elder 	 * If no error occurred, compute the aggregate transfer
161955f27e09SAlex Elder 	 * count for the image request.  We could instead use
162055f27e09SAlex Elder 	 * atomic64_cmpxchg() to update it as each object request
162155f27e09SAlex Elder 	 * completes; not clear which way is better off hand.
162255f27e09SAlex Elder 	 */
162355f27e09SAlex Elder 	if (!img_request->result) {
162455f27e09SAlex Elder 		struct rbd_obj_request *obj_request;
162555f27e09SAlex Elder 		u64 xferred = 0;
162655f27e09SAlex Elder 
162755f27e09SAlex Elder 		for_each_obj_request(img_request, obj_request)
162855f27e09SAlex Elder 			xferred += obj_request->xferred;
162955f27e09SAlex Elder 		img_request->xferred = xferred;
163055f27e09SAlex Elder 	}
163155f27e09SAlex Elder 
1632bf0d5f50SAlex Elder 	if (img_request->callback)
1633bf0d5f50SAlex Elder 		img_request->callback(img_request);
1634bf0d5f50SAlex Elder 	else
1635bf0d5f50SAlex Elder 		rbd_img_request_put(img_request);
1636bf0d5f50SAlex Elder }
1637bf0d5f50SAlex Elder 
16380c425248SAlex Elder /*
16390c425248SAlex Elder  * The default/initial value for all image request flags is 0.  Each
16400c425248SAlex Elder  * is conditionally set to 1 at image request initialization time
16410c425248SAlex Elder  * and currently never change thereafter.
16420c425248SAlex Elder  */
16430c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request)
16440c425248SAlex Elder {
16450c425248SAlex Elder 	set_bit(IMG_REQ_WRITE, &img_request->flags);
16460c425248SAlex Elder 	smp_mb();
16470c425248SAlex Elder }
16480c425248SAlex Elder 
16490c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request)
16500c425248SAlex Elder {
16510c425248SAlex Elder 	smp_mb();
16520c425248SAlex Elder 	return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
16530c425248SAlex Elder }
16540c425248SAlex Elder 
165590e98c52SGuangliang Zhao /*
165690e98c52SGuangliang Zhao  * Set the discard flag when the img_request is an discard request
165790e98c52SGuangliang Zhao  */
165890e98c52SGuangliang Zhao static void img_request_discard_set(struct rbd_img_request *img_request)
165990e98c52SGuangliang Zhao {
166090e98c52SGuangliang Zhao 	set_bit(IMG_REQ_DISCARD, &img_request->flags);
166190e98c52SGuangliang Zhao 	smp_mb();
166290e98c52SGuangliang Zhao }
166390e98c52SGuangliang Zhao 
166490e98c52SGuangliang Zhao static bool img_request_discard_test(struct rbd_img_request *img_request)
166590e98c52SGuangliang Zhao {
166690e98c52SGuangliang Zhao 	smp_mb();
166790e98c52SGuangliang Zhao 	return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0;
166890e98c52SGuangliang Zhao }
166990e98c52SGuangliang Zhao 
16709849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request)
16719849e986SAlex Elder {
16729849e986SAlex Elder 	set_bit(IMG_REQ_CHILD, &img_request->flags);
16739849e986SAlex Elder 	smp_mb();
16749849e986SAlex Elder }
16759849e986SAlex Elder 
1676e93f3152SAlex Elder static void img_request_child_clear(struct rbd_img_request *img_request)
1677e93f3152SAlex Elder {
1678e93f3152SAlex Elder 	clear_bit(IMG_REQ_CHILD, &img_request->flags);
1679e93f3152SAlex Elder 	smp_mb();
1680e93f3152SAlex Elder }
1681e93f3152SAlex Elder 
16829849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request)
16839849e986SAlex Elder {
16849849e986SAlex Elder 	smp_mb();
16859849e986SAlex Elder 	return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
16869849e986SAlex Elder }
16879849e986SAlex Elder 
1688d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request)
1689d0b2e944SAlex Elder {
1690d0b2e944SAlex Elder 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1691d0b2e944SAlex Elder 	smp_mb();
1692d0b2e944SAlex Elder }
1693d0b2e944SAlex Elder 
1694a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request)
1695a2acd00eSAlex Elder {
1696a2acd00eSAlex Elder 	clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1697a2acd00eSAlex Elder 	smp_mb();
1698a2acd00eSAlex Elder }
1699a2acd00eSAlex Elder 
1700d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request)
1701d0b2e944SAlex Elder {
1702d0b2e944SAlex Elder 	smp_mb();
1703d0b2e944SAlex Elder 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1704d0b2e944SAlex Elder }
1705d0b2e944SAlex Elder 
17063b434a2aSJosh Durgin static enum obj_operation_type
17073b434a2aSJosh Durgin rbd_img_request_op_type(struct rbd_img_request *img_request)
17083b434a2aSJosh Durgin {
17093b434a2aSJosh Durgin 	if (img_request_write_test(img_request))
17103b434a2aSJosh Durgin 		return OBJ_OP_WRITE;
17113b434a2aSJosh Durgin 	else if (img_request_discard_test(img_request))
17123b434a2aSJosh Durgin 		return OBJ_OP_DISCARD;
17133b434a2aSJosh Durgin 	else
17143b434a2aSJosh Durgin 		return OBJ_OP_READ;
17153b434a2aSJosh Durgin }
17163b434a2aSJosh Durgin 
17176e2a4505SAlex Elder static void
17186e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
17196e2a4505SAlex Elder {
1720b9434c5bSAlex Elder 	u64 xferred = obj_request->xferred;
1721b9434c5bSAlex Elder 	u64 length = obj_request->length;
1722b9434c5bSAlex Elder 
17236e2a4505SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
17246e2a4505SAlex Elder 		obj_request, obj_request->img_request, obj_request->result,
1725b9434c5bSAlex Elder 		xferred, length);
17266e2a4505SAlex Elder 	/*
172717c1cc1dSJosh Durgin 	 * ENOENT means a hole in the image.  We zero-fill the entire
172817c1cc1dSJosh Durgin 	 * length of the request.  A short read also implies zero-fill
172917c1cc1dSJosh Durgin 	 * to the end of the request.  An error requires the whole
173017c1cc1dSJosh Durgin 	 * length of the request to be reported finished with an error
173117c1cc1dSJosh Durgin 	 * to the block layer.  In each case we update the xferred
173217c1cc1dSJosh Durgin 	 * count to indicate the whole request was satisfied.
17336e2a4505SAlex Elder 	 */
1734b9434c5bSAlex Elder 	rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
17356e2a4505SAlex Elder 	if (obj_request->result == -ENOENT) {
1736b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
17376e2a4505SAlex Elder 			zero_bio_chain(obj_request->bio_list, 0);
1738b9434c5bSAlex Elder 		else
1739b9434c5bSAlex Elder 			zero_pages(obj_request->pages, 0, length);
17406e2a4505SAlex Elder 		obj_request->result = 0;
1741b9434c5bSAlex Elder 	} else if (xferred < length && !obj_request->result) {
1742b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
1743b9434c5bSAlex Elder 			zero_bio_chain(obj_request->bio_list, xferred);
1744b9434c5bSAlex Elder 		else
1745b9434c5bSAlex Elder 			zero_pages(obj_request->pages, xferred, length);
17466e2a4505SAlex Elder 	}
174717c1cc1dSJosh Durgin 	obj_request->xferred = length;
17486e2a4505SAlex Elder 	obj_request_done_set(obj_request);
17496e2a4505SAlex Elder }
17506e2a4505SAlex Elder 
1751bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1752bf0d5f50SAlex Elder {
175337206ee5SAlex Elder 	dout("%s: obj %p cb %p\n", __func__, obj_request,
175437206ee5SAlex Elder 		obj_request->callback);
1755bf0d5f50SAlex Elder 	if (obj_request->callback)
1756bf0d5f50SAlex Elder 		obj_request->callback(obj_request);
1757788e2df3SAlex Elder 	else
1758788e2df3SAlex Elder 		complete_all(&obj_request->completion);
1759bf0d5f50SAlex Elder }
1760bf0d5f50SAlex Elder 
17610dcc685eSIlya Dryomov static void rbd_obj_request_error(struct rbd_obj_request *obj_request, int err)
17620dcc685eSIlya Dryomov {
17630dcc685eSIlya Dryomov 	obj_request->result = err;
17640dcc685eSIlya Dryomov 	obj_request->xferred = 0;
17650dcc685eSIlya Dryomov 	/*
17660dcc685eSIlya Dryomov 	 * kludge - mirror rbd_obj_request_submit() to match a put in
17670dcc685eSIlya Dryomov 	 * rbd_img_obj_callback()
17680dcc685eSIlya Dryomov 	 */
17690dcc685eSIlya Dryomov 	if (obj_request_img_data_test(obj_request)) {
17700dcc685eSIlya Dryomov 		WARN_ON(obj_request->callback != rbd_img_obj_callback);
17710dcc685eSIlya Dryomov 		rbd_img_request_get(obj_request->img_request);
17720dcc685eSIlya Dryomov 	}
17730dcc685eSIlya Dryomov 	obj_request_done_set(obj_request);
17740dcc685eSIlya Dryomov 	rbd_obj_request_complete(obj_request);
17750dcc685eSIlya Dryomov }
17760dcc685eSIlya Dryomov 
1777c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1778bf0d5f50SAlex Elder {
177957acbaa7SAlex Elder 	struct rbd_img_request *img_request = NULL;
1780a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev = NULL;
178157acbaa7SAlex Elder 	bool layered = false;
178257acbaa7SAlex Elder 
178357acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
178457acbaa7SAlex Elder 		img_request = obj_request->img_request;
178557acbaa7SAlex Elder 		layered = img_request && img_request_layered_test(img_request);
1786a9e8ba2cSAlex Elder 		rbd_dev = img_request->rbd_dev;
178757acbaa7SAlex Elder 	}
17888b3e1a56SAlex Elder 
17898b3e1a56SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
17908b3e1a56SAlex Elder 		obj_request, img_request, obj_request->result,
17918b3e1a56SAlex Elder 		obj_request->xferred, obj_request->length);
1792a9e8ba2cSAlex Elder 	if (layered && obj_request->result == -ENOENT &&
1793a9e8ba2cSAlex Elder 			obj_request->img_offset < rbd_dev->parent_overlap)
17948b3e1a56SAlex Elder 		rbd_img_parent_read(obj_request);
17958b3e1a56SAlex Elder 	else if (img_request)
17966e2a4505SAlex Elder 		rbd_img_obj_request_read_callback(obj_request);
17976e2a4505SAlex Elder 	else
179807741308SAlex Elder 		obj_request_done_set(obj_request);
1799bf0d5f50SAlex Elder }
1800bf0d5f50SAlex Elder 
1801c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1802bf0d5f50SAlex Elder {
18031b83bef2SSage Weil 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
18041b83bef2SSage Weil 		obj_request->result, obj_request->length);
18051b83bef2SSage Weil 	/*
18068b3e1a56SAlex Elder 	 * There is no such thing as a successful short write.  Set
18078b3e1a56SAlex Elder 	 * it to our originally-requested length.
18081b83bef2SSage Weil 	 */
18091b83bef2SSage Weil 	obj_request->xferred = obj_request->length;
181007741308SAlex Elder 	obj_request_done_set(obj_request);
1811bf0d5f50SAlex Elder }
1812bf0d5f50SAlex Elder 
181390e98c52SGuangliang Zhao static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request)
181490e98c52SGuangliang Zhao {
181590e98c52SGuangliang Zhao 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
181690e98c52SGuangliang Zhao 		obj_request->result, obj_request->length);
181790e98c52SGuangliang Zhao 	/*
181890e98c52SGuangliang Zhao 	 * There is no such thing as a successful short discard.  Set
181990e98c52SGuangliang Zhao 	 * it to our originally-requested length.
182090e98c52SGuangliang Zhao 	 */
182190e98c52SGuangliang Zhao 	obj_request->xferred = obj_request->length;
1822d0265de7SJosh Durgin 	/* discarding a non-existent object is not a problem */
1823d0265de7SJosh Durgin 	if (obj_request->result == -ENOENT)
1824d0265de7SJosh Durgin 		obj_request->result = 0;
182590e98c52SGuangliang Zhao 	obj_request_done_set(obj_request);
182690e98c52SGuangliang Zhao }
182790e98c52SGuangliang Zhao 
1828fbfab539SAlex Elder /*
1829fbfab539SAlex Elder  * For a simple stat call there's nothing to do.  We'll do more if
1830fbfab539SAlex Elder  * this is part of a write sequence for a layered image.
1831fbfab539SAlex Elder  */
1832c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1833fbfab539SAlex Elder {
183437206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
1835fbfab539SAlex Elder 	obj_request_done_set(obj_request);
1836fbfab539SAlex Elder }
1837fbfab539SAlex Elder 
18382761713dSIlya Dryomov static void rbd_osd_call_callback(struct rbd_obj_request *obj_request)
18392761713dSIlya Dryomov {
18402761713dSIlya Dryomov 	dout("%s: obj %p\n", __func__, obj_request);
18412761713dSIlya Dryomov 
18422761713dSIlya Dryomov 	if (obj_request_img_data_test(obj_request))
18432761713dSIlya Dryomov 		rbd_osd_copyup_callback(obj_request);
18442761713dSIlya Dryomov 	else
18452761713dSIlya Dryomov 		obj_request_done_set(obj_request);
18462761713dSIlya Dryomov }
18472761713dSIlya Dryomov 
184885e084feSIlya Dryomov static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
1849bf0d5f50SAlex Elder {
1850bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1851bf0d5f50SAlex Elder 	u16 opcode;
1852bf0d5f50SAlex Elder 
185385e084feSIlya Dryomov 	dout("%s: osd_req %p\n", __func__, osd_req);
1854bf0d5f50SAlex Elder 	rbd_assert(osd_req == obj_request->osd_req);
185557acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
185657acbaa7SAlex Elder 		rbd_assert(obj_request->img_request);
185757acbaa7SAlex Elder 		rbd_assert(obj_request->which != BAD_WHICH);
185857acbaa7SAlex Elder 	} else {
185957acbaa7SAlex Elder 		rbd_assert(obj_request->which == BAD_WHICH);
186057acbaa7SAlex Elder 	}
1861bf0d5f50SAlex Elder 
18621b83bef2SSage Weil 	if (osd_req->r_result < 0)
18631b83bef2SSage Weil 		obj_request->result = osd_req->r_result;
1864bf0d5f50SAlex Elder 
1865c47f9371SAlex Elder 	/*
1866c47f9371SAlex Elder 	 * We support a 64-bit length, but ultimately it has to be
18677ad18afaSChristoph Hellwig 	 * passed to the block layer, which just supports a 32-bit
18687ad18afaSChristoph Hellwig 	 * length field.
1869c47f9371SAlex Elder 	 */
18707665d85bSYan, Zheng 	obj_request->xferred = osd_req->r_ops[0].outdata_len;
1871c47f9371SAlex Elder 	rbd_assert(obj_request->xferred < (u64)UINT_MAX);
18720ccd5926SIlya Dryomov 
187379528734SAlex Elder 	opcode = osd_req->r_ops[0].op;
1874bf0d5f50SAlex Elder 	switch (opcode) {
1875bf0d5f50SAlex Elder 	case CEPH_OSD_OP_READ:
1876c47f9371SAlex Elder 		rbd_osd_read_callback(obj_request);
1877bf0d5f50SAlex Elder 		break;
18780ccd5926SIlya Dryomov 	case CEPH_OSD_OP_SETALLOCHINT:
1879e30b7577SIlya Dryomov 		rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE ||
1880e30b7577SIlya Dryomov 			   osd_req->r_ops[1].op == CEPH_OSD_OP_WRITEFULL);
18810ccd5926SIlya Dryomov 		/* fall through */
1882bf0d5f50SAlex Elder 	case CEPH_OSD_OP_WRITE:
1883e30b7577SIlya Dryomov 	case CEPH_OSD_OP_WRITEFULL:
1884c47f9371SAlex Elder 		rbd_osd_write_callback(obj_request);
1885bf0d5f50SAlex Elder 		break;
1886fbfab539SAlex Elder 	case CEPH_OSD_OP_STAT:
1887c47f9371SAlex Elder 		rbd_osd_stat_callback(obj_request);
1888fbfab539SAlex Elder 		break;
188990e98c52SGuangliang Zhao 	case CEPH_OSD_OP_DELETE:
189090e98c52SGuangliang Zhao 	case CEPH_OSD_OP_TRUNCATE:
189190e98c52SGuangliang Zhao 	case CEPH_OSD_OP_ZERO:
189290e98c52SGuangliang Zhao 		rbd_osd_discard_callback(obj_request);
189390e98c52SGuangliang Zhao 		break;
189436be9a76SAlex Elder 	case CEPH_OSD_OP_CALL:
18952761713dSIlya Dryomov 		rbd_osd_call_callback(obj_request);
18962761713dSIlya Dryomov 		break;
1897bf0d5f50SAlex Elder 	default:
1898a90bb0c1SIlya Dryomov 		rbd_warn(NULL, "unexpected OSD op: object_no %016llx opcode %d",
1899a90bb0c1SIlya Dryomov 			 obj_request->object_no, opcode);
1900bf0d5f50SAlex Elder 		break;
1901bf0d5f50SAlex Elder 	}
1902bf0d5f50SAlex Elder 
190307741308SAlex Elder 	if (obj_request_done_test(obj_request))
1904bf0d5f50SAlex Elder 		rbd_obj_request_complete(obj_request);
1905bf0d5f50SAlex Elder }
1906bf0d5f50SAlex Elder 
19079d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1908430c28c3SAlex Elder {
19098c042b0dSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
1910430c28c3SAlex Elder 
19117c84883aSIlya Dryomov 	rbd_assert(obj_request_img_data_test(obj_request));
19127c84883aSIlya Dryomov 	osd_req->r_snapid = obj_request->img_request->snap_id;
19139d4df01fSAlex Elder }
19149d4df01fSAlex Elder 
19159d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
19169d4df01fSAlex Elder {
19179d4df01fSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
19189d4df01fSAlex Elder 
1919bb873b53SIlya Dryomov 	osd_req->r_mtime = CURRENT_TIME;
1920bb873b53SIlya Dryomov 	osd_req->r_data_offset = obj_request->offset;
1921430c28c3SAlex Elder }
1922430c28c3SAlex Elder 
1923bc81207eSIlya Dryomov static struct ceph_osd_request *
1924bc81207eSIlya Dryomov __rbd_osd_req_create(struct rbd_device *rbd_dev,
1925bc81207eSIlya Dryomov 		     struct ceph_snap_context *snapc,
1926bc81207eSIlya Dryomov 		     int num_ops, unsigned int flags,
1927bc81207eSIlya Dryomov 		     struct rbd_obj_request *obj_request)
1928bc81207eSIlya Dryomov {
1929bc81207eSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1930bc81207eSIlya Dryomov 	struct ceph_osd_request *req;
1931a90bb0c1SIlya Dryomov 	const char *name_format = rbd_dev->image_format == 1 ?
1932a90bb0c1SIlya Dryomov 				      RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
1933bc81207eSIlya Dryomov 
1934bc81207eSIlya Dryomov 	req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
1935bc81207eSIlya Dryomov 	if (!req)
1936bc81207eSIlya Dryomov 		return NULL;
1937bc81207eSIlya Dryomov 
1938bc81207eSIlya Dryomov 	req->r_flags = flags;
1939bc81207eSIlya Dryomov 	req->r_callback = rbd_osd_req_callback;
1940bc81207eSIlya Dryomov 	req->r_priv = obj_request;
1941bc81207eSIlya Dryomov 
1942bc81207eSIlya Dryomov 	req->r_base_oloc.pool = rbd_dev->layout.pool_id;
1943a90bb0c1SIlya Dryomov 	if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
1944a90bb0c1SIlya Dryomov 			rbd_dev->header.object_prefix, obj_request->object_no))
1945bc81207eSIlya Dryomov 		goto err_req;
1946bc81207eSIlya Dryomov 
1947bc81207eSIlya Dryomov 	if (ceph_osdc_alloc_messages(req, GFP_NOIO))
1948bc81207eSIlya Dryomov 		goto err_req;
1949bc81207eSIlya Dryomov 
1950bc81207eSIlya Dryomov 	return req;
1951bc81207eSIlya Dryomov 
1952bc81207eSIlya Dryomov err_req:
1953bc81207eSIlya Dryomov 	ceph_osdc_put_request(req);
1954bc81207eSIlya Dryomov 	return NULL;
1955bc81207eSIlya Dryomov }
1956bc81207eSIlya Dryomov 
19570ccd5926SIlya Dryomov /*
19580ccd5926SIlya Dryomov  * Create an osd request.  A read request has one osd op (read).
19590ccd5926SIlya Dryomov  * A write request has either one (watch) or two (hint+write) osd ops.
19600ccd5926SIlya Dryomov  * (All rbd data writes are prefixed with an allocation hint op, but
19610ccd5926SIlya Dryomov  * technically osd watch is a write request, hence this distinction.)
19620ccd5926SIlya Dryomov  */
1963bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create(
1964bf0d5f50SAlex Elder 					struct rbd_device *rbd_dev,
19656d2940c8SGuangliang Zhao 					enum obj_operation_type op_type,
1966deb236b3SIlya Dryomov 					unsigned int num_ops,
1967430c28c3SAlex Elder 					struct rbd_obj_request *obj_request)
1968bf0d5f50SAlex Elder {
1969bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1970bf0d5f50SAlex Elder 
197190e98c52SGuangliang Zhao 	if (obj_request_img_data_test(obj_request) &&
197290e98c52SGuangliang Zhao 		(op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) {
19736365d33aSAlex Elder 		struct rbd_img_request *img_request = obj_request->img_request;
197490e98c52SGuangliang Zhao 		if (op_type == OBJ_OP_WRITE) {
19756d2940c8SGuangliang Zhao 			rbd_assert(img_request_write_test(img_request));
197690e98c52SGuangliang Zhao 		} else {
197790e98c52SGuangliang Zhao 			rbd_assert(img_request_discard_test(img_request));
197890e98c52SGuangliang Zhao 		}
1979bf0d5f50SAlex Elder 		snapc = img_request->snapc;
1980bf0d5f50SAlex Elder 	}
1981bf0d5f50SAlex Elder 
19826d2940c8SGuangliang Zhao 	rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2));
1983deb236b3SIlya Dryomov 
1984bc81207eSIlya Dryomov 	return __rbd_osd_req_create(rbd_dev, snapc, num_ops,
1985bc81207eSIlya Dryomov 	    (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) ?
198654ea0046SIlya Dryomov 	    CEPH_OSD_FLAG_WRITE : CEPH_OSD_FLAG_READ, obj_request);
1987bf0d5f50SAlex Elder }
1988bf0d5f50SAlex Elder 
19890eefd470SAlex Elder /*
1990d3246fb0SJosh Durgin  * Create a copyup osd request based on the information in the object
1991d3246fb0SJosh Durgin  * request supplied.  A copyup request has two or three osd ops, a
1992d3246fb0SJosh Durgin  * copyup method call, potentially a hint op, and a write or truncate
1993d3246fb0SJosh Durgin  * or zero op.
19940eefd470SAlex Elder  */
19950eefd470SAlex Elder static struct ceph_osd_request *
19960eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
19970eefd470SAlex Elder {
19980eefd470SAlex Elder 	struct rbd_img_request *img_request;
1999d3246fb0SJosh Durgin 	int num_osd_ops = 3;
20000eefd470SAlex Elder 
20010eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
20020eefd470SAlex Elder 	img_request = obj_request->img_request;
20030eefd470SAlex Elder 	rbd_assert(img_request);
2004d3246fb0SJosh Durgin 	rbd_assert(img_request_write_test(img_request) ||
2005d3246fb0SJosh Durgin 			img_request_discard_test(img_request));
20060eefd470SAlex Elder 
2007d3246fb0SJosh Durgin 	if (img_request_discard_test(img_request))
2008d3246fb0SJosh Durgin 		num_osd_ops = 2;
2009d3246fb0SJosh Durgin 
2010bc81207eSIlya Dryomov 	return __rbd_osd_req_create(img_request->rbd_dev,
2011bc81207eSIlya Dryomov 				    img_request->snapc, num_osd_ops,
201254ea0046SIlya Dryomov 				    CEPH_OSD_FLAG_WRITE, obj_request);
20130eefd470SAlex Elder }
20140eefd470SAlex Elder 
2015bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
2016bf0d5f50SAlex Elder {
2017bf0d5f50SAlex Elder 	ceph_osdc_put_request(osd_req);
2018bf0d5f50SAlex Elder }
2019bf0d5f50SAlex Elder 
20206c696d85SIlya Dryomov static struct rbd_obj_request *
20216c696d85SIlya Dryomov rbd_obj_request_create(enum obj_request_type type)
2022bf0d5f50SAlex Elder {
2023bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2024bf0d5f50SAlex Elder 
2025bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(type));
2026bf0d5f50SAlex Elder 
20275a60e876SIlya Dryomov 	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
20286c696d85SIlya Dryomov 	if (!obj_request)
2029f907ad55SAlex Elder 		return NULL;
2030f907ad55SAlex Elder 
2031bf0d5f50SAlex Elder 	obj_request->which = BAD_WHICH;
2032bf0d5f50SAlex Elder 	obj_request->type = type;
2033bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&obj_request->links);
2034788e2df3SAlex Elder 	init_completion(&obj_request->completion);
2035bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
2036bf0d5f50SAlex Elder 
203767e2b652SIlya Dryomov 	dout("%s %p\n", __func__, obj_request);
2038bf0d5f50SAlex Elder 	return obj_request;
2039bf0d5f50SAlex Elder }
2040bf0d5f50SAlex Elder 
2041bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
2042bf0d5f50SAlex Elder {
2043bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2044bf0d5f50SAlex Elder 
2045bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
2046bf0d5f50SAlex Elder 
204737206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
204837206ee5SAlex Elder 
2049bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
2050bf0d5f50SAlex Elder 	rbd_assert(obj_request->which == BAD_WHICH);
2051bf0d5f50SAlex Elder 
2052bf0d5f50SAlex Elder 	if (obj_request->osd_req)
2053bf0d5f50SAlex Elder 		rbd_osd_req_destroy(obj_request->osd_req);
2054bf0d5f50SAlex Elder 
2055bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
2056bf0d5f50SAlex Elder 	switch (obj_request->type) {
20579969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
20589969ebc5SAlex Elder 		break;		/* Nothing to do */
2059bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
2060bf0d5f50SAlex Elder 		if (obj_request->bio_list)
2061bf0d5f50SAlex Elder 			bio_chain_put(obj_request->bio_list);
2062bf0d5f50SAlex Elder 		break;
2063788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
206404dc923cSIlya Dryomov 		/* img_data requests don't own their page array */
206504dc923cSIlya Dryomov 		if (obj_request->pages &&
206604dc923cSIlya Dryomov 		    !obj_request_img_data_test(obj_request))
2067788e2df3SAlex Elder 			ceph_release_page_vector(obj_request->pages,
2068788e2df3SAlex Elder 						obj_request->page_count);
2069788e2df3SAlex Elder 		break;
2070bf0d5f50SAlex Elder 	}
2071bf0d5f50SAlex Elder 
2072868311b1SAlex Elder 	kmem_cache_free(rbd_obj_request_cache, obj_request);
2073bf0d5f50SAlex Elder }
2074bf0d5f50SAlex Elder 
2075fb65d228SAlex Elder /* It's OK to call this for a device with no parent */
2076fb65d228SAlex Elder 
2077fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
2078fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev)
2079fb65d228SAlex Elder {
2080fb65d228SAlex Elder 	rbd_dev_remove_parent(rbd_dev);
2081fb65d228SAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
2082fb65d228SAlex Elder 	rbd_dev->parent_spec = NULL;
2083fb65d228SAlex Elder 	rbd_dev->parent_overlap = 0;
2084fb65d228SAlex Elder }
2085fb65d228SAlex Elder 
2086bf0d5f50SAlex Elder /*
2087a2acd00eSAlex Elder  * Parent image reference counting is used to determine when an
2088a2acd00eSAlex Elder  * image's parent fields can be safely torn down--after there are no
2089a2acd00eSAlex Elder  * more in-flight requests to the parent image.  When the last
2090a2acd00eSAlex Elder  * reference is dropped, cleaning them up is safe.
2091a2acd00eSAlex Elder  */
2092a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
2093a2acd00eSAlex Elder {
2094a2acd00eSAlex Elder 	int counter;
2095a2acd00eSAlex Elder 
2096a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
2097a2acd00eSAlex Elder 		return;
2098a2acd00eSAlex Elder 
2099a2acd00eSAlex Elder 	counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
2100a2acd00eSAlex Elder 	if (counter > 0)
2101a2acd00eSAlex Elder 		return;
2102a2acd00eSAlex Elder 
2103a2acd00eSAlex Elder 	/* Last reference; clean up parent data structures */
2104a2acd00eSAlex Elder 
2105a2acd00eSAlex Elder 	if (!counter)
2106a2acd00eSAlex Elder 		rbd_dev_unparent(rbd_dev);
2107a2acd00eSAlex Elder 	else
21089584d508SIlya Dryomov 		rbd_warn(rbd_dev, "parent reference underflow");
2109a2acd00eSAlex Elder }
2110a2acd00eSAlex Elder 
2111a2acd00eSAlex Elder /*
2112a2acd00eSAlex Elder  * If an image has a non-zero parent overlap, get a reference to its
2113a2acd00eSAlex Elder  * parent.
2114a2acd00eSAlex Elder  *
2115a2acd00eSAlex Elder  * Returns true if the rbd device has a parent with a non-zero
2116a2acd00eSAlex Elder  * overlap and a reference for it was successfully taken, or
2117a2acd00eSAlex Elder  * false otherwise.
2118a2acd00eSAlex Elder  */
2119a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
2120a2acd00eSAlex Elder {
2121ae43e9d0SIlya Dryomov 	int counter = 0;
2122a2acd00eSAlex Elder 
2123a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
2124a2acd00eSAlex Elder 		return false;
2125a2acd00eSAlex Elder 
2126ae43e9d0SIlya Dryomov 	down_read(&rbd_dev->header_rwsem);
2127ae43e9d0SIlya Dryomov 	if (rbd_dev->parent_overlap)
2128a2acd00eSAlex Elder 		counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
2129ae43e9d0SIlya Dryomov 	up_read(&rbd_dev->header_rwsem);
2130a2acd00eSAlex Elder 
2131a2acd00eSAlex Elder 	if (counter < 0)
21329584d508SIlya Dryomov 		rbd_warn(rbd_dev, "parent reference overflow");
2133a2acd00eSAlex Elder 
2134ae43e9d0SIlya Dryomov 	return counter > 0;
2135a2acd00eSAlex Elder }
2136a2acd00eSAlex Elder 
2137bf0d5f50SAlex Elder /*
2138bf0d5f50SAlex Elder  * Caller is responsible for filling in the list of object requests
2139bf0d5f50SAlex Elder  * that comprises the image request, and the Linux request pointer
2140bf0d5f50SAlex Elder  * (if there is one).
2141bf0d5f50SAlex Elder  */
2142cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create(
2143cc344fa1SAlex Elder 					struct rbd_device *rbd_dev,
2144bf0d5f50SAlex Elder 					u64 offset, u64 length,
21456d2940c8SGuangliang Zhao 					enum obj_operation_type op_type,
21464e752f0aSJosh Durgin 					struct ceph_snap_context *snapc)
2147bf0d5f50SAlex Elder {
2148bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
2149bf0d5f50SAlex Elder 
21507a716aacSIlya Dryomov 	img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
2151bf0d5f50SAlex Elder 	if (!img_request)
2152bf0d5f50SAlex Elder 		return NULL;
2153bf0d5f50SAlex Elder 
2154bf0d5f50SAlex Elder 	img_request->rq = NULL;
2155bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
2156bf0d5f50SAlex Elder 	img_request->offset = offset;
2157bf0d5f50SAlex Elder 	img_request->length = length;
21580c425248SAlex Elder 	img_request->flags = 0;
215990e98c52SGuangliang Zhao 	if (op_type == OBJ_OP_DISCARD) {
216090e98c52SGuangliang Zhao 		img_request_discard_set(img_request);
216190e98c52SGuangliang Zhao 		img_request->snapc = snapc;
216290e98c52SGuangliang Zhao 	} else if (op_type == OBJ_OP_WRITE) {
21630c425248SAlex Elder 		img_request_write_set(img_request);
21644e752f0aSJosh Durgin 		img_request->snapc = snapc;
21650c425248SAlex Elder 	} else {
2166bf0d5f50SAlex Elder 		img_request->snap_id = rbd_dev->spec->snap_id;
21670c425248SAlex Elder 	}
2168a2acd00eSAlex Elder 	if (rbd_dev_parent_get(rbd_dev))
2169d0b2e944SAlex Elder 		img_request_layered_set(img_request);
2170bf0d5f50SAlex Elder 	spin_lock_init(&img_request->completion_lock);
2171bf0d5f50SAlex Elder 	img_request->next_completion = 0;
2172bf0d5f50SAlex Elder 	img_request->callback = NULL;
2173a5a337d4SAlex Elder 	img_request->result = 0;
2174bf0d5f50SAlex Elder 	img_request->obj_request_count = 0;
2175bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&img_request->obj_requests);
2176bf0d5f50SAlex Elder 	kref_init(&img_request->kref);
2177bf0d5f50SAlex Elder 
217837206ee5SAlex Elder 	dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
21796d2940c8SGuangliang Zhao 		obj_op_name(op_type), offset, length, img_request);
218037206ee5SAlex Elder 
2181bf0d5f50SAlex Elder 	return img_request;
2182bf0d5f50SAlex Elder }
2183bf0d5f50SAlex Elder 
2184bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref)
2185bf0d5f50SAlex Elder {
2186bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
2187bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2188bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
2189bf0d5f50SAlex Elder 
2190bf0d5f50SAlex Elder 	img_request = container_of(kref, struct rbd_img_request, kref);
2191bf0d5f50SAlex Elder 
219237206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
219337206ee5SAlex Elder 
2194bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2195bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
219625dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count == 0);
2197bf0d5f50SAlex Elder 
2198a2acd00eSAlex Elder 	if (img_request_layered_test(img_request)) {
2199a2acd00eSAlex Elder 		img_request_layered_clear(img_request);
2200a2acd00eSAlex Elder 		rbd_dev_parent_put(img_request->rbd_dev);
2201a2acd00eSAlex Elder 	}
2202a2acd00eSAlex Elder 
2203bef95455SJosh Durgin 	if (img_request_write_test(img_request) ||
2204bef95455SJosh Durgin 		img_request_discard_test(img_request))
2205812164f8SAlex Elder 		ceph_put_snap_context(img_request->snapc);
2206bf0d5f50SAlex Elder 
22071c2a9dfeSAlex Elder 	kmem_cache_free(rbd_img_request_cache, img_request);
2208bf0d5f50SAlex Elder }
2209bf0d5f50SAlex Elder 
2210e93f3152SAlex Elder static struct rbd_img_request *rbd_parent_request_create(
2211e93f3152SAlex Elder 					struct rbd_obj_request *obj_request,
2212e93f3152SAlex Elder 					u64 img_offset, u64 length)
2213e93f3152SAlex Elder {
2214e93f3152SAlex Elder 	struct rbd_img_request *parent_request;
2215e93f3152SAlex Elder 	struct rbd_device *rbd_dev;
2216e93f3152SAlex Elder 
2217e93f3152SAlex Elder 	rbd_assert(obj_request->img_request);
2218e93f3152SAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2219e93f3152SAlex Elder 
22204e752f0aSJosh Durgin 	parent_request = rbd_img_request_create(rbd_dev->parent, img_offset,
22216d2940c8SGuangliang Zhao 						length, OBJ_OP_READ, NULL);
2222e93f3152SAlex Elder 	if (!parent_request)
2223e93f3152SAlex Elder 		return NULL;
2224e93f3152SAlex Elder 
2225e93f3152SAlex Elder 	img_request_child_set(parent_request);
2226e93f3152SAlex Elder 	rbd_obj_request_get(obj_request);
2227e93f3152SAlex Elder 	parent_request->obj_request = obj_request;
2228e93f3152SAlex Elder 
2229e93f3152SAlex Elder 	return parent_request;
2230e93f3152SAlex Elder }
2231e93f3152SAlex Elder 
2232e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref)
2233e93f3152SAlex Elder {
2234e93f3152SAlex Elder 	struct rbd_img_request *parent_request;
2235e93f3152SAlex Elder 	struct rbd_obj_request *orig_request;
2236e93f3152SAlex Elder 
2237e93f3152SAlex Elder 	parent_request = container_of(kref, struct rbd_img_request, kref);
2238e93f3152SAlex Elder 	orig_request = parent_request->obj_request;
2239e93f3152SAlex Elder 
2240e93f3152SAlex Elder 	parent_request->obj_request = NULL;
2241e93f3152SAlex Elder 	rbd_obj_request_put(orig_request);
2242e93f3152SAlex Elder 	img_request_child_clear(parent_request);
2243e93f3152SAlex Elder 
2244e93f3152SAlex Elder 	rbd_img_request_destroy(kref);
2245e93f3152SAlex Elder }
2246e93f3152SAlex Elder 
22471217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
22481217857fSAlex Elder {
22496365d33aSAlex Elder 	struct rbd_img_request *img_request;
22501217857fSAlex Elder 	unsigned int xferred;
22511217857fSAlex Elder 	int result;
22528b3e1a56SAlex Elder 	bool more;
22531217857fSAlex Elder 
22546365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
22556365d33aSAlex Elder 	img_request = obj_request->img_request;
22566365d33aSAlex Elder 
22571217857fSAlex Elder 	rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
22581217857fSAlex Elder 	xferred = (unsigned int)obj_request->xferred;
22591217857fSAlex Elder 	result = obj_request->result;
22601217857fSAlex Elder 	if (result) {
22611217857fSAlex Elder 		struct rbd_device *rbd_dev = img_request->rbd_dev;
22626d2940c8SGuangliang Zhao 		enum obj_operation_type op_type;
22636d2940c8SGuangliang Zhao 
226490e98c52SGuangliang Zhao 		if (img_request_discard_test(img_request))
226590e98c52SGuangliang Zhao 			op_type = OBJ_OP_DISCARD;
226690e98c52SGuangliang Zhao 		else if (img_request_write_test(img_request))
226790e98c52SGuangliang Zhao 			op_type = OBJ_OP_WRITE;
226890e98c52SGuangliang Zhao 		else
226990e98c52SGuangliang Zhao 			op_type = OBJ_OP_READ;
22701217857fSAlex Elder 
22719584d508SIlya Dryomov 		rbd_warn(rbd_dev, "%s %llx at %llx (%llx)",
22726d2940c8SGuangliang Zhao 			obj_op_name(op_type), obj_request->length,
22736d2940c8SGuangliang Zhao 			obj_request->img_offset, obj_request->offset);
22749584d508SIlya Dryomov 		rbd_warn(rbd_dev, "  result %d xferred %x",
22751217857fSAlex Elder 			result, xferred);
22761217857fSAlex Elder 		if (!img_request->result)
22771217857fSAlex Elder 			img_request->result = result;
2278082a75daSIlya Dryomov 		/*
2279082a75daSIlya Dryomov 		 * Need to end I/O on the entire obj_request worth of
2280082a75daSIlya Dryomov 		 * bytes in case of error.
2281082a75daSIlya Dryomov 		 */
2282082a75daSIlya Dryomov 		xferred = obj_request->length;
22831217857fSAlex Elder 	}
22841217857fSAlex Elder 
22858b3e1a56SAlex Elder 	if (img_request_child_test(img_request)) {
22868b3e1a56SAlex Elder 		rbd_assert(img_request->obj_request != NULL);
22878b3e1a56SAlex Elder 		more = obj_request->which < img_request->obj_request_count - 1;
22888b3e1a56SAlex Elder 	} else {
22898b3e1a56SAlex Elder 		rbd_assert(img_request->rq != NULL);
22907ad18afaSChristoph Hellwig 
22917ad18afaSChristoph Hellwig 		more = blk_update_request(img_request->rq, result, xferred);
22927ad18afaSChristoph Hellwig 		if (!more)
22937ad18afaSChristoph Hellwig 			__blk_mq_end_request(img_request->rq, result);
22948b3e1a56SAlex Elder 	}
22958b3e1a56SAlex Elder 
22968b3e1a56SAlex Elder 	return more;
22971217857fSAlex Elder }
22981217857fSAlex Elder 
22992169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
23002169238dSAlex Elder {
23012169238dSAlex Elder 	struct rbd_img_request *img_request;
23022169238dSAlex Elder 	u32 which = obj_request->which;
23032169238dSAlex Elder 	bool more = true;
23042169238dSAlex Elder 
23056365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
23062169238dSAlex Elder 	img_request = obj_request->img_request;
23072169238dSAlex Elder 
23082169238dSAlex Elder 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
23092169238dSAlex Elder 	rbd_assert(img_request != NULL);
23102169238dSAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
23112169238dSAlex Elder 	rbd_assert(which != BAD_WHICH);
23122169238dSAlex Elder 	rbd_assert(which < img_request->obj_request_count);
23132169238dSAlex Elder 
23142169238dSAlex Elder 	spin_lock_irq(&img_request->completion_lock);
23152169238dSAlex Elder 	if (which != img_request->next_completion)
23162169238dSAlex Elder 		goto out;
23172169238dSAlex Elder 
23182169238dSAlex Elder 	for_each_obj_request_from(img_request, obj_request) {
23192169238dSAlex Elder 		rbd_assert(more);
23202169238dSAlex Elder 		rbd_assert(which < img_request->obj_request_count);
23212169238dSAlex Elder 
23222169238dSAlex Elder 		if (!obj_request_done_test(obj_request))
23232169238dSAlex Elder 			break;
23241217857fSAlex Elder 		more = rbd_img_obj_end_request(obj_request);
23252169238dSAlex Elder 		which++;
23262169238dSAlex Elder 	}
23272169238dSAlex Elder 
23282169238dSAlex Elder 	rbd_assert(more ^ (which == img_request->obj_request_count));
23292169238dSAlex Elder 	img_request->next_completion = which;
23302169238dSAlex Elder out:
23312169238dSAlex Elder 	spin_unlock_irq(&img_request->completion_lock);
23320f2d5be7SAlex Elder 	rbd_img_request_put(img_request);
23332169238dSAlex Elder 
23342169238dSAlex Elder 	if (!more)
23352169238dSAlex Elder 		rbd_img_request_complete(img_request);
23362169238dSAlex Elder }
23372169238dSAlex Elder 
2338f1a4739fSAlex Elder /*
23393b434a2aSJosh Durgin  * Add individual osd ops to the given ceph_osd_request and prepare
23403b434a2aSJosh Durgin  * them for submission. num_ops is the current number of
23413b434a2aSJosh Durgin  * osd operations already to the object request.
23423b434a2aSJosh Durgin  */
23433b434a2aSJosh Durgin static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request,
23443b434a2aSJosh Durgin 				struct ceph_osd_request *osd_request,
23453b434a2aSJosh Durgin 				enum obj_operation_type op_type,
23463b434a2aSJosh Durgin 				unsigned int num_ops)
23473b434a2aSJosh Durgin {
23483b434a2aSJosh Durgin 	struct rbd_img_request *img_request = obj_request->img_request;
23493b434a2aSJosh Durgin 	struct rbd_device *rbd_dev = img_request->rbd_dev;
23503b434a2aSJosh Durgin 	u64 object_size = rbd_obj_bytes(&rbd_dev->header);
23513b434a2aSJosh Durgin 	u64 offset = obj_request->offset;
23523b434a2aSJosh Durgin 	u64 length = obj_request->length;
23533b434a2aSJosh Durgin 	u64 img_end;
23543b434a2aSJosh Durgin 	u16 opcode;
23553b434a2aSJosh Durgin 
23563b434a2aSJosh Durgin 	if (op_type == OBJ_OP_DISCARD) {
2357d3246fb0SJosh Durgin 		if (!offset && length == object_size &&
2358d3246fb0SJosh Durgin 		    (!img_request_layered_test(img_request) ||
2359d3246fb0SJosh Durgin 		     !obj_request_overlaps_parent(obj_request))) {
23603b434a2aSJosh Durgin 			opcode = CEPH_OSD_OP_DELETE;
23613b434a2aSJosh Durgin 		} else if ((offset + length == object_size)) {
23623b434a2aSJosh Durgin 			opcode = CEPH_OSD_OP_TRUNCATE;
23633b434a2aSJosh Durgin 		} else {
23643b434a2aSJosh Durgin 			down_read(&rbd_dev->header_rwsem);
23653b434a2aSJosh Durgin 			img_end = rbd_dev->header.image_size;
23663b434a2aSJosh Durgin 			up_read(&rbd_dev->header_rwsem);
23673b434a2aSJosh Durgin 
23683b434a2aSJosh Durgin 			if (obj_request->img_offset + length == img_end)
23693b434a2aSJosh Durgin 				opcode = CEPH_OSD_OP_TRUNCATE;
23703b434a2aSJosh Durgin 			else
23713b434a2aSJosh Durgin 				opcode = CEPH_OSD_OP_ZERO;
23723b434a2aSJosh Durgin 		}
23733b434a2aSJosh Durgin 	} else if (op_type == OBJ_OP_WRITE) {
2374e30b7577SIlya Dryomov 		if (!offset && length == object_size)
2375e30b7577SIlya Dryomov 			opcode = CEPH_OSD_OP_WRITEFULL;
2376e30b7577SIlya Dryomov 		else
23773b434a2aSJosh Durgin 			opcode = CEPH_OSD_OP_WRITE;
23783b434a2aSJosh Durgin 		osd_req_op_alloc_hint_init(osd_request, num_ops,
23793b434a2aSJosh Durgin 					object_size, object_size);
23803b434a2aSJosh Durgin 		num_ops++;
23813b434a2aSJosh Durgin 	} else {
23823b434a2aSJosh Durgin 		opcode = CEPH_OSD_OP_READ;
23833b434a2aSJosh Durgin 	}
23843b434a2aSJosh Durgin 
23857e868b6eSIlya Dryomov 	if (opcode == CEPH_OSD_OP_DELETE)
2386144cba14SYan, Zheng 		osd_req_op_init(osd_request, num_ops, opcode, 0);
23877e868b6eSIlya Dryomov 	else
23887e868b6eSIlya Dryomov 		osd_req_op_extent_init(osd_request, num_ops, opcode,
23897e868b6eSIlya Dryomov 				       offset, length, 0, 0);
23907e868b6eSIlya Dryomov 
23913b434a2aSJosh Durgin 	if (obj_request->type == OBJ_REQUEST_BIO)
23923b434a2aSJosh Durgin 		osd_req_op_extent_osd_data_bio(osd_request, num_ops,
23933b434a2aSJosh Durgin 					obj_request->bio_list, length);
23943b434a2aSJosh Durgin 	else if (obj_request->type == OBJ_REQUEST_PAGES)
23953b434a2aSJosh Durgin 		osd_req_op_extent_osd_data_pages(osd_request, num_ops,
23963b434a2aSJosh Durgin 					obj_request->pages, length,
23973b434a2aSJosh Durgin 					offset & ~PAGE_MASK, false, false);
23983b434a2aSJosh Durgin 
23993b434a2aSJosh Durgin 	/* Discards are also writes */
24003b434a2aSJosh Durgin 	if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
24013b434a2aSJosh Durgin 		rbd_osd_req_format_write(obj_request);
24023b434a2aSJosh Durgin 	else
24033b434a2aSJosh Durgin 		rbd_osd_req_format_read(obj_request);
24043b434a2aSJosh Durgin }
24053b434a2aSJosh Durgin 
24063b434a2aSJosh Durgin /*
2407f1a4739fSAlex Elder  * Split up an image request into one or more object requests, each
2408f1a4739fSAlex Elder  * to a different object.  The "type" parameter indicates whether
2409f1a4739fSAlex Elder  * "data_desc" is the pointer to the head of a list of bio
2410f1a4739fSAlex Elder  * structures, or the base of a page array.  In either case this
2411f1a4739fSAlex Elder  * function assumes data_desc describes memory sufficient to hold
2412f1a4739fSAlex Elder  * all data described by the image request.
2413f1a4739fSAlex Elder  */
2414f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request,
2415f1a4739fSAlex Elder 					enum obj_request_type type,
2416f1a4739fSAlex Elder 					void *data_desc)
2417bf0d5f50SAlex Elder {
2418bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = img_request->rbd_dev;
2419bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = NULL;
2420bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
2421a158073cSJingoo Han 	struct bio *bio_list = NULL;
2422f1a4739fSAlex Elder 	unsigned int bio_offset = 0;
2423a158073cSJingoo Han 	struct page **pages = NULL;
24246d2940c8SGuangliang Zhao 	enum obj_operation_type op_type;
24257da22d29SAlex Elder 	u64 img_offset;
2426bf0d5f50SAlex Elder 	u64 resid;
2427bf0d5f50SAlex Elder 
2428f1a4739fSAlex Elder 	dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2429f1a4739fSAlex Elder 		(int)type, data_desc);
243037206ee5SAlex Elder 
24317da22d29SAlex Elder 	img_offset = img_request->offset;
2432bf0d5f50SAlex Elder 	resid = img_request->length;
24334dda41d3SAlex Elder 	rbd_assert(resid > 0);
24343b434a2aSJosh Durgin 	op_type = rbd_img_request_op_type(img_request);
2435f1a4739fSAlex Elder 
2436f1a4739fSAlex Elder 	if (type == OBJ_REQUEST_BIO) {
2437f1a4739fSAlex Elder 		bio_list = data_desc;
24384f024f37SKent Overstreet 		rbd_assert(img_offset ==
24394f024f37SKent Overstreet 			   bio_list->bi_iter.bi_sector << SECTOR_SHIFT);
244090e98c52SGuangliang Zhao 	} else if (type == OBJ_REQUEST_PAGES) {
2441f1a4739fSAlex Elder 		pages = data_desc;
2442f1a4739fSAlex Elder 	}
2443f1a4739fSAlex Elder 
2444bf0d5f50SAlex Elder 	while (resid) {
24452fa12320SAlex Elder 		struct ceph_osd_request *osd_req;
2446a90bb0c1SIlya Dryomov 		u64 object_no = img_offset >> rbd_dev->header.obj_order;
244767e2b652SIlya Dryomov 		u64 offset = rbd_segment_offset(rbd_dev, img_offset);
244867e2b652SIlya Dryomov 		u64 length = rbd_segment_length(rbd_dev, img_offset, resid);
2449bf0d5f50SAlex Elder 
24506c696d85SIlya Dryomov 		obj_request = rbd_obj_request_create(type);
2451bf0d5f50SAlex Elder 		if (!obj_request)
2452bf0d5f50SAlex Elder 			goto out_unwind;
245362054da6SIlya Dryomov 
2454a90bb0c1SIlya Dryomov 		obj_request->object_no = object_no;
245567e2b652SIlya Dryomov 		obj_request->offset = offset;
245667e2b652SIlya Dryomov 		obj_request->length = length;
245767e2b652SIlya Dryomov 
245803507db6SJosh Durgin 		/*
245903507db6SJosh Durgin 		 * set obj_request->img_request before creating the
246003507db6SJosh Durgin 		 * osd_request so that it gets the right snapc
246103507db6SJosh Durgin 		 */
246203507db6SJosh Durgin 		rbd_img_obj_request_add(img_request, obj_request);
2463bf0d5f50SAlex Elder 
2464f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO) {
2465f1a4739fSAlex Elder 			unsigned int clone_size;
2466f1a4739fSAlex Elder 
2467bf0d5f50SAlex Elder 			rbd_assert(length <= (u64)UINT_MAX);
2468bf0d5f50SAlex Elder 			clone_size = (unsigned int)length;
2469f1a4739fSAlex Elder 			obj_request->bio_list =
2470f1a4739fSAlex Elder 					bio_chain_clone_range(&bio_list,
2471f1a4739fSAlex Elder 								&bio_offset,
2472f1a4739fSAlex Elder 								clone_size,
24732224d879SDavid Disseldorp 								GFP_NOIO);
2474bf0d5f50SAlex Elder 			if (!obj_request->bio_list)
247562054da6SIlya Dryomov 				goto out_unwind;
247690e98c52SGuangliang Zhao 		} else if (type == OBJ_REQUEST_PAGES) {
2477f1a4739fSAlex Elder 			unsigned int page_count;
2478f1a4739fSAlex Elder 
2479f1a4739fSAlex Elder 			obj_request->pages = pages;
2480f1a4739fSAlex Elder 			page_count = (u32)calc_pages_for(offset, length);
2481f1a4739fSAlex Elder 			obj_request->page_count = page_count;
2482f1a4739fSAlex Elder 			if ((offset + length) & ~PAGE_MASK)
2483f1a4739fSAlex Elder 				page_count--;	/* more on last page */
2484f1a4739fSAlex Elder 			pages += page_count;
2485f1a4739fSAlex Elder 		}
2486bf0d5f50SAlex Elder 
24876d2940c8SGuangliang Zhao 		osd_req = rbd_osd_req_create(rbd_dev, op_type,
24886d2940c8SGuangliang Zhao 					(op_type == OBJ_OP_WRITE) ? 2 : 1,
24892fa12320SAlex Elder 					obj_request);
24902fa12320SAlex Elder 		if (!osd_req)
249162054da6SIlya Dryomov 			goto out_unwind;
24923b434a2aSJosh Durgin 
24932fa12320SAlex Elder 		obj_request->osd_req = osd_req;
24942169238dSAlex Elder 		obj_request->callback = rbd_img_obj_callback;
24957da22d29SAlex Elder 		obj_request->img_offset = img_offset;
2496bf0d5f50SAlex Elder 
24973b434a2aSJosh Durgin 		rbd_img_obj_request_fill(obj_request, osd_req, op_type, 0);
24983b434a2aSJosh Durgin 
24997da22d29SAlex Elder 		img_offset += length;
2500bf0d5f50SAlex Elder 		resid -= length;
2501bf0d5f50SAlex Elder 	}
2502bf0d5f50SAlex Elder 
2503bf0d5f50SAlex Elder 	return 0;
2504bf0d5f50SAlex Elder 
2505bf0d5f50SAlex Elder out_unwind:
2506bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
250742dd037cSIlya Dryomov 		rbd_img_obj_request_del(img_request, obj_request);
2508bf0d5f50SAlex Elder 
2509bf0d5f50SAlex Elder 	return -ENOMEM;
2510bf0d5f50SAlex Elder }
2511bf0d5f50SAlex Elder 
25123d7efd18SAlex Elder static void
25132761713dSIlya Dryomov rbd_osd_copyup_callback(struct rbd_obj_request *obj_request)
25140eefd470SAlex Elder {
25150eefd470SAlex Elder 	struct rbd_img_request *img_request;
25160eefd470SAlex Elder 	struct rbd_device *rbd_dev;
2517ebda6408SAlex Elder 	struct page **pages;
25180eefd470SAlex Elder 	u32 page_count;
25190eefd470SAlex Elder 
25202761713dSIlya Dryomov 	dout("%s: obj %p\n", __func__, obj_request);
25212761713dSIlya Dryomov 
2522d3246fb0SJosh Durgin 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO ||
2523d3246fb0SJosh Durgin 		obj_request->type == OBJ_REQUEST_NODATA);
25240eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
25250eefd470SAlex Elder 	img_request = obj_request->img_request;
25260eefd470SAlex Elder 	rbd_assert(img_request);
25270eefd470SAlex Elder 
25280eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
25290eefd470SAlex Elder 	rbd_assert(rbd_dev);
25300eefd470SAlex Elder 
2531ebda6408SAlex Elder 	pages = obj_request->copyup_pages;
2532ebda6408SAlex Elder 	rbd_assert(pages != NULL);
25330eefd470SAlex Elder 	obj_request->copyup_pages = NULL;
2534ebda6408SAlex Elder 	page_count = obj_request->copyup_page_count;
2535ebda6408SAlex Elder 	rbd_assert(page_count);
2536ebda6408SAlex Elder 	obj_request->copyup_page_count = 0;
2537ebda6408SAlex Elder 	ceph_release_page_vector(pages, page_count);
25380eefd470SAlex Elder 
25390eefd470SAlex Elder 	/*
25400eefd470SAlex Elder 	 * We want the transfer count to reflect the size of the
25410eefd470SAlex Elder 	 * original write request.  There is no such thing as a
25420eefd470SAlex Elder 	 * successful short write, so if the request was successful
25430eefd470SAlex Elder 	 * we can just set it to the originally-requested length.
25440eefd470SAlex Elder 	 */
25450eefd470SAlex Elder 	if (!obj_request->result)
25460eefd470SAlex Elder 		obj_request->xferred = obj_request->length;
25470eefd470SAlex Elder 
25482761713dSIlya Dryomov 	obj_request_done_set(obj_request);
25490eefd470SAlex Elder }
25500eefd470SAlex Elder 
25510eefd470SAlex Elder static void
25523d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
25533d7efd18SAlex Elder {
25543d7efd18SAlex Elder 	struct rbd_obj_request *orig_request;
25550eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
25560eefd470SAlex Elder 	struct rbd_device *rbd_dev;
25573d7efd18SAlex Elder 	struct page **pages;
2558d3246fb0SJosh Durgin 	enum obj_operation_type op_type;
2559ebda6408SAlex Elder 	u32 page_count;
2560bbea1c1aSAlex Elder 	int img_result;
2561ebda6408SAlex Elder 	u64 parent_length;
25623d7efd18SAlex Elder 
25633d7efd18SAlex Elder 	rbd_assert(img_request_child_test(img_request));
25643d7efd18SAlex Elder 
25653d7efd18SAlex Elder 	/* First get what we need from the image request */
25663d7efd18SAlex Elder 
25673d7efd18SAlex Elder 	pages = img_request->copyup_pages;
25683d7efd18SAlex Elder 	rbd_assert(pages != NULL);
25693d7efd18SAlex Elder 	img_request->copyup_pages = NULL;
2570ebda6408SAlex Elder 	page_count = img_request->copyup_page_count;
2571ebda6408SAlex Elder 	rbd_assert(page_count);
2572ebda6408SAlex Elder 	img_request->copyup_page_count = 0;
25733d7efd18SAlex Elder 
25743d7efd18SAlex Elder 	orig_request = img_request->obj_request;
25753d7efd18SAlex Elder 	rbd_assert(orig_request != NULL);
2576b91f09f1SAlex Elder 	rbd_assert(obj_request_type_valid(orig_request->type));
2577bbea1c1aSAlex Elder 	img_result = img_request->result;
2578ebda6408SAlex Elder 	parent_length = img_request->length;
2579fa355112SIlya Dryomov 	rbd_assert(img_result || parent_length == img_request->xferred);
25803d7efd18SAlex Elder 	rbd_img_request_put(img_request);
25813d7efd18SAlex Elder 
258291c6febbSAlex Elder 	rbd_assert(orig_request->img_request);
258391c6febbSAlex Elder 	rbd_dev = orig_request->img_request->rbd_dev;
25843d7efd18SAlex Elder 	rbd_assert(rbd_dev);
25853d7efd18SAlex Elder 
2586bbea1c1aSAlex Elder 	/*
2587bbea1c1aSAlex Elder 	 * If the overlap has become 0 (most likely because the
2588bbea1c1aSAlex Elder 	 * image has been flattened) we need to free the pages
2589bbea1c1aSAlex Elder 	 * and re-submit the original write request.
2590bbea1c1aSAlex Elder 	 */
2591bbea1c1aSAlex Elder 	if (!rbd_dev->parent_overlap) {
2592bbea1c1aSAlex Elder 		ceph_release_page_vector(pages, page_count);
2593980917fcSIlya Dryomov 		rbd_obj_request_submit(orig_request);
2594bbea1c1aSAlex Elder 		return;
2595bbea1c1aSAlex Elder 	}
2596bbea1c1aSAlex Elder 
2597bbea1c1aSAlex Elder 	if (img_result)
25980eefd470SAlex Elder 		goto out_err;
25993d7efd18SAlex Elder 
26008785b1d4SAlex Elder 	/*
26018785b1d4SAlex Elder 	 * The original osd request is of no use to use any more.
26020ccd5926SIlya Dryomov 	 * We need a new one that can hold the three ops in a copyup
26038785b1d4SAlex Elder 	 * request.  Allocate the new copyup osd request for the
26048785b1d4SAlex Elder 	 * original request, and release the old one.
26058785b1d4SAlex Elder 	 */
2606bbea1c1aSAlex Elder 	img_result = -ENOMEM;
26070eefd470SAlex Elder 	osd_req = rbd_osd_req_create_copyup(orig_request);
26080eefd470SAlex Elder 	if (!osd_req)
26090eefd470SAlex Elder 		goto out_err;
26108785b1d4SAlex Elder 	rbd_osd_req_destroy(orig_request->osd_req);
26110eefd470SAlex Elder 	orig_request->osd_req = osd_req;
26120eefd470SAlex Elder 	orig_request->copyup_pages = pages;
2613ebda6408SAlex Elder 	orig_request->copyup_page_count = page_count;
26143d7efd18SAlex Elder 
26150eefd470SAlex Elder 	/* Initialize the copyup op */
26160eefd470SAlex Elder 
26170eefd470SAlex Elder 	osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2618ebda6408SAlex Elder 	osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
26190eefd470SAlex Elder 						false, false);
26200eefd470SAlex Elder 
2621d3246fb0SJosh Durgin 	/* Add the other op(s) */
26220ccd5926SIlya Dryomov 
2623d3246fb0SJosh Durgin 	op_type = rbd_img_request_op_type(orig_request->img_request);
2624d3246fb0SJosh Durgin 	rbd_img_obj_request_fill(orig_request, osd_req, op_type, 1);
26250eefd470SAlex Elder 
26260eefd470SAlex Elder 	/* All set, send it off. */
26270eefd470SAlex Elder 
2628980917fcSIlya Dryomov 	rbd_obj_request_submit(orig_request);
26290eefd470SAlex Elder 	return;
26300eefd470SAlex Elder 
26310eefd470SAlex Elder out_err:
2632fa355112SIlya Dryomov 	ceph_release_page_vector(pages, page_count);
26330dcc685eSIlya Dryomov 	rbd_obj_request_error(orig_request, img_result);
26343d7efd18SAlex Elder }
26353d7efd18SAlex Elder 
26363d7efd18SAlex Elder /*
26373d7efd18SAlex Elder  * Read from the parent image the range of data that covers the
26383d7efd18SAlex Elder  * entire target of the given object request.  This is used for
26393d7efd18SAlex Elder  * satisfying a layered image write request when the target of an
26403d7efd18SAlex Elder  * object request from the image request does not exist.
26413d7efd18SAlex Elder  *
26423d7efd18SAlex Elder  * A page array big enough to hold the returned data is allocated
26433d7efd18SAlex Elder  * and supplied to rbd_img_request_fill() as the "data descriptor."
26443d7efd18SAlex Elder  * When the read completes, this page array will be transferred to
26453d7efd18SAlex Elder  * the original object request for the copyup operation.
26463d7efd18SAlex Elder  *
2647c2e82414SIlya Dryomov  * If an error occurs, it is recorded as the result of the original
2648c2e82414SIlya Dryomov  * object request in rbd_img_obj_exists_callback().
26493d7efd18SAlex Elder  */
26503d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
26513d7efd18SAlex Elder {
2652058aa991SIlya Dryomov 	struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
26533d7efd18SAlex Elder 	struct rbd_img_request *parent_request = NULL;
26543d7efd18SAlex Elder 	u64 img_offset;
26553d7efd18SAlex Elder 	u64 length;
26563d7efd18SAlex Elder 	struct page **pages = NULL;
26573d7efd18SAlex Elder 	u32 page_count;
26583d7efd18SAlex Elder 	int result;
26593d7efd18SAlex Elder 
26603d7efd18SAlex Elder 	rbd_assert(rbd_dev->parent != NULL);
26613d7efd18SAlex Elder 
26623d7efd18SAlex Elder 	/*
26633d7efd18SAlex Elder 	 * Determine the byte range covered by the object in the
26643d7efd18SAlex Elder 	 * child image to which the original request was to be sent.
26653d7efd18SAlex Elder 	 */
26663d7efd18SAlex Elder 	img_offset = obj_request->img_offset - obj_request->offset;
26675bc3fb17SIlya Dryomov 	length = rbd_obj_bytes(&rbd_dev->header);
26683d7efd18SAlex Elder 
26693d7efd18SAlex Elder 	/*
2670a9e8ba2cSAlex Elder 	 * There is no defined parent data beyond the parent
2671a9e8ba2cSAlex Elder 	 * overlap, so limit what we read at that boundary if
2672a9e8ba2cSAlex Elder 	 * necessary.
2673a9e8ba2cSAlex Elder 	 */
2674a9e8ba2cSAlex Elder 	if (img_offset + length > rbd_dev->parent_overlap) {
2675a9e8ba2cSAlex Elder 		rbd_assert(img_offset < rbd_dev->parent_overlap);
2676a9e8ba2cSAlex Elder 		length = rbd_dev->parent_overlap - img_offset;
2677a9e8ba2cSAlex Elder 	}
2678a9e8ba2cSAlex Elder 
2679a9e8ba2cSAlex Elder 	/*
26803d7efd18SAlex Elder 	 * Allocate a page array big enough to receive the data read
26813d7efd18SAlex Elder 	 * from the parent.
26823d7efd18SAlex Elder 	 */
26833d7efd18SAlex Elder 	page_count = (u32)calc_pages_for(0, length);
26843d7efd18SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
26853d7efd18SAlex Elder 	if (IS_ERR(pages)) {
26863d7efd18SAlex Elder 		result = PTR_ERR(pages);
26873d7efd18SAlex Elder 		pages = NULL;
26883d7efd18SAlex Elder 		goto out_err;
26893d7efd18SAlex Elder 	}
26903d7efd18SAlex Elder 
26913d7efd18SAlex Elder 	result = -ENOMEM;
2692e93f3152SAlex Elder 	parent_request = rbd_parent_request_create(obj_request,
2693e93f3152SAlex Elder 						img_offset, length);
26943d7efd18SAlex Elder 	if (!parent_request)
26953d7efd18SAlex Elder 		goto out_err;
26963d7efd18SAlex Elder 
26973d7efd18SAlex Elder 	result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
26983d7efd18SAlex Elder 	if (result)
26993d7efd18SAlex Elder 		goto out_err;
2700058aa991SIlya Dryomov 
27013d7efd18SAlex Elder 	parent_request->copyup_pages = pages;
2702ebda6408SAlex Elder 	parent_request->copyup_page_count = page_count;
27033d7efd18SAlex Elder 	parent_request->callback = rbd_img_obj_parent_read_full_callback;
2704058aa991SIlya Dryomov 
27053d7efd18SAlex Elder 	result = rbd_img_request_submit(parent_request);
27063d7efd18SAlex Elder 	if (!result)
27073d7efd18SAlex Elder 		return 0;
27083d7efd18SAlex Elder 
27093d7efd18SAlex Elder 	parent_request->copyup_pages = NULL;
2710ebda6408SAlex Elder 	parent_request->copyup_page_count = 0;
27113d7efd18SAlex Elder 	parent_request->obj_request = NULL;
27123d7efd18SAlex Elder 	rbd_obj_request_put(obj_request);
27133d7efd18SAlex Elder out_err:
27143d7efd18SAlex Elder 	if (pages)
27153d7efd18SAlex Elder 		ceph_release_page_vector(pages, page_count);
27163d7efd18SAlex Elder 	if (parent_request)
27173d7efd18SAlex Elder 		rbd_img_request_put(parent_request);
27183d7efd18SAlex Elder 	return result;
27193d7efd18SAlex Elder }
27203d7efd18SAlex Elder 
2721c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2722c5b5ef6cSAlex Elder {
2723c5b5ef6cSAlex Elder 	struct rbd_obj_request *orig_request;
2724638f5abeSAlex Elder 	struct rbd_device *rbd_dev;
2725c5b5ef6cSAlex Elder 	int result;
2726c5b5ef6cSAlex Elder 
2727c5b5ef6cSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
2728c5b5ef6cSAlex Elder 
2729c5b5ef6cSAlex Elder 	/*
2730c5b5ef6cSAlex Elder 	 * All we need from the object request is the original
2731c5b5ef6cSAlex Elder 	 * request and the result of the STAT op.  Grab those, then
2732c5b5ef6cSAlex Elder 	 * we're done with the request.
2733c5b5ef6cSAlex Elder 	 */
2734c5b5ef6cSAlex Elder 	orig_request = obj_request->obj_request;
2735c5b5ef6cSAlex Elder 	obj_request->obj_request = NULL;
2736912c317dSAlex Elder 	rbd_obj_request_put(orig_request);
2737c5b5ef6cSAlex Elder 	rbd_assert(orig_request);
2738c5b5ef6cSAlex Elder 	rbd_assert(orig_request->img_request);
2739c5b5ef6cSAlex Elder 
2740c5b5ef6cSAlex Elder 	result = obj_request->result;
2741c5b5ef6cSAlex Elder 	obj_request->result = 0;
2742c5b5ef6cSAlex Elder 
2743c5b5ef6cSAlex Elder 	dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2744c5b5ef6cSAlex Elder 		obj_request, orig_request, result,
2745c5b5ef6cSAlex Elder 		obj_request->xferred, obj_request->length);
2746c5b5ef6cSAlex Elder 	rbd_obj_request_put(obj_request);
2747c5b5ef6cSAlex Elder 
2748638f5abeSAlex Elder 	/*
2749638f5abeSAlex Elder 	 * If the overlap has become 0 (most likely because the
2750980917fcSIlya Dryomov 	 * image has been flattened) we need to re-submit the
2751980917fcSIlya Dryomov 	 * original request.
2752638f5abeSAlex Elder 	 */
2753638f5abeSAlex Elder 	rbd_dev = orig_request->img_request->rbd_dev;
2754638f5abeSAlex Elder 	if (!rbd_dev->parent_overlap) {
2755980917fcSIlya Dryomov 		rbd_obj_request_submit(orig_request);
2756638f5abeSAlex Elder 		return;
2757638f5abeSAlex Elder 	}
2758c5b5ef6cSAlex Elder 
2759c5b5ef6cSAlex Elder 	/*
2760c5b5ef6cSAlex Elder 	 * Our only purpose here is to determine whether the object
2761c5b5ef6cSAlex Elder 	 * exists, and we don't want to treat the non-existence as
2762c5b5ef6cSAlex Elder 	 * an error.  If something else comes back, transfer the
2763c5b5ef6cSAlex Elder 	 * error to the original request and complete it now.
2764c5b5ef6cSAlex Elder 	 */
2765c5b5ef6cSAlex Elder 	if (!result) {
2766c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, true);
2767c5b5ef6cSAlex Elder 	} else if (result == -ENOENT) {
2768c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, false);
2769c2e82414SIlya Dryomov 	} else {
2770c2e82414SIlya Dryomov 		goto fail_orig_request;
2771c5b5ef6cSAlex Elder 	}
2772c5b5ef6cSAlex Elder 
2773c5b5ef6cSAlex Elder 	/*
2774c5b5ef6cSAlex Elder 	 * Resubmit the original request now that we have recorded
2775c5b5ef6cSAlex Elder 	 * whether the target object exists.
2776c5b5ef6cSAlex Elder 	 */
2777c2e82414SIlya Dryomov 	result = rbd_img_obj_request_submit(orig_request);
2778c2e82414SIlya Dryomov 	if (result)
2779c2e82414SIlya Dryomov 		goto fail_orig_request;
2780c2e82414SIlya Dryomov 
2781c2e82414SIlya Dryomov 	return;
2782c2e82414SIlya Dryomov 
2783c2e82414SIlya Dryomov fail_orig_request:
27840dcc685eSIlya Dryomov 	rbd_obj_request_error(orig_request, result);
2785c5b5ef6cSAlex Elder }
2786c5b5ef6cSAlex Elder 
2787c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2788c5b5ef6cSAlex Elder {
2789058aa991SIlya Dryomov 	struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
2790c5b5ef6cSAlex Elder 	struct rbd_obj_request *stat_request;
2791710214e3SIlya Dryomov 	struct page **pages;
2792c5b5ef6cSAlex Elder 	u32 page_count;
2793c5b5ef6cSAlex Elder 	size_t size;
2794c5b5ef6cSAlex Elder 	int ret;
2795c5b5ef6cSAlex Elder 
27966c696d85SIlya Dryomov 	stat_request = rbd_obj_request_create(OBJ_REQUEST_PAGES);
2797710214e3SIlya Dryomov 	if (!stat_request)
2798710214e3SIlya Dryomov 		return -ENOMEM;
2799710214e3SIlya Dryomov 
2800a90bb0c1SIlya Dryomov 	stat_request->object_no = obj_request->object_no;
2801a90bb0c1SIlya Dryomov 
2802710214e3SIlya Dryomov 	stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
2803710214e3SIlya Dryomov 						   stat_request);
2804710214e3SIlya Dryomov 	if (!stat_request->osd_req) {
2805710214e3SIlya Dryomov 		ret = -ENOMEM;
2806710214e3SIlya Dryomov 		goto fail_stat_request;
2807710214e3SIlya Dryomov 	}
2808710214e3SIlya Dryomov 
2809c5b5ef6cSAlex Elder 	/*
2810c5b5ef6cSAlex Elder 	 * The response data for a STAT call consists of:
2811c5b5ef6cSAlex Elder 	 *     le64 length;
2812c5b5ef6cSAlex Elder 	 *     struct {
2813c5b5ef6cSAlex Elder 	 *         le32 tv_sec;
2814c5b5ef6cSAlex Elder 	 *         le32 tv_nsec;
2815c5b5ef6cSAlex Elder 	 *     } mtime;
2816c5b5ef6cSAlex Elder 	 */
2817c5b5ef6cSAlex Elder 	size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2818c5b5ef6cSAlex Elder 	page_count = (u32)calc_pages_for(0, size);
2819c5b5ef6cSAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2820710214e3SIlya Dryomov 	if (IS_ERR(pages)) {
2821710214e3SIlya Dryomov 		ret = PTR_ERR(pages);
2822710214e3SIlya Dryomov 		goto fail_stat_request;
2823710214e3SIlya Dryomov 	}
2824c5b5ef6cSAlex Elder 
2825710214e3SIlya Dryomov 	osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0);
2826710214e3SIlya Dryomov 	osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2827710214e3SIlya Dryomov 				     false, false);
2828c5b5ef6cSAlex Elder 
2829c5b5ef6cSAlex Elder 	rbd_obj_request_get(obj_request);
2830c5b5ef6cSAlex Elder 	stat_request->obj_request = obj_request;
2831c5b5ef6cSAlex Elder 	stat_request->pages = pages;
2832c5b5ef6cSAlex Elder 	stat_request->page_count = page_count;
2833c5b5ef6cSAlex Elder 	stat_request->callback = rbd_img_obj_exists_callback;
2834c5b5ef6cSAlex Elder 
2835980917fcSIlya Dryomov 	rbd_obj_request_submit(stat_request);
2836980917fcSIlya Dryomov 	return 0;
2837c5b5ef6cSAlex Elder 
2838710214e3SIlya Dryomov fail_stat_request:
2839710214e3SIlya Dryomov 	rbd_obj_request_put(stat_request);
2840c5b5ef6cSAlex Elder 	return ret;
2841c5b5ef6cSAlex Elder }
2842c5b5ef6cSAlex Elder 
284370d045f6SIlya Dryomov static bool img_obj_request_simple(struct rbd_obj_request *obj_request)
2844b454e36dSAlex Elder {
2845058aa991SIlya Dryomov 	struct rbd_img_request *img_request = obj_request->img_request;
2846058aa991SIlya Dryomov 	struct rbd_device *rbd_dev = img_request->rbd_dev;
2847b454e36dSAlex Elder 
284870d045f6SIlya Dryomov 	/* Reads */
28491c220881SJosh Durgin 	if (!img_request_write_test(img_request) &&
28501c220881SJosh Durgin 	    !img_request_discard_test(img_request))
285170d045f6SIlya Dryomov 		return true;
2852b454e36dSAlex Elder 
285370d045f6SIlya Dryomov 	/* Non-layered writes */
285470d045f6SIlya Dryomov 	if (!img_request_layered_test(img_request))
285570d045f6SIlya Dryomov 		return true;
285670d045f6SIlya Dryomov 
285770d045f6SIlya Dryomov 	/*
285870d045f6SIlya Dryomov 	 * Layered writes outside of the parent overlap range don't
285970d045f6SIlya Dryomov 	 * share any data with the parent.
286070d045f6SIlya Dryomov 	 */
286170d045f6SIlya Dryomov 	if (!obj_request_overlaps_parent(obj_request))
286270d045f6SIlya Dryomov 		return true;
286370d045f6SIlya Dryomov 
286470d045f6SIlya Dryomov 	/*
2865c622d226SGuangliang Zhao 	 * Entire-object layered writes - we will overwrite whatever
2866c622d226SGuangliang Zhao 	 * parent data there is anyway.
2867c622d226SGuangliang Zhao 	 */
2868c622d226SGuangliang Zhao 	if (!obj_request->offset &&
2869c622d226SGuangliang Zhao 	    obj_request->length == rbd_obj_bytes(&rbd_dev->header))
2870c622d226SGuangliang Zhao 		return true;
2871c622d226SGuangliang Zhao 
2872c622d226SGuangliang Zhao 	/*
287370d045f6SIlya Dryomov 	 * If the object is known to already exist, its parent data has
287470d045f6SIlya Dryomov 	 * already been copied.
287570d045f6SIlya Dryomov 	 */
287670d045f6SIlya Dryomov 	if (obj_request_known_test(obj_request) &&
287770d045f6SIlya Dryomov 	    obj_request_exists_test(obj_request))
287870d045f6SIlya Dryomov 		return true;
287970d045f6SIlya Dryomov 
288070d045f6SIlya Dryomov 	return false;
288170d045f6SIlya Dryomov }
288270d045f6SIlya Dryomov 
288370d045f6SIlya Dryomov static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
288470d045f6SIlya Dryomov {
2885058aa991SIlya Dryomov 	rbd_assert(obj_request_img_data_test(obj_request));
2886058aa991SIlya Dryomov 	rbd_assert(obj_request_type_valid(obj_request->type));
2887058aa991SIlya Dryomov 	rbd_assert(obj_request->img_request);
2888058aa991SIlya Dryomov 
288970d045f6SIlya Dryomov 	if (img_obj_request_simple(obj_request)) {
2890980917fcSIlya Dryomov 		rbd_obj_request_submit(obj_request);
2891980917fcSIlya Dryomov 		return 0;
2892b454e36dSAlex Elder 	}
2893b454e36dSAlex Elder 
2894b454e36dSAlex Elder 	/*
28953d7efd18SAlex Elder 	 * It's a layered write.  The target object might exist but
28963d7efd18SAlex Elder 	 * we may not know that yet.  If we know it doesn't exist,
28973d7efd18SAlex Elder 	 * start by reading the data for the full target object from
28983d7efd18SAlex Elder 	 * the parent so we can use it for a copyup to the target.
2899b454e36dSAlex Elder 	 */
290070d045f6SIlya Dryomov 	if (obj_request_known_test(obj_request))
29013d7efd18SAlex Elder 		return rbd_img_obj_parent_read_full(obj_request);
29023d7efd18SAlex Elder 
29033d7efd18SAlex Elder 	/* We don't know whether the target exists.  Go find out. */
2904b454e36dSAlex Elder 
2905b454e36dSAlex Elder 	return rbd_img_obj_exists_submit(obj_request);
2906b454e36dSAlex Elder }
2907b454e36dSAlex Elder 
2908bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request)
2909bf0d5f50SAlex Elder {
2910bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
291146faeed4SAlex Elder 	struct rbd_obj_request *next_obj_request;
2912663ae2ccSIlya Dryomov 	int ret = 0;
2913bf0d5f50SAlex Elder 
291437206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
2915bf0d5f50SAlex Elder 
2916663ae2ccSIlya Dryomov 	rbd_img_request_get(img_request);
2917663ae2ccSIlya Dryomov 	for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
2918b454e36dSAlex Elder 		ret = rbd_img_obj_request_submit(obj_request);
2919bf0d5f50SAlex Elder 		if (ret)
2920663ae2ccSIlya Dryomov 			goto out_put_ireq;
2921bf0d5f50SAlex Elder 	}
2922bf0d5f50SAlex Elder 
2923663ae2ccSIlya Dryomov out_put_ireq:
2924663ae2ccSIlya Dryomov 	rbd_img_request_put(img_request);
2925663ae2ccSIlya Dryomov 	return ret;
2926bf0d5f50SAlex Elder }
2927bf0d5f50SAlex Elder 
29288b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
29298b3e1a56SAlex Elder {
29308b3e1a56SAlex Elder 	struct rbd_obj_request *obj_request;
2931a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
2932a9e8ba2cSAlex Elder 	u64 obj_end;
293302c74fbaSAlex Elder 	u64 img_xferred;
293402c74fbaSAlex Elder 	int img_result;
29358b3e1a56SAlex Elder 
29368b3e1a56SAlex Elder 	rbd_assert(img_request_child_test(img_request));
29378b3e1a56SAlex Elder 
293802c74fbaSAlex Elder 	/* First get what we need from the image request and release it */
293902c74fbaSAlex Elder 
29408b3e1a56SAlex Elder 	obj_request = img_request->obj_request;
294102c74fbaSAlex Elder 	img_xferred = img_request->xferred;
294202c74fbaSAlex Elder 	img_result = img_request->result;
294302c74fbaSAlex Elder 	rbd_img_request_put(img_request);
294402c74fbaSAlex Elder 
294502c74fbaSAlex Elder 	/*
294602c74fbaSAlex Elder 	 * If the overlap has become 0 (most likely because the
294702c74fbaSAlex Elder 	 * image has been flattened) we need to re-submit the
294802c74fbaSAlex Elder 	 * original request.
294902c74fbaSAlex Elder 	 */
2950a9e8ba2cSAlex Elder 	rbd_assert(obj_request);
2951a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_request);
295202c74fbaSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
295302c74fbaSAlex Elder 	if (!rbd_dev->parent_overlap) {
2954980917fcSIlya Dryomov 		rbd_obj_request_submit(obj_request);
295502c74fbaSAlex Elder 		return;
295602c74fbaSAlex Elder 	}
295702c74fbaSAlex Elder 
295802c74fbaSAlex Elder 	obj_request->result = img_result;
2959a9e8ba2cSAlex Elder 	if (obj_request->result)
2960a9e8ba2cSAlex Elder 		goto out;
2961a9e8ba2cSAlex Elder 
2962a9e8ba2cSAlex Elder 	/*
2963a9e8ba2cSAlex Elder 	 * We need to zero anything beyond the parent overlap
2964a9e8ba2cSAlex Elder 	 * boundary.  Since rbd_img_obj_request_read_callback()
2965a9e8ba2cSAlex Elder 	 * will zero anything beyond the end of a short read, an
2966a9e8ba2cSAlex Elder 	 * easy way to do this is to pretend the data from the
2967a9e8ba2cSAlex Elder 	 * parent came up short--ending at the overlap boundary.
2968a9e8ba2cSAlex Elder 	 */
2969a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2970a9e8ba2cSAlex Elder 	obj_end = obj_request->img_offset + obj_request->length;
2971a9e8ba2cSAlex Elder 	if (obj_end > rbd_dev->parent_overlap) {
2972a9e8ba2cSAlex Elder 		u64 xferred = 0;
2973a9e8ba2cSAlex Elder 
2974a9e8ba2cSAlex Elder 		if (obj_request->img_offset < rbd_dev->parent_overlap)
2975a9e8ba2cSAlex Elder 			xferred = rbd_dev->parent_overlap -
2976a9e8ba2cSAlex Elder 					obj_request->img_offset;
2977a9e8ba2cSAlex Elder 
297802c74fbaSAlex Elder 		obj_request->xferred = min(img_xferred, xferred);
2979a9e8ba2cSAlex Elder 	} else {
298002c74fbaSAlex Elder 		obj_request->xferred = img_xferred;
2981a9e8ba2cSAlex Elder 	}
2982a9e8ba2cSAlex Elder out:
29838b3e1a56SAlex Elder 	rbd_img_obj_request_read_callback(obj_request);
29848b3e1a56SAlex Elder 	rbd_obj_request_complete(obj_request);
29858b3e1a56SAlex Elder }
29868b3e1a56SAlex Elder 
29878b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
29888b3e1a56SAlex Elder {
29898b3e1a56SAlex Elder 	struct rbd_img_request *img_request;
29908b3e1a56SAlex Elder 	int result;
29918b3e1a56SAlex Elder 
29928b3e1a56SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
29938b3e1a56SAlex Elder 	rbd_assert(obj_request->img_request != NULL);
29948b3e1a56SAlex Elder 	rbd_assert(obj_request->result == (s32) -ENOENT);
29955b2ab72dSAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
29968b3e1a56SAlex Elder 
29978b3e1a56SAlex Elder 	/* rbd_read_finish(obj_request, obj_request->length); */
2998e93f3152SAlex Elder 	img_request = rbd_parent_request_create(obj_request,
29998b3e1a56SAlex Elder 						obj_request->img_offset,
3000e93f3152SAlex Elder 						obj_request->length);
30018b3e1a56SAlex Elder 	result = -ENOMEM;
30028b3e1a56SAlex Elder 	if (!img_request)
30038b3e1a56SAlex Elder 		goto out_err;
30048b3e1a56SAlex Elder 
30055b2ab72dSAlex Elder 	if (obj_request->type == OBJ_REQUEST_BIO)
3006f1a4739fSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
3007f1a4739fSAlex Elder 						obj_request->bio_list);
30085b2ab72dSAlex Elder 	else
30095b2ab72dSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES,
30105b2ab72dSAlex Elder 						obj_request->pages);
30118b3e1a56SAlex Elder 	if (result)
30128b3e1a56SAlex Elder 		goto out_err;
30138b3e1a56SAlex Elder 
30148b3e1a56SAlex Elder 	img_request->callback = rbd_img_parent_read_callback;
30158b3e1a56SAlex Elder 	result = rbd_img_request_submit(img_request);
30168b3e1a56SAlex Elder 	if (result)
30178b3e1a56SAlex Elder 		goto out_err;
30188b3e1a56SAlex Elder 
30198b3e1a56SAlex Elder 	return;
30208b3e1a56SAlex Elder out_err:
30218b3e1a56SAlex Elder 	if (img_request)
30228b3e1a56SAlex Elder 		rbd_img_request_put(img_request);
30238b3e1a56SAlex Elder 	obj_request->result = result;
30248b3e1a56SAlex Elder 	obj_request->xferred = 0;
30258b3e1a56SAlex Elder 	obj_request_done_set(obj_request);
30268b3e1a56SAlex Elder }
30278b3e1a56SAlex Elder 
3028ed95b21aSIlya Dryomov static const struct rbd_client_id rbd_empty_cid;
3029ed95b21aSIlya Dryomov 
3030ed95b21aSIlya Dryomov static bool rbd_cid_equal(const struct rbd_client_id *lhs,
3031ed95b21aSIlya Dryomov 			  const struct rbd_client_id *rhs)
3032ed95b21aSIlya Dryomov {
3033ed95b21aSIlya Dryomov 	return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
3034ed95b21aSIlya Dryomov }
3035ed95b21aSIlya Dryomov 
3036ed95b21aSIlya Dryomov static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
3037ed95b21aSIlya Dryomov {
3038ed95b21aSIlya Dryomov 	struct rbd_client_id cid;
3039ed95b21aSIlya Dryomov 
3040ed95b21aSIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
3041ed95b21aSIlya Dryomov 	cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
3042ed95b21aSIlya Dryomov 	cid.handle = rbd_dev->watch_cookie;
3043ed95b21aSIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
3044ed95b21aSIlya Dryomov 	return cid;
3045ed95b21aSIlya Dryomov }
3046ed95b21aSIlya Dryomov 
3047ed95b21aSIlya Dryomov /*
3048ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3049ed95b21aSIlya Dryomov  */
3050ed95b21aSIlya Dryomov static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
3051ed95b21aSIlya Dryomov 			      const struct rbd_client_id *cid)
3052ed95b21aSIlya Dryomov {
3053ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
3054ed95b21aSIlya Dryomov 	     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
3055ed95b21aSIlya Dryomov 	     cid->gid, cid->handle);
3056ed95b21aSIlya Dryomov 	rbd_dev->owner_cid = *cid; /* struct */
3057ed95b21aSIlya Dryomov }
3058ed95b21aSIlya Dryomov 
3059ed95b21aSIlya Dryomov static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
3060ed95b21aSIlya Dryomov {
3061ed95b21aSIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
3062ed95b21aSIlya Dryomov 	sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
3063ed95b21aSIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
3064ed95b21aSIlya Dryomov }
3065ed95b21aSIlya Dryomov 
3066ed95b21aSIlya Dryomov /*
3067ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3068ed95b21aSIlya Dryomov  */
3069ed95b21aSIlya Dryomov static int rbd_lock(struct rbd_device *rbd_dev)
3070ed95b21aSIlya Dryomov {
3071ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3072ed95b21aSIlya Dryomov 	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3073ed95b21aSIlya Dryomov 	char cookie[32];
3074ed95b21aSIlya Dryomov 	int ret;
3075ed95b21aSIlya Dryomov 
3076cbbfb0ffSIlya Dryomov 	WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
3077cbbfb0ffSIlya Dryomov 		rbd_dev->lock_cookie[0] != '\0');
3078ed95b21aSIlya Dryomov 
3079ed95b21aSIlya Dryomov 	format_lock_cookie(rbd_dev, cookie);
3080ed95b21aSIlya Dryomov 	ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3081ed95b21aSIlya Dryomov 			    RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
3082ed95b21aSIlya Dryomov 			    RBD_LOCK_TAG, "", 0);
3083ed95b21aSIlya Dryomov 	if (ret)
3084ed95b21aSIlya Dryomov 		return ret;
3085ed95b21aSIlya Dryomov 
3086ed95b21aSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
3087cbbfb0ffSIlya Dryomov 	strcpy(rbd_dev->lock_cookie, cookie);
3088ed95b21aSIlya Dryomov 	rbd_set_owner_cid(rbd_dev, &cid);
3089ed95b21aSIlya Dryomov 	queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
3090ed95b21aSIlya Dryomov 	return 0;
3091ed95b21aSIlya Dryomov }
3092ed95b21aSIlya Dryomov 
3093ed95b21aSIlya Dryomov /*
3094ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3095ed95b21aSIlya Dryomov  */
3096bbead745SIlya Dryomov static void rbd_unlock(struct rbd_device *rbd_dev)
3097ed95b21aSIlya Dryomov {
3098ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3099ed95b21aSIlya Dryomov 	int ret;
3100ed95b21aSIlya Dryomov 
3101cbbfb0ffSIlya Dryomov 	WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
3102cbbfb0ffSIlya Dryomov 		rbd_dev->lock_cookie[0] == '\0');
3103ed95b21aSIlya Dryomov 
3104ed95b21aSIlya Dryomov 	ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3105cbbfb0ffSIlya Dryomov 			      RBD_LOCK_NAME, rbd_dev->lock_cookie);
3106bbead745SIlya Dryomov 	if (ret && ret != -ENOENT)
3107bbead745SIlya Dryomov 		rbd_warn(rbd_dev, "failed to unlock: %d", ret);
3108ed95b21aSIlya Dryomov 
3109bbead745SIlya Dryomov 	/* treat errors as the image is unlocked */
3110bbead745SIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
3111cbbfb0ffSIlya Dryomov 	rbd_dev->lock_cookie[0] = '\0';
3112ed95b21aSIlya Dryomov 	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3113ed95b21aSIlya Dryomov 	queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
3114ed95b21aSIlya Dryomov }
3115ed95b21aSIlya Dryomov 
3116ed95b21aSIlya Dryomov static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
3117ed95b21aSIlya Dryomov 				enum rbd_notify_op notify_op,
3118ed95b21aSIlya Dryomov 				struct page ***preply_pages,
3119ed95b21aSIlya Dryomov 				size_t *preply_len)
3120ed95b21aSIlya Dryomov {
3121ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3122ed95b21aSIlya Dryomov 	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3123ed95b21aSIlya Dryomov 	int buf_size = 4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN;
3124ed95b21aSIlya Dryomov 	char buf[buf_size];
3125ed95b21aSIlya Dryomov 	void *p = buf;
3126ed95b21aSIlya Dryomov 
3127ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
3128ed95b21aSIlya Dryomov 
3129ed95b21aSIlya Dryomov 	/* encode *LockPayload NotifyMessage (op + ClientId) */
3130ed95b21aSIlya Dryomov 	ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
3131ed95b21aSIlya Dryomov 	ceph_encode_32(&p, notify_op);
3132ed95b21aSIlya Dryomov 	ceph_encode_64(&p, cid.gid);
3133ed95b21aSIlya Dryomov 	ceph_encode_64(&p, cid.handle);
3134ed95b21aSIlya Dryomov 
3135ed95b21aSIlya Dryomov 	return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
3136ed95b21aSIlya Dryomov 				&rbd_dev->header_oloc, buf, buf_size,
3137ed95b21aSIlya Dryomov 				RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
3138ed95b21aSIlya Dryomov }
3139ed95b21aSIlya Dryomov 
3140ed95b21aSIlya Dryomov static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
3141ed95b21aSIlya Dryomov 			       enum rbd_notify_op notify_op)
3142ed95b21aSIlya Dryomov {
3143ed95b21aSIlya Dryomov 	struct page **reply_pages;
3144ed95b21aSIlya Dryomov 	size_t reply_len;
3145ed95b21aSIlya Dryomov 
3146ed95b21aSIlya Dryomov 	__rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
3147ed95b21aSIlya Dryomov 	ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3148ed95b21aSIlya Dryomov }
3149ed95b21aSIlya Dryomov 
3150ed95b21aSIlya Dryomov static void rbd_notify_acquired_lock(struct work_struct *work)
3151ed95b21aSIlya Dryomov {
3152ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3153ed95b21aSIlya Dryomov 						  acquired_lock_work);
3154ed95b21aSIlya Dryomov 
3155ed95b21aSIlya Dryomov 	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
3156ed95b21aSIlya Dryomov }
3157ed95b21aSIlya Dryomov 
3158ed95b21aSIlya Dryomov static void rbd_notify_released_lock(struct work_struct *work)
3159ed95b21aSIlya Dryomov {
3160ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3161ed95b21aSIlya Dryomov 						  released_lock_work);
3162ed95b21aSIlya Dryomov 
3163ed95b21aSIlya Dryomov 	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
3164ed95b21aSIlya Dryomov }
3165ed95b21aSIlya Dryomov 
3166ed95b21aSIlya Dryomov static int rbd_request_lock(struct rbd_device *rbd_dev)
3167ed95b21aSIlya Dryomov {
3168ed95b21aSIlya Dryomov 	struct page **reply_pages;
3169ed95b21aSIlya Dryomov 	size_t reply_len;
3170ed95b21aSIlya Dryomov 	bool lock_owner_responded = false;
3171ed95b21aSIlya Dryomov 	int ret;
3172ed95b21aSIlya Dryomov 
3173ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3174ed95b21aSIlya Dryomov 
3175ed95b21aSIlya Dryomov 	ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
3176ed95b21aSIlya Dryomov 				   &reply_pages, &reply_len);
3177ed95b21aSIlya Dryomov 	if (ret && ret != -ETIMEDOUT) {
3178ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "failed to request lock: %d", ret);
3179ed95b21aSIlya Dryomov 		goto out;
3180ed95b21aSIlya Dryomov 	}
3181ed95b21aSIlya Dryomov 
3182ed95b21aSIlya Dryomov 	if (reply_len > 0 && reply_len <= PAGE_SIZE) {
3183ed95b21aSIlya Dryomov 		void *p = page_address(reply_pages[0]);
3184ed95b21aSIlya Dryomov 		void *const end = p + reply_len;
3185ed95b21aSIlya Dryomov 		u32 n;
3186ed95b21aSIlya Dryomov 
3187ed95b21aSIlya Dryomov 		ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
3188ed95b21aSIlya Dryomov 		while (n--) {
3189ed95b21aSIlya Dryomov 			u8 struct_v;
3190ed95b21aSIlya Dryomov 			u32 len;
3191ed95b21aSIlya Dryomov 
3192ed95b21aSIlya Dryomov 			ceph_decode_need(&p, end, 8 + 8, e_inval);
3193ed95b21aSIlya Dryomov 			p += 8 + 8; /* skip gid and cookie */
3194ed95b21aSIlya Dryomov 
3195ed95b21aSIlya Dryomov 			ceph_decode_32_safe(&p, end, len, e_inval);
3196ed95b21aSIlya Dryomov 			if (!len)
3197ed95b21aSIlya Dryomov 				continue;
3198ed95b21aSIlya Dryomov 
3199ed95b21aSIlya Dryomov 			if (lock_owner_responded) {
3200ed95b21aSIlya Dryomov 				rbd_warn(rbd_dev,
3201ed95b21aSIlya Dryomov 					 "duplicate lock owners detected");
3202ed95b21aSIlya Dryomov 				ret = -EIO;
3203ed95b21aSIlya Dryomov 				goto out;
3204ed95b21aSIlya Dryomov 			}
3205ed95b21aSIlya Dryomov 
3206ed95b21aSIlya Dryomov 			lock_owner_responded = true;
3207ed95b21aSIlya Dryomov 			ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
3208ed95b21aSIlya Dryomov 						  &struct_v, &len);
3209ed95b21aSIlya Dryomov 			if (ret) {
3210ed95b21aSIlya Dryomov 				rbd_warn(rbd_dev,
3211ed95b21aSIlya Dryomov 					 "failed to decode ResponseMessage: %d",
3212ed95b21aSIlya Dryomov 					 ret);
3213ed95b21aSIlya Dryomov 				goto e_inval;
3214ed95b21aSIlya Dryomov 			}
3215ed95b21aSIlya Dryomov 
3216ed95b21aSIlya Dryomov 			ret = ceph_decode_32(&p);
3217ed95b21aSIlya Dryomov 		}
3218ed95b21aSIlya Dryomov 	}
3219ed95b21aSIlya Dryomov 
3220ed95b21aSIlya Dryomov 	if (!lock_owner_responded) {
3221ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "no lock owners detected");
3222ed95b21aSIlya Dryomov 		ret = -ETIMEDOUT;
3223ed95b21aSIlya Dryomov 	}
3224ed95b21aSIlya Dryomov 
3225ed95b21aSIlya Dryomov out:
3226ed95b21aSIlya Dryomov 	ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3227ed95b21aSIlya Dryomov 	return ret;
3228ed95b21aSIlya Dryomov 
3229ed95b21aSIlya Dryomov e_inval:
3230ed95b21aSIlya Dryomov 	ret = -EINVAL;
3231ed95b21aSIlya Dryomov 	goto out;
3232ed95b21aSIlya Dryomov }
3233ed95b21aSIlya Dryomov 
3234ed95b21aSIlya Dryomov static void wake_requests(struct rbd_device *rbd_dev, bool wake_all)
3235ed95b21aSIlya Dryomov {
3236ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all);
3237ed95b21aSIlya Dryomov 
3238ed95b21aSIlya Dryomov 	cancel_delayed_work(&rbd_dev->lock_dwork);
3239ed95b21aSIlya Dryomov 	if (wake_all)
3240ed95b21aSIlya Dryomov 		wake_up_all(&rbd_dev->lock_waitq);
3241ed95b21aSIlya Dryomov 	else
3242ed95b21aSIlya Dryomov 		wake_up(&rbd_dev->lock_waitq);
3243ed95b21aSIlya Dryomov }
3244ed95b21aSIlya Dryomov 
3245ed95b21aSIlya Dryomov static int get_lock_owner_info(struct rbd_device *rbd_dev,
3246ed95b21aSIlya Dryomov 			       struct ceph_locker **lockers, u32 *num_lockers)
3247ed95b21aSIlya Dryomov {
3248ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3249ed95b21aSIlya Dryomov 	u8 lock_type;
3250ed95b21aSIlya Dryomov 	char *lock_tag;
3251ed95b21aSIlya Dryomov 	int ret;
3252ed95b21aSIlya Dryomov 
3253ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3254ed95b21aSIlya Dryomov 
3255ed95b21aSIlya Dryomov 	ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
3256ed95b21aSIlya Dryomov 				 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3257ed95b21aSIlya Dryomov 				 &lock_type, &lock_tag, lockers, num_lockers);
3258ed95b21aSIlya Dryomov 	if (ret)
3259ed95b21aSIlya Dryomov 		return ret;
3260ed95b21aSIlya Dryomov 
3261ed95b21aSIlya Dryomov 	if (*num_lockers == 0) {
3262ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
3263ed95b21aSIlya Dryomov 		goto out;
3264ed95b21aSIlya Dryomov 	}
3265ed95b21aSIlya Dryomov 
3266ed95b21aSIlya Dryomov 	if (strcmp(lock_tag, RBD_LOCK_TAG)) {
3267ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
3268ed95b21aSIlya Dryomov 			 lock_tag);
3269ed95b21aSIlya Dryomov 		ret = -EBUSY;
3270ed95b21aSIlya Dryomov 		goto out;
3271ed95b21aSIlya Dryomov 	}
3272ed95b21aSIlya Dryomov 
3273ed95b21aSIlya Dryomov 	if (lock_type == CEPH_CLS_LOCK_SHARED) {
3274ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "shared lock type detected");
3275ed95b21aSIlya Dryomov 		ret = -EBUSY;
3276ed95b21aSIlya Dryomov 		goto out;
3277ed95b21aSIlya Dryomov 	}
3278ed95b21aSIlya Dryomov 
3279ed95b21aSIlya Dryomov 	if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
3280ed95b21aSIlya Dryomov 		    strlen(RBD_LOCK_COOKIE_PREFIX))) {
3281ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
3282ed95b21aSIlya Dryomov 			 (*lockers)[0].id.cookie);
3283ed95b21aSIlya Dryomov 		ret = -EBUSY;
3284ed95b21aSIlya Dryomov 		goto out;
3285ed95b21aSIlya Dryomov 	}
3286ed95b21aSIlya Dryomov 
3287ed95b21aSIlya Dryomov out:
3288ed95b21aSIlya Dryomov 	kfree(lock_tag);
3289ed95b21aSIlya Dryomov 	return ret;
3290ed95b21aSIlya Dryomov }
3291ed95b21aSIlya Dryomov 
3292ed95b21aSIlya Dryomov static int find_watcher(struct rbd_device *rbd_dev,
3293ed95b21aSIlya Dryomov 			const struct ceph_locker *locker)
3294ed95b21aSIlya Dryomov {
3295ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3296ed95b21aSIlya Dryomov 	struct ceph_watch_item *watchers;
3297ed95b21aSIlya Dryomov 	u32 num_watchers;
3298ed95b21aSIlya Dryomov 	u64 cookie;
3299ed95b21aSIlya Dryomov 	int i;
3300ed95b21aSIlya Dryomov 	int ret;
3301ed95b21aSIlya Dryomov 
3302ed95b21aSIlya Dryomov 	ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
3303ed95b21aSIlya Dryomov 				      &rbd_dev->header_oloc, &watchers,
3304ed95b21aSIlya Dryomov 				      &num_watchers);
3305ed95b21aSIlya Dryomov 	if (ret)
3306ed95b21aSIlya Dryomov 		return ret;
3307ed95b21aSIlya Dryomov 
3308ed95b21aSIlya Dryomov 	sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
3309ed95b21aSIlya Dryomov 	for (i = 0; i < num_watchers; i++) {
3310ed95b21aSIlya Dryomov 		if (!memcmp(&watchers[i].addr, &locker->info.addr,
3311ed95b21aSIlya Dryomov 			    sizeof(locker->info.addr)) &&
3312ed95b21aSIlya Dryomov 		    watchers[i].cookie == cookie) {
3313ed95b21aSIlya Dryomov 			struct rbd_client_id cid = {
3314ed95b21aSIlya Dryomov 				.gid = le64_to_cpu(watchers[i].name.num),
3315ed95b21aSIlya Dryomov 				.handle = cookie,
3316ed95b21aSIlya Dryomov 			};
3317ed95b21aSIlya Dryomov 
3318ed95b21aSIlya Dryomov 			dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
3319ed95b21aSIlya Dryomov 			     rbd_dev, cid.gid, cid.handle);
3320ed95b21aSIlya Dryomov 			rbd_set_owner_cid(rbd_dev, &cid);
3321ed95b21aSIlya Dryomov 			ret = 1;
3322ed95b21aSIlya Dryomov 			goto out;
3323ed95b21aSIlya Dryomov 		}
3324ed95b21aSIlya Dryomov 	}
3325ed95b21aSIlya Dryomov 
3326ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
3327ed95b21aSIlya Dryomov 	ret = 0;
3328ed95b21aSIlya Dryomov out:
3329ed95b21aSIlya Dryomov 	kfree(watchers);
3330ed95b21aSIlya Dryomov 	return ret;
3331ed95b21aSIlya Dryomov }
3332ed95b21aSIlya Dryomov 
3333ed95b21aSIlya Dryomov /*
3334ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3335ed95b21aSIlya Dryomov  */
3336ed95b21aSIlya Dryomov static int rbd_try_lock(struct rbd_device *rbd_dev)
3337ed95b21aSIlya Dryomov {
3338ed95b21aSIlya Dryomov 	struct ceph_client *client = rbd_dev->rbd_client->client;
3339ed95b21aSIlya Dryomov 	struct ceph_locker *lockers;
3340ed95b21aSIlya Dryomov 	u32 num_lockers;
3341ed95b21aSIlya Dryomov 	int ret;
3342ed95b21aSIlya Dryomov 
3343ed95b21aSIlya Dryomov 	for (;;) {
3344ed95b21aSIlya Dryomov 		ret = rbd_lock(rbd_dev);
3345ed95b21aSIlya Dryomov 		if (ret != -EBUSY)
3346ed95b21aSIlya Dryomov 			return ret;
3347ed95b21aSIlya Dryomov 
3348ed95b21aSIlya Dryomov 		/* determine if the current lock holder is still alive */
3349ed95b21aSIlya Dryomov 		ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
3350ed95b21aSIlya Dryomov 		if (ret)
3351ed95b21aSIlya Dryomov 			return ret;
3352ed95b21aSIlya Dryomov 
3353ed95b21aSIlya Dryomov 		if (num_lockers == 0)
3354ed95b21aSIlya Dryomov 			goto again;
3355ed95b21aSIlya Dryomov 
3356ed95b21aSIlya Dryomov 		ret = find_watcher(rbd_dev, lockers);
3357ed95b21aSIlya Dryomov 		if (ret) {
3358ed95b21aSIlya Dryomov 			if (ret > 0)
3359ed95b21aSIlya Dryomov 				ret = 0; /* have to request lock */
3360ed95b21aSIlya Dryomov 			goto out;
3361ed95b21aSIlya Dryomov 		}
3362ed95b21aSIlya Dryomov 
3363ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
3364ed95b21aSIlya Dryomov 			 ENTITY_NAME(lockers[0].id.name));
3365ed95b21aSIlya Dryomov 
3366ed95b21aSIlya Dryomov 		ret = ceph_monc_blacklist_add(&client->monc,
3367ed95b21aSIlya Dryomov 					      &lockers[0].info.addr);
3368ed95b21aSIlya Dryomov 		if (ret) {
3369ed95b21aSIlya Dryomov 			rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
3370ed95b21aSIlya Dryomov 				 ENTITY_NAME(lockers[0].id.name), ret);
3371ed95b21aSIlya Dryomov 			goto out;
3372ed95b21aSIlya Dryomov 		}
3373ed95b21aSIlya Dryomov 
3374ed95b21aSIlya Dryomov 		ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
3375ed95b21aSIlya Dryomov 					  &rbd_dev->header_oloc, RBD_LOCK_NAME,
3376ed95b21aSIlya Dryomov 					  lockers[0].id.cookie,
3377ed95b21aSIlya Dryomov 					  &lockers[0].id.name);
3378ed95b21aSIlya Dryomov 		if (ret && ret != -ENOENT)
3379ed95b21aSIlya Dryomov 			goto out;
3380ed95b21aSIlya Dryomov 
3381ed95b21aSIlya Dryomov again:
3382ed95b21aSIlya Dryomov 		ceph_free_lockers(lockers, num_lockers);
3383ed95b21aSIlya Dryomov 	}
3384ed95b21aSIlya Dryomov 
3385ed95b21aSIlya Dryomov out:
3386ed95b21aSIlya Dryomov 	ceph_free_lockers(lockers, num_lockers);
3387ed95b21aSIlya Dryomov 	return ret;
3388ed95b21aSIlya Dryomov }
3389ed95b21aSIlya Dryomov 
3390ed95b21aSIlya Dryomov /*
3391ed95b21aSIlya Dryomov  * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED
3392ed95b21aSIlya Dryomov  */
3393ed95b21aSIlya Dryomov static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev,
3394ed95b21aSIlya Dryomov 						int *pret)
3395ed95b21aSIlya Dryomov {
3396ed95b21aSIlya Dryomov 	enum rbd_lock_state lock_state;
3397ed95b21aSIlya Dryomov 
3398ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
3399ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3400ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
3401ed95b21aSIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev)) {
3402ed95b21aSIlya Dryomov 		lock_state = rbd_dev->lock_state;
3403ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
3404ed95b21aSIlya Dryomov 		return lock_state;
3405ed95b21aSIlya Dryomov 	}
3406ed95b21aSIlya Dryomov 
3407ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3408ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3409ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3410ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
3411ed95b21aSIlya Dryomov 	if (!__rbd_is_lock_owner(rbd_dev)) {
3412ed95b21aSIlya Dryomov 		*pret = rbd_try_lock(rbd_dev);
3413ed95b21aSIlya Dryomov 		if (*pret)
3414ed95b21aSIlya Dryomov 			rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret);
3415ed95b21aSIlya Dryomov 	}
3416ed95b21aSIlya Dryomov 
3417ed95b21aSIlya Dryomov 	lock_state = rbd_dev->lock_state;
3418ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
3419ed95b21aSIlya Dryomov 	return lock_state;
3420ed95b21aSIlya Dryomov }
3421ed95b21aSIlya Dryomov 
3422ed95b21aSIlya Dryomov static void rbd_acquire_lock(struct work_struct *work)
3423ed95b21aSIlya Dryomov {
3424ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3425ed95b21aSIlya Dryomov 					    struct rbd_device, lock_dwork);
3426ed95b21aSIlya Dryomov 	enum rbd_lock_state lock_state;
3427ed95b21aSIlya Dryomov 	int ret;
3428ed95b21aSIlya Dryomov 
3429ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3430ed95b21aSIlya Dryomov again:
3431ed95b21aSIlya Dryomov 	lock_state = rbd_try_acquire_lock(rbd_dev, &ret);
3432ed95b21aSIlya Dryomov 	if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) {
3433ed95b21aSIlya Dryomov 		if (lock_state == RBD_LOCK_STATE_LOCKED)
3434ed95b21aSIlya Dryomov 			wake_requests(rbd_dev, true);
3435ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__,
3436ed95b21aSIlya Dryomov 		     rbd_dev, lock_state, ret);
3437ed95b21aSIlya Dryomov 		return;
3438ed95b21aSIlya Dryomov 	}
3439ed95b21aSIlya Dryomov 
3440ed95b21aSIlya Dryomov 	ret = rbd_request_lock(rbd_dev);
3441ed95b21aSIlya Dryomov 	if (ret == -ETIMEDOUT) {
3442ed95b21aSIlya Dryomov 		goto again; /* treat this as a dead client */
3443ed95b21aSIlya Dryomov 	} else if (ret < 0) {
3444ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "error requesting lock: %d", ret);
3445ed95b21aSIlya Dryomov 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3446ed95b21aSIlya Dryomov 				 RBD_RETRY_DELAY);
3447ed95b21aSIlya Dryomov 	} else {
3448ed95b21aSIlya Dryomov 		/*
3449ed95b21aSIlya Dryomov 		 * lock owner acked, but resend if we don't see them
3450ed95b21aSIlya Dryomov 		 * release the lock
3451ed95b21aSIlya Dryomov 		 */
3452ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p requeueing lock_dwork\n", __func__,
3453ed95b21aSIlya Dryomov 		     rbd_dev);
3454ed95b21aSIlya Dryomov 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3455ed95b21aSIlya Dryomov 		    msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
3456ed95b21aSIlya Dryomov 	}
3457ed95b21aSIlya Dryomov }
3458ed95b21aSIlya Dryomov 
3459ed95b21aSIlya Dryomov /*
3460ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3461ed95b21aSIlya Dryomov  */
3462ed95b21aSIlya Dryomov static bool rbd_release_lock(struct rbd_device *rbd_dev)
3463ed95b21aSIlya Dryomov {
3464ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3465ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
3466ed95b21aSIlya Dryomov 	if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
3467ed95b21aSIlya Dryomov 		return false;
3468ed95b21aSIlya Dryomov 
3469ed95b21aSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
3470ed95b21aSIlya Dryomov 	downgrade_write(&rbd_dev->lock_rwsem);
3471ed95b21aSIlya Dryomov 	/*
3472ed95b21aSIlya Dryomov 	 * Ensure that all in-flight IO is flushed.
3473ed95b21aSIlya Dryomov 	 *
3474ed95b21aSIlya Dryomov 	 * FIXME: ceph_osdc_sync() flushes the entire OSD client, which
3475ed95b21aSIlya Dryomov 	 * may be shared with other devices.
3476ed95b21aSIlya Dryomov 	 */
3477ed95b21aSIlya Dryomov 	ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc);
3478ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3479ed95b21aSIlya Dryomov 
3480ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3481ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3482ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
3483ed95b21aSIlya Dryomov 	if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
3484ed95b21aSIlya Dryomov 		return false;
3485ed95b21aSIlya Dryomov 
3486bbead745SIlya Dryomov 	rbd_unlock(rbd_dev);
3487ed95b21aSIlya Dryomov 	/*
3488ed95b21aSIlya Dryomov 	 * Give others a chance to grab the lock - we would re-acquire
3489ed95b21aSIlya Dryomov 	 * almost immediately if we got new IO during ceph_osdc_sync()
3490ed95b21aSIlya Dryomov 	 * otherwise.  We need to ack our own notifications, so this
3491ed95b21aSIlya Dryomov 	 * lock_dwork will be requeued from rbd_wait_state_locked()
3492ed95b21aSIlya Dryomov 	 * after wake_requests() in rbd_handle_released_lock().
3493ed95b21aSIlya Dryomov 	 */
3494ed95b21aSIlya Dryomov 	cancel_delayed_work(&rbd_dev->lock_dwork);
3495ed95b21aSIlya Dryomov 	return true;
3496ed95b21aSIlya Dryomov }
3497ed95b21aSIlya Dryomov 
3498ed95b21aSIlya Dryomov static void rbd_release_lock_work(struct work_struct *work)
3499ed95b21aSIlya Dryomov {
3500ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3501ed95b21aSIlya Dryomov 						  unlock_work);
3502ed95b21aSIlya Dryomov 
3503ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3504ed95b21aSIlya Dryomov 	rbd_release_lock(rbd_dev);
3505ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
3506ed95b21aSIlya Dryomov }
3507ed95b21aSIlya Dryomov 
3508ed95b21aSIlya Dryomov static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
3509ed95b21aSIlya Dryomov 				     void **p)
3510ed95b21aSIlya Dryomov {
3511ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
3512ed95b21aSIlya Dryomov 
3513ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
3514ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
3515ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
3516ed95b21aSIlya Dryomov 	}
3517ed95b21aSIlya Dryomov 
3518ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3519ed95b21aSIlya Dryomov 	     cid.handle);
3520ed95b21aSIlya Dryomov 	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3521ed95b21aSIlya Dryomov 		down_write(&rbd_dev->lock_rwsem);
3522ed95b21aSIlya Dryomov 		if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3523ed95b21aSIlya Dryomov 			/*
3524ed95b21aSIlya Dryomov 			 * we already know that the remote client is
3525ed95b21aSIlya Dryomov 			 * the owner
3526ed95b21aSIlya Dryomov 			 */
3527ed95b21aSIlya Dryomov 			up_write(&rbd_dev->lock_rwsem);
3528ed95b21aSIlya Dryomov 			return;
3529ed95b21aSIlya Dryomov 		}
3530ed95b21aSIlya Dryomov 
3531ed95b21aSIlya Dryomov 		rbd_set_owner_cid(rbd_dev, &cid);
3532ed95b21aSIlya Dryomov 		downgrade_write(&rbd_dev->lock_rwsem);
3533ed95b21aSIlya Dryomov 	} else {
3534ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
3535ed95b21aSIlya Dryomov 	}
3536ed95b21aSIlya Dryomov 
3537ed95b21aSIlya Dryomov 	if (!__rbd_is_lock_owner(rbd_dev))
3538ed95b21aSIlya Dryomov 		wake_requests(rbd_dev, false);
3539ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3540ed95b21aSIlya Dryomov }
3541ed95b21aSIlya Dryomov 
3542ed95b21aSIlya Dryomov static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
3543ed95b21aSIlya Dryomov 				     void **p)
3544ed95b21aSIlya Dryomov {
3545ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
3546ed95b21aSIlya Dryomov 
3547ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
3548ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
3549ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
3550ed95b21aSIlya Dryomov 	}
3551ed95b21aSIlya Dryomov 
3552ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3553ed95b21aSIlya Dryomov 	     cid.handle);
3554ed95b21aSIlya Dryomov 	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3555ed95b21aSIlya Dryomov 		down_write(&rbd_dev->lock_rwsem);
3556ed95b21aSIlya Dryomov 		if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3557ed95b21aSIlya Dryomov 			dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
3558ed95b21aSIlya Dryomov 			     __func__, rbd_dev, cid.gid, cid.handle,
3559ed95b21aSIlya Dryomov 			     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
3560ed95b21aSIlya Dryomov 			up_write(&rbd_dev->lock_rwsem);
3561ed95b21aSIlya Dryomov 			return;
3562ed95b21aSIlya Dryomov 		}
3563ed95b21aSIlya Dryomov 
3564ed95b21aSIlya Dryomov 		rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3565ed95b21aSIlya Dryomov 		downgrade_write(&rbd_dev->lock_rwsem);
3566ed95b21aSIlya Dryomov 	} else {
3567ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
3568ed95b21aSIlya Dryomov 	}
3569ed95b21aSIlya Dryomov 
3570ed95b21aSIlya Dryomov 	if (!__rbd_is_lock_owner(rbd_dev))
3571ed95b21aSIlya Dryomov 		wake_requests(rbd_dev, false);
3572ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3573ed95b21aSIlya Dryomov }
3574ed95b21aSIlya Dryomov 
3575ed95b21aSIlya Dryomov static bool rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
3576ed95b21aSIlya Dryomov 				    void **p)
3577ed95b21aSIlya Dryomov {
3578ed95b21aSIlya Dryomov 	struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
3579ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
3580ed95b21aSIlya Dryomov 	bool need_to_send;
3581ed95b21aSIlya Dryomov 
3582ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
3583ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
3584ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
3585ed95b21aSIlya Dryomov 	}
3586ed95b21aSIlya Dryomov 
3587ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3588ed95b21aSIlya Dryomov 	     cid.handle);
3589ed95b21aSIlya Dryomov 	if (rbd_cid_equal(&cid, &my_cid))
3590ed95b21aSIlya Dryomov 		return false;
3591ed95b21aSIlya Dryomov 
3592ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
3593ed95b21aSIlya Dryomov 	need_to_send = __rbd_is_lock_owner(rbd_dev);
3594ed95b21aSIlya Dryomov 	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
3595ed95b21aSIlya Dryomov 		if (!rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid)) {
3596ed95b21aSIlya Dryomov 			dout("%s rbd_dev %p queueing unlock_work\n", __func__,
3597ed95b21aSIlya Dryomov 			     rbd_dev);
3598ed95b21aSIlya Dryomov 			queue_work(rbd_dev->task_wq, &rbd_dev->unlock_work);
3599ed95b21aSIlya Dryomov 		}
3600ed95b21aSIlya Dryomov 	}
3601ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3602ed95b21aSIlya Dryomov 	return need_to_send;
3603ed95b21aSIlya Dryomov }
3604ed95b21aSIlya Dryomov 
3605ed95b21aSIlya Dryomov static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
3606ed95b21aSIlya Dryomov 				     u64 notify_id, u64 cookie, s32 *result)
3607ed95b21aSIlya Dryomov {
3608ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3609ed95b21aSIlya Dryomov 	int buf_size = 4 + CEPH_ENCODING_START_BLK_LEN;
3610ed95b21aSIlya Dryomov 	char buf[buf_size];
3611ed95b21aSIlya Dryomov 	int ret;
3612ed95b21aSIlya Dryomov 
3613ed95b21aSIlya Dryomov 	if (result) {
3614ed95b21aSIlya Dryomov 		void *p = buf;
3615ed95b21aSIlya Dryomov 
3616ed95b21aSIlya Dryomov 		/* encode ResponseMessage */
3617ed95b21aSIlya Dryomov 		ceph_start_encoding(&p, 1, 1,
3618ed95b21aSIlya Dryomov 				    buf_size - CEPH_ENCODING_START_BLK_LEN);
3619ed95b21aSIlya Dryomov 		ceph_encode_32(&p, *result);
3620ed95b21aSIlya Dryomov 	} else {
3621ed95b21aSIlya Dryomov 		buf_size = 0;
3622ed95b21aSIlya Dryomov 	}
3623ed95b21aSIlya Dryomov 
3624ed95b21aSIlya Dryomov 	ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
3625ed95b21aSIlya Dryomov 				   &rbd_dev->header_oloc, notify_id, cookie,
3626ed95b21aSIlya Dryomov 				   buf, buf_size);
3627ed95b21aSIlya Dryomov 	if (ret)
3628ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
3629ed95b21aSIlya Dryomov }
3630ed95b21aSIlya Dryomov 
3631ed95b21aSIlya Dryomov static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
3632ed95b21aSIlya Dryomov 				   u64 cookie)
3633ed95b21aSIlya Dryomov {
3634ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3635ed95b21aSIlya Dryomov 	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
3636ed95b21aSIlya Dryomov }
3637ed95b21aSIlya Dryomov 
3638ed95b21aSIlya Dryomov static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
3639ed95b21aSIlya Dryomov 					  u64 notify_id, u64 cookie, s32 result)
3640ed95b21aSIlya Dryomov {
3641ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3642ed95b21aSIlya Dryomov 	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
3643ed95b21aSIlya Dryomov }
3644922dab61SIlya Dryomov 
3645922dab61SIlya Dryomov static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
3646922dab61SIlya Dryomov 			 u64 notifier_id, void *data, size_t data_len)
3647b8d70035SAlex Elder {
3648922dab61SIlya Dryomov 	struct rbd_device *rbd_dev = arg;
3649ed95b21aSIlya Dryomov 	void *p = data;
3650ed95b21aSIlya Dryomov 	void *const end = p + data_len;
3651d4c2269bSIlya Dryomov 	u8 struct_v = 0;
3652ed95b21aSIlya Dryomov 	u32 len;
3653ed95b21aSIlya Dryomov 	u32 notify_op;
3654b8d70035SAlex Elder 	int ret;
3655b8d70035SAlex Elder 
3656ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
3657ed95b21aSIlya Dryomov 	     __func__, rbd_dev, cookie, notify_id, data_len);
3658ed95b21aSIlya Dryomov 	if (data_len) {
3659ed95b21aSIlya Dryomov 		ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
3660ed95b21aSIlya Dryomov 					  &struct_v, &len);
3661ed95b21aSIlya Dryomov 		if (ret) {
3662ed95b21aSIlya Dryomov 			rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
3663ed95b21aSIlya Dryomov 				 ret);
3664ed95b21aSIlya Dryomov 			return;
3665ed95b21aSIlya Dryomov 		}
366652bb1f9bSIlya Dryomov 
3667ed95b21aSIlya Dryomov 		notify_op = ceph_decode_32(&p);
3668ed95b21aSIlya Dryomov 	} else {
3669ed95b21aSIlya Dryomov 		/* legacy notification for header updates */
3670ed95b21aSIlya Dryomov 		notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
3671ed95b21aSIlya Dryomov 		len = 0;
3672ed95b21aSIlya Dryomov 	}
3673ed95b21aSIlya Dryomov 
3674ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
3675ed95b21aSIlya Dryomov 	switch (notify_op) {
3676ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_ACQUIRED_LOCK:
3677ed95b21aSIlya Dryomov 		rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
3678ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3679ed95b21aSIlya Dryomov 		break;
3680ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_RELEASED_LOCK:
3681ed95b21aSIlya Dryomov 		rbd_handle_released_lock(rbd_dev, struct_v, &p);
3682ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3683ed95b21aSIlya Dryomov 		break;
3684ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_REQUEST_LOCK:
3685ed95b21aSIlya Dryomov 		if (rbd_handle_request_lock(rbd_dev, struct_v, &p))
368652bb1f9bSIlya Dryomov 			/*
3687ed95b21aSIlya Dryomov 			 * send ResponseMessage(0) back so the client
3688ed95b21aSIlya Dryomov 			 * can detect a missing owner
368952bb1f9bSIlya Dryomov 			 */
3690ed95b21aSIlya Dryomov 			rbd_acknowledge_notify_result(rbd_dev, notify_id,
3691ed95b21aSIlya Dryomov 						      cookie, 0);
3692ed95b21aSIlya Dryomov 		else
3693ed95b21aSIlya Dryomov 			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3694ed95b21aSIlya Dryomov 		break;
3695ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_HEADER_UPDATE:
3696e627db08SAlex Elder 		ret = rbd_dev_refresh(rbd_dev);
3697e627db08SAlex Elder 		if (ret)
36989584d508SIlya Dryomov 			rbd_warn(rbd_dev, "refresh failed: %d", ret);
3699b8d70035SAlex Elder 
3700ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3701ed95b21aSIlya Dryomov 		break;
3702ed95b21aSIlya Dryomov 	default:
3703ed95b21aSIlya Dryomov 		if (rbd_is_lock_owner(rbd_dev))
3704ed95b21aSIlya Dryomov 			rbd_acknowledge_notify_result(rbd_dev, notify_id,
3705ed95b21aSIlya Dryomov 						      cookie, -EOPNOTSUPP);
3706ed95b21aSIlya Dryomov 		else
3707ed95b21aSIlya Dryomov 			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3708ed95b21aSIlya Dryomov 		break;
3709b8d70035SAlex Elder 	}
3710b8d70035SAlex Elder }
3711b8d70035SAlex Elder 
371299d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
37139969ebc5SAlex Elder 
3714922dab61SIlya Dryomov static void rbd_watch_errcb(void *arg, u64 cookie, int err)
3715bb040aa0SIlya Dryomov {
3716922dab61SIlya Dryomov 	struct rbd_device *rbd_dev = arg;
3717bb040aa0SIlya Dryomov 
3718922dab61SIlya Dryomov 	rbd_warn(rbd_dev, "encountered watch error: %d", err);
3719bb040aa0SIlya Dryomov 
3720ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3721ed95b21aSIlya Dryomov 	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3722ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
3723bb040aa0SIlya Dryomov 
372499d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
372599d16943SIlya Dryomov 	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
372699d16943SIlya Dryomov 		__rbd_unregister_watch(rbd_dev);
372799d16943SIlya Dryomov 		rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
3728bb040aa0SIlya Dryomov 
372999d16943SIlya Dryomov 		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
3730bb040aa0SIlya Dryomov 	}
373199d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
3732bb040aa0SIlya Dryomov }
3733bb040aa0SIlya Dryomov 
3734bb040aa0SIlya Dryomov /*
373599d16943SIlya Dryomov  * watch_mutex must be locked
37369969ebc5SAlex Elder  */
373799d16943SIlya Dryomov static int __rbd_register_watch(struct rbd_device *rbd_dev)
37389969ebc5SAlex Elder {
37399969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3740922dab61SIlya Dryomov 	struct ceph_osd_linger_request *handle;
37419969ebc5SAlex Elder 
3742922dab61SIlya Dryomov 	rbd_assert(!rbd_dev->watch_handle);
374399d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
37449969ebc5SAlex Elder 
3745922dab61SIlya Dryomov 	handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
3746922dab61SIlya Dryomov 				 &rbd_dev->header_oloc, rbd_watch_cb,
3747922dab61SIlya Dryomov 				 rbd_watch_errcb, rbd_dev);
3748922dab61SIlya Dryomov 	if (IS_ERR(handle))
3749922dab61SIlya Dryomov 		return PTR_ERR(handle);
37509969ebc5SAlex Elder 
3751922dab61SIlya Dryomov 	rbd_dev->watch_handle = handle;
37528eb87565SAlex Elder 	return 0;
37539969ebc5SAlex Elder }
37549969ebc5SAlex Elder 
375599d16943SIlya Dryomov /*
375699d16943SIlya Dryomov  * watch_mutex must be locked
375799d16943SIlya Dryomov  */
375899d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
3759fca27065SIlya Dryomov {
3760922dab61SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3761922dab61SIlya Dryomov 	int ret;
3762b30a01f2SIlya Dryomov 
376399d16943SIlya Dryomov 	rbd_assert(rbd_dev->watch_handle);
376499d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3765b30a01f2SIlya Dryomov 
3766922dab61SIlya Dryomov 	ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
3767922dab61SIlya Dryomov 	if (ret)
3768922dab61SIlya Dryomov 		rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
3769b30a01f2SIlya Dryomov 
3770922dab61SIlya Dryomov 	rbd_dev->watch_handle = NULL;
3771c525f036SIlya Dryomov }
3772c525f036SIlya Dryomov 
377399d16943SIlya Dryomov static int rbd_register_watch(struct rbd_device *rbd_dev)
3774c525f036SIlya Dryomov {
377599d16943SIlya Dryomov 	int ret;
3776811c6688SIlya Dryomov 
377799d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
377899d16943SIlya Dryomov 	rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
377999d16943SIlya Dryomov 	ret = __rbd_register_watch(rbd_dev);
378099d16943SIlya Dryomov 	if (ret)
378199d16943SIlya Dryomov 		goto out;
378299d16943SIlya Dryomov 
378399d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
378499d16943SIlya Dryomov 	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
378599d16943SIlya Dryomov 
378699d16943SIlya Dryomov out:
378799d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
378899d16943SIlya Dryomov 	return ret;
378999d16943SIlya Dryomov }
379099d16943SIlya Dryomov 
379199d16943SIlya Dryomov static void cancel_tasks_sync(struct rbd_device *rbd_dev)
379299d16943SIlya Dryomov {
379399d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
379499d16943SIlya Dryomov 
379599d16943SIlya Dryomov 	cancel_delayed_work_sync(&rbd_dev->watch_dwork);
3796ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->acquired_lock_work);
3797ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->released_lock_work);
3798ed95b21aSIlya Dryomov 	cancel_delayed_work_sync(&rbd_dev->lock_dwork);
3799ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->unlock_work);
380099d16943SIlya Dryomov }
380199d16943SIlya Dryomov 
380299d16943SIlya Dryomov static void rbd_unregister_watch(struct rbd_device *rbd_dev)
380399d16943SIlya Dryomov {
3804ed95b21aSIlya Dryomov 	WARN_ON(waitqueue_active(&rbd_dev->lock_waitq));
380599d16943SIlya Dryomov 	cancel_tasks_sync(rbd_dev);
380699d16943SIlya Dryomov 
380799d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
380899d16943SIlya Dryomov 	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
380999d16943SIlya Dryomov 		__rbd_unregister_watch(rbd_dev);
381099d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
381199d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
381299d16943SIlya Dryomov 
3813811c6688SIlya Dryomov 	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
3814fca27065SIlya Dryomov }
3815fca27065SIlya Dryomov 
381614bb211dSIlya Dryomov /*
381714bb211dSIlya Dryomov  * lock_rwsem must be held for write
381814bb211dSIlya Dryomov  */
381914bb211dSIlya Dryomov static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
382014bb211dSIlya Dryomov {
382114bb211dSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
382214bb211dSIlya Dryomov 	char cookie[32];
382314bb211dSIlya Dryomov 	int ret;
382414bb211dSIlya Dryomov 
382514bb211dSIlya Dryomov 	WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
382614bb211dSIlya Dryomov 
382714bb211dSIlya Dryomov 	format_lock_cookie(rbd_dev, cookie);
382814bb211dSIlya Dryomov 	ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
382914bb211dSIlya Dryomov 				  &rbd_dev->header_oloc, RBD_LOCK_NAME,
383014bb211dSIlya Dryomov 				  CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
383114bb211dSIlya Dryomov 				  RBD_LOCK_TAG, cookie);
383214bb211dSIlya Dryomov 	if (ret) {
383314bb211dSIlya Dryomov 		if (ret != -EOPNOTSUPP)
383414bb211dSIlya Dryomov 			rbd_warn(rbd_dev, "failed to update lock cookie: %d",
383514bb211dSIlya Dryomov 				 ret);
383614bb211dSIlya Dryomov 
383714bb211dSIlya Dryomov 		/*
383814bb211dSIlya Dryomov 		 * Lock cookie cannot be updated on older OSDs, so do
383914bb211dSIlya Dryomov 		 * a manual release and queue an acquire.
384014bb211dSIlya Dryomov 		 */
384114bb211dSIlya Dryomov 		if (rbd_release_lock(rbd_dev))
384214bb211dSIlya Dryomov 			queue_delayed_work(rbd_dev->task_wq,
384314bb211dSIlya Dryomov 					   &rbd_dev->lock_dwork, 0);
384414bb211dSIlya Dryomov 	} else {
384514bb211dSIlya Dryomov 		strcpy(rbd_dev->lock_cookie, cookie);
384614bb211dSIlya Dryomov 	}
384714bb211dSIlya Dryomov }
384814bb211dSIlya Dryomov 
384999d16943SIlya Dryomov static void rbd_reregister_watch(struct work_struct *work)
385099d16943SIlya Dryomov {
385199d16943SIlya Dryomov 	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
385299d16943SIlya Dryomov 					    struct rbd_device, watch_dwork);
385399d16943SIlya Dryomov 	int ret;
385499d16943SIlya Dryomov 
385599d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
385699d16943SIlya Dryomov 
385799d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
385887c0fdedSIlya Dryomov 	if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
385987c0fdedSIlya Dryomov 		mutex_unlock(&rbd_dev->watch_mutex);
386014bb211dSIlya Dryomov 		return;
386187c0fdedSIlya Dryomov 	}
386299d16943SIlya Dryomov 
386399d16943SIlya Dryomov 	ret = __rbd_register_watch(rbd_dev);
386499d16943SIlya Dryomov 	if (ret) {
386599d16943SIlya Dryomov 		rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
38664d73644bSIlya Dryomov 		if (ret == -EBLACKLISTED || ret == -ENOENT) {
386787c0fdedSIlya Dryomov 			set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
386814bb211dSIlya Dryomov 			wake_requests(rbd_dev, true);
386987c0fdedSIlya Dryomov 		} else {
387099d16943SIlya Dryomov 			queue_delayed_work(rbd_dev->task_wq,
387199d16943SIlya Dryomov 					   &rbd_dev->watch_dwork,
387299d16943SIlya Dryomov 					   RBD_RETRY_DELAY);
387387c0fdedSIlya Dryomov 		}
387487c0fdedSIlya Dryomov 		mutex_unlock(&rbd_dev->watch_mutex);
387514bb211dSIlya Dryomov 		return;
387699d16943SIlya Dryomov 	}
387799d16943SIlya Dryomov 
387899d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
387999d16943SIlya Dryomov 	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
388099d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
388199d16943SIlya Dryomov 
388214bb211dSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
388314bb211dSIlya Dryomov 	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
388414bb211dSIlya Dryomov 		rbd_reacquire_lock(rbd_dev);
388514bb211dSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
388614bb211dSIlya Dryomov 
388799d16943SIlya Dryomov 	ret = rbd_dev_refresh(rbd_dev);
388899d16943SIlya Dryomov 	if (ret)
388999d16943SIlya Dryomov 		rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret);
389099d16943SIlya Dryomov }
389199d16943SIlya Dryomov 
389236be9a76SAlex Elder /*
3893f40eb349SAlex Elder  * Synchronous osd object method call.  Returns the number of bytes
3894f40eb349SAlex Elder  * returned in the outbound buffer, or a negative error code.
389536be9a76SAlex Elder  */
389636be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
3897ecd4a68aSIlya Dryomov 			     struct ceph_object_id *oid,
3898ecd4a68aSIlya Dryomov 			     struct ceph_object_locator *oloc,
389936be9a76SAlex Elder 			     const char *method_name,
39004157976bSAlex Elder 			     const void *outbound,
390136be9a76SAlex Elder 			     size_t outbound_size,
39024157976bSAlex Elder 			     void *inbound,
3903e2a58ee5SAlex Elder 			     size_t inbound_size)
390436be9a76SAlex Elder {
3905ecd4a68aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3906ecd4a68aSIlya Dryomov 	struct page *req_page = NULL;
3907ecd4a68aSIlya Dryomov 	struct page *reply_page;
390836be9a76SAlex Elder 	int ret;
390936be9a76SAlex Elder 
391036be9a76SAlex Elder 	/*
39116010a451SAlex Elder 	 * Method calls are ultimately read operations.  The result
39126010a451SAlex Elder 	 * should placed into the inbound buffer provided.  They
39136010a451SAlex Elder 	 * also supply outbound data--parameters for the object
39146010a451SAlex Elder 	 * method.  Currently if this is present it will be a
39156010a451SAlex Elder 	 * snapshot id.
391636be9a76SAlex Elder 	 */
3917ecd4a68aSIlya Dryomov 	if (outbound) {
3918ecd4a68aSIlya Dryomov 		if (outbound_size > PAGE_SIZE)
3919ecd4a68aSIlya Dryomov 			return -E2BIG;
392036be9a76SAlex Elder 
3921ecd4a68aSIlya Dryomov 		req_page = alloc_page(GFP_KERNEL);
3922ecd4a68aSIlya Dryomov 		if (!req_page)
3923ecd4a68aSIlya Dryomov 			return -ENOMEM;
392436be9a76SAlex Elder 
3925ecd4a68aSIlya Dryomov 		memcpy(page_address(req_page), outbound, outbound_size);
392604017e29SAlex Elder 	}
3927430c28c3SAlex Elder 
3928ecd4a68aSIlya Dryomov 	reply_page = alloc_page(GFP_KERNEL);
3929ecd4a68aSIlya Dryomov 	if (!reply_page) {
3930ecd4a68aSIlya Dryomov 		if (req_page)
3931ecd4a68aSIlya Dryomov 			__free_page(req_page);
3932ecd4a68aSIlya Dryomov 		return -ENOMEM;
3933ecd4a68aSIlya Dryomov 	}
393436be9a76SAlex Elder 
3935ecd4a68aSIlya Dryomov 	ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
3936ecd4a68aSIlya Dryomov 			     CEPH_OSD_FLAG_READ, req_page, outbound_size,
3937ecd4a68aSIlya Dryomov 			     reply_page, &inbound_size);
3938ecd4a68aSIlya Dryomov 	if (!ret) {
3939ecd4a68aSIlya Dryomov 		memcpy(inbound, page_address(reply_page), inbound_size);
3940ecd4a68aSIlya Dryomov 		ret = inbound_size;
3941ecd4a68aSIlya Dryomov 	}
394257385b51SAlex Elder 
3943ecd4a68aSIlya Dryomov 	if (req_page)
3944ecd4a68aSIlya Dryomov 		__free_page(req_page);
3945ecd4a68aSIlya Dryomov 	__free_page(reply_page);
394636be9a76SAlex Elder 	return ret;
394736be9a76SAlex Elder }
394836be9a76SAlex Elder 
3949ed95b21aSIlya Dryomov /*
3950ed95b21aSIlya Dryomov  * lock_rwsem must be held for read
3951ed95b21aSIlya Dryomov  */
3952ed95b21aSIlya Dryomov static void rbd_wait_state_locked(struct rbd_device *rbd_dev)
3953ed95b21aSIlya Dryomov {
3954ed95b21aSIlya Dryomov 	DEFINE_WAIT(wait);
3955ed95b21aSIlya Dryomov 
3956ed95b21aSIlya Dryomov 	do {
3957ed95b21aSIlya Dryomov 		/*
3958ed95b21aSIlya Dryomov 		 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3959ed95b21aSIlya Dryomov 		 * and cancel_delayed_work() in wake_requests().
3960ed95b21aSIlya Dryomov 		 */
3961ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3962ed95b21aSIlya Dryomov 		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3963ed95b21aSIlya Dryomov 		prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait,
3964ed95b21aSIlya Dryomov 					  TASK_UNINTERRUPTIBLE);
3965ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
3966ed95b21aSIlya Dryomov 		schedule();
3967ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
396887c0fdedSIlya Dryomov 	} while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED &&
396987c0fdedSIlya Dryomov 		 !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags));
397087c0fdedSIlya Dryomov 
3971ed95b21aSIlya Dryomov 	finish_wait(&rbd_dev->lock_waitq, &wait);
3972ed95b21aSIlya Dryomov }
3973ed95b21aSIlya Dryomov 
39747ad18afaSChristoph Hellwig static void rbd_queue_workfn(struct work_struct *work)
3975bc1ecc65SIlya Dryomov {
39767ad18afaSChristoph Hellwig 	struct request *rq = blk_mq_rq_from_pdu(work);
39777ad18afaSChristoph Hellwig 	struct rbd_device *rbd_dev = rq->q->queuedata;
3978bc1ecc65SIlya Dryomov 	struct rbd_img_request *img_request;
39794e752f0aSJosh Durgin 	struct ceph_snap_context *snapc = NULL;
3980bc1ecc65SIlya Dryomov 	u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
3981bc1ecc65SIlya Dryomov 	u64 length = blk_rq_bytes(rq);
39826d2940c8SGuangliang Zhao 	enum obj_operation_type op_type;
39834e752f0aSJosh Durgin 	u64 mapping_size;
398480de1912SIlya Dryomov 	bool must_be_locked;
3985bc1ecc65SIlya Dryomov 	int result;
3986bc1ecc65SIlya Dryomov 
3987aebf526bSChristoph Hellwig 	switch (req_op(rq)) {
3988aebf526bSChristoph Hellwig 	case REQ_OP_DISCARD:
3989aebf526bSChristoph Hellwig 		op_type = OBJ_OP_DISCARD;
3990aebf526bSChristoph Hellwig 		break;
3991aebf526bSChristoph Hellwig 	case REQ_OP_WRITE:
3992aebf526bSChristoph Hellwig 		op_type = OBJ_OP_WRITE;
3993aebf526bSChristoph Hellwig 		break;
3994aebf526bSChristoph Hellwig 	case REQ_OP_READ:
3995aebf526bSChristoph Hellwig 		op_type = OBJ_OP_READ;
3996aebf526bSChristoph Hellwig 		break;
3997aebf526bSChristoph Hellwig 	default:
3998aebf526bSChristoph Hellwig 		dout("%s: non-fs request type %d\n", __func__, req_op(rq));
39997ad18afaSChristoph Hellwig 		result = -EIO;
40007ad18afaSChristoph Hellwig 		goto err;
40017ad18afaSChristoph Hellwig 	}
40027ad18afaSChristoph Hellwig 
4003bc1ecc65SIlya Dryomov 	/* Ignore/skip any zero-length requests */
4004bc1ecc65SIlya Dryomov 
4005bc1ecc65SIlya Dryomov 	if (!length) {
4006bc1ecc65SIlya Dryomov 		dout("%s: zero-length request\n", __func__);
4007bc1ecc65SIlya Dryomov 		result = 0;
4008bc1ecc65SIlya Dryomov 		goto err_rq;
4009bc1ecc65SIlya Dryomov 	}
4010bc1ecc65SIlya Dryomov 
40116d2940c8SGuangliang Zhao 	/* Only reads are allowed to a read-only device */
4012bc1ecc65SIlya Dryomov 
40136d2940c8SGuangliang Zhao 	if (op_type != OBJ_OP_READ) {
4014bc1ecc65SIlya Dryomov 		if (rbd_dev->mapping.read_only) {
4015bc1ecc65SIlya Dryomov 			result = -EROFS;
4016bc1ecc65SIlya Dryomov 			goto err_rq;
4017bc1ecc65SIlya Dryomov 		}
4018bc1ecc65SIlya Dryomov 		rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
4019bc1ecc65SIlya Dryomov 	}
4020bc1ecc65SIlya Dryomov 
4021bc1ecc65SIlya Dryomov 	/*
4022bc1ecc65SIlya Dryomov 	 * Quit early if the mapped snapshot no longer exists.  It's
4023bc1ecc65SIlya Dryomov 	 * still possible the snapshot will have disappeared by the
4024bc1ecc65SIlya Dryomov 	 * time our request arrives at the osd, but there's no sense in
4025bc1ecc65SIlya Dryomov 	 * sending it if we already know.
4026bc1ecc65SIlya Dryomov 	 */
4027bc1ecc65SIlya Dryomov 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
4028bc1ecc65SIlya Dryomov 		dout("request for non-existent snapshot");
4029bc1ecc65SIlya Dryomov 		rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
4030bc1ecc65SIlya Dryomov 		result = -ENXIO;
4031bc1ecc65SIlya Dryomov 		goto err_rq;
4032bc1ecc65SIlya Dryomov 	}
4033bc1ecc65SIlya Dryomov 
4034bc1ecc65SIlya Dryomov 	if (offset && length > U64_MAX - offset + 1) {
4035bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
4036bc1ecc65SIlya Dryomov 			 length);
4037bc1ecc65SIlya Dryomov 		result = -EINVAL;
4038bc1ecc65SIlya Dryomov 		goto err_rq;	/* Shouldn't happen */
4039bc1ecc65SIlya Dryomov 	}
4040bc1ecc65SIlya Dryomov 
40417ad18afaSChristoph Hellwig 	blk_mq_start_request(rq);
40427ad18afaSChristoph Hellwig 
40434e752f0aSJosh Durgin 	down_read(&rbd_dev->header_rwsem);
40444e752f0aSJosh Durgin 	mapping_size = rbd_dev->mapping.size;
40456d2940c8SGuangliang Zhao 	if (op_type != OBJ_OP_READ) {
40464e752f0aSJosh Durgin 		snapc = rbd_dev->header.snapc;
40474e752f0aSJosh Durgin 		ceph_get_snap_context(snapc);
40484e752f0aSJosh Durgin 	}
40494e752f0aSJosh Durgin 	up_read(&rbd_dev->header_rwsem);
40504e752f0aSJosh Durgin 
40514e752f0aSJosh Durgin 	if (offset + length > mapping_size) {
4052bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
40534e752f0aSJosh Durgin 			 length, mapping_size);
4054bc1ecc65SIlya Dryomov 		result = -EIO;
4055bc1ecc65SIlya Dryomov 		goto err_rq;
4056bc1ecc65SIlya Dryomov 	}
4057bc1ecc65SIlya Dryomov 
4058*f9bebd58SIlya Dryomov 	must_be_locked =
4059*f9bebd58SIlya Dryomov 	    (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
4060*f9bebd58SIlya Dryomov 	    (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read);
4061ed95b21aSIlya Dryomov 	if (must_be_locked) {
4062ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
406387c0fdedSIlya Dryomov 		if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED &&
406487c0fdedSIlya Dryomov 		    !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags))
4065ed95b21aSIlya Dryomov 			rbd_wait_state_locked(rbd_dev);
406687c0fdedSIlya Dryomov 		if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
406787c0fdedSIlya Dryomov 			result = -EBLACKLISTED;
406887c0fdedSIlya Dryomov 			goto err_unlock;
406987c0fdedSIlya Dryomov 		}
4070ed95b21aSIlya Dryomov 	}
4071ed95b21aSIlya Dryomov 
40726d2940c8SGuangliang Zhao 	img_request = rbd_img_request_create(rbd_dev, offset, length, op_type,
40734e752f0aSJosh Durgin 					     snapc);
4074bc1ecc65SIlya Dryomov 	if (!img_request) {
4075bc1ecc65SIlya Dryomov 		result = -ENOMEM;
4076ed95b21aSIlya Dryomov 		goto err_unlock;
4077bc1ecc65SIlya Dryomov 	}
4078bc1ecc65SIlya Dryomov 	img_request->rq = rq;
407970b16db8SIlya Dryomov 	snapc = NULL; /* img_request consumes a ref */
4080bc1ecc65SIlya Dryomov 
408190e98c52SGuangliang Zhao 	if (op_type == OBJ_OP_DISCARD)
408290e98c52SGuangliang Zhao 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA,
408390e98c52SGuangliang Zhao 					      NULL);
408490e98c52SGuangliang Zhao 	else
408590e98c52SGuangliang Zhao 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
408690e98c52SGuangliang Zhao 					      rq->bio);
4087bc1ecc65SIlya Dryomov 	if (result)
4088bc1ecc65SIlya Dryomov 		goto err_img_request;
4089bc1ecc65SIlya Dryomov 
4090bc1ecc65SIlya Dryomov 	result = rbd_img_request_submit(img_request);
4091bc1ecc65SIlya Dryomov 	if (result)
4092bc1ecc65SIlya Dryomov 		goto err_img_request;
4093bc1ecc65SIlya Dryomov 
4094ed95b21aSIlya Dryomov 	if (must_be_locked)
4095ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
4096bc1ecc65SIlya Dryomov 	return;
4097bc1ecc65SIlya Dryomov 
4098bc1ecc65SIlya Dryomov err_img_request:
4099bc1ecc65SIlya Dryomov 	rbd_img_request_put(img_request);
4100ed95b21aSIlya Dryomov err_unlock:
4101ed95b21aSIlya Dryomov 	if (must_be_locked)
4102ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
4103bc1ecc65SIlya Dryomov err_rq:
4104bc1ecc65SIlya Dryomov 	if (result)
4105bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "%s %llx at %llx result %d",
41066d2940c8SGuangliang Zhao 			 obj_op_name(op_type), length, offset, result);
41074e752f0aSJosh Durgin 	ceph_put_snap_context(snapc);
41087ad18afaSChristoph Hellwig err:
41097ad18afaSChristoph Hellwig 	blk_mq_end_request(rq, result);
4110bc1ecc65SIlya Dryomov }
4111bc1ecc65SIlya Dryomov 
41127ad18afaSChristoph Hellwig static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
41137ad18afaSChristoph Hellwig 		const struct blk_mq_queue_data *bd)
4114bc1ecc65SIlya Dryomov {
41157ad18afaSChristoph Hellwig 	struct request *rq = bd->rq;
41167ad18afaSChristoph Hellwig 	struct work_struct *work = blk_mq_rq_to_pdu(rq);
4117bc1ecc65SIlya Dryomov 
41187ad18afaSChristoph Hellwig 	queue_work(rbd_wq, work);
41197ad18afaSChristoph Hellwig 	return BLK_MQ_RQ_QUEUE_OK;
4120bf0d5f50SAlex Elder }
4121bf0d5f50SAlex Elder 
4122602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
4123602adf40SYehuda Sadeh {
41245769ed0cSIlya Dryomov 	blk_cleanup_queue(rbd_dev->disk->queue);
41257ad18afaSChristoph Hellwig 	blk_mq_free_tag_set(&rbd_dev->tag_set);
41265769ed0cSIlya Dryomov 	put_disk(rbd_dev->disk);
41275769ed0cSIlya Dryomov 	rbd_dev->disk = NULL;
4128602adf40SYehuda Sadeh }
4129602adf40SYehuda Sadeh 
4130788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
4131fe5478e0SIlya Dryomov 			     struct ceph_object_id *oid,
4132fe5478e0SIlya Dryomov 			     struct ceph_object_locator *oloc,
4133fe5478e0SIlya Dryomov 			     void *buf, int buf_len)
4134788e2df3SAlex Elder 
4135788e2df3SAlex Elder {
4136fe5478e0SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4137fe5478e0SIlya Dryomov 	struct ceph_osd_request *req;
4138fe5478e0SIlya Dryomov 	struct page **pages;
4139fe5478e0SIlya Dryomov 	int num_pages = calc_pages_for(0, buf_len);
4140788e2df3SAlex Elder 	int ret;
4141788e2df3SAlex Elder 
4142fe5478e0SIlya Dryomov 	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
4143fe5478e0SIlya Dryomov 	if (!req)
4144fe5478e0SIlya Dryomov 		return -ENOMEM;
4145788e2df3SAlex Elder 
4146fe5478e0SIlya Dryomov 	ceph_oid_copy(&req->r_base_oid, oid);
4147fe5478e0SIlya Dryomov 	ceph_oloc_copy(&req->r_base_oloc, oloc);
4148fe5478e0SIlya Dryomov 	req->r_flags = CEPH_OSD_FLAG_READ;
4149788e2df3SAlex Elder 
4150fe5478e0SIlya Dryomov 	ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
4151788e2df3SAlex Elder 	if (ret)
4152fe5478e0SIlya Dryomov 		goto out_req;
4153788e2df3SAlex Elder 
4154fe5478e0SIlya Dryomov 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
4155fe5478e0SIlya Dryomov 	if (IS_ERR(pages)) {
4156fe5478e0SIlya Dryomov 		ret = PTR_ERR(pages);
4157fe5478e0SIlya Dryomov 		goto out_req;
4158fe5478e0SIlya Dryomov 	}
41591ceae7efSAlex Elder 
4160fe5478e0SIlya Dryomov 	osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
4161fe5478e0SIlya Dryomov 	osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
4162fe5478e0SIlya Dryomov 					 true);
4163788e2df3SAlex Elder 
4164fe5478e0SIlya Dryomov 	ceph_osdc_start_request(osdc, req, false);
4165fe5478e0SIlya Dryomov 	ret = ceph_osdc_wait_request(osdc, req);
4166fe5478e0SIlya Dryomov 	if (ret >= 0)
4167fe5478e0SIlya Dryomov 		ceph_copy_from_page_vector(pages, buf, 0, ret);
4168fe5478e0SIlya Dryomov 
4169fe5478e0SIlya Dryomov out_req:
4170fe5478e0SIlya Dryomov 	ceph_osdc_put_request(req);
4171788e2df3SAlex Elder 	return ret;
4172788e2df3SAlex Elder }
4173788e2df3SAlex Elder 
4174602adf40SYehuda Sadeh /*
4175662518b1SAlex Elder  * Read the complete header for the given rbd device.  On successful
4176662518b1SAlex Elder  * return, the rbd_dev->header field will contain up-to-date
4177662518b1SAlex Elder  * information about the image.
41784156d998SAlex Elder  */
417999a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
41804156d998SAlex Elder {
41814156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
41824156d998SAlex Elder 	u32 snap_count = 0;
41834156d998SAlex Elder 	u64 names_size = 0;
41844156d998SAlex Elder 	u32 want_count;
41854156d998SAlex Elder 	int ret;
41864156d998SAlex Elder 
41874156d998SAlex Elder 	/*
41884156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
41894156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
41904156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
41914156d998SAlex Elder 	 * the number of snapshots could change by the time we read
41924156d998SAlex Elder 	 * it in, in which case we re-read it.
41934156d998SAlex Elder 	 */
41944156d998SAlex Elder 	do {
41954156d998SAlex Elder 		size_t size;
41964156d998SAlex Elder 
41974156d998SAlex Elder 		kfree(ondisk);
41984156d998SAlex Elder 
41994156d998SAlex Elder 		size = sizeof (*ondisk);
42004156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
42014156d998SAlex Elder 		size += names_size;
42024156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
42034156d998SAlex Elder 		if (!ondisk)
4204662518b1SAlex Elder 			return -ENOMEM;
42054156d998SAlex Elder 
4206fe5478e0SIlya Dryomov 		ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
4207fe5478e0SIlya Dryomov 					&rbd_dev->header_oloc, ondisk, size);
42084156d998SAlex Elder 		if (ret < 0)
4209662518b1SAlex Elder 			goto out;
4210c0cd10dbSAlex Elder 		if ((size_t)ret < size) {
42114156d998SAlex Elder 			ret = -ENXIO;
421206ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
421306ecc6cbSAlex Elder 				size, ret);
4214662518b1SAlex Elder 			goto out;
42154156d998SAlex Elder 		}
42164156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
42174156d998SAlex Elder 			ret = -ENXIO;
421806ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
4219662518b1SAlex Elder 			goto out;
42204156d998SAlex Elder 		}
42214156d998SAlex Elder 
42224156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
42234156d998SAlex Elder 		want_count = snap_count;
42244156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
42254156d998SAlex Elder 	} while (snap_count != want_count);
42264156d998SAlex Elder 
4227662518b1SAlex Elder 	ret = rbd_header_from_disk(rbd_dev, ondisk);
4228662518b1SAlex Elder out:
42294156d998SAlex Elder 	kfree(ondisk);
42304156d998SAlex Elder 
4231dfc5606dSYehuda Sadeh 	return ret;
4232602adf40SYehuda Sadeh }
4233602adf40SYehuda Sadeh 
423415228edeSAlex Elder /*
423515228edeSAlex Elder  * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
423615228edeSAlex Elder  * has disappeared from the (just updated) snapshot context.
423715228edeSAlex Elder  */
423815228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev)
423915228edeSAlex Elder {
424015228edeSAlex Elder 	u64 snap_id;
424115228edeSAlex Elder 
424215228edeSAlex Elder 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
424315228edeSAlex Elder 		return;
424415228edeSAlex Elder 
424515228edeSAlex Elder 	snap_id = rbd_dev->spec->snap_id;
424615228edeSAlex Elder 	if (snap_id == CEPH_NOSNAP)
424715228edeSAlex Elder 		return;
424815228edeSAlex Elder 
424915228edeSAlex Elder 	if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
425015228edeSAlex Elder 		clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
425115228edeSAlex Elder }
425215228edeSAlex Elder 
42539875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev)
42549875201eSJosh Durgin {
42559875201eSJosh Durgin 	sector_t size;
42569875201eSJosh Durgin 
42579875201eSJosh Durgin 	/*
4258811c6688SIlya Dryomov 	 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
4259811c6688SIlya Dryomov 	 * try to update its size.  If REMOVING is set, updating size
4260811c6688SIlya Dryomov 	 * is just useless work since the device can't be opened.
42619875201eSJosh Durgin 	 */
4262811c6688SIlya Dryomov 	if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
4263811c6688SIlya Dryomov 	    !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
42649875201eSJosh Durgin 		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
42659875201eSJosh Durgin 		dout("setting size to %llu sectors", (unsigned long long)size);
42669875201eSJosh Durgin 		set_capacity(rbd_dev->disk, size);
42679875201eSJosh Durgin 		revalidate_disk(rbd_dev->disk);
42689875201eSJosh Durgin 	}
42699875201eSJosh Durgin }
42709875201eSJosh Durgin 
4271cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev)
42721fe5e993SAlex Elder {
4273e627db08SAlex Elder 	u64 mapping_size;
42741fe5e993SAlex Elder 	int ret;
42751fe5e993SAlex Elder 
4276cfbf6377SAlex Elder 	down_write(&rbd_dev->header_rwsem);
42773b5cf2a2SAlex Elder 	mapping_size = rbd_dev->mapping.size;
4278a720ae09SIlya Dryomov 
4279a720ae09SIlya Dryomov 	ret = rbd_dev_header_info(rbd_dev);
428052bb1f9bSIlya Dryomov 	if (ret)
428173e39e4dSIlya Dryomov 		goto out;
428215228edeSAlex Elder 
4283e8f59b59SIlya Dryomov 	/*
4284e8f59b59SIlya Dryomov 	 * If there is a parent, see if it has disappeared due to the
4285e8f59b59SIlya Dryomov 	 * mapped image getting flattened.
4286e8f59b59SIlya Dryomov 	 */
4287e8f59b59SIlya Dryomov 	if (rbd_dev->parent) {
4288e8f59b59SIlya Dryomov 		ret = rbd_dev_v2_parent_info(rbd_dev);
4289e8f59b59SIlya Dryomov 		if (ret)
429073e39e4dSIlya Dryomov 			goto out;
4291e8f59b59SIlya Dryomov 	}
4292e8f59b59SIlya Dryomov 
42935ff1108cSIlya Dryomov 	if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
42945ff1108cSIlya Dryomov 		rbd_dev->mapping.size = rbd_dev->header.image_size;
42955ff1108cSIlya Dryomov 	} else {
42965ff1108cSIlya Dryomov 		/* validate mapped snapshot's EXISTS flag */
429715228edeSAlex Elder 		rbd_exists_validate(rbd_dev);
42985ff1108cSIlya Dryomov 	}
42995ff1108cSIlya Dryomov 
430073e39e4dSIlya Dryomov out:
4301cfbf6377SAlex Elder 	up_write(&rbd_dev->header_rwsem);
430273e39e4dSIlya Dryomov 	if (!ret && mapping_size != rbd_dev->mapping.size)
43039875201eSJosh Durgin 		rbd_dev_update_size(rbd_dev);
43041fe5e993SAlex Elder 
430573e39e4dSIlya Dryomov 	return ret;
43061fe5e993SAlex Elder }
43071fe5e993SAlex Elder 
43087ad18afaSChristoph Hellwig static int rbd_init_request(void *data, struct request *rq,
43097ad18afaSChristoph Hellwig 		unsigned int hctx_idx, unsigned int request_idx,
43107ad18afaSChristoph Hellwig 		unsigned int numa_node)
43117ad18afaSChristoph Hellwig {
43127ad18afaSChristoph Hellwig 	struct work_struct *work = blk_mq_rq_to_pdu(rq);
43137ad18afaSChristoph Hellwig 
43147ad18afaSChristoph Hellwig 	INIT_WORK(work, rbd_queue_workfn);
43157ad18afaSChristoph Hellwig 	return 0;
43167ad18afaSChristoph Hellwig }
43177ad18afaSChristoph Hellwig 
43187ad18afaSChristoph Hellwig static struct blk_mq_ops rbd_mq_ops = {
43197ad18afaSChristoph Hellwig 	.queue_rq	= rbd_queue_rq,
43207ad18afaSChristoph Hellwig 	.init_request	= rbd_init_request,
43217ad18afaSChristoph Hellwig };
43227ad18afaSChristoph Hellwig 
4323602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
4324602adf40SYehuda Sadeh {
4325602adf40SYehuda Sadeh 	struct gendisk *disk;
4326602adf40SYehuda Sadeh 	struct request_queue *q;
4327593a9e7bSAlex Elder 	u64 segment_size;
43287ad18afaSChristoph Hellwig 	int err;
4329602adf40SYehuda Sadeh 
4330602adf40SYehuda Sadeh 	/* create gendisk info */
43317e513d43SIlya Dryomov 	disk = alloc_disk(single_major ?
43327e513d43SIlya Dryomov 			  (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
43337e513d43SIlya Dryomov 			  RBD_MINORS_PER_MAJOR);
4334602adf40SYehuda Sadeh 	if (!disk)
43351fcdb8aaSAlex Elder 		return -ENOMEM;
4336602adf40SYehuda Sadeh 
4337f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
4338de71a297SAlex Elder 		 rbd_dev->dev_id);
4339602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
4340dd82fff1SIlya Dryomov 	disk->first_minor = rbd_dev->minor;
43417e513d43SIlya Dryomov 	if (single_major)
43427e513d43SIlya Dryomov 		disk->flags |= GENHD_FL_EXT_DEVT;
4343602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
4344602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
4345602adf40SYehuda Sadeh 
43467ad18afaSChristoph Hellwig 	memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
43477ad18afaSChristoph Hellwig 	rbd_dev->tag_set.ops = &rbd_mq_ops;
4348b5584180SIlya Dryomov 	rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
43497ad18afaSChristoph Hellwig 	rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
4350b5584180SIlya Dryomov 	rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
43517ad18afaSChristoph Hellwig 	rbd_dev->tag_set.nr_hw_queues = 1;
43527ad18afaSChristoph Hellwig 	rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
43537ad18afaSChristoph Hellwig 
43547ad18afaSChristoph Hellwig 	err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
43557ad18afaSChristoph Hellwig 	if (err)
4356602adf40SYehuda Sadeh 		goto out_disk;
4357029bcbd8SJosh Durgin 
43587ad18afaSChristoph Hellwig 	q = blk_mq_init_queue(&rbd_dev->tag_set);
43597ad18afaSChristoph Hellwig 	if (IS_ERR(q)) {
43607ad18afaSChristoph Hellwig 		err = PTR_ERR(q);
43617ad18afaSChristoph Hellwig 		goto out_tag_set;
43627ad18afaSChristoph Hellwig 	}
43637ad18afaSChristoph Hellwig 
4364d8a2c89cSIlya Dryomov 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
4365d8a2c89cSIlya Dryomov 	/* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
4366593a9e7bSAlex Elder 
4367029bcbd8SJosh Durgin 	/* set io sizes to object size */
4368593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
4369593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
43700d9fde4fSIlya Dryomov 	q->limits.max_sectors = queue_max_hw_sectors(q);
4371d3834fefSIlya Dryomov 	blk_queue_max_segments(q, segment_size / SECTOR_SIZE);
4372593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
4373593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
4374593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
4375029bcbd8SJosh Durgin 
437690e98c52SGuangliang Zhao 	/* enable the discard support */
437790e98c52SGuangliang Zhao 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
437890e98c52SGuangliang Zhao 	q->limits.discard_granularity = segment_size;
437990e98c52SGuangliang Zhao 	q->limits.discard_alignment = segment_size;
43802bb4cd5cSJens Axboe 	blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE);
4381b76f8239SJosh Durgin 	q->limits.discard_zeroes_data = 1;
438290e98c52SGuangliang Zhao 
4383bae818eeSRonny Hegewald 	if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
4384dc3b17ccSJan Kara 		q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
4385bae818eeSRonny Hegewald 
43865769ed0cSIlya Dryomov 	/*
43875769ed0cSIlya Dryomov 	 * disk_release() expects a queue ref from add_disk() and will
43885769ed0cSIlya Dryomov 	 * put it.  Hold an extra ref until add_disk() is called.
43895769ed0cSIlya Dryomov 	 */
43905769ed0cSIlya Dryomov 	WARN_ON(!blk_get_queue(q));
4391602adf40SYehuda Sadeh 	disk->queue = q;
4392602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
4393602adf40SYehuda Sadeh 
4394602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
4395602adf40SYehuda Sadeh 
4396602adf40SYehuda Sadeh 	return 0;
43977ad18afaSChristoph Hellwig out_tag_set:
43987ad18afaSChristoph Hellwig 	blk_mq_free_tag_set(&rbd_dev->tag_set);
4399602adf40SYehuda Sadeh out_disk:
4400602adf40SYehuda Sadeh 	put_disk(disk);
44017ad18afaSChristoph Hellwig 	return err;
4402602adf40SYehuda Sadeh }
4403602adf40SYehuda Sadeh 
4404dfc5606dSYehuda Sadeh /*
4405dfc5606dSYehuda Sadeh   sysfs
4406dfc5606dSYehuda Sadeh */
4407602adf40SYehuda Sadeh 
4408593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
4409593a9e7bSAlex Elder {
4410593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
4411593a9e7bSAlex Elder }
4412593a9e7bSAlex Elder 
4413dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
4414dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
4415602adf40SYehuda Sadeh {
4416593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4417dfc5606dSYehuda Sadeh 
4418fc71d833SAlex Elder 	return sprintf(buf, "%llu\n",
4419fc71d833SAlex Elder 		(unsigned long long)rbd_dev->mapping.size);
4420602adf40SYehuda Sadeh }
4421602adf40SYehuda Sadeh 
442234b13184SAlex Elder /*
442334b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
442434b13184SAlex Elder  * necessarily the base image.
442534b13184SAlex Elder  */
442634b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
442734b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
442834b13184SAlex Elder {
442934b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
443034b13184SAlex Elder 
443134b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
443234b13184SAlex Elder 			(unsigned long long)rbd_dev->mapping.features);
443334b13184SAlex Elder }
443434b13184SAlex Elder 
4435dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
4436dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
4437602adf40SYehuda Sadeh {
4438593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4439dfc5606dSYehuda Sadeh 
4440fc71d833SAlex Elder 	if (rbd_dev->major)
4441dfc5606dSYehuda Sadeh 		return sprintf(buf, "%d\n", rbd_dev->major);
4442fc71d833SAlex Elder 
4443fc71d833SAlex Elder 	return sprintf(buf, "(none)\n");
4444dd82fff1SIlya Dryomov }
4445fc71d833SAlex Elder 
4446dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev,
4447dd82fff1SIlya Dryomov 			      struct device_attribute *attr, char *buf)
4448dd82fff1SIlya Dryomov {
4449dd82fff1SIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4450dd82fff1SIlya Dryomov 
4451dd82fff1SIlya Dryomov 	return sprintf(buf, "%d\n", rbd_dev->minor);
4452dfc5606dSYehuda Sadeh }
4453dfc5606dSYehuda Sadeh 
4454005a07bfSIlya Dryomov static ssize_t rbd_client_addr_show(struct device *dev,
4455005a07bfSIlya Dryomov 				    struct device_attribute *attr, char *buf)
4456005a07bfSIlya Dryomov {
4457005a07bfSIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4458005a07bfSIlya Dryomov 	struct ceph_entity_addr *client_addr =
4459005a07bfSIlya Dryomov 	    ceph_client_addr(rbd_dev->rbd_client->client);
4460005a07bfSIlya Dryomov 
4461005a07bfSIlya Dryomov 	return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
4462005a07bfSIlya Dryomov 		       le32_to_cpu(client_addr->nonce));
4463005a07bfSIlya Dryomov }
4464005a07bfSIlya Dryomov 
4465dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
4466dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
4467dfc5606dSYehuda Sadeh {
4468593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4469dfc5606dSYehuda Sadeh 
44701dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
4471033268a5SIlya Dryomov 		       ceph_client_gid(rbd_dev->rbd_client->client));
4472dfc5606dSYehuda Sadeh }
4473dfc5606dSYehuda Sadeh 
4474267fb90bSMike Christie static ssize_t rbd_cluster_fsid_show(struct device *dev,
4475267fb90bSMike Christie 				     struct device_attribute *attr, char *buf)
4476267fb90bSMike Christie {
4477267fb90bSMike Christie 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4478267fb90bSMike Christie 
4479267fb90bSMike Christie 	return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
4480267fb90bSMike Christie }
4481267fb90bSMike Christie 
44820d6d1e9cSMike Christie static ssize_t rbd_config_info_show(struct device *dev,
44830d6d1e9cSMike Christie 				    struct device_attribute *attr, char *buf)
44840d6d1e9cSMike Christie {
44850d6d1e9cSMike Christie 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
44860d6d1e9cSMike Christie 
44870d6d1e9cSMike Christie 	return sprintf(buf, "%s\n", rbd_dev->config_info);
4488dfc5606dSYehuda Sadeh }
4489dfc5606dSYehuda Sadeh 
4490dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
4491dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
4492dfc5606dSYehuda Sadeh {
4493593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4494dfc5606dSYehuda Sadeh 
44950d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
4496dfc5606dSYehuda Sadeh }
4497dfc5606dSYehuda Sadeh 
44989bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
44999bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
45009bb2f334SAlex Elder {
45019bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
45029bb2f334SAlex Elder 
45030d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
45040d7dbfceSAlex Elder 			(unsigned long long) rbd_dev->spec->pool_id);
45059bb2f334SAlex Elder }
45069bb2f334SAlex Elder 
4507dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
4508dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
4509dfc5606dSYehuda Sadeh {
4510593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4511dfc5606dSYehuda Sadeh 
4512a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
45130d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
4514a92ffdf8SAlex Elder 
4515a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
4516dfc5606dSYehuda Sadeh }
4517dfc5606dSYehuda Sadeh 
4518589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
4519589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
4520589d30e0SAlex Elder {
4521589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4522589d30e0SAlex Elder 
45230d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
4524589d30e0SAlex Elder }
4525589d30e0SAlex Elder 
452634b13184SAlex Elder /*
452734b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
452834b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
452934b13184SAlex Elder  */
4530dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
4531dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
4532dfc5606dSYehuda Sadeh 			     char *buf)
4533dfc5606dSYehuda Sadeh {
4534593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4535dfc5606dSYehuda Sadeh 
45360d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
4537dfc5606dSYehuda Sadeh }
4538dfc5606dSYehuda Sadeh 
453992a58671SMike Christie static ssize_t rbd_snap_id_show(struct device *dev,
454092a58671SMike Christie 				struct device_attribute *attr, char *buf)
454192a58671SMike Christie {
454292a58671SMike Christie 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
454392a58671SMike Christie 
454492a58671SMike Christie 	return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
454592a58671SMike Christie }
454692a58671SMike Christie 
454786b00e0dSAlex Elder /*
4548ff96128fSIlya Dryomov  * For a v2 image, shows the chain of parent images, separated by empty
4549ff96128fSIlya Dryomov  * lines.  For v1 images or if there is no parent, shows "(no parent
4550ff96128fSIlya Dryomov  * image)".
455186b00e0dSAlex Elder  */
455286b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
455386b00e0dSAlex Elder 			       struct device_attribute *attr,
455486b00e0dSAlex Elder 			       char *buf)
455586b00e0dSAlex Elder {
455686b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4557ff96128fSIlya Dryomov 	ssize_t count = 0;
455886b00e0dSAlex Elder 
4559ff96128fSIlya Dryomov 	if (!rbd_dev->parent)
456086b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
456186b00e0dSAlex Elder 
4562ff96128fSIlya Dryomov 	for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
4563ff96128fSIlya Dryomov 		struct rbd_spec *spec = rbd_dev->parent_spec;
456486b00e0dSAlex Elder 
4565ff96128fSIlya Dryomov 		count += sprintf(&buf[count], "%s"
4566ff96128fSIlya Dryomov 			    "pool_id %llu\npool_name %s\n"
4567ff96128fSIlya Dryomov 			    "image_id %s\nimage_name %s\n"
4568ff96128fSIlya Dryomov 			    "snap_id %llu\nsnap_name %s\n"
4569ff96128fSIlya Dryomov 			    "overlap %llu\n",
4570ff96128fSIlya Dryomov 			    !count ? "" : "\n", /* first? */
4571ff96128fSIlya Dryomov 			    spec->pool_id, spec->pool_name,
4572ff96128fSIlya Dryomov 			    spec->image_id, spec->image_name ?: "(unknown)",
4573ff96128fSIlya Dryomov 			    spec->snap_id, spec->snap_name,
4574ff96128fSIlya Dryomov 			    rbd_dev->parent_overlap);
4575ff96128fSIlya Dryomov 	}
457686b00e0dSAlex Elder 
457786b00e0dSAlex Elder 	return count;
457886b00e0dSAlex Elder }
457986b00e0dSAlex Elder 
4580dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
4581dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
4582dfc5606dSYehuda Sadeh 				 const char *buf,
4583dfc5606dSYehuda Sadeh 				 size_t size)
4584dfc5606dSYehuda Sadeh {
4585593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4586b813623aSAlex Elder 	int ret;
4587602adf40SYehuda Sadeh 
4588cc4a38bdSAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
4589e627db08SAlex Elder 	if (ret)
459052bb1f9bSIlya Dryomov 		return ret;
4591b813623aSAlex Elder 
459252bb1f9bSIlya Dryomov 	return size;
4593dfc5606dSYehuda Sadeh }
4594602adf40SYehuda Sadeh 
4595dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
459634b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
4597dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
4598dd82fff1SIlya Dryomov static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
4599005a07bfSIlya Dryomov static DEVICE_ATTR(client_addr, S_IRUGO, rbd_client_addr_show, NULL);
4600dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
4601267fb90bSMike Christie static DEVICE_ATTR(cluster_fsid, S_IRUGO, rbd_cluster_fsid_show, NULL);
46020d6d1e9cSMike Christie static DEVICE_ATTR(config_info, S_IRUSR, rbd_config_info_show, NULL);
4603dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
46049bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
4605dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
4606589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
4607dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
4608dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
460992a58671SMike Christie static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
461086b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
4611dfc5606dSYehuda Sadeh 
4612dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
4613dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
461434b13184SAlex Elder 	&dev_attr_features.attr,
4615dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
4616dd82fff1SIlya Dryomov 	&dev_attr_minor.attr,
4617005a07bfSIlya Dryomov 	&dev_attr_client_addr.attr,
4618dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
4619267fb90bSMike Christie 	&dev_attr_cluster_fsid.attr,
46200d6d1e9cSMike Christie 	&dev_attr_config_info.attr,
4621dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
46229bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
4623dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
4624589d30e0SAlex Elder 	&dev_attr_image_id.attr,
4625dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
462692a58671SMike Christie 	&dev_attr_snap_id.attr,
462786b00e0dSAlex Elder 	&dev_attr_parent.attr,
4628dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
4629dfc5606dSYehuda Sadeh 	NULL
4630dfc5606dSYehuda Sadeh };
4631dfc5606dSYehuda Sadeh 
4632dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
4633dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
4634dfc5606dSYehuda Sadeh };
4635dfc5606dSYehuda Sadeh 
4636dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
4637dfc5606dSYehuda Sadeh 	&rbd_attr_group,
4638dfc5606dSYehuda Sadeh 	NULL
4639dfc5606dSYehuda Sadeh };
4640dfc5606dSYehuda Sadeh 
46416cac4695SIlya Dryomov static void rbd_dev_release(struct device *dev);
4642dfc5606dSYehuda Sadeh 
4643b9942bc9SBhumika Goyal static const struct device_type rbd_device_type = {
4644dfc5606dSYehuda Sadeh 	.name		= "rbd",
4645dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
46466cac4695SIlya Dryomov 	.release	= rbd_dev_release,
4647dfc5606dSYehuda Sadeh };
4648dfc5606dSYehuda Sadeh 
46498b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
46508b8fb99cSAlex Elder {
46518b8fb99cSAlex Elder 	kref_get(&spec->kref);
46528b8fb99cSAlex Elder 
46538b8fb99cSAlex Elder 	return spec;
46548b8fb99cSAlex Elder }
46558b8fb99cSAlex Elder 
46568b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
46578b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
46588b8fb99cSAlex Elder {
46598b8fb99cSAlex Elder 	if (spec)
46608b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
46618b8fb99cSAlex Elder }
46628b8fb99cSAlex Elder 
46638b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
46648b8fb99cSAlex Elder {
46658b8fb99cSAlex Elder 	struct rbd_spec *spec;
46668b8fb99cSAlex Elder 
46678b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
46688b8fb99cSAlex Elder 	if (!spec)
46698b8fb99cSAlex Elder 		return NULL;
467004077599SIlya Dryomov 
467104077599SIlya Dryomov 	spec->pool_id = CEPH_NOPOOL;
467204077599SIlya Dryomov 	spec->snap_id = CEPH_NOSNAP;
46738b8fb99cSAlex Elder 	kref_init(&spec->kref);
46748b8fb99cSAlex Elder 
46758b8fb99cSAlex Elder 	return spec;
46768b8fb99cSAlex Elder }
46778b8fb99cSAlex Elder 
46788b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
46798b8fb99cSAlex Elder {
46808b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
46818b8fb99cSAlex Elder 
46828b8fb99cSAlex Elder 	kfree(spec->pool_name);
46838b8fb99cSAlex Elder 	kfree(spec->image_id);
46848b8fb99cSAlex Elder 	kfree(spec->image_name);
46858b8fb99cSAlex Elder 	kfree(spec->snap_name);
46868b8fb99cSAlex Elder 	kfree(spec);
46878b8fb99cSAlex Elder }
46888b8fb99cSAlex Elder 
46891643dfa4SIlya Dryomov static void rbd_dev_free(struct rbd_device *rbd_dev)
4690dd5ac32dSIlya Dryomov {
469199d16943SIlya Dryomov 	WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
4692ed95b21aSIlya Dryomov 	WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
4693dd5ac32dSIlya Dryomov 
4694c41d13a3SIlya Dryomov 	ceph_oid_destroy(&rbd_dev->header_oid);
46956b6dddbeSIlya Dryomov 	ceph_oloc_destroy(&rbd_dev->header_oloc);
46960d6d1e9cSMike Christie 	kfree(rbd_dev->config_info);
4697c41d13a3SIlya Dryomov 
4698dd5ac32dSIlya Dryomov 	rbd_put_client(rbd_dev->rbd_client);
4699dd5ac32dSIlya Dryomov 	rbd_spec_put(rbd_dev->spec);
4700dd5ac32dSIlya Dryomov 	kfree(rbd_dev->opts);
4701dd5ac32dSIlya Dryomov 	kfree(rbd_dev);
47021643dfa4SIlya Dryomov }
47031643dfa4SIlya Dryomov 
47041643dfa4SIlya Dryomov static void rbd_dev_release(struct device *dev)
47051643dfa4SIlya Dryomov {
47061643dfa4SIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
47071643dfa4SIlya Dryomov 	bool need_put = !!rbd_dev->opts;
47081643dfa4SIlya Dryomov 
47091643dfa4SIlya Dryomov 	if (need_put) {
47101643dfa4SIlya Dryomov 		destroy_workqueue(rbd_dev->task_wq);
47111643dfa4SIlya Dryomov 		ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
47121643dfa4SIlya Dryomov 	}
47131643dfa4SIlya Dryomov 
47141643dfa4SIlya Dryomov 	rbd_dev_free(rbd_dev);
4715dd5ac32dSIlya Dryomov 
4716dd5ac32dSIlya Dryomov 	/*
4717dd5ac32dSIlya Dryomov 	 * This is racy, but way better than putting module outside of
4718dd5ac32dSIlya Dryomov 	 * the release callback.  The race window is pretty small, so
4719dd5ac32dSIlya Dryomov 	 * doing something similar to dm (dm-builtin.c) is overkill.
4720dd5ac32dSIlya Dryomov 	 */
4721dd5ac32dSIlya Dryomov 	if (need_put)
4722dd5ac32dSIlya Dryomov 		module_put(THIS_MODULE);
4723dd5ac32dSIlya Dryomov }
4724dd5ac32dSIlya Dryomov 
47251643dfa4SIlya Dryomov static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
47261643dfa4SIlya Dryomov 					   struct rbd_spec *spec)
4727c53d5893SAlex Elder {
4728c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
4729c53d5893SAlex Elder 
4730c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
4731c53d5893SAlex Elder 	if (!rbd_dev)
4732c53d5893SAlex Elder 		return NULL;
4733c53d5893SAlex Elder 
4734c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
4735c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
4736c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
4737c53d5893SAlex Elder 
47387e97332eSIlya Dryomov 	rbd_dev->header.data_pool_id = CEPH_NOPOOL;
4739c41d13a3SIlya Dryomov 	ceph_oid_init(&rbd_dev->header_oid);
4740431a02cdSIlya Dryomov 	rbd_dev->header_oloc.pool = spec->pool_id;
4741c41d13a3SIlya Dryomov 
474299d16943SIlya Dryomov 	mutex_init(&rbd_dev->watch_mutex);
474399d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
474499d16943SIlya Dryomov 	INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
474599d16943SIlya Dryomov 
4746ed95b21aSIlya Dryomov 	init_rwsem(&rbd_dev->lock_rwsem);
4747ed95b21aSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
4748ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
4749ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
4750ed95b21aSIlya Dryomov 	INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
4751ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
4752ed95b21aSIlya Dryomov 	init_waitqueue_head(&rbd_dev->lock_waitq);
4753ed95b21aSIlya Dryomov 
4754dd5ac32dSIlya Dryomov 	rbd_dev->dev.bus = &rbd_bus_type;
4755dd5ac32dSIlya Dryomov 	rbd_dev->dev.type = &rbd_device_type;
4756dd5ac32dSIlya Dryomov 	rbd_dev->dev.parent = &rbd_root_dev;
4757dd5ac32dSIlya Dryomov 	device_initialize(&rbd_dev->dev);
4758dd5ac32dSIlya Dryomov 
4759c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
4760d147543dSIlya Dryomov 	rbd_dev->spec = spec;
47610903e875SAlex Elder 
47621643dfa4SIlya Dryomov 	return rbd_dev;
47631643dfa4SIlya Dryomov }
47641643dfa4SIlya Dryomov 
4765dd5ac32dSIlya Dryomov /*
47661643dfa4SIlya Dryomov  * Create a mapping rbd_dev.
4767dd5ac32dSIlya Dryomov  */
47681643dfa4SIlya Dryomov static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
47691643dfa4SIlya Dryomov 					 struct rbd_spec *spec,
47701643dfa4SIlya Dryomov 					 struct rbd_options *opts)
47711643dfa4SIlya Dryomov {
47721643dfa4SIlya Dryomov 	struct rbd_device *rbd_dev;
47731643dfa4SIlya Dryomov 
47741643dfa4SIlya Dryomov 	rbd_dev = __rbd_dev_create(rbdc, spec);
47751643dfa4SIlya Dryomov 	if (!rbd_dev)
47761643dfa4SIlya Dryomov 		return NULL;
47771643dfa4SIlya Dryomov 
47781643dfa4SIlya Dryomov 	rbd_dev->opts = opts;
47791643dfa4SIlya Dryomov 
47801643dfa4SIlya Dryomov 	/* get an id and fill in device name */
47811643dfa4SIlya Dryomov 	rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
47821643dfa4SIlya Dryomov 					 minor_to_rbd_dev_id(1 << MINORBITS),
47831643dfa4SIlya Dryomov 					 GFP_KERNEL);
47841643dfa4SIlya Dryomov 	if (rbd_dev->dev_id < 0)
47851643dfa4SIlya Dryomov 		goto fail_rbd_dev;
47861643dfa4SIlya Dryomov 
47871643dfa4SIlya Dryomov 	sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
47881643dfa4SIlya Dryomov 	rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
47891643dfa4SIlya Dryomov 						   rbd_dev->name);
47901643dfa4SIlya Dryomov 	if (!rbd_dev->task_wq)
47911643dfa4SIlya Dryomov 		goto fail_dev_id;
47921643dfa4SIlya Dryomov 
47931643dfa4SIlya Dryomov 	/* we have a ref from do_rbd_add() */
4794dd5ac32dSIlya Dryomov 	__module_get(THIS_MODULE);
4795dd5ac32dSIlya Dryomov 
47961643dfa4SIlya Dryomov 	dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
4797c53d5893SAlex Elder 	return rbd_dev;
47981643dfa4SIlya Dryomov 
47991643dfa4SIlya Dryomov fail_dev_id:
48001643dfa4SIlya Dryomov 	ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
48011643dfa4SIlya Dryomov fail_rbd_dev:
48021643dfa4SIlya Dryomov 	rbd_dev_free(rbd_dev);
48031643dfa4SIlya Dryomov 	return NULL;
4804c53d5893SAlex Elder }
4805c53d5893SAlex Elder 
4806c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4807c53d5893SAlex Elder {
4808dd5ac32dSIlya Dryomov 	if (rbd_dev)
4809dd5ac32dSIlya Dryomov 		put_device(&rbd_dev->dev);
4810c53d5893SAlex Elder }
4811c53d5893SAlex Elder 
4812dfc5606dSYehuda Sadeh /*
48139d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
48149d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
48159d475de5SAlex Elder  * image.
48169d475de5SAlex Elder  */
48179d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
48189d475de5SAlex Elder 				u8 *order, u64 *snap_size)
48199d475de5SAlex Elder {
48209d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
48219d475de5SAlex Elder 	int ret;
48229d475de5SAlex Elder 	struct {
48239d475de5SAlex Elder 		u8 order;
48249d475de5SAlex Elder 		__le64 size;
48259d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
48269d475de5SAlex Elder 
4827ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4828ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_size",
48294157976bSAlex Elder 				  &snapid, sizeof(snapid),
4830e2a58ee5SAlex Elder 				  &size_buf, sizeof(size_buf));
483136be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
48329d475de5SAlex Elder 	if (ret < 0)
48339d475de5SAlex Elder 		return ret;
483457385b51SAlex Elder 	if (ret < sizeof (size_buf))
483557385b51SAlex Elder 		return -ERANGE;
48369d475de5SAlex Elder 
4837c3545579SJosh Durgin 	if (order) {
48389d475de5SAlex Elder 		*order = size_buf.order;
4839c3545579SJosh Durgin 		dout("  order %u", (unsigned int)*order);
4840c3545579SJosh Durgin 	}
48419d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
48429d475de5SAlex Elder 
4843c3545579SJosh Durgin 	dout("  snap_id 0x%016llx snap_size = %llu\n",
4844c3545579SJosh Durgin 		(unsigned long long)snap_id,
48459d475de5SAlex Elder 		(unsigned long long)*snap_size);
48469d475de5SAlex Elder 
48479d475de5SAlex Elder 	return 0;
48489d475de5SAlex Elder }
48499d475de5SAlex Elder 
48509d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
48519d475de5SAlex Elder {
48529d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
48539d475de5SAlex Elder 					&rbd_dev->header.obj_order,
48549d475de5SAlex Elder 					&rbd_dev->header.image_size);
48559d475de5SAlex Elder }
48569d475de5SAlex Elder 
48571e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
48581e130199SAlex Elder {
48591e130199SAlex Elder 	void *reply_buf;
48601e130199SAlex Elder 	int ret;
48611e130199SAlex Elder 	void *p;
48621e130199SAlex Elder 
48631e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
48641e130199SAlex Elder 	if (!reply_buf)
48651e130199SAlex Elder 		return -ENOMEM;
48661e130199SAlex Elder 
4867ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4868ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_object_prefix",
4869ecd4a68aSIlya Dryomov 				  NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
487036be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
48711e130199SAlex Elder 	if (ret < 0)
48721e130199SAlex Elder 		goto out;
48731e130199SAlex Elder 
48741e130199SAlex Elder 	p = reply_buf;
48751e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
487657385b51SAlex Elder 						p + ret, NULL, GFP_NOIO);
487757385b51SAlex Elder 	ret = 0;
48781e130199SAlex Elder 
48791e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
48801e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
48811e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
48821e130199SAlex Elder 	} else {
48831e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
48841e130199SAlex Elder 	}
48851e130199SAlex Elder out:
48861e130199SAlex Elder 	kfree(reply_buf);
48871e130199SAlex Elder 
48881e130199SAlex Elder 	return ret;
48891e130199SAlex Elder }
48901e130199SAlex Elder 
4891b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4892b1b5402aSAlex Elder 		u64 *snap_features)
4893b1b5402aSAlex Elder {
4894b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
4895b1b5402aSAlex Elder 	struct {
4896b1b5402aSAlex Elder 		__le64 features;
4897b1b5402aSAlex Elder 		__le64 incompat;
48984157976bSAlex Elder 	} __attribute__ ((packed)) features_buf = { 0 };
4899d3767f0fSIlya Dryomov 	u64 unsup;
4900b1b5402aSAlex Elder 	int ret;
4901b1b5402aSAlex Elder 
4902ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4903ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_features",
49044157976bSAlex Elder 				  &snapid, sizeof(snapid),
4905e2a58ee5SAlex Elder 				  &features_buf, sizeof(features_buf));
490636be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4907b1b5402aSAlex Elder 	if (ret < 0)
4908b1b5402aSAlex Elder 		return ret;
490957385b51SAlex Elder 	if (ret < sizeof (features_buf))
491057385b51SAlex Elder 		return -ERANGE;
4911d889140cSAlex Elder 
4912d3767f0fSIlya Dryomov 	unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
4913d3767f0fSIlya Dryomov 	if (unsup) {
4914d3767f0fSIlya Dryomov 		rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
4915d3767f0fSIlya Dryomov 			 unsup);
4916b8f5c6edSAlex Elder 		return -ENXIO;
4917d3767f0fSIlya Dryomov 	}
4918d889140cSAlex Elder 
4919b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
4920b1b5402aSAlex Elder 
4921b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
4922b1b5402aSAlex Elder 		(unsigned long long)snap_id,
4923b1b5402aSAlex Elder 		(unsigned long long)*snap_features,
4924b1b5402aSAlex Elder 		(unsigned long long)le64_to_cpu(features_buf.incompat));
4925b1b5402aSAlex Elder 
4926b1b5402aSAlex Elder 	return 0;
4927b1b5402aSAlex Elder }
4928b1b5402aSAlex Elder 
4929b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
4930b1b5402aSAlex Elder {
4931b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
4932b1b5402aSAlex Elder 						&rbd_dev->header.features);
4933b1b5402aSAlex Elder }
4934b1b5402aSAlex Elder 
493586b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
493686b00e0dSAlex Elder {
493786b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
493886b00e0dSAlex Elder 	size_t size;
493986b00e0dSAlex Elder 	void *reply_buf = NULL;
494086b00e0dSAlex Elder 	__le64 snapid;
494186b00e0dSAlex Elder 	void *p;
494286b00e0dSAlex Elder 	void *end;
4943642a2537SAlex Elder 	u64 pool_id;
494486b00e0dSAlex Elder 	char *image_id;
49453b5cf2a2SAlex Elder 	u64 snap_id;
494686b00e0dSAlex Elder 	u64 overlap;
494786b00e0dSAlex Elder 	int ret;
494886b00e0dSAlex Elder 
494986b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
495086b00e0dSAlex Elder 	if (!parent_spec)
495186b00e0dSAlex Elder 		return -ENOMEM;
495286b00e0dSAlex Elder 
495386b00e0dSAlex Elder 	size = sizeof (__le64) +				/* pool_id */
495486b00e0dSAlex Elder 		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
495586b00e0dSAlex Elder 		sizeof (__le64) +				/* snap_id */
495686b00e0dSAlex Elder 		sizeof (__le64);				/* overlap */
495786b00e0dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
495886b00e0dSAlex Elder 	if (!reply_buf) {
495986b00e0dSAlex Elder 		ret = -ENOMEM;
496086b00e0dSAlex Elder 		goto out_err;
496186b00e0dSAlex Elder 	}
496286b00e0dSAlex Elder 
49634d9b67cdSIlya Dryomov 	snapid = cpu_to_le64(rbd_dev->spec->snap_id);
4964ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4965ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_parent",
4966ecd4a68aSIlya Dryomov 				  &snapid, sizeof(snapid), reply_buf, size);
496736be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
496886b00e0dSAlex Elder 	if (ret < 0)
496986b00e0dSAlex Elder 		goto out_err;
497086b00e0dSAlex Elder 
497186b00e0dSAlex Elder 	p = reply_buf;
497257385b51SAlex Elder 	end = reply_buf + ret;
497357385b51SAlex Elder 	ret = -ERANGE;
4974642a2537SAlex Elder 	ceph_decode_64_safe(&p, end, pool_id, out_err);
4975392a9dadSAlex Elder 	if (pool_id == CEPH_NOPOOL) {
4976392a9dadSAlex Elder 		/*
4977392a9dadSAlex Elder 		 * Either the parent never existed, or we have
4978392a9dadSAlex Elder 		 * record of it but the image got flattened so it no
4979392a9dadSAlex Elder 		 * longer has a parent.  When the parent of a
4980392a9dadSAlex Elder 		 * layered image disappears we immediately set the
4981392a9dadSAlex Elder 		 * overlap to 0.  The effect of this is that all new
4982392a9dadSAlex Elder 		 * requests will be treated as if the image had no
4983392a9dadSAlex Elder 		 * parent.
4984392a9dadSAlex Elder 		 */
4985392a9dadSAlex Elder 		if (rbd_dev->parent_overlap) {
4986392a9dadSAlex Elder 			rbd_dev->parent_overlap = 0;
4987392a9dadSAlex Elder 			rbd_dev_parent_put(rbd_dev);
4988392a9dadSAlex Elder 			pr_info("%s: clone image has been flattened\n",
4989392a9dadSAlex Elder 				rbd_dev->disk->disk_name);
4990392a9dadSAlex Elder 		}
4991392a9dadSAlex Elder 
499286b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
4993392a9dadSAlex Elder 	}
499486b00e0dSAlex Elder 
49950903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
49960903e875SAlex Elder 
49970903e875SAlex Elder 	ret = -EIO;
4998642a2537SAlex Elder 	if (pool_id > (u64)U32_MAX) {
49999584d508SIlya Dryomov 		rbd_warn(NULL, "parent pool id too large (%llu > %u)",
5000642a2537SAlex Elder 			(unsigned long long)pool_id, U32_MAX);
500157385b51SAlex Elder 		goto out_err;
5002c0cd10dbSAlex Elder 	}
50030903e875SAlex Elder 
5004979ed480SAlex Elder 	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
500586b00e0dSAlex Elder 	if (IS_ERR(image_id)) {
500686b00e0dSAlex Elder 		ret = PTR_ERR(image_id);
500786b00e0dSAlex Elder 		goto out_err;
500886b00e0dSAlex Elder 	}
50093b5cf2a2SAlex Elder 	ceph_decode_64_safe(&p, end, snap_id, out_err);
501086b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, overlap, out_err);
501186b00e0dSAlex Elder 
50123b5cf2a2SAlex Elder 	/*
50133b5cf2a2SAlex Elder 	 * The parent won't change (except when the clone is
50143b5cf2a2SAlex Elder 	 * flattened, already handled that).  So we only need to
50153b5cf2a2SAlex Elder 	 * record the parent spec we have not already done so.
50163b5cf2a2SAlex Elder 	 */
50173b5cf2a2SAlex Elder 	if (!rbd_dev->parent_spec) {
50183b5cf2a2SAlex Elder 		parent_spec->pool_id = pool_id;
50193b5cf2a2SAlex Elder 		parent_spec->image_id = image_id;
50203b5cf2a2SAlex Elder 		parent_spec->snap_id = snap_id;
502186b00e0dSAlex Elder 		rbd_dev->parent_spec = parent_spec;
502286b00e0dSAlex Elder 		parent_spec = NULL;	/* rbd_dev now owns this */
5023fbba11b3SIlya Dryomov 	} else {
5024fbba11b3SIlya Dryomov 		kfree(image_id);
50253b5cf2a2SAlex Elder 	}
50263b5cf2a2SAlex Elder 
50273b5cf2a2SAlex Elder 	/*
5028cf32bd9cSIlya Dryomov 	 * We always update the parent overlap.  If it's zero we issue
5029cf32bd9cSIlya Dryomov 	 * a warning, as we will proceed as if there was no parent.
50303b5cf2a2SAlex Elder 	 */
50313b5cf2a2SAlex Elder 	if (!overlap) {
50323b5cf2a2SAlex Elder 		if (parent_spec) {
5033cf32bd9cSIlya Dryomov 			/* refresh, careful to warn just once */
5034cf32bd9cSIlya Dryomov 			if (rbd_dev->parent_overlap)
5035cf32bd9cSIlya Dryomov 				rbd_warn(rbd_dev,
5036cf32bd9cSIlya Dryomov 				    "clone now standalone (overlap became 0)");
503770cf49cfSAlex Elder 		} else {
5038cf32bd9cSIlya Dryomov 			/* initial probe */
5039cf32bd9cSIlya Dryomov 			rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
50403b5cf2a2SAlex Elder 		}
504170cf49cfSAlex Elder 	}
5042cf32bd9cSIlya Dryomov 	rbd_dev->parent_overlap = overlap;
5043cf32bd9cSIlya Dryomov 
504486b00e0dSAlex Elder out:
504586b00e0dSAlex Elder 	ret = 0;
504686b00e0dSAlex Elder out_err:
504786b00e0dSAlex Elder 	kfree(reply_buf);
504886b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
504986b00e0dSAlex Elder 
505086b00e0dSAlex Elder 	return ret;
505186b00e0dSAlex Elder }
505286b00e0dSAlex Elder 
5053cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
5054cc070d59SAlex Elder {
5055cc070d59SAlex Elder 	struct {
5056cc070d59SAlex Elder 		__le64 stripe_unit;
5057cc070d59SAlex Elder 		__le64 stripe_count;
5058cc070d59SAlex Elder 	} __attribute__ ((packed)) striping_info_buf = { 0 };
5059cc070d59SAlex Elder 	size_t size = sizeof (striping_info_buf);
5060cc070d59SAlex Elder 	void *p;
5061cc070d59SAlex Elder 	u64 obj_size;
5062cc070d59SAlex Elder 	u64 stripe_unit;
5063cc070d59SAlex Elder 	u64 stripe_count;
5064cc070d59SAlex Elder 	int ret;
5065cc070d59SAlex Elder 
5066ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5067ecd4a68aSIlya Dryomov 				&rbd_dev->header_oloc, "get_stripe_unit_count",
5068ecd4a68aSIlya Dryomov 				NULL, 0, &striping_info_buf, size);
5069cc070d59SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5070cc070d59SAlex Elder 	if (ret < 0)
5071cc070d59SAlex Elder 		return ret;
5072cc070d59SAlex Elder 	if (ret < size)
5073cc070d59SAlex Elder 		return -ERANGE;
5074cc070d59SAlex Elder 
5075cc070d59SAlex Elder 	/*
5076cc070d59SAlex Elder 	 * We don't actually support the "fancy striping" feature
5077cc070d59SAlex Elder 	 * (STRIPINGV2) yet, but if the striping sizes are the
5078cc070d59SAlex Elder 	 * defaults the behavior is the same as before.  So find
5079cc070d59SAlex Elder 	 * out, and only fail if the image has non-default values.
5080cc070d59SAlex Elder 	 */
5081cc070d59SAlex Elder 	ret = -EINVAL;
50825bc3fb17SIlya Dryomov 	obj_size = rbd_obj_bytes(&rbd_dev->header);
5083cc070d59SAlex Elder 	p = &striping_info_buf;
5084cc070d59SAlex Elder 	stripe_unit = ceph_decode_64(&p);
5085cc070d59SAlex Elder 	if (stripe_unit != obj_size) {
5086cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe unit "
5087cc070d59SAlex Elder 				"(got %llu want %llu)",
5088cc070d59SAlex Elder 				stripe_unit, obj_size);
5089cc070d59SAlex Elder 		return -EINVAL;
5090cc070d59SAlex Elder 	}
5091cc070d59SAlex Elder 	stripe_count = ceph_decode_64(&p);
5092cc070d59SAlex Elder 	if (stripe_count != 1) {
5093cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe count "
5094cc070d59SAlex Elder 				"(got %llu want 1)", stripe_count);
5095cc070d59SAlex Elder 		return -EINVAL;
5096cc070d59SAlex Elder 	}
5097500d0c0fSAlex Elder 	rbd_dev->header.stripe_unit = stripe_unit;
5098500d0c0fSAlex Elder 	rbd_dev->header.stripe_count = stripe_count;
5099cc070d59SAlex Elder 
5100cc070d59SAlex Elder 	return 0;
5101cc070d59SAlex Elder }
5102cc070d59SAlex Elder 
51037e97332eSIlya Dryomov static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
51047e97332eSIlya Dryomov {
51057e97332eSIlya Dryomov 	__le64 data_pool_id;
51067e97332eSIlya Dryomov 	int ret;
51077e97332eSIlya Dryomov 
51087e97332eSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
51097e97332eSIlya Dryomov 				  &rbd_dev->header_oloc, "get_data_pool",
51107e97332eSIlya Dryomov 				  NULL, 0, &data_pool_id, sizeof(data_pool_id));
51117e97332eSIlya Dryomov 	if (ret < 0)
51127e97332eSIlya Dryomov 		return ret;
51137e97332eSIlya Dryomov 	if (ret < sizeof(data_pool_id))
51147e97332eSIlya Dryomov 		return -EBADMSG;
51157e97332eSIlya Dryomov 
51167e97332eSIlya Dryomov 	rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
51177e97332eSIlya Dryomov 	WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
51187e97332eSIlya Dryomov 	return 0;
51197e97332eSIlya Dryomov }
51207e97332eSIlya Dryomov 
51219e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
51229e15b77dSAlex Elder {
5123ecd4a68aSIlya Dryomov 	CEPH_DEFINE_OID_ONSTACK(oid);
51249e15b77dSAlex Elder 	size_t image_id_size;
51259e15b77dSAlex Elder 	char *image_id;
51269e15b77dSAlex Elder 	void *p;
51279e15b77dSAlex Elder 	void *end;
51289e15b77dSAlex Elder 	size_t size;
51299e15b77dSAlex Elder 	void *reply_buf = NULL;
51309e15b77dSAlex Elder 	size_t len = 0;
51319e15b77dSAlex Elder 	char *image_name = NULL;
51329e15b77dSAlex Elder 	int ret;
51339e15b77dSAlex Elder 
51349e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
51359e15b77dSAlex Elder 
513669e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
513769e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
51389e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
51399e15b77dSAlex Elder 	if (!image_id)
51409e15b77dSAlex Elder 		return NULL;
51419e15b77dSAlex Elder 
51429e15b77dSAlex Elder 	p = image_id;
51434157976bSAlex Elder 	end = image_id + image_id_size;
514469e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
51459e15b77dSAlex Elder 
51469e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
51479e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
51489e15b77dSAlex Elder 	if (!reply_buf)
51499e15b77dSAlex Elder 		goto out;
51509e15b77dSAlex Elder 
5151ecd4a68aSIlya Dryomov 	ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
5152ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5153ecd4a68aSIlya Dryomov 				  "dir_get_name", image_id, image_id_size,
5154e2a58ee5SAlex Elder 				  reply_buf, size);
51559e15b77dSAlex Elder 	if (ret < 0)
51569e15b77dSAlex Elder 		goto out;
51579e15b77dSAlex Elder 	p = reply_buf;
5158f40eb349SAlex Elder 	end = reply_buf + ret;
5159f40eb349SAlex Elder 
51609e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
51619e15b77dSAlex Elder 	if (IS_ERR(image_name))
51629e15b77dSAlex Elder 		image_name = NULL;
51639e15b77dSAlex Elder 	else
51649e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
51659e15b77dSAlex Elder out:
51669e15b77dSAlex Elder 	kfree(reply_buf);
51679e15b77dSAlex Elder 	kfree(image_id);
51689e15b77dSAlex Elder 
51699e15b77dSAlex Elder 	return image_name;
51709e15b77dSAlex Elder }
51719e15b77dSAlex Elder 
51722ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
51732ad3d716SAlex Elder {
51742ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
51752ad3d716SAlex Elder 	const char *snap_name;
51762ad3d716SAlex Elder 	u32 which = 0;
51772ad3d716SAlex Elder 
51782ad3d716SAlex Elder 	/* Skip over names until we find the one we are looking for */
51792ad3d716SAlex Elder 
51802ad3d716SAlex Elder 	snap_name = rbd_dev->header.snap_names;
51812ad3d716SAlex Elder 	while (which < snapc->num_snaps) {
51822ad3d716SAlex Elder 		if (!strcmp(name, snap_name))
51832ad3d716SAlex Elder 			return snapc->snaps[which];
51842ad3d716SAlex Elder 		snap_name += strlen(snap_name) + 1;
51852ad3d716SAlex Elder 		which++;
51862ad3d716SAlex Elder 	}
51872ad3d716SAlex Elder 	return CEPH_NOSNAP;
51882ad3d716SAlex Elder }
51892ad3d716SAlex Elder 
51902ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
51912ad3d716SAlex Elder {
51922ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
51932ad3d716SAlex Elder 	u32 which;
51942ad3d716SAlex Elder 	bool found = false;
51952ad3d716SAlex Elder 	u64 snap_id;
51962ad3d716SAlex Elder 
51972ad3d716SAlex Elder 	for (which = 0; !found && which < snapc->num_snaps; which++) {
51982ad3d716SAlex Elder 		const char *snap_name;
51992ad3d716SAlex Elder 
52002ad3d716SAlex Elder 		snap_id = snapc->snaps[which];
52012ad3d716SAlex Elder 		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
5202efadc98aSJosh Durgin 		if (IS_ERR(snap_name)) {
5203efadc98aSJosh Durgin 			/* ignore no-longer existing snapshots */
5204efadc98aSJosh Durgin 			if (PTR_ERR(snap_name) == -ENOENT)
5205efadc98aSJosh Durgin 				continue;
5206efadc98aSJosh Durgin 			else
52072ad3d716SAlex Elder 				break;
5208efadc98aSJosh Durgin 		}
52092ad3d716SAlex Elder 		found = !strcmp(name, snap_name);
52102ad3d716SAlex Elder 		kfree(snap_name);
52112ad3d716SAlex Elder 	}
52122ad3d716SAlex Elder 	return found ? snap_id : CEPH_NOSNAP;
52132ad3d716SAlex Elder }
52142ad3d716SAlex Elder 
52152ad3d716SAlex Elder /*
52162ad3d716SAlex Elder  * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
52172ad3d716SAlex Elder  * no snapshot by that name is found, or if an error occurs.
52182ad3d716SAlex Elder  */
52192ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
52202ad3d716SAlex Elder {
52212ad3d716SAlex Elder 	if (rbd_dev->image_format == 1)
52222ad3d716SAlex Elder 		return rbd_v1_snap_id_by_name(rbd_dev, name);
52232ad3d716SAlex Elder 
52242ad3d716SAlex Elder 	return rbd_v2_snap_id_by_name(rbd_dev, name);
52252ad3d716SAlex Elder }
52262ad3d716SAlex Elder 
52279e15b77dSAlex Elder /*
522804077599SIlya Dryomov  * An image being mapped will have everything but the snap id.
52299e15b77dSAlex Elder  */
523004077599SIlya Dryomov static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
523104077599SIlya Dryomov {
523204077599SIlya Dryomov 	struct rbd_spec *spec = rbd_dev->spec;
523304077599SIlya Dryomov 
523404077599SIlya Dryomov 	rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
523504077599SIlya Dryomov 	rbd_assert(spec->image_id && spec->image_name);
523604077599SIlya Dryomov 	rbd_assert(spec->snap_name);
523704077599SIlya Dryomov 
523804077599SIlya Dryomov 	if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
523904077599SIlya Dryomov 		u64 snap_id;
524004077599SIlya Dryomov 
524104077599SIlya Dryomov 		snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
524204077599SIlya Dryomov 		if (snap_id == CEPH_NOSNAP)
524304077599SIlya Dryomov 			return -ENOENT;
524404077599SIlya Dryomov 
524504077599SIlya Dryomov 		spec->snap_id = snap_id;
524604077599SIlya Dryomov 	} else {
524704077599SIlya Dryomov 		spec->snap_id = CEPH_NOSNAP;
524804077599SIlya Dryomov 	}
524904077599SIlya Dryomov 
525004077599SIlya Dryomov 	return 0;
525104077599SIlya Dryomov }
525204077599SIlya Dryomov 
525304077599SIlya Dryomov /*
525404077599SIlya Dryomov  * A parent image will have all ids but none of the names.
525504077599SIlya Dryomov  *
525604077599SIlya Dryomov  * All names in an rbd spec are dynamically allocated.  It's OK if we
525704077599SIlya Dryomov  * can't figure out the name for an image id.
525804077599SIlya Dryomov  */
525904077599SIlya Dryomov static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
52609e15b77dSAlex Elder {
52612e9f7f1cSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
52622e9f7f1cSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
52632e9f7f1cSAlex Elder 	const char *pool_name;
52642e9f7f1cSAlex Elder 	const char *image_name;
52652e9f7f1cSAlex Elder 	const char *snap_name;
52669e15b77dSAlex Elder 	int ret;
52679e15b77dSAlex Elder 
526804077599SIlya Dryomov 	rbd_assert(spec->pool_id != CEPH_NOPOOL);
526904077599SIlya Dryomov 	rbd_assert(spec->image_id);
527004077599SIlya Dryomov 	rbd_assert(spec->snap_id != CEPH_NOSNAP);
52719e15b77dSAlex Elder 
52722e9f7f1cSAlex Elder 	/* Get the pool name; we have to make our own copy of this */
52739e15b77dSAlex Elder 
52742e9f7f1cSAlex Elder 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
52752e9f7f1cSAlex Elder 	if (!pool_name) {
52762e9f7f1cSAlex Elder 		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
5277935dc89fSAlex Elder 		return -EIO;
5278935dc89fSAlex Elder 	}
52792e9f7f1cSAlex Elder 	pool_name = kstrdup(pool_name, GFP_KERNEL);
52802e9f7f1cSAlex Elder 	if (!pool_name)
52819e15b77dSAlex Elder 		return -ENOMEM;
52829e15b77dSAlex Elder 
52839e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
52849e15b77dSAlex Elder 
52852e9f7f1cSAlex Elder 	image_name = rbd_dev_image_name(rbd_dev);
52862e9f7f1cSAlex Elder 	if (!image_name)
528706ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
52889e15b77dSAlex Elder 
528904077599SIlya Dryomov 	/* Fetch the snapshot name */
52909e15b77dSAlex Elder 
52912e9f7f1cSAlex Elder 	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
5292da6a6b63SJosh Durgin 	if (IS_ERR(snap_name)) {
5293da6a6b63SJosh Durgin 		ret = PTR_ERR(snap_name);
52949e15b77dSAlex Elder 		goto out_err;
52952e9f7f1cSAlex Elder 	}
52962e9f7f1cSAlex Elder 
52972e9f7f1cSAlex Elder 	spec->pool_name = pool_name;
52982e9f7f1cSAlex Elder 	spec->image_name = image_name;
52992e9f7f1cSAlex Elder 	spec->snap_name = snap_name;
53009e15b77dSAlex Elder 
53019e15b77dSAlex Elder 	return 0;
530204077599SIlya Dryomov 
53039e15b77dSAlex Elder out_err:
53042e9f7f1cSAlex Elder 	kfree(image_name);
53052e9f7f1cSAlex Elder 	kfree(pool_name);
53069e15b77dSAlex Elder 	return ret;
53079e15b77dSAlex Elder }
53089e15b77dSAlex Elder 
5309cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
531035d489f9SAlex Elder {
531135d489f9SAlex Elder 	size_t size;
531235d489f9SAlex Elder 	int ret;
531335d489f9SAlex Elder 	void *reply_buf;
531435d489f9SAlex Elder 	void *p;
531535d489f9SAlex Elder 	void *end;
531635d489f9SAlex Elder 	u64 seq;
531735d489f9SAlex Elder 	u32 snap_count;
531835d489f9SAlex Elder 	struct ceph_snap_context *snapc;
531935d489f9SAlex Elder 	u32 i;
532035d489f9SAlex Elder 
532135d489f9SAlex Elder 	/*
532235d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
532335d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
532435d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
532535d489f9SAlex Elder 	 * prepared to receive.
532635d489f9SAlex Elder 	 */
532735d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
532835d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
532935d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
533035d489f9SAlex Elder 	if (!reply_buf)
533135d489f9SAlex Elder 		return -ENOMEM;
533235d489f9SAlex Elder 
5333ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5334ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_snapcontext",
5335ecd4a68aSIlya Dryomov 				  NULL, 0, reply_buf, size);
533636be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
533735d489f9SAlex Elder 	if (ret < 0)
533835d489f9SAlex Elder 		goto out;
533935d489f9SAlex Elder 
534035d489f9SAlex Elder 	p = reply_buf;
534157385b51SAlex Elder 	end = reply_buf + ret;
534257385b51SAlex Elder 	ret = -ERANGE;
534335d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
534435d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
534535d489f9SAlex Elder 
534635d489f9SAlex Elder 	/*
534735d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
534835d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
534935d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
535035d489f9SAlex Elder 	 * allocate is representable in a size_t.
535135d489f9SAlex Elder 	 */
535235d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
535335d489f9SAlex Elder 				 / sizeof (u64)) {
535435d489f9SAlex Elder 		ret = -EINVAL;
535535d489f9SAlex Elder 		goto out;
535635d489f9SAlex Elder 	}
535735d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
535835d489f9SAlex Elder 		goto out;
5359468521c1SAlex Elder 	ret = 0;
536035d489f9SAlex Elder 
5361812164f8SAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
536235d489f9SAlex Elder 	if (!snapc) {
536335d489f9SAlex Elder 		ret = -ENOMEM;
536435d489f9SAlex Elder 		goto out;
536535d489f9SAlex Elder 	}
536635d489f9SAlex Elder 	snapc->seq = seq;
536735d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
536835d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
536935d489f9SAlex Elder 
537049ece554SAlex Elder 	ceph_put_snap_context(rbd_dev->header.snapc);
537135d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
537235d489f9SAlex Elder 
537335d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
537435d489f9SAlex Elder 		(unsigned long long)seq, (unsigned int)snap_count);
537535d489f9SAlex Elder out:
537635d489f9SAlex Elder 	kfree(reply_buf);
537735d489f9SAlex Elder 
537857385b51SAlex Elder 	return ret;
537935d489f9SAlex Elder }
538035d489f9SAlex Elder 
538154cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
538254cac61fSAlex Elder 					u64 snap_id)
5383b8b1e2dbSAlex Elder {
5384b8b1e2dbSAlex Elder 	size_t size;
5385b8b1e2dbSAlex Elder 	void *reply_buf;
538654cac61fSAlex Elder 	__le64 snapid;
5387b8b1e2dbSAlex Elder 	int ret;
5388b8b1e2dbSAlex Elder 	void *p;
5389b8b1e2dbSAlex Elder 	void *end;
5390b8b1e2dbSAlex Elder 	char *snap_name;
5391b8b1e2dbSAlex Elder 
5392b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
5393b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
5394b8b1e2dbSAlex Elder 	if (!reply_buf)
5395b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
5396b8b1e2dbSAlex Elder 
539754cac61fSAlex Elder 	snapid = cpu_to_le64(snap_id);
5398ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5399ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_snapshot_name",
5400ecd4a68aSIlya Dryomov 				  &snapid, sizeof(snapid), reply_buf, size);
540136be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5402f40eb349SAlex Elder 	if (ret < 0) {
5403f40eb349SAlex Elder 		snap_name = ERR_PTR(ret);
5404b8b1e2dbSAlex Elder 		goto out;
5405f40eb349SAlex Elder 	}
5406b8b1e2dbSAlex Elder 
5407b8b1e2dbSAlex Elder 	p = reply_buf;
5408f40eb349SAlex Elder 	end = reply_buf + ret;
5409e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
5410f40eb349SAlex Elder 	if (IS_ERR(snap_name))
5411b8b1e2dbSAlex Elder 		goto out;
5412f40eb349SAlex Elder 
5413b8b1e2dbSAlex Elder 	dout("  snap_id 0x%016llx snap_name = %s\n",
541454cac61fSAlex Elder 		(unsigned long long)snap_id, snap_name);
5415b8b1e2dbSAlex Elder out:
5416b8b1e2dbSAlex Elder 	kfree(reply_buf);
5417b8b1e2dbSAlex Elder 
5418f40eb349SAlex Elder 	return snap_name;
5419b8b1e2dbSAlex Elder }
5420b8b1e2dbSAlex Elder 
54212df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
5422117973fbSAlex Elder {
54232df3fac7SAlex Elder 	bool first_time = rbd_dev->header.object_prefix == NULL;
5424117973fbSAlex Elder 	int ret;
5425117973fbSAlex Elder 
54261617e40cSJosh Durgin 	ret = rbd_dev_v2_image_size(rbd_dev);
54271617e40cSJosh Durgin 	if (ret)
5428cfbf6377SAlex Elder 		return ret;
54291617e40cSJosh Durgin 
54302df3fac7SAlex Elder 	if (first_time) {
54312df3fac7SAlex Elder 		ret = rbd_dev_v2_header_onetime(rbd_dev);
54322df3fac7SAlex Elder 		if (ret)
5433cfbf6377SAlex Elder 			return ret;
54342df3fac7SAlex Elder 	}
54352df3fac7SAlex Elder 
5436cc4a38bdSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev);
5437d194cd1dSIlya Dryomov 	if (ret && first_time) {
5438d194cd1dSIlya Dryomov 		kfree(rbd_dev->header.object_prefix);
5439d194cd1dSIlya Dryomov 		rbd_dev->header.object_prefix = NULL;
5440d194cd1dSIlya Dryomov 	}
5441117973fbSAlex Elder 
5442117973fbSAlex Elder 	return ret;
5443117973fbSAlex Elder }
5444117973fbSAlex Elder 
5445a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev)
5446a720ae09SIlya Dryomov {
5447a720ae09SIlya Dryomov 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5448a720ae09SIlya Dryomov 
5449a720ae09SIlya Dryomov 	if (rbd_dev->image_format == 1)
5450a720ae09SIlya Dryomov 		return rbd_dev_v1_header_info(rbd_dev);
5451a720ae09SIlya Dryomov 
5452a720ae09SIlya Dryomov 	return rbd_dev_v2_header_info(rbd_dev);
5453a720ae09SIlya Dryomov }
5454a720ae09SIlya Dryomov 
54551ddbe94eSAlex Elder /*
5456e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
5457e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
5458593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
5459593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
5460e28fff26SAlex Elder  */
5461e28fff26SAlex Elder static inline size_t next_token(const char **buf)
5462e28fff26SAlex Elder {
5463e28fff26SAlex Elder         /*
5464e28fff26SAlex Elder         * These are the characters that produce nonzero for
5465e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
5466e28fff26SAlex Elder         */
5467e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
5468e28fff26SAlex Elder 
5469e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
5470e28fff26SAlex Elder 
5471e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
5472e28fff26SAlex Elder }
5473e28fff26SAlex Elder 
5474e28fff26SAlex Elder /*
5475ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
5476ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
5477ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
5478ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
5479ea3352f4SAlex Elder  *
5480ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
5481ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
5482ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
5483ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
5484ea3352f4SAlex Elder  *
5485ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
5486ea3352f4SAlex Elder  * the end of the found token.
5487ea3352f4SAlex Elder  *
5488ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
5489ea3352f4SAlex Elder  */
5490ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
5491ea3352f4SAlex Elder {
5492ea3352f4SAlex Elder 	char *dup;
5493ea3352f4SAlex Elder 	size_t len;
5494ea3352f4SAlex Elder 
5495ea3352f4SAlex Elder 	len = next_token(buf);
54964caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
5497ea3352f4SAlex Elder 	if (!dup)
5498ea3352f4SAlex Elder 		return NULL;
5499ea3352f4SAlex Elder 	*(dup + len) = '\0';
5500ea3352f4SAlex Elder 	*buf += len;
5501ea3352f4SAlex Elder 
5502ea3352f4SAlex Elder 	if (lenp)
5503ea3352f4SAlex Elder 		*lenp = len;
5504ea3352f4SAlex Elder 
5505ea3352f4SAlex Elder 	return dup;
5506ea3352f4SAlex Elder }
5507ea3352f4SAlex Elder 
5508ea3352f4SAlex Elder /*
5509859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
5510859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
5511859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
5512859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
5513d22f76e7SAlex Elder  *
5514859c31dfSAlex Elder  * The information extracted from these options is recorded in
5515859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
5516859c31dfSAlex Elder  * structures:
5517859c31dfSAlex Elder  *  ceph_opts
5518859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
5519859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
5520859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
5521859c31dfSAlex Elder  *  rbd_opts
5522859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
5523859c31dfSAlex Elder  *	this function; caller must release with kfree().
5524859c31dfSAlex Elder  *  spec
5525859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
5526859c31dfSAlex Elder  *	initialized by this function based on parsed options.
5527859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
5528859c31dfSAlex Elder  *
5529859c31dfSAlex Elder  * The options passed take this form:
5530859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
5531859c31dfSAlex Elder  * where:
5532859c31dfSAlex Elder  *  <mon_addrs>
5533859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
5534859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
5535859c31dfSAlex Elder  *      by a port number (separated by a colon).
5536859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
5537859c31dfSAlex Elder  *  <options>
5538859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
5539859c31dfSAlex Elder  *  <pool_name>
5540859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
5541859c31dfSAlex Elder  *  <image_name>
5542859c31dfSAlex Elder  *      The name of the image in that pool to map.
5543859c31dfSAlex Elder  *  <snap_id>
5544859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
5545859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
5546859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
5547859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
5548a725f65eSAlex Elder  */
5549859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
5550dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
5551859c31dfSAlex Elder 				struct rbd_options **opts,
5552859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
5553a725f65eSAlex Elder {
5554e28fff26SAlex Elder 	size_t len;
5555859c31dfSAlex Elder 	char *options;
55560ddebc0cSAlex Elder 	const char *mon_addrs;
5557ecb4dc22SAlex Elder 	char *snap_name;
55580ddebc0cSAlex Elder 	size_t mon_addrs_size;
5559859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
55604e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
5561859c31dfSAlex Elder 	struct ceph_options *copts;
5562dc79b113SAlex Elder 	int ret;
5563e28fff26SAlex Elder 
5564e28fff26SAlex Elder 	/* The first four tokens are required */
5565e28fff26SAlex Elder 
55667ef3214aSAlex Elder 	len = next_token(&buf);
55674fb5d671SAlex Elder 	if (!len) {
55684fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
55694fb5d671SAlex Elder 		return -EINVAL;
55704fb5d671SAlex Elder 	}
55710ddebc0cSAlex Elder 	mon_addrs = buf;
5572f28e565aSAlex Elder 	mon_addrs_size = len + 1;
55737ef3214aSAlex Elder 	buf += len;
5574a725f65eSAlex Elder 
5575dc79b113SAlex Elder 	ret = -EINVAL;
5576f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
5577f28e565aSAlex Elder 	if (!options)
5578dc79b113SAlex Elder 		return -ENOMEM;
55794fb5d671SAlex Elder 	if (!*options) {
55804fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
55814fb5d671SAlex Elder 		goto out_err;
55824fb5d671SAlex Elder 	}
5583a725f65eSAlex Elder 
5584859c31dfSAlex Elder 	spec = rbd_spec_alloc();
5585859c31dfSAlex Elder 	if (!spec)
5586f28e565aSAlex Elder 		goto out_mem;
5587859c31dfSAlex Elder 
5588859c31dfSAlex Elder 	spec->pool_name = dup_token(&buf, NULL);
5589859c31dfSAlex Elder 	if (!spec->pool_name)
5590859c31dfSAlex Elder 		goto out_mem;
55914fb5d671SAlex Elder 	if (!*spec->pool_name) {
55924fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
55934fb5d671SAlex Elder 		goto out_err;
55944fb5d671SAlex Elder 	}
5595e28fff26SAlex Elder 
559669e7a02fSAlex Elder 	spec->image_name = dup_token(&buf, NULL);
5597859c31dfSAlex Elder 	if (!spec->image_name)
5598f28e565aSAlex Elder 		goto out_mem;
55994fb5d671SAlex Elder 	if (!*spec->image_name) {
56004fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
56014fb5d671SAlex Elder 		goto out_err;
56024fb5d671SAlex Elder 	}
5603e28fff26SAlex Elder 
5604f28e565aSAlex Elder 	/*
5605f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
5606f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
5607f28e565aSAlex Elder 	 */
56083feeb894SAlex Elder 	len = next_token(&buf);
5609820a5f3eSAlex Elder 	if (!len) {
56103feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
56113feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
5612f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
5613dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
5614f28e565aSAlex Elder 		goto out_err;
5615849b4260SAlex Elder 	}
5616ecb4dc22SAlex Elder 	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
5617ecb4dc22SAlex Elder 	if (!snap_name)
5618f28e565aSAlex Elder 		goto out_mem;
5619ecb4dc22SAlex Elder 	*(snap_name + len) = '\0';
5620ecb4dc22SAlex Elder 	spec->snap_name = snap_name;
5621e5c35534SAlex Elder 
56220ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
5623e28fff26SAlex Elder 
56244e9afebaSAlex Elder 	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
56254e9afebaSAlex Elder 	if (!rbd_opts)
56264e9afebaSAlex Elder 		goto out_mem;
56274e9afebaSAlex Elder 
56284e9afebaSAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
5629b5584180SIlya Dryomov 	rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
563080de1912SIlya Dryomov 	rbd_opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
5631d22f76e7SAlex Elder 
5632859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
56330ddebc0cSAlex Elder 					mon_addrs + mon_addrs_size - 1,
56344e9afebaSAlex Elder 					parse_rbd_opts_token, rbd_opts);
5635859c31dfSAlex Elder 	if (IS_ERR(copts)) {
5636859c31dfSAlex Elder 		ret = PTR_ERR(copts);
5637dc79b113SAlex Elder 		goto out_err;
5638dc79b113SAlex Elder 	}
5639859c31dfSAlex Elder 	kfree(options);
5640859c31dfSAlex Elder 
5641859c31dfSAlex Elder 	*ceph_opts = copts;
56424e9afebaSAlex Elder 	*opts = rbd_opts;
5643859c31dfSAlex Elder 	*rbd_spec = spec;
56440ddebc0cSAlex Elder 
5645dc79b113SAlex Elder 	return 0;
5646f28e565aSAlex Elder out_mem:
5647dc79b113SAlex Elder 	ret = -ENOMEM;
5648d22f76e7SAlex Elder out_err:
5649859c31dfSAlex Elder 	kfree(rbd_opts);
5650859c31dfSAlex Elder 	rbd_spec_put(spec);
5651f28e565aSAlex Elder 	kfree(options);
5652d22f76e7SAlex Elder 
5653dc79b113SAlex Elder 	return ret;
5654a725f65eSAlex Elder }
5655a725f65eSAlex Elder 
5656589d30e0SAlex Elder /*
565730ba1f02SIlya Dryomov  * Return pool id (>= 0) or a negative error code.
565830ba1f02SIlya Dryomov  */
565930ba1f02SIlya Dryomov static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
566030ba1f02SIlya Dryomov {
5661a319bf56SIlya Dryomov 	struct ceph_options *opts = rbdc->client->options;
566230ba1f02SIlya Dryomov 	u64 newest_epoch;
566330ba1f02SIlya Dryomov 	int tries = 0;
566430ba1f02SIlya Dryomov 	int ret;
566530ba1f02SIlya Dryomov 
566630ba1f02SIlya Dryomov again:
566730ba1f02SIlya Dryomov 	ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
566830ba1f02SIlya Dryomov 	if (ret == -ENOENT && tries++ < 1) {
5669d0b19705SIlya Dryomov 		ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap",
567030ba1f02SIlya Dryomov 					    &newest_epoch);
567130ba1f02SIlya Dryomov 		if (ret < 0)
567230ba1f02SIlya Dryomov 			return ret;
567330ba1f02SIlya Dryomov 
567430ba1f02SIlya Dryomov 		if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
56757cca78c9SIlya Dryomov 			ceph_osdc_maybe_request_map(&rbdc->client->osdc);
567630ba1f02SIlya Dryomov 			(void) ceph_monc_wait_osdmap(&rbdc->client->monc,
5677a319bf56SIlya Dryomov 						     newest_epoch,
5678a319bf56SIlya Dryomov 						     opts->mount_timeout);
567930ba1f02SIlya Dryomov 			goto again;
568030ba1f02SIlya Dryomov 		} else {
568130ba1f02SIlya Dryomov 			/* the osdmap we have is new enough */
568230ba1f02SIlya Dryomov 			return -ENOENT;
568330ba1f02SIlya Dryomov 		}
568430ba1f02SIlya Dryomov 	}
568530ba1f02SIlya Dryomov 
568630ba1f02SIlya Dryomov 	return ret;
568730ba1f02SIlya Dryomov }
568830ba1f02SIlya Dryomov 
568930ba1f02SIlya Dryomov /*
5690589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
5691589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
5692589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
5693589d30e0SAlex Elder  *
5694589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
5695589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
5696589d30e0SAlex Elder  * with the supplied name.
5697589d30e0SAlex Elder  *
5698589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
5699589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
5700589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
5701589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
5702589d30e0SAlex Elder  */
5703589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
5704589d30e0SAlex Elder {
5705589d30e0SAlex Elder 	int ret;
5706589d30e0SAlex Elder 	size_t size;
5707ecd4a68aSIlya Dryomov 	CEPH_DEFINE_OID_ONSTACK(oid);
5708589d30e0SAlex Elder 	void *response;
5709c0fba368SAlex Elder 	char *image_id;
57102f82ee54SAlex Elder 
5711589d30e0SAlex Elder 	/*
57122c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
57132c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
5714c0fba368SAlex Elder 	 * need to fetch the image id again in this case.  We
5715c0fba368SAlex Elder 	 * do still need to set the image format though.
57162c0d0a10SAlex Elder 	 */
5717c0fba368SAlex Elder 	if (rbd_dev->spec->image_id) {
5718c0fba368SAlex Elder 		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
5719c0fba368SAlex Elder 
57202c0d0a10SAlex Elder 		return 0;
5721c0fba368SAlex Elder 	}
57222c0d0a10SAlex Elder 
57232c0d0a10SAlex Elder 	/*
5724589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
5725589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
5726589d30e0SAlex Elder 	 */
5727ecd4a68aSIlya Dryomov 	ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
5728ecd4a68aSIlya Dryomov 			       rbd_dev->spec->image_name);
5729ecd4a68aSIlya Dryomov 	if (ret)
5730ecd4a68aSIlya Dryomov 		return ret;
5731ecd4a68aSIlya Dryomov 
5732ecd4a68aSIlya Dryomov 	dout("rbd id object name is %s\n", oid.name);
5733589d30e0SAlex Elder 
5734589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
5735589d30e0SAlex Elder 
5736589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
5737589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
5738589d30e0SAlex Elder 	if (!response) {
5739589d30e0SAlex Elder 		ret = -ENOMEM;
5740589d30e0SAlex Elder 		goto out;
5741589d30e0SAlex Elder 	}
5742589d30e0SAlex Elder 
5743c0fba368SAlex Elder 	/* If it doesn't exist we'll assume it's a format 1 image */
5744c0fba368SAlex Elder 
5745ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5746ecd4a68aSIlya Dryomov 				  "get_id", NULL, 0,
5747e2a58ee5SAlex Elder 				  response, RBD_IMAGE_ID_LEN_MAX);
574836be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5749c0fba368SAlex Elder 	if (ret == -ENOENT) {
5750c0fba368SAlex Elder 		image_id = kstrdup("", GFP_KERNEL);
5751c0fba368SAlex Elder 		ret = image_id ? 0 : -ENOMEM;
5752c0fba368SAlex Elder 		if (!ret)
5753c0fba368SAlex Elder 			rbd_dev->image_format = 1;
57547dd440c9SIlya Dryomov 	} else if (ret >= 0) {
5755c0fba368SAlex Elder 		void *p = response;
5756589d30e0SAlex Elder 
5757c0fba368SAlex Elder 		image_id = ceph_extract_encoded_string(&p, p + ret,
5758979ed480SAlex Elder 						NULL, GFP_NOIO);
5759461f758aSDuan Jiong 		ret = PTR_ERR_OR_ZERO(image_id);
5760c0fba368SAlex Elder 		if (!ret)
5761c0fba368SAlex Elder 			rbd_dev->image_format = 2;
5762c0fba368SAlex Elder 	}
5763c0fba368SAlex Elder 
5764c0fba368SAlex Elder 	if (!ret) {
5765c0fba368SAlex Elder 		rbd_dev->spec->image_id = image_id;
5766c0fba368SAlex Elder 		dout("image_id is %s\n", image_id);
5767589d30e0SAlex Elder 	}
5768589d30e0SAlex Elder out:
5769589d30e0SAlex Elder 	kfree(response);
5770ecd4a68aSIlya Dryomov 	ceph_oid_destroy(&oid);
5771589d30e0SAlex Elder 	return ret;
5772589d30e0SAlex Elder }
5773589d30e0SAlex Elder 
57743abef3b3SAlex Elder /*
57753abef3b3SAlex Elder  * Undo whatever state changes are made by v1 or v2 header info
57763abef3b3SAlex Elder  * call.
57773abef3b3SAlex Elder  */
57786fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
57796fd48b3bSAlex Elder {
57806fd48b3bSAlex Elder 	struct rbd_image_header	*header;
57816fd48b3bSAlex Elder 
5782a2acd00eSAlex Elder 	rbd_dev_parent_put(rbd_dev);
57836fd48b3bSAlex Elder 
57846fd48b3bSAlex Elder 	/* Free dynamic fields from the header, then zero it out */
57856fd48b3bSAlex Elder 
57866fd48b3bSAlex Elder 	header = &rbd_dev->header;
5787812164f8SAlex Elder 	ceph_put_snap_context(header->snapc);
57886fd48b3bSAlex Elder 	kfree(header->snap_sizes);
57896fd48b3bSAlex Elder 	kfree(header->snap_names);
57906fd48b3bSAlex Elder 	kfree(header->object_prefix);
57916fd48b3bSAlex Elder 	memset(header, 0, sizeof (*header));
57926fd48b3bSAlex Elder }
57936fd48b3bSAlex Elder 
57942df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
5795a30b71b9SAlex Elder {
5796a30b71b9SAlex Elder 	int ret;
5797a30b71b9SAlex Elder 
57981e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
579957385b51SAlex Elder 	if (ret)
58001e130199SAlex Elder 		goto out_err;
5801b1b5402aSAlex Elder 
58022df3fac7SAlex Elder 	/*
58032df3fac7SAlex Elder 	 * Get the and check features for the image.  Currently the
58042df3fac7SAlex Elder 	 * features are assumed to never change.
58052df3fac7SAlex Elder 	 */
5806b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
580757385b51SAlex Elder 	if (ret)
5808b1b5402aSAlex Elder 		goto out_err;
580935d489f9SAlex Elder 
5810cc070d59SAlex Elder 	/* If the image supports fancy striping, get its parameters */
5811cc070d59SAlex Elder 
5812cc070d59SAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
5813cc070d59SAlex Elder 		ret = rbd_dev_v2_striping_info(rbd_dev);
5814cc070d59SAlex Elder 		if (ret < 0)
5815cc070d59SAlex Elder 			goto out_err;
5816cc070d59SAlex Elder 	}
5817a30b71b9SAlex Elder 
58187e97332eSIlya Dryomov 	if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
58197e97332eSIlya Dryomov 		ret = rbd_dev_v2_data_pool(rbd_dev);
58207e97332eSIlya Dryomov 		if (ret)
58217e97332eSIlya Dryomov 			goto out_err;
58227e97332eSIlya Dryomov 	}
58237e97332eSIlya Dryomov 
5824263423f8SIlya Dryomov 	rbd_init_layout(rbd_dev);
582535152979SAlex Elder 	return 0;
5826263423f8SIlya Dryomov 
58279d475de5SAlex Elder out_err:
5828642a2537SAlex Elder 	rbd_dev->header.features = 0;
58291e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
58301e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
58319d475de5SAlex Elder 	return ret;
5832a30b71b9SAlex Elder }
5833a30b71b9SAlex Elder 
58346d69bb53SIlya Dryomov /*
58356d69bb53SIlya Dryomov  * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
58366d69bb53SIlya Dryomov  * rbd_dev_image_probe() recursion depth, which means it's also the
58376d69bb53SIlya Dryomov  * length of the already discovered part of the parent chain.
58386d69bb53SIlya Dryomov  */
58396d69bb53SIlya Dryomov static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
584083a06263SAlex Elder {
58412f82ee54SAlex Elder 	struct rbd_device *parent = NULL;
5842124afba2SAlex Elder 	int ret;
5843124afba2SAlex Elder 
5844124afba2SAlex Elder 	if (!rbd_dev->parent_spec)
5845124afba2SAlex Elder 		return 0;
5846124afba2SAlex Elder 
58476d69bb53SIlya Dryomov 	if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
58486d69bb53SIlya Dryomov 		pr_info("parent chain is too long (%d)\n", depth);
58496d69bb53SIlya Dryomov 		ret = -EINVAL;
58506d69bb53SIlya Dryomov 		goto out_err;
58516d69bb53SIlya Dryomov 	}
58526d69bb53SIlya Dryomov 
58531643dfa4SIlya Dryomov 	parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
58541f2c6651SIlya Dryomov 	if (!parent) {
5855124afba2SAlex Elder 		ret = -ENOMEM;
5856124afba2SAlex Elder 		goto out_err;
58571f2c6651SIlya Dryomov 	}
58581f2c6651SIlya Dryomov 
58591f2c6651SIlya Dryomov 	/*
58601f2c6651SIlya Dryomov 	 * Images related by parent/child relationships always share
58611f2c6651SIlya Dryomov 	 * rbd_client and spec/parent_spec, so bump their refcounts.
58621f2c6651SIlya Dryomov 	 */
58631f2c6651SIlya Dryomov 	__rbd_get_client(rbd_dev->rbd_client);
58641f2c6651SIlya Dryomov 	rbd_spec_get(rbd_dev->parent_spec);
5865124afba2SAlex Elder 
58666d69bb53SIlya Dryomov 	ret = rbd_dev_image_probe(parent, depth);
5867124afba2SAlex Elder 	if (ret < 0)
5868124afba2SAlex Elder 		goto out_err;
58691f2c6651SIlya Dryomov 
5870124afba2SAlex Elder 	rbd_dev->parent = parent;
5871a2acd00eSAlex Elder 	atomic_set(&rbd_dev->parent_ref, 1);
5872124afba2SAlex Elder 	return 0;
5873124afba2SAlex Elder 
58741f2c6651SIlya Dryomov out_err:
58751f2c6651SIlya Dryomov 	rbd_dev_unparent(rbd_dev);
58761f2c6651SIlya Dryomov 	rbd_dev_destroy(parent);
5877124afba2SAlex Elder 	return ret;
5878124afba2SAlex Elder }
5879124afba2SAlex Elder 
58805769ed0cSIlya Dryomov static void rbd_dev_device_release(struct rbd_device *rbd_dev)
58815769ed0cSIlya Dryomov {
58825769ed0cSIlya Dryomov 	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
58835769ed0cSIlya Dryomov 	rbd_dev_mapping_clear(rbd_dev);
58845769ed0cSIlya Dryomov 	rbd_free_disk(rbd_dev);
58855769ed0cSIlya Dryomov 	if (!single_major)
58865769ed0cSIlya Dryomov 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
58875769ed0cSIlya Dryomov }
58885769ed0cSIlya Dryomov 
5889811c6688SIlya Dryomov /*
5890811c6688SIlya Dryomov  * rbd_dev->header_rwsem must be locked for write and will be unlocked
5891811c6688SIlya Dryomov  * upon return.
5892811c6688SIlya Dryomov  */
5893200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
5894124afba2SAlex Elder {
589583a06263SAlex Elder 	int ret;
589683a06263SAlex Elder 
58979b60e70bSIlya Dryomov 	/* Record our major and minor device numbers. */
589883a06263SAlex Elder 
58999b60e70bSIlya Dryomov 	if (!single_major) {
590083a06263SAlex Elder 		ret = register_blkdev(0, rbd_dev->name);
590183a06263SAlex Elder 		if (ret < 0)
59021643dfa4SIlya Dryomov 			goto err_out_unlock;
59039b60e70bSIlya Dryomov 
590483a06263SAlex Elder 		rbd_dev->major = ret;
5905dd82fff1SIlya Dryomov 		rbd_dev->minor = 0;
59069b60e70bSIlya Dryomov 	} else {
59079b60e70bSIlya Dryomov 		rbd_dev->major = rbd_major;
59089b60e70bSIlya Dryomov 		rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
59099b60e70bSIlya Dryomov 	}
591083a06263SAlex Elder 
591183a06263SAlex Elder 	/* Set up the blkdev mapping. */
591283a06263SAlex Elder 
591383a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
591483a06263SAlex Elder 	if (ret)
591583a06263SAlex Elder 		goto err_out_blkdev;
591683a06263SAlex Elder 
5917f35a4deeSAlex Elder 	ret = rbd_dev_mapping_set(rbd_dev);
591883a06263SAlex Elder 	if (ret)
591983a06263SAlex Elder 		goto err_out_disk;
5920bc1ecc65SIlya Dryomov 
5921f35a4deeSAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
592222001f61SJosh Durgin 	set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only);
5923f35a4deeSAlex Elder 
59245769ed0cSIlya Dryomov 	ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
5925f35a4deeSAlex Elder 	if (ret)
5926f5ee37bdSIlya Dryomov 		goto err_out_mapping;
592783a06263SAlex Elder 
5928129b79d4SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5929811c6688SIlya Dryomov 	up_write(&rbd_dev->header_rwsem);
59305769ed0cSIlya Dryomov 	return 0;
59312f82ee54SAlex Elder 
5932f35a4deeSAlex Elder err_out_mapping:
5933f35a4deeSAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
593483a06263SAlex Elder err_out_disk:
593583a06263SAlex Elder 	rbd_free_disk(rbd_dev);
593683a06263SAlex Elder err_out_blkdev:
59379b60e70bSIlya Dryomov 	if (!single_major)
593883a06263SAlex Elder 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
5939811c6688SIlya Dryomov err_out_unlock:
5940811c6688SIlya Dryomov 	up_write(&rbd_dev->header_rwsem);
594183a06263SAlex Elder 	return ret;
594283a06263SAlex Elder }
594383a06263SAlex Elder 
5944332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev)
5945332bb12dSAlex Elder {
5946332bb12dSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
5947c41d13a3SIlya Dryomov 	int ret;
5948332bb12dSAlex Elder 
5949332bb12dSAlex Elder 	/* Record the header object name for this rbd image. */
5950332bb12dSAlex Elder 
5951332bb12dSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5952332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
5953c41d13a3SIlya Dryomov 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5954332bb12dSAlex Elder 				       spec->image_name, RBD_SUFFIX);
5955332bb12dSAlex Elder 	else
5956c41d13a3SIlya Dryomov 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5957332bb12dSAlex Elder 				       RBD_HEADER_PREFIX, spec->image_id);
5958c41d13a3SIlya Dryomov 
5959c41d13a3SIlya Dryomov 	return ret;
5960332bb12dSAlex Elder }
5961332bb12dSAlex Elder 
5962200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev)
5963200a6a8bSAlex Elder {
59646fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
5965fd22aef8SIlya Dryomov 	if (rbd_dev->opts)
5966fd22aef8SIlya Dryomov 		rbd_unregister_watch(rbd_dev);
59676fd48b3bSAlex Elder 	rbd_dev->image_format = 0;
59686fd48b3bSAlex Elder 	kfree(rbd_dev->spec->image_id);
59696fd48b3bSAlex Elder 	rbd_dev->spec->image_id = NULL;
5970200a6a8bSAlex Elder }
5971200a6a8bSAlex Elder 
5972a30b71b9SAlex Elder /*
5973a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
59741f3ef788SAlex Elder  * device.  If this image is the one being mapped (i.e., not a
59751f3ef788SAlex Elder  * parent), initiate a watch on its header object before using that
59761f3ef788SAlex Elder  * object to get detailed information about the rbd image.
5977a30b71b9SAlex Elder  */
59786d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
5979a30b71b9SAlex Elder {
5980a30b71b9SAlex Elder 	int ret;
5981a30b71b9SAlex Elder 
5982a30b71b9SAlex Elder 	/*
59833abef3b3SAlex Elder 	 * Get the id from the image id object.  Unless there's an
59843abef3b3SAlex Elder 	 * error, rbd_dev->spec->image_id will be filled in with
59853abef3b3SAlex Elder 	 * a dynamically-allocated string, and rbd_dev->image_format
59863abef3b3SAlex Elder 	 * will be set to either 1 or 2.
5987a30b71b9SAlex Elder 	 */
5988a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
5989a30b71b9SAlex Elder 	if (ret)
5990c0fba368SAlex Elder 		return ret;
5991c0fba368SAlex Elder 
5992332bb12dSAlex Elder 	ret = rbd_dev_header_name(rbd_dev);
5993332bb12dSAlex Elder 	if (ret)
5994332bb12dSAlex Elder 		goto err_out_format;
5995332bb12dSAlex Elder 
59966d69bb53SIlya Dryomov 	if (!depth) {
599799d16943SIlya Dryomov 		ret = rbd_register_watch(rbd_dev);
59981fe48023SIlya Dryomov 		if (ret) {
59991fe48023SIlya Dryomov 			if (ret == -ENOENT)
60001fe48023SIlya Dryomov 				pr_info("image %s/%s does not exist\n",
60011fe48023SIlya Dryomov 					rbd_dev->spec->pool_name,
60021fe48023SIlya Dryomov 					rbd_dev->spec->image_name);
6003c41d13a3SIlya Dryomov 			goto err_out_format;
60041f3ef788SAlex Elder 		}
60051fe48023SIlya Dryomov 	}
6006b644de2bSAlex Elder 
6007a720ae09SIlya Dryomov 	ret = rbd_dev_header_info(rbd_dev);
60085655c4d9SAlex Elder 	if (ret)
6009b644de2bSAlex Elder 		goto err_out_watch;
6010a30b71b9SAlex Elder 
601104077599SIlya Dryomov 	/*
601204077599SIlya Dryomov 	 * If this image is the one being mapped, we have pool name and
601304077599SIlya Dryomov 	 * id, image name and id, and snap name - need to fill snap id.
601404077599SIlya Dryomov 	 * Otherwise this is a parent image, identified by pool, image
601504077599SIlya Dryomov 	 * and snap ids - need to fill in names for those ids.
601604077599SIlya Dryomov 	 */
60176d69bb53SIlya Dryomov 	if (!depth)
601804077599SIlya Dryomov 		ret = rbd_spec_fill_snap_id(rbd_dev);
601904077599SIlya Dryomov 	else
602004077599SIlya Dryomov 		ret = rbd_spec_fill_names(rbd_dev);
60211fe48023SIlya Dryomov 	if (ret) {
60221fe48023SIlya Dryomov 		if (ret == -ENOENT)
60231fe48023SIlya Dryomov 			pr_info("snap %s/%s@%s does not exist\n",
60241fe48023SIlya Dryomov 				rbd_dev->spec->pool_name,
60251fe48023SIlya Dryomov 				rbd_dev->spec->image_name,
60261fe48023SIlya Dryomov 				rbd_dev->spec->snap_name);
602733dca39fSAlex Elder 		goto err_out_probe;
60281fe48023SIlya Dryomov 	}
60299bb81c9bSAlex Elder 
6030e8f59b59SIlya Dryomov 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
6031e8f59b59SIlya Dryomov 		ret = rbd_dev_v2_parent_info(rbd_dev);
6032e8f59b59SIlya Dryomov 		if (ret)
6033e8f59b59SIlya Dryomov 			goto err_out_probe;
6034e8f59b59SIlya Dryomov 
6035e8f59b59SIlya Dryomov 		/*
6036e8f59b59SIlya Dryomov 		 * Need to warn users if this image is the one being
6037e8f59b59SIlya Dryomov 		 * mapped and has a parent.
6038e8f59b59SIlya Dryomov 		 */
60396d69bb53SIlya Dryomov 		if (!depth && rbd_dev->parent_spec)
6040e8f59b59SIlya Dryomov 			rbd_warn(rbd_dev,
6041e8f59b59SIlya Dryomov 				 "WARNING: kernel layering is EXPERIMENTAL!");
6042e8f59b59SIlya Dryomov 	}
6043e8f59b59SIlya Dryomov 
60446d69bb53SIlya Dryomov 	ret = rbd_dev_probe_parent(rbd_dev, depth);
604530d60ba2SAlex Elder 	if (ret)
604630d60ba2SAlex Elder 		goto err_out_probe;
604783a06263SAlex Elder 
604830d60ba2SAlex Elder 	dout("discovered format %u image, header name is %s\n",
6049c41d13a3SIlya Dryomov 		rbd_dev->image_format, rbd_dev->header_oid.name);
605030d60ba2SAlex Elder 	return 0;
6051e8f59b59SIlya Dryomov 
60526fd48b3bSAlex Elder err_out_probe:
60536fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
6054b644de2bSAlex Elder err_out_watch:
60556d69bb53SIlya Dryomov 	if (!depth)
605699d16943SIlya Dryomov 		rbd_unregister_watch(rbd_dev);
6057332bb12dSAlex Elder err_out_format:
6058332bb12dSAlex Elder 	rbd_dev->image_format = 0;
60595655c4d9SAlex Elder 	kfree(rbd_dev->spec->image_id);
60605655c4d9SAlex Elder 	rbd_dev->spec->image_id = NULL;
60615655c4d9SAlex Elder 	return ret;
606283a06263SAlex Elder }
606383a06263SAlex Elder 
60649b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus,
606559c2be1eSYehuda Sadeh 			  const char *buf,
606659c2be1eSYehuda Sadeh 			  size_t count)
6067602adf40SYehuda Sadeh {
6068cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
6069dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
60704e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
6071859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
60729d3997fdSAlex Elder 	struct rbd_client *rbdc;
607351344a38SAlex Elder 	bool read_only;
6074b51c83c2SIlya Dryomov 	int rc;
6075602adf40SYehuda Sadeh 
6076602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
6077602adf40SYehuda Sadeh 		return -ENODEV;
6078602adf40SYehuda Sadeh 
6079a725f65eSAlex Elder 	/* parse add command */
6080859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
6081dc79b113SAlex Elder 	if (rc < 0)
6082dd5ac32dSIlya Dryomov 		goto out;
6083a725f65eSAlex Elder 
60849d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
60859d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
60869d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
60870ddebc0cSAlex Elder 		goto err_out_args;
60889d3997fdSAlex Elder 	}
6089602adf40SYehuda Sadeh 
6090602adf40SYehuda Sadeh 	/* pick the pool */
609130ba1f02SIlya Dryomov 	rc = rbd_add_get_pool_id(rbdc, spec->pool_name);
60921fe48023SIlya Dryomov 	if (rc < 0) {
60931fe48023SIlya Dryomov 		if (rc == -ENOENT)
60941fe48023SIlya Dryomov 			pr_info("pool %s does not exist\n", spec->pool_name);
6095602adf40SYehuda Sadeh 		goto err_out_client;
60961fe48023SIlya Dryomov 	}
6097859c31dfSAlex Elder 	spec->pool_id = (u64)rc;
6098859c31dfSAlex Elder 
6099d147543dSIlya Dryomov 	rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
6100b51c83c2SIlya Dryomov 	if (!rbd_dev) {
6101b51c83c2SIlya Dryomov 		rc = -ENOMEM;
6102bd4ba655SAlex Elder 		goto err_out_client;
6103b51c83c2SIlya Dryomov 	}
6104c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
6105c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
6106d147543dSIlya Dryomov 	rbd_opts = NULL;	/* rbd_dev now owns this */
6107602adf40SYehuda Sadeh 
61080d6d1e9cSMike Christie 	rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
61090d6d1e9cSMike Christie 	if (!rbd_dev->config_info) {
61100d6d1e9cSMike Christie 		rc = -ENOMEM;
61110d6d1e9cSMike Christie 		goto err_out_rbd_dev;
61120d6d1e9cSMike Christie 	}
61130d6d1e9cSMike Christie 
6114811c6688SIlya Dryomov 	down_write(&rbd_dev->header_rwsem);
61156d69bb53SIlya Dryomov 	rc = rbd_dev_image_probe(rbd_dev, 0);
61160d6d1e9cSMike Christie 	if (rc < 0) {
61170d6d1e9cSMike Christie 		up_write(&rbd_dev->header_rwsem);
6118c53d5893SAlex Elder 		goto err_out_rbd_dev;
61190d6d1e9cSMike Christie 	}
612005fd6f6fSAlex Elder 
61217ce4eef7SAlex Elder 	/* If we are mapping a snapshot it must be marked read-only */
61227ce4eef7SAlex Elder 
6123d147543dSIlya Dryomov 	read_only = rbd_dev->opts->read_only;
61247ce4eef7SAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
61257ce4eef7SAlex Elder 		read_only = true;
61267ce4eef7SAlex Elder 	rbd_dev->mapping.read_only = read_only;
61277ce4eef7SAlex Elder 
6128b536f69aSAlex Elder 	rc = rbd_dev_device_setup(rbd_dev);
6129fd22aef8SIlya Dryomov 	if (rc)
61308b679ec5SIlya Dryomov 		goto err_out_image_probe;
61313abef3b3SAlex Elder 
61325769ed0cSIlya Dryomov 	/* Everything's ready.  Announce the disk to the world. */
61335769ed0cSIlya Dryomov 
61345769ed0cSIlya Dryomov 	rc = device_add(&rbd_dev->dev);
61355769ed0cSIlya Dryomov 	if (rc)
61365769ed0cSIlya Dryomov 		goto err_out_device_setup;
61375769ed0cSIlya Dryomov 
61385769ed0cSIlya Dryomov 	add_disk(rbd_dev->disk);
61395769ed0cSIlya Dryomov 	/* see rbd_init_disk() */
61405769ed0cSIlya Dryomov 	blk_put_queue(rbd_dev->disk->queue);
61415769ed0cSIlya Dryomov 
61425769ed0cSIlya Dryomov 	spin_lock(&rbd_dev_list_lock);
61435769ed0cSIlya Dryomov 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
61445769ed0cSIlya Dryomov 	spin_unlock(&rbd_dev_list_lock);
61455769ed0cSIlya Dryomov 
61465769ed0cSIlya Dryomov 	pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
61475769ed0cSIlya Dryomov 		(unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
61485769ed0cSIlya Dryomov 		rbd_dev->header.features);
6149dd5ac32dSIlya Dryomov 	rc = count;
6150dd5ac32dSIlya Dryomov out:
6151dd5ac32dSIlya Dryomov 	module_put(THIS_MODULE);
6152dd5ac32dSIlya Dryomov 	return rc;
6153b536f69aSAlex Elder 
61545769ed0cSIlya Dryomov err_out_device_setup:
61555769ed0cSIlya Dryomov 	rbd_dev_device_release(rbd_dev);
61568b679ec5SIlya Dryomov err_out_image_probe:
61578b679ec5SIlya Dryomov 	rbd_dev_image_release(rbd_dev);
6158c53d5893SAlex Elder err_out_rbd_dev:
6159c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
6160bd4ba655SAlex Elder err_out_client:
61619d3997fdSAlex Elder 	rbd_put_client(rbdc);
61620ddebc0cSAlex Elder err_out_args:
6163859c31dfSAlex Elder 	rbd_spec_put(spec);
6164d147543dSIlya Dryomov 	kfree(rbd_opts);
6165dd5ac32dSIlya Dryomov 	goto out;
6166602adf40SYehuda Sadeh }
6167602adf40SYehuda Sadeh 
61689b60e70bSIlya Dryomov static ssize_t rbd_add(struct bus_type *bus,
61699b60e70bSIlya Dryomov 		       const char *buf,
61709b60e70bSIlya Dryomov 		       size_t count)
61719b60e70bSIlya Dryomov {
61729b60e70bSIlya Dryomov 	if (single_major)
61739b60e70bSIlya Dryomov 		return -EINVAL;
61749b60e70bSIlya Dryomov 
61759b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
61769b60e70bSIlya Dryomov }
61779b60e70bSIlya Dryomov 
61789b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus,
61799b60e70bSIlya Dryomov 				    const char *buf,
61809b60e70bSIlya Dryomov 				    size_t count)
61819b60e70bSIlya Dryomov {
61829b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
61839b60e70bSIlya Dryomov }
61849b60e70bSIlya Dryomov 
618505a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
618605a46afdSAlex Elder {
6187ad945fc1SAlex Elder 	while (rbd_dev->parent) {
618805a46afdSAlex Elder 		struct rbd_device *first = rbd_dev;
618905a46afdSAlex Elder 		struct rbd_device *second = first->parent;
619005a46afdSAlex Elder 		struct rbd_device *third;
619105a46afdSAlex Elder 
619205a46afdSAlex Elder 		/*
619305a46afdSAlex Elder 		 * Follow to the parent with no grandparent and
619405a46afdSAlex Elder 		 * remove it.
619505a46afdSAlex Elder 		 */
619605a46afdSAlex Elder 		while (second && (third = second->parent)) {
619705a46afdSAlex Elder 			first = second;
619805a46afdSAlex Elder 			second = third;
619905a46afdSAlex Elder 		}
6200ad945fc1SAlex Elder 		rbd_assert(second);
62018ad42cd0SAlex Elder 		rbd_dev_image_release(second);
62028b679ec5SIlya Dryomov 		rbd_dev_destroy(second);
6203ad945fc1SAlex Elder 		first->parent = NULL;
6204ad945fc1SAlex Elder 		first->parent_overlap = 0;
6205ad945fc1SAlex Elder 
6206ad945fc1SAlex Elder 		rbd_assert(first->parent_spec);
620705a46afdSAlex Elder 		rbd_spec_put(first->parent_spec);
620805a46afdSAlex Elder 		first->parent_spec = NULL;
620905a46afdSAlex Elder 	}
621005a46afdSAlex Elder }
621105a46afdSAlex Elder 
62129b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus,
6213602adf40SYehuda Sadeh 			     const char *buf,
6214602adf40SYehuda Sadeh 			     size_t count)
6215602adf40SYehuda Sadeh {
6216602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
6217751cc0e3SAlex Elder 	struct list_head *tmp;
6218751cc0e3SAlex Elder 	int dev_id;
62190276dca6SMike Christie 	char opt_buf[6];
622082a442d2SAlex Elder 	bool already = false;
62210276dca6SMike Christie 	bool force = false;
62220d8189e1SAlex Elder 	int ret;
6223602adf40SYehuda Sadeh 
62240276dca6SMike Christie 	dev_id = -1;
62250276dca6SMike Christie 	opt_buf[0] = '\0';
62260276dca6SMike Christie 	sscanf(buf, "%d %5s", &dev_id, opt_buf);
62270276dca6SMike Christie 	if (dev_id < 0) {
62280276dca6SMike Christie 		pr_err("dev_id out of range\n");
6229602adf40SYehuda Sadeh 		return -EINVAL;
62300276dca6SMike Christie 	}
62310276dca6SMike Christie 	if (opt_buf[0] != '\0') {
62320276dca6SMike Christie 		if (!strcmp(opt_buf, "force")) {
62330276dca6SMike Christie 			force = true;
62340276dca6SMike Christie 		} else {
62350276dca6SMike Christie 			pr_err("bad remove option at '%s'\n", opt_buf);
62360276dca6SMike Christie 			return -EINVAL;
62370276dca6SMike Christie 		}
62380276dca6SMike Christie 	}
6239602adf40SYehuda Sadeh 
6240602adf40SYehuda Sadeh 	ret = -ENOENT;
6241751cc0e3SAlex Elder 	spin_lock(&rbd_dev_list_lock);
6242751cc0e3SAlex Elder 	list_for_each(tmp, &rbd_dev_list) {
6243751cc0e3SAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
6244751cc0e3SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
6245751cc0e3SAlex Elder 			ret = 0;
6246751cc0e3SAlex Elder 			break;
6247602adf40SYehuda Sadeh 		}
6248751cc0e3SAlex Elder 	}
6249751cc0e3SAlex Elder 	if (!ret) {
6250a14ea269SAlex Elder 		spin_lock_irq(&rbd_dev->lock);
62510276dca6SMike Christie 		if (rbd_dev->open_count && !force)
625242382b70SAlex Elder 			ret = -EBUSY;
6253b82d167bSAlex Elder 		else
625482a442d2SAlex Elder 			already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
625582a442d2SAlex Elder 							&rbd_dev->flags);
6256a14ea269SAlex Elder 		spin_unlock_irq(&rbd_dev->lock);
6257751cc0e3SAlex Elder 	}
6258751cc0e3SAlex Elder 	spin_unlock(&rbd_dev_list_lock);
625982a442d2SAlex Elder 	if (ret < 0 || already)
62601ba0f1e7SAlex Elder 		return ret;
6261751cc0e3SAlex Elder 
62620276dca6SMike Christie 	if (force) {
62630276dca6SMike Christie 		/*
62640276dca6SMike Christie 		 * Prevent new IO from being queued and wait for existing
62650276dca6SMike Christie 		 * IO to complete/fail.
62660276dca6SMike Christie 		 */
62670276dca6SMike Christie 		blk_mq_freeze_queue(rbd_dev->disk->queue);
62680276dca6SMike Christie 		blk_set_queue_dying(rbd_dev->disk->queue);
62690276dca6SMike Christie 	}
62700276dca6SMike Christie 
62715769ed0cSIlya Dryomov 	del_gendisk(rbd_dev->disk);
62725769ed0cSIlya Dryomov 	spin_lock(&rbd_dev_list_lock);
62735769ed0cSIlya Dryomov 	list_del_init(&rbd_dev->node);
62745769ed0cSIlya Dryomov 	spin_unlock(&rbd_dev_list_lock);
62755769ed0cSIlya Dryomov 	device_del(&rbd_dev->dev);
62765769ed0cSIlya Dryomov 
6277ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
6278ed95b21aSIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev))
6279ed95b21aSIlya Dryomov 		rbd_unlock(rbd_dev);
6280ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
6281fca27065SIlya Dryomov 
6282dd5ac32dSIlya Dryomov 	rbd_dev_device_release(rbd_dev);
62838ad42cd0SAlex Elder 	rbd_dev_image_release(rbd_dev);
62848b679ec5SIlya Dryomov 	rbd_dev_destroy(rbd_dev);
62851ba0f1e7SAlex Elder 	return count;
6286602adf40SYehuda Sadeh }
6287602adf40SYehuda Sadeh 
62889b60e70bSIlya Dryomov static ssize_t rbd_remove(struct bus_type *bus,
62899b60e70bSIlya Dryomov 			  const char *buf,
62909b60e70bSIlya Dryomov 			  size_t count)
62919b60e70bSIlya Dryomov {
62929b60e70bSIlya Dryomov 	if (single_major)
62939b60e70bSIlya Dryomov 		return -EINVAL;
62949b60e70bSIlya Dryomov 
62959b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
62969b60e70bSIlya Dryomov }
62979b60e70bSIlya Dryomov 
62989b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus,
62999b60e70bSIlya Dryomov 				       const char *buf,
63009b60e70bSIlya Dryomov 				       size_t count)
63019b60e70bSIlya Dryomov {
63029b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
63039b60e70bSIlya Dryomov }
63049b60e70bSIlya Dryomov 
6305602adf40SYehuda Sadeh /*
6306602adf40SYehuda Sadeh  * create control files in sysfs
6307dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
6308602adf40SYehuda Sadeh  */
6309602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
6310602adf40SYehuda Sadeh {
6311dfc5606dSYehuda Sadeh 	int ret;
6312602adf40SYehuda Sadeh 
6313fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
6314dfc5606dSYehuda Sadeh 	if (ret < 0)
6315dfc5606dSYehuda Sadeh 		return ret;
6316602adf40SYehuda Sadeh 
6317fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
6318fed4c143SAlex Elder 	if (ret < 0)
6319fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
6320602adf40SYehuda Sadeh 
6321602adf40SYehuda Sadeh 	return ret;
6322602adf40SYehuda Sadeh }
6323602adf40SYehuda Sadeh 
6324602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
6325602adf40SYehuda Sadeh {
6326dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
6327fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
6328602adf40SYehuda Sadeh }
6329602adf40SYehuda Sadeh 
63301c2a9dfeSAlex Elder static int rbd_slab_init(void)
63311c2a9dfeSAlex Elder {
63321c2a9dfeSAlex Elder 	rbd_assert(!rbd_img_request_cache);
633303d94406SGeliang Tang 	rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
6334868311b1SAlex Elder 	if (!rbd_img_request_cache)
6335868311b1SAlex Elder 		return -ENOMEM;
6336868311b1SAlex Elder 
6337868311b1SAlex Elder 	rbd_assert(!rbd_obj_request_cache);
633803d94406SGeliang Tang 	rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
633978c2a44aSAlex Elder 	if (!rbd_obj_request_cache)
634078c2a44aSAlex Elder 		goto out_err;
634178c2a44aSAlex Elder 
63421c2a9dfeSAlex Elder 	return 0;
63431c2a9dfeSAlex Elder 
63446c696d85SIlya Dryomov out_err:
6345868311b1SAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
6346868311b1SAlex Elder 	rbd_img_request_cache = NULL;
63471c2a9dfeSAlex Elder 	return -ENOMEM;
63481c2a9dfeSAlex Elder }
63491c2a9dfeSAlex Elder 
63501c2a9dfeSAlex Elder static void rbd_slab_exit(void)
63511c2a9dfeSAlex Elder {
6352868311b1SAlex Elder 	rbd_assert(rbd_obj_request_cache);
6353868311b1SAlex Elder 	kmem_cache_destroy(rbd_obj_request_cache);
6354868311b1SAlex Elder 	rbd_obj_request_cache = NULL;
6355868311b1SAlex Elder 
63561c2a9dfeSAlex Elder 	rbd_assert(rbd_img_request_cache);
63571c2a9dfeSAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
63581c2a9dfeSAlex Elder 	rbd_img_request_cache = NULL;
63591c2a9dfeSAlex Elder }
63601c2a9dfeSAlex Elder 
6361cc344fa1SAlex Elder static int __init rbd_init(void)
6362602adf40SYehuda Sadeh {
6363602adf40SYehuda Sadeh 	int rc;
6364602adf40SYehuda Sadeh 
63651e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
63661e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
63671e32d34cSAlex Elder 		return -EINVAL;
63681e32d34cSAlex Elder 	}
6369e1b4d96dSIlya Dryomov 
63701c2a9dfeSAlex Elder 	rc = rbd_slab_init();
6371602adf40SYehuda Sadeh 	if (rc)
6372602adf40SYehuda Sadeh 		return rc;
6373e1b4d96dSIlya Dryomov 
6374f5ee37bdSIlya Dryomov 	/*
6375f5ee37bdSIlya Dryomov 	 * The number of active work items is limited by the number of
6376f77303bdSIlya Dryomov 	 * rbd devices * queue depth, so leave @max_active at default.
6377f5ee37bdSIlya Dryomov 	 */
6378f5ee37bdSIlya Dryomov 	rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
6379f5ee37bdSIlya Dryomov 	if (!rbd_wq) {
6380f5ee37bdSIlya Dryomov 		rc = -ENOMEM;
6381f5ee37bdSIlya Dryomov 		goto err_out_slab;
6382f5ee37bdSIlya Dryomov 	}
6383f5ee37bdSIlya Dryomov 
63849b60e70bSIlya Dryomov 	if (single_major) {
63859b60e70bSIlya Dryomov 		rbd_major = register_blkdev(0, RBD_DRV_NAME);
63869b60e70bSIlya Dryomov 		if (rbd_major < 0) {
63879b60e70bSIlya Dryomov 			rc = rbd_major;
6388f5ee37bdSIlya Dryomov 			goto err_out_wq;
63899b60e70bSIlya Dryomov 		}
63909b60e70bSIlya Dryomov 	}
63919b60e70bSIlya Dryomov 
63921c2a9dfeSAlex Elder 	rc = rbd_sysfs_init();
63931c2a9dfeSAlex Elder 	if (rc)
63949b60e70bSIlya Dryomov 		goto err_out_blkdev;
63951c2a9dfeSAlex Elder 
63969b60e70bSIlya Dryomov 	if (single_major)
63979b60e70bSIlya Dryomov 		pr_info("loaded (major %d)\n", rbd_major);
63989b60e70bSIlya Dryomov 	else
6399e1b4d96dSIlya Dryomov 		pr_info("loaded\n");
64009b60e70bSIlya Dryomov 
6401e1b4d96dSIlya Dryomov 	return 0;
6402e1b4d96dSIlya Dryomov 
64039b60e70bSIlya Dryomov err_out_blkdev:
64049b60e70bSIlya Dryomov 	if (single_major)
64059b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
6406f5ee37bdSIlya Dryomov err_out_wq:
6407f5ee37bdSIlya Dryomov 	destroy_workqueue(rbd_wq);
6408e1b4d96dSIlya Dryomov err_out_slab:
6409e1b4d96dSIlya Dryomov 	rbd_slab_exit();
64101c2a9dfeSAlex Elder 	return rc;
6411602adf40SYehuda Sadeh }
6412602adf40SYehuda Sadeh 
6413cc344fa1SAlex Elder static void __exit rbd_exit(void)
6414602adf40SYehuda Sadeh {
6415ffe312cfSIlya Dryomov 	ida_destroy(&rbd_dev_id_ida);
6416602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
64179b60e70bSIlya Dryomov 	if (single_major)
64189b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
6419f5ee37bdSIlya Dryomov 	destroy_workqueue(rbd_wq);
64201c2a9dfeSAlex Elder 	rbd_slab_exit();
6421602adf40SYehuda Sadeh }
6422602adf40SYehuda Sadeh 
6423602adf40SYehuda Sadeh module_init(rbd_init);
6424602adf40SYehuda Sadeh module_exit(rbd_exit);
6425602adf40SYehuda Sadeh 
6426d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
6427602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
6428602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
6429602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
6430602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
6431602adf40SYehuda Sadeh 
643290da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
6433602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
6434