xref: /openbmc/linux/drivers/block/rbd.c (revision 67e2b652)
1e2a58ee5SAlex Elder 
2602adf40SYehuda Sadeh /*
3602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh 
6602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
7602adf40SYehuda Sadeh 
8602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
9602adf40SYehuda Sadeh 
10602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
11602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
12602adf40SYehuda Sadeh    the Free Software Foundation.
13602adf40SYehuda Sadeh 
14602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
15602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
16602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17602adf40SYehuda Sadeh    GNU General Public License for more details.
18602adf40SYehuda Sadeh 
19602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
20602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
21602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24602adf40SYehuda Sadeh 
25dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
26602adf40SYehuda Sadeh 
27dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
28602adf40SYehuda Sadeh 
29602adf40SYehuda Sadeh  */
30602adf40SYehuda Sadeh 
31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
34ed95b21aSIlya Dryomov #include <linux/ceph/cls_lock_client.h>
35602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3659c2be1eSYehuda Sadeh #include <linux/parser.h>
3730d1cff8SAlex Elder #include <linux/bsearch.h>
38602adf40SYehuda Sadeh 
39602adf40SYehuda Sadeh #include <linux/kernel.h>
40602adf40SYehuda Sadeh #include <linux/device.h>
41602adf40SYehuda Sadeh #include <linux/module.h>
427ad18afaSChristoph Hellwig #include <linux/blk-mq.h>
43602adf40SYehuda Sadeh #include <linux/fs.h>
44602adf40SYehuda Sadeh #include <linux/blkdev.h>
451c2a9dfeSAlex Elder #include <linux/slab.h>
46f8a22fc2SIlya Dryomov #include <linux/idr.h>
47bc1ecc65SIlya Dryomov #include <linux/workqueue.h>
48602adf40SYehuda Sadeh 
49602adf40SYehuda Sadeh #include "rbd_types.h"
50602adf40SYehuda Sadeh 
51aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
52aafb230eSAlex Elder 
53593a9e7bSAlex Elder /*
54593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
55593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
56593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
57593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
58593a9e7bSAlex Elder  */
59593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
60593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
61593a9e7bSAlex Elder 
62a2acd00eSAlex Elder /*
63a2acd00eSAlex Elder  * Increment the given counter and return its updated value.
64a2acd00eSAlex Elder  * If the counter is already 0 it will not be incremented.
65a2acd00eSAlex Elder  * If the counter is already at its maximum value returns
66a2acd00eSAlex Elder  * -EINVAL without updating it.
67a2acd00eSAlex Elder  */
68a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v)
69a2acd00eSAlex Elder {
70a2acd00eSAlex Elder 	unsigned int counter;
71a2acd00eSAlex Elder 
72a2acd00eSAlex Elder 	counter = (unsigned int)__atomic_add_unless(v, 1, 0);
73a2acd00eSAlex Elder 	if (counter <= (unsigned int)INT_MAX)
74a2acd00eSAlex Elder 		return (int)counter;
75a2acd00eSAlex Elder 
76a2acd00eSAlex Elder 	atomic_dec(v);
77a2acd00eSAlex Elder 
78a2acd00eSAlex Elder 	return -EINVAL;
79a2acd00eSAlex Elder }
80a2acd00eSAlex Elder 
81a2acd00eSAlex Elder /* Decrement the counter.  Return the resulting value, or -EINVAL */
82a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v)
83a2acd00eSAlex Elder {
84a2acd00eSAlex Elder 	int counter;
85a2acd00eSAlex Elder 
86a2acd00eSAlex Elder 	counter = atomic_dec_return(v);
87a2acd00eSAlex Elder 	if (counter >= 0)
88a2acd00eSAlex Elder 		return counter;
89a2acd00eSAlex Elder 
90a2acd00eSAlex Elder 	atomic_inc(v);
91a2acd00eSAlex Elder 
92a2acd00eSAlex Elder 	return -EINVAL;
93a2acd00eSAlex Elder }
94a2acd00eSAlex Elder 
95f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
96602adf40SYehuda Sadeh 
977e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR		256
987e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT	4
99602adf40SYehuda Sadeh 
1006d69bb53SIlya Dryomov #define RBD_MAX_PARENT_CHAIN_LEN	16
1016d69bb53SIlya Dryomov 
102d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
103d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
104d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
105d4b125e9SAlex Elder 
10635d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
107602adf40SYehuda Sadeh 
108602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
109602adf40SYehuda Sadeh 
1109682fc6dSAlex Elder #define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
1119682fc6dSAlex Elder 
1129e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
1139e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
114589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
1159e15b77dSAlex Elder 
1161e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
117589d30e0SAlex Elder 
118ed95b21aSIlya Dryomov #define RBD_NOTIFY_TIMEOUT	5	/* seconds */
11999d16943SIlya Dryomov #define RBD_RETRY_DELAY		msecs_to_jiffies(1000)
12099d16943SIlya Dryomov 
121d889140cSAlex Elder /* Feature bits */
122d889140cSAlex Elder 
1235cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING	(1<<0)
1245cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2	(1<<1)
125ed95b21aSIlya Dryomov #define RBD_FEATURE_EXCLUSIVE_LOCK (1<<2)
1267e97332eSIlya Dryomov #define RBD_FEATURE_DATA_POOL (1<<7)
127ed95b21aSIlya Dryomov #define RBD_FEATURES_ALL	(RBD_FEATURE_LAYERING |		\
128ed95b21aSIlya Dryomov 				 RBD_FEATURE_STRIPINGV2 |	\
1297e97332eSIlya Dryomov 				 RBD_FEATURE_EXCLUSIVE_LOCK |	\
1307e97332eSIlya Dryomov 				 RBD_FEATURE_DATA_POOL)
131d889140cSAlex Elder 
132d889140cSAlex Elder /* Features supported by this (client software) implementation. */
133d889140cSAlex Elder 
134770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
135d889140cSAlex Elder 
13681a89793SAlex Elder /*
13781a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
13881a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
13981a89793SAlex Elder  */
140602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
141602adf40SYehuda Sadeh 
142602adf40SYehuda Sadeh /*
143602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
144602adf40SYehuda Sadeh  */
145602adf40SYehuda Sadeh struct rbd_image_header {
146f35a4deeSAlex Elder 	/* These six fields never change for a given rbd image */
147849b4260SAlex Elder 	char *object_prefix;
148602adf40SYehuda Sadeh 	__u8 obj_order;
149f35a4deeSAlex Elder 	u64 stripe_unit;
150f35a4deeSAlex Elder 	u64 stripe_count;
1517e97332eSIlya Dryomov 	s64 data_pool_id;
152f35a4deeSAlex Elder 	u64 features;		/* Might be changeable someday? */
153602adf40SYehuda Sadeh 
154f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
155f84344f3SAlex Elder 	u64 image_size;
156f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
157f35a4deeSAlex Elder 	char *snap_names;	/* format 1 only */
158f35a4deeSAlex Elder 	u64 *snap_sizes;	/* format 1 only */
15959c2be1eSYehuda Sadeh };
16059c2be1eSYehuda Sadeh 
1610d7dbfceSAlex Elder /*
1620d7dbfceSAlex Elder  * An rbd image specification.
1630d7dbfceSAlex Elder  *
1640d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
165c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
166c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
167c66c6e0cSAlex Elder  *
168c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
169c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
170c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
171c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
172c66c6e0cSAlex Elder  *
173c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
174c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
175c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
176c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
177c66c6e0cSAlex Elder  * is shared between the parent and child).
178c66c6e0cSAlex Elder  *
179c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
180c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
181c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
182c66c6e0cSAlex Elder  *
183c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
184c66c6e0cSAlex Elder  * could be a null pointer).
1850d7dbfceSAlex Elder  */
1860d7dbfceSAlex Elder struct rbd_spec {
1870d7dbfceSAlex Elder 	u64		pool_id;
188ecb4dc22SAlex Elder 	const char	*pool_name;
1890d7dbfceSAlex Elder 
190ecb4dc22SAlex Elder 	const char	*image_id;
191ecb4dc22SAlex Elder 	const char	*image_name;
1920d7dbfceSAlex Elder 
1930d7dbfceSAlex Elder 	u64		snap_id;
194ecb4dc22SAlex Elder 	const char	*snap_name;
1950d7dbfceSAlex Elder 
1960d7dbfceSAlex Elder 	struct kref	kref;
1970d7dbfceSAlex Elder };
1980d7dbfceSAlex Elder 
199602adf40SYehuda Sadeh /*
200f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
201602adf40SYehuda Sadeh  */
202602adf40SYehuda Sadeh struct rbd_client {
203602adf40SYehuda Sadeh 	struct ceph_client	*client;
204602adf40SYehuda Sadeh 	struct kref		kref;
205602adf40SYehuda Sadeh 	struct list_head	node;
206602adf40SYehuda Sadeh };
207602adf40SYehuda Sadeh 
208bf0d5f50SAlex Elder struct rbd_img_request;
209bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
210bf0d5f50SAlex Elder 
211bf0d5f50SAlex Elder #define	BAD_WHICH	U32_MAX		/* Good which or bad which, which? */
212bf0d5f50SAlex Elder 
213bf0d5f50SAlex Elder struct rbd_obj_request;
214bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
215bf0d5f50SAlex Elder 
2169969ebc5SAlex Elder enum obj_request_type {
2179969ebc5SAlex Elder 	OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
2189969ebc5SAlex Elder };
219bf0d5f50SAlex Elder 
2206d2940c8SGuangliang Zhao enum obj_operation_type {
2216d2940c8SGuangliang Zhao 	OBJ_OP_WRITE,
2226d2940c8SGuangliang Zhao 	OBJ_OP_READ,
22390e98c52SGuangliang Zhao 	OBJ_OP_DISCARD,
2246d2940c8SGuangliang Zhao };
2256d2940c8SGuangliang Zhao 
226926f9b3fSAlex Elder enum obj_req_flags {
227926f9b3fSAlex Elder 	OBJ_REQ_DONE,		/* completion flag: not done = 0, done = 1 */
2286365d33aSAlex Elder 	OBJ_REQ_IMG_DATA,	/* object usage: standalone = 0, image = 1 */
2295679c59fSAlex Elder 	OBJ_REQ_KNOWN,		/* EXISTS flag valid: no = 0, yes = 1 */
2305679c59fSAlex Elder 	OBJ_REQ_EXISTS,		/* target exists: no = 0, yes = 1 */
231926f9b3fSAlex Elder };
232926f9b3fSAlex Elder 
233bf0d5f50SAlex Elder struct rbd_obj_request {
234bf0d5f50SAlex Elder 	const char		*object_name;
235bf0d5f50SAlex Elder 	u64			offset;		/* object start byte */
236bf0d5f50SAlex Elder 	u64			length;		/* bytes from offset */
237926f9b3fSAlex Elder 	unsigned long		flags;
238bf0d5f50SAlex Elder 
239c5b5ef6cSAlex Elder 	/*
240c5b5ef6cSAlex Elder 	 * An object request associated with an image will have its
241c5b5ef6cSAlex Elder 	 * img_data flag set; a standalone object request will not.
242c5b5ef6cSAlex Elder 	 *
243c5b5ef6cSAlex Elder 	 * A standalone object request will have which == BAD_WHICH
244c5b5ef6cSAlex Elder 	 * and a null obj_request pointer.
245c5b5ef6cSAlex Elder 	 *
246c5b5ef6cSAlex Elder 	 * An object request initiated in support of a layered image
247c5b5ef6cSAlex Elder 	 * object (to check for its existence before a write) will
248c5b5ef6cSAlex Elder 	 * have which == BAD_WHICH and a non-null obj_request pointer.
249c5b5ef6cSAlex Elder 	 *
250c5b5ef6cSAlex Elder 	 * Finally, an object request for rbd image data will have
251c5b5ef6cSAlex Elder 	 * which != BAD_WHICH, and will have a non-null img_request
252c5b5ef6cSAlex Elder 	 * pointer.  The value of which will be in the range
253c5b5ef6cSAlex Elder 	 * 0..(img_request->obj_request_count-1).
254c5b5ef6cSAlex Elder 	 */
255c5b5ef6cSAlex Elder 	union {
256c5b5ef6cSAlex Elder 		struct rbd_obj_request	*obj_request;	/* STAT op */
257c5b5ef6cSAlex Elder 		struct {
258bf0d5f50SAlex Elder 			struct rbd_img_request	*img_request;
259c5b5ef6cSAlex Elder 			u64			img_offset;
260c5b5ef6cSAlex Elder 			/* links for img_request->obj_requests list */
261c5b5ef6cSAlex Elder 			struct list_head	links;
262c5b5ef6cSAlex Elder 		};
263c5b5ef6cSAlex Elder 	};
264bf0d5f50SAlex Elder 	u32			which;		/* posn image request list */
265bf0d5f50SAlex Elder 
266bf0d5f50SAlex Elder 	enum obj_request_type	type;
267788e2df3SAlex Elder 	union {
268bf0d5f50SAlex Elder 		struct bio	*bio_list;
269788e2df3SAlex Elder 		struct {
270788e2df3SAlex Elder 			struct page	**pages;
271788e2df3SAlex Elder 			u32		page_count;
272788e2df3SAlex Elder 		};
273788e2df3SAlex Elder 	};
2740eefd470SAlex Elder 	struct page		**copyup_pages;
275ebda6408SAlex Elder 	u32			copyup_page_count;
276bf0d5f50SAlex Elder 
277bf0d5f50SAlex Elder 	struct ceph_osd_request	*osd_req;
278bf0d5f50SAlex Elder 
279bf0d5f50SAlex Elder 	u64			xferred;	/* bytes transferred */
2801b83bef2SSage Weil 	int			result;
281bf0d5f50SAlex Elder 
282bf0d5f50SAlex Elder 	rbd_obj_callback_t	callback;
283788e2df3SAlex Elder 	struct completion	completion;
284bf0d5f50SAlex Elder 
285bf0d5f50SAlex Elder 	struct kref		kref;
286bf0d5f50SAlex Elder };
287bf0d5f50SAlex Elder 
2880c425248SAlex Elder enum img_req_flags {
2899849e986SAlex Elder 	IMG_REQ_WRITE,		/* I/O direction: read = 0, write = 1 */
2909849e986SAlex Elder 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
291d0b2e944SAlex Elder 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
29290e98c52SGuangliang Zhao 	IMG_REQ_DISCARD,	/* discard: normal = 0, discard request = 1 */
2930c425248SAlex Elder };
2940c425248SAlex Elder 
295bf0d5f50SAlex Elder struct rbd_img_request {
296bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
297bf0d5f50SAlex Elder 	u64			offset;	/* starting image byte offset */
298bf0d5f50SAlex Elder 	u64			length;	/* byte count from offset */
2990c425248SAlex Elder 	unsigned long		flags;
300bf0d5f50SAlex Elder 	union {
301bf0d5f50SAlex Elder 		u64			snap_id;	/* for reads */
3029849e986SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
3039849e986SAlex Elder 	};
3049849e986SAlex Elder 	union {
3059849e986SAlex Elder 		struct request		*rq;		/* block request */
3069849e986SAlex Elder 		struct rbd_obj_request	*obj_request;	/* obj req initiator */
307bf0d5f50SAlex Elder 	};
3083d7efd18SAlex Elder 	struct page		**copyup_pages;
309ebda6408SAlex Elder 	u32			copyup_page_count;
310bf0d5f50SAlex Elder 	spinlock_t		completion_lock;/* protects next_completion */
311bf0d5f50SAlex Elder 	u32			next_completion;
312bf0d5f50SAlex Elder 	rbd_img_callback_t	callback;
31355f27e09SAlex Elder 	u64			xferred;/* aggregate bytes transferred */
314a5a337d4SAlex Elder 	int			result;	/* first nonzero obj_request result */
315bf0d5f50SAlex Elder 
316bf0d5f50SAlex Elder 	u32			obj_request_count;
317bf0d5f50SAlex Elder 	struct list_head	obj_requests;	/* rbd_obj_request structs */
318bf0d5f50SAlex Elder 
319bf0d5f50SAlex Elder 	struct kref		kref;
320bf0d5f50SAlex Elder };
321bf0d5f50SAlex Elder 
322bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
323ef06f4d3SAlex Elder 	list_for_each_entry(oreq, &(ireq)->obj_requests, links)
324bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \
325ef06f4d3SAlex Elder 	list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
326bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
327ef06f4d3SAlex Elder 	list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
328bf0d5f50SAlex Elder 
32999d16943SIlya Dryomov enum rbd_watch_state {
33099d16943SIlya Dryomov 	RBD_WATCH_STATE_UNREGISTERED,
33199d16943SIlya Dryomov 	RBD_WATCH_STATE_REGISTERED,
33299d16943SIlya Dryomov 	RBD_WATCH_STATE_ERROR,
33399d16943SIlya Dryomov };
33499d16943SIlya Dryomov 
335ed95b21aSIlya Dryomov enum rbd_lock_state {
336ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_UNLOCKED,
337ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_LOCKED,
338ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_RELEASING,
339ed95b21aSIlya Dryomov };
340ed95b21aSIlya Dryomov 
341ed95b21aSIlya Dryomov /* WatchNotify::ClientId */
342ed95b21aSIlya Dryomov struct rbd_client_id {
343ed95b21aSIlya Dryomov 	u64 gid;
344ed95b21aSIlya Dryomov 	u64 handle;
345ed95b21aSIlya Dryomov };
346ed95b21aSIlya Dryomov 
347f84344f3SAlex Elder struct rbd_mapping {
34899c1f08fSAlex Elder 	u64                     size;
34934b13184SAlex Elder 	u64                     features;
350f84344f3SAlex Elder 	bool			read_only;
351f84344f3SAlex Elder };
352f84344f3SAlex Elder 
353602adf40SYehuda Sadeh /*
354602adf40SYehuda Sadeh  * a single device
355602adf40SYehuda Sadeh  */
356602adf40SYehuda Sadeh struct rbd_device {
357de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
358602adf40SYehuda Sadeh 
359602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
360dd82fff1SIlya Dryomov 	int			minor;
361602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
362602adf40SYehuda Sadeh 
363a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
364602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
365602adf40SYehuda Sadeh 
366602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
367602adf40SYehuda Sadeh 
368b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
369602adf40SYehuda Sadeh 
370602adf40SYehuda Sadeh 	struct rbd_image_header	header;
371b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
3720d7dbfceSAlex Elder 	struct rbd_spec		*spec;
373d147543dSIlya Dryomov 	struct rbd_options	*opts;
3740d6d1e9cSMike Christie 	char			*config_info;	/* add{,_single_major} string */
375602adf40SYehuda Sadeh 
376c41d13a3SIlya Dryomov 	struct ceph_object_id	header_oid;
377922dab61SIlya Dryomov 	struct ceph_object_locator header_oloc;
378971f839aSAlex Elder 
3791643dfa4SIlya Dryomov 	struct ceph_file_layout	layout;		/* used for all rbd requests */
3800903e875SAlex Elder 
38199d16943SIlya Dryomov 	struct mutex		watch_mutex;
38299d16943SIlya Dryomov 	enum rbd_watch_state	watch_state;
383922dab61SIlya Dryomov 	struct ceph_osd_linger_request *watch_handle;
38499d16943SIlya Dryomov 	u64			watch_cookie;
38599d16943SIlya Dryomov 	struct delayed_work	watch_dwork;
38659c2be1eSYehuda Sadeh 
387ed95b21aSIlya Dryomov 	struct rw_semaphore	lock_rwsem;
388ed95b21aSIlya Dryomov 	enum rbd_lock_state	lock_state;
389ed95b21aSIlya Dryomov 	struct rbd_client_id	owner_cid;
390ed95b21aSIlya Dryomov 	struct work_struct	acquired_lock_work;
391ed95b21aSIlya Dryomov 	struct work_struct	released_lock_work;
392ed95b21aSIlya Dryomov 	struct delayed_work	lock_dwork;
393ed95b21aSIlya Dryomov 	struct work_struct	unlock_work;
394ed95b21aSIlya Dryomov 	wait_queue_head_t	lock_waitq;
395ed95b21aSIlya Dryomov 
3961643dfa4SIlya Dryomov 	struct workqueue_struct	*task_wq;
397602adf40SYehuda Sadeh 
39886b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
39986b00e0dSAlex Elder 	u64			parent_overlap;
400a2acd00eSAlex Elder 	atomic_t		parent_ref;
4012f82ee54SAlex Elder 	struct rbd_device	*parent;
40286b00e0dSAlex Elder 
4037ad18afaSChristoph Hellwig 	/* Block layer tags. */
4047ad18afaSChristoph Hellwig 	struct blk_mq_tag_set	tag_set;
4057ad18afaSChristoph Hellwig 
406c666601aSJosh Durgin 	/* protects updating the header */
407c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
408f84344f3SAlex Elder 
409f84344f3SAlex Elder 	struct rbd_mapping	mapping;
410602adf40SYehuda Sadeh 
411602adf40SYehuda Sadeh 	struct list_head	node;
412dfc5606dSYehuda Sadeh 
413dfc5606dSYehuda Sadeh 	/* sysfs related */
414dfc5606dSYehuda Sadeh 	struct device		dev;
415b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
416dfc5606dSYehuda Sadeh };
417dfc5606dSYehuda Sadeh 
418b82d167bSAlex Elder /*
41987c0fdedSIlya Dryomov  * Flag bits for rbd_dev->flags:
42087c0fdedSIlya Dryomov  * - REMOVING (which is coupled with rbd_dev->open_count) is protected
42187c0fdedSIlya Dryomov  *   by rbd_dev->lock
42287c0fdedSIlya Dryomov  * - BLACKLISTED is protected by rbd_dev->lock_rwsem
423b82d167bSAlex Elder  */
4246d292906SAlex Elder enum rbd_dev_flags {
4256d292906SAlex Elder 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
426b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
42787c0fdedSIlya Dryomov 	RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */
4286d292906SAlex Elder };
4296d292906SAlex Elder 
430cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex);	/* Serialize client creation */
431e124a82fSAlex Elder 
432602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
433e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
434e124a82fSAlex Elder 
435602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
436432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
437602adf40SYehuda Sadeh 
43878c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */
43978c2a44aSAlex Elder 
4401c2a9dfeSAlex Elder static struct kmem_cache	*rbd_img_request_cache;
441868311b1SAlex Elder static struct kmem_cache	*rbd_obj_request_cache;
44278c2a44aSAlex Elder static struct kmem_cache	*rbd_segment_name_cache;
4431c2a9dfeSAlex Elder 
4449b60e70bSIlya Dryomov static int rbd_major;
445f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida);
446f8a22fc2SIlya Dryomov 
447f5ee37bdSIlya Dryomov static struct workqueue_struct *rbd_wq;
448f5ee37bdSIlya Dryomov 
4499b60e70bSIlya Dryomov /*
4509b60e70bSIlya Dryomov  * Default to false for now, as single-major requires >= 0.75 version of
4519b60e70bSIlya Dryomov  * userspace rbd utility.
4529b60e70bSIlya Dryomov  */
4539b60e70bSIlya Dryomov static bool single_major = false;
4549b60e70bSIlya Dryomov module_param(single_major, bool, S_IRUGO);
4559b60e70bSIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)");
4569b60e70bSIlya Dryomov 
4573d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request);
4583d7efd18SAlex Elder 
459f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
460f0f8cef5SAlex Elder 		       size_t count);
461f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
462f0f8cef5SAlex Elder 			  size_t count);
4639b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
4649b60e70bSIlya Dryomov 				    size_t count);
4659b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
4669b60e70bSIlya Dryomov 				       size_t count);
4676d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
468a2acd00eSAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
469f0f8cef5SAlex Elder 
4709b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id)
4719b60e70bSIlya Dryomov {
4727e513d43SIlya Dryomov 	return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
4739b60e70bSIlya Dryomov }
4749b60e70bSIlya Dryomov 
4759b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor)
4769b60e70bSIlya Dryomov {
4777e513d43SIlya Dryomov 	return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
4789b60e70bSIlya Dryomov }
4799b60e70bSIlya Dryomov 
480ed95b21aSIlya Dryomov static bool rbd_is_lock_supported(struct rbd_device *rbd_dev)
481ed95b21aSIlya Dryomov {
482ed95b21aSIlya Dryomov 	return (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
483ed95b21aSIlya Dryomov 	       rbd_dev->spec->snap_id == CEPH_NOSNAP &&
484ed95b21aSIlya Dryomov 	       !rbd_dev->mapping.read_only;
485ed95b21aSIlya Dryomov }
486ed95b21aSIlya Dryomov 
487ed95b21aSIlya Dryomov static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
488ed95b21aSIlya Dryomov {
489ed95b21aSIlya Dryomov 	return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
490ed95b21aSIlya Dryomov 	       rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
491ed95b21aSIlya Dryomov }
492ed95b21aSIlya Dryomov 
493ed95b21aSIlya Dryomov static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
494ed95b21aSIlya Dryomov {
495ed95b21aSIlya Dryomov 	bool is_lock_owner;
496ed95b21aSIlya Dryomov 
497ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
498ed95b21aSIlya Dryomov 	is_lock_owner = __rbd_is_lock_owner(rbd_dev);
499ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
500ed95b21aSIlya Dryomov 	return is_lock_owner;
501ed95b21aSIlya Dryomov }
502ed95b21aSIlya Dryomov 
503b15a21ddSGreg Kroah-Hartman static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
504b15a21ddSGreg Kroah-Hartman static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
5059b60e70bSIlya Dryomov static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
5069b60e70bSIlya Dryomov static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
507b15a21ddSGreg Kroah-Hartman 
508b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = {
509b15a21ddSGreg Kroah-Hartman 	&bus_attr_add.attr,
510b15a21ddSGreg Kroah-Hartman 	&bus_attr_remove.attr,
5119b60e70bSIlya Dryomov 	&bus_attr_add_single_major.attr,
5129b60e70bSIlya Dryomov 	&bus_attr_remove_single_major.attr,
513b15a21ddSGreg Kroah-Hartman 	NULL,
514f0f8cef5SAlex Elder };
51592c76dc0SIlya Dryomov 
51692c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj,
51792c76dc0SIlya Dryomov 				  struct attribute *attr, int index)
51892c76dc0SIlya Dryomov {
5199b60e70bSIlya Dryomov 	if (!single_major &&
5209b60e70bSIlya Dryomov 	    (attr == &bus_attr_add_single_major.attr ||
5219b60e70bSIlya Dryomov 	     attr == &bus_attr_remove_single_major.attr))
5229b60e70bSIlya Dryomov 		return 0;
5239b60e70bSIlya Dryomov 
52492c76dc0SIlya Dryomov 	return attr->mode;
52592c76dc0SIlya Dryomov }
52692c76dc0SIlya Dryomov 
52792c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = {
52892c76dc0SIlya Dryomov 	.attrs = rbd_bus_attrs,
52992c76dc0SIlya Dryomov 	.is_visible = rbd_bus_is_visible,
53092c76dc0SIlya Dryomov };
53192c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus);
532f0f8cef5SAlex Elder 
533f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
534f0f8cef5SAlex Elder 	.name		= "rbd",
535b15a21ddSGreg Kroah-Hartman 	.bus_groups	= rbd_bus_groups,
536f0f8cef5SAlex Elder };
537f0f8cef5SAlex Elder 
538f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
539f0f8cef5SAlex Elder {
540f0f8cef5SAlex Elder }
541f0f8cef5SAlex Elder 
542f0f8cef5SAlex Elder static struct device rbd_root_dev = {
543f0f8cef5SAlex Elder 	.init_name =    "rbd",
544f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
545f0f8cef5SAlex Elder };
546f0f8cef5SAlex Elder 
54706ecc6cbSAlex Elder static __printf(2, 3)
54806ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
54906ecc6cbSAlex Elder {
55006ecc6cbSAlex Elder 	struct va_format vaf;
55106ecc6cbSAlex Elder 	va_list args;
55206ecc6cbSAlex Elder 
55306ecc6cbSAlex Elder 	va_start(args, fmt);
55406ecc6cbSAlex Elder 	vaf.fmt = fmt;
55506ecc6cbSAlex Elder 	vaf.va = &args;
55606ecc6cbSAlex Elder 
55706ecc6cbSAlex Elder 	if (!rbd_dev)
55806ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
55906ecc6cbSAlex Elder 	else if (rbd_dev->disk)
56006ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
56106ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
56206ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
56306ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
56406ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
56506ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
56606ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
56706ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
56806ecc6cbSAlex Elder 	else	/* punt */
56906ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
57006ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
57106ecc6cbSAlex Elder 	va_end(args);
57206ecc6cbSAlex Elder }
57306ecc6cbSAlex Elder 
574aafb230eSAlex Elder #ifdef RBD_DEBUG
575aafb230eSAlex Elder #define rbd_assert(expr)						\
576aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
577aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
578aafb230eSAlex Elder 						"at line %d:\n\n"	\
579aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
580aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
581aafb230eSAlex Elder 			BUG();						\
582aafb230eSAlex Elder 		}
583aafb230eSAlex Elder #else /* !RBD_DEBUG */
584aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
585aafb230eSAlex Elder #endif /* !RBD_DEBUG */
586dfc5606dSYehuda Sadeh 
5872761713dSIlya Dryomov static void rbd_osd_copyup_callback(struct rbd_obj_request *obj_request);
588b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
58905a46afdSAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
59005a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
5918b3e1a56SAlex Elder 
592cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev);
5932df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
594a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev);
595e8f59b59SIlya Dryomov static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
59654cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
59754cac61fSAlex Elder 					u64 snap_id);
5982ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
5992ad3d716SAlex Elder 				u8 *order, u64 *snap_size);
6002ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
6012ad3d716SAlex Elder 		u64 *snap_features);
60259c2be1eSYehuda Sadeh 
603602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
604602adf40SYehuda Sadeh {
605f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
606b82d167bSAlex Elder 	bool removing = false;
607602adf40SYehuda Sadeh 
608f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
609602adf40SYehuda Sadeh 		return -EROFS;
610602adf40SYehuda Sadeh 
611a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
612b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
613b82d167bSAlex Elder 		removing = true;
614b82d167bSAlex Elder 	else
615b82d167bSAlex Elder 		rbd_dev->open_count++;
616a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
617b82d167bSAlex Elder 	if (removing)
618b82d167bSAlex Elder 		return -ENOENT;
619b82d167bSAlex Elder 
620c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
621340c7a2bSAlex Elder 
622602adf40SYehuda Sadeh 	return 0;
623602adf40SYehuda Sadeh }
624602adf40SYehuda Sadeh 
625db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode)
626dfc5606dSYehuda Sadeh {
627dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
628b82d167bSAlex Elder 	unsigned long open_count_before;
629b82d167bSAlex Elder 
630a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
631b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
632a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
633b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
634dfc5606dSYehuda Sadeh 
635c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
636dfc5606dSYehuda Sadeh }
637dfc5606dSYehuda Sadeh 
638131fd9f6SGuangliang Zhao static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
639131fd9f6SGuangliang Zhao {
64077f33c03SJosh Durgin 	int ret = 0;
641131fd9f6SGuangliang Zhao 	int val;
642131fd9f6SGuangliang Zhao 	bool ro;
64377f33c03SJosh Durgin 	bool ro_changed = false;
644131fd9f6SGuangliang Zhao 
64577f33c03SJosh Durgin 	/* get_user() may sleep, so call it before taking rbd_dev->lock */
646131fd9f6SGuangliang Zhao 	if (get_user(val, (int __user *)(arg)))
647131fd9f6SGuangliang Zhao 		return -EFAULT;
648131fd9f6SGuangliang Zhao 
649131fd9f6SGuangliang Zhao 	ro = val ? true : false;
650131fd9f6SGuangliang Zhao 	/* Snapshot doesn't allow to write*/
651131fd9f6SGuangliang Zhao 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
652131fd9f6SGuangliang Zhao 		return -EROFS;
653131fd9f6SGuangliang Zhao 
65477f33c03SJosh Durgin 	spin_lock_irq(&rbd_dev->lock);
65577f33c03SJosh Durgin 	/* prevent others open this device */
65677f33c03SJosh Durgin 	if (rbd_dev->open_count > 1) {
65777f33c03SJosh Durgin 		ret = -EBUSY;
65877f33c03SJosh Durgin 		goto out;
659131fd9f6SGuangliang Zhao 	}
660131fd9f6SGuangliang Zhao 
66177f33c03SJosh Durgin 	if (rbd_dev->mapping.read_only != ro) {
66277f33c03SJosh Durgin 		rbd_dev->mapping.read_only = ro;
66377f33c03SJosh Durgin 		ro_changed = true;
66477f33c03SJosh Durgin 	}
66577f33c03SJosh Durgin 
66677f33c03SJosh Durgin out:
66777f33c03SJosh Durgin 	spin_unlock_irq(&rbd_dev->lock);
66877f33c03SJosh Durgin 	/* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */
66977f33c03SJosh Durgin 	if (ret == 0 && ro_changed)
67077f33c03SJosh Durgin 		set_disk_ro(rbd_dev->disk, ro ? 1 : 0);
67177f33c03SJosh Durgin 
67277f33c03SJosh Durgin 	return ret;
673131fd9f6SGuangliang Zhao }
674131fd9f6SGuangliang Zhao 
675131fd9f6SGuangliang Zhao static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
676131fd9f6SGuangliang Zhao 			unsigned int cmd, unsigned long arg)
677131fd9f6SGuangliang Zhao {
678131fd9f6SGuangliang Zhao 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
679131fd9f6SGuangliang Zhao 	int ret = 0;
680131fd9f6SGuangliang Zhao 
681131fd9f6SGuangliang Zhao 	switch (cmd) {
682131fd9f6SGuangliang Zhao 	case BLKROSET:
683131fd9f6SGuangliang Zhao 		ret = rbd_ioctl_set_ro(rbd_dev, arg);
684131fd9f6SGuangliang Zhao 		break;
685131fd9f6SGuangliang Zhao 	default:
686131fd9f6SGuangliang Zhao 		ret = -ENOTTY;
687131fd9f6SGuangliang Zhao 	}
688131fd9f6SGuangliang Zhao 
689131fd9f6SGuangliang Zhao 	return ret;
690131fd9f6SGuangliang Zhao }
691131fd9f6SGuangliang Zhao 
692131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT
693131fd9f6SGuangliang Zhao static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
694131fd9f6SGuangliang Zhao 				unsigned int cmd, unsigned long arg)
695131fd9f6SGuangliang Zhao {
696131fd9f6SGuangliang Zhao 	return rbd_ioctl(bdev, mode, cmd, arg);
697131fd9f6SGuangliang Zhao }
698131fd9f6SGuangliang Zhao #endif /* CONFIG_COMPAT */
699131fd9f6SGuangliang Zhao 
700602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
701602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
702602adf40SYehuda Sadeh 	.open			= rbd_open,
703dfc5606dSYehuda Sadeh 	.release		= rbd_release,
704131fd9f6SGuangliang Zhao 	.ioctl			= rbd_ioctl,
705131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT
706131fd9f6SGuangliang Zhao 	.compat_ioctl		= rbd_compat_ioctl,
707131fd9f6SGuangliang Zhao #endif
708602adf40SYehuda Sadeh };
709602adf40SYehuda Sadeh 
710602adf40SYehuda Sadeh /*
7117262cfcaSAlex Elder  * Initialize an rbd client instance.  Success or not, this function
712cfbf6377SAlex Elder  * consumes ceph_opts.  Caller holds client_mutex.
713602adf40SYehuda Sadeh  */
714f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
715602adf40SYehuda Sadeh {
716602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
717602adf40SYehuda Sadeh 	int ret = -ENOMEM;
718602adf40SYehuda Sadeh 
71937206ee5SAlex Elder 	dout("%s:\n", __func__);
720602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
721602adf40SYehuda Sadeh 	if (!rbdc)
722602adf40SYehuda Sadeh 		goto out_opt;
723602adf40SYehuda Sadeh 
724602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
725602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
726602adf40SYehuda Sadeh 
72743ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
728602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
72908f75463SAlex Elder 		goto out_rbdc;
73043ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
731602adf40SYehuda Sadeh 
732602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
733602adf40SYehuda Sadeh 	if (ret < 0)
73408f75463SAlex Elder 		goto out_client;
735602adf40SYehuda Sadeh 
736432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
737602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
738432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
739602adf40SYehuda Sadeh 
74037206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
741bc534d86SAlex Elder 
742602adf40SYehuda Sadeh 	return rbdc;
74308f75463SAlex Elder out_client:
744602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
74508f75463SAlex Elder out_rbdc:
746602adf40SYehuda Sadeh 	kfree(rbdc);
747602adf40SYehuda Sadeh out_opt:
74843ae4701SAlex Elder 	if (ceph_opts)
74943ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
75037206ee5SAlex Elder 	dout("%s: error %d\n", __func__, ret);
75137206ee5SAlex Elder 
75228f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
753602adf40SYehuda Sadeh }
754602adf40SYehuda Sadeh 
7552f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
7562f82ee54SAlex Elder {
7572f82ee54SAlex Elder 	kref_get(&rbdc->kref);
7582f82ee54SAlex Elder 
7592f82ee54SAlex Elder 	return rbdc;
7602f82ee54SAlex Elder }
7612f82ee54SAlex Elder 
762602adf40SYehuda Sadeh /*
7631f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
7641f7ba331SAlex Elder  * found, bump its reference count.
765602adf40SYehuda Sadeh  */
7661f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
767602adf40SYehuda Sadeh {
768602adf40SYehuda Sadeh 	struct rbd_client *client_node;
7691f7ba331SAlex Elder 	bool found = false;
770602adf40SYehuda Sadeh 
77143ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
772602adf40SYehuda Sadeh 		return NULL;
773602adf40SYehuda Sadeh 
7741f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
7751f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
7761f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
7772f82ee54SAlex Elder 			__rbd_get_client(client_node);
7782f82ee54SAlex Elder 
7791f7ba331SAlex Elder 			found = true;
7801f7ba331SAlex Elder 			break;
7811f7ba331SAlex Elder 		}
7821f7ba331SAlex Elder 	}
7831f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
7841f7ba331SAlex Elder 
7851f7ba331SAlex Elder 	return found ? client_node : NULL;
786602adf40SYehuda Sadeh }
787602adf40SYehuda Sadeh 
788602adf40SYehuda Sadeh /*
789210c104cSIlya Dryomov  * (Per device) rbd map options
79059c2be1eSYehuda Sadeh  */
79159c2be1eSYehuda Sadeh enum {
792b5584180SIlya Dryomov 	Opt_queue_depth,
79359c2be1eSYehuda Sadeh 	Opt_last_int,
79459c2be1eSYehuda Sadeh 	/* int args above */
79559c2be1eSYehuda Sadeh 	Opt_last_string,
79659c2be1eSYehuda Sadeh 	/* string args above */
797cc0538b6SAlex Elder 	Opt_read_only,
798cc0538b6SAlex Elder 	Opt_read_write,
79980de1912SIlya Dryomov 	Opt_lock_on_read,
800210c104cSIlya Dryomov 	Opt_err
80159c2be1eSYehuda Sadeh };
80259c2be1eSYehuda Sadeh 
80343ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
804b5584180SIlya Dryomov 	{Opt_queue_depth, "queue_depth=%d"},
80559c2be1eSYehuda Sadeh 	/* int args above */
80659c2be1eSYehuda Sadeh 	/* string args above */
807be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
808cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
809cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
810cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
81180de1912SIlya Dryomov 	{Opt_lock_on_read, "lock_on_read"},
812210c104cSIlya Dryomov 	{Opt_err, NULL}
81359c2be1eSYehuda Sadeh };
81459c2be1eSYehuda Sadeh 
81598571b5aSAlex Elder struct rbd_options {
816b5584180SIlya Dryomov 	int	queue_depth;
81798571b5aSAlex Elder 	bool	read_only;
81880de1912SIlya Dryomov 	bool	lock_on_read;
81998571b5aSAlex Elder };
82098571b5aSAlex Elder 
821b5584180SIlya Dryomov #define RBD_QUEUE_DEPTH_DEFAULT	BLKDEV_MAX_RQ
82298571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
82380de1912SIlya Dryomov #define RBD_LOCK_ON_READ_DEFAULT false
82498571b5aSAlex Elder 
82559c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
82659c2be1eSYehuda Sadeh {
82743ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
82859c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
82959c2be1eSYehuda Sadeh 	int token, intval, ret;
83059c2be1eSYehuda Sadeh 
83143ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
83259c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
83359c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
83459c2be1eSYehuda Sadeh 		if (ret < 0) {
835210c104cSIlya Dryomov 			pr_err("bad mount option arg (not int) at '%s'\n", c);
83659c2be1eSYehuda Sadeh 			return ret;
83759c2be1eSYehuda Sadeh 		}
83859c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
83959c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
840210c104cSIlya Dryomov 		dout("got string token %d val %s\n", token, argstr[0].from);
84159c2be1eSYehuda Sadeh 	} else {
84259c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
84359c2be1eSYehuda Sadeh 	}
84459c2be1eSYehuda Sadeh 
84559c2be1eSYehuda Sadeh 	switch (token) {
846b5584180SIlya Dryomov 	case Opt_queue_depth:
847b5584180SIlya Dryomov 		if (intval < 1) {
848b5584180SIlya Dryomov 			pr_err("queue_depth out of range\n");
849b5584180SIlya Dryomov 			return -EINVAL;
850b5584180SIlya Dryomov 		}
851b5584180SIlya Dryomov 		rbd_opts->queue_depth = intval;
852b5584180SIlya Dryomov 		break;
853cc0538b6SAlex Elder 	case Opt_read_only:
854cc0538b6SAlex Elder 		rbd_opts->read_only = true;
855cc0538b6SAlex Elder 		break;
856cc0538b6SAlex Elder 	case Opt_read_write:
857cc0538b6SAlex Elder 		rbd_opts->read_only = false;
858cc0538b6SAlex Elder 		break;
85980de1912SIlya Dryomov 	case Opt_lock_on_read:
86080de1912SIlya Dryomov 		rbd_opts->lock_on_read = true;
86180de1912SIlya Dryomov 		break;
86259c2be1eSYehuda Sadeh 	default:
863210c104cSIlya Dryomov 		/* libceph prints "bad option" msg */
864210c104cSIlya Dryomov 		return -EINVAL;
86559c2be1eSYehuda Sadeh 	}
866210c104cSIlya Dryomov 
86759c2be1eSYehuda Sadeh 	return 0;
86859c2be1eSYehuda Sadeh }
86959c2be1eSYehuda Sadeh 
8706d2940c8SGuangliang Zhao static char* obj_op_name(enum obj_operation_type op_type)
8716d2940c8SGuangliang Zhao {
8726d2940c8SGuangliang Zhao 	switch (op_type) {
8736d2940c8SGuangliang Zhao 	case OBJ_OP_READ:
8746d2940c8SGuangliang Zhao 		return "read";
8756d2940c8SGuangliang Zhao 	case OBJ_OP_WRITE:
8766d2940c8SGuangliang Zhao 		return "write";
87790e98c52SGuangliang Zhao 	case OBJ_OP_DISCARD:
87890e98c52SGuangliang Zhao 		return "discard";
8796d2940c8SGuangliang Zhao 	default:
8806d2940c8SGuangliang Zhao 		return "???";
8816d2940c8SGuangliang Zhao 	}
8826d2940c8SGuangliang Zhao }
8836d2940c8SGuangliang Zhao 
88459c2be1eSYehuda Sadeh /*
885602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
8867262cfcaSAlex Elder  * not exist create it.  Either way, ceph_opts is consumed by this
8877262cfcaSAlex Elder  * function.
888602adf40SYehuda Sadeh  */
8899d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
890602adf40SYehuda Sadeh {
891f8c38929SAlex Elder 	struct rbd_client *rbdc;
89259c2be1eSYehuda Sadeh 
893cfbf6377SAlex Elder 	mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
8941f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
8959d3997fdSAlex Elder 	if (rbdc)	/* using an existing client */
89643ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
8979d3997fdSAlex Elder 	else
898f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
899cfbf6377SAlex Elder 	mutex_unlock(&client_mutex);
900d720bcb0SAlex Elder 
9019d3997fdSAlex Elder 	return rbdc;
902602adf40SYehuda Sadeh }
903602adf40SYehuda Sadeh 
904602adf40SYehuda Sadeh /*
905602adf40SYehuda Sadeh  * Destroy ceph client
906d23a4b3fSAlex Elder  *
907432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
908602adf40SYehuda Sadeh  */
909602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
910602adf40SYehuda Sadeh {
911602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
912602adf40SYehuda Sadeh 
91337206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
914cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
915602adf40SYehuda Sadeh 	list_del(&rbdc->node);
916cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
917602adf40SYehuda Sadeh 
918602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
919602adf40SYehuda Sadeh 	kfree(rbdc);
920602adf40SYehuda Sadeh }
921602adf40SYehuda Sadeh 
922602adf40SYehuda Sadeh /*
923602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
924602adf40SYehuda Sadeh  * it.
925602adf40SYehuda Sadeh  */
9269d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
927602adf40SYehuda Sadeh {
928c53d5893SAlex Elder 	if (rbdc)
9299d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
930602adf40SYehuda Sadeh }
931602adf40SYehuda Sadeh 
932a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
933a30b71b9SAlex Elder {
934a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
935a30b71b9SAlex Elder }
936a30b71b9SAlex Elder 
9378e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
9388e94af8eSAlex Elder {
939103a150fSAlex Elder 	size_t size;
940103a150fSAlex Elder 	u32 snap_count;
941103a150fSAlex Elder 
942103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
943103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
944103a150fSAlex Elder 		return false;
945103a150fSAlex Elder 
946db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
947db2388b6SAlex Elder 
948db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
949db2388b6SAlex Elder 		return false;
950db2388b6SAlex Elder 
951db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
952db2388b6SAlex Elder 
953db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
954db2388b6SAlex Elder 		return false;
955db2388b6SAlex Elder 
956103a150fSAlex Elder 	/*
957103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
958103a150fSAlex Elder 	 * that limits the number of snapshots.
959103a150fSAlex Elder 	 */
960103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
961103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
962103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
963103a150fSAlex Elder 		return false;
964103a150fSAlex Elder 
965103a150fSAlex Elder 	/*
966103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
967103a150fSAlex Elder 	 * header must also be representable in a size_t.
968103a150fSAlex Elder 	 */
969103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
970103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
971103a150fSAlex Elder 		return false;
972103a150fSAlex Elder 
973103a150fSAlex Elder 	return true;
9748e94af8eSAlex Elder }
9758e94af8eSAlex Elder 
976602adf40SYehuda Sadeh /*
9775bc3fb17SIlya Dryomov  * returns the size of an object in the image
9785bc3fb17SIlya Dryomov  */
9795bc3fb17SIlya Dryomov static u32 rbd_obj_bytes(struct rbd_image_header *header)
9805bc3fb17SIlya Dryomov {
9815bc3fb17SIlya Dryomov 	return 1U << header->obj_order;
9825bc3fb17SIlya Dryomov }
9835bc3fb17SIlya Dryomov 
984263423f8SIlya Dryomov static void rbd_init_layout(struct rbd_device *rbd_dev)
985263423f8SIlya Dryomov {
986263423f8SIlya Dryomov 	if (rbd_dev->header.stripe_unit == 0 ||
987263423f8SIlya Dryomov 	    rbd_dev->header.stripe_count == 0) {
988263423f8SIlya Dryomov 		rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
989263423f8SIlya Dryomov 		rbd_dev->header.stripe_count = 1;
990263423f8SIlya Dryomov 	}
991263423f8SIlya Dryomov 
992263423f8SIlya Dryomov 	rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
993263423f8SIlya Dryomov 	rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
994263423f8SIlya Dryomov 	rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
9957e97332eSIlya Dryomov 	rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
9967e97332eSIlya Dryomov 			  rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
997263423f8SIlya Dryomov 	RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
998263423f8SIlya Dryomov }
999263423f8SIlya Dryomov 
10005bc3fb17SIlya Dryomov /*
1001bb23e37aSAlex Elder  * Fill an rbd image header with information from the given format 1
1002bb23e37aSAlex Elder  * on-disk header.
1003602adf40SYehuda Sadeh  */
1004662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev,
10054156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
1006602adf40SYehuda Sadeh {
1007662518b1SAlex Elder 	struct rbd_image_header *header = &rbd_dev->header;
1008bb23e37aSAlex Elder 	bool first_time = header->object_prefix == NULL;
1009bb23e37aSAlex Elder 	struct ceph_snap_context *snapc;
1010bb23e37aSAlex Elder 	char *object_prefix = NULL;
1011bb23e37aSAlex Elder 	char *snap_names = NULL;
1012bb23e37aSAlex Elder 	u64 *snap_sizes = NULL;
1013ccece235SAlex Elder 	u32 snap_count;
1014bb23e37aSAlex Elder 	int ret = -ENOMEM;
1015621901d6SAlex Elder 	u32 i;
1016602adf40SYehuda Sadeh 
1017bb23e37aSAlex Elder 	/* Allocate this now to avoid having to handle failure below */
1018103a150fSAlex Elder 
1019bb23e37aSAlex Elder 	if (first_time) {
1020848d796cSIlya Dryomov 		object_prefix = kstrndup(ondisk->object_prefix,
1021848d796cSIlya Dryomov 					 sizeof(ondisk->object_prefix),
1022848d796cSIlya Dryomov 					 GFP_KERNEL);
1023bb23e37aSAlex Elder 		if (!object_prefix)
1024602adf40SYehuda Sadeh 			return -ENOMEM;
1025bb23e37aSAlex Elder 	}
102600f1f36fSAlex Elder 
1027bb23e37aSAlex Elder 	/* Allocate the snapshot context and fill it in */
1028d2bb24e5SAlex Elder 
1029602adf40SYehuda Sadeh 	snap_count = le32_to_cpu(ondisk->snap_count);
1030bb23e37aSAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1031bb23e37aSAlex Elder 	if (!snapc)
1032bb23e37aSAlex Elder 		goto out_err;
1033bb23e37aSAlex Elder 	snapc->seq = le64_to_cpu(ondisk->snap_seq);
1034602adf40SYehuda Sadeh 	if (snap_count) {
1035bb23e37aSAlex Elder 		struct rbd_image_snap_ondisk *snaps;
1036f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1037f785cc1dSAlex Elder 
1038bb23e37aSAlex Elder 		/* We'll keep a copy of the snapshot names... */
1039621901d6SAlex Elder 
1040f785cc1dSAlex Elder 		if (snap_names_len > (u64)SIZE_MAX)
1041bb23e37aSAlex Elder 			goto out_2big;
1042bb23e37aSAlex Elder 		snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1043bb23e37aSAlex Elder 		if (!snap_names)
1044602adf40SYehuda Sadeh 			goto out_err;
1045bb23e37aSAlex Elder 
1046bb23e37aSAlex Elder 		/* ...as well as the array of their sizes. */
104788a25a5fSMarkus Elfring 		snap_sizes = kmalloc_array(snap_count,
104888a25a5fSMarkus Elfring 					   sizeof(*header->snap_sizes),
104988a25a5fSMarkus Elfring 					   GFP_KERNEL);
1050bb23e37aSAlex Elder 		if (!snap_sizes)
1051bb23e37aSAlex Elder 			goto out_err;
1052bb23e37aSAlex Elder 
1053f785cc1dSAlex Elder 		/*
1054bb23e37aSAlex Elder 		 * Copy the names, and fill in each snapshot's id
1055bb23e37aSAlex Elder 		 * and size.
1056bb23e37aSAlex Elder 		 *
105799a41ebcSAlex Elder 		 * Note that rbd_dev_v1_header_info() guarantees the
1058bb23e37aSAlex Elder 		 * ondisk buffer we're working with has
1059f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
1060f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
1061f785cc1dSAlex Elder 		 */
1062bb23e37aSAlex Elder 		memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1063bb23e37aSAlex Elder 		snaps = ondisk->snaps;
1064bb23e37aSAlex Elder 		for (i = 0; i < snap_count; i++) {
1065bb23e37aSAlex Elder 			snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1066bb23e37aSAlex Elder 			snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1067bb23e37aSAlex Elder 		}
1068602adf40SYehuda Sadeh 	}
1069849b4260SAlex Elder 
1070bb23e37aSAlex Elder 	/* We won't fail any more, fill in the header */
1071bb23e37aSAlex Elder 
1072bb23e37aSAlex Elder 	if (first_time) {
1073bb23e37aSAlex Elder 		header->object_prefix = object_prefix;
1074602adf40SYehuda Sadeh 		header->obj_order = ondisk->options.order;
1075263423f8SIlya Dryomov 		rbd_init_layout(rbd_dev);
1076662518b1SAlex Elder 	} else {
1077662518b1SAlex Elder 		ceph_put_snap_context(header->snapc);
1078662518b1SAlex Elder 		kfree(header->snap_names);
1079662518b1SAlex Elder 		kfree(header->snap_sizes);
1080bb23e37aSAlex Elder 	}
10816a52325fSAlex Elder 
1082bb23e37aSAlex Elder 	/* The remaining fields always get updated (when we refresh) */
1083621901d6SAlex Elder 
1084f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
1085bb23e37aSAlex Elder 	header->snapc = snapc;
1086bb23e37aSAlex Elder 	header->snap_names = snap_names;
1087bb23e37aSAlex Elder 	header->snap_sizes = snap_sizes;
1088468521c1SAlex Elder 
1089602adf40SYehuda Sadeh 	return 0;
1090bb23e37aSAlex Elder out_2big:
1091bb23e37aSAlex Elder 	ret = -EIO;
10926a52325fSAlex Elder out_err:
1093bb23e37aSAlex Elder 	kfree(snap_sizes);
1094bb23e37aSAlex Elder 	kfree(snap_names);
1095bb23e37aSAlex Elder 	ceph_put_snap_context(snapc);
1096bb23e37aSAlex Elder 	kfree(object_prefix);
1097ccece235SAlex Elder 
1098bb23e37aSAlex Elder 	return ret;
1099602adf40SYehuda Sadeh }
1100602adf40SYehuda Sadeh 
11019682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
11029682fc6dSAlex Elder {
11039682fc6dSAlex Elder 	const char *snap_name;
11049682fc6dSAlex Elder 
11059682fc6dSAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
11069682fc6dSAlex Elder 
11079682fc6dSAlex Elder 	/* Skip over names until we find the one we are looking for */
11089682fc6dSAlex Elder 
11099682fc6dSAlex Elder 	snap_name = rbd_dev->header.snap_names;
11109682fc6dSAlex Elder 	while (which--)
11119682fc6dSAlex Elder 		snap_name += strlen(snap_name) + 1;
11129682fc6dSAlex Elder 
11139682fc6dSAlex Elder 	return kstrdup(snap_name, GFP_KERNEL);
11149682fc6dSAlex Elder }
11159682fc6dSAlex Elder 
111630d1cff8SAlex Elder /*
111730d1cff8SAlex Elder  * Snapshot id comparison function for use with qsort()/bsearch().
111830d1cff8SAlex Elder  * Note that result is for snapshots in *descending* order.
111930d1cff8SAlex Elder  */
112030d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2)
112130d1cff8SAlex Elder {
112230d1cff8SAlex Elder 	u64 snap_id1 = *(u64 *)s1;
112330d1cff8SAlex Elder 	u64 snap_id2 = *(u64 *)s2;
112430d1cff8SAlex Elder 
112530d1cff8SAlex Elder 	if (snap_id1 < snap_id2)
112630d1cff8SAlex Elder 		return 1;
112730d1cff8SAlex Elder 	return snap_id1 == snap_id2 ? 0 : -1;
112830d1cff8SAlex Elder }
112930d1cff8SAlex Elder 
113030d1cff8SAlex Elder /*
113130d1cff8SAlex Elder  * Search a snapshot context to see if the given snapshot id is
113230d1cff8SAlex Elder  * present.
113330d1cff8SAlex Elder  *
113430d1cff8SAlex Elder  * Returns the position of the snapshot id in the array if it's found,
113530d1cff8SAlex Elder  * or BAD_SNAP_INDEX otherwise.
113630d1cff8SAlex Elder  *
113730d1cff8SAlex Elder  * Note: The snapshot array is in kept sorted (by the osd) in
113830d1cff8SAlex Elder  * reverse order, highest snapshot id first.
113930d1cff8SAlex Elder  */
11409682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
11419682fc6dSAlex Elder {
11429682fc6dSAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
114330d1cff8SAlex Elder 	u64 *found;
11449682fc6dSAlex Elder 
114530d1cff8SAlex Elder 	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
114630d1cff8SAlex Elder 				sizeof (snap_id), snapid_compare_reverse);
11479682fc6dSAlex Elder 
114830d1cff8SAlex Elder 	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
11499682fc6dSAlex Elder }
11509682fc6dSAlex Elder 
11512ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
11522ad3d716SAlex Elder 					u64 snap_id)
115354cac61fSAlex Elder {
115454cac61fSAlex Elder 	u32 which;
1155da6a6b63SJosh Durgin 	const char *snap_name;
115654cac61fSAlex Elder 
115754cac61fSAlex Elder 	which = rbd_dev_snap_index(rbd_dev, snap_id);
115854cac61fSAlex Elder 	if (which == BAD_SNAP_INDEX)
1159da6a6b63SJosh Durgin 		return ERR_PTR(-ENOENT);
116054cac61fSAlex Elder 
1161da6a6b63SJosh Durgin 	snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1162da6a6b63SJosh Durgin 	return snap_name ? snap_name : ERR_PTR(-ENOMEM);
116354cac61fSAlex Elder }
116454cac61fSAlex Elder 
11659e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
11669e15b77dSAlex Elder {
11679e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
11689e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
11699e15b77dSAlex Elder 
117054cac61fSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
117154cac61fSAlex Elder 	if (rbd_dev->image_format == 1)
117254cac61fSAlex Elder 		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
11739e15b77dSAlex Elder 
117454cac61fSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
11759e15b77dSAlex Elder }
11769e15b77dSAlex Elder 
11772ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
11782ad3d716SAlex Elder 				u64 *snap_size)
1179602adf40SYehuda Sadeh {
11802ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
11812ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
11822ad3d716SAlex Elder 		*snap_size = rbd_dev->header.image_size;
11832ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
11842ad3d716SAlex Elder 		u32 which;
118500f1f36fSAlex Elder 
11862ad3d716SAlex Elder 		which = rbd_dev_snap_index(rbd_dev, snap_id);
11872ad3d716SAlex Elder 		if (which == BAD_SNAP_INDEX)
11882ad3d716SAlex Elder 			return -ENOENT;
118900f1f36fSAlex Elder 
11902ad3d716SAlex Elder 		*snap_size = rbd_dev->header.snap_sizes[which];
11912ad3d716SAlex Elder 	} else {
11922ad3d716SAlex Elder 		u64 size = 0;
11932ad3d716SAlex Elder 		int ret;
11942ad3d716SAlex Elder 
11952ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
11962ad3d716SAlex Elder 		if (ret)
11972ad3d716SAlex Elder 			return ret;
11982ad3d716SAlex Elder 
11992ad3d716SAlex Elder 		*snap_size = size;
12002ad3d716SAlex Elder 	}
12012ad3d716SAlex Elder 	return 0;
12022ad3d716SAlex Elder }
12032ad3d716SAlex Elder 
12042ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
12052ad3d716SAlex Elder 			u64 *snap_features)
12062ad3d716SAlex Elder {
12072ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
12082ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
12092ad3d716SAlex Elder 		*snap_features = rbd_dev->header.features;
12102ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
12112ad3d716SAlex Elder 		*snap_features = 0;	/* No features for format 1 */
12122ad3d716SAlex Elder 	} else {
12132ad3d716SAlex Elder 		u64 features = 0;
12142ad3d716SAlex Elder 		int ret;
12152ad3d716SAlex Elder 
12162ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
12172ad3d716SAlex Elder 		if (ret)
12182ad3d716SAlex Elder 			return ret;
12192ad3d716SAlex Elder 
12202ad3d716SAlex Elder 		*snap_features = features;
12212ad3d716SAlex Elder 	}
12222ad3d716SAlex Elder 	return 0;
122300f1f36fSAlex Elder }
1224602adf40SYehuda Sadeh 
1225d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1226602adf40SYehuda Sadeh {
12278f4b7d98SAlex Elder 	u64 snap_id = rbd_dev->spec->snap_id;
12282ad3d716SAlex Elder 	u64 size = 0;
12292ad3d716SAlex Elder 	u64 features = 0;
12302ad3d716SAlex Elder 	int ret;
12318b0241f8SAlex Elder 
12322ad3d716SAlex Elder 	ret = rbd_snap_size(rbd_dev, snap_id, &size);
12332ad3d716SAlex Elder 	if (ret)
12342ad3d716SAlex Elder 		return ret;
12352ad3d716SAlex Elder 	ret = rbd_snap_features(rbd_dev, snap_id, &features);
12362ad3d716SAlex Elder 	if (ret)
12372ad3d716SAlex Elder 		return ret;
12382ad3d716SAlex Elder 
12392ad3d716SAlex Elder 	rbd_dev->mapping.size = size;
12402ad3d716SAlex Elder 	rbd_dev->mapping.features = features;
12412ad3d716SAlex Elder 
12428b0241f8SAlex Elder 	return 0;
1243602adf40SYehuda Sadeh }
1244602adf40SYehuda Sadeh 
1245d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1246d1cf5788SAlex Elder {
1247d1cf5788SAlex Elder 	rbd_dev->mapping.size = 0;
1248d1cf5788SAlex Elder 	rbd_dev->mapping.features = 0;
1249200a6a8bSAlex Elder }
1250200a6a8bSAlex Elder 
12517d5079aaSHimangi Saraogi static void rbd_segment_name_free(const char *name)
12527d5079aaSHimangi Saraogi {
12537d5079aaSHimangi Saraogi 	/* The explicit cast here is needed to drop the const qualifier */
12547d5079aaSHimangi Saraogi 
12557d5079aaSHimangi Saraogi 	kmem_cache_free(rbd_segment_name_cache, (void *)name);
12567d5079aaSHimangi Saraogi }
12577d5079aaSHimangi Saraogi 
125898571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
1259602adf40SYehuda Sadeh {
126065ccfe21SAlex Elder 	char *name;
126165ccfe21SAlex Elder 	u64 segment;
126265ccfe21SAlex Elder 	int ret;
12633a96d5cdSJosh Durgin 	char *name_format;
1264602adf40SYehuda Sadeh 
126578c2a44aSAlex Elder 	name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
126665ccfe21SAlex Elder 	if (!name)
126765ccfe21SAlex Elder 		return NULL;
126865ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
12693a96d5cdSJosh Durgin 	name_format = "%s.%012llx";
12703a96d5cdSJosh Durgin 	if (rbd_dev->image_format == 2)
12713a96d5cdSJosh Durgin 		name_format = "%s.%016llx";
12722d0ebc5dSIlya Dryomov 	ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format,
127365ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
12742d0ebc5dSIlya Dryomov 	if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) {
127565ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
127665ccfe21SAlex Elder 			segment, ret);
12777d5079aaSHimangi Saraogi 		rbd_segment_name_free(name);
127865ccfe21SAlex Elder 		name = NULL;
127965ccfe21SAlex Elder 	}
1280602adf40SYehuda Sadeh 
128165ccfe21SAlex Elder 	return name;
128265ccfe21SAlex Elder }
1283602adf40SYehuda Sadeh 
128465ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
128565ccfe21SAlex Elder {
12865bc3fb17SIlya Dryomov 	u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
1287602adf40SYehuda Sadeh 
128865ccfe21SAlex Elder 	return offset & (segment_size - 1);
128965ccfe21SAlex Elder }
129065ccfe21SAlex Elder 
129165ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
129265ccfe21SAlex Elder 				u64 offset, u64 length)
129365ccfe21SAlex Elder {
12945bc3fb17SIlya Dryomov 	u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
129565ccfe21SAlex Elder 
129665ccfe21SAlex Elder 	offset &= segment_size - 1;
129765ccfe21SAlex Elder 
1298aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
129965ccfe21SAlex Elder 	if (offset + length > segment_size)
130065ccfe21SAlex Elder 		length = segment_size - offset;
130165ccfe21SAlex Elder 
130265ccfe21SAlex Elder 	return length;
1303602adf40SYehuda Sadeh }
1304602adf40SYehuda Sadeh 
1305602adf40SYehuda Sadeh /*
1306602adf40SYehuda Sadeh  * bio helpers
1307602adf40SYehuda Sadeh  */
1308602adf40SYehuda Sadeh 
1309602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
1310602adf40SYehuda Sadeh {
1311602adf40SYehuda Sadeh 	struct bio *tmp;
1312602adf40SYehuda Sadeh 
1313602adf40SYehuda Sadeh 	while (chain) {
1314602adf40SYehuda Sadeh 		tmp = chain;
1315602adf40SYehuda Sadeh 		chain = chain->bi_next;
1316602adf40SYehuda Sadeh 		bio_put(tmp);
1317602adf40SYehuda Sadeh 	}
1318602adf40SYehuda Sadeh }
1319602adf40SYehuda Sadeh 
1320602adf40SYehuda Sadeh /*
1321602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
1322602adf40SYehuda Sadeh  */
1323602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
1324602adf40SYehuda Sadeh {
13257988613bSKent Overstreet 	struct bio_vec bv;
13267988613bSKent Overstreet 	struct bvec_iter iter;
1327602adf40SYehuda Sadeh 	unsigned long flags;
1328602adf40SYehuda Sadeh 	void *buf;
1329602adf40SYehuda Sadeh 	int pos = 0;
1330602adf40SYehuda Sadeh 
1331602adf40SYehuda Sadeh 	while (chain) {
13327988613bSKent Overstreet 		bio_for_each_segment(bv, chain, iter) {
13337988613bSKent Overstreet 			if (pos + bv.bv_len > start_ofs) {
1334602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
13357988613bSKent Overstreet 				buf = bvec_kmap_irq(&bv, &flags);
1336602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
13377988613bSKent Overstreet 				       bv.bv_len - remainder);
13387988613bSKent Overstreet 				flush_dcache_page(bv.bv_page);
133985b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
1340602adf40SYehuda Sadeh 			}
13417988613bSKent Overstreet 			pos += bv.bv_len;
1342602adf40SYehuda Sadeh 		}
1343602adf40SYehuda Sadeh 
1344602adf40SYehuda Sadeh 		chain = chain->bi_next;
1345602adf40SYehuda Sadeh 	}
1346602adf40SYehuda Sadeh }
1347602adf40SYehuda Sadeh 
1348602adf40SYehuda Sadeh /*
1349b9434c5bSAlex Elder  * similar to zero_bio_chain(), zeros data defined by a page array,
1350b9434c5bSAlex Elder  * starting at the given byte offset from the start of the array and
1351b9434c5bSAlex Elder  * continuing up to the given end offset.  The pages array is
1352b9434c5bSAlex Elder  * assumed to be big enough to hold all bytes up to the end.
1353b9434c5bSAlex Elder  */
1354b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end)
1355b9434c5bSAlex Elder {
1356b9434c5bSAlex Elder 	struct page **page = &pages[offset >> PAGE_SHIFT];
1357b9434c5bSAlex Elder 
1358b9434c5bSAlex Elder 	rbd_assert(end > offset);
1359b9434c5bSAlex Elder 	rbd_assert(end - offset <= (u64)SIZE_MAX);
1360b9434c5bSAlex Elder 	while (offset < end) {
1361b9434c5bSAlex Elder 		size_t page_offset;
1362b9434c5bSAlex Elder 		size_t length;
1363b9434c5bSAlex Elder 		unsigned long flags;
1364b9434c5bSAlex Elder 		void *kaddr;
1365b9434c5bSAlex Elder 
1366491205a8SGeert Uytterhoeven 		page_offset = offset & ~PAGE_MASK;
1367491205a8SGeert Uytterhoeven 		length = min_t(size_t, PAGE_SIZE - page_offset, end - offset);
1368b9434c5bSAlex Elder 		local_irq_save(flags);
1369b9434c5bSAlex Elder 		kaddr = kmap_atomic(*page);
1370b9434c5bSAlex Elder 		memset(kaddr + page_offset, 0, length);
1371e2156054SAlex Elder 		flush_dcache_page(*page);
1372b9434c5bSAlex Elder 		kunmap_atomic(kaddr);
1373b9434c5bSAlex Elder 		local_irq_restore(flags);
1374b9434c5bSAlex Elder 
1375b9434c5bSAlex Elder 		offset += length;
1376b9434c5bSAlex Elder 		page++;
1377b9434c5bSAlex Elder 	}
1378b9434c5bSAlex Elder }
1379b9434c5bSAlex Elder 
1380b9434c5bSAlex Elder /*
1381f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
1382f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
1383602adf40SYehuda Sadeh  */
1384f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
1385f7760dadSAlex Elder 					unsigned int offset,
1386f7760dadSAlex Elder 					unsigned int len,
1387f7760dadSAlex Elder 					gfp_t gfpmask)
1388602adf40SYehuda Sadeh {
1389f7760dadSAlex Elder 	struct bio *bio;
1390602adf40SYehuda Sadeh 
13915341a627SKent Overstreet 	bio = bio_clone(bio_src, gfpmask);
1392f7760dadSAlex Elder 	if (!bio)
1393f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
1394f7760dadSAlex Elder 
13955341a627SKent Overstreet 	bio_advance(bio, offset);
13964f024f37SKent Overstreet 	bio->bi_iter.bi_size = len;
1397602adf40SYehuda Sadeh 
1398f7760dadSAlex Elder 	return bio;
1399602adf40SYehuda Sadeh }
1400602adf40SYehuda Sadeh 
1401f7760dadSAlex Elder /*
1402f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
1403f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
1404f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
1405f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
1406f7760dadSAlex Elder  *
1407f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
1408f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
1409f7760dadSAlex Elder  * the start of data to be cloned is located.
1410f7760dadSAlex Elder  *
1411f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
1412f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
1413f7760dadSAlex Elder  * contain the offset of that byte within that bio.
1414f7760dadSAlex Elder  */
1415f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
1416f7760dadSAlex Elder 					unsigned int *offset,
1417f7760dadSAlex Elder 					unsigned int len,
1418f7760dadSAlex Elder 					gfp_t gfpmask)
1419f7760dadSAlex Elder {
1420f7760dadSAlex Elder 	struct bio *bi = *bio_src;
1421f7760dadSAlex Elder 	unsigned int off = *offset;
1422f7760dadSAlex Elder 	struct bio *chain = NULL;
1423f7760dadSAlex Elder 	struct bio **end;
1424602adf40SYehuda Sadeh 
1425f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
1426602adf40SYehuda Sadeh 
14274f024f37SKent Overstreet 	if (!bi || off >= bi->bi_iter.bi_size || !len)
1428f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
1429602adf40SYehuda Sadeh 
1430f7760dadSAlex Elder 	end = &chain;
1431f7760dadSAlex Elder 	while (len) {
1432f7760dadSAlex Elder 		unsigned int bi_size;
1433f7760dadSAlex Elder 		struct bio *bio;
1434f7760dadSAlex Elder 
1435f5400b7aSAlex Elder 		if (!bi) {
1436f5400b7aSAlex Elder 			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1437f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
1438f5400b7aSAlex Elder 		}
14394f024f37SKent Overstreet 		bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len);
1440f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
1441f7760dadSAlex Elder 		if (!bio)
1442f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
1443f7760dadSAlex Elder 
1444f7760dadSAlex Elder 		*end = bio;
1445f7760dadSAlex Elder 		end = &bio->bi_next;
1446f7760dadSAlex Elder 
1447f7760dadSAlex Elder 		off += bi_size;
14484f024f37SKent Overstreet 		if (off == bi->bi_iter.bi_size) {
1449f7760dadSAlex Elder 			bi = bi->bi_next;
1450f7760dadSAlex Elder 			off = 0;
1451f7760dadSAlex Elder 		}
1452f7760dadSAlex Elder 		len -= bi_size;
1453f7760dadSAlex Elder 	}
1454f7760dadSAlex Elder 	*bio_src = bi;
1455f7760dadSAlex Elder 	*offset = off;
1456f7760dadSAlex Elder 
1457f7760dadSAlex Elder 	return chain;
1458f7760dadSAlex Elder out_err:
1459f7760dadSAlex Elder 	bio_chain_put(chain);
1460f7760dadSAlex Elder 
1461602adf40SYehuda Sadeh 	return NULL;
1462602adf40SYehuda Sadeh }
1463602adf40SYehuda Sadeh 
1464926f9b3fSAlex Elder /*
1465926f9b3fSAlex Elder  * The default/initial value for all object request flags is 0.  For
1466926f9b3fSAlex Elder  * each flag, once its value is set to 1 it is never reset to 0
1467926f9b3fSAlex Elder  * again.
1468926f9b3fSAlex Elder  */
14696365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
14706365d33aSAlex Elder {
14716365d33aSAlex Elder 	if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
14726365d33aSAlex Elder 		struct rbd_device *rbd_dev;
14736365d33aSAlex Elder 
147457acbaa7SAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
14759584d508SIlya Dryomov 		rbd_warn(rbd_dev, "obj_request %p already marked img_data",
14766365d33aSAlex Elder 			obj_request);
14776365d33aSAlex Elder 	}
14786365d33aSAlex Elder }
14796365d33aSAlex Elder 
14806365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
14816365d33aSAlex Elder {
14826365d33aSAlex Elder 	smp_mb();
14836365d33aSAlex Elder 	return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
14846365d33aSAlex Elder }
14856365d33aSAlex Elder 
148657acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request)
148757acbaa7SAlex Elder {
148857acbaa7SAlex Elder 	if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
148957acbaa7SAlex Elder 		struct rbd_device *rbd_dev = NULL;
149057acbaa7SAlex Elder 
149157acbaa7SAlex Elder 		if (obj_request_img_data_test(obj_request))
149257acbaa7SAlex Elder 			rbd_dev = obj_request->img_request->rbd_dev;
14939584d508SIlya Dryomov 		rbd_warn(rbd_dev, "obj_request %p already marked done",
149457acbaa7SAlex Elder 			obj_request);
149557acbaa7SAlex Elder 	}
149657acbaa7SAlex Elder }
149757acbaa7SAlex Elder 
149857acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request)
149957acbaa7SAlex Elder {
150057acbaa7SAlex Elder 	smp_mb();
150157acbaa7SAlex Elder 	return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
150257acbaa7SAlex Elder }
150357acbaa7SAlex Elder 
15045679c59fSAlex Elder /*
15055679c59fSAlex Elder  * This sets the KNOWN flag after (possibly) setting the EXISTS
15065679c59fSAlex Elder  * flag.  The latter is set based on the "exists" value provided.
15075679c59fSAlex Elder  *
15085679c59fSAlex Elder  * Note that for our purposes once an object exists it never goes
15095679c59fSAlex Elder  * away again.  It's possible that the response from two existence
15105679c59fSAlex Elder  * checks are separated by the creation of the target object, and
15115679c59fSAlex Elder  * the first ("doesn't exist") response arrives *after* the second
15125679c59fSAlex Elder  * ("does exist").  In that case we ignore the second one.
15135679c59fSAlex Elder  */
15145679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request,
15155679c59fSAlex Elder 				bool exists)
15165679c59fSAlex Elder {
15175679c59fSAlex Elder 	if (exists)
15185679c59fSAlex Elder 		set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
15195679c59fSAlex Elder 	set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
15205679c59fSAlex Elder 	smp_mb();
15215679c59fSAlex Elder }
15225679c59fSAlex Elder 
15235679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request)
15245679c59fSAlex Elder {
15255679c59fSAlex Elder 	smp_mb();
15265679c59fSAlex Elder 	return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
15275679c59fSAlex Elder }
15285679c59fSAlex Elder 
15295679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
15305679c59fSAlex Elder {
15315679c59fSAlex Elder 	smp_mb();
15325679c59fSAlex Elder 	return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
15335679c59fSAlex Elder }
15345679c59fSAlex Elder 
15359638556aSIlya Dryomov static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request)
15369638556aSIlya Dryomov {
15379638556aSIlya Dryomov 	struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
15389638556aSIlya Dryomov 
15399638556aSIlya Dryomov 	return obj_request->img_offset <
15409638556aSIlya Dryomov 	    round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header));
15419638556aSIlya Dryomov }
15429638556aSIlya Dryomov 
1543bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1544bf0d5f50SAlex Elder {
154537206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
154637206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1547bf0d5f50SAlex Elder 	kref_get(&obj_request->kref);
1548bf0d5f50SAlex Elder }
1549bf0d5f50SAlex Elder 
1550bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
1551bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1552bf0d5f50SAlex Elder {
1553bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
155437206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
155537206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1556bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1557bf0d5f50SAlex Elder }
1558bf0d5f50SAlex Elder 
15590f2d5be7SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request)
15600f2d5be7SAlex Elder {
15610f2d5be7SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
15620f2d5be7SAlex Elder 	     atomic_read(&img_request->kref.refcount));
15630f2d5be7SAlex Elder 	kref_get(&img_request->kref);
15640f2d5be7SAlex Elder }
15650f2d5be7SAlex Elder 
1566e93f3152SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request);
1567e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref);
1568bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref);
1569bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request)
1570bf0d5f50SAlex Elder {
1571bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
157237206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
157337206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1574e93f3152SAlex Elder 	if (img_request_child_test(img_request))
1575e93f3152SAlex Elder 		kref_put(&img_request->kref, rbd_parent_request_destroy);
1576e93f3152SAlex Elder 	else
1577bf0d5f50SAlex Elder 		kref_put(&img_request->kref, rbd_img_request_destroy);
1578bf0d5f50SAlex Elder }
1579bf0d5f50SAlex Elder 
1580bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1581bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1582bf0d5f50SAlex Elder {
158325dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
158425dcf954SAlex Elder 
1585b155e86cSAlex Elder 	/* Image request now owns object's original reference */
1586bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
158725dcf954SAlex Elder 	obj_request->which = img_request->obj_request_count;
15886365d33aSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
15896365d33aSAlex Elder 	obj_request_img_data_set(obj_request);
1590bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
159125dcf954SAlex Elder 	img_request->obj_request_count++;
159225dcf954SAlex Elder 	list_add_tail(&obj_request->links, &img_request->obj_requests);
159337206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
159437206ee5SAlex Elder 		obj_request->which);
1595bf0d5f50SAlex Elder }
1596bf0d5f50SAlex Elder 
1597bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1598bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1599bf0d5f50SAlex Elder {
1600bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
160125dcf954SAlex Elder 
160237206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
160337206ee5SAlex Elder 		obj_request->which);
1604bf0d5f50SAlex Elder 	list_del(&obj_request->links);
160525dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
160625dcf954SAlex Elder 	img_request->obj_request_count--;
160725dcf954SAlex Elder 	rbd_assert(obj_request->which == img_request->obj_request_count);
160825dcf954SAlex Elder 	obj_request->which = BAD_WHICH;
16096365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
1610bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1611bf0d5f50SAlex Elder 	obj_request->img_request = NULL;
161225dcf954SAlex Elder 	obj_request->callback = NULL;
1613bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1614bf0d5f50SAlex Elder }
1615bf0d5f50SAlex Elder 
1616bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type)
1617bf0d5f50SAlex Elder {
1618bf0d5f50SAlex Elder 	switch (type) {
16199969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1620bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1621788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1622bf0d5f50SAlex Elder 		return true;
1623bf0d5f50SAlex Elder 	default:
1624bf0d5f50SAlex Elder 		return false;
1625bf0d5f50SAlex Elder 	}
1626bf0d5f50SAlex Elder }
1627bf0d5f50SAlex Elder 
16284a17dadcSIlya Dryomov static void rbd_img_obj_callback(struct rbd_obj_request *obj_request);
16294a17dadcSIlya Dryomov 
1630980917fcSIlya Dryomov static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
1631bf0d5f50SAlex Elder {
1632980917fcSIlya Dryomov 	struct ceph_osd_request *osd_req = obj_request->osd_req;
1633980917fcSIlya Dryomov 
163467e2b652SIlya Dryomov 	dout("%s %p \"%s\" %llu~%llu osd_req %p\n", __func__,
163567e2b652SIlya Dryomov 	     obj_request, obj_request->object_name, obj_request->offset,
163667e2b652SIlya Dryomov 	     obj_request->length, osd_req);
16374a17dadcSIlya Dryomov 	if (obj_request_img_data_test(obj_request)) {
16384a17dadcSIlya Dryomov 		WARN_ON(obj_request->callback != rbd_img_obj_callback);
16394a17dadcSIlya Dryomov 		rbd_img_request_get(obj_request->img_request);
16404a17dadcSIlya Dryomov 	}
1641980917fcSIlya Dryomov 	ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
1642bf0d5f50SAlex Elder }
1643bf0d5f50SAlex Elder 
1644bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request)
1645bf0d5f50SAlex Elder {
164655f27e09SAlex Elder 
164737206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
164855f27e09SAlex Elder 
164955f27e09SAlex Elder 	/*
165055f27e09SAlex Elder 	 * If no error occurred, compute the aggregate transfer
165155f27e09SAlex Elder 	 * count for the image request.  We could instead use
165255f27e09SAlex Elder 	 * atomic64_cmpxchg() to update it as each object request
165355f27e09SAlex Elder 	 * completes; not clear which way is better off hand.
165455f27e09SAlex Elder 	 */
165555f27e09SAlex Elder 	if (!img_request->result) {
165655f27e09SAlex Elder 		struct rbd_obj_request *obj_request;
165755f27e09SAlex Elder 		u64 xferred = 0;
165855f27e09SAlex Elder 
165955f27e09SAlex Elder 		for_each_obj_request(img_request, obj_request)
166055f27e09SAlex Elder 			xferred += obj_request->xferred;
166155f27e09SAlex Elder 		img_request->xferred = xferred;
166255f27e09SAlex Elder 	}
166355f27e09SAlex Elder 
1664bf0d5f50SAlex Elder 	if (img_request->callback)
1665bf0d5f50SAlex Elder 		img_request->callback(img_request);
1666bf0d5f50SAlex Elder 	else
1667bf0d5f50SAlex Elder 		rbd_img_request_put(img_request);
1668bf0d5f50SAlex Elder }
1669bf0d5f50SAlex Elder 
16700c425248SAlex Elder /*
16710c425248SAlex Elder  * The default/initial value for all image request flags is 0.  Each
16720c425248SAlex Elder  * is conditionally set to 1 at image request initialization time
16730c425248SAlex Elder  * and currently never change thereafter.
16740c425248SAlex Elder  */
16750c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request)
16760c425248SAlex Elder {
16770c425248SAlex Elder 	set_bit(IMG_REQ_WRITE, &img_request->flags);
16780c425248SAlex Elder 	smp_mb();
16790c425248SAlex Elder }
16800c425248SAlex Elder 
16810c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request)
16820c425248SAlex Elder {
16830c425248SAlex Elder 	smp_mb();
16840c425248SAlex Elder 	return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
16850c425248SAlex Elder }
16860c425248SAlex Elder 
168790e98c52SGuangliang Zhao /*
168890e98c52SGuangliang Zhao  * Set the discard flag when the img_request is an discard request
168990e98c52SGuangliang Zhao  */
169090e98c52SGuangliang Zhao static void img_request_discard_set(struct rbd_img_request *img_request)
169190e98c52SGuangliang Zhao {
169290e98c52SGuangliang Zhao 	set_bit(IMG_REQ_DISCARD, &img_request->flags);
169390e98c52SGuangliang Zhao 	smp_mb();
169490e98c52SGuangliang Zhao }
169590e98c52SGuangliang Zhao 
169690e98c52SGuangliang Zhao static bool img_request_discard_test(struct rbd_img_request *img_request)
169790e98c52SGuangliang Zhao {
169890e98c52SGuangliang Zhao 	smp_mb();
169990e98c52SGuangliang Zhao 	return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0;
170090e98c52SGuangliang Zhao }
170190e98c52SGuangliang Zhao 
17029849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request)
17039849e986SAlex Elder {
17049849e986SAlex Elder 	set_bit(IMG_REQ_CHILD, &img_request->flags);
17059849e986SAlex Elder 	smp_mb();
17069849e986SAlex Elder }
17079849e986SAlex Elder 
1708e93f3152SAlex Elder static void img_request_child_clear(struct rbd_img_request *img_request)
1709e93f3152SAlex Elder {
1710e93f3152SAlex Elder 	clear_bit(IMG_REQ_CHILD, &img_request->flags);
1711e93f3152SAlex Elder 	smp_mb();
1712e93f3152SAlex Elder }
1713e93f3152SAlex Elder 
17149849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request)
17159849e986SAlex Elder {
17169849e986SAlex Elder 	smp_mb();
17179849e986SAlex Elder 	return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
17189849e986SAlex Elder }
17199849e986SAlex Elder 
1720d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request)
1721d0b2e944SAlex Elder {
1722d0b2e944SAlex Elder 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1723d0b2e944SAlex Elder 	smp_mb();
1724d0b2e944SAlex Elder }
1725d0b2e944SAlex Elder 
1726a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request)
1727a2acd00eSAlex Elder {
1728a2acd00eSAlex Elder 	clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1729a2acd00eSAlex Elder 	smp_mb();
1730a2acd00eSAlex Elder }
1731a2acd00eSAlex Elder 
1732d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request)
1733d0b2e944SAlex Elder {
1734d0b2e944SAlex Elder 	smp_mb();
1735d0b2e944SAlex Elder 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1736d0b2e944SAlex Elder }
1737d0b2e944SAlex Elder 
17383b434a2aSJosh Durgin static enum obj_operation_type
17393b434a2aSJosh Durgin rbd_img_request_op_type(struct rbd_img_request *img_request)
17403b434a2aSJosh Durgin {
17413b434a2aSJosh Durgin 	if (img_request_write_test(img_request))
17423b434a2aSJosh Durgin 		return OBJ_OP_WRITE;
17433b434a2aSJosh Durgin 	else if (img_request_discard_test(img_request))
17443b434a2aSJosh Durgin 		return OBJ_OP_DISCARD;
17453b434a2aSJosh Durgin 	else
17463b434a2aSJosh Durgin 		return OBJ_OP_READ;
17473b434a2aSJosh Durgin }
17483b434a2aSJosh Durgin 
17496e2a4505SAlex Elder static void
17506e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
17516e2a4505SAlex Elder {
1752b9434c5bSAlex Elder 	u64 xferred = obj_request->xferred;
1753b9434c5bSAlex Elder 	u64 length = obj_request->length;
1754b9434c5bSAlex Elder 
17556e2a4505SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
17566e2a4505SAlex Elder 		obj_request, obj_request->img_request, obj_request->result,
1757b9434c5bSAlex Elder 		xferred, length);
17586e2a4505SAlex Elder 	/*
175917c1cc1dSJosh Durgin 	 * ENOENT means a hole in the image.  We zero-fill the entire
176017c1cc1dSJosh Durgin 	 * length of the request.  A short read also implies zero-fill
176117c1cc1dSJosh Durgin 	 * to the end of the request.  An error requires the whole
176217c1cc1dSJosh Durgin 	 * length of the request to be reported finished with an error
176317c1cc1dSJosh Durgin 	 * to the block layer.  In each case we update the xferred
176417c1cc1dSJosh Durgin 	 * count to indicate the whole request was satisfied.
17656e2a4505SAlex Elder 	 */
1766b9434c5bSAlex Elder 	rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
17676e2a4505SAlex Elder 	if (obj_request->result == -ENOENT) {
1768b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
17696e2a4505SAlex Elder 			zero_bio_chain(obj_request->bio_list, 0);
1770b9434c5bSAlex Elder 		else
1771b9434c5bSAlex Elder 			zero_pages(obj_request->pages, 0, length);
17726e2a4505SAlex Elder 		obj_request->result = 0;
1773b9434c5bSAlex Elder 	} else if (xferred < length && !obj_request->result) {
1774b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
1775b9434c5bSAlex Elder 			zero_bio_chain(obj_request->bio_list, xferred);
1776b9434c5bSAlex Elder 		else
1777b9434c5bSAlex Elder 			zero_pages(obj_request->pages, xferred, length);
17786e2a4505SAlex Elder 	}
177917c1cc1dSJosh Durgin 	obj_request->xferred = length;
17806e2a4505SAlex Elder 	obj_request_done_set(obj_request);
17816e2a4505SAlex Elder }
17826e2a4505SAlex Elder 
1783bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1784bf0d5f50SAlex Elder {
178537206ee5SAlex Elder 	dout("%s: obj %p cb %p\n", __func__, obj_request,
178637206ee5SAlex Elder 		obj_request->callback);
1787bf0d5f50SAlex Elder 	if (obj_request->callback)
1788bf0d5f50SAlex Elder 		obj_request->callback(obj_request);
1789788e2df3SAlex Elder 	else
1790788e2df3SAlex Elder 		complete_all(&obj_request->completion);
1791bf0d5f50SAlex Elder }
1792bf0d5f50SAlex Elder 
17930dcc685eSIlya Dryomov static void rbd_obj_request_error(struct rbd_obj_request *obj_request, int err)
17940dcc685eSIlya Dryomov {
17950dcc685eSIlya Dryomov 	obj_request->result = err;
17960dcc685eSIlya Dryomov 	obj_request->xferred = 0;
17970dcc685eSIlya Dryomov 	/*
17980dcc685eSIlya Dryomov 	 * kludge - mirror rbd_obj_request_submit() to match a put in
17990dcc685eSIlya Dryomov 	 * rbd_img_obj_callback()
18000dcc685eSIlya Dryomov 	 */
18010dcc685eSIlya Dryomov 	if (obj_request_img_data_test(obj_request)) {
18020dcc685eSIlya Dryomov 		WARN_ON(obj_request->callback != rbd_img_obj_callback);
18030dcc685eSIlya Dryomov 		rbd_img_request_get(obj_request->img_request);
18040dcc685eSIlya Dryomov 	}
18050dcc685eSIlya Dryomov 	obj_request_done_set(obj_request);
18060dcc685eSIlya Dryomov 	rbd_obj_request_complete(obj_request);
18070dcc685eSIlya Dryomov }
18080dcc685eSIlya Dryomov 
1809c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1810bf0d5f50SAlex Elder {
181157acbaa7SAlex Elder 	struct rbd_img_request *img_request = NULL;
1812a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev = NULL;
181357acbaa7SAlex Elder 	bool layered = false;
181457acbaa7SAlex Elder 
181557acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
181657acbaa7SAlex Elder 		img_request = obj_request->img_request;
181757acbaa7SAlex Elder 		layered = img_request && img_request_layered_test(img_request);
1818a9e8ba2cSAlex Elder 		rbd_dev = img_request->rbd_dev;
181957acbaa7SAlex Elder 	}
18208b3e1a56SAlex Elder 
18218b3e1a56SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
18228b3e1a56SAlex Elder 		obj_request, img_request, obj_request->result,
18238b3e1a56SAlex Elder 		obj_request->xferred, obj_request->length);
1824a9e8ba2cSAlex Elder 	if (layered && obj_request->result == -ENOENT &&
1825a9e8ba2cSAlex Elder 			obj_request->img_offset < rbd_dev->parent_overlap)
18268b3e1a56SAlex Elder 		rbd_img_parent_read(obj_request);
18278b3e1a56SAlex Elder 	else if (img_request)
18286e2a4505SAlex Elder 		rbd_img_obj_request_read_callback(obj_request);
18296e2a4505SAlex Elder 	else
183007741308SAlex Elder 		obj_request_done_set(obj_request);
1831bf0d5f50SAlex Elder }
1832bf0d5f50SAlex Elder 
1833c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1834bf0d5f50SAlex Elder {
18351b83bef2SSage Weil 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
18361b83bef2SSage Weil 		obj_request->result, obj_request->length);
18371b83bef2SSage Weil 	/*
18388b3e1a56SAlex Elder 	 * There is no such thing as a successful short write.  Set
18398b3e1a56SAlex Elder 	 * it to our originally-requested length.
18401b83bef2SSage Weil 	 */
18411b83bef2SSage Weil 	obj_request->xferred = obj_request->length;
184207741308SAlex Elder 	obj_request_done_set(obj_request);
1843bf0d5f50SAlex Elder }
1844bf0d5f50SAlex Elder 
184590e98c52SGuangliang Zhao static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request)
184690e98c52SGuangliang Zhao {
184790e98c52SGuangliang Zhao 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
184890e98c52SGuangliang Zhao 		obj_request->result, obj_request->length);
184990e98c52SGuangliang Zhao 	/*
185090e98c52SGuangliang Zhao 	 * There is no such thing as a successful short discard.  Set
185190e98c52SGuangliang Zhao 	 * it to our originally-requested length.
185290e98c52SGuangliang Zhao 	 */
185390e98c52SGuangliang Zhao 	obj_request->xferred = obj_request->length;
1854d0265de7SJosh Durgin 	/* discarding a non-existent object is not a problem */
1855d0265de7SJosh Durgin 	if (obj_request->result == -ENOENT)
1856d0265de7SJosh Durgin 		obj_request->result = 0;
185790e98c52SGuangliang Zhao 	obj_request_done_set(obj_request);
185890e98c52SGuangliang Zhao }
185990e98c52SGuangliang Zhao 
1860fbfab539SAlex Elder /*
1861fbfab539SAlex Elder  * For a simple stat call there's nothing to do.  We'll do more if
1862fbfab539SAlex Elder  * this is part of a write sequence for a layered image.
1863fbfab539SAlex Elder  */
1864c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1865fbfab539SAlex Elder {
186637206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
1867fbfab539SAlex Elder 	obj_request_done_set(obj_request);
1868fbfab539SAlex Elder }
1869fbfab539SAlex Elder 
18702761713dSIlya Dryomov static void rbd_osd_call_callback(struct rbd_obj_request *obj_request)
18712761713dSIlya Dryomov {
18722761713dSIlya Dryomov 	dout("%s: obj %p\n", __func__, obj_request);
18732761713dSIlya Dryomov 
18742761713dSIlya Dryomov 	if (obj_request_img_data_test(obj_request))
18752761713dSIlya Dryomov 		rbd_osd_copyup_callback(obj_request);
18762761713dSIlya Dryomov 	else
18772761713dSIlya Dryomov 		obj_request_done_set(obj_request);
18782761713dSIlya Dryomov }
18792761713dSIlya Dryomov 
188085e084feSIlya Dryomov static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
1881bf0d5f50SAlex Elder {
1882bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1883bf0d5f50SAlex Elder 	u16 opcode;
1884bf0d5f50SAlex Elder 
188585e084feSIlya Dryomov 	dout("%s: osd_req %p\n", __func__, osd_req);
1886bf0d5f50SAlex Elder 	rbd_assert(osd_req == obj_request->osd_req);
188757acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
188857acbaa7SAlex Elder 		rbd_assert(obj_request->img_request);
188957acbaa7SAlex Elder 		rbd_assert(obj_request->which != BAD_WHICH);
189057acbaa7SAlex Elder 	} else {
189157acbaa7SAlex Elder 		rbd_assert(obj_request->which == BAD_WHICH);
189257acbaa7SAlex Elder 	}
1893bf0d5f50SAlex Elder 
18941b83bef2SSage Weil 	if (osd_req->r_result < 0)
18951b83bef2SSage Weil 		obj_request->result = osd_req->r_result;
1896bf0d5f50SAlex Elder 
1897c47f9371SAlex Elder 	/*
1898c47f9371SAlex Elder 	 * We support a 64-bit length, but ultimately it has to be
18997ad18afaSChristoph Hellwig 	 * passed to the block layer, which just supports a 32-bit
19007ad18afaSChristoph Hellwig 	 * length field.
1901c47f9371SAlex Elder 	 */
19027665d85bSYan, Zheng 	obj_request->xferred = osd_req->r_ops[0].outdata_len;
1903c47f9371SAlex Elder 	rbd_assert(obj_request->xferred < (u64)UINT_MAX);
19040ccd5926SIlya Dryomov 
190579528734SAlex Elder 	opcode = osd_req->r_ops[0].op;
1906bf0d5f50SAlex Elder 	switch (opcode) {
1907bf0d5f50SAlex Elder 	case CEPH_OSD_OP_READ:
1908c47f9371SAlex Elder 		rbd_osd_read_callback(obj_request);
1909bf0d5f50SAlex Elder 		break;
19100ccd5926SIlya Dryomov 	case CEPH_OSD_OP_SETALLOCHINT:
1911e30b7577SIlya Dryomov 		rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE ||
1912e30b7577SIlya Dryomov 			   osd_req->r_ops[1].op == CEPH_OSD_OP_WRITEFULL);
19130ccd5926SIlya Dryomov 		/* fall through */
1914bf0d5f50SAlex Elder 	case CEPH_OSD_OP_WRITE:
1915e30b7577SIlya Dryomov 	case CEPH_OSD_OP_WRITEFULL:
1916c47f9371SAlex Elder 		rbd_osd_write_callback(obj_request);
1917bf0d5f50SAlex Elder 		break;
1918fbfab539SAlex Elder 	case CEPH_OSD_OP_STAT:
1919c47f9371SAlex Elder 		rbd_osd_stat_callback(obj_request);
1920fbfab539SAlex Elder 		break;
192190e98c52SGuangliang Zhao 	case CEPH_OSD_OP_DELETE:
192290e98c52SGuangliang Zhao 	case CEPH_OSD_OP_TRUNCATE:
192390e98c52SGuangliang Zhao 	case CEPH_OSD_OP_ZERO:
192490e98c52SGuangliang Zhao 		rbd_osd_discard_callback(obj_request);
192590e98c52SGuangliang Zhao 		break;
192636be9a76SAlex Elder 	case CEPH_OSD_OP_CALL:
19272761713dSIlya Dryomov 		rbd_osd_call_callback(obj_request);
19282761713dSIlya Dryomov 		break;
1929bf0d5f50SAlex Elder 	default:
19309584d508SIlya Dryomov 		rbd_warn(NULL, "%s: unsupported op %hu",
1931bf0d5f50SAlex Elder 			obj_request->object_name, (unsigned short) opcode);
1932bf0d5f50SAlex Elder 		break;
1933bf0d5f50SAlex Elder 	}
1934bf0d5f50SAlex Elder 
193507741308SAlex Elder 	if (obj_request_done_test(obj_request))
1936bf0d5f50SAlex Elder 		rbd_obj_request_complete(obj_request);
1937bf0d5f50SAlex Elder }
1938bf0d5f50SAlex Elder 
19399d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1940430c28c3SAlex Elder {
19418c042b0dSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
1942430c28c3SAlex Elder 
19437c84883aSIlya Dryomov 	rbd_assert(obj_request_img_data_test(obj_request));
19447c84883aSIlya Dryomov 	osd_req->r_snapid = obj_request->img_request->snap_id;
19459d4df01fSAlex Elder }
19469d4df01fSAlex Elder 
19479d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
19489d4df01fSAlex Elder {
19499d4df01fSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
19509d4df01fSAlex Elder 
1951bb873b53SIlya Dryomov 	osd_req->r_mtime = CURRENT_TIME;
1952bb873b53SIlya Dryomov 	osd_req->r_data_offset = obj_request->offset;
1953430c28c3SAlex Elder }
1954430c28c3SAlex Elder 
19550ccd5926SIlya Dryomov /*
19560ccd5926SIlya Dryomov  * Create an osd request.  A read request has one osd op (read).
19570ccd5926SIlya Dryomov  * A write request has either one (watch) or two (hint+write) osd ops.
19580ccd5926SIlya Dryomov  * (All rbd data writes are prefixed with an allocation hint op, but
19590ccd5926SIlya Dryomov  * technically osd watch is a write request, hence this distinction.)
19600ccd5926SIlya Dryomov  */
1961bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create(
1962bf0d5f50SAlex Elder 					struct rbd_device *rbd_dev,
19636d2940c8SGuangliang Zhao 					enum obj_operation_type op_type,
1964deb236b3SIlya Dryomov 					unsigned int num_ops,
1965430c28c3SAlex Elder 					struct rbd_obj_request *obj_request)
1966bf0d5f50SAlex Elder {
1967bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1968bf0d5f50SAlex Elder 	struct ceph_osd_client *osdc;
1969bf0d5f50SAlex Elder 	struct ceph_osd_request *osd_req;
1970bf0d5f50SAlex Elder 
197190e98c52SGuangliang Zhao 	if (obj_request_img_data_test(obj_request) &&
197290e98c52SGuangliang Zhao 		(op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) {
19736365d33aSAlex Elder 		struct rbd_img_request *img_request = obj_request->img_request;
197490e98c52SGuangliang Zhao 		if (op_type == OBJ_OP_WRITE) {
19756d2940c8SGuangliang Zhao 			rbd_assert(img_request_write_test(img_request));
197690e98c52SGuangliang Zhao 		} else {
197790e98c52SGuangliang Zhao 			rbd_assert(img_request_discard_test(img_request));
197890e98c52SGuangliang Zhao 		}
1979bf0d5f50SAlex Elder 		snapc = img_request->snapc;
1980bf0d5f50SAlex Elder 	}
1981bf0d5f50SAlex Elder 
19826d2940c8SGuangliang Zhao 	rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2));
1983deb236b3SIlya Dryomov 
1984deb236b3SIlya Dryomov 	/* Allocate and initialize the request, for the num_ops ops */
1985bf0d5f50SAlex Elder 
1986bf0d5f50SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
1987deb236b3SIlya Dryomov 	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
19882224d879SDavid Disseldorp 					  GFP_NOIO);
1989bf0d5f50SAlex Elder 	if (!osd_req)
199013d1ad16SIlya Dryomov 		goto fail;
1991bf0d5f50SAlex Elder 
199290e98c52SGuangliang Zhao 	if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
1993bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1994430c28c3SAlex Elder 	else
1995bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_READ;
1996bf0d5f50SAlex Elder 
1997bf0d5f50SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
1998bf0d5f50SAlex Elder 	osd_req->r_priv = obj_request;
1999bf0d5f50SAlex Elder 
20007627151eSYan, Zheng 	osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id;
2001d30291b9SIlya Dryomov 	if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
2002d30291b9SIlya Dryomov 			     obj_request->object_name))
2003d30291b9SIlya Dryomov 		goto fail;
2004bf0d5f50SAlex Elder 
200513d1ad16SIlya Dryomov 	if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
200613d1ad16SIlya Dryomov 		goto fail;
200713d1ad16SIlya Dryomov 
2008bf0d5f50SAlex Elder 	return osd_req;
200913d1ad16SIlya Dryomov 
201013d1ad16SIlya Dryomov fail:
201113d1ad16SIlya Dryomov 	ceph_osdc_put_request(osd_req);
201213d1ad16SIlya Dryomov 	return NULL;
2013bf0d5f50SAlex Elder }
2014bf0d5f50SAlex Elder 
20150eefd470SAlex Elder /*
2016d3246fb0SJosh Durgin  * Create a copyup osd request based on the information in the object
2017d3246fb0SJosh Durgin  * request supplied.  A copyup request has two or three osd ops, a
2018d3246fb0SJosh Durgin  * copyup method call, potentially a hint op, and a write or truncate
2019d3246fb0SJosh Durgin  * or zero op.
20200eefd470SAlex Elder  */
20210eefd470SAlex Elder static struct ceph_osd_request *
20220eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
20230eefd470SAlex Elder {
20240eefd470SAlex Elder 	struct rbd_img_request *img_request;
20250eefd470SAlex Elder 	struct ceph_snap_context *snapc;
20260eefd470SAlex Elder 	struct rbd_device *rbd_dev;
20270eefd470SAlex Elder 	struct ceph_osd_client *osdc;
20280eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
2029d3246fb0SJosh Durgin 	int num_osd_ops = 3;
20300eefd470SAlex Elder 
20310eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
20320eefd470SAlex Elder 	img_request = obj_request->img_request;
20330eefd470SAlex Elder 	rbd_assert(img_request);
2034d3246fb0SJosh Durgin 	rbd_assert(img_request_write_test(img_request) ||
2035d3246fb0SJosh Durgin 			img_request_discard_test(img_request));
20360eefd470SAlex Elder 
2037d3246fb0SJosh Durgin 	if (img_request_discard_test(img_request))
2038d3246fb0SJosh Durgin 		num_osd_ops = 2;
2039d3246fb0SJosh Durgin 
2040d3246fb0SJosh Durgin 	/* Allocate and initialize the request, for all the ops */
20410eefd470SAlex Elder 
20420eefd470SAlex Elder 	snapc = img_request->snapc;
20430eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
20440eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2045d3246fb0SJosh Durgin 	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops,
20462224d879SDavid Disseldorp 						false, GFP_NOIO);
20470eefd470SAlex Elder 	if (!osd_req)
204813d1ad16SIlya Dryomov 		goto fail;
20490eefd470SAlex Elder 
20500eefd470SAlex Elder 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
20510eefd470SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
20520eefd470SAlex Elder 	osd_req->r_priv = obj_request;
20530eefd470SAlex Elder 
20547627151eSYan, Zheng 	osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id;
2055d30291b9SIlya Dryomov 	if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
2056d30291b9SIlya Dryomov 			     obj_request->object_name))
2057d30291b9SIlya Dryomov 		goto fail;
20580eefd470SAlex Elder 
205913d1ad16SIlya Dryomov 	if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
206013d1ad16SIlya Dryomov 		goto fail;
206113d1ad16SIlya Dryomov 
20620eefd470SAlex Elder 	return osd_req;
206313d1ad16SIlya Dryomov 
206413d1ad16SIlya Dryomov fail:
206513d1ad16SIlya Dryomov 	ceph_osdc_put_request(osd_req);
206613d1ad16SIlya Dryomov 	return NULL;
20670eefd470SAlex Elder }
20680eefd470SAlex Elder 
20690eefd470SAlex Elder 
2070bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
2071bf0d5f50SAlex Elder {
2072bf0d5f50SAlex Elder 	ceph_osdc_put_request(osd_req);
2073bf0d5f50SAlex Elder }
2074bf0d5f50SAlex Elder 
2075bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */
2076bf0d5f50SAlex Elder 
2077bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
2078bf0d5f50SAlex Elder 						enum obj_request_type type)
2079bf0d5f50SAlex Elder {
2080bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2081bf0d5f50SAlex Elder 	size_t size;
2082bf0d5f50SAlex Elder 	char *name;
2083bf0d5f50SAlex Elder 
2084bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(type));
2085bf0d5f50SAlex Elder 
2086bf0d5f50SAlex Elder 	size = strlen(object_name) + 1;
20875a60e876SIlya Dryomov 	name = kmalloc(size, GFP_NOIO);
2088f907ad55SAlex Elder 	if (!name)
2089bf0d5f50SAlex Elder 		return NULL;
2090bf0d5f50SAlex Elder 
20915a60e876SIlya Dryomov 	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
2092f907ad55SAlex Elder 	if (!obj_request) {
2093f907ad55SAlex Elder 		kfree(name);
2094f907ad55SAlex Elder 		return NULL;
2095f907ad55SAlex Elder 	}
2096f907ad55SAlex Elder 
2097bf0d5f50SAlex Elder 	obj_request->object_name = memcpy(name, object_name, size);
2098bf0d5f50SAlex Elder 	obj_request->which = BAD_WHICH;
2099bf0d5f50SAlex Elder 	obj_request->type = type;
2100bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&obj_request->links);
2101788e2df3SAlex Elder 	init_completion(&obj_request->completion);
2102bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
2103bf0d5f50SAlex Elder 
210467e2b652SIlya Dryomov 	dout("%s %p\n", __func__, obj_request);
2105bf0d5f50SAlex Elder 	return obj_request;
2106bf0d5f50SAlex Elder }
2107bf0d5f50SAlex Elder 
2108bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
2109bf0d5f50SAlex Elder {
2110bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2111bf0d5f50SAlex Elder 
2112bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
2113bf0d5f50SAlex Elder 
211437206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
211537206ee5SAlex Elder 
2116bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
2117bf0d5f50SAlex Elder 	rbd_assert(obj_request->which == BAD_WHICH);
2118bf0d5f50SAlex Elder 
2119bf0d5f50SAlex Elder 	if (obj_request->osd_req)
2120bf0d5f50SAlex Elder 		rbd_osd_req_destroy(obj_request->osd_req);
2121bf0d5f50SAlex Elder 
2122bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
2123bf0d5f50SAlex Elder 	switch (obj_request->type) {
21249969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
21259969ebc5SAlex Elder 		break;		/* Nothing to do */
2126bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
2127bf0d5f50SAlex Elder 		if (obj_request->bio_list)
2128bf0d5f50SAlex Elder 			bio_chain_put(obj_request->bio_list);
2129bf0d5f50SAlex Elder 		break;
2130788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
213104dc923cSIlya Dryomov 		/* img_data requests don't own their page array */
213204dc923cSIlya Dryomov 		if (obj_request->pages &&
213304dc923cSIlya Dryomov 		    !obj_request_img_data_test(obj_request))
2134788e2df3SAlex Elder 			ceph_release_page_vector(obj_request->pages,
2135788e2df3SAlex Elder 						obj_request->page_count);
2136788e2df3SAlex Elder 		break;
2137bf0d5f50SAlex Elder 	}
2138bf0d5f50SAlex Elder 
2139f907ad55SAlex Elder 	kfree(obj_request->object_name);
2140868311b1SAlex Elder 	obj_request->object_name = NULL;
2141868311b1SAlex Elder 	kmem_cache_free(rbd_obj_request_cache, obj_request);
2142bf0d5f50SAlex Elder }
2143bf0d5f50SAlex Elder 
2144fb65d228SAlex Elder /* It's OK to call this for a device with no parent */
2145fb65d228SAlex Elder 
2146fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
2147fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev)
2148fb65d228SAlex Elder {
2149fb65d228SAlex Elder 	rbd_dev_remove_parent(rbd_dev);
2150fb65d228SAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
2151fb65d228SAlex Elder 	rbd_dev->parent_spec = NULL;
2152fb65d228SAlex Elder 	rbd_dev->parent_overlap = 0;
2153fb65d228SAlex Elder }
2154fb65d228SAlex Elder 
2155bf0d5f50SAlex Elder /*
2156a2acd00eSAlex Elder  * Parent image reference counting is used to determine when an
2157a2acd00eSAlex Elder  * image's parent fields can be safely torn down--after there are no
2158a2acd00eSAlex Elder  * more in-flight requests to the parent image.  When the last
2159a2acd00eSAlex Elder  * reference is dropped, cleaning them up is safe.
2160a2acd00eSAlex Elder  */
2161a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
2162a2acd00eSAlex Elder {
2163a2acd00eSAlex Elder 	int counter;
2164a2acd00eSAlex Elder 
2165a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
2166a2acd00eSAlex Elder 		return;
2167a2acd00eSAlex Elder 
2168a2acd00eSAlex Elder 	counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
2169a2acd00eSAlex Elder 	if (counter > 0)
2170a2acd00eSAlex Elder 		return;
2171a2acd00eSAlex Elder 
2172a2acd00eSAlex Elder 	/* Last reference; clean up parent data structures */
2173a2acd00eSAlex Elder 
2174a2acd00eSAlex Elder 	if (!counter)
2175a2acd00eSAlex Elder 		rbd_dev_unparent(rbd_dev);
2176a2acd00eSAlex Elder 	else
21779584d508SIlya Dryomov 		rbd_warn(rbd_dev, "parent reference underflow");
2178a2acd00eSAlex Elder }
2179a2acd00eSAlex Elder 
2180a2acd00eSAlex Elder /*
2181a2acd00eSAlex Elder  * If an image has a non-zero parent overlap, get a reference to its
2182a2acd00eSAlex Elder  * parent.
2183a2acd00eSAlex Elder  *
2184a2acd00eSAlex Elder  * Returns true if the rbd device has a parent with a non-zero
2185a2acd00eSAlex Elder  * overlap and a reference for it was successfully taken, or
2186a2acd00eSAlex Elder  * false otherwise.
2187a2acd00eSAlex Elder  */
2188a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
2189a2acd00eSAlex Elder {
2190ae43e9d0SIlya Dryomov 	int counter = 0;
2191a2acd00eSAlex Elder 
2192a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
2193a2acd00eSAlex Elder 		return false;
2194a2acd00eSAlex Elder 
2195ae43e9d0SIlya Dryomov 	down_read(&rbd_dev->header_rwsem);
2196ae43e9d0SIlya Dryomov 	if (rbd_dev->parent_overlap)
2197a2acd00eSAlex Elder 		counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
2198ae43e9d0SIlya Dryomov 	up_read(&rbd_dev->header_rwsem);
2199a2acd00eSAlex Elder 
2200a2acd00eSAlex Elder 	if (counter < 0)
22019584d508SIlya Dryomov 		rbd_warn(rbd_dev, "parent reference overflow");
2202a2acd00eSAlex Elder 
2203ae43e9d0SIlya Dryomov 	return counter > 0;
2204a2acd00eSAlex Elder }
2205a2acd00eSAlex Elder 
2206bf0d5f50SAlex Elder /*
2207bf0d5f50SAlex Elder  * Caller is responsible for filling in the list of object requests
2208bf0d5f50SAlex Elder  * that comprises the image request, and the Linux request pointer
2209bf0d5f50SAlex Elder  * (if there is one).
2210bf0d5f50SAlex Elder  */
2211cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create(
2212cc344fa1SAlex Elder 					struct rbd_device *rbd_dev,
2213bf0d5f50SAlex Elder 					u64 offset, u64 length,
22146d2940c8SGuangliang Zhao 					enum obj_operation_type op_type,
22154e752f0aSJosh Durgin 					struct ceph_snap_context *snapc)
2216bf0d5f50SAlex Elder {
2217bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
2218bf0d5f50SAlex Elder 
22197a716aacSIlya Dryomov 	img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
2220bf0d5f50SAlex Elder 	if (!img_request)
2221bf0d5f50SAlex Elder 		return NULL;
2222bf0d5f50SAlex Elder 
2223bf0d5f50SAlex Elder 	img_request->rq = NULL;
2224bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
2225bf0d5f50SAlex Elder 	img_request->offset = offset;
2226bf0d5f50SAlex Elder 	img_request->length = length;
22270c425248SAlex Elder 	img_request->flags = 0;
222890e98c52SGuangliang Zhao 	if (op_type == OBJ_OP_DISCARD) {
222990e98c52SGuangliang Zhao 		img_request_discard_set(img_request);
223090e98c52SGuangliang Zhao 		img_request->snapc = snapc;
223190e98c52SGuangliang Zhao 	} else if (op_type == OBJ_OP_WRITE) {
22320c425248SAlex Elder 		img_request_write_set(img_request);
22334e752f0aSJosh Durgin 		img_request->snapc = snapc;
22340c425248SAlex Elder 	} else {
2235bf0d5f50SAlex Elder 		img_request->snap_id = rbd_dev->spec->snap_id;
22360c425248SAlex Elder 	}
2237a2acd00eSAlex Elder 	if (rbd_dev_parent_get(rbd_dev))
2238d0b2e944SAlex Elder 		img_request_layered_set(img_request);
2239bf0d5f50SAlex Elder 	spin_lock_init(&img_request->completion_lock);
2240bf0d5f50SAlex Elder 	img_request->next_completion = 0;
2241bf0d5f50SAlex Elder 	img_request->callback = NULL;
2242a5a337d4SAlex Elder 	img_request->result = 0;
2243bf0d5f50SAlex Elder 	img_request->obj_request_count = 0;
2244bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&img_request->obj_requests);
2245bf0d5f50SAlex Elder 	kref_init(&img_request->kref);
2246bf0d5f50SAlex Elder 
224737206ee5SAlex Elder 	dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
22486d2940c8SGuangliang Zhao 		obj_op_name(op_type), offset, length, img_request);
224937206ee5SAlex Elder 
2250bf0d5f50SAlex Elder 	return img_request;
2251bf0d5f50SAlex Elder }
2252bf0d5f50SAlex Elder 
2253bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref)
2254bf0d5f50SAlex Elder {
2255bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
2256bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2257bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
2258bf0d5f50SAlex Elder 
2259bf0d5f50SAlex Elder 	img_request = container_of(kref, struct rbd_img_request, kref);
2260bf0d5f50SAlex Elder 
226137206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
226237206ee5SAlex Elder 
2263bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2264bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
226525dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count == 0);
2266bf0d5f50SAlex Elder 
2267a2acd00eSAlex Elder 	if (img_request_layered_test(img_request)) {
2268a2acd00eSAlex Elder 		img_request_layered_clear(img_request);
2269a2acd00eSAlex Elder 		rbd_dev_parent_put(img_request->rbd_dev);
2270a2acd00eSAlex Elder 	}
2271a2acd00eSAlex Elder 
2272bef95455SJosh Durgin 	if (img_request_write_test(img_request) ||
2273bef95455SJosh Durgin 		img_request_discard_test(img_request))
2274812164f8SAlex Elder 		ceph_put_snap_context(img_request->snapc);
2275bf0d5f50SAlex Elder 
22761c2a9dfeSAlex Elder 	kmem_cache_free(rbd_img_request_cache, img_request);
2277bf0d5f50SAlex Elder }
2278bf0d5f50SAlex Elder 
2279e93f3152SAlex Elder static struct rbd_img_request *rbd_parent_request_create(
2280e93f3152SAlex Elder 					struct rbd_obj_request *obj_request,
2281e93f3152SAlex Elder 					u64 img_offset, u64 length)
2282e93f3152SAlex Elder {
2283e93f3152SAlex Elder 	struct rbd_img_request *parent_request;
2284e93f3152SAlex Elder 	struct rbd_device *rbd_dev;
2285e93f3152SAlex Elder 
2286e93f3152SAlex Elder 	rbd_assert(obj_request->img_request);
2287e93f3152SAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2288e93f3152SAlex Elder 
22894e752f0aSJosh Durgin 	parent_request = rbd_img_request_create(rbd_dev->parent, img_offset,
22906d2940c8SGuangliang Zhao 						length, OBJ_OP_READ, NULL);
2291e93f3152SAlex Elder 	if (!parent_request)
2292e93f3152SAlex Elder 		return NULL;
2293e93f3152SAlex Elder 
2294e93f3152SAlex Elder 	img_request_child_set(parent_request);
2295e93f3152SAlex Elder 	rbd_obj_request_get(obj_request);
2296e93f3152SAlex Elder 	parent_request->obj_request = obj_request;
2297e93f3152SAlex Elder 
2298e93f3152SAlex Elder 	return parent_request;
2299e93f3152SAlex Elder }
2300e93f3152SAlex Elder 
2301e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref)
2302e93f3152SAlex Elder {
2303e93f3152SAlex Elder 	struct rbd_img_request *parent_request;
2304e93f3152SAlex Elder 	struct rbd_obj_request *orig_request;
2305e93f3152SAlex Elder 
2306e93f3152SAlex Elder 	parent_request = container_of(kref, struct rbd_img_request, kref);
2307e93f3152SAlex Elder 	orig_request = parent_request->obj_request;
2308e93f3152SAlex Elder 
2309e93f3152SAlex Elder 	parent_request->obj_request = NULL;
2310e93f3152SAlex Elder 	rbd_obj_request_put(orig_request);
2311e93f3152SAlex Elder 	img_request_child_clear(parent_request);
2312e93f3152SAlex Elder 
2313e93f3152SAlex Elder 	rbd_img_request_destroy(kref);
2314e93f3152SAlex Elder }
2315e93f3152SAlex Elder 
23161217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
23171217857fSAlex Elder {
23186365d33aSAlex Elder 	struct rbd_img_request *img_request;
23191217857fSAlex Elder 	unsigned int xferred;
23201217857fSAlex Elder 	int result;
23218b3e1a56SAlex Elder 	bool more;
23221217857fSAlex Elder 
23236365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
23246365d33aSAlex Elder 	img_request = obj_request->img_request;
23256365d33aSAlex Elder 
23261217857fSAlex Elder 	rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
23271217857fSAlex Elder 	xferred = (unsigned int)obj_request->xferred;
23281217857fSAlex Elder 	result = obj_request->result;
23291217857fSAlex Elder 	if (result) {
23301217857fSAlex Elder 		struct rbd_device *rbd_dev = img_request->rbd_dev;
23316d2940c8SGuangliang Zhao 		enum obj_operation_type op_type;
23326d2940c8SGuangliang Zhao 
233390e98c52SGuangliang Zhao 		if (img_request_discard_test(img_request))
233490e98c52SGuangliang Zhao 			op_type = OBJ_OP_DISCARD;
233590e98c52SGuangliang Zhao 		else if (img_request_write_test(img_request))
233690e98c52SGuangliang Zhao 			op_type = OBJ_OP_WRITE;
233790e98c52SGuangliang Zhao 		else
233890e98c52SGuangliang Zhao 			op_type = OBJ_OP_READ;
23391217857fSAlex Elder 
23409584d508SIlya Dryomov 		rbd_warn(rbd_dev, "%s %llx at %llx (%llx)",
23416d2940c8SGuangliang Zhao 			obj_op_name(op_type), obj_request->length,
23426d2940c8SGuangliang Zhao 			obj_request->img_offset, obj_request->offset);
23439584d508SIlya Dryomov 		rbd_warn(rbd_dev, "  result %d xferred %x",
23441217857fSAlex Elder 			result, xferred);
23451217857fSAlex Elder 		if (!img_request->result)
23461217857fSAlex Elder 			img_request->result = result;
2347082a75daSIlya Dryomov 		/*
2348082a75daSIlya Dryomov 		 * Need to end I/O on the entire obj_request worth of
2349082a75daSIlya Dryomov 		 * bytes in case of error.
2350082a75daSIlya Dryomov 		 */
2351082a75daSIlya Dryomov 		xferred = obj_request->length;
23521217857fSAlex Elder 	}
23531217857fSAlex Elder 
23548b3e1a56SAlex Elder 	if (img_request_child_test(img_request)) {
23558b3e1a56SAlex Elder 		rbd_assert(img_request->obj_request != NULL);
23568b3e1a56SAlex Elder 		more = obj_request->which < img_request->obj_request_count - 1;
23578b3e1a56SAlex Elder 	} else {
23588b3e1a56SAlex Elder 		rbd_assert(img_request->rq != NULL);
23597ad18afaSChristoph Hellwig 
23607ad18afaSChristoph Hellwig 		more = blk_update_request(img_request->rq, result, xferred);
23617ad18afaSChristoph Hellwig 		if (!more)
23627ad18afaSChristoph Hellwig 			__blk_mq_end_request(img_request->rq, result);
23638b3e1a56SAlex Elder 	}
23648b3e1a56SAlex Elder 
23658b3e1a56SAlex Elder 	return more;
23661217857fSAlex Elder }
23671217857fSAlex Elder 
23682169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
23692169238dSAlex Elder {
23702169238dSAlex Elder 	struct rbd_img_request *img_request;
23712169238dSAlex Elder 	u32 which = obj_request->which;
23722169238dSAlex Elder 	bool more = true;
23732169238dSAlex Elder 
23746365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
23752169238dSAlex Elder 	img_request = obj_request->img_request;
23762169238dSAlex Elder 
23772169238dSAlex Elder 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
23782169238dSAlex Elder 	rbd_assert(img_request != NULL);
23792169238dSAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
23802169238dSAlex Elder 	rbd_assert(which != BAD_WHICH);
23812169238dSAlex Elder 	rbd_assert(which < img_request->obj_request_count);
23822169238dSAlex Elder 
23832169238dSAlex Elder 	spin_lock_irq(&img_request->completion_lock);
23842169238dSAlex Elder 	if (which != img_request->next_completion)
23852169238dSAlex Elder 		goto out;
23862169238dSAlex Elder 
23872169238dSAlex Elder 	for_each_obj_request_from(img_request, obj_request) {
23882169238dSAlex Elder 		rbd_assert(more);
23892169238dSAlex Elder 		rbd_assert(which < img_request->obj_request_count);
23902169238dSAlex Elder 
23912169238dSAlex Elder 		if (!obj_request_done_test(obj_request))
23922169238dSAlex Elder 			break;
23931217857fSAlex Elder 		more = rbd_img_obj_end_request(obj_request);
23942169238dSAlex Elder 		which++;
23952169238dSAlex Elder 	}
23962169238dSAlex Elder 
23972169238dSAlex Elder 	rbd_assert(more ^ (which == img_request->obj_request_count));
23982169238dSAlex Elder 	img_request->next_completion = which;
23992169238dSAlex Elder out:
24002169238dSAlex Elder 	spin_unlock_irq(&img_request->completion_lock);
24010f2d5be7SAlex Elder 	rbd_img_request_put(img_request);
24022169238dSAlex Elder 
24032169238dSAlex Elder 	if (!more)
24042169238dSAlex Elder 		rbd_img_request_complete(img_request);
24052169238dSAlex Elder }
24062169238dSAlex Elder 
2407f1a4739fSAlex Elder /*
24083b434a2aSJosh Durgin  * Add individual osd ops to the given ceph_osd_request and prepare
24093b434a2aSJosh Durgin  * them for submission. num_ops is the current number of
24103b434a2aSJosh Durgin  * osd operations already to the object request.
24113b434a2aSJosh Durgin  */
24123b434a2aSJosh Durgin static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request,
24133b434a2aSJosh Durgin 				struct ceph_osd_request *osd_request,
24143b434a2aSJosh Durgin 				enum obj_operation_type op_type,
24153b434a2aSJosh Durgin 				unsigned int num_ops)
24163b434a2aSJosh Durgin {
24173b434a2aSJosh Durgin 	struct rbd_img_request *img_request = obj_request->img_request;
24183b434a2aSJosh Durgin 	struct rbd_device *rbd_dev = img_request->rbd_dev;
24193b434a2aSJosh Durgin 	u64 object_size = rbd_obj_bytes(&rbd_dev->header);
24203b434a2aSJosh Durgin 	u64 offset = obj_request->offset;
24213b434a2aSJosh Durgin 	u64 length = obj_request->length;
24223b434a2aSJosh Durgin 	u64 img_end;
24233b434a2aSJosh Durgin 	u16 opcode;
24243b434a2aSJosh Durgin 
24253b434a2aSJosh Durgin 	if (op_type == OBJ_OP_DISCARD) {
2426d3246fb0SJosh Durgin 		if (!offset && length == object_size &&
2427d3246fb0SJosh Durgin 		    (!img_request_layered_test(img_request) ||
2428d3246fb0SJosh Durgin 		     !obj_request_overlaps_parent(obj_request))) {
24293b434a2aSJosh Durgin 			opcode = CEPH_OSD_OP_DELETE;
24303b434a2aSJosh Durgin 		} else if ((offset + length == object_size)) {
24313b434a2aSJosh Durgin 			opcode = CEPH_OSD_OP_TRUNCATE;
24323b434a2aSJosh Durgin 		} else {
24333b434a2aSJosh Durgin 			down_read(&rbd_dev->header_rwsem);
24343b434a2aSJosh Durgin 			img_end = rbd_dev->header.image_size;
24353b434a2aSJosh Durgin 			up_read(&rbd_dev->header_rwsem);
24363b434a2aSJosh Durgin 
24373b434a2aSJosh Durgin 			if (obj_request->img_offset + length == img_end)
24383b434a2aSJosh Durgin 				opcode = CEPH_OSD_OP_TRUNCATE;
24393b434a2aSJosh Durgin 			else
24403b434a2aSJosh Durgin 				opcode = CEPH_OSD_OP_ZERO;
24413b434a2aSJosh Durgin 		}
24423b434a2aSJosh Durgin 	} else if (op_type == OBJ_OP_WRITE) {
2443e30b7577SIlya Dryomov 		if (!offset && length == object_size)
2444e30b7577SIlya Dryomov 			opcode = CEPH_OSD_OP_WRITEFULL;
2445e30b7577SIlya Dryomov 		else
24463b434a2aSJosh Durgin 			opcode = CEPH_OSD_OP_WRITE;
24473b434a2aSJosh Durgin 		osd_req_op_alloc_hint_init(osd_request, num_ops,
24483b434a2aSJosh Durgin 					object_size, object_size);
24493b434a2aSJosh Durgin 		num_ops++;
24503b434a2aSJosh Durgin 	} else {
24513b434a2aSJosh Durgin 		opcode = CEPH_OSD_OP_READ;
24523b434a2aSJosh Durgin 	}
24533b434a2aSJosh Durgin 
24547e868b6eSIlya Dryomov 	if (opcode == CEPH_OSD_OP_DELETE)
2455144cba14SYan, Zheng 		osd_req_op_init(osd_request, num_ops, opcode, 0);
24567e868b6eSIlya Dryomov 	else
24577e868b6eSIlya Dryomov 		osd_req_op_extent_init(osd_request, num_ops, opcode,
24587e868b6eSIlya Dryomov 				       offset, length, 0, 0);
24597e868b6eSIlya Dryomov 
24603b434a2aSJosh Durgin 	if (obj_request->type == OBJ_REQUEST_BIO)
24613b434a2aSJosh Durgin 		osd_req_op_extent_osd_data_bio(osd_request, num_ops,
24623b434a2aSJosh Durgin 					obj_request->bio_list, length);
24633b434a2aSJosh Durgin 	else if (obj_request->type == OBJ_REQUEST_PAGES)
24643b434a2aSJosh Durgin 		osd_req_op_extent_osd_data_pages(osd_request, num_ops,
24653b434a2aSJosh Durgin 					obj_request->pages, length,
24663b434a2aSJosh Durgin 					offset & ~PAGE_MASK, false, false);
24673b434a2aSJosh Durgin 
24683b434a2aSJosh Durgin 	/* Discards are also writes */
24693b434a2aSJosh Durgin 	if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
24703b434a2aSJosh Durgin 		rbd_osd_req_format_write(obj_request);
24713b434a2aSJosh Durgin 	else
24723b434a2aSJosh Durgin 		rbd_osd_req_format_read(obj_request);
24733b434a2aSJosh Durgin }
24743b434a2aSJosh Durgin 
24753b434a2aSJosh Durgin /*
2476f1a4739fSAlex Elder  * Split up an image request into one or more object requests, each
2477f1a4739fSAlex Elder  * to a different object.  The "type" parameter indicates whether
2478f1a4739fSAlex Elder  * "data_desc" is the pointer to the head of a list of bio
2479f1a4739fSAlex Elder  * structures, or the base of a page array.  In either case this
2480f1a4739fSAlex Elder  * function assumes data_desc describes memory sufficient to hold
2481f1a4739fSAlex Elder  * all data described by the image request.
2482f1a4739fSAlex Elder  */
2483f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request,
2484f1a4739fSAlex Elder 					enum obj_request_type type,
2485f1a4739fSAlex Elder 					void *data_desc)
2486bf0d5f50SAlex Elder {
2487bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = img_request->rbd_dev;
2488bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = NULL;
2489bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
2490a158073cSJingoo Han 	struct bio *bio_list = NULL;
2491f1a4739fSAlex Elder 	unsigned int bio_offset = 0;
2492a158073cSJingoo Han 	struct page **pages = NULL;
24936d2940c8SGuangliang Zhao 	enum obj_operation_type op_type;
24947da22d29SAlex Elder 	u64 img_offset;
2495bf0d5f50SAlex Elder 	u64 resid;
2496bf0d5f50SAlex Elder 
2497f1a4739fSAlex Elder 	dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2498f1a4739fSAlex Elder 		(int)type, data_desc);
249937206ee5SAlex Elder 
25007da22d29SAlex Elder 	img_offset = img_request->offset;
2501bf0d5f50SAlex Elder 	resid = img_request->length;
25024dda41d3SAlex Elder 	rbd_assert(resid > 0);
25033b434a2aSJosh Durgin 	op_type = rbd_img_request_op_type(img_request);
2504f1a4739fSAlex Elder 
2505f1a4739fSAlex Elder 	if (type == OBJ_REQUEST_BIO) {
2506f1a4739fSAlex Elder 		bio_list = data_desc;
25074f024f37SKent Overstreet 		rbd_assert(img_offset ==
25084f024f37SKent Overstreet 			   bio_list->bi_iter.bi_sector << SECTOR_SHIFT);
250990e98c52SGuangliang Zhao 	} else if (type == OBJ_REQUEST_PAGES) {
2510f1a4739fSAlex Elder 		pages = data_desc;
2511f1a4739fSAlex Elder 	}
2512f1a4739fSAlex Elder 
2513bf0d5f50SAlex Elder 	while (resid) {
25142fa12320SAlex Elder 		struct ceph_osd_request *osd_req;
2515bf0d5f50SAlex Elder 		const char *object_name;
251667e2b652SIlya Dryomov 		u64 offset = rbd_segment_offset(rbd_dev, img_offset);
251767e2b652SIlya Dryomov 		u64 length = rbd_segment_length(rbd_dev, img_offset, resid);
2518bf0d5f50SAlex Elder 
25197da22d29SAlex Elder 		object_name = rbd_segment_name(rbd_dev, img_offset);
2520bf0d5f50SAlex Elder 		if (!object_name)
2521bf0d5f50SAlex Elder 			goto out_unwind;
252267e2b652SIlya Dryomov 		obj_request = rbd_obj_request_create(object_name, type);
252378c2a44aSAlex Elder 		/* object request has its own copy of the object name */
252478c2a44aSAlex Elder 		rbd_segment_name_free(object_name);
2525bf0d5f50SAlex Elder 		if (!obj_request)
2526bf0d5f50SAlex Elder 			goto out_unwind;
252762054da6SIlya Dryomov 
252867e2b652SIlya Dryomov 		obj_request->offset = offset;
252967e2b652SIlya Dryomov 		obj_request->length = length;
253067e2b652SIlya Dryomov 
253103507db6SJosh Durgin 		/*
253203507db6SJosh Durgin 		 * set obj_request->img_request before creating the
253303507db6SJosh Durgin 		 * osd_request so that it gets the right snapc
253403507db6SJosh Durgin 		 */
253503507db6SJosh Durgin 		rbd_img_obj_request_add(img_request, obj_request);
2536bf0d5f50SAlex Elder 
2537f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO) {
2538f1a4739fSAlex Elder 			unsigned int clone_size;
2539f1a4739fSAlex Elder 
2540bf0d5f50SAlex Elder 			rbd_assert(length <= (u64)UINT_MAX);
2541bf0d5f50SAlex Elder 			clone_size = (unsigned int)length;
2542f1a4739fSAlex Elder 			obj_request->bio_list =
2543f1a4739fSAlex Elder 					bio_chain_clone_range(&bio_list,
2544f1a4739fSAlex Elder 								&bio_offset,
2545f1a4739fSAlex Elder 								clone_size,
25462224d879SDavid Disseldorp 								GFP_NOIO);
2547bf0d5f50SAlex Elder 			if (!obj_request->bio_list)
254862054da6SIlya Dryomov 				goto out_unwind;
254990e98c52SGuangliang Zhao 		} else if (type == OBJ_REQUEST_PAGES) {
2550f1a4739fSAlex Elder 			unsigned int page_count;
2551f1a4739fSAlex Elder 
2552f1a4739fSAlex Elder 			obj_request->pages = pages;
2553f1a4739fSAlex Elder 			page_count = (u32)calc_pages_for(offset, length);
2554f1a4739fSAlex Elder 			obj_request->page_count = page_count;
2555f1a4739fSAlex Elder 			if ((offset + length) & ~PAGE_MASK)
2556f1a4739fSAlex Elder 				page_count--;	/* more on last page */
2557f1a4739fSAlex Elder 			pages += page_count;
2558f1a4739fSAlex Elder 		}
2559bf0d5f50SAlex Elder 
25606d2940c8SGuangliang Zhao 		osd_req = rbd_osd_req_create(rbd_dev, op_type,
25616d2940c8SGuangliang Zhao 					(op_type == OBJ_OP_WRITE) ? 2 : 1,
25622fa12320SAlex Elder 					obj_request);
25632fa12320SAlex Elder 		if (!osd_req)
256462054da6SIlya Dryomov 			goto out_unwind;
25653b434a2aSJosh Durgin 
25662fa12320SAlex Elder 		obj_request->osd_req = osd_req;
25672169238dSAlex Elder 		obj_request->callback = rbd_img_obj_callback;
25687da22d29SAlex Elder 		obj_request->img_offset = img_offset;
2569bf0d5f50SAlex Elder 
25703b434a2aSJosh Durgin 		rbd_img_obj_request_fill(obj_request, osd_req, op_type, 0);
25713b434a2aSJosh Durgin 
25727da22d29SAlex Elder 		img_offset += length;
2573bf0d5f50SAlex Elder 		resid -= length;
2574bf0d5f50SAlex Elder 	}
2575bf0d5f50SAlex Elder 
2576bf0d5f50SAlex Elder 	return 0;
2577bf0d5f50SAlex Elder 
2578bf0d5f50SAlex Elder out_unwind:
2579bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
258042dd037cSIlya Dryomov 		rbd_img_obj_request_del(img_request, obj_request);
2581bf0d5f50SAlex Elder 
2582bf0d5f50SAlex Elder 	return -ENOMEM;
2583bf0d5f50SAlex Elder }
2584bf0d5f50SAlex Elder 
25853d7efd18SAlex Elder static void
25862761713dSIlya Dryomov rbd_osd_copyup_callback(struct rbd_obj_request *obj_request)
25870eefd470SAlex Elder {
25880eefd470SAlex Elder 	struct rbd_img_request *img_request;
25890eefd470SAlex Elder 	struct rbd_device *rbd_dev;
2590ebda6408SAlex Elder 	struct page **pages;
25910eefd470SAlex Elder 	u32 page_count;
25920eefd470SAlex Elder 
25932761713dSIlya Dryomov 	dout("%s: obj %p\n", __func__, obj_request);
25942761713dSIlya Dryomov 
2595d3246fb0SJosh Durgin 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO ||
2596d3246fb0SJosh Durgin 		obj_request->type == OBJ_REQUEST_NODATA);
25970eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
25980eefd470SAlex Elder 	img_request = obj_request->img_request;
25990eefd470SAlex Elder 	rbd_assert(img_request);
26000eefd470SAlex Elder 
26010eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
26020eefd470SAlex Elder 	rbd_assert(rbd_dev);
26030eefd470SAlex Elder 
2604ebda6408SAlex Elder 	pages = obj_request->copyup_pages;
2605ebda6408SAlex Elder 	rbd_assert(pages != NULL);
26060eefd470SAlex Elder 	obj_request->copyup_pages = NULL;
2607ebda6408SAlex Elder 	page_count = obj_request->copyup_page_count;
2608ebda6408SAlex Elder 	rbd_assert(page_count);
2609ebda6408SAlex Elder 	obj_request->copyup_page_count = 0;
2610ebda6408SAlex Elder 	ceph_release_page_vector(pages, page_count);
26110eefd470SAlex Elder 
26120eefd470SAlex Elder 	/*
26130eefd470SAlex Elder 	 * We want the transfer count to reflect the size of the
26140eefd470SAlex Elder 	 * original write request.  There is no such thing as a
26150eefd470SAlex Elder 	 * successful short write, so if the request was successful
26160eefd470SAlex Elder 	 * we can just set it to the originally-requested length.
26170eefd470SAlex Elder 	 */
26180eefd470SAlex Elder 	if (!obj_request->result)
26190eefd470SAlex Elder 		obj_request->xferred = obj_request->length;
26200eefd470SAlex Elder 
26212761713dSIlya Dryomov 	obj_request_done_set(obj_request);
26220eefd470SAlex Elder }
26230eefd470SAlex Elder 
26240eefd470SAlex Elder static void
26253d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
26263d7efd18SAlex Elder {
26273d7efd18SAlex Elder 	struct rbd_obj_request *orig_request;
26280eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
26290eefd470SAlex Elder 	struct rbd_device *rbd_dev;
26303d7efd18SAlex Elder 	struct page **pages;
2631d3246fb0SJosh Durgin 	enum obj_operation_type op_type;
2632ebda6408SAlex Elder 	u32 page_count;
2633bbea1c1aSAlex Elder 	int img_result;
2634ebda6408SAlex Elder 	u64 parent_length;
26353d7efd18SAlex Elder 
26363d7efd18SAlex Elder 	rbd_assert(img_request_child_test(img_request));
26373d7efd18SAlex Elder 
26383d7efd18SAlex Elder 	/* First get what we need from the image request */
26393d7efd18SAlex Elder 
26403d7efd18SAlex Elder 	pages = img_request->copyup_pages;
26413d7efd18SAlex Elder 	rbd_assert(pages != NULL);
26423d7efd18SAlex Elder 	img_request->copyup_pages = NULL;
2643ebda6408SAlex Elder 	page_count = img_request->copyup_page_count;
2644ebda6408SAlex Elder 	rbd_assert(page_count);
2645ebda6408SAlex Elder 	img_request->copyup_page_count = 0;
26463d7efd18SAlex Elder 
26473d7efd18SAlex Elder 	orig_request = img_request->obj_request;
26483d7efd18SAlex Elder 	rbd_assert(orig_request != NULL);
2649b91f09f1SAlex Elder 	rbd_assert(obj_request_type_valid(orig_request->type));
2650bbea1c1aSAlex Elder 	img_result = img_request->result;
2651ebda6408SAlex Elder 	parent_length = img_request->length;
2652fa355112SIlya Dryomov 	rbd_assert(img_result || parent_length == img_request->xferred);
26533d7efd18SAlex Elder 	rbd_img_request_put(img_request);
26543d7efd18SAlex Elder 
265591c6febbSAlex Elder 	rbd_assert(orig_request->img_request);
265691c6febbSAlex Elder 	rbd_dev = orig_request->img_request->rbd_dev;
26573d7efd18SAlex Elder 	rbd_assert(rbd_dev);
26583d7efd18SAlex Elder 
2659bbea1c1aSAlex Elder 	/*
2660bbea1c1aSAlex Elder 	 * If the overlap has become 0 (most likely because the
2661bbea1c1aSAlex Elder 	 * image has been flattened) we need to free the pages
2662bbea1c1aSAlex Elder 	 * and re-submit the original write request.
2663bbea1c1aSAlex Elder 	 */
2664bbea1c1aSAlex Elder 	if (!rbd_dev->parent_overlap) {
2665bbea1c1aSAlex Elder 		ceph_release_page_vector(pages, page_count);
2666980917fcSIlya Dryomov 		rbd_obj_request_submit(orig_request);
2667bbea1c1aSAlex Elder 		return;
2668bbea1c1aSAlex Elder 	}
2669bbea1c1aSAlex Elder 
2670bbea1c1aSAlex Elder 	if (img_result)
26710eefd470SAlex Elder 		goto out_err;
26723d7efd18SAlex Elder 
26738785b1d4SAlex Elder 	/*
26748785b1d4SAlex Elder 	 * The original osd request is of no use to use any more.
26750ccd5926SIlya Dryomov 	 * We need a new one that can hold the three ops in a copyup
26768785b1d4SAlex Elder 	 * request.  Allocate the new copyup osd request for the
26778785b1d4SAlex Elder 	 * original request, and release the old one.
26788785b1d4SAlex Elder 	 */
2679bbea1c1aSAlex Elder 	img_result = -ENOMEM;
26800eefd470SAlex Elder 	osd_req = rbd_osd_req_create_copyup(orig_request);
26810eefd470SAlex Elder 	if (!osd_req)
26820eefd470SAlex Elder 		goto out_err;
26838785b1d4SAlex Elder 	rbd_osd_req_destroy(orig_request->osd_req);
26840eefd470SAlex Elder 	orig_request->osd_req = osd_req;
26850eefd470SAlex Elder 	orig_request->copyup_pages = pages;
2686ebda6408SAlex Elder 	orig_request->copyup_page_count = page_count;
26873d7efd18SAlex Elder 
26880eefd470SAlex Elder 	/* Initialize the copyup op */
26890eefd470SAlex Elder 
26900eefd470SAlex Elder 	osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2691ebda6408SAlex Elder 	osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
26920eefd470SAlex Elder 						false, false);
26930eefd470SAlex Elder 
2694d3246fb0SJosh Durgin 	/* Add the other op(s) */
26950ccd5926SIlya Dryomov 
2696d3246fb0SJosh Durgin 	op_type = rbd_img_request_op_type(orig_request->img_request);
2697d3246fb0SJosh Durgin 	rbd_img_obj_request_fill(orig_request, osd_req, op_type, 1);
26980eefd470SAlex Elder 
26990eefd470SAlex Elder 	/* All set, send it off. */
27000eefd470SAlex Elder 
2701980917fcSIlya Dryomov 	rbd_obj_request_submit(orig_request);
27020eefd470SAlex Elder 	return;
27030eefd470SAlex Elder 
27040eefd470SAlex Elder out_err:
2705fa355112SIlya Dryomov 	ceph_release_page_vector(pages, page_count);
27060dcc685eSIlya Dryomov 	rbd_obj_request_error(orig_request, img_result);
27073d7efd18SAlex Elder }
27083d7efd18SAlex Elder 
27093d7efd18SAlex Elder /*
27103d7efd18SAlex Elder  * Read from the parent image the range of data that covers the
27113d7efd18SAlex Elder  * entire target of the given object request.  This is used for
27123d7efd18SAlex Elder  * satisfying a layered image write request when the target of an
27133d7efd18SAlex Elder  * object request from the image request does not exist.
27143d7efd18SAlex Elder  *
27153d7efd18SAlex Elder  * A page array big enough to hold the returned data is allocated
27163d7efd18SAlex Elder  * and supplied to rbd_img_request_fill() as the "data descriptor."
27173d7efd18SAlex Elder  * When the read completes, this page array will be transferred to
27183d7efd18SAlex Elder  * the original object request for the copyup operation.
27193d7efd18SAlex Elder  *
2720c2e82414SIlya Dryomov  * If an error occurs, it is recorded as the result of the original
2721c2e82414SIlya Dryomov  * object request in rbd_img_obj_exists_callback().
27223d7efd18SAlex Elder  */
27233d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
27243d7efd18SAlex Elder {
2725058aa991SIlya Dryomov 	struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
27263d7efd18SAlex Elder 	struct rbd_img_request *parent_request = NULL;
27273d7efd18SAlex Elder 	u64 img_offset;
27283d7efd18SAlex Elder 	u64 length;
27293d7efd18SAlex Elder 	struct page **pages = NULL;
27303d7efd18SAlex Elder 	u32 page_count;
27313d7efd18SAlex Elder 	int result;
27323d7efd18SAlex Elder 
27333d7efd18SAlex Elder 	rbd_assert(rbd_dev->parent != NULL);
27343d7efd18SAlex Elder 
27353d7efd18SAlex Elder 	/*
27363d7efd18SAlex Elder 	 * Determine the byte range covered by the object in the
27373d7efd18SAlex Elder 	 * child image to which the original request was to be sent.
27383d7efd18SAlex Elder 	 */
27393d7efd18SAlex Elder 	img_offset = obj_request->img_offset - obj_request->offset;
27405bc3fb17SIlya Dryomov 	length = rbd_obj_bytes(&rbd_dev->header);
27413d7efd18SAlex Elder 
27423d7efd18SAlex Elder 	/*
2743a9e8ba2cSAlex Elder 	 * There is no defined parent data beyond the parent
2744a9e8ba2cSAlex Elder 	 * overlap, so limit what we read at that boundary if
2745a9e8ba2cSAlex Elder 	 * necessary.
2746a9e8ba2cSAlex Elder 	 */
2747a9e8ba2cSAlex Elder 	if (img_offset + length > rbd_dev->parent_overlap) {
2748a9e8ba2cSAlex Elder 		rbd_assert(img_offset < rbd_dev->parent_overlap);
2749a9e8ba2cSAlex Elder 		length = rbd_dev->parent_overlap - img_offset;
2750a9e8ba2cSAlex Elder 	}
2751a9e8ba2cSAlex Elder 
2752a9e8ba2cSAlex Elder 	/*
27533d7efd18SAlex Elder 	 * Allocate a page array big enough to receive the data read
27543d7efd18SAlex Elder 	 * from the parent.
27553d7efd18SAlex Elder 	 */
27563d7efd18SAlex Elder 	page_count = (u32)calc_pages_for(0, length);
27573d7efd18SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
27583d7efd18SAlex Elder 	if (IS_ERR(pages)) {
27593d7efd18SAlex Elder 		result = PTR_ERR(pages);
27603d7efd18SAlex Elder 		pages = NULL;
27613d7efd18SAlex Elder 		goto out_err;
27623d7efd18SAlex Elder 	}
27633d7efd18SAlex Elder 
27643d7efd18SAlex Elder 	result = -ENOMEM;
2765e93f3152SAlex Elder 	parent_request = rbd_parent_request_create(obj_request,
2766e93f3152SAlex Elder 						img_offset, length);
27673d7efd18SAlex Elder 	if (!parent_request)
27683d7efd18SAlex Elder 		goto out_err;
27693d7efd18SAlex Elder 
27703d7efd18SAlex Elder 	result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
27713d7efd18SAlex Elder 	if (result)
27723d7efd18SAlex Elder 		goto out_err;
2773058aa991SIlya Dryomov 
27743d7efd18SAlex Elder 	parent_request->copyup_pages = pages;
2775ebda6408SAlex Elder 	parent_request->copyup_page_count = page_count;
27763d7efd18SAlex Elder 	parent_request->callback = rbd_img_obj_parent_read_full_callback;
2777058aa991SIlya Dryomov 
27783d7efd18SAlex Elder 	result = rbd_img_request_submit(parent_request);
27793d7efd18SAlex Elder 	if (!result)
27803d7efd18SAlex Elder 		return 0;
27813d7efd18SAlex Elder 
27823d7efd18SAlex Elder 	parent_request->copyup_pages = NULL;
2783ebda6408SAlex Elder 	parent_request->copyup_page_count = 0;
27843d7efd18SAlex Elder 	parent_request->obj_request = NULL;
27853d7efd18SAlex Elder 	rbd_obj_request_put(obj_request);
27863d7efd18SAlex Elder out_err:
27873d7efd18SAlex Elder 	if (pages)
27883d7efd18SAlex Elder 		ceph_release_page_vector(pages, page_count);
27893d7efd18SAlex Elder 	if (parent_request)
27903d7efd18SAlex Elder 		rbd_img_request_put(parent_request);
27913d7efd18SAlex Elder 	return result;
27923d7efd18SAlex Elder }
27933d7efd18SAlex Elder 
2794c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2795c5b5ef6cSAlex Elder {
2796c5b5ef6cSAlex Elder 	struct rbd_obj_request *orig_request;
2797638f5abeSAlex Elder 	struct rbd_device *rbd_dev;
2798c5b5ef6cSAlex Elder 	int result;
2799c5b5ef6cSAlex Elder 
2800c5b5ef6cSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
2801c5b5ef6cSAlex Elder 
2802c5b5ef6cSAlex Elder 	/*
2803c5b5ef6cSAlex Elder 	 * All we need from the object request is the original
2804c5b5ef6cSAlex Elder 	 * request and the result of the STAT op.  Grab those, then
2805c5b5ef6cSAlex Elder 	 * we're done with the request.
2806c5b5ef6cSAlex Elder 	 */
2807c5b5ef6cSAlex Elder 	orig_request = obj_request->obj_request;
2808c5b5ef6cSAlex Elder 	obj_request->obj_request = NULL;
2809912c317dSAlex Elder 	rbd_obj_request_put(orig_request);
2810c5b5ef6cSAlex Elder 	rbd_assert(orig_request);
2811c5b5ef6cSAlex Elder 	rbd_assert(orig_request->img_request);
2812c5b5ef6cSAlex Elder 
2813c5b5ef6cSAlex Elder 	result = obj_request->result;
2814c5b5ef6cSAlex Elder 	obj_request->result = 0;
2815c5b5ef6cSAlex Elder 
2816c5b5ef6cSAlex Elder 	dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2817c5b5ef6cSAlex Elder 		obj_request, orig_request, result,
2818c5b5ef6cSAlex Elder 		obj_request->xferred, obj_request->length);
2819c5b5ef6cSAlex Elder 	rbd_obj_request_put(obj_request);
2820c5b5ef6cSAlex Elder 
2821638f5abeSAlex Elder 	/*
2822638f5abeSAlex Elder 	 * If the overlap has become 0 (most likely because the
2823980917fcSIlya Dryomov 	 * image has been flattened) we need to re-submit the
2824980917fcSIlya Dryomov 	 * original request.
2825638f5abeSAlex Elder 	 */
2826638f5abeSAlex Elder 	rbd_dev = orig_request->img_request->rbd_dev;
2827638f5abeSAlex Elder 	if (!rbd_dev->parent_overlap) {
2828980917fcSIlya Dryomov 		rbd_obj_request_submit(orig_request);
2829638f5abeSAlex Elder 		return;
2830638f5abeSAlex Elder 	}
2831c5b5ef6cSAlex Elder 
2832c5b5ef6cSAlex Elder 	/*
2833c5b5ef6cSAlex Elder 	 * Our only purpose here is to determine whether the object
2834c5b5ef6cSAlex Elder 	 * exists, and we don't want to treat the non-existence as
2835c5b5ef6cSAlex Elder 	 * an error.  If something else comes back, transfer the
2836c5b5ef6cSAlex Elder 	 * error to the original request and complete it now.
2837c5b5ef6cSAlex Elder 	 */
2838c5b5ef6cSAlex Elder 	if (!result) {
2839c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, true);
2840c5b5ef6cSAlex Elder 	} else if (result == -ENOENT) {
2841c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, false);
2842c2e82414SIlya Dryomov 	} else {
2843c2e82414SIlya Dryomov 		goto fail_orig_request;
2844c5b5ef6cSAlex Elder 	}
2845c5b5ef6cSAlex Elder 
2846c5b5ef6cSAlex Elder 	/*
2847c5b5ef6cSAlex Elder 	 * Resubmit the original request now that we have recorded
2848c5b5ef6cSAlex Elder 	 * whether the target object exists.
2849c5b5ef6cSAlex Elder 	 */
2850c2e82414SIlya Dryomov 	result = rbd_img_obj_request_submit(orig_request);
2851c2e82414SIlya Dryomov 	if (result)
2852c2e82414SIlya Dryomov 		goto fail_orig_request;
2853c2e82414SIlya Dryomov 
2854c2e82414SIlya Dryomov 	return;
2855c2e82414SIlya Dryomov 
2856c2e82414SIlya Dryomov fail_orig_request:
28570dcc685eSIlya Dryomov 	rbd_obj_request_error(orig_request, result);
2858c5b5ef6cSAlex Elder }
2859c5b5ef6cSAlex Elder 
2860c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2861c5b5ef6cSAlex Elder {
2862058aa991SIlya Dryomov 	struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
2863c5b5ef6cSAlex Elder 	struct rbd_obj_request *stat_request;
2864710214e3SIlya Dryomov 	struct page **pages;
2865c5b5ef6cSAlex Elder 	u32 page_count;
2866c5b5ef6cSAlex Elder 	size_t size;
2867c5b5ef6cSAlex Elder 	int ret;
2868c5b5ef6cSAlex Elder 
286967e2b652SIlya Dryomov 	stat_request = rbd_obj_request_create(obj_request->object_name,
2870710214e3SIlya Dryomov 					      OBJ_REQUEST_PAGES);
2871710214e3SIlya Dryomov 	if (!stat_request)
2872710214e3SIlya Dryomov 		return -ENOMEM;
2873710214e3SIlya Dryomov 
2874710214e3SIlya Dryomov 	stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
2875710214e3SIlya Dryomov 						   stat_request);
2876710214e3SIlya Dryomov 	if (!stat_request->osd_req) {
2877710214e3SIlya Dryomov 		ret = -ENOMEM;
2878710214e3SIlya Dryomov 		goto fail_stat_request;
2879710214e3SIlya Dryomov 	}
2880710214e3SIlya Dryomov 
2881c5b5ef6cSAlex Elder 	/*
2882c5b5ef6cSAlex Elder 	 * The response data for a STAT call consists of:
2883c5b5ef6cSAlex Elder 	 *     le64 length;
2884c5b5ef6cSAlex Elder 	 *     struct {
2885c5b5ef6cSAlex Elder 	 *         le32 tv_sec;
2886c5b5ef6cSAlex Elder 	 *         le32 tv_nsec;
2887c5b5ef6cSAlex Elder 	 *     } mtime;
2888c5b5ef6cSAlex Elder 	 */
2889c5b5ef6cSAlex Elder 	size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2890c5b5ef6cSAlex Elder 	page_count = (u32)calc_pages_for(0, size);
2891c5b5ef6cSAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2892710214e3SIlya Dryomov 	if (IS_ERR(pages)) {
2893710214e3SIlya Dryomov 		ret = PTR_ERR(pages);
2894710214e3SIlya Dryomov 		goto fail_stat_request;
2895710214e3SIlya Dryomov 	}
2896c5b5ef6cSAlex Elder 
2897710214e3SIlya Dryomov 	osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0);
2898710214e3SIlya Dryomov 	osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2899710214e3SIlya Dryomov 				     false, false);
2900c5b5ef6cSAlex Elder 
2901c5b5ef6cSAlex Elder 	rbd_obj_request_get(obj_request);
2902c5b5ef6cSAlex Elder 	stat_request->obj_request = obj_request;
2903c5b5ef6cSAlex Elder 	stat_request->pages = pages;
2904c5b5ef6cSAlex Elder 	stat_request->page_count = page_count;
2905c5b5ef6cSAlex Elder 	stat_request->callback = rbd_img_obj_exists_callback;
2906c5b5ef6cSAlex Elder 
2907980917fcSIlya Dryomov 	rbd_obj_request_submit(stat_request);
2908980917fcSIlya Dryomov 	return 0;
2909c5b5ef6cSAlex Elder 
2910710214e3SIlya Dryomov fail_stat_request:
2911710214e3SIlya Dryomov 	rbd_obj_request_put(stat_request);
2912c5b5ef6cSAlex Elder 	return ret;
2913c5b5ef6cSAlex Elder }
2914c5b5ef6cSAlex Elder 
291570d045f6SIlya Dryomov static bool img_obj_request_simple(struct rbd_obj_request *obj_request)
2916b454e36dSAlex Elder {
2917058aa991SIlya Dryomov 	struct rbd_img_request *img_request = obj_request->img_request;
2918058aa991SIlya Dryomov 	struct rbd_device *rbd_dev = img_request->rbd_dev;
2919b454e36dSAlex Elder 
292070d045f6SIlya Dryomov 	/* Reads */
29211c220881SJosh Durgin 	if (!img_request_write_test(img_request) &&
29221c220881SJosh Durgin 	    !img_request_discard_test(img_request))
292370d045f6SIlya Dryomov 		return true;
2924b454e36dSAlex Elder 
292570d045f6SIlya Dryomov 	/* Non-layered writes */
292670d045f6SIlya Dryomov 	if (!img_request_layered_test(img_request))
292770d045f6SIlya Dryomov 		return true;
292870d045f6SIlya Dryomov 
292970d045f6SIlya Dryomov 	/*
293070d045f6SIlya Dryomov 	 * Layered writes outside of the parent overlap range don't
293170d045f6SIlya Dryomov 	 * share any data with the parent.
293270d045f6SIlya Dryomov 	 */
293370d045f6SIlya Dryomov 	if (!obj_request_overlaps_parent(obj_request))
293470d045f6SIlya Dryomov 		return true;
293570d045f6SIlya Dryomov 
293670d045f6SIlya Dryomov 	/*
2937c622d226SGuangliang Zhao 	 * Entire-object layered writes - we will overwrite whatever
2938c622d226SGuangliang Zhao 	 * parent data there is anyway.
2939c622d226SGuangliang Zhao 	 */
2940c622d226SGuangliang Zhao 	if (!obj_request->offset &&
2941c622d226SGuangliang Zhao 	    obj_request->length == rbd_obj_bytes(&rbd_dev->header))
2942c622d226SGuangliang Zhao 		return true;
2943c622d226SGuangliang Zhao 
2944c622d226SGuangliang Zhao 	/*
294570d045f6SIlya Dryomov 	 * If the object is known to already exist, its parent data has
294670d045f6SIlya Dryomov 	 * already been copied.
294770d045f6SIlya Dryomov 	 */
294870d045f6SIlya Dryomov 	if (obj_request_known_test(obj_request) &&
294970d045f6SIlya Dryomov 	    obj_request_exists_test(obj_request))
295070d045f6SIlya Dryomov 		return true;
295170d045f6SIlya Dryomov 
295270d045f6SIlya Dryomov 	return false;
295370d045f6SIlya Dryomov }
295470d045f6SIlya Dryomov 
295570d045f6SIlya Dryomov static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
295670d045f6SIlya Dryomov {
2957058aa991SIlya Dryomov 	rbd_assert(obj_request_img_data_test(obj_request));
2958058aa991SIlya Dryomov 	rbd_assert(obj_request_type_valid(obj_request->type));
2959058aa991SIlya Dryomov 	rbd_assert(obj_request->img_request);
2960058aa991SIlya Dryomov 
296170d045f6SIlya Dryomov 	if (img_obj_request_simple(obj_request)) {
2962980917fcSIlya Dryomov 		rbd_obj_request_submit(obj_request);
2963980917fcSIlya Dryomov 		return 0;
2964b454e36dSAlex Elder 	}
2965b454e36dSAlex Elder 
2966b454e36dSAlex Elder 	/*
29673d7efd18SAlex Elder 	 * It's a layered write.  The target object might exist but
29683d7efd18SAlex Elder 	 * we may not know that yet.  If we know it doesn't exist,
29693d7efd18SAlex Elder 	 * start by reading the data for the full target object from
29703d7efd18SAlex Elder 	 * the parent so we can use it for a copyup to the target.
2971b454e36dSAlex Elder 	 */
297270d045f6SIlya Dryomov 	if (obj_request_known_test(obj_request))
29733d7efd18SAlex Elder 		return rbd_img_obj_parent_read_full(obj_request);
29743d7efd18SAlex Elder 
29753d7efd18SAlex Elder 	/* We don't know whether the target exists.  Go find out. */
2976b454e36dSAlex Elder 
2977b454e36dSAlex Elder 	return rbd_img_obj_exists_submit(obj_request);
2978b454e36dSAlex Elder }
2979b454e36dSAlex Elder 
2980bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request)
2981bf0d5f50SAlex Elder {
2982bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
298346faeed4SAlex Elder 	struct rbd_obj_request *next_obj_request;
2984663ae2ccSIlya Dryomov 	int ret = 0;
2985bf0d5f50SAlex Elder 
298637206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
2987bf0d5f50SAlex Elder 
2988663ae2ccSIlya Dryomov 	rbd_img_request_get(img_request);
2989663ae2ccSIlya Dryomov 	for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
2990b454e36dSAlex Elder 		ret = rbd_img_obj_request_submit(obj_request);
2991bf0d5f50SAlex Elder 		if (ret)
2992663ae2ccSIlya Dryomov 			goto out_put_ireq;
2993bf0d5f50SAlex Elder 	}
2994bf0d5f50SAlex Elder 
2995663ae2ccSIlya Dryomov out_put_ireq:
2996663ae2ccSIlya Dryomov 	rbd_img_request_put(img_request);
2997663ae2ccSIlya Dryomov 	return ret;
2998bf0d5f50SAlex Elder }
2999bf0d5f50SAlex Elder 
30008b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
30018b3e1a56SAlex Elder {
30028b3e1a56SAlex Elder 	struct rbd_obj_request *obj_request;
3003a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
3004a9e8ba2cSAlex Elder 	u64 obj_end;
300502c74fbaSAlex Elder 	u64 img_xferred;
300602c74fbaSAlex Elder 	int img_result;
30078b3e1a56SAlex Elder 
30088b3e1a56SAlex Elder 	rbd_assert(img_request_child_test(img_request));
30098b3e1a56SAlex Elder 
301002c74fbaSAlex Elder 	/* First get what we need from the image request and release it */
301102c74fbaSAlex Elder 
30128b3e1a56SAlex Elder 	obj_request = img_request->obj_request;
301302c74fbaSAlex Elder 	img_xferred = img_request->xferred;
301402c74fbaSAlex Elder 	img_result = img_request->result;
301502c74fbaSAlex Elder 	rbd_img_request_put(img_request);
301602c74fbaSAlex Elder 
301702c74fbaSAlex Elder 	/*
301802c74fbaSAlex Elder 	 * If the overlap has become 0 (most likely because the
301902c74fbaSAlex Elder 	 * image has been flattened) we need to re-submit the
302002c74fbaSAlex Elder 	 * original request.
302102c74fbaSAlex Elder 	 */
3022a9e8ba2cSAlex Elder 	rbd_assert(obj_request);
3023a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_request);
302402c74fbaSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
302502c74fbaSAlex Elder 	if (!rbd_dev->parent_overlap) {
3026980917fcSIlya Dryomov 		rbd_obj_request_submit(obj_request);
302702c74fbaSAlex Elder 		return;
302802c74fbaSAlex Elder 	}
302902c74fbaSAlex Elder 
303002c74fbaSAlex Elder 	obj_request->result = img_result;
3031a9e8ba2cSAlex Elder 	if (obj_request->result)
3032a9e8ba2cSAlex Elder 		goto out;
3033a9e8ba2cSAlex Elder 
3034a9e8ba2cSAlex Elder 	/*
3035a9e8ba2cSAlex Elder 	 * We need to zero anything beyond the parent overlap
3036a9e8ba2cSAlex Elder 	 * boundary.  Since rbd_img_obj_request_read_callback()
3037a9e8ba2cSAlex Elder 	 * will zero anything beyond the end of a short read, an
3038a9e8ba2cSAlex Elder 	 * easy way to do this is to pretend the data from the
3039a9e8ba2cSAlex Elder 	 * parent came up short--ending at the overlap boundary.
3040a9e8ba2cSAlex Elder 	 */
3041a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
3042a9e8ba2cSAlex Elder 	obj_end = obj_request->img_offset + obj_request->length;
3043a9e8ba2cSAlex Elder 	if (obj_end > rbd_dev->parent_overlap) {
3044a9e8ba2cSAlex Elder 		u64 xferred = 0;
3045a9e8ba2cSAlex Elder 
3046a9e8ba2cSAlex Elder 		if (obj_request->img_offset < rbd_dev->parent_overlap)
3047a9e8ba2cSAlex Elder 			xferred = rbd_dev->parent_overlap -
3048a9e8ba2cSAlex Elder 					obj_request->img_offset;
3049a9e8ba2cSAlex Elder 
305002c74fbaSAlex Elder 		obj_request->xferred = min(img_xferred, xferred);
3051a9e8ba2cSAlex Elder 	} else {
305202c74fbaSAlex Elder 		obj_request->xferred = img_xferred;
3053a9e8ba2cSAlex Elder 	}
3054a9e8ba2cSAlex Elder out:
30558b3e1a56SAlex Elder 	rbd_img_obj_request_read_callback(obj_request);
30568b3e1a56SAlex Elder 	rbd_obj_request_complete(obj_request);
30578b3e1a56SAlex Elder }
30588b3e1a56SAlex Elder 
30598b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
30608b3e1a56SAlex Elder {
30618b3e1a56SAlex Elder 	struct rbd_img_request *img_request;
30628b3e1a56SAlex Elder 	int result;
30638b3e1a56SAlex Elder 
30648b3e1a56SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
30658b3e1a56SAlex Elder 	rbd_assert(obj_request->img_request != NULL);
30668b3e1a56SAlex Elder 	rbd_assert(obj_request->result == (s32) -ENOENT);
30675b2ab72dSAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
30688b3e1a56SAlex Elder 
30698b3e1a56SAlex Elder 	/* rbd_read_finish(obj_request, obj_request->length); */
3070e93f3152SAlex Elder 	img_request = rbd_parent_request_create(obj_request,
30718b3e1a56SAlex Elder 						obj_request->img_offset,
3072e93f3152SAlex Elder 						obj_request->length);
30738b3e1a56SAlex Elder 	result = -ENOMEM;
30748b3e1a56SAlex Elder 	if (!img_request)
30758b3e1a56SAlex Elder 		goto out_err;
30768b3e1a56SAlex Elder 
30775b2ab72dSAlex Elder 	if (obj_request->type == OBJ_REQUEST_BIO)
3078f1a4739fSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
3079f1a4739fSAlex Elder 						obj_request->bio_list);
30805b2ab72dSAlex Elder 	else
30815b2ab72dSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES,
30825b2ab72dSAlex Elder 						obj_request->pages);
30838b3e1a56SAlex Elder 	if (result)
30848b3e1a56SAlex Elder 		goto out_err;
30858b3e1a56SAlex Elder 
30868b3e1a56SAlex Elder 	img_request->callback = rbd_img_parent_read_callback;
30878b3e1a56SAlex Elder 	result = rbd_img_request_submit(img_request);
30888b3e1a56SAlex Elder 	if (result)
30898b3e1a56SAlex Elder 		goto out_err;
30908b3e1a56SAlex Elder 
30918b3e1a56SAlex Elder 	return;
30928b3e1a56SAlex Elder out_err:
30938b3e1a56SAlex Elder 	if (img_request)
30948b3e1a56SAlex Elder 		rbd_img_request_put(img_request);
30958b3e1a56SAlex Elder 	obj_request->result = result;
30968b3e1a56SAlex Elder 	obj_request->xferred = 0;
30978b3e1a56SAlex Elder 	obj_request_done_set(obj_request);
30988b3e1a56SAlex Elder }
30998b3e1a56SAlex Elder 
3100ed95b21aSIlya Dryomov static const struct rbd_client_id rbd_empty_cid;
3101ed95b21aSIlya Dryomov 
3102ed95b21aSIlya Dryomov static bool rbd_cid_equal(const struct rbd_client_id *lhs,
3103ed95b21aSIlya Dryomov 			  const struct rbd_client_id *rhs)
3104ed95b21aSIlya Dryomov {
3105ed95b21aSIlya Dryomov 	return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
3106ed95b21aSIlya Dryomov }
3107ed95b21aSIlya Dryomov 
3108ed95b21aSIlya Dryomov static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
3109ed95b21aSIlya Dryomov {
3110ed95b21aSIlya Dryomov 	struct rbd_client_id cid;
3111ed95b21aSIlya Dryomov 
3112ed95b21aSIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
3113ed95b21aSIlya Dryomov 	cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
3114ed95b21aSIlya Dryomov 	cid.handle = rbd_dev->watch_cookie;
3115ed95b21aSIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
3116ed95b21aSIlya Dryomov 	return cid;
3117ed95b21aSIlya Dryomov }
3118ed95b21aSIlya Dryomov 
3119ed95b21aSIlya Dryomov /*
3120ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3121ed95b21aSIlya Dryomov  */
3122ed95b21aSIlya Dryomov static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
3123ed95b21aSIlya Dryomov 			      const struct rbd_client_id *cid)
3124ed95b21aSIlya Dryomov {
3125ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
3126ed95b21aSIlya Dryomov 	     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
3127ed95b21aSIlya Dryomov 	     cid->gid, cid->handle);
3128ed95b21aSIlya Dryomov 	rbd_dev->owner_cid = *cid; /* struct */
3129ed95b21aSIlya Dryomov }
3130ed95b21aSIlya Dryomov 
3131ed95b21aSIlya Dryomov static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
3132ed95b21aSIlya Dryomov {
3133ed95b21aSIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
3134ed95b21aSIlya Dryomov 	sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
3135ed95b21aSIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
3136ed95b21aSIlya Dryomov }
3137ed95b21aSIlya Dryomov 
3138ed95b21aSIlya Dryomov /*
3139ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3140ed95b21aSIlya Dryomov  */
3141ed95b21aSIlya Dryomov static int rbd_lock(struct rbd_device *rbd_dev)
3142ed95b21aSIlya Dryomov {
3143ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3144ed95b21aSIlya Dryomov 	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3145ed95b21aSIlya Dryomov 	char cookie[32];
3146ed95b21aSIlya Dryomov 	int ret;
3147ed95b21aSIlya Dryomov 
3148ed95b21aSIlya Dryomov 	WARN_ON(__rbd_is_lock_owner(rbd_dev));
3149ed95b21aSIlya Dryomov 
3150ed95b21aSIlya Dryomov 	format_lock_cookie(rbd_dev, cookie);
3151ed95b21aSIlya Dryomov 	ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3152ed95b21aSIlya Dryomov 			    RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
3153ed95b21aSIlya Dryomov 			    RBD_LOCK_TAG, "", 0);
3154ed95b21aSIlya Dryomov 	if (ret)
3155ed95b21aSIlya Dryomov 		return ret;
3156ed95b21aSIlya Dryomov 
3157ed95b21aSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
3158ed95b21aSIlya Dryomov 	rbd_set_owner_cid(rbd_dev, &cid);
3159ed95b21aSIlya Dryomov 	queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
3160ed95b21aSIlya Dryomov 	return 0;
3161ed95b21aSIlya Dryomov }
3162ed95b21aSIlya Dryomov 
3163ed95b21aSIlya Dryomov /*
3164ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3165ed95b21aSIlya Dryomov  */
3166ed95b21aSIlya Dryomov static int rbd_unlock(struct rbd_device *rbd_dev)
3167ed95b21aSIlya Dryomov {
3168ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3169ed95b21aSIlya Dryomov 	char cookie[32];
3170ed95b21aSIlya Dryomov 	int ret;
3171ed95b21aSIlya Dryomov 
3172ed95b21aSIlya Dryomov 	WARN_ON(!__rbd_is_lock_owner(rbd_dev));
3173ed95b21aSIlya Dryomov 
3174ed95b21aSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
3175ed95b21aSIlya Dryomov 
3176ed95b21aSIlya Dryomov 	format_lock_cookie(rbd_dev, cookie);
3177ed95b21aSIlya Dryomov 	ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3178ed95b21aSIlya Dryomov 			      RBD_LOCK_NAME, cookie);
3179ed95b21aSIlya Dryomov 	if (ret && ret != -ENOENT) {
3180ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "cls_unlock failed: %d", ret);
3181ed95b21aSIlya Dryomov 		return ret;
3182ed95b21aSIlya Dryomov 	}
3183ed95b21aSIlya Dryomov 
3184ed95b21aSIlya Dryomov 	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3185ed95b21aSIlya Dryomov 	queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
3186ed95b21aSIlya Dryomov 	return 0;
3187ed95b21aSIlya Dryomov }
3188ed95b21aSIlya Dryomov 
3189ed95b21aSIlya Dryomov static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
3190ed95b21aSIlya Dryomov 				enum rbd_notify_op notify_op,
3191ed95b21aSIlya Dryomov 				struct page ***preply_pages,
3192ed95b21aSIlya Dryomov 				size_t *preply_len)
3193ed95b21aSIlya Dryomov {
3194ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3195ed95b21aSIlya Dryomov 	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3196ed95b21aSIlya Dryomov 	int buf_size = 4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN;
3197ed95b21aSIlya Dryomov 	char buf[buf_size];
3198ed95b21aSIlya Dryomov 	void *p = buf;
3199ed95b21aSIlya Dryomov 
3200ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
3201ed95b21aSIlya Dryomov 
3202ed95b21aSIlya Dryomov 	/* encode *LockPayload NotifyMessage (op + ClientId) */
3203ed95b21aSIlya Dryomov 	ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
3204ed95b21aSIlya Dryomov 	ceph_encode_32(&p, notify_op);
3205ed95b21aSIlya Dryomov 	ceph_encode_64(&p, cid.gid);
3206ed95b21aSIlya Dryomov 	ceph_encode_64(&p, cid.handle);
3207ed95b21aSIlya Dryomov 
3208ed95b21aSIlya Dryomov 	return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
3209ed95b21aSIlya Dryomov 				&rbd_dev->header_oloc, buf, buf_size,
3210ed95b21aSIlya Dryomov 				RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
3211ed95b21aSIlya Dryomov }
3212ed95b21aSIlya Dryomov 
3213ed95b21aSIlya Dryomov static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
3214ed95b21aSIlya Dryomov 			       enum rbd_notify_op notify_op)
3215ed95b21aSIlya Dryomov {
3216ed95b21aSIlya Dryomov 	struct page **reply_pages;
3217ed95b21aSIlya Dryomov 	size_t reply_len;
3218ed95b21aSIlya Dryomov 
3219ed95b21aSIlya Dryomov 	__rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
3220ed95b21aSIlya Dryomov 	ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3221ed95b21aSIlya Dryomov }
3222ed95b21aSIlya Dryomov 
3223ed95b21aSIlya Dryomov static void rbd_notify_acquired_lock(struct work_struct *work)
3224ed95b21aSIlya Dryomov {
3225ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3226ed95b21aSIlya Dryomov 						  acquired_lock_work);
3227ed95b21aSIlya Dryomov 
3228ed95b21aSIlya Dryomov 	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
3229ed95b21aSIlya Dryomov }
3230ed95b21aSIlya Dryomov 
3231ed95b21aSIlya Dryomov static void rbd_notify_released_lock(struct work_struct *work)
3232ed95b21aSIlya Dryomov {
3233ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3234ed95b21aSIlya Dryomov 						  released_lock_work);
3235ed95b21aSIlya Dryomov 
3236ed95b21aSIlya Dryomov 	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
3237ed95b21aSIlya Dryomov }
3238ed95b21aSIlya Dryomov 
3239ed95b21aSIlya Dryomov static int rbd_request_lock(struct rbd_device *rbd_dev)
3240ed95b21aSIlya Dryomov {
3241ed95b21aSIlya Dryomov 	struct page **reply_pages;
3242ed95b21aSIlya Dryomov 	size_t reply_len;
3243ed95b21aSIlya Dryomov 	bool lock_owner_responded = false;
3244ed95b21aSIlya Dryomov 	int ret;
3245ed95b21aSIlya Dryomov 
3246ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3247ed95b21aSIlya Dryomov 
3248ed95b21aSIlya Dryomov 	ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
3249ed95b21aSIlya Dryomov 				   &reply_pages, &reply_len);
3250ed95b21aSIlya Dryomov 	if (ret && ret != -ETIMEDOUT) {
3251ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "failed to request lock: %d", ret);
3252ed95b21aSIlya Dryomov 		goto out;
3253ed95b21aSIlya Dryomov 	}
3254ed95b21aSIlya Dryomov 
3255ed95b21aSIlya Dryomov 	if (reply_len > 0 && reply_len <= PAGE_SIZE) {
3256ed95b21aSIlya Dryomov 		void *p = page_address(reply_pages[0]);
3257ed95b21aSIlya Dryomov 		void *const end = p + reply_len;
3258ed95b21aSIlya Dryomov 		u32 n;
3259ed95b21aSIlya Dryomov 
3260ed95b21aSIlya Dryomov 		ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
3261ed95b21aSIlya Dryomov 		while (n--) {
3262ed95b21aSIlya Dryomov 			u8 struct_v;
3263ed95b21aSIlya Dryomov 			u32 len;
3264ed95b21aSIlya Dryomov 
3265ed95b21aSIlya Dryomov 			ceph_decode_need(&p, end, 8 + 8, e_inval);
3266ed95b21aSIlya Dryomov 			p += 8 + 8; /* skip gid and cookie */
3267ed95b21aSIlya Dryomov 
3268ed95b21aSIlya Dryomov 			ceph_decode_32_safe(&p, end, len, e_inval);
3269ed95b21aSIlya Dryomov 			if (!len)
3270ed95b21aSIlya Dryomov 				continue;
3271ed95b21aSIlya Dryomov 
3272ed95b21aSIlya Dryomov 			if (lock_owner_responded) {
3273ed95b21aSIlya Dryomov 				rbd_warn(rbd_dev,
3274ed95b21aSIlya Dryomov 					 "duplicate lock owners detected");
3275ed95b21aSIlya Dryomov 				ret = -EIO;
3276ed95b21aSIlya Dryomov 				goto out;
3277ed95b21aSIlya Dryomov 			}
3278ed95b21aSIlya Dryomov 
3279ed95b21aSIlya Dryomov 			lock_owner_responded = true;
3280ed95b21aSIlya Dryomov 			ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
3281ed95b21aSIlya Dryomov 						  &struct_v, &len);
3282ed95b21aSIlya Dryomov 			if (ret) {
3283ed95b21aSIlya Dryomov 				rbd_warn(rbd_dev,
3284ed95b21aSIlya Dryomov 					 "failed to decode ResponseMessage: %d",
3285ed95b21aSIlya Dryomov 					 ret);
3286ed95b21aSIlya Dryomov 				goto e_inval;
3287ed95b21aSIlya Dryomov 			}
3288ed95b21aSIlya Dryomov 
3289ed95b21aSIlya Dryomov 			ret = ceph_decode_32(&p);
3290ed95b21aSIlya Dryomov 		}
3291ed95b21aSIlya Dryomov 	}
3292ed95b21aSIlya Dryomov 
3293ed95b21aSIlya Dryomov 	if (!lock_owner_responded) {
3294ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "no lock owners detected");
3295ed95b21aSIlya Dryomov 		ret = -ETIMEDOUT;
3296ed95b21aSIlya Dryomov 	}
3297ed95b21aSIlya Dryomov 
3298ed95b21aSIlya Dryomov out:
3299ed95b21aSIlya Dryomov 	ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3300ed95b21aSIlya Dryomov 	return ret;
3301ed95b21aSIlya Dryomov 
3302ed95b21aSIlya Dryomov e_inval:
3303ed95b21aSIlya Dryomov 	ret = -EINVAL;
3304ed95b21aSIlya Dryomov 	goto out;
3305ed95b21aSIlya Dryomov }
3306ed95b21aSIlya Dryomov 
3307ed95b21aSIlya Dryomov static void wake_requests(struct rbd_device *rbd_dev, bool wake_all)
3308ed95b21aSIlya Dryomov {
3309ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all);
3310ed95b21aSIlya Dryomov 
3311ed95b21aSIlya Dryomov 	cancel_delayed_work(&rbd_dev->lock_dwork);
3312ed95b21aSIlya Dryomov 	if (wake_all)
3313ed95b21aSIlya Dryomov 		wake_up_all(&rbd_dev->lock_waitq);
3314ed95b21aSIlya Dryomov 	else
3315ed95b21aSIlya Dryomov 		wake_up(&rbd_dev->lock_waitq);
3316ed95b21aSIlya Dryomov }
3317ed95b21aSIlya Dryomov 
3318ed95b21aSIlya Dryomov static int get_lock_owner_info(struct rbd_device *rbd_dev,
3319ed95b21aSIlya Dryomov 			       struct ceph_locker **lockers, u32 *num_lockers)
3320ed95b21aSIlya Dryomov {
3321ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3322ed95b21aSIlya Dryomov 	u8 lock_type;
3323ed95b21aSIlya Dryomov 	char *lock_tag;
3324ed95b21aSIlya Dryomov 	int ret;
3325ed95b21aSIlya Dryomov 
3326ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3327ed95b21aSIlya Dryomov 
3328ed95b21aSIlya Dryomov 	ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
3329ed95b21aSIlya Dryomov 				 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3330ed95b21aSIlya Dryomov 				 &lock_type, &lock_tag, lockers, num_lockers);
3331ed95b21aSIlya Dryomov 	if (ret)
3332ed95b21aSIlya Dryomov 		return ret;
3333ed95b21aSIlya Dryomov 
3334ed95b21aSIlya Dryomov 	if (*num_lockers == 0) {
3335ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
3336ed95b21aSIlya Dryomov 		goto out;
3337ed95b21aSIlya Dryomov 	}
3338ed95b21aSIlya Dryomov 
3339ed95b21aSIlya Dryomov 	if (strcmp(lock_tag, RBD_LOCK_TAG)) {
3340ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
3341ed95b21aSIlya Dryomov 			 lock_tag);
3342ed95b21aSIlya Dryomov 		ret = -EBUSY;
3343ed95b21aSIlya Dryomov 		goto out;
3344ed95b21aSIlya Dryomov 	}
3345ed95b21aSIlya Dryomov 
3346ed95b21aSIlya Dryomov 	if (lock_type == CEPH_CLS_LOCK_SHARED) {
3347ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "shared lock type detected");
3348ed95b21aSIlya Dryomov 		ret = -EBUSY;
3349ed95b21aSIlya Dryomov 		goto out;
3350ed95b21aSIlya Dryomov 	}
3351ed95b21aSIlya Dryomov 
3352ed95b21aSIlya Dryomov 	if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
3353ed95b21aSIlya Dryomov 		    strlen(RBD_LOCK_COOKIE_PREFIX))) {
3354ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
3355ed95b21aSIlya Dryomov 			 (*lockers)[0].id.cookie);
3356ed95b21aSIlya Dryomov 		ret = -EBUSY;
3357ed95b21aSIlya Dryomov 		goto out;
3358ed95b21aSIlya Dryomov 	}
3359ed95b21aSIlya Dryomov 
3360ed95b21aSIlya Dryomov out:
3361ed95b21aSIlya Dryomov 	kfree(lock_tag);
3362ed95b21aSIlya Dryomov 	return ret;
3363ed95b21aSIlya Dryomov }
3364ed95b21aSIlya Dryomov 
3365ed95b21aSIlya Dryomov static int find_watcher(struct rbd_device *rbd_dev,
3366ed95b21aSIlya Dryomov 			const struct ceph_locker *locker)
3367ed95b21aSIlya Dryomov {
3368ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3369ed95b21aSIlya Dryomov 	struct ceph_watch_item *watchers;
3370ed95b21aSIlya Dryomov 	u32 num_watchers;
3371ed95b21aSIlya Dryomov 	u64 cookie;
3372ed95b21aSIlya Dryomov 	int i;
3373ed95b21aSIlya Dryomov 	int ret;
3374ed95b21aSIlya Dryomov 
3375ed95b21aSIlya Dryomov 	ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
3376ed95b21aSIlya Dryomov 				      &rbd_dev->header_oloc, &watchers,
3377ed95b21aSIlya Dryomov 				      &num_watchers);
3378ed95b21aSIlya Dryomov 	if (ret)
3379ed95b21aSIlya Dryomov 		return ret;
3380ed95b21aSIlya Dryomov 
3381ed95b21aSIlya Dryomov 	sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
3382ed95b21aSIlya Dryomov 	for (i = 0; i < num_watchers; i++) {
3383ed95b21aSIlya Dryomov 		if (!memcmp(&watchers[i].addr, &locker->info.addr,
3384ed95b21aSIlya Dryomov 			    sizeof(locker->info.addr)) &&
3385ed95b21aSIlya Dryomov 		    watchers[i].cookie == cookie) {
3386ed95b21aSIlya Dryomov 			struct rbd_client_id cid = {
3387ed95b21aSIlya Dryomov 				.gid = le64_to_cpu(watchers[i].name.num),
3388ed95b21aSIlya Dryomov 				.handle = cookie,
3389ed95b21aSIlya Dryomov 			};
3390ed95b21aSIlya Dryomov 
3391ed95b21aSIlya Dryomov 			dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
3392ed95b21aSIlya Dryomov 			     rbd_dev, cid.gid, cid.handle);
3393ed95b21aSIlya Dryomov 			rbd_set_owner_cid(rbd_dev, &cid);
3394ed95b21aSIlya Dryomov 			ret = 1;
3395ed95b21aSIlya Dryomov 			goto out;
3396ed95b21aSIlya Dryomov 		}
3397ed95b21aSIlya Dryomov 	}
3398ed95b21aSIlya Dryomov 
3399ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
3400ed95b21aSIlya Dryomov 	ret = 0;
3401ed95b21aSIlya Dryomov out:
3402ed95b21aSIlya Dryomov 	kfree(watchers);
3403ed95b21aSIlya Dryomov 	return ret;
3404ed95b21aSIlya Dryomov }
3405ed95b21aSIlya Dryomov 
3406ed95b21aSIlya Dryomov /*
3407ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3408ed95b21aSIlya Dryomov  */
3409ed95b21aSIlya Dryomov static int rbd_try_lock(struct rbd_device *rbd_dev)
3410ed95b21aSIlya Dryomov {
3411ed95b21aSIlya Dryomov 	struct ceph_client *client = rbd_dev->rbd_client->client;
3412ed95b21aSIlya Dryomov 	struct ceph_locker *lockers;
3413ed95b21aSIlya Dryomov 	u32 num_lockers;
3414ed95b21aSIlya Dryomov 	int ret;
3415ed95b21aSIlya Dryomov 
3416ed95b21aSIlya Dryomov 	for (;;) {
3417ed95b21aSIlya Dryomov 		ret = rbd_lock(rbd_dev);
3418ed95b21aSIlya Dryomov 		if (ret != -EBUSY)
3419ed95b21aSIlya Dryomov 			return ret;
3420ed95b21aSIlya Dryomov 
3421ed95b21aSIlya Dryomov 		/* determine if the current lock holder is still alive */
3422ed95b21aSIlya Dryomov 		ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
3423ed95b21aSIlya Dryomov 		if (ret)
3424ed95b21aSIlya Dryomov 			return ret;
3425ed95b21aSIlya Dryomov 
3426ed95b21aSIlya Dryomov 		if (num_lockers == 0)
3427ed95b21aSIlya Dryomov 			goto again;
3428ed95b21aSIlya Dryomov 
3429ed95b21aSIlya Dryomov 		ret = find_watcher(rbd_dev, lockers);
3430ed95b21aSIlya Dryomov 		if (ret) {
3431ed95b21aSIlya Dryomov 			if (ret > 0)
3432ed95b21aSIlya Dryomov 				ret = 0; /* have to request lock */
3433ed95b21aSIlya Dryomov 			goto out;
3434ed95b21aSIlya Dryomov 		}
3435ed95b21aSIlya Dryomov 
3436ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
3437ed95b21aSIlya Dryomov 			 ENTITY_NAME(lockers[0].id.name));
3438ed95b21aSIlya Dryomov 
3439ed95b21aSIlya Dryomov 		ret = ceph_monc_blacklist_add(&client->monc,
3440ed95b21aSIlya Dryomov 					      &lockers[0].info.addr);
3441ed95b21aSIlya Dryomov 		if (ret) {
3442ed95b21aSIlya Dryomov 			rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
3443ed95b21aSIlya Dryomov 				 ENTITY_NAME(lockers[0].id.name), ret);
3444ed95b21aSIlya Dryomov 			goto out;
3445ed95b21aSIlya Dryomov 		}
3446ed95b21aSIlya Dryomov 
3447ed95b21aSIlya Dryomov 		ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
3448ed95b21aSIlya Dryomov 					  &rbd_dev->header_oloc, RBD_LOCK_NAME,
3449ed95b21aSIlya Dryomov 					  lockers[0].id.cookie,
3450ed95b21aSIlya Dryomov 					  &lockers[0].id.name);
3451ed95b21aSIlya Dryomov 		if (ret && ret != -ENOENT)
3452ed95b21aSIlya Dryomov 			goto out;
3453ed95b21aSIlya Dryomov 
3454ed95b21aSIlya Dryomov again:
3455ed95b21aSIlya Dryomov 		ceph_free_lockers(lockers, num_lockers);
3456ed95b21aSIlya Dryomov 	}
3457ed95b21aSIlya Dryomov 
3458ed95b21aSIlya Dryomov out:
3459ed95b21aSIlya Dryomov 	ceph_free_lockers(lockers, num_lockers);
3460ed95b21aSIlya Dryomov 	return ret;
3461ed95b21aSIlya Dryomov }
3462ed95b21aSIlya Dryomov 
3463ed95b21aSIlya Dryomov /*
3464ed95b21aSIlya Dryomov  * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED
3465ed95b21aSIlya Dryomov  */
3466ed95b21aSIlya Dryomov static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev,
3467ed95b21aSIlya Dryomov 						int *pret)
3468ed95b21aSIlya Dryomov {
3469ed95b21aSIlya Dryomov 	enum rbd_lock_state lock_state;
3470ed95b21aSIlya Dryomov 
3471ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
3472ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3473ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
3474ed95b21aSIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev)) {
3475ed95b21aSIlya Dryomov 		lock_state = rbd_dev->lock_state;
3476ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
3477ed95b21aSIlya Dryomov 		return lock_state;
3478ed95b21aSIlya Dryomov 	}
3479ed95b21aSIlya Dryomov 
3480ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3481ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3482ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3483ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
3484ed95b21aSIlya Dryomov 	if (!__rbd_is_lock_owner(rbd_dev)) {
3485ed95b21aSIlya Dryomov 		*pret = rbd_try_lock(rbd_dev);
3486ed95b21aSIlya Dryomov 		if (*pret)
3487ed95b21aSIlya Dryomov 			rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret);
3488ed95b21aSIlya Dryomov 	}
3489ed95b21aSIlya Dryomov 
3490ed95b21aSIlya Dryomov 	lock_state = rbd_dev->lock_state;
3491ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
3492ed95b21aSIlya Dryomov 	return lock_state;
3493ed95b21aSIlya Dryomov }
3494ed95b21aSIlya Dryomov 
3495ed95b21aSIlya Dryomov static void rbd_acquire_lock(struct work_struct *work)
3496ed95b21aSIlya Dryomov {
3497ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3498ed95b21aSIlya Dryomov 					    struct rbd_device, lock_dwork);
3499ed95b21aSIlya Dryomov 	enum rbd_lock_state lock_state;
3500ed95b21aSIlya Dryomov 	int ret;
3501ed95b21aSIlya Dryomov 
3502ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3503ed95b21aSIlya Dryomov again:
3504ed95b21aSIlya Dryomov 	lock_state = rbd_try_acquire_lock(rbd_dev, &ret);
3505ed95b21aSIlya Dryomov 	if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) {
3506ed95b21aSIlya Dryomov 		if (lock_state == RBD_LOCK_STATE_LOCKED)
3507ed95b21aSIlya Dryomov 			wake_requests(rbd_dev, true);
3508ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__,
3509ed95b21aSIlya Dryomov 		     rbd_dev, lock_state, ret);
3510ed95b21aSIlya Dryomov 		return;
3511ed95b21aSIlya Dryomov 	}
3512ed95b21aSIlya Dryomov 
3513ed95b21aSIlya Dryomov 	ret = rbd_request_lock(rbd_dev);
3514ed95b21aSIlya Dryomov 	if (ret == -ETIMEDOUT) {
3515ed95b21aSIlya Dryomov 		goto again; /* treat this as a dead client */
3516ed95b21aSIlya Dryomov 	} else if (ret < 0) {
3517ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "error requesting lock: %d", ret);
3518ed95b21aSIlya Dryomov 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3519ed95b21aSIlya Dryomov 				 RBD_RETRY_DELAY);
3520ed95b21aSIlya Dryomov 	} else {
3521ed95b21aSIlya Dryomov 		/*
3522ed95b21aSIlya Dryomov 		 * lock owner acked, but resend if we don't see them
3523ed95b21aSIlya Dryomov 		 * release the lock
3524ed95b21aSIlya Dryomov 		 */
3525ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p requeueing lock_dwork\n", __func__,
3526ed95b21aSIlya Dryomov 		     rbd_dev);
3527ed95b21aSIlya Dryomov 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3528ed95b21aSIlya Dryomov 		    msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
3529ed95b21aSIlya Dryomov 	}
3530ed95b21aSIlya Dryomov }
3531ed95b21aSIlya Dryomov 
3532ed95b21aSIlya Dryomov /*
3533ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3534ed95b21aSIlya Dryomov  */
3535ed95b21aSIlya Dryomov static bool rbd_release_lock(struct rbd_device *rbd_dev)
3536ed95b21aSIlya Dryomov {
3537ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3538ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
3539ed95b21aSIlya Dryomov 	if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
3540ed95b21aSIlya Dryomov 		return false;
3541ed95b21aSIlya Dryomov 
3542ed95b21aSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
3543ed95b21aSIlya Dryomov 	downgrade_write(&rbd_dev->lock_rwsem);
3544ed95b21aSIlya Dryomov 	/*
3545ed95b21aSIlya Dryomov 	 * Ensure that all in-flight IO is flushed.
3546ed95b21aSIlya Dryomov 	 *
3547ed95b21aSIlya Dryomov 	 * FIXME: ceph_osdc_sync() flushes the entire OSD client, which
3548ed95b21aSIlya Dryomov 	 * may be shared with other devices.
3549ed95b21aSIlya Dryomov 	 */
3550ed95b21aSIlya Dryomov 	ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc);
3551ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3552ed95b21aSIlya Dryomov 
3553ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3554ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3555ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
3556ed95b21aSIlya Dryomov 	if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
3557ed95b21aSIlya Dryomov 		return false;
3558ed95b21aSIlya Dryomov 
3559ed95b21aSIlya Dryomov 	if (!rbd_unlock(rbd_dev))
3560ed95b21aSIlya Dryomov 		/*
3561ed95b21aSIlya Dryomov 		 * Give others a chance to grab the lock - we would re-acquire
3562ed95b21aSIlya Dryomov 		 * almost immediately if we got new IO during ceph_osdc_sync()
3563ed95b21aSIlya Dryomov 		 * otherwise.  We need to ack our own notifications, so this
3564ed95b21aSIlya Dryomov 		 * lock_dwork will be requeued from rbd_wait_state_locked()
3565ed95b21aSIlya Dryomov 		 * after wake_requests() in rbd_handle_released_lock().
3566ed95b21aSIlya Dryomov 		 */
3567ed95b21aSIlya Dryomov 		cancel_delayed_work(&rbd_dev->lock_dwork);
3568ed95b21aSIlya Dryomov 
3569ed95b21aSIlya Dryomov 	return true;
3570ed95b21aSIlya Dryomov }
3571ed95b21aSIlya Dryomov 
3572ed95b21aSIlya Dryomov static void rbd_release_lock_work(struct work_struct *work)
3573ed95b21aSIlya Dryomov {
3574ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3575ed95b21aSIlya Dryomov 						  unlock_work);
3576ed95b21aSIlya Dryomov 
3577ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3578ed95b21aSIlya Dryomov 	rbd_release_lock(rbd_dev);
3579ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
3580ed95b21aSIlya Dryomov }
3581ed95b21aSIlya Dryomov 
3582ed95b21aSIlya Dryomov static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
3583ed95b21aSIlya Dryomov 				     void **p)
3584ed95b21aSIlya Dryomov {
3585ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
3586ed95b21aSIlya Dryomov 
3587ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
3588ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
3589ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
3590ed95b21aSIlya Dryomov 	}
3591ed95b21aSIlya Dryomov 
3592ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3593ed95b21aSIlya Dryomov 	     cid.handle);
3594ed95b21aSIlya Dryomov 	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3595ed95b21aSIlya Dryomov 		down_write(&rbd_dev->lock_rwsem);
3596ed95b21aSIlya Dryomov 		if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3597ed95b21aSIlya Dryomov 			/*
3598ed95b21aSIlya Dryomov 			 * we already know that the remote client is
3599ed95b21aSIlya Dryomov 			 * the owner
3600ed95b21aSIlya Dryomov 			 */
3601ed95b21aSIlya Dryomov 			up_write(&rbd_dev->lock_rwsem);
3602ed95b21aSIlya Dryomov 			return;
3603ed95b21aSIlya Dryomov 		}
3604ed95b21aSIlya Dryomov 
3605ed95b21aSIlya Dryomov 		rbd_set_owner_cid(rbd_dev, &cid);
3606ed95b21aSIlya Dryomov 		downgrade_write(&rbd_dev->lock_rwsem);
3607ed95b21aSIlya Dryomov 	} else {
3608ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
3609ed95b21aSIlya Dryomov 	}
3610ed95b21aSIlya Dryomov 
3611ed95b21aSIlya Dryomov 	if (!__rbd_is_lock_owner(rbd_dev))
3612ed95b21aSIlya Dryomov 		wake_requests(rbd_dev, false);
3613ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3614ed95b21aSIlya Dryomov }
3615ed95b21aSIlya Dryomov 
3616ed95b21aSIlya Dryomov static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
3617ed95b21aSIlya Dryomov 				     void **p)
3618ed95b21aSIlya Dryomov {
3619ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
3620ed95b21aSIlya Dryomov 
3621ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
3622ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
3623ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
3624ed95b21aSIlya Dryomov 	}
3625ed95b21aSIlya Dryomov 
3626ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3627ed95b21aSIlya Dryomov 	     cid.handle);
3628ed95b21aSIlya Dryomov 	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3629ed95b21aSIlya Dryomov 		down_write(&rbd_dev->lock_rwsem);
3630ed95b21aSIlya Dryomov 		if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3631ed95b21aSIlya Dryomov 			dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
3632ed95b21aSIlya Dryomov 			     __func__, rbd_dev, cid.gid, cid.handle,
3633ed95b21aSIlya Dryomov 			     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
3634ed95b21aSIlya Dryomov 			up_write(&rbd_dev->lock_rwsem);
3635ed95b21aSIlya Dryomov 			return;
3636ed95b21aSIlya Dryomov 		}
3637ed95b21aSIlya Dryomov 
3638ed95b21aSIlya Dryomov 		rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3639ed95b21aSIlya Dryomov 		downgrade_write(&rbd_dev->lock_rwsem);
3640ed95b21aSIlya Dryomov 	} else {
3641ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
3642ed95b21aSIlya Dryomov 	}
3643ed95b21aSIlya Dryomov 
3644ed95b21aSIlya Dryomov 	if (!__rbd_is_lock_owner(rbd_dev))
3645ed95b21aSIlya Dryomov 		wake_requests(rbd_dev, false);
3646ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3647ed95b21aSIlya Dryomov }
3648ed95b21aSIlya Dryomov 
3649ed95b21aSIlya Dryomov static bool rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
3650ed95b21aSIlya Dryomov 				    void **p)
3651ed95b21aSIlya Dryomov {
3652ed95b21aSIlya Dryomov 	struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
3653ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
3654ed95b21aSIlya Dryomov 	bool need_to_send;
3655ed95b21aSIlya Dryomov 
3656ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
3657ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
3658ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
3659ed95b21aSIlya Dryomov 	}
3660ed95b21aSIlya Dryomov 
3661ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3662ed95b21aSIlya Dryomov 	     cid.handle);
3663ed95b21aSIlya Dryomov 	if (rbd_cid_equal(&cid, &my_cid))
3664ed95b21aSIlya Dryomov 		return false;
3665ed95b21aSIlya Dryomov 
3666ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
3667ed95b21aSIlya Dryomov 	need_to_send = __rbd_is_lock_owner(rbd_dev);
3668ed95b21aSIlya Dryomov 	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
3669ed95b21aSIlya Dryomov 		if (!rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid)) {
3670ed95b21aSIlya Dryomov 			dout("%s rbd_dev %p queueing unlock_work\n", __func__,
3671ed95b21aSIlya Dryomov 			     rbd_dev);
3672ed95b21aSIlya Dryomov 			queue_work(rbd_dev->task_wq, &rbd_dev->unlock_work);
3673ed95b21aSIlya Dryomov 		}
3674ed95b21aSIlya Dryomov 	}
3675ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3676ed95b21aSIlya Dryomov 	return need_to_send;
3677ed95b21aSIlya Dryomov }
3678ed95b21aSIlya Dryomov 
3679ed95b21aSIlya Dryomov static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
3680ed95b21aSIlya Dryomov 				     u64 notify_id, u64 cookie, s32 *result)
3681ed95b21aSIlya Dryomov {
3682ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3683ed95b21aSIlya Dryomov 	int buf_size = 4 + CEPH_ENCODING_START_BLK_LEN;
3684ed95b21aSIlya Dryomov 	char buf[buf_size];
3685ed95b21aSIlya Dryomov 	int ret;
3686ed95b21aSIlya Dryomov 
3687ed95b21aSIlya Dryomov 	if (result) {
3688ed95b21aSIlya Dryomov 		void *p = buf;
3689ed95b21aSIlya Dryomov 
3690ed95b21aSIlya Dryomov 		/* encode ResponseMessage */
3691ed95b21aSIlya Dryomov 		ceph_start_encoding(&p, 1, 1,
3692ed95b21aSIlya Dryomov 				    buf_size - CEPH_ENCODING_START_BLK_LEN);
3693ed95b21aSIlya Dryomov 		ceph_encode_32(&p, *result);
3694ed95b21aSIlya Dryomov 	} else {
3695ed95b21aSIlya Dryomov 		buf_size = 0;
3696ed95b21aSIlya Dryomov 	}
3697ed95b21aSIlya Dryomov 
3698ed95b21aSIlya Dryomov 	ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
3699ed95b21aSIlya Dryomov 				   &rbd_dev->header_oloc, notify_id, cookie,
3700ed95b21aSIlya Dryomov 				   buf, buf_size);
3701ed95b21aSIlya Dryomov 	if (ret)
3702ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
3703ed95b21aSIlya Dryomov }
3704ed95b21aSIlya Dryomov 
3705ed95b21aSIlya Dryomov static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
3706ed95b21aSIlya Dryomov 				   u64 cookie)
3707ed95b21aSIlya Dryomov {
3708ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3709ed95b21aSIlya Dryomov 	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
3710ed95b21aSIlya Dryomov }
3711ed95b21aSIlya Dryomov 
3712ed95b21aSIlya Dryomov static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
3713ed95b21aSIlya Dryomov 					  u64 notify_id, u64 cookie, s32 result)
3714ed95b21aSIlya Dryomov {
3715ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3716ed95b21aSIlya Dryomov 	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
3717ed95b21aSIlya Dryomov }
3718922dab61SIlya Dryomov 
3719922dab61SIlya Dryomov static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
3720922dab61SIlya Dryomov 			 u64 notifier_id, void *data, size_t data_len)
3721b8d70035SAlex Elder {
3722922dab61SIlya Dryomov 	struct rbd_device *rbd_dev = arg;
3723ed95b21aSIlya Dryomov 	void *p = data;
3724ed95b21aSIlya Dryomov 	void *const end = p + data_len;
3725d4c2269bSIlya Dryomov 	u8 struct_v = 0;
3726ed95b21aSIlya Dryomov 	u32 len;
3727ed95b21aSIlya Dryomov 	u32 notify_op;
3728b8d70035SAlex Elder 	int ret;
3729b8d70035SAlex Elder 
3730ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
3731ed95b21aSIlya Dryomov 	     __func__, rbd_dev, cookie, notify_id, data_len);
3732ed95b21aSIlya Dryomov 	if (data_len) {
3733ed95b21aSIlya Dryomov 		ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
3734ed95b21aSIlya Dryomov 					  &struct_v, &len);
3735ed95b21aSIlya Dryomov 		if (ret) {
3736ed95b21aSIlya Dryomov 			rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
3737ed95b21aSIlya Dryomov 				 ret);
3738ed95b21aSIlya Dryomov 			return;
3739ed95b21aSIlya Dryomov 		}
374052bb1f9bSIlya Dryomov 
3741ed95b21aSIlya Dryomov 		notify_op = ceph_decode_32(&p);
3742ed95b21aSIlya Dryomov 	} else {
3743ed95b21aSIlya Dryomov 		/* legacy notification for header updates */
3744ed95b21aSIlya Dryomov 		notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
3745ed95b21aSIlya Dryomov 		len = 0;
3746ed95b21aSIlya Dryomov 	}
3747ed95b21aSIlya Dryomov 
3748ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
3749ed95b21aSIlya Dryomov 	switch (notify_op) {
3750ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_ACQUIRED_LOCK:
3751ed95b21aSIlya Dryomov 		rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
3752ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3753ed95b21aSIlya Dryomov 		break;
3754ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_RELEASED_LOCK:
3755ed95b21aSIlya Dryomov 		rbd_handle_released_lock(rbd_dev, struct_v, &p);
3756ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3757ed95b21aSIlya Dryomov 		break;
3758ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_REQUEST_LOCK:
3759ed95b21aSIlya Dryomov 		if (rbd_handle_request_lock(rbd_dev, struct_v, &p))
376052bb1f9bSIlya Dryomov 			/*
3761ed95b21aSIlya Dryomov 			 * send ResponseMessage(0) back so the client
3762ed95b21aSIlya Dryomov 			 * can detect a missing owner
376352bb1f9bSIlya Dryomov 			 */
3764ed95b21aSIlya Dryomov 			rbd_acknowledge_notify_result(rbd_dev, notify_id,
3765ed95b21aSIlya Dryomov 						      cookie, 0);
3766ed95b21aSIlya Dryomov 		else
3767ed95b21aSIlya Dryomov 			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3768ed95b21aSIlya Dryomov 		break;
3769ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_HEADER_UPDATE:
3770e627db08SAlex Elder 		ret = rbd_dev_refresh(rbd_dev);
3771e627db08SAlex Elder 		if (ret)
37729584d508SIlya Dryomov 			rbd_warn(rbd_dev, "refresh failed: %d", ret);
3773b8d70035SAlex Elder 
3774ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3775ed95b21aSIlya Dryomov 		break;
3776ed95b21aSIlya Dryomov 	default:
3777ed95b21aSIlya Dryomov 		if (rbd_is_lock_owner(rbd_dev))
3778ed95b21aSIlya Dryomov 			rbd_acknowledge_notify_result(rbd_dev, notify_id,
3779ed95b21aSIlya Dryomov 						      cookie, -EOPNOTSUPP);
3780ed95b21aSIlya Dryomov 		else
3781ed95b21aSIlya Dryomov 			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3782ed95b21aSIlya Dryomov 		break;
3783b8d70035SAlex Elder 	}
3784b8d70035SAlex Elder }
3785b8d70035SAlex Elder 
378699d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
37879969ebc5SAlex Elder 
3788922dab61SIlya Dryomov static void rbd_watch_errcb(void *arg, u64 cookie, int err)
3789bb040aa0SIlya Dryomov {
3790922dab61SIlya Dryomov 	struct rbd_device *rbd_dev = arg;
3791bb040aa0SIlya Dryomov 
3792922dab61SIlya Dryomov 	rbd_warn(rbd_dev, "encountered watch error: %d", err);
3793bb040aa0SIlya Dryomov 
3794ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3795ed95b21aSIlya Dryomov 	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3796ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
3797bb040aa0SIlya Dryomov 
379899d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
379999d16943SIlya Dryomov 	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
380099d16943SIlya Dryomov 		__rbd_unregister_watch(rbd_dev);
380199d16943SIlya Dryomov 		rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
3802bb040aa0SIlya Dryomov 
380399d16943SIlya Dryomov 		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
3804bb040aa0SIlya Dryomov 	}
380599d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
3806bb040aa0SIlya Dryomov }
3807bb040aa0SIlya Dryomov 
3808bb040aa0SIlya Dryomov /*
380999d16943SIlya Dryomov  * watch_mutex must be locked
38109969ebc5SAlex Elder  */
381199d16943SIlya Dryomov static int __rbd_register_watch(struct rbd_device *rbd_dev)
38129969ebc5SAlex Elder {
38139969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3814922dab61SIlya Dryomov 	struct ceph_osd_linger_request *handle;
38159969ebc5SAlex Elder 
3816922dab61SIlya Dryomov 	rbd_assert(!rbd_dev->watch_handle);
381799d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
38189969ebc5SAlex Elder 
3819922dab61SIlya Dryomov 	handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
3820922dab61SIlya Dryomov 				 &rbd_dev->header_oloc, rbd_watch_cb,
3821922dab61SIlya Dryomov 				 rbd_watch_errcb, rbd_dev);
3822922dab61SIlya Dryomov 	if (IS_ERR(handle))
3823922dab61SIlya Dryomov 		return PTR_ERR(handle);
38249969ebc5SAlex Elder 
3825922dab61SIlya Dryomov 	rbd_dev->watch_handle = handle;
38268eb87565SAlex Elder 	return 0;
38279969ebc5SAlex Elder }
38289969ebc5SAlex Elder 
382999d16943SIlya Dryomov /*
383099d16943SIlya Dryomov  * watch_mutex must be locked
383199d16943SIlya Dryomov  */
383299d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
3833fca27065SIlya Dryomov {
3834922dab61SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3835922dab61SIlya Dryomov 	int ret;
3836b30a01f2SIlya Dryomov 
383799d16943SIlya Dryomov 	rbd_assert(rbd_dev->watch_handle);
383899d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3839b30a01f2SIlya Dryomov 
3840922dab61SIlya Dryomov 	ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
3841922dab61SIlya Dryomov 	if (ret)
3842922dab61SIlya Dryomov 		rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
3843b30a01f2SIlya Dryomov 
3844922dab61SIlya Dryomov 	rbd_dev->watch_handle = NULL;
3845c525f036SIlya Dryomov }
3846c525f036SIlya Dryomov 
384799d16943SIlya Dryomov static int rbd_register_watch(struct rbd_device *rbd_dev)
3848c525f036SIlya Dryomov {
384999d16943SIlya Dryomov 	int ret;
3850811c6688SIlya Dryomov 
385199d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
385299d16943SIlya Dryomov 	rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
385399d16943SIlya Dryomov 	ret = __rbd_register_watch(rbd_dev);
385499d16943SIlya Dryomov 	if (ret)
385599d16943SIlya Dryomov 		goto out;
385699d16943SIlya Dryomov 
385799d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
385899d16943SIlya Dryomov 	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
385999d16943SIlya Dryomov 
386099d16943SIlya Dryomov out:
386199d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
386299d16943SIlya Dryomov 	return ret;
386399d16943SIlya Dryomov }
386499d16943SIlya Dryomov 
386599d16943SIlya Dryomov static void cancel_tasks_sync(struct rbd_device *rbd_dev)
386699d16943SIlya Dryomov {
386799d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
386899d16943SIlya Dryomov 
386999d16943SIlya Dryomov 	cancel_delayed_work_sync(&rbd_dev->watch_dwork);
3870ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->acquired_lock_work);
3871ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->released_lock_work);
3872ed95b21aSIlya Dryomov 	cancel_delayed_work_sync(&rbd_dev->lock_dwork);
3873ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->unlock_work);
387499d16943SIlya Dryomov }
387599d16943SIlya Dryomov 
387699d16943SIlya Dryomov static void rbd_unregister_watch(struct rbd_device *rbd_dev)
387799d16943SIlya Dryomov {
3878ed95b21aSIlya Dryomov 	WARN_ON(waitqueue_active(&rbd_dev->lock_waitq));
387999d16943SIlya Dryomov 	cancel_tasks_sync(rbd_dev);
388099d16943SIlya Dryomov 
388199d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
388299d16943SIlya Dryomov 	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
388399d16943SIlya Dryomov 		__rbd_unregister_watch(rbd_dev);
388499d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
388599d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
388699d16943SIlya Dryomov 
3887811c6688SIlya Dryomov 	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
3888fca27065SIlya Dryomov }
3889fca27065SIlya Dryomov 
389099d16943SIlya Dryomov static void rbd_reregister_watch(struct work_struct *work)
389199d16943SIlya Dryomov {
389299d16943SIlya Dryomov 	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
389399d16943SIlya Dryomov 					    struct rbd_device, watch_dwork);
3894ed95b21aSIlya Dryomov 	bool was_lock_owner = false;
389587c0fdedSIlya Dryomov 	bool need_to_wake = false;
389699d16943SIlya Dryomov 	int ret;
389799d16943SIlya Dryomov 
389899d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
389999d16943SIlya Dryomov 
3900ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3901ed95b21aSIlya Dryomov 	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3902ed95b21aSIlya Dryomov 		was_lock_owner = rbd_release_lock(rbd_dev);
3903ed95b21aSIlya Dryomov 
390499d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
390587c0fdedSIlya Dryomov 	if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
390687c0fdedSIlya Dryomov 		mutex_unlock(&rbd_dev->watch_mutex);
390787c0fdedSIlya Dryomov 		goto out;
390887c0fdedSIlya Dryomov 	}
390999d16943SIlya Dryomov 
391099d16943SIlya Dryomov 	ret = __rbd_register_watch(rbd_dev);
391199d16943SIlya Dryomov 	if (ret) {
391299d16943SIlya Dryomov 		rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
39134d73644bSIlya Dryomov 		if (ret == -EBLACKLISTED || ret == -ENOENT) {
391487c0fdedSIlya Dryomov 			set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
391587c0fdedSIlya Dryomov 			need_to_wake = true;
391687c0fdedSIlya Dryomov 		} else {
391799d16943SIlya Dryomov 			queue_delayed_work(rbd_dev->task_wq,
391899d16943SIlya Dryomov 					   &rbd_dev->watch_dwork,
391999d16943SIlya Dryomov 					   RBD_RETRY_DELAY);
392087c0fdedSIlya Dryomov 		}
392187c0fdedSIlya Dryomov 		mutex_unlock(&rbd_dev->watch_mutex);
392287c0fdedSIlya Dryomov 		goto out;
392399d16943SIlya Dryomov 	}
392499d16943SIlya Dryomov 
392587c0fdedSIlya Dryomov 	need_to_wake = true;
392699d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
392799d16943SIlya Dryomov 	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
392899d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
392999d16943SIlya Dryomov 
393099d16943SIlya Dryomov 	ret = rbd_dev_refresh(rbd_dev);
393199d16943SIlya Dryomov 	if (ret)
393299d16943SIlya Dryomov 		rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret);
393399d16943SIlya Dryomov 
3934ed95b21aSIlya Dryomov 	if (was_lock_owner) {
3935ed95b21aSIlya Dryomov 		ret = rbd_try_lock(rbd_dev);
3936ed95b21aSIlya Dryomov 		if (ret)
3937ed95b21aSIlya Dryomov 			rbd_warn(rbd_dev, "reregisteration lock failed: %d",
3938ed95b21aSIlya Dryomov 				 ret);
3939ed95b21aSIlya Dryomov 	}
3940ed95b21aSIlya Dryomov 
394187c0fdedSIlya Dryomov out:
3942ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
394387c0fdedSIlya Dryomov 	if (need_to_wake)
3944ed95b21aSIlya Dryomov 		wake_requests(rbd_dev, true);
394599d16943SIlya Dryomov }
394699d16943SIlya Dryomov 
394736be9a76SAlex Elder /*
3948f40eb349SAlex Elder  * Synchronous osd object method call.  Returns the number of bytes
3949f40eb349SAlex Elder  * returned in the outbound buffer, or a negative error code.
395036be9a76SAlex Elder  */
395136be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
3952ecd4a68aSIlya Dryomov 			     struct ceph_object_id *oid,
3953ecd4a68aSIlya Dryomov 			     struct ceph_object_locator *oloc,
395436be9a76SAlex Elder 			     const char *method_name,
39554157976bSAlex Elder 			     const void *outbound,
395636be9a76SAlex Elder 			     size_t outbound_size,
39574157976bSAlex Elder 			     void *inbound,
3958e2a58ee5SAlex Elder 			     size_t inbound_size)
395936be9a76SAlex Elder {
3960ecd4a68aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3961ecd4a68aSIlya Dryomov 	struct page *req_page = NULL;
3962ecd4a68aSIlya Dryomov 	struct page *reply_page;
396336be9a76SAlex Elder 	int ret;
396436be9a76SAlex Elder 
396536be9a76SAlex Elder 	/*
39666010a451SAlex Elder 	 * Method calls are ultimately read operations.  The result
39676010a451SAlex Elder 	 * should placed into the inbound buffer provided.  They
39686010a451SAlex Elder 	 * also supply outbound data--parameters for the object
39696010a451SAlex Elder 	 * method.  Currently if this is present it will be a
39706010a451SAlex Elder 	 * snapshot id.
397136be9a76SAlex Elder 	 */
3972ecd4a68aSIlya Dryomov 	if (outbound) {
3973ecd4a68aSIlya Dryomov 		if (outbound_size > PAGE_SIZE)
3974ecd4a68aSIlya Dryomov 			return -E2BIG;
397536be9a76SAlex Elder 
3976ecd4a68aSIlya Dryomov 		req_page = alloc_page(GFP_KERNEL);
3977ecd4a68aSIlya Dryomov 		if (!req_page)
3978ecd4a68aSIlya Dryomov 			return -ENOMEM;
397936be9a76SAlex Elder 
3980ecd4a68aSIlya Dryomov 		memcpy(page_address(req_page), outbound, outbound_size);
398104017e29SAlex Elder 	}
3982430c28c3SAlex Elder 
3983ecd4a68aSIlya Dryomov 	reply_page = alloc_page(GFP_KERNEL);
3984ecd4a68aSIlya Dryomov 	if (!reply_page) {
3985ecd4a68aSIlya Dryomov 		if (req_page)
3986ecd4a68aSIlya Dryomov 			__free_page(req_page);
3987ecd4a68aSIlya Dryomov 		return -ENOMEM;
3988ecd4a68aSIlya Dryomov 	}
398936be9a76SAlex Elder 
3990ecd4a68aSIlya Dryomov 	ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
3991ecd4a68aSIlya Dryomov 			     CEPH_OSD_FLAG_READ, req_page, outbound_size,
3992ecd4a68aSIlya Dryomov 			     reply_page, &inbound_size);
3993ecd4a68aSIlya Dryomov 	if (!ret) {
3994ecd4a68aSIlya Dryomov 		memcpy(inbound, page_address(reply_page), inbound_size);
3995ecd4a68aSIlya Dryomov 		ret = inbound_size;
3996ecd4a68aSIlya Dryomov 	}
399757385b51SAlex Elder 
3998ecd4a68aSIlya Dryomov 	if (req_page)
3999ecd4a68aSIlya Dryomov 		__free_page(req_page);
4000ecd4a68aSIlya Dryomov 	__free_page(reply_page);
400136be9a76SAlex Elder 	return ret;
400236be9a76SAlex Elder }
400336be9a76SAlex Elder 
4004ed95b21aSIlya Dryomov /*
4005ed95b21aSIlya Dryomov  * lock_rwsem must be held for read
4006ed95b21aSIlya Dryomov  */
4007ed95b21aSIlya Dryomov static void rbd_wait_state_locked(struct rbd_device *rbd_dev)
4008ed95b21aSIlya Dryomov {
4009ed95b21aSIlya Dryomov 	DEFINE_WAIT(wait);
4010ed95b21aSIlya Dryomov 
4011ed95b21aSIlya Dryomov 	do {
4012ed95b21aSIlya Dryomov 		/*
4013ed95b21aSIlya Dryomov 		 * Note the use of mod_delayed_work() in rbd_acquire_lock()
4014ed95b21aSIlya Dryomov 		 * and cancel_delayed_work() in wake_requests().
4015ed95b21aSIlya Dryomov 		 */
4016ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
4017ed95b21aSIlya Dryomov 		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
4018ed95b21aSIlya Dryomov 		prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait,
4019ed95b21aSIlya Dryomov 					  TASK_UNINTERRUPTIBLE);
4020ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
4021ed95b21aSIlya Dryomov 		schedule();
4022ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
402387c0fdedSIlya Dryomov 	} while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED &&
402487c0fdedSIlya Dryomov 		 !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags));
402587c0fdedSIlya Dryomov 
4026ed95b21aSIlya Dryomov 	finish_wait(&rbd_dev->lock_waitq, &wait);
4027ed95b21aSIlya Dryomov }
4028ed95b21aSIlya Dryomov 
40297ad18afaSChristoph Hellwig static void rbd_queue_workfn(struct work_struct *work)
4030bc1ecc65SIlya Dryomov {
40317ad18afaSChristoph Hellwig 	struct request *rq = blk_mq_rq_from_pdu(work);
40327ad18afaSChristoph Hellwig 	struct rbd_device *rbd_dev = rq->q->queuedata;
4033bc1ecc65SIlya Dryomov 	struct rbd_img_request *img_request;
40344e752f0aSJosh Durgin 	struct ceph_snap_context *snapc = NULL;
4035bc1ecc65SIlya Dryomov 	u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
4036bc1ecc65SIlya Dryomov 	u64 length = blk_rq_bytes(rq);
40376d2940c8SGuangliang Zhao 	enum obj_operation_type op_type;
40384e752f0aSJosh Durgin 	u64 mapping_size;
403980de1912SIlya Dryomov 	bool must_be_locked;
4040bc1ecc65SIlya Dryomov 	int result;
4041bc1ecc65SIlya Dryomov 
40427ad18afaSChristoph Hellwig 	if (rq->cmd_type != REQ_TYPE_FS) {
40437ad18afaSChristoph Hellwig 		dout("%s: non-fs request type %d\n", __func__,
40447ad18afaSChristoph Hellwig 			(int) rq->cmd_type);
40457ad18afaSChristoph Hellwig 		result = -EIO;
40467ad18afaSChristoph Hellwig 		goto err;
40477ad18afaSChristoph Hellwig 	}
40487ad18afaSChristoph Hellwig 
4049c2df40dfSMike Christie 	if (req_op(rq) == REQ_OP_DISCARD)
405090e98c52SGuangliang Zhao 		op_type = OBJ_OP_DISCARD;
4051c2df40dfSMike Christie 	else if (req_op(rq) == REQ_OP_WRITE)
40526d2940c8SGuangliang Zhao 		op_type = OBJ_OP_WRITE;
40536d2940c8SGuangliang Zhao 	else
40546d2940c8SGuangliang Zhao 		op_type = OBJ_OP_READ;
40556d2940c8SGuangliang Zhao 
4056bc1ecc65SIlya Dryomov 	/* Ignore/skip any zero-length requests */
4057bc1ecc65SIlya Dryomov 
4058bc1ecc65SIlya Dryomov 	if (!length) {
4059bc1ecc65SIlya Dryomov 		dout("%s: zero-length request\n", __func__);
4060bc1ecc65SIlya Dryomov 		result = 0;
4061bc1ecc65SIlya Dryomov 		goto err_rq;
4062bc1ecc65SIlya Dryomov 	}
4063bc1ecc65SIlya Dryomov 
40646d2940c8SGuangliang Zhao 	/* Only reads are allowed to a read-only device */
4065bc1ecc65SIlya Dryomov 
40666d2940c8SGuangliang Zhao 	if (op_type != OBJ_OP_READ) {
4067bc1ecc65SIlya Dryomov 		if (rbd_dev->mapping.read_only) {
4068bc1ecc65SIlya Dryomov 			result = -EROFS;
4069bc1ecc65SIlya Dryomov 			goto err_rq;
4070bc1ecc65SIlya Dryomov 		}
4071bc1ecc65SIlya Dryomov 		rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
4072bc1ecc65SIlya Dryomov 	}
4073bc1ecc65SIlya Dryomov 
4074bc1ecc65SIlya Dryomov 	/*
4075bc1ecc65SIlya Dryomov 	 * Quit early if the mapped snapshot no longer exists.  It's
4076bc1ecc65SIlya Dryomov 	 * still possible the snapshot will have disappeared by the
4077bc1ecc65SIlya Dryomov 	 * time our request arrives at the osd, but there's no sense in
4078bc1ecc65SIlya Dryomov 	 * sending it if we already know.
4079bc1ecc65SIlya Dryomov 	 */
4080bc1ecc65SIlya Dryomov 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
4081bc1ecc65SIlya Dryomov 		dout("request for non-existent snapshot");
4082bc1ecc65SIlya Dryomov 		rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
4083bc1ecc65SIlya Dryomov 		result = -ENXIO;
4084bc1ecc65SIlya Dryomov 		goto err_rq;
4085bc1ecc65SIlya Dryomov 	}
4086bc1ecc65SIlya Dryomov 
4087bc1ecc65SIlya Dryomov 	if (offset && length > U64_MAX - offset + 1) {
4088bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
4089bc1ecc65SIlya Dryomov 			 length);
4090bc1ecc65SIlya Dryomov 		result = -EINVAL;
4091bc1ecc65SIlya Dryomov 		goto err_rq;	/* Shouldn't happen */
4092bc1ecc65SIlya Dryomov 	}
4093bc1ecc65SIlya Dryomov 
40947ad18afaSChristoph Hellwig 	blk_mq_start_request(rq);
40957ad18afaSChristoph Hellwig 
40964e752f0aSJosh Durgin 	down_read(&rbd_dev->header_rwsem);
40974e752f0aSJosh Durgin 	mapping_size = rbd_dev->mapping.size;
40986d2940c8SGuangliang Zhao 	if (op_type != OBJ_OP_READ) {
40994e752f0aSJosh Durgin 		snapc = rbd_dev->header.snapc;
41004e752f0aSJosh Durgin 		ceph_get_snap_context(snapc);
4101ed95b21aSIlya Dryomov 		must_be_locked = rbd_is_lock_supported(rbd_dev);
410280de1912SIlya Dryomov 	} else {
410380de1912SIlya Dryomov 		must_be_locked = rbd_dev->opts->lock_on_read &&
410480de1912SIlya Dryomov 					rbd_is_lock_supported(rbd_dev);
41054e752f0aSJosh Durgin 	}
41064e752f0aSJosh Durgin 	up_read(&rbd_dev->header_rwsem);
41074e752f0aSJosh Durgin 
41084e752f0aSJosh Durgin 	if (offset + length > mapping_size) {
4109bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
41104e752f0aSJosh Durgin 			 length, mapping_size);
4111bc1ecc65SIlya Dryomov 		result = -EIO;
4112bc1ecc65SIlya Dryomov 		goto err_rq;
4113bc1ecc65SIlya Dryomov 	}
4114bc1ecc65SIlya Dryomov 
4115ed95b21aSIlya Dryomov 	if (must_be_locked) {
4116ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
411787c0fdedSIlya Dryomov 		if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED &&
411887c0fdedSIlya Dryomov 		    !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags))
4119ed95b21aSIlya Dryomov 			rbd_wait_state_locked(rbd_dev);
412087c0fdedSIlya Dryomov 
412187c0fdedSIlya Dryomov 		WARN_ON((rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) ^
412287c0fdedSIlya Dryomov 			!test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags));
412387c0fdedSIlya Dryomov 		if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
412487c0fdedSIlya Dryomov 			result = -EBLACKLISTED;
412587c0fdedSIlya Dryomov 			goto err_unlock;
412687c0fdedSIlya Dryomov 		}
4127ed95b21aSIlya Dryomov 	}
4128ed95b21aSIlya Dryomov 
41296d2940c8SGuangliang Zhao 	img_request = rbd_img_request_create(rbd_dev, offset, length, op_type,
41304e752f0aSJosh Durgin 					     snapc);
4131bc1ecc65SIlya Dryomov 	if (!img_request) {
4132bc1ecc65SIlya Dryomov 		result = -ENOMEM;
4133ed95b21aSIlya Dryomov 		goto err_unlock;
4134bc1ecc65SIlya Dryomov 	}
4135bc1ecc65SIlya Dryomov 	img_request->rq = rq;
413670b16db8SIlya Dryomov 	snapc = NULL; /* img_request consumes a ref */
4137bc1ecc65SIlya Dryomov 
413890e98c52SGuangliang Zhao 	if (op_type == OBJ_OP_DISCARD)
413990e98c52SGuangliang Zhao 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA,
414090e98c52SGuangliang Zhao 					      NULL);
414190e98c52SGuangliang Zhao 	else
414290e98c52SGuangliang Zhao 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
414390e98c52SGuangliang Zhao 					      rq->bio);
4144bc1ecc65SIlya Dryomov 	if (result)
4145bc1ecc65SIlya Dryomov 		goto err_img_request;
4146bc1ecc65SIlya Dryomov 
4147bc1ecc65SIlya Dryomov 	result = rbd_img_request_submit(img_request);
4148bc1ecc65SIlya Dryomov 	if (result)
4149bc1ecc65SIlya Dryomov 		goto err_img_request;
4150bc1ecc65SIlya Dryomov 
4151ed95b21aSIlya Dryomov 	if (must_be_locked)
4152ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
4153bc1ecc65SIlya Dryomov 	return;
4154bc1ecc65SIlya Dryomov 
4155bc1ecc65SIlya Dryomov err_img_request:
4156bc1ecc65SIlya Dryomov 	rbd_img_request_put(img_request);
4157ed95b21aSIlya Dryomov err_unlock:
4158ed95b21aSIlya Dryomov 	if (must_be_locked)
4159ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
4160bc1ecc65SIlya Dryomov err_rq:
4161bc1ecc65SIlya Dryomov 	if (result)
4162bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "%s %llx at %llx result %d",
41636d2940c8SGuangliang Zhao 			 obj_op_name(op_type), length, offset, result);
41644e752f0aSJosh Durgin 	ceph_put_snap_context(snapc);
41657ad18afaSChristoph Hellwig err:
41667ad18afaSChristoph Hellwig 	blk_mq_end_request(rq, result);
4167bc1ecc65SIlya Dryomov }
4168bc1ecc65SIlya Dryomov 
41697ad18afaSChristoph Hellwig static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
41707ad18afaSChristoph Hellwig 		const struct blk_mq_queue_data *bd)
4171bc1ecc65SIlya Dryomov {
41727ad18afaSChristoph Hellwig 	struct request *rq = bd->rq;
41737ad18afaSChristoph Hellwig 	struct work_struct *work = blk_mq_rq_to_pdu(rq);
4174bc1ecc65SIlya Dryomov 
41757ad18afaSChristoph Hellwig 	queue_work(rbd_wq, work);
41767ad18afaSChristoph Hellwig 	return BLK_MQ_RQ_QUEUE_OK;
4177bf0d5f50SAlex Elder }
4178bf0d5f50SAlex Elder 
4179602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
4180602adf40SYehuda Sadeh {
4181602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
4182602adf40SYehuda Sadeh 
4183602adf40SYehuda Sadeh 	if (!disk)
4184602adf40SYehuda Sadeh 		return;
4185602adf40SYehuda Sadeh 
4186a0cab924SAlex Elder 	rbd_dev->disk = NULL;
4187a0cab924SAlex Elder 	if (disk->flags & GENHD_FL_UP) {
4188602adf40SYehuda Sadeh 		del_gendisk(disk);
4189602adf40SYehuda Sadeh 		if (disk->queue)
4190602adf40SYehuda Sadeh 			blk_cleanup_queue(disk->queue);
41917ad18afaSChristoph Hellwig 		blk_mq_free_tag_set(&rbd_dev->tag_set);
4192a0cab924SAlex Elder 	}
4193602adf40SYehuda Sadeh 	put_disk(disk);
4194602adf40SYehuda Sadeh }
4195602adf40SYehuda Sadeh 
4196788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
4197fe5478e0SIlya Dryomov 			     struct ceph_object_id *oid,
4198fe5478e0SIlya Dryomov 			     struct ceph_object_locator *oloc,
4199fe5478e0SIlya Dryomov 			     void *buf, int buf_len)
4200788e2df3SAlex Elder 
4201788e2df3SAlex Elder {
4202fe5478e0SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4203fe5478e0SIlya Dryomov 	struct ceph_osd_request *req;
4204fe5478e0SIlya Dryomov 	struct page **pages;
4205fe5478e0SIlya Dryomov 	int num_pages = calc_pages_for(0, buf_len);
4206788e2df3SAlex Elder 	int ret;
4207788e2df3SAlex Elder 
4208fe5478e0SIlya Dryomov 	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
4209fe5478e0SIlya Dryomov 	if (!req)
4210fe5478e0SIlya Dryomov 		return -ENOMEM;
4211788e2df3SAlex Elder 
4212fe5478e0SIlya Dryomov 	ceph_oid_copy(&req->r_base_oid, oid);
4213fe5478e0SIlya Dryomov 	ceph_oloc_copy(&req->r_base_oloc, oloc);
4214fe5478e0SIlya Dryomov 	req->r_flags = CEPH_OSD_FLAG_READ;
4215788e2df3SAlex Elder 
4216fe5478e0SIlya Dryomov 	ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
4217788e2df3SAlex Elder 	if (ret)
4218fe5478e0SIlya Dryomov 		goto out_req;
4219788e2df3SAlex Elder 
4220fe5478e0SIlya Dryomov 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
4221fe5478e0SIlya Dryomov 	if (IS_ERR(pages)) {
4222fe5478e0SIlya Dryomov 		ret = PTR_ERR(pages);
4223fe5478e0SIlya Dryomov 		goto out_req;
4224fe5478e0SIlya Dryomov 	}
42251ceae7efSAlex Elder 
4226fe5478e0SIlya Dryomov 	osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
4227fe5478e0SIlya Dryomov 	osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
4228fe5478e0SIlya Dryomov 					 true);
4229788e2df3SAlex Elder 
4230fe5478e0SIlya Dryomov 	ceph_osdc_start_request(osdc, req, false);
4231fe5478e0SIlya Dryomov 	ret = ceph_osdc_wait_request(osdc, req);
4232fe5478e0SIlya Dryomov 	if (ret >= 0)
4233fe5478e0SIlya Dryomov 		ceph_copy_from_page_vector(pages, buf, 0, ret);
4234fe5478e0SIlya Dryomov 
4235fe5478e0SIlya Dryomov out_req:
4236fe5478e0SIlya Dryomov 	ceph_osdc_put_request(req);
4237788e2df3SAlex Elder 	return ret;
4238788e2df3SAlex Elder }
4239788e2df3SAlex Elder 
4240602adf40SYehuda Sadeh /*
4241662518b1SAlex Elder  * Read the complete header for the given rbd device.  On successful
4242662518b1SAlex Elder  * return, the rbd_dev->header field will contain up-to-date
4243662518b1SAlex Elder  * information about the image.
42444156d998SAlex Elder  */
424599a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
42464156d998SAlex Elder {
42474156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
42484156d998SAlex Elder 	u32 snap_count = 0;
42494156d998SAlex Elder 	u64 names_size = 0;
42504156d998SAlex Elder 	u32 want_count;
42514156d998SAlex Elder 	int ret;
42524156d998SAlex Elder 
42534156d998SAlex Elder 	/*
42544156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
42554156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
42564156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
42574156d998SAlex Elder 	 * the number of snapshots could change by the time we read
42584156d998SAlex Elder 	 * it in, in which case we re-read it.
42594156d998SAlex Elder 	 */
42604156d998SAlex Elder 	do {
42614156d998SAlex Elder 		size_t size;
42624156d998SAlex Elder 
42634156d998SAlex Elder 		kfree(ondisk);
42644156d998SAlex Elder 
42654156d998SAlex Elder 		size = sizeof (*ondisk);
42664156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
42674156d998SAlex Elder 		size += names_size;
42684156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
42694156d998SAlex Elder 		if (!ondisk)
4270662518b1SAlex Elder 			return -ENOMEM;
42714156d998SAlex Elder 
4272fe5478e0SIlya Dryomov 		ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
4273fe5478e0SIlya Dryomov 					&rbd_dev->header_oloc, ondisk, size);
42744156d998SAlex Elder 		if (ret < 0)
4275662518b1SAlex Elder 			goto out;
4276c0cd10dbSAlex Elder 		if ((size_t)ret < size) {
42774156d998SAlex Elder 			ret = -ENXIO;
427806ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
427906ecc6cbSAlex Elder 				size, ret);
4280662518b1SAlex Elder 			goto out;
42814156d998SAlex Elder 		}
42824156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
42834156d998SAlex Elder 			ret = -ENXIO;
428406ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
4285662518b1SAlex Elder 			goto out;
42864156d998SAlex Elder 		}
42874156d998SAlex Elder 
42884156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
42894156d998SAlex Elder 		want_count = snap_count;
42904156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
42914156d998SAlex Elder 	} while (snap_count != want_count);
42924156d998SAlex Elder 
4293662518b1SAlex Elder 	ret = rbd_header_from_disk(rbd_dev, ondisk);
4294662518b1SAlex Elder out:
42954156d998SAlex Elder 	kfree(ondisk);
42964156d998SAlex Elder 
4297dfc5606dSYehuda Sadeh 	return ret;
4298602adf40SYehuda Sadeh }
4299602adf40SYehuda Sadeh 
430015228edeSAlex Elder /*
430115228edeSAlex Elder  * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
430215228edeSAlex Elder  * has disappeared from the (just updated) snapshot context.
430315228edeSAlex Elder  */
430415228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev)
430515228edeSAlex Elder {
430615228edeSAlex Elder 	u64 snap_id;
430715228edeSAlex Elder 
430815228edeSAlex Elder 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
430915228edeSAlex Elder 		return;
431015228edeSAlex Elder 
431115228edeSAlex Elder 	snap_id = rbd_dev->spec->snap_id;
431215228edeSAlex Elder 	if (snap_id == CEPH_NOSNAP)
431315228edeSAlex Elder 		return;
431415228edeSAlex Elder 
431515228edeSAlex Elder 	if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
431615228edeSAlex Elder 		clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
431715228edeSAlex Elder }
431815228edeSAlex Elder 
43199875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev)
43209875201eSJosh Durgin {
43219875201eSJosh Durgin 	sector_t size;
43229875201eSJosh Durgin 
43239875201eSJosh Durgin 	/*
4324811c6688SIlya Dryomov 	 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
4325811c6688SIlya Dryomov 	 * try to update its size.  If REMOVING is set, updating size
4326811c6688SIlya Dryomov 	 * is just useless work since the device can't be opened.
43279875201eSJosh Durgin 	 */
4328811c6688SIlya Dryomov 	if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
4329811c6688SIlya Dryomov 	    !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
43309875201eSJosh Durgin 		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
43319875201eSJosh Durgin 		dout("setting size to %llu sectors", (unsigned long long)size);
43329875201eSJosh Durgin 		set_capacity(rbd_dev->disk, size);
43339875201eSJosh Durgin 		revalidate_disk(rbd_dev->disk);
43349875201eSJosh Durgin 	}
43359875201eSJosh Durgin }
43369875201eSJosh Durgin 
4337cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev)
43381fe5e993SAlex Elder {
4339e627db08SAlex Elder 	u64 mapping_size;
43401fe5e993SAlex Elder 	int ret;
43411fe5e993SAlex Elder 
4342cfbf6377SAlex Elder 	down_write(&rbd_dev->header_rwsem);
43433b5cf2a2SAlex Elder 	mapping_size = rbd_dev->mapping.size;
4344a720ae09SIlya Dryomov 
4345a720ae09SIlya Dryomov 	ret = rbd_dev_header_info(rbd_dev);
434652bb1f9bSIlya Dryomov 	if (ret)
434773e39e4dSIlya Dryomov 		goto out;
434815228edeSAlex Elder 
4349e8f59b59SIlya Dryomov 	/*
4350e8f59b59SIlya Dryomov 	 * If there is a parent, see if it has disappeared due to the
4351e8f59b59SIlya Dryomov 	 * mapped image getting flattened.
4352e8f59b59SIlya Dryomov 	 */
4353e8f59b59SIlya Dryomov 	if (rbd_dev->parent) {
4354e8f59b59SIlya Dryomov 		ret = rbd_dev_v2_parent_info(rbd_dev);
4355e8f59b59SIlya Dryomov 		if (ret)
435673e39e4dSIlya Dryomov 			goto out;
4357e8f59b59SIlya Dryomov 	}
4358e8f59b59SIlya Dryomov 
43595ff1108cSIlya Dryomov 	if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
43605ff1108cSIlya Dryomov 		rbd_dev->mapping.size = rbd_dev->header.image_size;
43615ff1108cSIlya Dryomov 	} else {
43625ff1108cSIlya Dryomov 		/* validate mapped snapshot's EXISTS flag */
436315228edeSAlex Elder 		rbd_exists_validate(rbd_dev);
43645ff1108cSIlya Dryomov 	}
43655ff1108cSIlya Dryomov 
436673e39e4dSIlya Dryomov out:
4367cfbf6377SAlex Elder 	up_write(&rbd_dev->header_rwsem);
436873e39e4dSIlya Dryomov 	if (!ret && mapping_size != rbd_dev->mapping.size)
43699875201eSJosh Durgin 		rbd_dev_update_size(rbd_dev);
43701fe5e993SAlex Elder 
437173e39e4dSIlya Dryomov 	return ret;
43721fe5e993SAlex Elder }
43731fe5e993SAlex Elder 
43747ad18afaSChristoph Hellwig static int rbd_init_request(void *data, struct request *rq,
43757ad18afaSChristoph Hellwig 		unsigned int hctx_idx, unsigned int request_idx,
43767ad18afaSChristoph Hellwig 		unsigned int numa_node)
43777ad18afaSChristoph Hellwig {
43787ad18afaSChristoph Hellwig 	struct work_struct *work = blk_mq_rq_to_pdu(rq);
43797ad18afaSChristoph Hellwig 
43807ad18afaSChristoph Hellwig 	INIT_WORK(work, rbd_queue_workfn);
43817ad18afaSChristoph Hellwig 	return 0;
43827ad18afaSChristoph Hellwig }
43837ad18afaSChristoph Hellwig 
43847ad18afaSChristoph Hellwig static struct blk_mq_ops rbd_mq_ops = {
43857ad18afaSChristoph Hellwig 	.queue_rq	= rbd_queue_rq,
43867ad18afaSChristoph Hellwig 	.init_request	= rbd_init_request,
43877ad18afaSChristoph Hellwig };
43887ad18afaSChristoph Hellwig 
4389602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
4390602adf40SYehuda Sadeh {
4391602adf40SYehuda Sadeh 	struct gendisk *disk;
4392602adf40SYehuda Sadeh 	struct request_queue *q;
4393593a9e7bSAlex Elder 	u64 segment_size;
43947ad18afaSChristoph Hellwig 	int err;
4395602adf40SYehuda Sadeh 
4396602adf40SYehuda Sadeh 	/* create gendisk info */
43977e513d43SIlya Dryomov 	disk = alloc_disk(single_major ?
43987e513d43SIlya Dryomov 			  (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
43997e513d43SIlya Dryomov 			  RBD_MINORS_PER_MAJOR);
4400602adf40SYehuda Sadeh 	if (!disk)
44011fcdb8aaSAlex Elder 		return -ENOMEM;
4402602adf40SYehuda Sadeh 
4403f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
4404de71a297SAlex Elder 		 rbd_dev->dev_id);
4405602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
4406dd82fff1SIlya Dryomov 	disk->first_minor = rbd_dev->minor;
44077e513d43SIlya Dryomov 	if (single_major)
44087e513d43SIlya Dryomov 		disk->flags |= GENHD_FL_EXT_DEVT;
4409602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
4410602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
4411602adf40SYehuda Sadeh 
44127ad18afaSChristoph Hellwig 	memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
44137ad18afaSChristoph Hellwig 	rbd_dev->tag_set.ops = &rbd_mq_ops;
4414b5584180SIlya Dryomov 	rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
44157ad18afaSChristoph Hellwig 	rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
4416b5584180SIlya Dryomov 	rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
44177ad18afaSChristoph Hellwig 	rbd_dev->tag_set.nr_hw_queues = 1;
44187ad18afaSChristoph Hellwig 	rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
44197ad18afaSChristoph Hellwig 
44207ad18afaSChristoph Hellwig 	err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
44217ad18afaSChristoph Hellwig 	if (err)
4422602adf40SYehuda Sadeh 		goto out_disk;
4423029bcbd8SJosh Durgin 
44247ad18afaSChristoph Hellwig 	q = blk_mq_init_queue(&rbd_dev->tag_set);
44257ad18afaSChristoph Hellwig 	if (IS_ERR(q)) {
44267ad18afaSChristoph Hellwig 		err = PTR_ERR(q);
44277ad18afaSChristoph Hellwig 		goto out_tag_set;
44287ad18afaSChristoph Hellwig 	}
44297ad18afaSChristoph Hellwig 
4430d8a2c89cSIlya Dryomov 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
4431d8a2c89cSIlya Dryomov 	/* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
4432593a9e7bSAlex Elder 
4433029bcbd8SJosh Durgin 	/* set io sizes to object size */
4434593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
4435593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
44360d9fde4fSIlya Dryomov 	q->limits.max_sectors = queue_max_hw_sectors(q);
4437d3834fefSIlya Dryomov 	blk_queue_max_segments(q, segment_size / SECTOR_SIZE);
4438593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
4439593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
4440593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
4441029bcbd8SJosh Durgin 
444290e98c52SGuangliang Zhao 	/* enable the discard support */
444390e98c52SGuangliang Zhao 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
444490e98c52SGuangliang Zhao 	q->limits.discard_granularity = segment_size;
444590e98c52SGuangliang Zhao 	q->limits.discard_alignment = segment_size;
44462bb4cd5cSJens Axboe 	blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE);
4447b76f8239SJosh Durgin 	q->limits.discard_zeroes_data = 1;
444890e98c52SGuangliang Zhao 
4449bae818eeSRonny Hegewald 	if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
4450bae818eeSRonny Hegewald 		q->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES;
4451bae818eeSRonny Hegewald 
4452602adf40SYehuda Sadeh 	disk->queue = q;
4453602adf40SYehuda Sadeh 
4454602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
4455602adf40SYehuda Sadeh 
4456602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
4457602adf40SYehuda Sadeh 
4458602adf40SYehuda Sadeh 	return 0;
44597ad18afaSChristoph Hellwig out_tag_set:
44607ad18afaSChristoph Hellwig 	blk_mq_free_tag_set(&rbd_dev->tag_set);
4461602adf40SYehuda Sadeh out_disk:
4462602adf40SYehuda Sadeh 	put_disk(disk);
44637ad18afaSChristoph Hellwig 	return err;
4464602adf40SYehuda Sadeh }
4465602adf40SYehuda Sadeh 
4466dfc5606dSYehuda Sadeh /*
4467dfc5606dSYehuda Sadeh   sysfs
4468dfc5606dSYehuda Sadeh */
4469602adf40SYehuda Sadeh 
4470593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
4471593a9e7bSAlex Elder {
4472593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
4473593a9e7bSAlex Elder }
4474593a9e7bSAlex Elder 
4475dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
4476dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
4477602adf40SYehuda Sadeh {
4478593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4479dfc5606dSYehuda Sadeh 
4480fc71d833SAlex Elder 	return sprintf(buf, "%llu\n",
4481fc71d833SAlex Elder 		(unsigned long long)rbd_dev->mapping.size);
4482602adf40SYehuda Sadeh }
4483602adf40SYehuda Sadeh 
448434b13184SAlex Elder /*
448534b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
448634b13184SAlex Elder  * necessarily the base image.
448734b13184SAlex Elder  */
448834b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
448934b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
449034b13184SAlex Elder {
449134b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
449234b13184SAlex Elder 
449334b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
449434b13184SAlex Elder 			(unsigned long long)rbd_dev->mapping.features);
449534b13184SAlex Elder }
449634b13184SAlex Elder 
4497dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
4498dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
4499602adf40SYehuda Sadeh {
4500593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4501dfc5606dSYehuda Sadeh 
4502fc71d833SAlex Elder 	if (rbd_dev->major)
4503dfc5606dSYehuda Sadeh 		return sprintf(buf, "%d\n", rbd_dev->major);
4504fc71d833SAlex Elder 
4505fc71d833SAlex Elder 	return sprintf(buf, "(none)\n");
4506dd82fff1SIlya Dryomov }
4507fc71d833SAlex Elder 
4508dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev,
4509dd82fff1SIlya Dryomov 			      struct device_attribute *attr, char *buf)
4510dd82fff1SIlya Dryomov {
4511dd82fff1SIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4512dd82fff1SIlya Dryomov 
4513dd82fff1SIlya Dryomov 	return sprintf(buf, "%d\n", rbd_dev->minor);
4514dfc5606dSYehuda Sadeh }
4515dfc5606dSYehuda Sadeh 
4516005a07bfSIlya Dryomov static ssize_t rbd_client_addr_show(struct device *dev,
4517005a07bfSIlya Dryomov 				    struct device_attribute *attr, char *buf)
4518005a07bfSIlya Dryomov {
4519005a07bfSIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4520005a07bfSIlya Dryomov 	struct ceph_entity_addr *client_addr =
4521005a07bfSIlya Dryomov 	    ceph_client_addr(rbd_dev->rbd_client->client);
4522005a07bfSIlya Dryomov 
4523005a07bfSIlya Dryomov 	return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
4524005a07bfSIlya Dryomov 		       le32_to_cpu(client_addr->nonce));
4525005a07bfSIlya Dryomov }
4526005a07bfSIlya Dryomov 
4527dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
4528dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
4529dfc5606dSYehuda Sadeh {
4530593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4531dfc5606dSYehuda Sadeh 
45321dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
4533033268a5SIlya Dryomov 		       ceph_client_gid(rbd_dev->rbd_client->client));
4534dfc5606dSYehuda Sadeh }
4535dfc5606dSYehuda Sadeh 
4536267fb90bSMike Christie static ssize_t rbd_cluster_fsid_show(struct device *dev,
4537267fb90bSMike Christie 				     struct device_attribute *attr, char *buf)
4538267fb90bSMike Christie {
4539267fb90bSMike Christie 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4540267fb90bSMike Christie 
4541267fb90bSMike Christie 	return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
4542267fb90bSMike Christie }
4543267fb90bSMike Christie 
45440d6d1e9cSMike Christie static ssize_t rbd_config_info_show(struct device *dev,
45450d6d1e9cSMike Christie 				    struct device_attribute *attr, char *buf)
45460d6d1e9cSMike Christie {
45470d6d1e9cSMike Christie 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
45480d6d1e9cSMike Christie 
45490d6d1e9cSMike Christie 	return sprintf(buf, "%s\n", rbd_dev->config_info);
4550dfc5606dSYehuda Sadeh }
4551dfc5606dSYehuda Sadeh 
4552dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
4553dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
4554dfc5606dSYehuda Sadeh {
4555593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4556dfc5606dSYehuda Sadeh 
45570d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
4558dfc5606dSYehuda Sadeh }
4559dfc5606dSYehuda Sadeh 
45609bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
45619bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
45629bb2f334SAlex Elder {
45639bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
45649bb2f334SAlex Elder 
45650d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
45660d7dbfceSAlex Elder 			(unsigned long long) rbd_dev->spec->pool_id);
45679bb2f334SAlex Elder }
45689bb2f334SAlex Elder 
4569dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
4570dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
4571dfc5606dSYehuda Sadeh {
4572593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4573dfc5606dSYehuda Sadeh 
4574a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
45750d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
4576a92ffdf8SAlex Elder 
4577a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
4578dfc5606dSYehuda Sadeh }
4579dfc5606dSYehuda Sadeh 
4580589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
4581589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
4582589d30e0SAlex Elder {
4583589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4584589d30e0SAlex Elder 
45850d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
4586589d30e0SAlex Elder }
4587589d30e0SAlex Elder 
458834b13184SAlex Elder /*
458934b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
459034b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
459134b13184SAlex Elder  */
4592dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
4593dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
4594dfc5606dSYehuda Sadeh 			     char *buf)
4595dfc5606dSYehuda Sadeh {
4596593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4597dfc5606dSYehuda Sadeh 
45980d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
4599dfc5606dSYehuda Sadeh }
4600dfc5606dSYehuda Sadeh 
460192a58671SMike Christie static ssize_t rbd_snap_id_show(struct device *dev,
460292a58671SMike Christie 				struct device_attribute *attr, char *buf)
460392a58671SMike Christie {
460492a58671SMike Christie 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
460592a58671SMike Christie 
460692a58671SMike Christie 	return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
460792a58671SMike Christie }
460892a58671SMike Christie 
460986b00e0dSAlex Elder /*
4610ff96128fSIlya Dryomov  * For a v2 image, shows the chain of parent images, separated by empty
4611ff96128fSIlya Dryomov  * lines.  For v1 images or if there is no parent, shows "(no parent
4612ff96128fSIlya Dryomov  * image)".
461386b00e0dSAlex Elder  */
461486b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
461586b00e0dSAlex Elder 			       struct device_attribute *attr,
461686b00e0dSAlex Elder 			       char *buf)
461786b00e0dSAlex Elder {
461886b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4619ff96128fSIlya Dryomov 	ssize_t count = 0;
462086b00e0dSAlex Elder 
4621ff96128fSIlya Dryomov 	if (!rbd_dev->parent)
462286b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
462386b00e0dSAlex Elder 
4624ff96128fSIlya Dryomov 	for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
4625ff96128fSIlya Dryomov 		struct rbd_spec *spec = rbd_dev->parent_spec;
462686b00e0dSAlex Elder 
4627ff96128fSIlya Dryomov 		count += sprintf(&buf[count], "%s"
4628ff96128fSIlya Dryomov 			    "pool_id %llu\npool_name %s\n"
4629ff96128fSIlya Dryomov 			    "image_id %s\nimage_name %s\n"
4630ff96128fSIlya Dryomov 			    "snap_id %llu\nsnap_name %s\n"
4631ff96128fSIlya Dryomov 			    "overlap %llu\n",
4632ff96128fSIlya Dryomov 			    !count ? "" : "\n", /* first? */
4633ff96128fSIlya Dryomov 			    spec->pool_id, spec->pool_name,
4634ff96128fSIlya Dryomov 			    spec->image_id, spec->image_name ?: "(unknown)",
4635ff96128fSIlya Dryomov 			    spec->snap_id, spec->snap_name,
4636ff96128fSIlya Dryomov 			    rbd_dev->parent_overlap);
4637ff96128fSIlya Dryomov 	}
463886b00e0dSAlex Elder 
463986b00e0dSAlex Elder 	return count;
464086b00e0dSAlex Elder }
464186b00e0dSAlex Elder 
4642dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
4643dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
4644dfc5606dSYehuda Sadeh 				 const char *buf,
4645dfc5606dSYehuda Sadeh 				 size_t size)
4646dfc5606dSYehuda Sadeh {
4647593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4648b813623aSAlex Elder 	int ret;
4649602adf40SYehuda Sadeh 
4650cc4a38bdSAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
4651e627db08SAlex Elder 	if (ret)
465252bb1f9bSIlya Dryomov 		return ret;
4653b813623aSAlex Elder 
465452bb1f9bSIlya Dryomov 	return size;
4655dfc5606dSYehuda Sadeh }
4656602adf40SYehuda Sadeh 
4657dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
465834b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
4659dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
4660dd82fff1SIlya Dryomov static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
4661005a07bfSIlya Dryomov static DEVICE_ATTR(client_addr, S_IRUGO, rbd_client_addr_show, NULL);
4662dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
4663267fb90bSMike Christie static DEVICE_ATTR(cluster_fsid, S_IRUGO, rbd_cluster_fsid_show, NULL);
46640d6d1e9cSMike Christie static DEVICE_ATTR(config_info, S_IRUSR, rbd_config_info_show, NULL);
4665dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
46669bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
4667dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
4668589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
4669dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
4670dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
467192a58671SMike Christie static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
467286b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
4673dfc5606dSYehuda Sadeh 
4674dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
4675dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
467634b13184SAlex Elder 	&dev_attr_features.attr,
4677dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
4678dd82fff1SIlya Dryomov 	&dev_attr_minor.attr,
4679005a07bfSIlya Dryomov 	&dev_attr_client_addr.attr,
4680dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
4681267fb90bSMike Christie 	&dev_attr_cluster_fsid.attr,
46820d6d1e9cSMike Christie 	&dev_attr_config_info.attr,
4683dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
46849bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
4685dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
4686589d30e0SAlex Elder 	&dev_attr_image_id.attr,
4687dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
468892a58671SMike Christie 	&dev_attr_snap_id.attr,
468986b00e0dSAlex Elder 	&dev_attr_parent.attr,
4690dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
4691dfc5606dSYehuda Sadeh 	NULL
4692dfc5606dSYehuda Sadeh };
4693dfc5606dSYehuda Sadeh 
4694dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
4695dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
4696dfc5606dSYehuda Sadeh };
4697dfc5606dSYehuda Sadeh 
4698dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
4699dfc5606dSYehuda Sadeh 	&rbd_attr_group,
4700dfc5606dSYehuda Sadeh 	NULL
4701dfc5606dSYehuda Sadeh };
4702dfc5606dSYehuda Sadeh 
47036cac4695SIlya Dryomov static void rbd_dev_release(struct device *dev);
4704dfc5606dSYehuda Sadeh 
4705dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
4706dfc5606dSYehuda Sadeh 	.name		= "rbd",
4707dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
47086cac4695SIlya Dryomov 	.release	= rbd_dev_release,
4709dfc5606dSYehuda Sadeh };
4710dfc5606dSYehuda Sadeh 
47118b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
47128b8fb99cSAlex Elder {
47138b8fb99cSAlex Elder 	kref_get(&spec->kref);
47148b8fb99cSAlex Elder 
47158b8fb99cSAlex Elder 	return spec;
47168b8fb99cSAlex Elder }
47178b8fb99cSAlex Elder 
47188b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
47198b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
47208b8fb99cSAlex Elder {
47218b8fb99cSAlex Elder 	if (spec)
47228b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
47238b8fb99cSAlex Elder }
47248b8fb99cSAlex Elder 
47258b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
47268b8fb99cSAlex Elder {
47278b8fb99cSAlex Elder 	struct rbd_spec *spec;
47288b8fb99cSAlex Elder 
47298b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
47308b8fb99cSAlex Elder 	if (!spec)
47318b8fb99cSAlex Elder 		return NULL;
473204077599SIlya Dryomov 
473304077599SIlya Dryomov 	spec->pool_id = CEPH_NOPOOL;
473404077599SIlya Dryomov 	spec->snap_id = CEPH_NOSNAP;
47358b8fb99cSAlex Elder 	kref_init(&spec->kref);
47368b8fb99cSAlex Elder 
47378b8fb99cSAlex Elder 	return spec;
47388b8fb99cSAlex Elder }
47398b8fb99cSAlex Elder 
47408b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
47418b8fb99cSAlex Elder {
47428b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
47438b8fb99cSAlex Elder 
47448b8fb99cSAlex Elder 	kfree(spec->pool_name);
47458b8fb99cSAlex Elder 	kfree(spec->image_id);
47468b8fb99cSAlex Elder 	kfree(spec->image_name);
47478b8fb99cSAlex Elder 	kfree(spec->snap_name);
47488b8fb99cSAlex Elder 	kfree(spec);
47498b8fb99cSAlex Elder }
47508b8fb99cSAlex Elder 
47511643dfa4SIlya Dryomov static void rbd_dev_free(struct rbd_device *rbd_dev)
4752dd5ac32dSIlya Dryomov {
475399d16943SIlya Dryomov 	WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
4754ed95b21aSIlya Dryomov 	WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
4755dd5ac32dSIlya Dryomov 
4756c41d13a3SIlya Dryomov 	ceph_oid_destroy(&rbd_dev->header_oid);
47576b6dddbeSIlya Dryomov 	ceph_oloc_destroy(&rbd_dev->header_oloc);
47580d6d1e9cSMike Christie 	kfree(rbd_dev->config_info);
4759c41d13a3SIlya Dryomov 
4760dd5ac32dSIlya Dryomov 	rbd_put_client(rbd_dev->rbd_client);
4761dd5ac32dSIlya Dryomov 	rbd_spec_put(rbd_dev->spec);
4762dd5ac32dSIlya Dryomov 	kfree(rbd_dev->opts);
4763dd5ac32dSIlya Dryomov 	kfree(rbd_dev);
47641643dfa4SIlya Dryomov }
47651643dfa4SIlya Dryomov 
47661643dfa4SIlya Dryomov static void rbd_dev_release(struct device *dev)
47671643dfa4SIlya Dryomov {
47681643dfa4SIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
47691643dfa4SIlya Dryomov 	bool need_put = !!rbd_dev->opts;
47701643dfa4SIlya Dryomov 
47711643dfa4SIlya Dryomov 	if (need_put) {
47721643dfa4SIlya Dryomov 		destroy_workqueue(rbd_dev->task_wq);
47731643dfa4SIlya Dryomov 		ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
47741643dfa4SIlya Dryomov 	}
47751643dfa4SIlya Dryomov 
47761643dfa4SIlya Dryomov 	rbd_dev_free(rbd_dev);
4777dd5ac32dSIlya Dryomov 
4778dd5ac32dSIlya Dryomov 	/*
4779dd5ac32dSIlya Dryomov 	 * This is racy, but way better than putting module outside of
4780dd5ac32dSIlya Dryomov 	 * the release callback.  The race window is pretty small, so
4781dd5ac32dSIlya Dryomov 	 * doing something similar to dm (dm-builtin.c) is overkill.
4782dd5ac32dSIlya Dryomov 	 */
4783dd5ac32dSIlya Dryomov 	if (need_put)
4784dd5ac32dSIlya Dryomov 		module_put(THIS_MODULE);
4785dd5ac32dSIlya Dryomov }
4786dd5ac32dSIlya Dryomov 
47871643dfa4SIlya Dryomov static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
47881643dfa4SIlya Dryomov 					   struct rbd_spec *spec)
4789c53d5893SAlex Elder {
4790c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
4791c53d5893SAlex Elder 
4792c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
4793c53d5893SAlex Elder 	if (!rbd_dev)
4794c53d5893SAlex Elder 		return NULL;
4795c53d5893SAlex Elder 
4796c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
4797c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
4798c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
4799c53d5893SAlex Elder 
48007e97332eSIlya Dryomov 	rbd_dev->header.data_pool_id = CEPH_NOPOOL;
4801c41d13a3SIlya Dryomov 	ceph_oid_init(&rbd_dev->header_oid);
4802431a02cdSIlya Dryomov 	rbd_dev->header_oloc.pool = spec->pool_id;
4803c41d13a3SIlya Dryomov 
480499d16943SIlya Dryomov 	mutex_init(&rbd_dev->watch_mutex);
480599d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
480699d16943SIlya Dryomov 	INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
480799d16943SIlya Dryomov 
4808ed95b21aSIlya Dryomov 	init_rwsem(&rbd_dev->lock_rwsem);
4809ed95b21aSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
4810ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
4811ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
4812ed95b21aSIlya Dryomov 	INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
4813ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
4814ed95b21aSIlya Dryomov 	init_waitqueue_head(&rbd_dev->lock_waitq);
4815ed95b21aSIlya Dryomov 
4816dd5ac32dSIlya Dryomov 	rbd_dev->dev.bus = &rbd_bus_type;
4817dd5ac32dSIlya Dryomov 	rbd_dev->dev.type = &rbd_device_type;
4818dd5ac32dSIlya Dryomov 	rbd_dev->dev.parent = &rbd_root_dev;
4819dd5ac32dSIlya Dryomov 	device_initialize(&rbd_dev->dev);
4820dd5ac32dSIlya Dryomov 
4821c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
4822d147543dSIlya Dryomov 	rbd_dev->spec = spec;
48230903e875SAlex Elder 
48241643dfa4SIlya Dryomov 	return rbd_dev;
48251643dfa4SIlya Dryomov }
48261643dfa4SIlya Dryomov 
4827dd5ac32dSIlya Dryomov /*
48281643dfa4SIlya Dryomov  * Create a mapping rbd_dev.
4829dd5ac32dSIlya Dryomov  */
48301643dfa4SIlya Dryomov static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
48311643dfa4SIlya Dryomov 					 struct rbd_spec *spec,
48321643dfa4SIlya Dryomov 					 struct rbd_options *opts)
48331643dfa4SIlya Dryomov {
48341643dfa4SIlya Dryomov 	struct rbd_device *rbd_dev;
48351643dfa4SIlya Dryomov 
48361643dfa4SIlya Dryomov 	rbd_dev = __rbd_dev_create(rbdc, spec);
48371643dfa4SIlya Dryomov 	if (!rbd_dev)
48381643dfa4SIlya Dryomov 		return NULL;
48391643dfa4SIlya Dryomov 
48401643dfa4SIlya Dryomov 	rbd_dev->opts = opts;
48411643dfa4SIlya Dryomov 
48421643dfa4SIlya Dryomov 	/* get an id and fill in device name */
48431643dfa4SIlya Dryomov 	rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
48441643dfa4SIlya Dryomov 					 minor_to_rbd_dev_id(1 << MINORBITS),
48451643dfa4SIlya Dryomov 					 GFP_KERNEL);
48461643dfa4SIlya Dryomov 	if (rbd_dev->dev_id < 0)
48471643dfa4SIlya Dryomov 		goto fail_rbd_dev;
48481643dfa4SIlya Dryomov 
48491643dfa4SIlya Dryomov 	sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
48501643dfa4SIlya Dryomov 	rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
48511643dfa4SIlya Dryomov 						   rbd_dev->name);
48521643dfa4SIlya Dryomov 	if (!rbd_dev->task_wq)
48531643dfa4SIlya Dryomov 		goto fail_dev_id;
48541643dfa4SIlya Dryomov 
48551643dfa4SIlya Dryomov 	/* we have a ref from do_rbd_add() */
4856dd5ac32dSIlya Dryomov 	__module_get(THIS_MODULE);
4857dd5ac32dSIlya Dryomov 
48581643dfa4SIlya Dryomov 	dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
4859c53d5893SAlex Elder 	return rbd_dev;
48601643dfa4SIlya Dryomov 
48611643dfa4SIlya Dryomov fail_dev_id:
48621643dfa4SIlya Dryomov 	ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
48631643dfa4SIlya Dryomov fail_rbd_dev:
48641643dfa4SIlya Dryomov 	rbd_dev_free(rbd_dev);
48651643dfa4SIlya Dryomov 	return NULL;
4866c53d5893SAlex Elder }
4867c53d5893SAlex Elder 
4868c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4869c53d5893SAlex Elder {
4870dd5ac32dSIlya Dryomov 	if (rbd_dev)
4871dd5ac32dSIlya Dryomov 		put_device(&rbd_dev->dev);
4872c53d5893SAlex Elder }
4873c53d5893SAlex Elder 
4874dfc5606dSYehuda Sadeh /*
48759d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
48769d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
48779d475de5SAlex Elder  * image.
48789d475de5SAlex Elder  */
48799d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
48809d475de5SAlex Elder 				u8 *order, u64 *snap_size)
48819d475de5SAlex Elder {
48829d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
48839d475de5SAlex Elder 	int ret;
48849d475de5SAlex Elder 	struct {
48859d475de5SAlex Elder 		u8 order;
48869d475de5SAlex Elder 		__le64 size;
48879d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
48889d475de5SAlex Elder 
4889ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4890ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_size",
48914157976bSAlex Elder 				  &snapid, sizeof(snapid),
4892e2a58ee5SAlex Elder 				  &size_buf, sizeof(size_buf));
489336be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
48949d475de5SAlex Elder 	if (ret < 0)
48959d475de5SAlex Elder 		return ret;
489657385b51SAlex Elder 	if (ret < sizeof (size_buf))
489757385b51SAlex Elder 		return -ERANGE;
48989d475de5SAlex Elder 
4899c3545579SJosh Durgin 	if (order) {
49009d475de5SAlex Elder 		*order = size_buf.order;
4901c3545579SJosh Durgin 		dout("  order %u", (unsigned int)*order);
4902c3545579SJosh Durgin 	}
49039d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
49049d475de5SAlex Elder 
4905c3545579SJosh Durgin 	dout("  snap_id 0x%016llx snap_size = %llu\n",
4906c3545579SJosh Durgin 		(unsigned long long)snap_id,
49079d475de5SAlex Elder 		(unsigned long long)*snap_size);
49089d475de5SAlex Elder 
49099d475de5SAlex Elder 	return 0;
49109d475de5SAlex Elder }
49119d475de5SAlex Elder 
49129d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
49139d475de5SAlex Elder {
49149d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
49159d475de5SAlex Elder 					&rbd_dev->header.obj_order,
49169d475de5SAlex Elder 					&rbd_dev->header.image_size);
49179d475de5SAlex Elder }
49189d475de5SAlex Elder 
49191e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
49201e130199SAlex Elder {
49211e130199SAlex Elder 	void *reply_buf;
49221e130199SAlex Elder 	int ret;
49231e130199SAlex Elder 	void *p;
49241e130199SAlex Elder 
49251e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
49261e130199SAlex Elder 	if (!reply_buf)
49271e130199SAlex Elder 		return -ENOMEM;
49281e130199SAlex Elder 
4929ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4930ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_object_prefix",
4931ecd4a68aSIlya Dryomov 				  NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
493236be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
49331e130199SAlex Elder 	if (ret < 0)
49341e130199SAlex Elder 		goto out;
49351e130199SAlex Elder 
49361e130199SAlex Elder 	p = reply_buf;
49371e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
493857385b51SAlex Elder 						p + ret, NULL, GFP_NOIO);
493957385b51SAlex Elder 	ret = 0;
49401e130199SAlex Elder 
49411e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
49421e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
49431e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
49441e130199SAlex Elder 	} else {
49451e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
49461e130199SAlex Elder 	}
49471e130199SAlex Elder out:
49481e130199SAlex Elder 	kfree(reply_buf);
49491e130199SAlex Elder 
49501e130199SAlex Elder 	return ret;
49511e130199SAlex Elder }
49521e130199SAlex Elder 
4953b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4954b1b5402aSAlex Elder 		u64 *snap_features)
4955b1b5402aSAlex Elder {
4956b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
4957b1b5402aSAlex Elder 	struct {
4958b1b5402aSAlex Elder 		__le64 features;
4959b1b5402aSAlex Elder 		__le64 incompat;
49604157976bSAlex Elder 	} __attribute__ ((packed)) features_buf = { 0 };
4961d3767f0fSIlya Dryomov 	u64 unsup;
4962b1b5402aSAlex Elder 	int ret;
4963b1b5402aSAlex Elder 
4964ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4965ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_features",
49664157976bSAlex Elder 				  &snapid, sizeof(snapid),
4967e2a58ee5SAlex Elder 				  &features_buf, sizeof(features_buf));
496836be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4969b1b5402aSAlex Elder 	if (ret < 0)
4970b1b5402aSAlex Elder 		return ret;
497157385b51SAlex Elder 	if (ret < sizeof (features_buf))
497257385b51SAlex Elder 		return -ERANGE;
4973d889140cSAlex Elder 
4974d3767f0fSIlya Dryomov 	unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
4975d3767f0fSIlya Dryomov 	if (unsup) {
4976d3767f0fSIlya Dryomov 		rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
4977d3767f0fSIlya Dryomov 			 unsup);
4978b8f5c6edSAlex Elder 		return -ENXIO;
4979d3767f0fSIlya Dryomov 	}
4980d889140cSAlex Elder 
4981b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
4982b1b5402aSAlex Elder 
4983b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
4984b1b5402aSAlex Elder 		(unsigned long long)snap_id,
4985b1b5402aSAlex Elder 		(unsigned long long)*snap_features,
4986b1b5402aSAlex Elder 		(unsigned long long)le64_to_cpu(features_buf.incompat));
4987b1b5402aSAlex Elder 
4988b1b5402aSAlex Elder 	return 0;
4989b1b5402aSAlex Elder }
4990b1b5402aSAlex Elder 
4991b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
4992b1b5402aSAlex Elder {
4993b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
4994b1b5402aSAlex Elder 						&rbd_dev->header.features);
4995b1b5402aSAlex Elder }
4996b1b5402aSAlex Elder 
499786b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
499886b00e0dSAlex Elder {
499986b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
500086b00e0dSAlex Elder 	size_t size;
500186b00e0dSAlex Elder 	void *reply_buf = NULL;
500286b00e0dSAlex Elder 	__le64 snapid;
500386b00e0dSAlex Elder 	void *p;
500486b00e0dSAlex Elder 	void *end;
5005642a2537SAlex Elder 	u64 pool_id;
500686b00e0dSAlex Elder 	char *image_id;
50073b5cf2a2SAlex Elder 	u64 snap_id;
500886b00e0dSAlex Elder 	u64 overlap;
500986b00e0dSAlex Elder 	int ret;
501086b00e0dSAlex Elder 
501186b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
501286b00e0dSAlex Elder 	if (!parent_spec)
501386b00e0dSAlex Elder 		return -ENOMEM;
501486b00e0dSAlex Elder 
501586b00e0dSAlex Elder 	size = sizeof (__le64) +				/* pool_id */
501686b00e0dSAlex Elder 		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
501786b00e0dSAlex Elder 		sizeof (__le64) +				/* snap_id */
501886b00e0dSAlex Elder 		sizeof (__le64);				/* overlap */
501986b00e0dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
502086b00e0dSAlex Elder 	if (!reply_buf) {
502186b00e0dSAlex Elder 		ret = -ENOMEM;
502286b00e0dSAlex Elder 		goto out_err;
502386b00e0dSAlex Elder 	}
502486b00e0dSAlex Elder 
50254d9b67cdSIlya Dryomov 	snapid = cpu_to_le64(rbd_dev->spec->snap_id);
5026ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5027ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_parent",
5028ecd4a68aSIlya Dryomov 				  &snapid, sizeof(snapid), reply_buf, size);
502936be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
503086b00e0dSAlex Elder 	if (ret < 0)
503186b00e0dSAlex Elder 		goto out_err;
503286b00e0dSAlex Elder 
503386b00e0dSAlex Elder 	p = reply_buf;
503457385b51SAlex Elder 	end = reply_buf + ret;
503557385b51SAlex Elder 	ret = -ERANGE;
5036642a2537SAlex Elder 	ceph_decode_64_safe(&p, end, pool_id, out_err);
5037392a9dadSAlex Elder 	if (pool_id == CEPH_NOPOOL) {
5038392a9dadSAlex Elder 		/*
5039392a9dadSAlex Elder 		 * Either the parent never existed, or we have
5040392a9dadSAlex Elder 		 * record of it but the image got flattened so it no
5041392a9dadSAlex Elder 		 * longer has a parent.  When the parent of a
5042392a9dadSAlex Elder 		 * layered image disappears we immediately set the
5043392a9dadSAlex Elder 		 * overlap to 0.  The effect of this is that all new
5044392a9dadSAlex Elder 		 * requests will be treated as if the image had no
5045392a9dadSAlex Elder 		 * parent.
5046392a9dadSAlex Elder 		 */
5047392a9dadSAlex Elder 		if (rbd_dev->parent_overlap) {
5048392a9dadSAlex Elder 			rbd_dev->parent_overlap = 0;
5049392a9dadSAlex Elder 			rbd_dev_parent_put(rbd_dev);
5050392a9dadSAlex Elder 			pr_info("%s: clone image has been flattened\n",
5051392a9dadSAlex Elder 				rbd_dev->disk->disk_name);
5052392a9dadSAlex Elder 		}
5053392a9dadSAlex Elder 
505486b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
5055392a9dadSAlex Elder 	}
505686b00e0dSAlex Elder 
50570903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
50580903e875SAlex Elder 
50590903e875SAlex Elder 	ret = -EIO;
5060642a2537SAlex Elder 	if (pool_id > (u64)U32_MAX) {
50619584d508SIlya Dryomov 		rbd_warn(NULL, "parent pool id too large (%llu > %u)",
5062642a2537SAlex Elder 			(unsigned long long)pool_id, U32_MAX);
506357385b51SAlex Elder 		goto out_err;
5064c0cd10dbSAlex Elder 	}
50650903e875SAlex Elder 
5066979ed480SAlex Elder 	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
506786b00e0dSAlex Elder 	if (IS_ERR(image_id)) {
506886b00e0dSAlex Elder 		ret = PTR_ERR(image_id);
506986b00e0dSAlex Elder 		goto out_err;
507086b00e0dSAlex Elder 	}
50713b5cf2a2SAlex Elder 	ceph_decode_64_safe(&p, end, snap_id, out_err);
507286b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, overlap, out_err);
507386b00e0dSAlex Elder 
50743b5cf2a2SAlex Elder 	/*
50753b5cf2a2SAlex Elder 	 * The parent won't change (except when the clone is
50763b5cf2a2SAlex Elder 	 * flattened, already handled that).  So we only need to
50773b5cf2a2SAlex Elder 	 * record the parent spec we have not already done so.
50783b5cf2a2SAlex Elder 	 */
50793b5cf2a2SAlex Elder 	if (!rbd_dev->parent_spec) {
50803b5cf2a2SAlex Elder 		parent_spec->pool_id = pool_id;
50813b5cf2a2SAlex Elder 		parent_spec->image_id = image_id;
50823b5cf2a2SAlex Elder 		parent_spec->snap_id = snap_id;
508386b00e0dSAlex Elder 		rbd_dev->parent_spec = parent_spec;
508486b00e0dSAlex Elder 		parent_spec = NULL;	/* rbd_dev now owns this */
5085fbba11b3SIlya Dryomov 	} else {
5086fbba11b3SIlya Dryomov 		kfree(image_id);
50873b5cf2a2SAlex Elder 	}
50883b5cf2a2SAlex Elder 
50893b5cf2a2SAlex Elder 	/*
5090cf32bd9cSIlya Dryomov 	 * We always update the parent overlap.  If it's zero we issue
5091cf32bd9cSIlya Dryomov 	 * a warning, as we will proceed as if there was no parent.
50923b5cf2a2SAlex Elder 	 */
50933b5cf2a2SAlex Elder 	if (!overlap) {
50943b5cf2a2SAlex Elder 		if (parent_spec) {
5095cf32bd9cSIlya Dryomov 			/* refresh, careful to warn just once */
5096cf32bd9cSIlya Dryomov 			if (rbd_dev->parent_overlap)
5097cf32bd9cSIlya Dryomov 				rbd_warn(rbd_dev,
5098cf32bd9cSIlya Dryomov 				    "clone now standalone (overlap became 0)");
509970cf49cfSAlex Elder 		} else {
5100cf32bd9cSIlya Dryomov 			/* initial probe */
5101cf32bd9cSIlya Dryomov 			rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
51023b5cf2a2SAlex Elder 		}
510370cf49cfSAlex Elder 	}
5104cf32bd9cSIlya Dryomov 	rbd_dev->parent_overlap = overlap;
5105cf32bd9cSIlya Dryomov 
510686b00e0dSAlex Elder out:
510786b00e0dSAlex Elder 	ret = 0;
510886b00e0dSAlex Elder out_err:
510986b00e0dSAlex Elder 	kfree(reply_buf);
511086b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
511186b00e0dSAlex Elder 
511286b00e0dSAlex Elder 	return ret;
511386b00e0dSAlex Elder }
511486b00e0dSAlex Elder 
5115cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
5116cc070d59SAlex Elder {
5117cc070d59SAlex Elder 	struct {
5118cc070d59SAlex Elder 		__le64 stripe_unit;
5119cc070d59SAlex Elder 		__le64 stripe_count;
5120cc070d59SAlex Elder 	} __attribute__ ((packed)) striping_info_buf = { 0 };
5121cc070d59SAlex Elder 	size_t size = sizeof (striping_info_buf);
5122cc070d59SAlex Elder 	void *p;
5123cc070d59SAlex Elder 	u64 obj_size;
5124cc070d59SAlex Elder 	u64 stripe_unit;
5125cc070d59SAlex Elder 	u64 stripe_count;
5126cc070d59SAlex Elder 	int ret;
5127cc070d59SAlex Elder 
5128ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5129ecd4a68aSIlya Dryomov 				&rbd_dev->header_oloc, "get_stripe_unit_count",
5130ecd4a68aSIlya Dryomov 				NULL, 0, &striping_info_buf, size);
5131cc070d59SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5132cc070d59SAlex Elder 	if (ret < 0)
5133cc070d59SAlex Elder 		return ret;
5134cc070d59SAlex Elder 	if (ret < size)
5135cc070d59SAlex Elder 		return -ERANGE;
5136cc070d59SAlex Elder 
5137cc070d59SAlex Elder 	/*
5138cc070d59SAlex Elder 	 * We don't actually support the "fancy striping" feature
5139cc070d59SAlex Elder 	 * (STRIPINGV2) yet, but if the striping sizes are the
5140cc070d59SAlex Elder 	 * defaults the behavior is the same as before.  So find
5141cc070d59SAlex Elder 	 * out, and only fail if the image has non-default values.
5142cc070d59SAlex Elder 	 */
5143cc070d59SAlex Elder 	ret = -EINVAL;
51445bc3fb17SIlya Dryomov 	obj_size = rbd_obj_bytes(&rbd_dev->header);
5145cc070d59SAlex Elder 	p = &striping_info_buf;
5146cc070d59SAlex Elder 	stripe_unit = ceph_decode_64(&p);
5147cc070d59SAlex Elder 	if (stripe_unit != obj_size) {
5148cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe unit "
5149cc070d59SAlex Elder 				"(got %llu want %llu)",
5150cc070d59SAlex Elder 				stripe_unit, obj_size);
5151cc070d59SAlex Elder 		return -EINVAL;
5152cc070d59SAlex Elder 	}
5153cc070d59SAlex Elder 	stripe_count = ceph_decode_64(&p);
5154cc070d59SAlex Elder 	if (stripe_count != 1) {
5155cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe count "
5156cc070d59SAlex Elder 				"(got %llu want 1)", stripe_count);
5157cc070d59SAlex Elder 		return -EINVAL;
5158cc070d59SAlex Elder 	}
5159500d0c0fSAlex Elder 	rbd_dev->header.stripe_unit = stripe_unit;
5160500d0c0fSAlex Elder 	rbd_dev->header.stripe_count = stripe_count;
5161cc070d59SAlex Elder 
5162cc070d59SAlex Elder 	return 0;
5163cc070d59SAlex Elder }
5164cc070d59SAlex Elder 
51657e97332eSIlya Dryomov static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
51667e97332eSIlya Dryomov {
51677e97332eSIlya Dryomov 	__le64 data_pool_id;
51687e97332eSIlya Dryomov 	int ret;
51697e97332eSIlya Dryomov 
51707e97332eSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
51717e97332eSIlya Dryomov 				  &rbd_dev->header_oloc, "get_data_pool",
51727e97332eSIlya Dryomov 				  NULL, 0, &data_pool_id, sizeof(data_pool_id));
51737e97332eSIlya Dryomov 	if (ret < 0)
51747e97332eSIlya Dryomov 		return ret;
51757e97332eSIlya Dryomov 	if (ret < sizeof(data_pool_id))
51767e97332eSIlya Dryomov 		return -EBADMSG;
51777e97332eSIlya Dryomov 
51787e97332eSIlya Dryomov 	rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
51797e97332eSIlya Dryomov 	WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
51807e97332eSIlya Dryomov 	return 0;
51817e97332eSIlya Dryomov }
51827e97332eSIlya Dryomov 
51839e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
51849e15b77dSAlex Elder {
5185ecd4a68aSIlya Dryomov 	CEPH_DEFINE_OID_ONSTACK(oid);
51869e15b77dSAlex Elder 	size_t image_id_size;
51879e15b77dSAlex Elder 	char *image_id;
51889e15b77dSAlex Elder 	void *p;
51899e15b77dSAlex Elder 	void *end;
51909e15b77dSAlex Elder 	size_t size;
51919e15b77dSAlex Elder 	void *reply_buf = NULL;
51929e15b77dSAlex Elder 	size_t len = 0;
51939e15b77dSAlex Elder 	char *image_name = NULL;
51949e15b77dSAlex Elder 	int ret;
51959e15b77dSAlex Elder 
51969e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
51979e15b77dSAlex Elder 
519869e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
519969e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
52009e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
52019e15b77dSAlex Elder 	if (!image_id)
52029e15b77dSAlex Elder 		return NULL;
52039e15b77dSAlex Elder 
52049e15b77dSAlex Elder 	p = image_id;
52054157976bSAlex Elder 	end = image_id + image_id_size;
520669e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
52079e15b77dSAlex Elder 
52089e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
52099e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
52109e15b77dSAlex Elder 	if (!reply_buf)
52119e15b77dSAlex Elder 		goto out;
52129e15b77dSAlex Elder 
5213ecd4a68aSIlya Dryomov 	ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
5214ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5215ecd4a68aSIlya Dryomov 				  "dir_get_name", image_id, image_id_size,
5216e2a58ee5SAlex Elder 				  reply_buf, size);
52179e15b77dSAlex Elder 	if (ret < 0)
52189e15b77dSAlex Elder 		goto out;
52199e15b77dSAlex Elder 	p = reply_buf;
5220f40eb349SAlex Elder 	end = reply_buf + ret;
5221f40eb349SAlex Elder 
52229e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
52239e15b77dSAlex Elder 	if (IS_ERR(image_name))
52249e15b77dSAlex Elder 		image_name = NULL;
52259e15b77dSAlex Elder 	else
52269e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
52279e15b77dSAlex Elder out:
52289e15b77dSAlex Elder 	kfree(reply_buf);
52299e15b77dSAlex Elder 	kfree(image_id);
52309e15b77dSAlex Elder 
52319e15b77dSAlex Elder 	return image_name;
52329e15b77dSAlex Elder }
52339e15b77dSAlex Elder 
52342ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
52352ad3d716SAlex Elder {
52362ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
52372ad3d716SAlex Elder 	const char *snap_name;
52382ad3d716SAlex Elder 	u32 which = 0;
52392ad3d716SAlex Elder 
52402ad3d716SAlex Elder 	/* Skip over names until we find the one we are looking for */
52412ad3d716SAlex Elder 
52422ad3d716SAlex Elder 	snap_name = rbd_dev->header.snap_names;
52432ad3d716SAlex Elder 	while (which < snapc->num_snaps) {
52442ad3d716SAlex Elder 		if (!strcmp(name, snap_name))
52452ad3d716SAlex Elder 			return snapc->snaps[which];
52462ad3d716SAlex Elder 		snap_name += strlen(snap_name) + 1;
52472ad3d716SAlex Elder 		which++;
52482ad3d716SAlex Elder 	}
52492ad3d716SAlex Elder 	return CEPH_NOSNAP;
52502ad3d716SAlex Elder }
52512ad3d716SAlex Elder 
52522ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
52532ad3d716SAlex Elder {
52542ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
52552ad3d716SAlex Elder 	u32 which;
52562ad3d716SAlex Elder 	bool found = false;
52572ad3d716SAlex Elder 	u64 snap_id;
52582ad3d716SAlex Elder 
52592ad3d716SAlex Elder 	for (which = 0; !found && which < snapc->num_snaps; which++) {
52602ad3d716SAlex Elder 		const char *snap_name;
52612ad3d716SAlex Elder 
52622ad3d716SAlex Elder 		snap_id = snapc->snaps[which];
52632ad3d716SAlex Elder 		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
5264efadc98aSJosh Durgin 		if (IS_ERR(snap_name)) {
5265efadc98aSJosh Durgin 			/* ignore no-longer existing snapshots */
5266efadc98aSJosh Durgin 			if (PTR_ERR(snap_name) == -ENOENT)
5267efadc98aSJosh Durgin 				continue;
5268efadc98aSJosh Durgin 			else
52692ad3d716SAlex Elder 				break;
5270efadc98aSJosh Durgin 		}
52712ad3d716SAlex Elder 		found = !strcmp(name, snap_name);
52722ad3d716SAlex Elder 		kfree(snap_name);
52732ad3d716SAlex Elder 	}
52742ad3d716SAlex Elder 	return found ? snap_id : CEPH_NOSNAP;
52752ad3d716SAlex Elder }
52762ad3d716SAlex Elder 
52772ad3d716SAlex Elder /*
52782ad3d716SAlex Elder  * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
52792ad3d716SAlex Elder  * no snapshot by that name is found, or if an error occurs.
52802ad3d716SAlex Elder  */
52812ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
52822ad3d716SAlex Elder {
52832ad3d716SAlex Elder 	if (rbd_dev->image_format == 1)
52842ad3d716SAlex Elder 		return rbd_v1_snap_id_by_name(rbd_dev, name);
52852ad3d716SAlex Elder 
52862ad3d716SAlex Elder 	return rbd_v2_snap_id_by_name(rbd_dev, name);
52872ad3d716SAlex Elder }
52882ad3d716SAlex Elder 
52899e15b77dSAlex Elder /*
529004077599SIlya Dryomov  * An image being mapped will have everything but the snap id.
52919e15b77dSAlex Elder  */
529204077599SIlya Dryomov static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
529304077599SIlya Dryomov {
529404077599SIlya Dryomov 	struct rbd_spec *spec = rbd_dev->spec;
529504077599SIlya Dryomov 
529604077599SIlya Dryomov 	rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
529704077599SIlya Dryomov 	rbd_assert(spec->image_id && spec->image_name);
529804077599SIlya Dryomov 	rbd_assert(spec->snap_name);
529904077599SIlya Dryomov 
530004077599SIlya Dryomov 	if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
530104077599SIlya Dryomov 		u64 snap_id;
530204077599SIlya Dryomov 
530304077599SIlya Dryomov 		snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
530404077599SIlya Dryomov 		if (snap_id == CEPH_NOSNAP)
530504077599SIlya Dryomov 			return -ENOENT;
530604077599SIlya Dryomov 
530704077599SIlya Dryomov 		spec->snap_id = snap_id;
530804077599SIlya Dryomov 	} else {
530904077599SIlya Dryomov 		spec->snap_id = CEPH_NOSNAP;
531004077599SIlya Dryomov 	}
531104077599SIlya Dryomov 
531204077599SIlya Dryomov 	return 0;
531304077599SIlya Dryomov }
531404077599SIlya Dryomov 
531504077599SIlya Dryomov /*
531604077599SIlya Dryomov  * A parent image will have all ids but none of the names.
531704077599SIlya Dryomov  *
531804077599SIlya Dryomov  * All names in an rbd spec are dynamically allocated.  It's OK if we
531904077599SIlya Dryomov  * can't figure out the name for an image id.
532004077599SIlya Dryomov  */
532104077599SIlya Dryomov static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
53229e15b77dSAlex Elder {
53232e9f7f1cSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
53242e9f7f1cSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
53252e9f7f1cSAlex Elder 	const char *pool_name;
53262e9f7f1cSAlex Elder 	const char *image_name;
53272e9f7f1cSAlex Elder 	const char *snap_name;
53289e15b77dSAlex Elder 	int ret;
53299e15b77dSAlex Elder 
533004077599SIlya Dryomov 	rbd_assert(spec->pool_id != CEPH_NOPOOL);
533104077599SIlya Dryomov 	rbd_assert(spec->image_id);
533204077599SIlya Dryomov 	rbd_assert(spec->snap_id != CEPH_NOSNAP);
53339e15b77dSAlex Elder 
53342e9f7f1cSAlex Elder 	/* Get the pool name; we have to make our own copy of this */
53359e15b77dSAlex Elder 
53362e9f7f1cSAlex Elder 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
53372e9f7f1cSAlex Elder 	if (!pool_name) {
53382e9f7f1cSAlex Elder 		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
5339935dc89fSAlex Elder 		return -EIO;
5340935dc89fSAlex Elder 	}
53412e9f7f1cSAlex Elder 	pool_name = kstrdup(pool_name, GFP_KERNEL);
53422e9f7f1cSAlex Elder 	if (!pool_name)
53439e15b77dSAlex Elder 		return -ENOMEM;
53449e15b77dSAlex Elder 
53459e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
53469e15b77dSAlex Elder 
53472e9f7f1cSAlex Elder 	image_name = rbd_dev_image_name(rbd_dev);
53482e9f7f1cSAlex Elder 	if (!image_name)
534906ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
53509e15b77dSAlex Elder 
535104077599SIlya Dryomov 	/* Fetch the snapshot name */
53529e15b77dSAlex Elder 
53532e9f7f1cSAlex Elder 	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
5354da6a6b63SJosh Durgin 	if (IS_ERR(snap_name)) {
5355da6a6b63SJosh Durgin 		ret = PTR_ERR(snap_name);
53569e15b77dSAlex Elder 		goto out_err;
53572e9f7f1cSAlex Elder 	}
53582e9f7f1cSAlex Elder 
53592e9f7f1cSAlex Elder 	spec->pool_name = pool_name;
53602e9f7f1cSAlex Elder 	spec->image_name = image_name;
53612e9f7f1cSAlex Elder 	spec->snap_name = snap_name;
53629e15b77dSAlex Elder 
53639e15b77dSAlex Elder 	return 0;
536404077599SIlya Dryomov 
53659e15b77dSAlex Elder out_err:
53662e9f7f1cSAlex Elder 	kfree(image_name);
53672e9f7f1cSAlex Elder 	kfree(pool_name);
53689e15b77dSAlex Elder 	return ret;
53699e15b77dSAlex Elder }
53709e15b77dSAlex Elder 
5371cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
537235d489f9SAlex Elder {
537335d489f9SAlex Elder 	size_t size;
537435d489f9SAlex Elder 	int ret;
537535d489f9SAlex Elder 	void *reply_buf;
537635d489f9SAlex Elder 	void *p;
537735d489f9SAlex Elder 	void *end;
537835d489f9SAlex Elder 	u64 seq;
537935d489f9SAlex Elder 	u32 snap_count;
538035d489f9SAlex Elder 	struct ceph_snap_context *snapc;
538135d489f9SAlex Elder 	u32 i;
538235d489f9SAlex Elder 
538335d489f9SAlex Elder 	/*
538435d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
538535d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
538635d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
538735d489f9SAlex Elder 	 * prepared to receive.
538835d489f9SAlex Elder 	 */
538935d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
539035d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
539135d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
539235d489f9SAlex Elder 	if (!reply_buf)
539335d489f9SAlex Elder 		return -ENOMEM;
539435d489f9SAlex Elder 
5395ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5396ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_snapcontext",
5397ecd4a68aSIlya Dryomov 				  NULL, 0, reply_buf, size);
539836be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
539935d489f9SAlex Elder 	if (ret < 0)
540035d489f9SAlex Elder 		goto out;
540135d489f9SAlex Elder 
540235d489f9SAlex Elder 	p = reply_buf;
540357385b51SAlex Elder 	end = reply_buf + ret;
540457385b51SAlex Elder 	ret = -ERANGE;
540535d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
540635d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
540735d489f9SAlex Elder 
540835d489f9SAlex Elder 	/*
540935d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
541035d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
541135d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
541235d489f9SAlex Elder 	 * allocate is representable in a size_t.
541335d489f9SAlex Elder 	 */
541435d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
541535d489f9SAlex Elder 				 / sizeof (u64)) {
541635d489f9SAlex Elder 		ret = -EINVAL;
541735d489f9SAlex Elder 		goto out;
541835d489f9SAlex Elder 	}
541935d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
542035d489f9SAlex Elder 		goto out;
5421468521c1SAlex Elder 	ret = 0;
542235d489f9SAlex Elder 
5423812164f8SAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
542435d489f9SAlex Elder 	if (!snapc) {
542535d489f9SAlex Elder 		ret = -ENOMEM;
542635d489f9SAlex Elder 		goto out;
542735d489f9SAlex Elder 	}
542835d489f9SAlex Elder 	snapc->seq = seq;
542935d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
543035d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
543135d489f9SAlex Elder 
543249ece554SAlex Elder 	ceph_put_snap_context(rbd_dev->header.snapc);
543335d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
543435d489f9SAlex Elder 
543535d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
543635d489f9SAlex Elder 		(unsigned long long)seq, (unsigned int)snap_count);
543735d489f9SAlex Elder out:
543835d489f9SAlex Elder 	kfree(reply_buf);
543935d489f9SAlex Elder 
544057385b51SAlex Elder 	return ret;
544135d489f9SAlex Elder }
544235d489f9SAlex Elder 
544354cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
544454cac61fSAlex Elder 					u64 snap_id)
5445b8b1e2dbSAlex Elder {
5446b8b1e2dbSAlex Elder 	size_t size;
5447b8b1e2dbSAlex Elder 	void *reply_buf;
544854cac61fSAlex Elder 	__le64 snapid;
5449b8b1e2dbSAlex Elder 	int ret;
5450b8b1e2dbSAlex Elder 	void *p;
5451b8b1e2dbSAlex Elder 	void *end;
5452b8b1e2dbSAlex Elder 	char *snap_name;
5453b8b1e2dbSAlex Elder 
5454b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
5455b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
5456b8b1e2dbSAlex Elder 	if (!reply_buf)
5457b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
5458b8b1e2dbSAlex Elder 
545954cac61fSAlex Elder 	snapid = cpu_to_le64(snap_id);
5460ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5461ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_snapshot_name",
5462ecd4a68aSIlya Dryomov 				  &snapid, sizeof(snapid), reply_buf, size);
546336be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5464f40eb349SAlex Elder 	if (ret < 0) {
5465f40eb349SAlex Elder 		snap_name = ERR_PTR(ret);
5466b8b1e2dbSAlex Elder 		goto out;
5467f40eb349SAlex Elder 	}
5468b8b1e2dbSAlex Elder 
5469b8b1e2dbSAlex Elder 	p = reply_buf;
5470f40eb349SAlex Elder 	end = reply_buf + ret;
5471e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
5472f40eb349SAlex Elder 	if (IS_ERR(snap_name))
5473b8b1e2dbSAlex Elder 		goto out;
5474f40eb349SAlex Elder 
5475b8b1e2dbSAlex Elder 	dout("  snap_id 0x%016llx snap_name = %s\n",
547654cac61fSAlex Elder 		(unsigned long long)snap_id, snap_name);
5477b8b1e2dbSAlex Elder out:
5478b8b1e2dbSAlex Elder 	kfree(reply_buf);
5479b8b1e2dbSAlex Elder 
5480f40eb349SAlex Elder 	return snap_name;
5481b8b1e2dbSAlex Elder }
5482b8b1e2dbSAlex Elder 
54832df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
5484117973fbSAlex Elder {
54852df3fac7SAlex Elder 	bool first_time = rbd_dev->header.object_prefix == NULL;
5486117973fbSAlex Elder 	int ret;
5487117973fbSAlex Elder 
54881617e40cSJosh Durgin 	ret = rbd_dev_v2_image_size(rbd_dev);
54891617e40cSJosh Durgin 	if (ret)
5490cfbf6377SAlex Elder 		return ret;
54911617e40cSJosh Durgin 
54922df3fac7SAlex Elder 	if (first_time) {
54932df3fac7SAlex Elder 		ret = rbd_dev_v2_header_onetime(rbd_dev);
54942df3fac7SAlex Elder 		if (ret)
5495cfbf6377SAlex Elder 			return ret;
54962df3fac7SAlex Elder 	}
54972df3fac7SAlex Elder 
5498cc4a38bdSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev);
5499d194cd1dSIlya Dryomov 	if (ret && first_time) {
5500d194cd1dSIlya Dryomov 		kfree(rbd_dev->header.object_prefix);
5501d194cd1dSIlya Dryomov 		rbd_dev->header.object_prefix = NULL;
5502d194cd1dSIlya Dryomov 	}
5503117973fbSAlex Elder 
5504117973fbSAlex Elder 	return ret;
5505117973fbSAlex Elder }
5506117973fbSAlex Elder 
5507a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev)
5508a720ae09SIlya Dryomov {
5509a720ae09SIlya Dryomov 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5510a720ae09SIlya Dryomov 
5511a720ae09SIlya Dryomov 	if (rbd_dev->image_format == 1)
5512a720ae09SIlya Dryomov 		return rbd_dev_v1_header_info(rbd_dev);
5513a720ae09SIlya Dryomov 
5514a720ae09SIlya Dryomov 	return rbd_dev_v2_header_info(rbd_dev);
5515a720ae09SIlya Dryomov }
5516a720ae09SIlya Dryomov 
55171ddbe94eSAlex Elder /*
5518e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
5519e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
5520593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
5521593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
5522e28fff26SAlex Elder  */
5523e28fff26SAlex Elder static inline size_t next_token(const char **buf)
5524e28fff26SAlex Elder {
5525e28fff26SAlex Elder         /*
5526e28fff26SAlex Elder         * These are the characters that produce nonzero for
5527e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
5528e28fff26SAlex Elder         */
5529e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
5530e28fff26SAlex Elder 
5531e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
5532e28fff26SAlex Elder 
5533e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
5534e28fff26SAlex Elder }
5535e28fff26SAlex Elder 
5536e28fff26SAlex Elder /*
5537ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
5538ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
5539ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
5540ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
5541ea3352f4SAlex Elder  *
5542ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
5543ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
5544ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
5545ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
5546ea3352f4SAlex Elder  *
5547ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
5548ea3352f4SAlex Elder  * the end of the found token.
5549ea3352f4SAlex Elder  *
5550ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
5551ea3352f4SAlex Elder  */
5552ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
5553ea3352f4SAlex Elder {
5554ea3352f4SAlex Elder 	char *dup;
5555ea3352f4SAlex Elder 	size_t len;
5556ea3352f4SAlex Elder 
5557ea3352f4SAlex Elder 	len = next_token(buf);
55584caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
5559ea3352f4SAlex Elder 	if (!dup)
5560ea3352f4SAlex Elder 		return NULL;
5561ea3352f4SAlex Elder 	*(dup + len) = '\0';
5562ea3352f4SAlex Elder 	*buf += len;
5563ea3352f4SAlex Elder 
5564ea3352f4SAlex Elder 	if (lenp)
5565ea3352f4SAlex Elder 		*lenp = len;
5566ea3352f4SAlex Elder 
5567ea3352f4SAlex Elder 	return dup;
5568ea3352f4SAlex Elder }
5569ea3352f4SAlex Elder 
5570ea3352f4SAlex Elder /*
5571859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
5572859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
5573859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
5574859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
5575d22f76e7SAlex Elder  *
5576859c31dfSAlex Elder  * The information extracted from these options is recorded in
5577859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
5578859c31dfSAlex Elder  * structures:
5579859c31dfSAlex Elder  *  ceph_opts
5580859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
5581859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
5582859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
5583859c31dfSAlex Elder  *  rbd_opts
5584859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
5585859c31dfSAlex Elder  *	this function; caller must release with kfree().
5586859c31dfSAlex Elder  *  spec
5587859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
5588859c31dfSAlex Elder  *	initialized by this function based on parsed options.
5589859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
5590859c31dfSAlex Elder  *
5591859c31dfSAlex Elder  * The options passed take this form:
5592859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
5593859c31dfSAlex Elder  * where:
5594859c31dfSAlex Elder  *  <mon_addrs>
5595859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
5596859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
5597859c31dfSAlex Elder  *      by a port number (separated by a colon).
5598859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
5599859c31dfSAlex Elder  *  <options>
5600859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
5601859c31dfSAlex Elder  *  <pool_name>
5602859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
5603859c31dfSAlex Elder  *  <image_name>
5604859c31dfSAlex Elder  *      The name of the image in that pool to map.
5605859c31dfSAlex Elder  *  <snap_id>
5606859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
5607859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
5608859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
5609859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
5610a725f65eSAlex Elder  */
5611859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
5612dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
5613859c31dfSAlex Elder 				struct rbd_options **opts,
5614859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
5615a725f65eSAlex Elder {
5616e28fff26SAlex Elder 	size_t len;
5617859c31dfSAlex Elder 	char *options;
56180ddebc0cSAlex Elder 	const char *mon_addrs;
5619ecb4dc22SAlex Elder 	char *snap_name;
56200ddebc0cSAlex Elder 	size_t mon_addrs_size;
5621859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
56224e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
5623859c31dfSAlex Elder 	struct ceph_options *copts;
5624dc79b113SAlex Elder 	int ret;
5625e28fff26SAlex Elder 
5626e28fff26SAlex Elder 	/* The first four tokens are required */
5627e28fff26SAlex Elder 
56287ef3214aSAlex Elder 	len = next_token(&buf);
56294fb5d671SAlex Elder 	if (!len) {
56304fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
56314fb5d671SAlex Elder 		return -EINVAL;
56324fb5d671SAlex Elder 	}
56330ddebc0cSAlex Elder 	mon_addrs = buf;
5634f28e565aSAlex Elder 	mon_addrs_size = len + 1;
56357ef3214aSAlex Elder 	buf += len;
5636a725f65eSAlex Elder 
5637dc79b113SAlex Elder 	ret = -EINVAL;
5638f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
5639f28e565aSAlex Elder 	if (!options)
5640dc79b113SAlex Elder 		return -ENOMEM;
56414fb5d671SAlex Elder 	if (!*options) {
56424fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
56434fb5d671SAlex Elder 		goto out_err;
56444fb5d671SAlex Elder 	}
5645a725f65eSAlex Elder 
5646859c31dfSAlex Elder 	spec = rbd_spec_alloc();
5647859c31dfSAlex Elder 	if (!spec)
5648f28e565aSAlex Elder 		goto out_mem;
5649859c31dfSAlex Elder 
5650859c31dfSAlex Elder 	spec->pool_name = dup_token(&buf, NULL);
5651859c31dfSAlex Elder 	if (!spec->pool_name)
5652859c31dfSAlex Elder 		goto out_mem;
56534fb5d671SAlex Elder 	if (!*spec->pool_name) {
56544fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
56554fb5d671SAlex Elder 		goto out_err;
56564fb5d671SAlex Elder 	}
5657e28fff26SAlex Elder 
565869e7a02fSAlex Elder 	spec->image_name = dup_token(&buf, NULL);
5659859c31dfSAlex Elder 	if (!spec->image_name)
5660f28e565aSAlex Elder 		goto out_mem;
56614fb5d671SAlex Elder 	if (!*spec->image_name) {
56624fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
56634fb5d671SAlex Elder 		goto out_err;
56644fb5d671SAlex Elder 	}
5665e28fff26SAlex Elder 
5666f28e565aSAlex Elder 	/*
5667f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
5668f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
5669f28e565aSAlex Elder 	 */
56703feeb894SAlex Elder 	len = next_token(&buf);
5671820a5f3eSAlex Elder 	if (!len) {
56723feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
56733feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
5674f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
5675dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
5676f28e565aSAlex Elder 		goto out_err;
5677849b4260SAlex Elder 	}
5678ecb4dc22SAlex Elder 	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
5679ecb4dc22SAlex Elder 	if (!snap_name)
5680f28e565aSAlex Elder 		goto out_mem;
5681ecb4dc22SAlex Elder 	*(snap_name + len) = '\0';
5682ecb4dc22SAlex Elder 	spec->snap_name = snap_name;
5683e5c35534SAlex Elder 
56840ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
5685e28fff26SAlex Elder 
56864e9afebaSAlex Elder 	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
56874e9afebaSAlex Elder 	if (!rbd_opts)
56884e9afebaSAlex Elder 		goto out_mem;
56894e9afebaSAlex Elder 
56904e9afebaSAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
5691b5584180SIlya Dryomov 	rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
569280de1912SIlya Dryomov 	rbd_opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
5693d22f76e7SAlex Elder 
5694859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
56950ddebc0cSAlex Elder 					mon_addrs + mon_addrs_size - 1,
56964e9afebaSAlex Elder 					parse_rbd_opts_token, rbd_opts);
5697859c31dfSAlex Elder 	if (IS_ERR(copts)) {
5698859c31dfSAlex Elder 		ret = PTR_ERR(copts);
5699dc79b113SAlex Elder 		goto out_err;
5700dc79b113SAlex Elder 	}
5701859c31dfSAlex Elder 	kfree(options);
5702859c31dfSAlex Elder 
5703859c31dfSAlex Elder 	*ceph_opts = copts;
57044e9afebaSAlex Elder 	*opts = rbd_opts;
5705859c31dfSAlex Elder 	*rbd_spec = spec;
57060ddebc0cSAlex Elder 
5707dc79b113SAlex Elder 	return 0;
5708f28e565aSAlex Elder out_mem:
5709dc79b113SAlex Elder 	ret = -ENOMEM;
5710d22f76e7SAlex Elder out_err:
5711859c31dfSAlex Elder 	kfree(rbd_opts);
5712859c31dfSAlex Elder 	rbd_spec_put(spec);
5713f28e565aSAlex Elder 	kfree(options);
5714d22f76e7SAlex Elder 
5715dc79b113SAlex Elder 	return ret;
5716a725f65eSAlex Elder }
5717a725f65eSAlex Elder 
5718589d30e0SAlex Elder /*
571930ba1f02SIlya Dryomov  * Return pool id (>= 0) or a negative error code.
572030ba1f02SIlya Dryomov  */
572130ba1f02SIlya Dryomov static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
572230ba1f02SIlya Dryomov {
5723a319bf56SIlya Dryomov 	struct ceph_options *opts = rbdc->client->options;
572430ba1f02SIlya Dryomov 	u64 newest_epoch;
572530ba1f02SIlya Dryomov 	int tries = 0;
572630ba1f02SIlya Dryomov 	int ret;
572730ba1f02SIlya Dryomov 
572830ba1f02SIlya Dryomov again:
572930ba1f02SIlya Dryomov 	ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
573030ba1f02SIlya Dryomov 	if (ret == -ENOENT && tries++ < 1) {
5731d0b19705SIlya Dryomov 		ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap",
573230ba1f02SIlya Dryomov 					    &newest_epoch);
573330ba1f02SIlya Dryomov 		if (ret < 0)
573430ba1f02SIlya Dryomov 			return ret;
573530ba1f02SIlya Dryomov 
573630ba1f02SIlya Dryomov 		if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
57377cca78c9SIlya Dryomov 			ceph_osdc_maybe_request_map(&rbdc->client->osdc);
573830ba1f02SIlya Dryomov 			(void) ceph_monc_wait_osdmap(&rbdc->client->monc,
5739a319bf56SIlya Dryomov 						     newest_epoch,
5740a319bf56SIlya Dryomov 						     opts->mount_timeout);
574130ba1f02SIlya Dryomov 			goto again;
574230ba1f02SIlya Dryomov 		} else {
574330ba1f02SIlya Dryomov 			/* the osdmap we have is new enough */
574430ba1f02SIlya Dryomov 			return -ENOENT;
574530ba1f02SIlya Dryomov 		}
574630ba1f02SIlya Dryomov 	}
574730ba1f02SIlya Dryomov 
574830ba1f02SIlya Dryomov 	return ret;
574930ba1f02SIlya Dryomov }
575030ba1f02SIlya Dryomov 
575130ba1f02SIlya Dryomov /*
5752589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
5753589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
5754589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
5755589d30e0SAlex Elder  *
5756589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
5757589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
5758589d30e0SAlex Elder  * with the supplied name.
5759589d30e0SAlex Elder  *
5760589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
5761589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
5762589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
5763589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
5764589d30e0SAlex Elder  */
5765589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
5766589d30e0SAlex Elder {
5767589d30e0SAlex Elder 	int ret;
5768589d30e0SAlex Elder 	size_t size;
5769ecd4a68aSIlya Dryomov 	CEPH_DEFINE_OID_ONSTACK(oid);
5770589d30e0SAlex Elder 	void *response;
5771c0fba368SAlex Elder 	char *image_id;
57722f82ee54SAlex Elder 
5773589d30e0SAlex Elder 	/*
57742c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
57752c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
5776c0fba368SAlex Elder 	 * need to fetch the image id again in this case.  We
5777c0fba368SAlex Elder 	 * do still need to set the image format though.
57782c0d0a10SAlex Elder 	 */
5779c0fba368SAlex Elder 	if (rbd_dev->spec->image_id) {
5780c0fba368SAlex Elder 		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
5781c0fba368SAlex Elder 
57822c0d0a10SAlex Elder 		return 0;
5783c0fba368SAlex Elder 	}
57842c0d0a10SAlex Elder 
57852c0d0a10SAlex Elder 	/*
5786589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
5787589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
5788589d30e0SAlex Elder 	 */
5789ecd4a68aSIlya Dryomov 	ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
5790ecd4a68aSIlya Dryomov 			       rbd_dev->spec->image_name);
5791ecd4a68aSIlya Dryomov 	if (ret)
5792ecd4a68aSIlya Dryomov 		return ret;
5793ecd4a68aSIlya Dryomov 
5794ecd4a68aSIlya Dryomov 	dout("rbd id object name is %s\n", oid.name);
5795589d30e0SAlex Elder 
5796589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
5797589d30e0SAlex Elder 
5798589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
5799589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
5800589d30e0SAlex Elder 	if (!response) {
5801589d30e0SAlex Elder 		ret = -ENOMEM;
5802589d30e0SAlex Elder 		goto out;
5803589d30e0SAlex Elder 	}
5804589d30e0SAlex Elder 
5805c0fba368SAlex Elder 	/* If it doesn't exist we'll assume it's a format 1 image */
5806c0fba368SAlex Elder 
5807ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5808ecd4a68aSIlya Dryomov 				  "get_id", NULL, 0,
5809e2a58ee5SAlex Elder 				  response, RBD_IMAGE_ID_LEN_MAX);
581036be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5811c0fba368SAlex Elder 	if (ret == -ENOENT) {
5812c0fba368SAlex Elder 		image_id = kstrdup("", GFP_KERNEL);
5813c0fba368SAlex Elder 		ret = image_id ? 0 : -ENOMEM;
5814c0fba368SAlex Elder 		if (!ret)
5815c0fba368SAlex Elder 			rbd_dev->image_format = 1;
58167dd440c9SIlya Dryomov 	} else if (ret >= 0) {
5817c0fba368SAlex Elder 		void *p = response;
5818589d30e0SAlex Elder 
5819c0fba368SAlex Elder 		image_id = ceph_extract_encoded_string(&p, p + ret,
5820979ed480SAlex Elder 						NULL, GFP_NOIO);
5821461f758aSDuan Jiong 		ret = PTR_ERR_OR_ZERO(image_id);
5822c0fba368SAlex Elder 		if (!ret)
5823c0fba368SAlex Elder 			rbd_dev->image_format = 2;
5824c0fba368SAlex Elder 	}
5825c0fba368SAlex Elder 
5826c0fba368SAlex Elder 	if (!ret) {
5827c0fba368SAlex Elder 		rbd_dev->spec->image_id = image_id;
5828c0fba368SAlex Elder 		dout("image_id is %s\n", image_id);
5829589d30e0SAlex Elder 	}
5830589d30e0SAlex Elder out:
5831589d30e0SAlex Elder 	kfree(response);
5832ecd4a68aSIlya Dryomov 	ceph_oid_destroy(&oid);
5833589d30e0SAlex Elder 	return ret;
5834589d30e0SAlex Elder }
5835589d30e0SAlex Elder 
58363abef3b3SAlex Elder /*
58373abef3b3SAlex Elder  * Undo whatever state changes are made by v1 or v2 header info
58383abef3b3SAlex Elder  * call.
58393abef3b3SAlex Elder  */
58406fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
58416fd48b3bSAlex Elder {
58426fd48b3bSAlex Elder 	struct rbd_image_header	*header;
58436fd48b3bSAlex Elder 
5844a2acd00eSAlex Elder 	rbd_dev_parent_put(rbd_dev);
58456fd48b3bSAlex Elder 
58466fd48b3bSAlex Elder 	/* Free dynamic fields from the header, then zero it out */
58476fd48b3bSAlex Elder 
58486fd48b3bSAlex Elder 	header = &rbd_dev->header;
5849812164f8SAlex Elder 	ceph_put_snap_context(header->snapc);
58506fd48b3bSAlex Elder 	kfree(header->snap_sizes);
58516fd48b3bSAlex Elder 	kfree(header->snap_names);
58526fd48b3bSAlex Elder 	kfree(header->object_prefix);
58536fd48b3bSAlex Elder 	memset(header, 0, sizeof (*header));
58546fd48b3bSAlex Elder }
58556fd48b3bSAlex Elder 
58562df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
5857a30b71b9SAlex Elder {
5858a30b71b9SAlex Elder 	int ret;
5859a30b71b9SAlex Elder 
58601e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
586157385b51SAlex Elder 	if (ret)
58621e130199SAlex Elder 		goto out_err;
5863b1b5402aSAlex Elder 
58642df3fac7SAlex Elder 	/*
58652df3fac7SAlex Elder 	 * Get the and check features for the image.  Currently the
58662df3fac7SAlex Elder 	 * features are assumed to never change.
58672df3fac7SAlex Elder 	 */
5868b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
586957385b51SAlex Elder 	if (ret)
5870b1b5402aSAlex Elder 		goto out_err;
587135d489f9SAlex Elder 
5872cc070d59SAlex Elder 	/* If the image supports fancy striping, get its parameters */
5873cc070d59SAlex Elder 
5874cc070d59SAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
5875cc070d59SAlex Elder 		ret = rbd_dev_v2_striping_info(rbd_dev);
5876cc070d59SAlex Elder 		if (ret < 0)
5877cc070d59SAlex Elder 			goto out_err;
5878cc070d59SAlex Elder 	}
5879a30b71b9SAlex Elder 
58807e97332eSIlya Dryomov 	if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
58817e97332eSIlya Dryomov 		ret = rbd_dev_v2_data_pool(rbd_dev);
58827e97332eSIlya Dryomov 		if (ret)
58837e97332eSIlya Dryomov 			goto out_err;
58847e97332eSIlya Dryomov 	}
58857e97332eSIlya Dryomov 
5886263423f8SIlya Dryomov 	rbd_init_layout(rbd_dev);
588735152979SAlex Elder 	return 0;
5888263423f8SIlya Dryomov 
58899d475de5SAlex Elder out_err:
5890642a2537SAlex Elder 	rbd_dev->header.features = 0;
58911e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
58921e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
58939d475de5SAlex Elder 	return ret;
5894a30b71b9SAlex Elder }
5895a30b71b9SAlex Elder 
58966d69bb53SIlya Dryomov /*
58976d69bb53SIlya Dryomov  * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
58986d69bb53SIlya Dryomov  * rbd_dev_image_probe() recursion depth, which means it's also the
58996d69bb53SIlya Dryomov  * length of the already discovered part of the parent chain.
59006d69bb53SIlya Dryomov  */
59016d69bb53SIlya Dryomov static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
590283a06263SAlex Elder {
59032f82ee54SAlex Elder 	struct rbd_device *parent = NULL;
5904124afba2SAlex Elder 	int ret;
5905124afba2SAlex Elder 
5906124afba2SAlex Elder 	if (!rbd_dev->parent_spec)
5907124afba2SAlex Elder 		return 0;
5908124afba2SAlex Elder 
59096d69bb53SIlya Dryomov 	if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
59106d69bb53SIlya Dryomov 		pr_info("parent chain is too long (%d)\n", depth);
59116d69bb53SIlya Dryomov 		ret = -EINVAL;
59126d69bb53SIlya Dryomov 		goto out_err;
59136d69bb53SIlya Dryomov 	}
59146d69bb53SIlya Dryomov 
59151643dfa4SIlya Dryomov 	parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
59161f2c6651SIlya Dryomov 	if (!parent) {
5917124afba2SAlex Elder 		ret = -ENOMEM;
5918124afba2SAlex Elder 		goto out_err;
59191f2c6651SIlya Dryomov 	}
59201f2c6651SIlya Dryomov 
59211f2c6651SIlya Dryomov 	/*
59221f2c6651SIlya Dryomov 	 * Images related by parent/child relationships always share
59231f2c6651SIlya Dryomov 	 * rbd_client and spec/parent_spec, so bump their refcounts.
59241f2c6651SIlya Dryomov 	 */
59251f2c6651SIlya Dryomov 	__rbd_get_client(rbd_dev->rbd_client);
59261f2c6651SIlya Dryomov 	rbd_spec_get(rbd_dev->parent_spec);
5927124afba2SAlex Elder 
59286d69bb53SIlya Dryomov 	ret = rbd_dev_image_probe(parent, depth);
5929124afba2SAlex Elder 	if (ret < 0)
5930124afba2SAlex Elder 		goto out_err;
59311f2c6651SIlya Dryomov 
5932124afba2SAlex Elder 	rbd_dev->parent = parent;
5933a2acd00eSAlex Elder 	atomic_set(&rbd_dev->parent_ref, 1);
5934124afba2SAlex Elder 	return 0;
5935124afba2SAlex Elder 
59361f2c6651SIlya Dryomov out_err:
59371f2c6651SIlya Dryomov 	rbd_dev_unparent(rbd_dev);
59381f2c6651SIlya Dryomov 	rbd_dev_destroy(parent);
5939124afba2SAlex Elder 	return ret;
5940124afba2SAlex Elder }
5941124afba2SAlex Elder 
5942811c6688SIlya Dryomov /*
5943811c6688SIlya Dryomov  * rbd_dev->header_rwsem must be locked for write and will be unlocked
5944811c6688SIlya Dryomov  * upon return.
5945811c6688SIlya Dryomov  */
5946200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
5947124afba2SAlex Elder {
594883a06263SAlex Elder 	int ret;
594983a06263SAlex Elder 
59509b60e70bSIlya Dryomov 	/* Record our major and minor device numbers. */
595183a06263SAlex Elder 
59529b60e70bSIlya Dryomov 	if (!single_major) {
595383a06263SAlex Elder 		ret = register_blkdev(0, rbd_dev->name);
595483a06263SAlex Elder 		if (ret < 0)
59551643dfa4SIlya Dryomov 			goto err_out_unlock;
59569b60e70bSIlya Dryomov 
595783a06263SAlex Elder 		rbd_dev->major = ret;
5958dd82fff1SIlya Dryomov 		rbd_dev->minor = 0;
59599b60e70bSIlya Dryomov 	} else {
59609b60e70bSIlya Dryomov 		rbd_dev->major = rbd_major;
59619b60e70bSIlya Dryomov 		rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
59629b60e70bSIlya Dryomov 	}
596383a06263SAlex Elder 
596483a06263SAlex Elder 	/* Set up the blkdev mapping. */
596583a06263SAlex Elder 
596683a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
596783a06263SAlex Elder 	if (ret)
596883a06263SAlex Elder 		goto err_out_blkdev;
596983a06263SAlex Elder 
5970f35a4deeSAlex Elder 	ret = rbd_dev_mapping_set(rbd_dev);
597183a06263SAlex Elder 	if (ret)
597283a06263SAlex Elder 		goto err_out_disk;
5973bc1ecc65SIlya Dryomov 
5974f35a4deeSAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
597522001f61SJosh Durgin 	set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only);
5976f35a4deeSAlex Elder 
5977dd5ac32dSIlya Dryomov 	dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
5978dd5ac32dSIlya Dryomov 	ret = device_add(&rbd_dev->dev);
5979f35a4deeSAlex Elder 	if (ret)
5980f5ee37bdSIlya Dryomov 		goto err_out_mapping;
598183a06263SAlex Elder 
598283a06263SAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
598383a06263SAlex Elder 
5984129b79d4SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5985811c6688SIlya Dryomov 	up_write(&rbd_dev->header_rwsem);
598683a06263SAlex Elder 
59871643dfa4SIlya Dryomov 	spin_lock(&rbd_dev_list_lock);
59881643dfa4SIlya Dryomov 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
59891643dfa4SIlya Dryomov 	spin_unlock(&rbd_dev_list_lock);
59901643dfa4SIlya Dryomov 
5991811c6688SIlya Dryomov 	add_disk(rbd_dev->disk);
5992ca7909e8SIlya Dryomov 	pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
5993ca7909e8SIlya Dryomov 		(unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
5994ca7909e8SIlya Dryomov 		rbd_dev->header.features);
599583a06263SAlex Elder 
599683a06263SAlex Elder 	return ret;
59972f82ee54SAlex Elder 
5998f35a4deeSAlex Elder err_out_mapping:
5999f35a4deeSAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
600083a06263SAlex Elder err_out_disk:
600183a06263SAlex Elder 	rbd_free_disk(rbd_dev);
600283a06263SAlex Elder err_out_blkdev:
60039b60e70bSIlya Dryomov 	if (!single_major)
600483a06263SAlex Elder 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
6005811c6688SIlya Dryomov err_out_unlock:
6006811c6688SIlya Dryomov 	up_write(&rbd_dev->header_rwsem);
600783a06263SAlex Elder 	return ret;
600883a06263SAlex Elder }
600983a06263SAlex Elder 
6010332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev)
6011332bb12dSAlex Elder {
6012332bb12dSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
6013c41d13a3SIlya Dryomov 	int ret;
6014332bb12dSAlex Elder 
6015332bb12dSAlex Elder 	/* Record the header object name for this rbd image. */
6016332bb12dSAlex Elder 
6017332bb12dSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
6018332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
6019c41d13a3SIlya Dryomov 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6020332bb12dSAlex Elder 				       spec->image_name, RBD_SUFFIX);
6021332bb12dSAlex Elder 	else
6022c41d13a3SIlya Dryomov 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6023332bb12dSAlex Elder 				       RBD_HEADER_PREFIX, spec->image_id);
6024c41d13a3SIlya Dryomov 
6025c41d13a3SIlya Dryomov 	return ret;
6026332bb12dSAlex Elder }
6027332bb12dSAlex Elder 
6028200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev)
6029200a6a8bSAlex Elder {
60306fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
60316fd48b3bSAlex Elder 	rbd_dev->image_format = 0;
60326fd48b3bSAlex Elder 	kfree(rbd_dev->spec->image_id);
60336fd48b3bSAlex Elder 	rbd_dev->spec->image_id = NULL;
60346fd48b3bSAlex Elder 
6035200a6a8bSAlex Elder 	rbd_dev_destroy(rbd_dev);
6036200a6a8bSAlex Elder }
6037200a6a8bSAlex Elder 
6038a30b71b9SAlex Elder /*
6039a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
60401f3ef788SAlex Elder  * device.  If this image is the one being mapped (i.e., not a
60411f3ef788SAlex Elder  * parent), initiate a watch on its header object before using that
60421f3ef788SAlex Elder  * object to get detailed information about the rbd image.
6043a30b71b9SAlex Elder  */
60446d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
6045a30b71b9SAlex Elder {
6046a30b71b9SAlex Elder 	int ret;
6047a30b71b9SAlex Elder 
6048a30b71b9SAlex Elder 	/*
60493abef3b3SAlex Elder 	 * Get the id from the image id object.  Unless there's an
60503abef3b3SAlex Elder 	 * error, rbd_dev->spec->image_id will be filled in with
60513abef3b3SAlex Elder 	 * a dynamically-allocated string, and rbd_dev->image_format
60523abef3b3SAlex Elder 	 * will be set to either 1 or 2.
6053a30b71b9SAlex Elder 	 */
6054a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
6055a30b71b9SAlex Elder 	if (ret)
6056c0fba368SAlex Elder 		return ret;
6057c0fba368SAlex Elder 
6058332bb12dSAlex Elder 	ret = rbd_dev_header_name(rbd_dev);
6059332bb12dSAlex Elder 	if (ret)
6060332bb12dSAlex Elder 		goto err_out_format;
6061332bb12dSAlex Elder 
60626d69bb53SIlya Dryomov 	if (!depth) {
606399d16943SIlya Dryomov 		ret = rbd_register_watch(rbd_dev);
60641fe48023SIlya Dryomov 		if (ret) {
60651fe48023SIlya Dryomov 			if (ret == -ENOENT)
60661fe48023SIlya Dryomov 				pr_info("image %s/%s does not exist\n",
60671fe48023SIlya Dryomov 					rbd_dev->spec->pool_name,
60681fe48023SIlya Dryomov 					rbd_dev->spec->image_name);
6069c41d13a3SIlya Dryomov 			goto err_out_format;
60701f3ef788SAlex Elder 		}
60711fe48023SIlya Dryomov 	}
6072b644de2bSAlex Elder 
6073a720ae09SIlya Dryomov 	ret = rbd_dev_header_info(rbd_dev);
60745655c4d9SAlex Elder 	if (ret)
6075b644de2bSAlex Elder 		goto err_out_watch;
6076a30b71b9SAlex Elder 
607704077599SIlya Dryomov 	/*
607804077599SIlya Dryomov 	 * If this image is the one being mapped, we have pool name and
607904077599SIlya Dryomov 	 * id, image name and id, and snap name - need to fill snap id.
608004077599SIlya Dryomov 	 * Otherwise this is a parent image, identified by pool, image
608104077599SIlya Dryomov 	 * and snap ids - need to fill in names for those ids.
608204077599SIlya Dryomov 	 */
60836d69bb53SIlya Dryomov 	if (!depth)
608404077599SIlya Dryomov 		ret = rbd_spec_fill_snap_id(rbd_dev);
608504077599SIlya Dryomov 	else
608604077599SIlya Dryomov 		ret = rbd_spec_fill_names(rbd_dev);
60871fe48023SIlya Dryomov 	if (ret) {
60881fe48023SIlya Dryomov 		if (ret == -ENOENT)
60891fe48023SIlya Dryomov 			pr_info("snap %s/%s@%s does not exist\n",
60901fe48023SIlya Dryomov 				rbd_dev->spec->pool_name,
60911fe48023SIlya Dryomov 				rbd_dev->spec->image_name,
60921fe48023SIlya Dryomov 				rbd_dev->spec->snap_name);
609333dca39fSAlex Elder 		goto err_out_probe;
60941fe48023SIlya Dryomov 	}
60959bb81c9bSAlex Elder 
6096e8f59b59SIlya Dryomov 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
6097e8f59b59SIlya Dryomov 		ret = rbd_dev_v2_parent_info(rbd_dev);
6098e8f59b59SIlya Dryomov 		if (ret)
6099e8f59b59SIlya Dryomov 			goto err_out_probe;
6100e8f59b59SIlya Dryomov 
6101e8f59b59SIlya Dryomov 		/*
6102e8f59b59SIlya Dryomov 		 * Need to warn users if this image is the one being
6103e8f59b59SIlya Dryomov 		 * mapped and has a parent.
6104e8f59b59SIlya Dryomov 		 */
61056d69bb53SIlya Dryomov 		if (!depth && rbd_dev->parent_spec)
6106e8f59b59SIlya Dryomov 			rbd_warn(rbd_dev,
6107e8f59b59SIlya Dryomov 				 "WARNING: kernel layering is EXPERIMENTAL!");
6108e8f59b59SIlya Dryomov 	}
6109e8f59b59SIlya Dryomov 
61106d69bb53SIlya Dryomov 	ret = rbd_dev_probe_parent(rbd_dev, depth);
611130d60ba2SAlex Elder 	if (ret)
611230d60ba2SAlex Elder 		goto err_out_probe;
611383a06263SAlex Elder 
611430d60ba2SAlex Elder 	dout("discovered format %u image, header name is %s\n",
6115c41d13a3SIlya Dryomov 		rbd_dev->image_format, rbd_dev->header_oid.name);
611630d60ba2SAlex Elder 	return 0;
6117e8f59b59SIlya Dryomov 
61186fd48b3bSAlex Elder err_out_probe:
61196fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
6120b644de2bSAlex Elder err_out_watch:
61216d69bb53SIlya Dryomov 	if (!depth)
612299d16943SIlya Dryomov 		rbd_unregister_watch(rbd_dev);
6123332bb12dSAlex Elder err_out_format:
6124332bb12dSAlex Elder 	rbd_dev->image_format = 0;
61255655c4d9SAlex Elder 	kfree(rbd_dev->spec->image_id);
61265655c4d9SAlex Elder 	rbd_dev->spec->image_id = NULL;
61275655c4d9SAlex Elder 	return ret;
612883a06263SAlex Elder }
612983a06263SAlex Elder 
61309b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus,
613159c2be1eSYehuda Sadeh 			  const char *buf,
613259c2be1eSYehuda Sadeh 			  size_t count)
6133602adf40SYehuda Sadeh {
6134cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
6135dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
61364e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
6137859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
61389d3997fdSAlex Elder 	struct rbd_client *rbdc;
613951344a38SAlex Elder 	bool read_only;
6140b51c83c2SIlya Dryomov 	int rc;
6141602adf40SYehuda Sadeh 
6142602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
6143602adf40SYehuda Sadeh 		return -ENODEV;
6144602adf40SYehuda Sadeh 
6145a725f65eSAlex Elder 	/* parse add command */
6146859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
6147dc79b113SAlex Elder 	if (rc < 0)
6148dd5ac32dSIlya Dryomov 		goto out;
6149a725f65eSAlex Elder 
61509d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
61519d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
61529d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
61530ddebc0cSAlex Elder 		goto err_out_args;
61549d3997fdSAlex Elder 	}
6155602adf40SYehuda Sadeh 
6156602adf40SYehuda Sadeh 	/* pick the pool */
615730ba1f02SIlya Dryomov 	rc = rbd_add_get_pool_id(rbdc, spec->pool_name);
61581fe48023SIlya Dryomov 	if (rc < 0) {
61591fe48023SIlya Dryomov 		if (rc == -ENOENT)
61601fe48023SIlya Dryomov 			pr_info("pool %s does not exist\n", spec->pool_name);
6161602adf40SYehuda Sadeh 		goto err_out_client;
61621fe48023SIlya Dryomov 	}
6163859c31dfSAlex Elder 	spec->pool_id = (u64)rc;
6164859c31dfSAlex Elder 
6165d147543dSIlya Dryomov 	rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
6166b51c83c2SIlya Dryomov 	if (!rbd_dev) {
6167b51c83c2SIlya Dryomov 		rc = -ENOMEM;
6168bd4ba655SAlex Elder 		goto err_out_client;
6169b51c83c2SIlya Dryomov 	}
6170c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
6171c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
6172d147543dSIlya Dryomov 	rbd_opts = NULL;	/* rbd_dev now owns this */
6173602adf40SYehuda Sadeh 
61740d6d1e9cSMike Christie 	rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
61750d6d1e9cSMike Christie 	if (!rbd_dev->config_info) {
61760d6d1e9cSMike Christie 		rc = -ENOMEM;
61770d6d1e9cSMike Christie 		goto err_out_rbd_dev;
61780d6d1e9cSMike Christie 	}
61790d6d1e9cSMike Christie 
6180811c6688SIlya Dryomov 	down_write(&rbd_dev->header_rwsem);
61816d69bb53SIlya Dryomov 	rc = rbd_dev_image_probe(rbd_dev, 0);
61820d6d1e9cSMike Christie 	if (rc < 0) {
61830d6d1e9cSMike Christie 		up_write(&rbd_dev->header_rwsem);
6184c53d5893SAlex Elder 		goto err_out_rbd_dev;
61850d6d1e9cSMike Christie 	}
618605fd6f6fSAlex Elder 
61877ce4eef7SAlex Elder 	/* If we are mapping a snapshot it must be marked read-only */
61887ce4eef7SAlex Elder 
6189d147543dSIlya Dryomov 	read_only = rbd_dev->opts->read_only;
61907ce4eef7SAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
61917ce4eef7SAlex Elder 		read_only = true;
61927ce4eef7SAlex Elder 	rbd_dev->mapping.read_only = read_only;
61937ce4eef7SAlex Elder 
6194b536f69aSAlex Elder 	rc = rbd_dev_device_setup(rbd_dev);
61953abef3b3SAlex Elder 	if (rc) {
6196e37180c0SIlya Dryomov 		/*
619799d16943SIlya Dryomov 		 * rbd_unregister_watch() can't be moved into
6198e37180c0SIlya Dryomov 		 * rbd_dev_image_release() without refactoring, see
6199e37180c0SIlya Dryomov 		 * commit 1f3ef78861ac.
6200e37180c0SIlya Dryomov 		 */
620199d16943SIlya Dryomov 		rbd_unregister_watch(rbd_dev);
62023abef3b3SAlex Elder 		rbd_dev_image_release(rbd_dev);
6203dd5ac32dSIlya Dryomov 		goto out;
62043abef3b3SAlex Elder 	}
62053abef3b3SAlex Elder 
6206dd5ac32dSIlya Dryomov 	rc = count;
6207dd5ac32dSIlya Dryomov out:
6208dd5ac32dSIlya Dryomov 	module_put(THIS_MODULE);
6209dd5ac32dSIlya Dryomov 	return rc;
6210b536f69aSAlex Elder 
6211c53d5893SAlex Elder err_out_rbd_dev:
6212c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
6213bd4ba655SAlex Elder err_out_client:
62149d3997fdSAlex Elder 	rbd_put_client(rbdc);
62150ddebc0cSAlex Elder err_out_args:
6216859c31dfSAlex Elder 	rbd_spec_put(spec);
6217d147543dSIlya Dryomov 	kfree(rbd_opts);
6218dd5ac32dSIlya Dryomov 	goto out;
6219602adf40SYehuda Sadeh }
6220602adf40SYehuda Sadeh 
62219b60e70bSIlya Dryomov static ssize_t rbd_add(struct bus_type *bus,
62229b60e70bSIlya Dryomov 		       const char *buf,
62239b60e70bSIlya Dryomov 		       size_t count)
62249b60e70bSIlya Dryomov {
62259b60e70bSIlya Dryomov 	if (single_major)
62269b60e70bSIlya Dryomov 		return -EINVAL;
62279b60e70bSIlya Dryomov 
62289b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
62299b60e70bSIlya Dryomov }
62309b60e70bSIlya Dryomov 
62319b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus,
62329b60e70bSIlya Dryomov 				    const char *buf,
62339b60e70bSIlya Dryomov 				    size_t count)
62349b60e70bSIlya Dryomov {
62359b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
62369b60e70bSIlya Dryomov }
62379b60e70bSIlya Dryomov 
6238dd5ac32dSIlya Dryomov static void rbd_dev_device_release(struct rbd_device *rbd_dev)
6239602adf40SYehuda Sadeh {
6240602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
62411643dfa4SIlya Dryomov 
62421643dfa4SIlya Dryomov 	spin_lock(&rbd_dev_list_lock);
62431643dfa4SIlya Dryomov 	list_del_init(&rbd_dev->node);
62441643dfa4SIlya Dryomov 	spin_unlock(&rbd_dev_list_lock);
62451643dfa4SIlya Dryomov 
6246200a6a8bSAlex Elder 	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
6247dd5ac32dSIlya Dryomov 	device_del(&rbd_dev->dev);
62486d80b130SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
62499b60e70bSIlya Dryomov 	if (!single_major)
6250602adf40SYehuda Sadeh 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
6251602adf40SYehuda Sadeh }
6252602adf40SYehuda Sadeh 
625305a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
625405a46afdSAlex Elder {
6255ad945fc1SAlex Elder 	while (rbd_dev->parent) {
625605a46afdSAlex Elder 		struct rbd_device *first = rbd_dev;
625705a46afdSAlex Elder 		struct rbd_device *second = first->parent;
625805a46afdSAlex Elder 		struct rbd_device *third;
625905a46afdSAlex Elder 
626005a46afdSAlex Elder 		/*
626105a46afdSAlex Elder 		 * Follow to the parent with no grandparent and
626205a46afdSAlex Elder 		 * remove it.
626305a46afdSAlex Elder 		 */
626405a46afdSAlex Elder 		while (second && (third = second->parent)) {
626505a46afdSAlex Elder 			first = second;
626605a46afdSAlex Elder 			second = third;
626705a46afdSAlex Elder 		}
6268ad945fc1SAlex Elder 		rbd_assert(second);
62698ad42cd0SAlex Elder 		rbd_dev_image_release(second);
6270ad945fc1SAlex Elder 		first->parent = NULL;
6271ad945fc1SAlex Elder 		first->parent_overlap = 0;
6272ad945fc1SAlex Elder 
6273ad945fc1SAlex Elder 		rbd_assert(first->parent_spec);
627405a46afdSAlex Elder 		rbd_spec_put(first->parent_spec);
627505a46afdSAlex Elder 		first->parent_spec = NULL;
627605a46afdSAlex Elder 	}
627705a46afdSAlex Elder }
627805a46afdSAlex Elder 
62799b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus,
6280602adf40SYehuda Sadeh 			     const char *buf,
6281602adf40SYehuda Sadeh 			     size_t count)
6282602adf40SYehuda Sadeh {
6283602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
6284751cc0e3SAlex Elder 	struct list_head *tmp;
6285751cc0e3SAlex Elder 	int dev_id;
62860276dca6SMike Christie 	char opt_buf[6];
628782a442d2SAlex Elder 	bool already = false;
62880276dca6SMike Christie 	bool force = false;
62890d8189e1SAlex Elder 	int ret;
6290602adf40SYehuda Sadeh 
62910276dca6SMike Christie 	dev_id = -1;
62920276dca6SMike Christie 	opt_buf[0] = '\0';
62930276dca6SMike Christie 	sscanf(buf, "%d %5s", &dev_id, opt_buf);
62940276dca6SMike Christie 	if (dev_id < 0) {
62950276dca6SMike Christie 		pr_err("dev_id out of range\n");
6296602adf40SYehuda Sadeh 		return -EINVAL;
62970276dca6SMike Christie 	}
62980276dca6SMike Christie 	if (opt_buf[0] != '\0') {
62990276dca6SMike Christie 		if (!strcmp(opt_buf, "force")) {
63000276dca6SMike Christie 			force = true;
63010276dca6SMike Christie 		} else {
63020276dca6SMike Christie 			pr_err("bad remove option at '%s'\n", opt_buf);
63030276dca6SMike Christie 			return -EINVAL;
63040276dca6SMike Christie 		}
63050276dca6SMike Christie 	}
6306602adf40SYehuda Sadeh 
6307602adf40SYehuda Sadeh 	ret = -ENOENT;
6308751cc0e3SAlex Elder 	spin_lock(&rbd_dev_list_lock);
6309751cc0e3SAlex Elder 	list_for_each(tmp, &rbd_dev_list) {
6310751cc0e3SAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
6311751cc0e3SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
6312751cc0e3SAlex Elder 			ret = 0;
6313751cc0e3SAlex Elder 			break;
6314602adf40SYehuda Sadeh 		}
6315751cc0e3SAlex Elder 	}
6316751cc0e3SAlex Elder 	if (!ret) {
6317a14ea269SAlex Elder 		spin_lock_irq(&rbd_dev->lock);
63180276dca6SMike Christie 		if (rbd_dev->open_count && !force)
631942382b70SAlex Elder 			ret = -EBUSY;
6320b82d167bSAlex Elder 		else
632182a442d2SAlex Elder 			already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
632282a442d2SAlex Elder 							&rbd_dev->flags);
6323a14ea269SAlex Elder 		spin_unlock_irq(&rbd_dev->lock);
6324751cc0e3SAlex Elder 	}
6325751cc0e3SAlex Elder 	spin_unlock(&rbd_dev_list_lock);
632682a442d2SAlex Elder 	if (ret < 0 || already)
63271ba0f1e7SAlex Elder 		return ret;
6328751cc0e3SAlex Elder 
63290276dca6SMike Christie 	if (force) {
63300276dca6SMike Christie 		/*
63310276dca6SMike Christie 		 * Prevent new IO from being queued and wait for existing
63320276dca6SMike Christie 		 * IO to complete/fail.
63330276dca6SMike Christie 		 */
63340276dca6SMike Christie 		blk_mq_freeze_queue(rbd_dev->disk->queue);
63350276dca6SMike Christie 		blk_set_queue_dying(rbd_dev->disk->queue);
63360276dca6SMike Christie 	}
63370276dca6SMike Christie 
6338ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
6339ed95b21aSIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev))
6340ed95b21aSIlya Dryomov 		rbd_unlock(rbd_dev);
6341ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
634299d16943SIlya Dryomov 	rbd_unregister_watch(rbd_dev);
6343fca27065SIlya Dryomov 
63449875201eSJosh Durgin 	/*
63459875201eSJosh Durgin 	 * Don't free anything from rbd_dev->disk until after all
63469875201eSJosh Durgin 	 * notifies are completely processed. Otherwise
63479875201eSJosh Durgin 	 * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting
63489875201eSJosh Durgin 	 * in a potential use after free of rbd_dev->disk or rbd_dev.
63499875201eSJosh Durgin 	 */
6350dd5ac32dSIlya Dryomov 	rbd_dev_device_release(rbd_dev);
63518ad42cd0SAlex Elder 	rbd_dev_image_release(rbd_dev);
6352aafb230eSAlex Elder 
63531ba0f1e7SAlex Elder 	return count;
6354602adf40SYehuda Sadeh }
6355602adf40SYehuda Sadeh 
63569b60e70bSIlya Dryomov static ssize_t rbd_remove(struct bus_type *bus,
63579b60e70bSIlya Dryomov 			  const char *buf,
63589b60e70bSIlya Dryomov 			  size_t count)
63599b60e70bSIlya Dryomov {
63609b60e70bSIlya Dryomov 	if (single_major)
63619b60e70bSIlya Dryomov 		return -EINVAL;
63629b60e70bSIlya Dryomov 
63639b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
63649b60e70bSIlya Dryomov }
63659b60e70bSIlya Dryomov 
63669b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus,
63679b60e70bSIlya Dryomov 				       const char *buf,
63689b60e70bSIlya Dryomov 				       size_t count)
63699b60e70bSIlya Dryomov {
63709b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
63719b60e70bSIlya Dryomov }
63729b60e70bSIlya Dryomov 
6373602adf40SYehuda Sadeh /*
6374602adf40SYehuda Sadeh  * create control files in sysfs
6375dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
6376602adf40SYehuda Sadeh  */
6377602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
6378602adf40SYehuda Sadeh {
6379dfc5606dSYehuda Sadeh 	int ret;
6380602adf40SYehuda Sadeh 
6381fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
6382dfc5606dSYehuda Sadeh 	if (ret < 0)
6383dfc5606dSYehuda Sadeh 		return ret;
6384602adf40SYehuda Sadeh 
6385fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
6386fed4c143SAlex Elder 	if (ret < 0)
6387fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
6388602adf40SYehuda Sadeh 
6389602adf40SYehuda Sadeh 	return ret;
6390602adf40SYehuda Sadeh }
6391602adf40SYehuda Sadeh 
6392602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
6393602adf40SYehuda Sadeh {
6394dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
6395fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
6396602adf40SYehuda Sadeh }
6397602adf40SYehuda Sadeh 
63981c2a9dfeSAlex Elder static int rbd_slab_init(void)
63991c2a9dfeSAlex Elder {
64001c2a9dfeSAlex Elder 	rbd_assert(!rbd_img_request_cache);
640103d94406SGeliang Tang 	rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
6402868311b1SAlex Elder 	if (!rbd_img_request_cache)
6403868311b1SAlex Elder 		return -ENOMEM;
6404868311b1SAlex Elder 
6405868311b1SAlex Elder 	rbd_assert(!rbd_obj_request_cache);
640603d94406SGeliang Tang 	rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
640778c2a44aSAlex Elder 	if (!rbd_obj_request_cache)
640878c2a44aSAlex Elder 		goto out_err;
640978c2a44aSAlex Elder 
641078c2a44aSAlex Elder 	rbd_assert(!rbd_segment_name_cache);
641178c2a44aSAlex Elder 	rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
64122d0ebc5dSIlya Dryomov 					CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL);
641378c2a44aSAlex Elder 	if (rbd_segment_name_cache)
64141c2a9dfeSAlex Elder 		return 0;
641578c2a44aSAlex Elder out_err:
641678c2a44aSAlex Elder 	kmem_cache_destroy(rbd_obj_request_cache);
641778c2a44aSAlex Elder 	rbd_obj_request_cache = NULL;
64181c2a9dfeSAlex Elder 
6419868311b1SAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
6420868311b1SAlex Elder 	rbd_img_request_cache = NULL;
6421868311b1SAlex Elder 
64221c2a9dfeSAlex Elder 	return -ENOMEM;
64231c2a9dfeSAlex Elder }
64241c2a9dfeSAlex Elder 
64251c2a9dfeSAlex Elder static void rbd_slab_exit(void)
64261c2a9dfeSAlex Elder {
642778c2a44aSAlex Elder 	rbd_assert(rbd_segment_name_cache);
642878c2a44aSAlex Elder 	kmem_cache_destroy(rbd_segment_name_cache);
642978c2a44aSAlex Elder 	rbd_segment_name_cache = NULL;
643078c2a44aSAlex Elder 
6431868311b1SAlex Elder 	rbd_assert(rbd_obj_request_cache);
6432868311b1SAlex Elder 	kmem_cache_destroy(rbd_obj_request_cache);
6433868311b1SAlex Elder 	rbd_obj_request_cache = NULL;
6434868311b1SAlex Elder 
64351c2a9dfeSAlex Elder 	rbd_assert(rbd_img_request_cache);
64361c2a9dfeSAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
64371c2a9dfeSAlex Elder 	rbd_img_request_cache = NULL;
64381c2a9dfeSAlex Elder }
64391c2a9dfeSAlex Elder 
6440cc344fa1SAlex Elder static int __init rbd_init(void)
6441602adf40SYehuda Sadeh {
6442602adf40SYehuda Sadeh 	int rc;
6443602adf40SYehuda Sadeh 
64441e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
64451e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
64461e32d34cSAlex Elder 		return -EINVAL;
64471e32d34cSAlex Elder 	}
6448e1b4d96dSIlya Dryomov 
64491c2a9dfeSAlex Elder 	rc = rbd_slab_init();
6450602adf40SYehuda Sadeh 	if (rc)
6451602adf40SYehuda Sadeh 		return rc;
6452e1b4d96dSIlya Dryomov 
6453f5ee37bdSIlya Dryomov 	/*
6454f5ee37bdSIlya Dryomov 	 * The number of active work items is limited by the number of
6455f77303bdSIlya Dryomov 	 * rbd devices * queue depth, so leave @max_active at default.
6456f5ee37bdSIlya Dryomov 	 */
6457f5ee37bdSIlya Dryomov 	rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
6458f5ee37bdSIlya Dryomov 	if (!rbd_wq) {
6459f5ee37bdSIlya Dryomov 		rc = -ENOMEM;
6460f5ee37bdSIlya Dryomov 		goto err_out_slab;
6461f5ee37bdSIlya Dryomov 	}
6462f5ee37bdSIlya Dryomov 
64639b60e70bSIlya Dryomov 	if (single_major) {
64649b60e70bSIlya Dryomov 		rbd_major = register_blkdev(0, RBD_DRV_NAME);
64659b60e70bSIlya Dryomov 		if (rbd_major < 0) {
64669b60e70bSIlya Dryomov 			rc = rbd_major;
6467f5ee37bdSIlya Dryomov 			goto err_out_wq;
64689b60e70bSIlya Dryomov 		}
64699b60e70bSIlya Dryomov 	}
64709b60e70bSIlya Dryomov 
64711c2a9dfeSAlex Elder 	rc = rbd_sysfs_init();
64721c2a9dfeSAlex Elder 	if (rc)
64739b60e70bSIlya Dryomov 		goto err_out_blkdev;
64741c2a9dfeSAlex Elder 
64759b60e70bSIlya Dryomov 	if (single_major)
64769b60e70bSIlya Dryomov 		pr_info("loaded (major %d)\n", rbd_major);
64779b60e70bSIlya Dryomov 	else
6478e1b4d96dSIlya Dryomov 		pr_info("loaded\n");
64799b60e70bSIlya Dryomov 
6480e1b4d96dSIlya Dryomov 	return 0;
6481e1b4d96dSIlya Dryomov 
64829b60e70bSIlya Dryomov err_out_blkdev:
64839b60e70bSIlya Dryomov 	if (single_major)
64849b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
6485f5ee37bdSIlya Dryomov err_out_wq:
6486f5ee37bdSIlya Dryomov 	destroy_workqueue(rbd_wq);
6487e1b4d96dSIlya Dryomov err_out_slab:
6488e1b4d96dSIlya Dryomov 	rbd_slab_exit();
64891c2a9dfeSAlex Elder 	return rc;
6490602adf40SYehuda Sadeh }
6491602adf40SYehuda Sadeh 
6492cc344fa1SAlex Elder static void __exit rbd_exit(void)
6493602adf40SYehuda Sadeh {
6494ffe312cfSIlya Dryomov 	ida_destroy(&rbd_dev_id_ida);
6495602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
64969b60e70bSIlya Dryomov 	if (single_major)
64979b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
6498f5ee37bdSIlya Dryomov 	destroy_workqueue(rbd_wq);
64991c2a9dfeSAlex Elder 	rbd_slab_exit();
6500602adf40SYehuda Sadeh }
6501602adf40SYehuda Sadeh 
6502602adf40SYehuda Sadeh module_init(rbd_init);
6503602adf40SYehuda Sadeh module_exit(rbd_exit);
6504602adf40SYehuda Sadeh 
6505d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
6506602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
6507602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
6508602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
6509602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
6510602adf40SYehuda Sadeh 
651190da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
6512602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
6513