xref: /openbmc/linux/drivers/block/rbd.c (revision 8b679ec5)
1e2a58ee5SAlex Elder 
2602adf40SYehuda Sadeh /*
3602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh 
6602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
7602adf40SYehuda Sadeh 
8602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
9602adf40SYehuda Sadeh 
10602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
11602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
12602adf40SYehuda Sadeh    the Free Software Foundation.
13602adf40SYehuda Sadeh 
14602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
15602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
16602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17602adf40SYehuda Sadeh    GNU General Public License for more details.
18602adf40SYehuda Sadeh 
19602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
20602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
21602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24602adf40SYehuda Sadeh 
25dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
26602adf40SYehuda Sadeh 
27dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
28602adf40SYehuda Sadeh 
29602adf40SYehuda Sadeh  */
30602adf40SYehuda Sadeh 
31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
34ed95b21aSIlya Dryomov #include <linux/ceph/cls_lock_client.h>
35602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3659c2be1eSYehuda Sadeh #include <linux/parser.h>
3730d1cff8SAlex Elder #include <linux/bsearch.h>
38602adf40SYehuda Sadeh 
39602adf40SYehuda Sadeh #include <linux/kernel.h>
40602adf40SYehuda Sadeh #include <linux/device.h>
41602adf40SYehuda Sadeh #include <linux/module.h>
427ad18afaSChristoph Hellwig #include <linux/blk-mq.h>
43602adf40SYehuda Sadeh #include <linux/fs.h>
44602adf40SYehuda Sadeh #include <linux/blkdev.h>
451c2a9dfeSAlex Elder #include <linux/slab.h>
46f8a22fc2SIlya Dryomov #include <linux/idr.h>
47bc1ecc65SIlya Dryomov #include <linux/workqueue.h>
48602adf40SYehuda Sadeh 
49602adf40SYehuda Sadeh #include "rbd_types.h"
50602adf40SYehuda Sadeh 
51aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
52aafb230eSAlex Elder 
53593a9e7bSAlex Elder /*
54593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
55593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
56593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
57593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
58593a9e7bSAlex Elder  */
59593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
60593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
61593a9e7bSAlex Elder 
62a2acd00eSAlex Elder /*
63a2acd00eSAlex Elder  * Increment the given counter and return its updated value.
64a2acd00eSAlex Elder  * If the counter is already 0 it will not be incremented.
65a2acd00eSAlex Elder  * If the counter is already at its maximum value returns
66a2acd00eSAlex Elder  * -EINVAL without updating it.
67a2acd00eSAlex Elder  */
68a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v)
69a2acd00eSAlex Elder {
70a2acd00eSAlex Elder 	unsigned int counter;
71a2acd00eSAlex Elder 
72a2acd00eSAlex Elder 	counter = (unsigned int)__atomic_add_unless(v, 1, 0);
73a2acd00eSAlex Elder 	if (counter <= (unsigned int)INT_MAX)
74a2acd00eSAlex Elder 		return (int)counter;
75a2acd00eSAlex Elder 
76a2acd00eSAlex Elder 	atomic_dec(v);
77a2acd00eSAlex Elder 
78a2acd00eSAlex Elder 	return -EINVAL;
79a2acd00eSAlex Elder }
80a2acd00eSAlex Elder 
81a2acd00eSAlex Elder /* Decrement the counter.  Return the resulting value, or -EINVAL */
82a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v)
83a2acd00eSAlex Elder {
84a2acd00eSAlex Elder 	int counter;
85a2acd00eSAlex Elder 
86a2acd00eSAlex Elder 	counter = atomic_dec_return(v);
87a2acd00eSAlex Elder 	if (counter >= 0)
88a2acd00eSAlex Elder 		return counter;
89a2acd00eSAlex Elder 
90a2acd00eSAlex Elder 	atomic_inc(v);
91a2acd00eSAlex Elder 
92a2acd00eSAlex Elder 	return -EINVAL;
93a2acd00eSAlex Elder }
94a2acd00eSAlex Elder 
95f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
96602adf40SYehuda Sadeh 
977e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR		256
987e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT	4
99602adf40SYehuda Sadeh 
1006d69bb53SIlya Dryomov #define RBD_MAX_PARENT_CHAIN_LEN	16
1016d69bb53SIlya Dryomov 
102d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
103d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
104d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
105d4b125e9SAlex Elder 
10635d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
107602adf40SYehuda Sadeh 
108602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
109602adf40SYehuda Sadeh 
1109682fc6dSAlex Elder #define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
1119682fc6dSAlex Elder 
1129e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
1139e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
114589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
1159e15b77dSAlex Elder 
1161e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
117589d30e0SAlex Elder 
118ed95b21aSIlya Dryomov #define RBD_NOTIFY_TIMEOUT	5	/* seconds */
11999d16943SIlya Dryomov #define RBD_RETRY_DELAY		msecs_to_jiffies(1000)
12099d16943SIlya Dryomov 
121d889140cSAlex Elder /* Feature bits */
122d889140cSAlex Elder 
1238767b293SIlya Dryomov #define RBD_FEATURE_LAYERING		(1ULL<<0)
1248767b293SIlya Dryomov #define RBD_FEATURE_STRIPINGV2		(1ULL<<1)
1258767b293SIlya Dryomov #define RBD_FEATURE_EXCLUSIVE_LOCK	(1ULL<<2)
1268767b293SIlya Dryomov #define RBD_FEATURE_DATA_POOL		(1ULL<<7)
1278767b293SIlya Dryomov 
128ed95b21aSIlya Dryomov #define RBD_FEATURES_ALL	(RBD_FEATURE_LAYERING |		\
129ed95b21aSIlya Dryomov 				 RBD_FEATURE_STRIPINGV2 |	\
1307e97332eSIlya Dryomov 				 RBD_FEATURE_EXCLUSIVE_LOCK |	\
1317e97332eSIlya Dryomov 				 RBD_FEATURE_DATA_POOL)
132d889140cSAlex Elder 
133d889140cSAlex Elder /* Features supported by this (client software) implementation. */
134d889140cSAlex Elder 
135770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
136d889140cSAlex Elder 
13781a89793SAlex Elder /*
13881a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
13981a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
14081a89793SAlex Elder  */
141602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
142602adf40SYehuda Sadeh 
143602adf40SYehuda Sadeh /*
144602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
145602adf40SYehuda Sadeh  */
146602adf40SYehuda Sadeh struct rbd_image_header {
147f35a4deeSAlex Elder 	/* These six fields never change for a given rbd image */
148849b4260SAlex Elder 	char *object_prefix;
149602adf40SYehuda Sadeh 	__u8 obj_order;
150f35a4deeSAlex Elder 	u64 stripe_unit;
151f35a4deeSAlex Elder 	u64 stripe_count;
1527e97332eSIlya Dryomov 	s64 data_pool_id;
153f35a4deeSAlex Elder 	u64 features;		/* Might be changeable someday? */
154602adf40SYehuda Sadeh 
155f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
156f84344f3SAlex Elder 	u64 image_size;
157f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
158f35a4deeSAlex Elder 	char *snap_names;	/* format 1 only */
159f35a4deeSAlex Elder 	u64 *snap_sizes;	/* format 1 only */
16059c2be1eSYehuda Sadeh };
16159c2be1eSYehuda Sadeh 
1620d7dbfceSAlex Elder /*
1630d7dbfceSAlex Elder  * An rbd image specification.
1640d7dbfceSAlex Elder  *
1650d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
166c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
167c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
168c66c6e0cSAlex Elder  *
169c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
170c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
171c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
172c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
173c66c6e0cSAlex Elder  *
174c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
175c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
176c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
177c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
178c66c6e0cSAlex Elder  * is shared between the parent and child).
179c66c6e0cSAlex Elder  *
180c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
181c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
182c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
183c66c6e0cSAlex Elder  *
184c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
185c66c6e0cSAlex Elder  * could be a null pointer).
1860d7dbfceSAlex Elder  */
1870d7dbfceSAlex Elder struct rbd_spec {
1880d7dbfceSAlex Elder 	u64		pool_id;
189ecb4dc22SAlex Elder 	const char	*pool_name;
1900d7dbfceSAlex Elder 
191ecb4dc22SAlex Elder 	const char	*image_id;
192ecb4dc22SAlex Elder 	const char	*image_name;
1930d7dbfceSAlex Elder 
1940d7dbfceSAlex Elder 	u64		snap_id;
195ecb4dc22SAlex Elder 	const char	*snap_name;
1960d7dbfceSAlex Elder 
1970d7dbfceSAlex Elder 	struct kref	kref;
1980d7dbfceSAlex Elder };
1990d7dbfceSAlex Elder 
200602adf40SYehuda Sadeh /*
201f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
202602adf40SYehuda Sadeh  */
203602adf40SYehuda Sadeh struct rbd_client {
204602adf40SYehuda Sadeh 	struct ceph_client	*client;
205602adf40SYehuda Sadeh 	struct kref		kref;
206602adf40SYehuda Sadeh 	struct list_head	node;
207602adf40SYehuda Sadeh };
208602adf40SYehuda Sadeh 
209bf0d5f50SAlex Elder struct rbd_img_request;
210bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
211bf0d5f50SAlex Elder 
212bf0d5f50SAlex Elder #define	BAD_WHICH	U32_MAX		/* Good which or bad which, which? */
213bf0d5f50SAlex Elder 
214bf0d5f50SAlex Elder struct rbd_obj_request;
215bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
216bf0d5f50SAlex Elder 
2179969ebc5SAlex Elder enum obj_request_type {
2189969ebc5SAlex Elder 	OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
2199969ebc5SAlex Elder };
220bf0d5f50SAlex Elder 
2216d2940c8SGuangliang Zhao enum obj_operation_type {
2226d2940c8SGuangliang Zhao 	OBJ_OP_WRITE,
2236d2940c8SGuangliang Zhao 	OBJ_OP_READ,
22490e98c52SGuangliang Zhao 	OBJ_OP_DISCARD,
2256d2940c8SGuangliang Zhao };
2266d2940c8SGuangliang Zhao 
227926f9b3fSAlex Elder enum obj_req_flags {
228926f9b3fSAlex Elder 	OBJ_REQ_DONE,		/* completion flag: not done = 0, done = 1 */
2296365d33aSAlex Elder 	OBJ_REQ_IMG_DATA,	/* object usage: standalone = 0, image = 1 */
2305679c59fSAlex Elder 	OBJ_REQ_KNOWN,		/* EXISTS flag valid: no = 0, yes = 1 */
2315679c59fSAlex Elder 	OBJ_REQ_EXISTS,		/* target exists: no = 0, yes = 1 */
232926f9b3fSAlex Elder };
233926f9b3fSAlex Elder 
234bf0d5f50SAlex Elder struct rbd_obj_request {
235a90bb0c1SIlya Dryomov 	u64			object_no;
236bf0d5f50SAlex Elder 	u64			offset;		/* object start byte */
237bf0d5f50SAlex Elder 	u64			length;		/* bytes from offset */
238926f9b3fSAlex Elder 	unsigned long		flags;
239bf0d5f50SAlex Elder 
240c5b5ef6cSAlex Elder 	/*
241c5b5ef6cSAlex Elder 	 * An object request associated with an image will have its
242c5b5ef6cSAlex Elder 	 * img_data flag set; a standalone object request will not.
243c5b5ef6cSAlex Elder 	 *
244c5b5ef6cSAlex Elder 	 * A standalone object request will have which == BAD_WHICH
245c5b5ef6cSAlex Elder 	 * and a null obj_request pointer.
246c5b5ef6cSAlex Elder 	 *
247c5b5ef6cSAlex Elder 	 * An object request initiated in support of a layered image
248c5b5ef6cSAlex Elder 	 * object (to check for its existence before a write) will
249c5b5ef6cSAlex Elder 	 * have which == BAD_WHICH and a non-null obj_request pointer.
250c5b5ef6cSAlex Elder 	 *
251c5b5ef6cSAlex Elder 	 * Finally, an object request for rbd image data will have
252c5b5ef6cSAlex Elder 	 * which != BAD_WHICH, and will have a non-null img_request
253c5b5ef6cSAlex Elder 	 * pointer.  The value of which will be in the range
254c5b5ef6cSAlex Elder 	 * 0..(img_request->obj_request_count-1).
255c5b5ef6cSAlex Elder 	 */
256c5b5ef6cSAlex Elder 	union {
257c5b5ef6cSAlex Elder 		struct rbd_obj_request	*obj_request;	/* STAT op */
258c5b5ef6cSAlex Elder 		struct {
259bf0d5f50SAlex Elder 			struct rbd_img_request	*img_request;
260c5b5ef6cSAlex Elder 			u64			img_offset;
261c5b5ef6cSAlex Elder 			/* links for img_request->obj_requests list */
262c5b5ef6cSAlex Elder 			struct list_head	links;
263c5b5ef6cSAlex Elder 		};
264c5b5ef6cSAlex Elder 	};
265bf0d5f50SAlex Elder 	u32			which;		/* posn image request list */
266bf0d5f50SAlex Elder 
267bf0d5f50SAlex Elder 	enum obj_request_type	type;
268788e2df3SAlex Elder 	union {
269bf0d5f50SAlex Elder 		struct bio	*bio_list;
270788e2df3SAlex Elder 		struct {
271788e2df3SAlex Elder 			struct page	**pages;
272788e2df3SAlex Elder 			u32		page_count;
273788e2df3SAlex Elder 		};
274788e2df3SAlex Elder 	};
2750eefd470SAlex Elder 	struct page		**copyup_pages;
276ebda6408SAlex Elder 	u32			copyup_page_count;
277bf0d5f50SAlex Elder 
278bf0d5f50SAlex Elder 	struct ceph_osd_request	*osd_req;
279bf0d5f50SAlex Elder 
280bf0d5f50SAlex Elder 	u64			xferred;	/* bytes transferred */
2811b83bef2SSage Weil 	int			result;
282bf0d5f50SAlex Elder 
283bf0d5f50SAlex Elder 	rbd_obj_callback_t	callback;
284788e2df3SAlex Elder 	struct completion	completion;
285bf0d5f50SAlex Elder 
286bf0d5f50SAlex Elder 	struct kref		kref;
287bf0d5f50SAlex Elder };
288bf0d5f50SAlex Elder 
2890c425248SAlex Elder enum img_req_flags {
2909849e986SAlex Elder 	IMG_REQ_WRITE,		/* I/O direction: read = 0, write = 1 */
2919849e986SAlex Elder 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
292d0b2e944SAlex Elder 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
29390e98c52SGuangliang Zhao 	IMG_REQ_DISCARD,	/* discard: normal = 0, discard request = 1 */
2940c425248SAlex Elder };
2950c425248SAlex Elder 
296bf0d5f50SAlex Elder struct rbd_img_request {
297bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
298bf0d5f50SAlex Elder 	u64			offset;	/* starting image byte offset */
299bf0d5f50SAlex Elder 	u64			length;	/* byte count from offset */
3000c425248SAlex Elder 	unsigned long		flags;
301bf0d5f50SAlex Elder 	union {
302bf0d5f50SAlex Elder 		u64			snap_id;	/* for reads */
3039849e986SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
3049849e986SAlex Elder 	};
3059849e986SAlex Elder 	union {
3069849e986SAlex Elder 		struct request		*rq;		/* block request */
3079849e986SAlex Elder 		struct rbd_obj_request	*obj_request;	/* obj req initiator */
308bf0d5f50SAlex Elder 	};
3093d7efd18SAlex Elder 	struct page		**copyup_pages;
310ebda6408SAlex Elder 	u32			copyup_page_count;
311bf0d5f50SAlex Elder 	spinlock_t		completion_lock;/* protects next_completion */
312bf0d5f50SAlex Elder 	u32			next_completion;
313bf0d5f50SAlex Elder 	rbd_img_callback_t	callback;
31455f27e09SAlex Elder 	u64			xferred;/* aggregate bytes transferred */
315a5a337d4SAlex Elder 	int			result;	/* first nonzero obj_request result */
316bf0d5f50SAlex Elder 
317bf0d5f50SAlex Elder 	u32			obj_request_count;
318bf0d5f50SAlex Elder 	struct list_head	obj_requests;	/* rbd_obj_request structs */
319bf0d5f50SAlex Elder 
320bf0d5f50SAlex Elder 	struct kref		kref;
321bf0d5f50SAlex Elder };
322bf0d5f50SAlex Elder 
323bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
324ef06f4d3SAlex Elder 	list_for_each_entry(oreq, &(ireq)->obj_requests, links)
325bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \
326ef06f4d3SAlex Elder 	list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
327bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
328ef06f4d3SAlex Elder 	list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
329bf0d5f50SAlex Elder 
33099d16943SIlya Dryomov enum rbd_watch_state {
33199d16943SIlya Dryomov 	RBD_WATCH_STATE_UNREGISTERED,
33299d16943SIlya Dryomov 	RBD_WATCH_STATE_REGISTERED,
33399d16943SIlya Dryomov 	RBD_WATCH_STATE_ERROR,
33499d16943SIlya Dryomov };
33599d16943SIlya Dryomov 
336ed95b21aSIlya Dryomov enum rbd_lock_state {
337ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_UNLOCKED,
338ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_LOCKED,
339ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_RELEASING,
340ed95b21aSIlya Dryomov };
341ed95b21aSIlya Dryomov 
342ed95b21aSIlya Dryomov /* WatchNotify::ClientId */
343ed95b21aSIlya Dryomov struct rbd_client_id {
344ed95b21aSIlya Dryomov 	u64 gid;
345ed95b21aSIlya Dryomov 	u64 handle;
346ed95b21aSIlya Dryomov };
347ed95b21aSIlya Dryomov 
348f84344f3SAlex Elder struct rbd_mapping {
34999c1f08fSAlex Elder 	u64                     size;
35034b13184SAlex Elder 	u64                     features;
351f84344f3SAlex Elder 	bool			read_only;
352f84344f3SAlex Elder };
353f84344f3SAlex Elder 
354602adf40SYehuda Sadeh /*
355602adf40SYehuda Sadeh  * a single device
356602adf40SYehuda Sadeh  */
357602adf40SYehuda Sadeh struct rbd_device {
358de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
359602adf40SYehuda Sadeh 
360602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
361dd82fff1SIlya Dryomov 	int			minor;
362602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
363602adf40SYehuda Sadeh 
364a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
365602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
366602adf40SYehuda Sadeh 
367602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
368602adf40SYehuda Sadeh 
369b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
370602adf40SYehuda Sadeh 
371602adf40SYehuda Sadeh 	struct rbd_image_header	header;
372b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
3730d7dbfceSAlex Elder 	struct rbd_spec		*spec;
374d147543dSIlya Dryomov 	struct rbd_options	*opts;
3750d6d1e9cSMike Christie 	char			*config_info;	/* add{,_single_major} string */
376602adf40SYehuda Sadeh 
377c41d13a3SIlya Dryomov 	struct ceph_object_id	header_oid;
378922dab61SIlya Dryomov 	struct ceph_object_locator header_oloc;
379971f839aSAlex Elder 
3801643dfa4SIlya Dryomov 	struct ceph_file_layout	layout;		/* used for all rbd requests */
3810903e875SAlex Elder 
38299d16943SIlya Dryomov 	struct mutex		watch_mutex;
38399d16943SIlya Dryomov 	enum rbd_watch_state	watch_state;
384922dab61SIlya Dryomov 	struct ceph_osd_linger_request *watch_handle;
38599d16943SIlya Dryomov 	u64			watch_cookie;
38699d16943SIlya Dryomov 	struct delayed_work	watch_dwork;
38759c2be1eSYehuda Sadeh 
388ed95b21aSIlya Dryomov 	struct rw_semaphore	lock_rwsem;
389ed95b21aSIlya Dryomov 	enum rbd_lock_state	lock_state;
390ed95b21aSIlya Dryomov 	struct rbd_client_id	owner_cid;
391ed95b21aSIlya Dryomov 	struct work_struct	acquired_lock_work;
392ed95b21aSIlya Dryomov 	struct work_struct	released_lock_work;
393ed95b21aSIlya Dryomov 	struct delayed_work	lock_dwork;
394ed95b21aSIlya Dryomov 	struct work_struct	unlock_work;
395ed95b21aSIlya Dryomov 	wait_queue_head_t	lock_waitq;
396ed95b21aSIlya Dryomov 
3971643dfa4SIlya Dryomov 	struct workqueue_struct	*task_wq;
398602adf40SYehuda Sadeh 
39986b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
40086b00e0dSAlex Elder 	u64			parent_overlap;
401a2acd00eSAlex Elder 	atomic_t		parent_ref;
4022f82ee54SAlex Elder 	struct rbd_device	*parent;
40386b00e0dSAlex Elder 
4047ad18afaSChristoph Hellwig 	/* Block layer tags. */
4057ad18afaSChristoph Hellwig 	struct blk_mq_tag_set	tag_set;
4067ad18afaSChristoph Hellwig 
407c666601aSJosh Durgin 	/* protects updating the header */
408c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
409f84344f3SAlex Elder 
410f84344f3SAlex Elder 	struct rbd_mapping	mapping;
411602adf40SYehuda Sadeh 
412602adf40SYehuda Sadeh 	struct list_head	node;
413dfc5606dSYehuda Sadeh 
414dfc5606dSYehuda Sadeh 	/* sysfs related */
415dfc5606dSYehuda Sadeh 	struct device		dev;
416b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
417dfc5606dSYehuda Sadeh };
418dfc5606dSYehuda Sadeh 
419b82d167bSAlex Elder /*
42087c0fdedSIlya Dryomov  * Flag bits for rbd_dev->flags:
42187c0fdedSIlya Dryomov  * - REMOVING (which is coupled with rbd_dev->open_count) is protected
42287c0fdedSIlya Dryomov  *   by rbd_dev->lock
42387c0fdedSIlya Dryomov  * - BLACKLISTED is protected by rbd_dev->lock_rwsem
424b82d167bSAlex Elder  */
4256d292906SAlex Elder enum rbd_dev_flags {
4266d292906SAlex Elder 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
427b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
42887c0fdedSIlya Dryomov 	RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */
4296d292906SAlex Elder };
4306d292906SAlex Elder 
431cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex);	/* Serialize client creation */
432e124a82fSAlex Elder 
433602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
434e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
435e124a82fSAlex Elder 
436602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
437432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
438602adf40SYehuda Sadeh 
43978c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */
44078c2a44aSAlex Elder 
4411c2a9dfeSAlex Elder static struct kmem_cache	*rbd_img_request_cache;
442868311b1SAlex Elder static struct kmem_cache	*rbd_obj_request_cache;
4431c2a9dfeSAlex Elder 
4449b60e70bSIlya Dryomov static int rbd_major;
445f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida);
446f8a22fc2SIlya Dryomov 
447f5ee37bdSIlya Dryomov static struct workqueue_struct *rbd_wq;
448f5ee37bdSIlya Dryomov 
4499b60e70bSIlya Dryomov /*
4509b60e70bSIlya Dryomov  * Default to false for now, as single-major requires >= 0.75 version of
4519b60e70bSIlya Dryomov  * userspace rbd utility.
4529b60e70bSIlya Dryomov  */
4539b60e70bSIlya Dryomov static bool single_major = false;
4549b60e70bSIlya Dryomov module_param(single_major, bool, S_IRUGO);
4559b60e70bSIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)");
4569b60e70bSIlya Dryomov 
4573d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request);
4583d7efd18SAlex Elder 
459f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
460f0f8cef5SAlex Elder 		       size_t count);
461f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
462f0f8cef5SAlex Elder 			  size_t count);
4639b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
4649b60e70bSIlya Dryomov 				    size_t count);
4659b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
4669b60e70bSIlya Dryomov 				       size_t count);
4676d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
468a2acd00eSAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
469f0f8cef5SAlex Elder 
4709b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id)
4719b60e70bSIlya Dryomov {
4727e513d43SIlya Dryomov 	return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
4739b60e70bSIlya Dryomov }
4749b60e70bSIlya Dryomov 
4759b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor)
4769b60e70bSIlya Dryomov {
4777e513d43SIlya Dryomov 	return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
4789b60e70bSIlya Dryomov }
4799b60e70bSIlya Dryomov 
480ed95b21aSIlya Dryomov static bool rbd_is_lock_supported(struct rbd_device *rbd_dev)
481ed95b21aSIlya Dryomov {
482ed95b21aSIlya Dryomov 	return (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
483ed95b21aSIlya Dryomov 	       rbd_dev->spec->snap_id == CEPH_NOSNAP &&
484ed95b21aSIlya Dryomov 	       !rbd_dev->mapping.read_only;
485ed95b21aSIlya Dryomov }
486ed95b21aSIlya Dryomov 
487ed95b21aSIlya Dryomov static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
488ed95b21aSIlya Dryomov {
489ed95b21aSIlya Dryomov 	return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
490ed95b21aSIlya Dryomov 	       rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
491ed95b21aSIlya Dryomov }
492ed95b21aSIlya Dryomov 
493ed95b21aSIlya Dryomov static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
494ed95b21aSIlya Dryomov {
495ed95b21aSIlya Dryomov 	bool is_lock_owner;
496ed95b21aSIlya Dryomov 
497ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
498ed95b21aSIlya Dryomov 	is_lock_owner = __rbd_is_lock_owner(rbd_dev);
499ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
500ed95b21aSIlya Dryomov 	return is_lock_owner;
501ed95b21aSIlya Dryomov }
502ed95b21aSIlya Dryomov 
5038767b293SIlya Dryomov static ssize_t rbd_supported_features_show(struct bus_type *bus, char *buf)
5048767b293SIlya Dryomov {
5058767b293SIlya Dryomov 	return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
5068767b293SIlya Dryomov }
5078767b293SIlya Dryomov 
508b15a21ddSGreg Kroah-Hartman static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
509b15a21ddSGreg Kroah-Hartman static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
5109b60e70bSIlya Dryomov static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
5119b60e70bSIlya Dryomov static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
5128767b293SIlya Dryomov static BUS_ATTR(supported_features, S_IRUGO, rbd_supported_features_show, NULL);
513b15a21ddSGreg Kroah-Hartman 
514b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = {
515b15a21ddSGreg Kroah-Hartman 	&bus_attr_add.attr,
516b15a21ddSGreg Kroah-Hartman 	&bus_attr_remove.attr,
5179b60e70bSIlya Dryomov 	&bus_attr_add_single_major.attr,
5189b60e70bSIlya Dryomov 	&bus_attr_remove_single_major.attr,
5198767b293SIlya Dryomov 	&bus_attr_supported_features.attr,
520b15a21ddSGreg Kroah-Hartman 	NULL,
521f0f8cef5SAlex Elder };
52292c76dc0SIlya Dryomov 
52392c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj,
52492c76dc0SIlya Dryomov 				  struct attribute *attr, int index)
52592c76dc0SIlya Dryomov {
5269b60e70bSIlya Dryomov 	if (!single_major &&
5279b60e70bSIlya Dryomov 	    (attr == &bus_attr_add_single_major.attr ||
5289b60e70bSIlya Dryomov 	     attr == &bus_attr_remove_single_major.attr))
5299b60e70bSIlya Dryomov 		return 0;
5309b60e70bSIlya Dryomov 
53192c76dc0SIlya Dryomov 	return attr->mode;
53292c76dc0SIlya Dryomov }
53392c76dc0SIlya Dryomov 
53492c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = {
53592c76dc0SIlya Dryomov 	.attrs = rbd_bus_attrs,
53692c76dc0SIlya Dryomov 	.is_visible = rbd_bus_is_visible,
53792c76dc0SIlya Dryomov };
53892c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus);
539f0f8cef5SAlex Elder 
540f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
541f0f8cef5SAlex Elder 	.name		= "rbd",
542b15a21ddSGreg Kroah-Hartman 	.bus_groups	= rbd_bus_groups,
543f0f8cef5SAlex Elder };
544f0f8cef5SAlex Elder 
545f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
546f0f8cef5SAlex Elder {
547f0f8cef5SAlex Elder }
548f0f8cef5SAlex Elder 
549f0f8cef5SAlex Elder static struct device rbd_root_dev = {
550f0f8cef5SAlex Elder 	.init_name =    "rbd",
551f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
552f0f8cef5SAlex Elder };
553f0f8cef5SAlex Elder 
55406ecc6cbSAlex Elder static __printf(2, 3)
55506ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
55606ecc6cbSAlex Elder {
55706ecc6cbSAlex Elder 	struct va_format vaf;
55806ecc6cbSAlex Elder 	va_list args;
55906ecc6cbSAlex Elder 
56006ecc6cbSAlex Elder 	va_start(args, fmt);
56106ecc6cbSAlex Elder 	vaf.fmt = fmt;
56206ecc6cbSAlex Elder 	vaf.va = &args;
56306ecc6cbSAlex Elder 
56406ecc6cbSAlex Elder 	if (!rbd_dev)
56506ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
56606ecc6cbSAlex Elder 	else if (rbd_dev->disk)
56706ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
56806ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
56906ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
57006ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
57106ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
57206ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
57306ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
57406ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
57506ecc6cbSAlex Elder 	else	/* punt */
57606ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
57706ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
57806ecc6cbSAlex Elder 	va_end(args);
57906ecc6cbSAlex Elder }
58006ecc6cbSAlex Elder 
581aafb230eSAlex Elder #ifdef RBD_DEBUG
582aafb230eSAlex Elder #define rbd_assert(expr)						\
583aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
584aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
585aafb230eSAlex Elder 						"at line %d:\n\n"	\
586aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
587aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
588aafb230eSAlex Elder 			BUG();						\
589aafb230eSAlex Elder 		}
590aafb230eSAlex Elder #else /* !RBD_DEBUG */
591aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
592aafb230eSAlex Elder #endif /* !RBD_DEBUG */
593dfc5606dSYehuda Sadeh 
5942761713dSIlya Dryomov static void rbd_osd_copyup_callback(struct rbd_obj_request *obj_request);
595b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
59605a46afdSAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
59705a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
5988b3e1a56SAlex Elder 
599cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev);
6002df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
601a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev);
602e8f59b59SIlya Dryomov static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
60354cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
60454cac61fSAlex Elder 					u64 snap_id);
6052ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
6062ad3d716SAlex Elder 				u8 *order, u64 *snap_size);
6072ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
6082ad3d716SAlex Elder 		u64 *snap_features);
60959c2be1eSYehuda Sadeh 
610602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
611602adf40SYehuda Sadeh {
612f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
613b82d167bSAlex Elder 	bool removing = false;
614602adf40SYehuda Sadeh 
615f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
616602adf40SYehuda Sadeh 		return -EROFS;
617602adf40SYehuda Sadeh 
618a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
619b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
620b82d167bSAlex Elder 		removing = true;
621b82d167bSAlex Elder 	else
622b82d167bSAlex Elder 		rbd_dev->open_count++;
623a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
624b82d167bSAlex Elder 	if (removing)
625b82d167bSAlex Elder 		return -ENOENT;
626b82d167bSAlex Elder 
627c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
628340c7a2bSAlex Elder 
629602adf40SYehuda Sadeh 	return 0;
630602adf40SYehuda Sadeh }
631602adf40SYehuda Sadeh 
632db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode)
633dfc5606dSYehuda Sadeh {
634dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
635b82d167bSAlex Elder 	unsigned long open_count_before;
636b82d167bSAlex Elder 
637a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
638b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
639a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
640b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
641dfc5606dSYehuda Sadeh 
642c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
643dfc5606dSYehuda Sadeh }
644dfc5606dSYehuda Sadeh 
645131fd9f6SGuangliang Zhao static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
646131fd9f6SGuangliang Zhao {
64777f33c03SJosh Durgin 	int ret = 0;
648131fd9f6SGuangliang Zhao 	int val;
649131fd9f6SGuangliang Zhao 	bool ro;
65077f33c03SJosh Durgin 	bool ro_changed = false;
651131fd9f6SGuangliang Zhao 
65277f33c03SJosh Durgin 	/* get_user() may sleep, so call it before taking rbd_dev->lock */
653131fd9f6SGuangliang Zhao 	if (get_user(val, (int __user *)(arg)))
654131fd9f6SGuangliang Zhao 		return -EFAULT;
655131fd9f6SGuangliang Zhao 
656131fd9f6SGuangliang Zhao 	ro = val ? true : false;
657131fd9f6SGuangliang Zhao 	/* Snapshot doesn't allow to write*/
658131fd9f6SGuangliang Zhao 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
659131fd9f6SGuangliang Zhao 		return -EROFS;
660131fd9f6SGuangliang Zhao 
66177f33c03SJosh Durgin 	spin_lock_irq(&rbd_dev->lock);
66277f33c03SJosh Durgin 	/* prevent others open this device */
66377f33c03SJosh Durgin 	if (rbd_dev->open_count > 1) {
66477f33c03SJosh Durgin 		ret = -EBUSY;
66577f33c03SJosh Durgin 		goto out;
666131fd9f6SGuangliang Zhao 	}
667131fd9f6SGuangliang Zhao 
66877f33c03SJosh Durgin 	if (rbd_dev->mapping.read_only != ro) {
66977f33c03SJosh Durgin 		rbd_dev->mapping.read_only = ro;
67077f33c03SJosh Durgin 		ro_changed = true;
67177f33c03SJosh Durgin 	}
67277f33c03SJosh Durgin 
67377f33c03SJosh Durgin out:
67477f33c03SJosh Durgin 	spin_unlock_irq(&rbd_dev->lock);
67577f33c03SJosh Durgin 	/* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */
67677f33c03SJosh Durgin 	if (ret == 0 && ro_changed)
67777f33c03SJosh Durgin 		set_disk_ro(rbd_dev->disk, ro ? 1 : 0);
67877f33c03SJosh Durgin 
67977f33c03SJosh Durgin 	return ret;
680131fd9f6SGuangliang Zhao }
681131fd9f6SGuangliang Zhao 
682131fd9f6SGuangliang Zhao static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
683131fd9f6SGuangliang Zhao 			unsigned int cmd, unsigned long arg)
684131fd9f6SGuangliang Zhao {
685131fd9f6SGuangliang Zhao 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
686131fd9f6SGuangliang Zhao 	int ret = 0;
687131fd9f6SGuangliang Zhao 
688131fd9f6SGuangliang Zhao 	switch (cmd) {
689131fd9f6SGuangliang Zhao 	case BLKROSET:
690131fd9f6SGuangliang Zhao 		ret = rbd_ioctl_set_ro(rbd_dev, arg);
691131fd9f6SGuangliang Zhao 		break;
692131fd9f6SGuangliang Zhao 	default:
693131fd9f6SGuangliang Zhao 		ret = -ENOTTY;
694131fd9f6SGuangliang Zhao 	}
695131fd9f6SGuangliang Zhao 
696131fd9f6SGuangliang Zhao 	return ret;
697131fd9f6SGuangliang Zhao }
698131fd9f6SGuangliang Zhao 
699131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT
700131fd9f6SGuangliang Zhao static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
701131fd9f6SGuangliang Zhao 				unsigned int cmd, unsigned long arg)
702131fd9f6SGuangliang Zhao {
703131fd9f6SGuangliang Zhao 	return rbd_ioctl(bdev, mode, cmd, arg);
704131fd9f6SGuangliang Zhao }
705131fd9f6SGuangliang Zhao #endif /* CONFIG_COMPAT */
706131fd9f6SGuangliang Zhao 
707602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
708602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
709602adf40SYehuda Sadeh 	.open			= rbd_open,
710dfc5606dSYehuda Sadeh 	.release		= rbd_release,
711131fd9f6SGuangliang Zhao 	.ioctl			= rbd_ioctl,
712131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT
713131fd9f6SGuangliang Zhao 	.compat_ioctl		= rbd_compat_ioctl,
714131fd9f6SGuangliang Zhao #endif
715602adf40SYehuda Sadeh };
716602adf40SYehuda Sadeh 
717602adf40SYehuda Sadeh /*
7187262cfcaSAlex Elder  * Initialize an rbd client instance.  Success or not, this function
719cfbf6377SAlex Elder  * consumes ceph_opts.  Caller holds client_mutex.
720602adf40SYehuda Sadeh  */
721f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
722602adf40SYehuda Sadeh {
723602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
724602adf40SYehuda Sadeh 	int ret = -ENOMEM;
725602adf40SYehuda Sadeh 
72637206ee5SAlex Elder 	dout("%s:\n", __func__);
727602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
728602adf40SYehuda Sadeh 	if (!rbdc)
729602adf40SYehuda Sadeh 		goto out_opt;
730602adf40SYehuda Sadeh 
731602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
732602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
733602adf40SYehuda Sadeh 
73474da4a0fSIlya Dryomov 	rbdc->client = ceph_create_client(ceph_opts, rbdc);
735602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
73608f75463SAlex Elder 		goto out_rbdc;
73743ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
738602adf40SYehuda Sadeh 
739602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
740602adf40SYehuda Sadeh 	if (ret < 0)
74108f75463SAlex Elder 		goto out_client;
742602adf40SYehuda Sadeh 
743432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
744602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
745432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
746602adf40SYehuda Sadeh 
74737206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
748bc534d86SAlex Elder 
749602adf40SYehuda Sadeh 	return rbdc;
75008f75463SAlex Elder out_client:
751602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
75208f75463SAlex Elder out_rbdc:
753602adf40SYehuda Sadeh 	kfree(rbdc);
754602adf40SYehuda Sadeh out_opt:
75543ae4701SAlex Elder 	if (ceph_opts)
75643ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
75737206ee5SAlex Elder 	dout("%s: error %d\n", __func__, ret);
75837206ee5SAlex Elder 
75928f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
760602adf40SYehuda Sadeh }
761602adf40SYehuda Sadeh 
7622f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
7632f82ee54SAlex Elder {
7642f82ee54SAlex Elder 	kref_get(&rbdc->kref);
7652f82ee54SAlex Elder 
7662f82ee54SAlex Elder 	return rbdc;
7672f82ee54SAlex Elder }
7682f82ee54SAlex Elder 
769602adf40SYehuda Sadeh /*
7701f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
7711f7ba331SAlex Elder  * found, bump its reference count.
772602adf40SYehuda Sadeh  */
7731f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
774602adf40SYehuda Sadeh {
775602adf40SYehuda Sadeh 	struct rbd_client *client_node;
7761f7ba331SAlex Elder 	bool found = false;
777602adf40SYehuda Sadeh 
77843ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
779602adf40SYehuda Sadeh 		return NULL;
780602adf40SYehuda Sadeh 
7811f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
7821f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
7831f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
7842f82ee54SAlex Elder 			__rbd_get_client(client_node);
7852f82ee54SAlex Elder 
7861f7ba331SAlex Elder 			found = true;
7871f7ba331SAlex Elder 			break;
7881f7ba331SAlex Elder 		}
7891f7ba331SAlex Elder 	}
7901f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
7911f7ba331SAlex Elder 
7921f7ba331SAlex Elder 	return found ? client_node : NULL;
793602adf40SYehuda Sadeh }
794602adf40SYehuda Sadeh 
795602adf40SYehuda Sadeh /*
796210c104cSIlya Dryomov  * (Per device) rbd map options
79759c2be1eSYehuda Sadeh  */
79859c2be1eSYehuda Sadeh enum {
799b5584180SIlya Dryomov 	Opt_queue_depth,
80059c2be1eSYehuda Sadeh 	Opt_last_int,
80159c2be1eSYehuda Sadeh 	/* int args above */
80259c2be1eSYehuda Sadeh 	Opt_last_string,
80359c2be1eSYehuda Sadeh 	/* string args above */
804cc0538b6SAlex Elder 	Opt_read_only,
805cc0538b6SAlex Elder 	Opt_read_write,
80680de1912SIlya Dryomov 	Opt_lock_on_read,
807210c104cSIlya Dryomov 	Opt_err
80859c2be1eSYehuda Sadeh };
80959c2be1eSYehuda Sadeh 
81043ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
811b5584180SIlya Dryomov 	{Opt_queue_depth, "queue_depth=%d"},
81259c2be1eSYehuda Sadeh 	/* int args above */
81359c2be1eSYehuda Sadeh 	/* string args above */
814be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
815cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
816cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
817cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
81880de1912SIlya Dryomov 	{Opt_lock_on_read, "lock_on_read"},
819210c104cSIlya Dryomov 	{Opt_err, NULL}
82059c2be1eSYehuda Sadeh };
82159c2be1eSYehuda Sadeh 
82298571b5aSAlex Elder struct rbd_options {
823b5584180SIlya Dryomov 	int	queue_depth;
82498571b5aSAlex Elder 	bool	read_only;
82580de1912SIlya Dryomov 	bool	lock_on_read;
82698571b5aSAlex Elder };
82798571b5aSAlex Elder 
828b5584180SIlya Dryomov #define RBD_QUEUE_DEPTH_DEFAULT	BLKDEV_MAX_RQ
82998571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
83080de1912SIlya Dryomov #define RBD_LOCK_ON_READ_DEFAULT false
83198571b5aSAlex Elder 
83259c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
83359c2be1eSYehuda Sadeh {
83443ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
83559c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
83659c2be1eSYehuda Sadeh 	int token, intval, ret;
83759c2be1eSYehuda Sadeh 
83843ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
83959c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
84059c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
84159c2be1eSYehuda Sadeh 		if (ret < 0) {
842210c104cSIlya Dryomov 			pr_err("bad mount option arg (not int) at '%s'\n", c);
84359c2be1eSYehuda Sadeh 			return ret;
84459c2be1eSYehuda Sadeh 		}
84559c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
84659c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
847210c104cSIlya Dryomov 		dout("got string token %d val %s\n", token, argstr[0].from);
84859c2be1eSYehuda Sadeh 	} else {
84959c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
85059c2be1eSYehuda Sadeh 	}
85159c2be1eSYehuda Sadeh 
85259c2be1eSYehuda Sadeh 	switch (token) {
853b5584180SIlya Dryomov 	case Opt_queue_depth:
854b5584180SIlya Dryomov 		if (intval < 1) {
855b5584180SIlya Dryomov 			pr_err("queue_depth out of range\n");
856b5584180SIlya Dryomov 			return -EINVAL;
857b5584180SIlya Dryomov 		}
858b5584180SIlya Dryomov 		rbd_opts->queue_depth = intval;
859b5584180SIlya Dryomov 		break;
860cc0538b6SAlex Elder 	case Opt_read_only:
861cc0538b6SAlex Elder 		rbd_opts->read_only = true;
862cc0538b6SAlex Elder 		break;
863cc0538b6SAlex Elder 	case Opt_read_write:
864cc0538b6SAlex Elder 		rbd_opts->read_only = false;
865cc0538b6SAlex Elder 		break;
86680de1912SIlya Dryomov 	case Opt_lock_on_read:
86780de1912SIlya Dryomov 		rbd_opts->lock_on_read = true;
86880de1912SIlya Dryomov 		break;
86959c2be1eSYehuda Sadeh 	default:
870210c104cSIlya Dryomov 		/* libceph prints "bad option" msg */
871210c104cSIlya Dryomov 		return -EINVAL;
87259c2be1eSYehuda Sadeh 	}
873210c104cSIlya Dryomov 
87459c2be1eSYehuda Sadeh 	return 0;
87559c2be1eSYehuda Sadeh }
87659c2be1eSYehuda Sadeh 
8776d2940c8SGuangliang Zhao static char* obj_op_name(enum obj_operation_type op_type)
8786d2940c8SGuangliang Zhao {
8796d2940c8SGuangliang Zhao 	switch (op_type) {
8806d2940c8SGuangliang Zhao 	case OBJ_OP_READ:
8816d2940c8SGuangliang Zhao 		return "read";
8826d2940c8SGuangliang Zhao 	case OBJ_OP_WRITE:
8836d2940c8SGuangliang Zhao 		return "write";
88490e98c52SGuangliang Zhao 	case OBJ_OP_DISCARD:
88590e98c52SGuangliang Zhao 		return "discard";
8866d2940c8SGuangliang Zhao 	default:
8876d2940c8SGuangliang Zhao 		return "???";
8886d2940c8SGuangliang Zhao 	}
8896d2940c8SGuangliang Zhao }
8906d2940c8SGuangliang Zhao 
89159c2be1eSYehuda Sadeh /*
892602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
8937262cfcaSAlex Elder  * not exist create it.  Either way, ceph_opts is consumed by this
8947262cfcaSAlex Elder  * function.
895602adf40SYehuda Sadeh  */
8969d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
897602adf40SYehuda Sadeh {
898f8c38929SAlex Elder 	struct rbd_client *rbdc;
89959c2be1eSYehuda Sadeh 
900cfbf6377SAlex Elder 	mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
9011f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
9029d3997fdSAlex Elder 	if (rbdc)	/* using an existing client */
90343ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
9049d3997fdSAlex Elder 	else
905f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
906cfbf6377SAlex Elder 	mutex_unlock(&client_mutex);
907d720bcb0SAlex Elder 
9089d3997fdSAlex Elder 	return rbdc;
909602adf40SYehuda Sadeh }
910602adf40SYehuda Sadeh 
911602adf40SYehuda Sadeh /*
912602adf40SYehuda Sadeh  * Destroy ceph client
913d23a4b3fSAlex Elder  *
914432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
915602adf40SYehuda Sadeh  */
916602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
917602adf40SYehuda Sadeh {
918602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
919602adf40SYehuda Sadeh 
92037206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
921cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
922602adf40SYehuda Sadeh 	list_del(&rbdc->node);
923cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
924602adf40SYehuda Sadeh 
925602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
926602adf40SYehuda Sadeh 	kfree(rbdc);
927602adf40SYehuda Sadeh }
928602adf40SYehuda Sadeh 
929602adf40SYehuda Sadeh /*
930602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
931602adf40SYehuda Sadeh  * it.
932602adf40SYehuda Sadeh  */
9339d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
934602adf40SYehuda Sadeh {
935c53d5893SAlex Elder 	if (rbdc)
9369d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
937602adf40SYehuda Sadeh }
938602adf40SYehuda Sadeh 
939a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
940a30b71b9SAlex Elder {
941a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
942a30b71b9SAlex Elder }
943a30b71b9SAlex Elder 
9448e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
9458e94af8eSAlex Elder {
946103a150fSAlex Elder 	size_t size;
947103a150fSAlex Elder 	u32 snap_count;
948103a150fSAlex Elder 
949103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
950103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
951103a150fSAlex Elder 		return false;
952103a150fSAlex Elder 
953db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
954db2388b6SAlex Elder 
955db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
956db2388b6SAlex Elder 		return false;
957db2388b6SAlex Elder 
958db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
959db2388b6SAlex Elder 
960db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
961db2388b6SAlex Elder 		return false;
962db2388b6SAlex Elder 
963103a150fSAlex Elder 	/*
964103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
965103a150fSAlex Elder 	 * that limits the number of snapshots.
966103a150fSAlex Elder 	 */
967103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
968103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
969103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
970103a150fSAlex Elder 		return false;
971103a150fSAlex Elder 
972103a150fSAlex Elder 	/*
973103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
974103a150fSAlex Elder 	 * header must also be representable in a size_t.
975103a150fSAlex Elder 	 */
976103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
977103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
978103a150fSAlex Elder 		return false;
979103a150fSAlex Elder 
980103a150fSAlex Elder 	return true;
9818e94af8eSAlex Elder }
9828e94af8eSAlex Elder 
983602adf40SYehuda Sadeh /*
9845bc3fb17SIlya Dryomov  * returns the size of an object in the image
9855bc3fb17SIlya Dryomov  */
9865bc3fb17SIlya Dryomov static u32 rbd_obj_bytes(struct rbd_image_header *header)
9875bc3fb17SIlya Dryomov {
9885bc3fb17SIlya Dryomov 	return 1U << header->obj_order;
9895bc3fb17SIlya Dryomov }
9905bc3fb17SIlya Dryomov 
991263423f8SIlya Dryomov static void rbd_init_layout(struct rbd_device *rbd_dev)
992263423f8SIlya Dryomov {
993263423f8SIlya Dryomov 	if (rbd_dev->header.stripe_unit == 0 ||
994263423f8SIlya Dryomov 	    rbd_dev->header.stripe_count == 0) {
995263423f8SIlya Dryomov 		rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
996263423f8SIlya Dryomov 		rbd_dev->header.stripe_count = 1;
997263423f8SIlya Dryomov 	}
998263423f8SIlya Dryomov 
999263423f8SIlya Dryomov 	rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
1000263423f8SIlya Dryomov 	rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
1001263423f8SIlya Dryomov 	rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
10027e97332eSIlya Dryomov 	rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
10037e97332eSIlya Dryomov 			  rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
1004263423f8SIlya Dryomov 	RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
1005263423f8SIlya Dryomov }
1006263423f8SIlya Dryomov 
10075bc3fb17SIlya Dryomov /*
1008bb23e37aSAlex Elder  * Fill an rbd image header with information from the given format 1
1009bb23e37aSAlex Elder  * on-disk header.
1010602adf40SYehuda Sadeh  */
1011662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev,
10124156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
1013602adf40SYehuda Sadeh {
1014662518b1SAlex Elder 	struct rbd_image_header *header = &rbd_dev->header;
1015bb23e37aSAlex Elder 	bool first_time = header->object_prefix == NULL;
1016bb23e37aSAlex Elder 	struct ceph_snap_context *snapc;
1017bb23e37aSAlex Elder 	char *object_prefix = NULL;
1018bb23e37aSAlex Elder 	char *snap_names = NULL;
1019bb23e37aSAlex Elder 	u64 *snap_sizes = NULL;
1020ccece235SAlex Elder 	u32 snap_count;
1021bb23e37aSAlex Elder 	int ret = -ENOMEM;
1022621901d6SAlex Elder 	u32 i;
1023602adf40SYehuda Sadeh 
1024bb23e37aSAlex Elder 	/* Allocate this now to avoid having to handle failure below */
1025103a150fSAlex Elder 
1026bb23e37aSAlex Elder 	if (first_time) {
1027848d796cSIlya Dryomov 		object_prefix = kstrndup(ondisk->object_prefix,
1028848d796cSIlya Dryomov 					 sizeof(ondisk->object_prefix),
1029848d796cSIlya Dryomov 					 GFP_KERNEL);
1030bb23e37aSAlex Elder 		if (!object_prefix)
1031602adf40SYehuda Sadeh 			return -ENOMEM;
1032bb23e37aSAlex Elder 	}
103300f1f36fSAlex Elder 
1034bb23e37aSAlex Elder 	/* Allocate the snapshot context and fill it in */
1035d2bb24e5SAlex Elder 
1036602adf40SYehuda Sadeh 	snap_count = le32_to_cpu(ondisk->snap_count);
1037bb23e37aSAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1038bb23e37aSAlex Elder 	if (!snapc)
1039bb23e37aSAlex Elder 		goto out_err;
1040bb23e37aSAlex Elder 	snapc->seq = le64_to_cpu(ondisk->snap_seq);
1041602adf40SYehuda Sadeh 	if (snap_count) {
1042bb23e37aSAlex Elder 		struct rbd_image_snap_ondisk *snaps;
1043f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1044f785cc1dSAlex Elder 
1045bb23e37aSAlex Elder 		/* We'll keep a copy of the snapshot names... */
1046621901d6SAlex Elder 
1047f785cc1dSAlex Elder 		if (snap_names_len > (u64)SIZE_MAX)
1048bb23e37aSAlex Elder 			goto out_2big;
1049bb23e37aSAlex Elder 		snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1050bb23e37aSAlex Elder 		if (!snap_names)
1051602adf40SYehuda Sadeh 			goto out_err;
1052bb23e37aSAlex Elder 
1053bb23e37aSAlex Elder 		/* ...as well as the array of their sizes. */
105488a25a5fSMarkus Elfring 		snap_sizes = kmalloc_array(snap_count,
105588a25a5fSMarkus Elfring 					   sizeof(*header->snap_sizes),
105688a25a5fSMarkus Elfring 					   GFP_KERNEL);
1057bb23e37aSAlex Elder 		if (!snap_sizes)
1058bb23e37aSAlex Elder 			goto out_err;
1059bb23e37aSAlex Elder 
1060f785cc1dSAlex Elder 		/*
1061bb23e37aSAlex Elder 		 * Copy the names, and fill in each snapshot's id
1062bb23e37aSAlex Elder 		 * and size.
1063bb23e37aSAlex Elder 		 *
106499a41ebcSAlex Elder 		 * Note that rbd_dev_v1_header_info() guarantees the
1065bb23e37aSAlex Elder 		 * ondisk buffer we're working with has
1066f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
1067f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
1068f785cc1dSAlex Elder 		 */
1069bb23e37aSAlex Elder 		memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1070bb23e37aSAlex Elder 		snaps = ondisk->snaps;
1071bb23e37aSAlex Elder 		for (i = 0; i < snap_count; i++) {
1072bb23e37aSAlex Elder 			snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1073bb23e37aSAlex Elder 			snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1074bb23e37aSAlex Elder 		}
1075602adf40SYehuda Sadeh 	}
1076849b4260SAlex Elder 
1077bb23e37aSAlex Elder 	/* We won't fail any more, fill in the header */
1078bb23e37aSAlex Elder 
1079bb23e37aSAlex Elder 	if (first_time) {
1080bb23e37aSAlex Elder 		header->object_prefix = object_prefix;
1081602adf40SYehuda Sadeh 		header->obj_order = ondisk->options.order;
1082263423f8SIlya Dryomov 		rbd_init_layout(rbd_dev);
1083662518b1SAlex Elder 	} else {
1084662518b1SAlex Elder 		ceph_put_snap_context(header->snapc);
1085662518b1SAlex Elder 		kfree(header->snap_names);
1086662518b1SAlex Elder 		kfree(header->snap_sizes);
1087bb23e37aSAlex Elder 	}
10886a52325fSAlex Elder 
1089bb23e37aSAlex Elder 	/* The remaining fields always get updated (when we refresh) */
1090621901d6SAlex Elder 
1091f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
1092bb23e37aSAlex Elder 	header->snapc = snapc;
1093bb23e37aSAlex Elder 	header->snap_names = snap_names;
1094bb23e37aSAlex Elder 	header->snap_sizes = snap_sizes;
1095468521c1SAlex Elder 
1096602adf40SYehuda Sadeh 	return 0;
1097bb23e37aSAlex Elder out_2big:
1098bb23e37aSAlex Elder 	ret = -EIO;
10996a52325fSAlex Elder out_err:
1100bb23e37aSAlex Elder 	kfree(snap_sizes);
1101bb23e37aSAlex Elder 	kfree(snap_names);
1102bb23e37aSAlex Elder 	ceph_put_snap_context(snapc);
1103bb23e37aSAlex Elder 	kfree(object_prefix);
1104ccece235SAlex Elder 
1105bb23e37aSAlex Elder 	return ret;
1106602adf40SYehuda Sadeh }
1107602adf40SYehuda Sadeh 
11089682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
11099682fc6dSAlex Elder {
11109682fc6dSAlex Elder 	const char *snap_name;
11119682fc6dSAlex Elder 
11129682fc6dSAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
11139682fc6dSAlex Elder 
11149682fc6dSAlex Elder 	/* Skip over names until we find the one we are looking for */
11159682fc6dSAlex Elder 
11169682fc6dSAlex Elder 	snap_name = rbd_dev->header.snap_names;
11179682fc6dSAlex Elder 	while (which--)
11189682fc6dSAlex Elder 		snap_name += strlen(snap_name) + 1;
11199682fc6dSAlex Elder 
11209682fc6dSAlex Elder 	return kstrdup(snap_name, GFP_KERNEL);
11219682fc6dSAlex Elder }
11229682fc6dSAlex Elder 
112330d1cff8SAlex Elder /*
112430d1cff8SAlex Elder  * Snapshot id comparison function for use with qsort()/bsearch().
112530d1cff8SAlex Elder  * Note that result is for snapshots in *descending* order.
112630d1cff8SAlex Elder  */
112730d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2)
112830d1cff8SAlex Elder {
112930d1cff8SAlex Elder 	u64 snap_id1 = *(u64 *)s1;
113030d1cff8SAlex Elder 	u64 snap_id2 = *(u64 *)s2;
113130d1cff8SAlex Elder 
113230d1cff8SAlex Elder 	if (snap_id1 < snap_id2)
113330d1cff8SAlex Elder 		return 1;
113430d1cff8SAlex Elder 	return snap_id1 == snap_id2 ? 0 : -1;
113530d1cff8SAlex Elder }
113630d1cff8SAlex Elder 
113730d1cff8SAlex Elder /*
113830d1cff8SAlex Elder  * Search a snapshot context to see if the given snapshot id is
113930d1cff8SAlex Elder  * present.
114030d1cff8SAlex Elder  *
114130d1cff8SAlex Elder  * Returns the position of the snapshot id in the array if it's found,
114230d1cff8SAlex Elder  * or BAD_SNAP_INDEX otherwise.
114330d1cff8SAlex Elder  *
114430d1cff8SAlex Elder  * Note: The snapshot array is in kept sorted (by the osd) in
114530d1cff8SAlex Elder  * reverse order, highest snapshot id first.
114630d1cff8SAlex Elder  */
11479682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
11489682fc6dSAlex Elder {
11499682fc6dSAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
115030d1cff8SAlex Elder 	u64 *found;
11519682fc6dSAlex Elder 
115230d1cff8SAlex Elder 	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
115330d1cff8SAlex Elder 				sizeof (snap_id), snapid_compare_reverse);
11549682fc6dSAlex Elder 
115530d1cff8SAlex Elder 	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
11569682fc6dSAlex Elder }
11579682fc6dSAlex Elder 
11582ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
11592ad3d716SAlex Elder 					u64 snap_id)
116054cac61fSAlex Elder {
116154cac61fSAlex Elder 	u32 which;
1162da6a6b63SJosh Durgin 	const char *snap_name;
116354cac61fSAlex Elder 
116454cac61fSAlex Elder 	which = rbd_dev_snap_index(rbd_dev, snap_id);
116554cac61fSAlex Elder 	if (which == BAD_SNAP_INDEX)
1166da6a6b63SJosh Durgin 		return ERR_PTR(-ENOENT);
116754cac61fSAlex Elder 
1168da6a6b63SJosh Durgin 	snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1169da6a6b63SJosh Durgin 	return snap_name ? snap_name : ERR_PTR(-ENOMEM);
117054cac61fSAlex Elder }
117154cac61fSAlex Elder 
11729e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
11739e15b77dSAlex Elder {
11749e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
11759e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
11769e15b77dSAlex Elder 
117754cac61fSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
117854cac61fSAlex Elder 	if (rbd_dev->image_format == 1)
117954cac61fSAlex Elder 		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
11809e15b77dSAlex Elder 
118154cac61fSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
11829e15b77dSAlex Elder }
11839e15b77dSAlex Elder 
11842ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
11852ad3d716SAlex Elder 				u64 *snap_size)
1186602adf40SYehuda Sadeh {
11872ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
11882ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
11892ad3d716SAlex Elder 		*snap_size = rbd_dev->header.image_size;
11902ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
11912ad3d716SAlex Elder 		u32 which;
119200f1f36fSAlex Elder 
11932ad3d716SAlex Elder 		which = rbd_dev_snap_index(rbd_dev, snap_id);
11942ad3d716SAlex Elder 		if (which == BAD_SNAP_INDEX)
11952ad3d716SAlex Elder 			return -ENOENT;
119600f1f36fSAlex Elder 
11972ad3d716SAlex Elder 		*snap_size = rbd_dev->header.snap_sizes[which];
11982ad3d716SAlex Elder 	} else {
11992ad3d716SAlex Elder 		u64 size = 0;
12002ad3d716SAlex Elder 		int ret;
12012ad3d716SAlex Elder 
12022ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
12032ad3d716SAlex Elder 		if (ret)
12042ad3d716SAlex Elder 			return ret;
12052ad3d716SAlex Elder 
12062ad3d716SAlex Elder 		*snap_size = size;
12072ad3d716SAlex Elder 	}
12082ad3d716SAlex Elder 	return 0;
12092ad3d716SAlex Elder }
12102ad3d716SAlex Elder 
12112ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
12122ad3d716SAlex Elder 			u64 *snap_features)
12132ad3d716SAlex Elder {
12142ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
12152ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
12162ad3d716SAlex Elder 		*snap_features = rbd_dev->header.features;
12172ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
12182ad3d716SAlex Elder 		*snap_features = 0;	/* No features for format 1 */
12192ad3d716SAlex Elder 	} else {
12202ad3d716SAlex Elder 		u64 features = 0;
12212ad3d716SAlex Elder 		int ret;
12222ad3d716SAlex Elder 
12232ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
12242ad3d716SAlex Elder 		if (ret)
12252ad3d716SAlex Elder 			return ret;
12262ad3d716SAlex Elder 
12272ad3d716SAlex Elder 		*snap_features = features;
12282ad3d716SAlex Elder 	}
12292ad3d716SAlex Elder 	return 0;
123000f1f36fSAlex Elder }
1231602adf40SYehuda Sadeh 
1232d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1233602adf40SYehuda Sadeh {
12348f4b7d98SAlex Elder 	u64 snap_id = rbd_dev->spec->snap_id;
12352ad3d716SAlex Elder 	u64 size = 0;
12362ad3d716SAlex Elder 	u64 features = 0;
12372ad3d716SAlex Elder 	int ret;
12388b0241f8SAlex Elder 
12392ad3d716SAlex Elder 	ret = rbd_snap_size(rbd_dev, snap_id, &size);
12402ad3d716SAlex Elder 	if (ret)
12412ad3d716SAlex Elder 		return ret;
12422ad3d716SAlex Elder 	ret = rbd_snap_features(rbd_dev, snap_id, &features);
12432ad3d716SAlex Elder 	if (ret)
12442ad3d716SAlex Elder 		return ret;
12452ad3d716SAlex Elder 
12462ad3d716SAlex Elder 	rbd_dev->mapping.size = size;
12472ad3d716SAlex Elder 	rbd_dev->mapping.features = features;
12482ad3d716SAlex Elder 
12498b0241f8SAlex Elder 	return 0;
1250602adf40SYehuda Sadeh }
1251602adf40SYehuda Sadeh 
1252d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1253d1cf5788SAlex Elder {
1254d1cf5788SAlex Elder 	rbd_dev->mapping.size = 0;
1255d1cf5788SAlex Elder 	rbd_dev->mapping.features = 0;
1256200a6a8bSAlex Elder }
1257200a6a8bSAlex Elder 
125865ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
125965ccfe21SAlex Elder {
12605bc3fb17SIlya Dryomov 	u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
1261602adf40SYehuda Sadeh 
126265ccfe21SAlex Elder 	return offset & (segment_size - 1);
126365ccfe21SAlex Elder }
126465ccfe21SAlex Elder 
126565ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
126665ccfe21SAlex Elder 				u64 offset, u64 length)
126765ccfe21SAlex Elder {
12685bc3fb17SIlya Dryomov 	u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
126965ccfe21SAlex Elder 
127065ccfe21SAlex Elder 	offset &= segment_size - 1;
127165ccfe21SAlex Elder 
1272aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
127365ccfe21SAlex Elder 	if (offset + length > segment_size)
127465ccfe21SAlex Elder 		length = segment_size - offset;
127565ccfe21SAlex Elder 
127665ccfe21SAlex Elder 	return length;
1277602adf40SYehuda Sadeh }
1278602adf40SYehuda Sadeh 
1279602adf40SYehuda Sadeh /*
1280602adf40SYehuda Sadeh  * bio helpers
1281602adf40SYehuda Sadeh  */
1282602adf40SYehuda Sadeh 
1283602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
1284602adf40SYehuda Sadeh {
1285602adf40SYehuda Sadeh 	struct bio *tmp;
1286602adf40SYehuda Sadeh 
1287602adf40SYehuda Sadeh 	while (chain) {
1288602adf40SYehuda Sadeh 		tmp = chain;
1289602adf40SYehuda Sadeh 		chain = chain->bi_next;
1290602adf40SYehuda Sadeh 		bio_put(tmp);
1291602adf40SYehuda Sadeh 	}
1292602adf40SYehuda Sadeh }
1293602adf40SYehuda Sadeh 
1294602adf40SYehuda Sadeh /*
1295602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
1296602adf40SYehuda Sadeh  */
1297602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
1298602adf40SYehuda Sadeh {
12997988613bSKent Overstreet 	struct bio_vec bv;
13007988613bSKent Overstreet 	struct bvec_iter iter;
1301602adf40SYehuda Sadeh 	unsigned long flags;
1302602adf40SYehuda Sadeh 	void *buf;
1303602adf40SYehuda Sadeh 	int pos = 0;
1304602adf40SYehuda Sadeh 
1305602adf40SYehuda Sadeh 	while (chain) {
13067988613bSKent Overstreet 		bio_for_each_segment(bv, chain, iter) {
13077988613bSKent Overstreet 			if (pos + bv.bv_len > start_ofs) {
1308602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
13097988613bSKent Overstreet 				buf = bvec_kmap_irq(&bv, &flags);
1310602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
13117988613bSKent Overstreet 				       bv.bv_len - remainder);
13127988613bSKent Overstreet 				flush_dcache_page(bv.bv_page);
131385b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
1314602adf40SYehuda Sadeh 			}
13157988613bSKent Overstreet 			pos += bv.bv_len;
1316602adf40SYehuda Sadeh 		}
1317602adf40SYehuda Sadeh 
1318602adf40SYehuda Sadeh 		chain = chain->bi_next;
1319602adf40SYehuda Sadeh 	}
1320602adf40SYehuda Sadeh }
1321602adf40SYehuda Sadeh 
1322602adf40SYehuda Sadeh /*
1323b9434c5bSAlex Elder  * similar to zero_bio_chain(), zeros data defined by a page array,
1324b9434c5bSAlex Elder  * starting at the given byte offset from the start of the array and
1325b9434c5bSAlex Elder  * continuing up to the given end offset.  The pages array is
1326b9434c5bSAlex Elder  * assumed to be big enough to hold all bytes up to the end.
1327b9434c5bSAlex Elder  */
1328b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end)
1329b9434c5bSAlex Elder {
1330b9434c5bSAlex Elder 	struct page **page = &pages[offset >> PAGE_SHIFT];
1331b9434c5bSAlex Elder 
1332b9434c5bSAlex Elder 	rbd_assert(end > offset);
1333b9434c5bSAlex Elder 	rbd_assert(end - offset <= (u64)SIZE_MAX);
1334b9434c5bSAlex Elder 	while (offset < end) {
1335b9434c5bSAlex Elder 		size_t page_offset;
1336b9434c5bSAlex Elder 		size_t length;
1337b9434c5bSAlex Elder 		unsigned long flags;
1338b9434c5bSAlex Elder 		void *kaddr;
1339b9434c5bSAlex Elder 
1340491205a8SGeert Uytterhoeven 		page_offset = offset & ~PAGE_MASK;
1341491205a8SGeert Uytterhoeven 		length = min_t(size_t, PAGE_SIZE - page_offset, end - offset);
1342b9434c5bSAlex Elder 		local_irq_save(flags);
1343b9434c5bSAlex Elder 		kaddr = kmap_atomic(*page);
1344b9434c5bSAlex Elder 		memset(kaddr + page_offset, 0, length);
1345e2156054SAlex Elder 		flush_dcache_page(*page);
1346b9434c5bSAlex Elder 		kunmap_atomic(kaddr);
1347b9434c5bSAlex Elder 		local_irq_restore(flags);
1348b9434c5bSAlex Elder 
1349b9434c5bSAlex Elder 		offset += length;
1350b9434c5bSAlex Elder 		page++;
1351b9434c5bSAlex Elder 	}
1352b9434c5bSAlex Elder }
1353b9434c5bSAlex Elder 
1354b9434c5bSAlex Elder /*
1355f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
1356f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
1357602adf40SYehuda Sadeh  */
1358f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
1359f7760dadSAlex Elder 					unsigned int offset,
1360f7760dadSAlex Elder 					unsigned int len,
1361f7760dadSAlex Elder 					gfp_t gfpmask)
1362602adf40SYehuda Sadeh {
1363f7760dadSAlex Elder 	struct bio *bio;
1364602adf40SYehuda Sadeh 
13655341a627SKent Overstreet 	bio = bio_clone(bio_src, gfpmask);
1366f7760dadSAlex Elder 	if (!bio)
1367f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
1368f7760dadSAlex Elder 
13695341a627SKent Overstreet 	bio_advance(bio, offset);
13704f024f37SKent Overstreet 	bio->bi_iter.bi_size = len;
1371602adf40SYehuda Sadeh 
1372f7760dadSAlex Elder 	return bio;
1373602adf40SYehuda Sadeh }
1374602adf40SYehuda Sadeh 
1375f7760dadSAlex Elder /*
1376f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
1377f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
1378f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
1379f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
1380f7760dadSAlex Elder  *
1381f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
1382f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
1383f7760dadSAlex Elder  * the start of data to be cloned is located.
1384f7760dadSAlex Elder  *
1385f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
1386f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
1387f7760dadSAlex Elder  * contain the offset of that byte within that bio.
1388f7760dadSAlex Elder  */
1389f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
1390f7760dadSAlex Elder 					unsigned int *offset,
1391f7760dadSAlex Elder 					unsigned int len,
1392f7760dadSAlex Elder 					gfp_t gfpmask)
1393f7760dadSAlex Elder {
1394f7760dadSAlex Elder 	struct bio *bi = *bio_src;
1395f7760dadSAlex Elder 	unsigned int off = *offset;
1396f7760dadSAlex Elder 	struct bio *chain = NULL;
1397f7760dadSAlex Elder 	struct bio **end;
1398602adf40SYehuda Sadeh 
1399f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
1400602adf40SYehuda Sadeh 
14014f024f37SKent Overstreet 	if (!bi || off >= bi->bi_iter.bi_size || !len)
1402f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
1403602adf40SYehuda Sadeh 
1404f7760dadSAlex Elder 	end = &chain;
1405f7760dadSAlex Elder 	while (len) {
1406f7760dadSAlex Elder 		unsigned int bi_size;
1407f7760dadSAlex Elder 		struct bio *bio;
1408f7760dadSAlex Elder 
1409f5400b7aSAlex Elder 		if (!bi) {
1410f5400b7aSAlex Elder 			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1411f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
1412f5400b7aSAlex Elder 		}
14134f024f37SKent Overstreet 		bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len);
1414f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
1415f7760dadSAlex Elder 		if (!bio)
1416f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
1417f7760dadSAlex Elder 
1418f7760dadSAlex Elder 		*end = bio;
1419f7760dadSAlex Elder 		end = &bio->bi_next;
1420f7760dadSAlex Elder 
1421f7760dadSAlex Elder 		off += bi_size;
14224f024f37SKent Overstreet 		if (off == bi->bi_iter.bi_size) {
1423f7760dadSAlex Elder 			bi = bi->bi_next;
1424f7760dadSAlex Elder 			off = 0;
1425f7760dadSAlex Elder 		}
1426f7760dadSAlex Elder 		len -= bi_size;
1427f7760dadSAlex Elder 	}
1428f7760dadSAlex Elder 	*bio_src = bi;
1429f7760dadSAlex Elder 	*offset = off;
1430f7760dadSAlex Elder 
1431f7760dadSAlex Elder 	return chain;
1432f7760dadSAlex Elder out_err:
1433f7760dadSAlex Elder 	bio_chain_put(chain);
1434f7760dadSAlex Elder 
1435602adf40SYehuda Sadeh 	return NULL;
1436602adf40SYehuda Sadeh }
1437602adf40SYehuda Sadeh 
1438926f9b3fSAlex Elder /*
1439926f9b3fSAlex Elder  * The default/initial value for all object request flags is 0.  For
1440926f9b3fSAlex Elder  * each flag, once its value is set to 1 it is never reset to 0
1441926f9b3fSAlex Elder  * again.
1442926f9b3fSAlex Elder  */
14436365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
14446365d33aSAlex Elder {
14456365d33aSAlex Elder 	if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
14466365d33aSAlex Elder 		struct rbd_device *rbd_dev;
14476365d33aSAlex Elder 
144857acbaa7SAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
14499584d508SIlya Dryomov 		rbd_warn(rbd_dev, "obj_request %p already marked img_data",
14506365d33aSAlex Elder 			obj_request);
14516365d33aSAlex Elder 	}
14526365d33aSAlex Elder }
14536365d33aSAlex Elder 
14546365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
14556365d33aSAlex Elder {
14566365d33aSAlex Elder 	smp_mb();
14576365d33aSAlex Elder 	return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
14586365d33aSAlex Elder }
14596365d33aSAlex Elder 
146057acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request)
146157acbaa7SAlex Elder {
146257acbaa7SAlex Elder 	if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
146357acbaa7SAlex Elder 		struct rbd_device *rbd_dev = NULL;
146457acbaa7SAlex Elder 
146557acbaa7SAlex Elder 		if (obj_request_img_data_test(obj_request))
146657acbaa7SAlex Elder 			rbd_dev = obj_request->img_request->rbd_dev;
14679584d508SIlya Dryomov 		rbd_warn(rbd_dev, "obj_request %p already marked done",
146857acbaa7SAlex Elder 			obj_request);
146957acbaa7SAlex Elder 	}
147057acbaa7SAlex Elder }
147157acbaa7SAlex Elder 
147257acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request)
147357acbaa7SAlex Elder {
147457acbaa7SAlex Elder 	smp_mb();
147557acbaa7SAlex Elder 	return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
147657acbaa7SAlex Elder }
147757acbaa7SAlex Elder 
14785679c59fSAlex Elder /*
14795679c59fSAlex Elder  * This sets the KNOWN flag after (possibly) setting the EXISTS
14805679c59fSAlex Elder  * flag.  The latter is set based on the "exists" value provided.
14815679c59fSAlex Elder  *
14825679c59fSAlex Elder  * Note that for our purposes once an object exists it never goes
14835679c59fSAlex Elder  * away again.  It's possible that the response from two existence
14845679c59fSAlex Elder  * checks are separated by the creation of the target object, and
14855679c59fSAlex Elder  * the first ("doesn't exist") response arrives *after* the second
14865679c59fSAlex Elder  * ("does exist").  In that case we ignore the second one.
14875679c59fSAlex Elder  */
14885679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request,
14895679c59fSAlex Elder 				bool exists)
14905679c59fSAlex Elder {
14915679c59fSAlex Elder 	if (exists)
14925679c59fSAlex Elder 		set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
14935679c59fSAlex Elder 	set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
14945679c59fSAlex Elder 	smp_mb();
14955679c59fSAlex Elder }
14965679c59fSAlex Elder 
14975679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request)
14985679c59fSAlex Elder {
14995679c59fSAlex Elder 	smp_mb();
15005679c59fSAlex Elder 	return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
15015679c59fSAlex Elder }
15025679c59fSAlex Elder 
15035679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
15045679c59fSAlex Elder {
15055679c59fSAlex Elder 	smp_mb();
15065679c59fSAlex Elder 	return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
15075679c59fSAlex Elder }
15085679c59fSAlex Elder 
15099638556aSIlya Dryomov static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request)
15109638556aSIlya Dryomov {
15119638556aSIlya Dryomov 	struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
15129638556aSIlya Dryomov 
15139638556aSIlya Dryomov 	return obj_request->img_offset <
15149638556aSIlya Dryomov 	    round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header));
15159638556aSIlya Dryomov }
15169638556aSIlya Dryomov 
1517bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1518bf0d5f50SAlex Elder {
151937206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
15202c935bc5SPeter Zijlstra 		kref_read(&obj_request->kref));
1521bf0d5f50SAlex Elder 	kref_get(&obj_request->kref);
1522bf0d5f50SAlex Elder }
1523bf0d5f50SAlex Elder 
1524bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
1525bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1526bf0d5f50SAlex Elder {
1527bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
152837206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
15292c935bc5SPeter Zijlstra 		kref_read(&obj_request->kref));
1530bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1531bf0d5f50SAlex Elder }
1532bf0d5f50SAlex Elder 
15330f2d5be7SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request)
15340f2d5be7SAlex Elder {
15350f2d5be7SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
15362c935bc5SPeter Zijlstra 	     kref_read(&img_request->kref));
15370f2d5be7SAlex Elder 	kref_get(&img_request->kref);
15380f2d5be7SAlex Elder }
15390f2d5be7SAlex Elder 
1540e93f3152SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request);
1541e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref);
1542bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref);
1543bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request)
1544bf0d5f50SAlex Elder {
1545bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
154637206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
15472c935bc5SPeter Zijlstra 		kref_read(&img_request->kref));
1548e93f3152SAlex Elder 	if (img_request_child_test(img_request))
1549e93f3152SAlex Elder 		kref_put(&img_request->kref, rbd_parent_request_destroy);
1550e93f3152SAlex Elder 	else
1551bf0d5f50SAlex Elder 		kref_put(&img_request->kref, rbd_img_request_destroy);
1552bf0d5f50SAlex Elder }
1553bf0d5f50SAlex Elder 
1554bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1555bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1556bf0d5f50SAlex Elder {
155725dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
155825dcf954SAlex Elder 
1559b155e86cSAlex Elder 	/* Image request now owns object's original reference */
1560bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
156125dcf954SAlex Elder 	obj_request->which = img_request->obj_request_count;
15626365d33aSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
15636365d33aSAlex Elder 	obj_request_img_data_set(obj_request);
1564bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
156525dcf954SAlex Elder 	img_request->obj_request_count++;
156625dcf954SAlex Elder 	list_add_tail(&obj_request->links, &img_request->obj_requests);
156737206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
156837206ee5SAlex Elder 		obj_request->which);
1569bf0d5f50SAlex Elder }
1570bf0d5f50SAlex Elder 
1571bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1572bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1573bf0d5f50SAlex Elder {
1574bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
157525dcf954SAlex Elder 
157637206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
157737206ee5SAlex Elder 		obj_request->which);
1578bf0d5f50SAlex Elder 	list_del(&obj_request->links);
157925dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
158025dcf954SAlex Elder 	img_request->obj_request_count--;
158125dcf954SAlex Elder 	rbd_assert(obj_request->which == img_request->obj_request_count);
158225dcf954SAlex Elder 	obj_request->which = BAD_WHICH;
15836365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
1584bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1585bf0d5f50SAlex Elder 	obj_request->img_request = NULL;
158625dcf954SAlex Elder 	obj_request->callback = NULL;
1587bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1588bf0d5f50SAlex Elder }
1589bf0d5f50SAlex Elder 
1590bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type)
1591bf0d5f50SAlex Elder {
1592bf0d5f50SAlex Elder 	switch (type) {
15939969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1594bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1595788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1596bf0d5f50SAlex Elder 		return true;
1597bf0d5f50SAlex Elder 	default:
1598bf0d5f50SAlex Elder 		return false;
1599bf0d5f50SAlex Elder 	}
1600bf0d5f50SAlex Elder }
1601bf0d5f50SAlex Elder 
16024a17dadcSIlya Dryomov static void rbd_img_obj_callback(struct rbd_obj_request *obj_request);
16034a17dadcSIlya Dryomov 
1604980917fcSIlya Dryomov static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
1605bf0d5f50SAlex Elder {
1606980917fcSIlya Dryomov 	struct ceph_osd_request *osd_req = obj_request->osd_req;
1607980917fcSIlya Dryomov 
1608a90bb0c1SIlya Dryomov 	dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__,
1609a90bb0c1SIlya Dryomov 	     obj_request, obj_request->object_no, obj_request->offset,
161067e2b652SIlya Dryomov 	     obj_request->length, osd_req);
16114a17dadcSIlya Dryomov 	if (obj_request_img_data_test(obj_request)) {
16124a17dadcSIlya Dryomov 		WARN_ON(obj_request->callback != rbd_img_obj_callback);
16134a17dadcSIlya Dryomov 		rbd_img_request_get(obj_request->img_request);
16144a17dadcSIlya Dryomov 	}
1615980917fcSIlya Dryomov 	ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
1616bf0d5f50SAlex Elder }
1617bf0d5f50SAlex Elder 
1618bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request)
1619bf0d5f50SAlex Elder {
162055f27e09SAlex Elder 
162137206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
162255f27e09SAlex Elder 
162355f27e09SAlex Elder 	/*
162455f27e09SAlex Elder 	 * If no error occurred, compute the aggregate transfer
162555f27e09SAlex Elder 	 * count for the image request.  We could instead use
162655f27e09SAlex Elder 	 * atomic64_cmpxchg() to update it as each object request
162755f27e09SAlex Elder 	 * completes; not clear which way is better off hand.
162855f27e09SAlex Elder 	 */
162955f27e09SAlex Elder 	if (!img_request->result) {
163055f27e09SAlex Elder 		struct rbd_obj_request *obj_request;
163155f27e09SAlex Elder 		u64 xferred = 0;
163255f27e09SAlex Elder 
163355f27e09SAlex Elder 		for_each_obj_request(img_request, obj_request)
163455f27e09SAlex Elder 			xferred += obj_request->xferred;
163555f27e09SAlex Elder 		img_request->xferred = xferred;
163655f27e09SAlex Elder 	}
163755f27e09SAlex Elder 
1638bf0d5f50SAlex Elder 	if (img_request->callback)
1639bf0d5f50SAlex Elder 		img_request->callback(img_request);
1640bf0d5f50SAlex Elder 	else
1641bf0d5f50SAlex Elder 		rbd_img_request_put(img_request);
1642bf0d5f50SAlex Elder }
1643bf0d5f50SAlex Elder 
16440c425248SAlex Elder /*
16450c425248SAlex Elder  * The default/initial value for all image request flags is 0.  Each
16460c425248SAlex Elder  * is conditionally set to 1 at image request initialization time
16470c425248SAlex Elder  * and currently never change thereafter.
16480c425248SAlex Elder  */
16490c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request)
16500c425248SAlex Elder {
16510c425248SAlex Elder 	set_bit(IMG_REQ_WRITE, &img_request->flags);
16520c425248SAlex Elder 	smp_mb();
16530c425248SAlex Elder }
16540c425248SAlex Elder 
16550c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request)
16560c425248SAlex Elder {
16570c425248SAlex Elder 	smp_mb();
16580c425248SAlex Elder 	return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
16590c425248SAlex Elder }
16600c425248SAlex Elder 
166190e98c52SGuangliang Zhao /*
166290e98c52SGuangliang Zhao  * Set the discard flag when the img_request is an discard request
166390e98c52SGuangliang Zhao  */
166490e98c52SGuangliang Zhao static void img_request_discard_set(struct rbd_img_request *img_request)
166590e98c52SGuangliang Zhao {
166690e98c52SGuangliang Zhao 	set_bit(IMG_REQ_DISCARD, &img_request->flags);
166790e98c52SGuangliang Zhao 	smp_mb();
166890e98c52SGuangliang Zhao }
166990e98c52SGuangliang Zhao 
167090e98c52SGuangliang Zhao static bool img_request_discard_test(struct rbd_img_request *img_request)
167190e98c52SGuangliang Zhao {
167290e98c52SGuangliang Zhao 	smp_mb();
167390e98c52SGuangliang Zhao 	return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0;
167490e98c52SGuangliang Zhao }
167590e98c52SGuangliang Zhao 
16769849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request)
16779849e986SAlex Elder {
16789849e986SAlex Elder 	set_bit(IMG_REQ_CHILD, &img_request->flags);
16799849e986SAlex Elder 	smp_mb();
16809849e986SAlex Elder }
16819849e986SAlex Elder 
1682e93f3152SAlex Elder static void img_request_child_clear(struct rbd_img_request *img_request)
1683e93f3152SAlex Elder {
1684e93f3152SAlex Elder 	clear_bit(IMG_REQ_CHILD, &img_request->flags);
1685e93f3152SAlex Elder 	smp_mb();
1686e93f3152SAlex Elder }
1687e93f3152SAlex Elder 
16889849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request)
16899849e986SAlex Elder {
16909849e986SAlex Elder 	smp_mb();
16919849e986SAlex Elder 	return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
16929849e986SAlex Elder }
16939849e986SAlex Elder 
1694d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request)
1695d0b2e944SAlex Elder {
1696d0b2e944SAlex Elder 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1697d0b2e944SAlex Elder 	smp_mb();
1698d0b2e944SAlex Elder }
1699d0b2e944SAlex Elder 
1700a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request)
1701a2acd00eSAlex Elder {
1702a2acd00eSAlex Elder 	clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1703a2acd00eSAlex Elder 	smp_mb();
1704a2acd00eSAlex Elder }
1705a2acd00eSAlex Elder 
1706d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request)
1707d0b2e944SAlex Elder {
1708d0b2e944SAlex Elder 	smp_mb();
1709d0b2e944SAlex Elder 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1710d0b2e944SAlex Elder }
1711d0b2e944SAlex Elder 
17123b434a2aSJosh Durgin static enum obj_operation_type
17133b434a2aSJosh Durgin rbd_img_request_op_type(struct rbd_img_request *img_request)
17143b434a2aSJosh Durgin {
17153b434a2aSJosh Durgin 	if (img_request_write_test(img_request))
17163b434a2aSJosh Durgin 		return OBJ_OP_WRITE;
17173b434a2aSJosh Durgin 	else if (img_request_discard_test(img_request))
17183b434a2aSJosh Durgin 		return OBJ_OP_DISCARD;
17193b434a2aSJosh Durgin 	else
17203b434a2aSJosh Durgin 		return OBJ_OP_READ;
17213b434a2aSJosh Durgin }
17223b434a2aSJosh Durgin 
17236e2a4505SAlex Elder static void
17246e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
17256e2a4505SAlex Elder {
1726b9434c5bSAlex Elder 	u64 xferred = obj_request->xferred;
1727b9434c5bSAlex Elder 	u64 length = obj_request->length;
1728b9434c5bSAlex Elder 
17296e2a4505SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
17306e2a4505SAlex Elder 		obj_request, obj_request->img_request, obj_request->result,
1731b9434c5bSAlex Elder 		xferred, length);
17326e2a4505SAlex Elder 	/*
173317c1cc1dSJosh Durgin 	 * ENOENT means a hole in the image.  We zero-fill the entire
173417c1cc1dSJosh Durgin 	 * length of the request.  A short read also implies zero-fill
173517c1cc1dSJosh Durgin 	 * to the end of the request.  An error requires the whole
173617c1cc1dSJosh Durgin 	 * length of the request to be reported finished with an error
173717c1cc1dSJosh Durgin 	 * to the block layer.  In each case we update the xferred
173817c1cc1dSJosh Durgin 	 * count to indicate the whole request was satisfied.
17396e2a4505SAlex Elder 	 */
1740b9434c5bSAlex Elder 	rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
17416e2a4505SAlex Elder 	if (obj_request->result == -ENOENT) {
1742b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
17436e2a4505SAlex Elder 			zero_bio_chain(obj_request->bio_list, 0);
1744b9434c5bSAlex Elder 		else
1745b9434c5bSAlex Elder 			zero_pages(obj_request->pages, 0, length);
17466e2a4505SAlex Elder 		obj_request->result = 0;
1747b9434c5bSAlex Elder 	} else if (xferred < length && !obj_request->result) {
1748b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
1749b9434c5bSAlex Elder 			zero_bio_chain(obj_request->bio_list, xferred);
1750b9434c5bSAlex Elder 		else
1751b9434c5bSAlex Elder 			zero_pages(obj_request->pages, xferred, length);
17526e2a4505SAlex Elder 	}
175317c1cc1dSJosh Durgin 	obj_request->xferred = length;
17546e2a4505SAlex Elder 	obj_request_done_set(obj_request);
17556e2a4505SAlex Elder }
17566e2a4505SAlex Elder 
1757bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1758bf0d5f50SAlex Elder {
175937206ee5SAlex Elder 	dout("%s: obj %p cb %p\n", __func__, obj_request,
176037206ee5SAlex Elder 		obj_request->callback);
1761bf0d5f50SAlex Elder 	if (obj_request->callback)
1762bf0d5f50SAlex Elder 		obj_request->callback(obj_request);
1763788e2df3SAlex Elder 	else
1764788e2df3SAlex Elder 		complete_all(&obj_request->completion);
1765bf0d5f50SAlex Elder }
1766bf0d5f50SAlex Elder 
17670dcc685eSIlya Dryomov static void rbd_obj_request_error(struct rbd_obj_request *obj_request, int err)
17680dcc685eSIlya Dryomov {
17690dcc685eSIlya Dryomov 	obj_request->result = err;
17700dcc685eSIlya Dryomov 	obj_request->xferred = 0;
17710dcc685eSIlya Dryomov 	/*
17720dcc685eSIlya Dryomov 	 * kludge - mirror rbd_obj_request_submit() to match a put in
17730dcc685eSIlya Dryomov 	 * rbd_img_obj_callback()
17740dcc685eSIlya Dryomov 	 */
17750dcc685eSIlya Dryomov 	if (obj_request_img_data_test(obj_request)) {
17760dcc685eSIlya Dryomov 		WARN_ON(obj_request->callback != rbd_img_obj_callback);
17770dcc685eSIlya Dryomov 		rbd_img_request_get(obj_request->img_request);
17780dcc685eSIlya Dryomov 	}
17790dcc685eSIlya Dryomov 	obj_request_done_set(obj_request);
17800dcc685eSIlya Dryomov 	rbd_obj_request_complete(obj_request);
17810dcc685eSIlya Dryomov }
17820dcc685eSIlya Dryomov 
1783c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1784bf0d5f50SAlex Elder {
178557acbaa7SAlex Elder 	struct rbd_img_request *img_request = NULL;
1786a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev = NULL;
178757acbaa7SAlex Elder 	bool layered = false;
178857acbaa7SAlex Elder 
178957acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
179057acbaa7SAlex Elder 		img_request = obj_request->img_request;
179157acbaa7SAlex Elder 		layered = img_request && img_request_layered_test(img_request);
1792a9e8ba2cSAlex Elder 		rbd_dev = img_request->rbd_dev;
179357acbaa7SAlex Elder 	}
17948b3e1a56SAlex Elder 
17958b3e1a56SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
17968b3e1a56SAlex Elder 		obj_request, img_request, obj_request->result,
17978b3e1a56SAlex Elder 		obj_request->xferred, obj_request->length);
1798a9e8ba2cSAlex Elder 	if (layered && obj_request->result == -ENOENT &&
1799a9e8ba2cSAlex Elder 			obj_request->img_offset < rbd_dev->parent_overlap)
18008b3e1a56SAlex Elder 		rbd_img_parent_read(obj_request);
18018b3e1a56SAlex Elder 	else if (img_request)
18026e2a4505SAlex Elder 		rbd_img_obj_request_read_callback(obj_request);
18036e2a4505SAlex Elder 	else
180407741308SAlex Elder 		obj_request_done_set(obj_request);
1805bf0d5f50SAlex Elder }
1806bf0d5f50SAlex Elder 
1807c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1808bf0d5f50SAlex Elder {
18091b83bef2SSage Weil 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
18101b83bef2SSage Weil 		obj_request->result, obj_request->length);
18111b83bef2SSage Weil 	/*
18128b3e1a56SAlex Elder 	 * There is no such thing as a successful short write.  Set
18138b3e1a56SAlex Elder 	 * it to our originally-requested length.
18141b83bef2SSage Weil 	 */
18151b83bef2SSage Weil 	obj_request->xferred = obj_request->length;
181607741308SAlex Elder 	obj_request_done_set(obj_request);
1817bf0d5f50SAlex Elder }
1818bf0d5f50SAlex Elder 
181990e98c52SGuangliang Zhao static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request)
182090e98c52SGuangliang Zhao {
182190e98c52SGuangliang Zhao 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
182290e98c52SGuangliang Zhao 		obj_request->result, obj_request->length);
182390e98c52SGuangliang Zhao 	/*
182490e98c52SGuangliang Zhao 	 * There is no such thing as a successful short discard.  Set
182590e98c52SGuangliang Zhao 	 * it to our originally-requested length.
182690e98c52SGuangliang Zhao 	 */
182790e98c52SGuangliang Zhao 	obj_request->xferred = obj_request->length;
1828d0265de7SJosh Durgin 	/* discarding a non-existent object is not a problem */
1829d0265de7SJosh Durgin 	if (obj_request->result == -ENOENT)
1830d0265de7SJosh Durgin 		obj_request->result = 0;
183190e98c52SGuangliang Zhao 	obj_request_done_set(obj_request);
183290e98c52SGuangliang Zhao }
183390e98c52SGuangliang Zhao 
1834fbfab539SAlex Elder /*
1835fbfab539SAlex Elder  * For a simple stat call there's nothing to do.  We'll do more if
1836fbfab539SAlex Elder  * this is part of a write sequence for a layered image.
1837fbfab539SAlex Elder  */
1838c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1839fbfab539SAlex Elder {
184037206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
1841fbfab539SAlex Elder 	obj_request_done_set(obj_request);
1842fbfab539SAlex Elder }
1843fbfab539SAlex Elder 
18442761713dSIlya Dryomov static void rbd_osd_call_callback(struct rbd_obj_request *obj_request)
18452761713dSIlya Dryomov {
18462761713dSIlya Dryomov 	dout("%s: obj %p\n", __func__, obj_request);
18472761713dSIlya Dryomov 
18482761713dSIlya Dryomov 	if (obj_request_img_data_test(obj_request))
18492761713dSIlya Dryomov 		rbd_osd_copyup_callback(obj_request);
18502761713dSIlya Dryomov 	else
18512761713dSIlya Dryomov 		obj_request_done_set(obj_request);
18522761713dSIlya Dryomov }
18532761713dSIlya Dryomov 
185485e084feSIlya Dryomov static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
1855bf0d5f50SAlex Elder {
1856bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1857bf0d5f50SAlex Elder 	u16 opcode;
1858bf0d5f50SAlex Elder 
185985e084feSIlya Dryomov 	dout("%s: osd_req %p\n", __func__, osd_req);
1860bf0d5f50SAlex Elder 	rbd_assert(osd_req == obj_request->osd_req);
186157acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
186257acbaa7SAlex Elder 		rbd_assert(obj_request->img_request);
186357acbaa7SAlex Elder 		rbd_assert(obj_request->which != BAD_WHICH);
186457acbaa7SAlex Elder 	} else {
186557acbaa7SAlex Elder 		rbd_assert(obj_request->which == BAD_WHICH);
186657acbaa7SAlex Elder 	}
1867bf0d5f50SAlex Elder 
18681b83bef2SSage Weil 	if (osd_req->r_result < 0)
18691b83bef2SSage Weil 		obj_request->result = osd_req->r_result;
1870bf0d5f50SAlex Elder 
1871c47f9371SAlex Elder 	/*
1872c47f9371SAlex Elder 	 * We support a 64-bit length, but ultimately it has to be
18737ad18afaSChristoph Hellwig 	 * passed to the block layer, which just supports a 32-bit
18747ad18afaSChristoph Hellwig 	 * length field.
1875c47f9371SAlex Elder 	 */
18767665d85bSYan, Zheng 	obj_request->xferred = osd_req->r_ops[0].outdata_len;
1877c47f9371SAlex Elder 	rbd_assert(obj_request->xferred < (u64)UINT_MAX);
18780ccd5926SIlya Dryomov 
187979528734SAlex Elder 	opcode = osd_req->r_ops[0].op;
1880bf0d5f50SAlex Elder 	switch (opcode) {
1881bf0d5f50SAlex Elder 	case CEPH_OSD_OP_READ:
1882c47f9371SAlex Elder 		rbd_osd_read_callback(obj_request);
1883bf0d5f50SAlex Elder 		break;
18840ccd5926SIlya Dryomov 	case CEPH_OSD_OP_SETALLOCHINT:
1885e30b7577SIlya Dryomov 		rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE ||
1886e30b7577SIlya Dryomov 			   osd_req->r_ops[1].op == CEPH_OSD_OP_WRITEFULL);
18870ccd5926SIlya Dryomov 		/* fall through */
1888bf0d5f50SAlex Elder 	case CEPH_OSD_OP_WRITE:
1889e30b7577SIlya Dryomov 	case CEPH_OSD_OP_WRITEFULL:
1890c47f9371SAlex Elder 		rbd_osd_write_callback(obj_request);
1891bf0d5f50SAlex Elder 		break;
1892fbfab539SAlex Elder 	case CEPH_OSD_OP_STAT:
1893c47f9371SAlex Elder 		rbd_osd_stat_callback(obj_request);
1894fbfab539SAlex Elder 		break;
189590e98c52SGuangliang Zhao 	case CEPH_OSD_OP_DELETE:
189690e98c52SGuangliang Zhao 	case CEPH_OSD_OP_TRUNCATE:
189790e98c52SGuangliang Zhao 	case CEPH_OSD_OP_ZERO:
189890e98c52SGuangliang Zhao 		rbd_osd_discard_callback(obj_request);
189990e98c52SGuangliang Zhao 		break;
190036be9a76SAlex Elder 	case CEPH_OSD_OP_CALL:
19012761713dSIlya Dryomov 		rbd_osd_call_callback(obj_request);
19022761713dSIlya Dryomov 		break;
1903bf0d5f50SAlex Elder 	default:
1904a90bb0c1SIlya Dryomov 		rbd_warn(NULL, "unexpected OSD op: object_no %016llx opcode %d",
1905a90bb0c1SIlya Dryomov 			 obj_request->object_no, opcode);
1906bf0d5f50SAlex Elder 		break;
1907bf0d5f50SAlex Elder 	}
1908bf0d5f50SAlex Elder 
190907741308SAlex Elder 	if (obj_request_done_test(obj_request))
1910bf0d5f50SAlex Elder 		rbd_obj_request_complete(obj_request);
1911bf0d5f50SAlex Elder }
1912bf0d5f50SAlex Elder 
19139d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1914430c28c3SAlex Elder {
19158c042b0dSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
1916430c28c3SAlex Elder 
19177c84883aSIlya Dryomov 	rbd_assert(obj_request_img_data_test(obj_request));
19187c84883aSIlya Dryomov 	osd_req->r_snapid = obj_request->img_request->snap_id;
19199d4df01fSAlex Elder }
19209d4df01fSAlex Elder 
19219d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
19229d4df01fSAlex Elder {
19239d4df01fSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
19249d4df01fSAlex Elder 
1925bb873b53SIlya Dryomov 	osd_req->r_mtime = CURRENT_TIME;
1926bb873b53SIlya Dryomov 	osd_req->r_data_offset = obj_request->offset;
1927430c28c3SAlex Elder }
1928430c28c3SAlex Elder 
1929bc81207eSIlya Dryomov static struct ceph_osd_request *
1930bc81207eSIlya Dryomov __rbd_osd_req_create(struct rbd_device *rbd_dev,
1931bc81207eSIlya Dryomov 		     struct ceph_snap_context *snapc,
1932bc81207eSIlya Dryomov 		     int num_ops, unsigned int flags,
1933bc81207eSIlya Dryomov 		     struct rbd_obj_request *obj_request)
1934bc81207eSIlya Dryomov {
1935bc81207eSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1936bc81207eSIlya Dryomov 	struct ceph_osd_request *req;
1937a90bb0c1SIlya Dryomov 	const char *name_format = rbd_dev->image_format == 1 ?
1938a90bb0c1SIlya Dryomov 				      RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
1939bc81207eSIlya Dryomov 
1940bc81207eSIlya Dryomov 	req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
1941bc81207eSIlya Dryomov 	if (!req)
1942bc81207eSIlya Dryomov 		return NULL;
1943bc81207eSIlya Dryomov 
1944bc81207eSIlya Dryomov 	req->r_flags = flags;
1945bc81207eSIlya Dryomov 	req->r_callback = rbd_osd_req_callback;
1946bc81207eSIlya Dryomov 	req->r_priv = obj_request;
1947bc81207eSIlya Dryomov 
1948bc81207eSIlya Dryomov 	req->r_base_oloc.pool = rbd_dev->layout.pool_id;
1949a90bb0c1SIlya Dryomov 	if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
1950a90bb0c1SIlya Dryomov 			rbd_dev->header.object_prefix, obj_request->object_no))
1951bc81207eSIlya Dryomov 		goto err_req;
1952bc81207eSIlya Dryomov 
1953bc81207eSIlya Dryomov 	if (ceph_osdc_alloc_messages(req, GFP_NOIO))
1954bc81207eSIlya Dryomov 		goto err_req;
1955bc81207eSIlya Dryomov 
1956bc81207eSIlya Dryomov 	return req;
1957bc81207eSIlya Dryomov 
1958bc81207eSIlya Dryomov err_req:
1959bc81207eSIlya Dryomov 	ceph_osdc_put_request(req);
1960bc81207eSIlya Dryomov 	return NULL;
1961bc81207eSIlya Dryomov }
1962bc81207eSIlya Dryomov 
19630ccd5926SIlya Dryomov /*
19640ccd5926SIlya Dryomov  * Create an osd request.  A read request has one osd op (read).
19650ccd5926SIlya Dryomov  * A write request has either one (watch) or two (hint+write) osd ops.
19660ccd5926SIlya Dryomov  * (All rbd data writes are prefixed with an allocation hint op, but
19670ccd5926SIlya Dryomov  * technically osd watch is a write request, hence this distinction.)
19680ccd5926SIlya Dryomov  */
1969bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create(
1970bf0d5f50SAlex Elder 					struct rbd_device *rbd_dev,
19716d2940c8SGuangliang Zhao 					enum obj_operation_type op_type,
1972deb236b3SIlya Dryomov 					unsigned int num_ops,
1973430c28c3SAlex Elder 					struct rbd_obj_request *obj_request)
1974bf0d5f50SAlex Elder {
1975bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1976bf0d5f50SAlex Elder 
197790e98c52SGuangliang Zhao 	if (obj_request_img_data_test(obj_request) &&
197890e98c52SGuangliang Zhao 		(op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) {
19796365d33aSAlex Elder 		struct rbd_img_request *img_request = obj_request->img_request;
198090e98c52SGuangliang Zhao 		if (op_type == OBJ_OP_WRITE) {
19816d2940c8SGuangliang Zhao 			rbd_assert(img_request_write_test(img_request));
198290e98c52SGuangliang Zhao 		} else {
198390e98c52SGuangliang Zhao 			rbd_assert(img_request_discard_test(img_request));
198490e98c52SGuangliang Zhao 		}
1985bf0d5f50SAlex Elder 		snapc = img_request->snapc;
1986bf0d5f50SAlex Elder 	}
1987bf0d5f50SAlex Elder 
19886d2940c8SGuangliang Zhao 	rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2));
1989deb236b3SIlya Dryomov 
1990bc81207eSIlya Dryomov 	return __rbd_osd_req_create(rbd_dev, snapc, num_ops,
1991bc81207eSIlya Dryomov 	    (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) ?
199254ea0046SIlya Dryomov 	    CEPH_OSD_FLAG_WRITE : CEPH_OSD_FLAG_READ, obj_request);
1993bf0d5f50SAlex Elder }
1994bf0d5f50SAlex Elder 
19950eefd470SAlex Elder /*
1996d3246fb0SJosh Durgin  * Create a copyup osd request based on the information in the object
1997d3246fb0SJosh Durgin  * request supplied.  A copyup request has two or three osd ops, a
1998d3246fb0SJosh Durgin  * copyup method call, potentially a hint op, and a write or truncate
1999d3246fb0SJosh Durgin  * or zero op.
20000eefd470SAlex Elder  */
20010eefd470SAlex Elder static struct ceph_osd_request *
20020eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
20030eefd470SAlex Elder {
20040eefd470SAlex Elder 	struct rbd_img_request *img_request;
2005d3246fb0SJosh Durgin 	int num_osd_ops = 3;
20060eefd470SAlex Elder 
20070eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
20080eefd470SAlex Elder 	img_request = obj_request->img_request;
20090eefd470SAlex Elder 	rbd_assert(img_request);
2010d3246fb0SJosh Durgin 	rbd_assert(img_request_write_test(img_request) ||
2011d3246fb0SJosh Durgin 			img_request_discard_test(img_request));
20120eefd470SAlex Elder 
2013d3246fb0SJosh Durgin 	if (img_request_discard_test(img_request))
2014d3246fb0SJosh Durgin 		num_osd_ops = 2;
2015d3246fb0SJosh Durgin 
2016bc81207eSIlya Dryomov 	return __rbd_osd_req_create(img_request->rbd_dev,
2017bc81207eSIlya Dryomov 				    img_request->snapc, num_osd_ops,
201854ea0046SIlya Dryomov 				    CEPH_OSD_FLAG_WRITE, obj_request);
20190eefd470SAlex Elder }
20200eefd470SAlex Elder 
2021bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
2022bf0d5f50SAlex Elder {
2023bf0d5f50SAlex Elder 	ceph_osdc_put_request(osd_req);
2024bf0d5f50SAlex Elder }
2025bf0d5f50SAlex Elder 
20266c696d85SIlya Dryomov static struct rbd_obj_request *
20276c696d85SIlya Dryomov rbd_obj_request_create(enum obj_request_type type)
2028bf0d5f50SAlex Elder {
2029bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2030bf0d5f50SAlex Elder 
2031bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(type));
2032bf0d5f50SAlex Elder 
20335a60e876SIlya Dryomov 	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
20346c696d85SIlya Dryomov 	if (!obj_request)
2035f907ad55SAlex Elder 		return NULL;
2036f907ad55SAlex Elder 
2037bf0d5f50SAlex Elder 	obj_request->which = BAD_WHICH;
2038bf0d5f50SAlex Elder 	obj_request->type = type;
2039bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&obj_request->links);
2040788e2df3SAlex Elder 	init_completion(&obj_request->completion);
2041bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
2042bf0d5f50SAlex Elder 
204367e2b652SIlya Dryomov 	dout("%s %p\n", __func__, obj_request);
2044bf0d5f50SAlex Elder 	return obj_request;
2045bf0d5f50SAlex Elder }
2046bf0d5f50SAlex Elder 
2047bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
2048bf0d5f50SAlex Elder {
2049bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2050bf0d5f50SAlex Elder 
2051bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
2052bf0d5f50SAlex Elder 
205337206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
205437206ee5SAlex Elder 
2055bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
2056bf0d5f50SAlex Elder 	rbd_assert(obj_request->which == BAD_WHICH);
2057bf0d5f50SAlex Elder 
2058bf0d5f50SAlex Elder 	if (obj_request->osd_req)
2059bf0d5f50SAlex Elder 		rbd_osd_req_destroy(obj_request->osd_req);
2060bf0d5f50SAlex Elder 
2061bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
2062bf0d5f50SAlex Elder 	switch (obj_request->type) {
20639969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
20649969ebc5SAlex Elder 		break;		/* Nothing to do */
2065bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
2066bf0d5f50SAlex Elder 		if (obj_request->bio_list)
2067bf0d5f50SAlex Elder 			bio_chain_put(obj_request->bio_list);
2068bf0d5f50SAlex Elder 		break;
2069788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
207004dc923cSIlya Dryomov 		/* img_data requests don't own their page array */
207104dc923cSIlya Dryomov 		if (obj_request->pages &&
207204dc923cSIlya Dryomov 		    !obj_request_img_data_test(obj_request))
2073788e2df3SAlex Elder 			ceph_release_page_vector(obj_request->pages,
2074788e2df3SAlex Elder 						obj_request->page_count);
2075788e2df3SAlex Elder 		break;
2076bf0d5f50SAlex Elder 	}
2077bf0d5f50SAlex Elder 
2078868311b1SAlex Elder 	kmem_cache_free(rbd_obj_request_cache, obj_request);
2079bf0d5f50SAlex Elder }
2080bf0d5f50SAlex Elder 
2081fb65d228SAlex Elder /* It's OK to call this for a device with no parent */
2082fb65d228SAlex Elder 
2083fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
2084fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev)
2085fb65d228SAlex Elder {
2086fb65d228SAlex Elder 	rbd_dev_remove_parent(rbd_dev);
2087fb65d228SAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
2088fb65d228SAlex Elder 	rbd_dev->parent_spec = NULL;
2089fb65d228SAlex Elder 	rbd_dev->parent_overlap = 0;
2090fb65d228SAlex Elder }
2091fb65d228SAlex Elder 
2092bf0d5f50SAlex Elder /*
2093a2acd00eSAlex Elder  * Parent image reference counting is used to determine when an
2094a2acd00eSAlex Elder  * image's parent fields can be safely torn down--after there are no
2095a2acd00eSAlex Elder  * more in-flight requests to the parent image.  When the last
2096a2acd00eSAlex Elder  * reference is dropped, cleaning them up is safe.
2097a2acd00eSAlex Elder  */
2098a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
2099a2acd00eSAlex Elder {
2100a2acd00eSAlex Elder 	int counter;
2101a2acd00eSAlex Elder 
2102a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
2103a2acd00eSAlex Elder 		return;
2104a2acd00eSAlex Elder 
2105a2acd00eSAlex Elder 	counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
2106a2acd00eSAlex Elder 	if (counter > 0)
2107a2acd00eSAlex Elder 		return;
2108a2acd00eSAlex Elder 
2109a2acd00eSAlex Elder 	/* Last reference; clean up parent data structures */
2110a2acd00eSAlex Elder 
2111a2acd00eSAlex Elder 	if (!counter)
2112a2acd00eSAlex Elder 		rbd_dev_unparent(rbd_dev);
2113a2acd00eSAlex Elder 	else
21149584d508SIlya Dryomov 		rbd_warn(rbd_dev, "parent reference underflow");
2115a2acd00eSAlex Elder }
2116a2acd00eSAlex Elder 
2117a2acd00eSAlex Elder /*
2118a2acd00eSAlex Elder  * If an image has a non-zero parent overlap, get a reference to its
2119a2acd00eSAlex Elder  * parent.
2120a2acd00eSAlex Elder  *
2121a2acd00eSAlex Elder  * Returns true if the rbd device has a parent with a non-zero
2122a2acd00eSAlex Elder  * overlap and a reference for it was successfully taken, or
2123a2acd00eSAlex Elder  * false otherwise.
2124a2acd00eSAlex Elder  */
2125a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
2126a2acd00eSAlex Elder {
2127ae43e9d0SIlya Dryomov 	int counter = 0;
2128a2acd00eSAlex Elder 
2129a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
2130a2acd00eSAlex Elder 		return false;
2131a2acd00eSAlex Elder 
2132ae43e9d0SIlya Dryomov 	down_read(&rbd_dev->header_rwsem);
2133ae43e9d0SIlya Dryomov 	if (rbd_dev->parent_overlap)
2134a2acd00eSAlex Elder 		counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
2135ae43e9d0SIlya Dryomov 	up_read(&rbd_dev->header_rwsem);
2136a2acd00eSAlex Elder 
2137a2acd00eSAlex Elder 	if (counter < 0)
21389584d508SIlya Dryomov 		rbd_warn(rbd_dev, "parent reference overflow");
2139a2acd00eSAlex Elder 
2140ae43e9d0SIlya Dryomov 	return counter > 0;
2141a2acd00eSAlex Elder }
2142a2acd00eSAlex Elder 
2143bf0d5f50SAlex Elder /*
2144bf0d5f50SAlex Elder  * Caller is responsible for filling in the list of object requests
2145bf0d5f50SAlex Elder  * that comprises the image request, and the Linux request pointer
2146bf0d5f50SAlex Elder  * (if there is one).
2147bf0d5f50SAlex Elder  */
2148cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create(
2149cc344fa1SAlex Elder 					struct rbd_device *rbd_dev,
2150bf0d5f50SAlex Elder 					u64 offset, u64 length,
21516d2940c8SGuangliang Zhao 					enum obj_operation_type op_type,
21524e752f0aSJosh Durgin 					struct ceph_snap_context *snapc)
2153bf0d5f50SAlex Elder {
2154bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
2155bf0d5f50SAlex Elder 
21567a716aacSIlya Dryomov 	img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
2157bf0d5f50SAlex Elder 	if (!img_request)
2158bf0d5f50SAlex Elder 		return NULL;
2159bf0d5f50SAlex Elder 
2160bf0d5f50SAlex Elder 	img_request->rq = NULL;
2161bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
2162bf0d5f50SAlex Elder 	img_request->offset = offset;
2163bf0d5f50SAlex Elder 	img_request->length = length;
21640c425248SAlex Elder 	img_request->flags = 0;
216590e98c52SGuangliang Zhao 	if (op_type == OBJ_OP_DISCARD) {
216690e98c52SGuangliang Zhao 		img_request_discard_set(img_request);
216790e98c52SGuangliang Zhao 		img_request->snapc = snapc;
216890e98c52SGuangliang Zhao 	} else if (op_type == OBJ_OP_WRITE) {
21690c425248SAlex Elder 		img_request_write_set(img_request);
21704e752f0aSJosh Durgin 		img_request->snapc = snapc;
21710c425248SAlex Elder 	} else {
2172bf0d5f50SAlex Elder 		img_request->snap_id = rbd_dev->spec->snap_id;
21730c425248SAlex Elder 	}
2174a2acd00eSAlex Elder 	if (rbd_dev_parent_get(rbd_dev))
2175d0b2e944SAlex Elder 		img_request_layered_set(img_request);
2176bf0d5f50SAlex Elder 	spin_lock_init(&img_request->completion_lock);
2177bf0d5f50SAlex Elder 	img_request->next_completion = 0;
2178bf0d5f50SAlex Elder 	img_request->callback = NULL;
2179a5a337d4SAlex Elder 	img_request->result = 0;
2180bf0d5f50SAlex Elder 	img_request->obj_request_count = 0;
2181bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&img_request->obj_requests);
2182bf0d5f50SAlex Elder 	kref_init(&img_request->kref);
2183bf0d5f50SAlex Elder 
218437206ee5SAlex Elder 	dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
21856d2940c8SGuangliang Zhao 		obj_op_name(op_type), offset, length, img_request);
218637206ee5SAlex Elder 
2187bf0d5f50SAlex Elder 	return img_request;
2188bf0d5f50SAlex Elder }
2189bf0d5f50SAlex Elder 
2190bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref)
2191bf0d5f50SAlex Elder {
2192bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
2193bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2194bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
2195bf0d5f50SAlex Elder 
2196bf0d5f50SAlex Elder 	img_request = container_of(kref, struct rbd_img_request, kref);
2197bf0d5f50SAlex Elder 
219837206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
219937206ee5SAlex Elder 
2200bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2201bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
220225dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count == 0);
2203bf0d5f50SAlex Elder 
2204a2acd00eSAlex Elder 	if (img_request_layered_test(img_request)) {
2205a2acd00eSAlex Elder 		img_request_layered_clear(img_request);
2206a2acd00eSAlex Elder 		rbd_dev_parent_put(img_request->rbd_dev);
2207a2acd00eSAlex Elder 	}
2208a2acd00eSAlex Elder 
2209bef95455SJosh Durgin 	if (img_request_write_test(img_request) ||
2210bef95455SJosh Durgin 		img_request_discard_test(img_request))
2211812164f8SAlex Elder 		ceph_put_snap_context(img_request->snapc);
2212bf0d5f50SAlex Elder 
22131c2a9dfeSAlex Elder 	kmem_cache_free(rbd_img_request_cache, img_request);
2214bf0d5f50SAlex Elder }
2215bf0d5f50SAlex Elder 
2216e93f3152SAlex Elder static struct rbd_img_request *rbd_parent_request_create(
2217e93f3152SAlex Elder 					struct rbd_obj_request *obj_request,
2218e93f3152SAlex Elder 					u64 img_offset, u64 length)
2219e93f3152SAlex Elder {
2220e93f3152SAlex Elder 	struct rbd_img_request *parent_request;
2221e93f3152SAlex Elder 	struct rbd_device *rbd_dev;
2222e93f3152SAlex Elder 
2223e93f3152SAlex Elder 	rbd_assert(obj_request->img_request);
2224e93f3152SAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2225e93f3152SAlex Elder 
22264e752f0aSJosh Durgin 	parent_request = rbd_img_request_create(rbd_dev->parent, img_offset,
22276d2940c8SGuangliang Zhao 						length, OBJ_OP_READ, NULL);
2228e93f3152SAlex Elder 	if (!parent_request)
2229e93f3152SAlex Elder 		return NULL;
2230e93f3152SAlex Elder 
2231e93f3152SAlex Elder 	img_request_child_set(parent_request);
2232e93f3152SAlex Elder 	rbd_obj_request_get(obj_request);
2233e93f3152SAlex Elder 	parent_request->obj_request = obj_request;
2234e93f3152SAlex Elder 
2235e93f3152SAlex Elder 	return parent_request;
2236e93f3152SAlex Elder }
2237e93f3152SAlex Elder 
2238e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref)
2239e93f3152SAlex Elder {
2240e93f3152SAlex Elder 	struct rbd_img_request *parent_request;
2241e93f3152SAlex Elder 	struct rbd_obj_request *orig_request;
2242e93f3152SAlex Elder 
2243e93f3152SAlex Elder 	parent_request = container_of(kref, struct rbd_img_request, kref);
2244e93f3152SAlex Elder 	orig_request = parent_request->obj_request;
2245e93f3152SAlex Elder 
2246e93f3152SAlex Elder 	parent_request->obj_request = NULL;
2247e93f3152SAlex Elder 	rbd_obj_request_put(orig_request);
2248e93f3152SAlex Elder 	img_request_child_clear(parent_request);
2249e93f3152SAlex Elder 
2250e93f3152SAlex Elder 	rbd_img_request_destroy(kref);
2251e93f3152SAlex Elder }
2252e93f3152SAlex Elder 
22531217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
22541217857fSAlex Elder {
22556365d33aSAlex Elder 	struct rbd_img_request *img_request;
22561217857fSAlex Elder 	unsigned int xferred;
22571217857fSAlex Elder 	int result;
22588b3e1a56SAlex Elder 	bool more;
22591217857fSAlex Elder 
22606365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
22616365d33aSAlex Elder 	img_request = obj_request->img_request;
22626365d33aSAlex Elder 
22631217857fSAlex Elder 	rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
22641217857fSAlex Elder 	xferred = (unsigned int)obj_request->xferred;
22651217857fSAlex Elder 	result = obj_request->result;
22661217857fSAlex Elder 	if (result) {
22671217857fSAlex Elder 		struct rbd_device *rbd_dev = img_request->rbd_dev;
22686d2940c8SGuangliang Zhao 		enum obj_operation_type op_type;
22696d2940c8SGuangliang Zhao 
227090e98c52SGuangliang Zhao 		if (img_request_discard_test(img_request))
227190e98c52SGuangliang Zhao 			op_type = OBJ_OP_DISCARD;
227290e98c52SGuangliang Zhao 		else if (img_request_write_test(img_request))
227390e98c52SGuangliang Zhao 			op_type = OBJ_OP_WRITE;
227490e98c52SGuangliang Zhao 		else
227590e98c52SGuangliang Zhao 			op_type = OBJ_OP_READ;
22761217857fSAlex Elder 
22779584d508SIlya Dryomov 		rbd_warn(rbd_dev, "%s %llx at %llx (%llx)",
22786d2940c8SGuangliang Zhao 			obj_op_name(op_type), obj_request->length,
22796d2940c8SGuangliang Zhao 			obj_request->img_offset, obj_request->offset);
22809584d508SIlya Dryomov 		rbd_warn(rbd_dev, "  result %d xferred %x",
22811217857fSAlex Elder 			result, xferred);
22821217857fSAlex Elder 		if (!img_request->result)
22831217857fSAlex Elder 			img_request->result = result;
2284082a75daSIlya Dryomov 		/*
2285082a75daSIlya Dryomov 		 * Need to end I/O on the entire obj_request worth of
2286082a75daSIlya Dryomov 		 * bytes in case of error.
2287082a75daSIlya Dryomov 		 */
2288082a75daSIlya Dryomov 		xferred = obj_request->length;
22891217857fSAlex Elder 	}
22901217857fSAlex Elder 
22918b3e1a56SAlex Elder 	if (img_request_child_test(img_request)) {
22928b3e1a56SAlex Elder 		rbd_assert(img_request->obj_request != NULL);
22938b3e1a56SAlex Elder 		more = obj_request->which < img_request->obj_request_count - 1;
22948b3e1a56SAlex Elder 	} else {
22958b3e1a56SAlex Elder 		rbd_assert(img_request->rq != NULL);
22967ad18afaSChristoph Hellwig 
22977ad18afaSChristoph Hellwig 		more = blk_update_request(img_request->rq, result, xferred);
22987ad18afaSChristoph Hellwig 		if (!more)
22997ad18afaSChristoph Hellwig 			__blk_mq_end_request(img_request->rq, result);
23008b3e1a56SAlex Elder 	}
23018b3e1a56SAlex Elder 
23028b3e1a56SAlex Elder 	return more;
23031217857fSAlex Elder }
23041217857fSAlex Elder 
23052169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
23062169238dSAlex Elder {
23072169238dSAlex Elder 	struct rbd_img_request *img_request;
23082169238dSAlex Elder 	u32 which = obj_request->which;
23092169238dSAlex Elder 	bool more = true;
23102169238dSAlex Elder 
23116365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
23122169238dSAlex Elder 	img_request = obj_request->img_request;
23132169238dSAlex Elder 
23142169238dSAlex Elder 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
23152169238dSAlex Elder 	rbd_assert(img_request != NULL);
23162169238dSAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
23172169238dSAlex Elder 	rbd_assert(which != BAD_WHICH);
23182169238dSAlex Elder 	rbd_assert(which < img_request->obj_request_count);
23192169238dSAlex Elder 
23202169238dSAlex Elder 	spin_lock_irq(&img_request->completion_lock);
23212169238dSAlex Elder 	if (which != img_request->next_completion)
23222169238dSAlex Elder 		goto out;
23232169238dSAlex Elder 
23242169238dSAlex Elder 	for_each_obj_request_from(img_request, obj_request) {
23252169238dSAlex Elder 		rbd_assert(more);
23262169238dSAlex Elder 		rbd_assert(which < img_request->obj_request_count);
23272169238dSAlex Elder 
23282169238dSAlex Elder 		if (!obj_request_done_test(obj_request))
23292169238dSAlex Elder 			break;
23301217857fSAlex Elder 		more = rbd_img_obj_end_request(obj_request);
23312169238dSAlex Elder 		which++;
23322169238dSAlex Elder 	}
23332169238dSAlex Elder 
23342169238dSAlex Elder 	rbd_assert(more ^ (which == img_request->obj_request_count));
23352169238dSAlex Elder 	img_request->next_completion = which;
23362169238dSAlex Elder out:
23372169238dSAlex Elder 	spin_unlock_irq(&img_request->completion_lock);
23380f2d5be7SAlex Elder 	rbd_img_request_put(img_request);
23392169238dSAlex Elder 
23402169238dSAlex Elder 	if (!more)
23412169238dSAlex Elder 		rbd_img_request_complete(img_request);
23422169238dSAlex Elder }
23432169238dSAlex Elder 
2344f1a4739fSAlex Elder /*
23453b434a2aSJosh Durgin  * Add individual osd ops to the given ceph_osd_request and prepare
23463b434a2aSJosh Durgin  * them for submission. num_ops is the current number of
23473b434a2aSJosh Durgin  * osd operations already to the object request.
23483b434a2aSJosh Durgin  */
23493b434a2aSJosh Durgin static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request,
23503b434a2aSJosh Durgin 				struct ceph_osd_request *osd_request,
23513b434a2aSJosh Durgin 				enum obj_operation_type op_type,
23523b434a2aSJosh Durgin 				unsigned int num_ops)
23533b434a2aSJosh Durgin {
23543b434a2aSJosh Durgin 	struct rbd_img_request *img_request = obj_request->img_request;
23553b434a2aSJosh Durgin 	struct rbd_device *rbd_dev = img_request->rbd_dev;
23563b434a2aSJosh Durgin 	u64 object_size = rbd_obj_bytes(&rbd_dev->header);
23573b434a2aSJosh Durgin 	u64 offset = obj_request->offset;
23583b434a2aSJosh Durgin 	u64 length = obj_request->length;
23593b434a2aSJosh Durgin 	u64 img_end;
23603b434a2aSJosh Durgin 	u16 opcode;
23613b434a2aSJosh Durgin 
23623b434a2aSJosh Durgin 	if (op_type == OBJ_OP_DISCARD) {
2363d3246fb0SJosh Durgin 		if (!offset && length == object_size &&
2364d3246fb0SJosh Durgin 		    (!img_request_layered_test(img_request) ||
2365d3246fb0SJosh Durgin 		     !obj_request_overlaps_parent(obj_request))) {
23663b434a2aSJosh Durgin 			opcode = CEPH_OSD_OP_DELETE;
23673b434a2aSJosh Durgin 		} else if ((offset + length == object_size)) {
23683b434a2aSJosh Durgin 			opcode = CEPH_OSD_OP_TRUNCATE;
23693b434a2aSJosh Durgin 		} else {
23703b434a2aSJosh Durgin 			down_read(&rbd_dev->header_rwsem);
23713b434a2aSJosh Durgin 			img_end = rbd_dev->header.image_size;
23723b434a2aSJosh Durgin 			up_read(&rbd_dev->header_rwsem);
23733b434a2aSJosh Durgin 
23743b434a2aSJosh Durgin 			if (obj_request->img_offset + length == img_end)
23753b434a2aSJosh Durgin 				opcode = CEPH_OSD_OP_TRUNCATE;
23763b434a2aSJosh Durgin 			else
23773b434a2aSJosh Durgin 				opcode = CEPH_OSD_OP_ZERO;
23783b434a2aSJosh Durgin 		}
23793b434a2aSJosh Durgin 	} else if (op_type == OBJ_OP_WRITE) {
2380e30b7577SIlya Dryomov 		if (!offset && length == object_size)
2381e30b7577SIlya Dryomov 			opcode = CEPH_OSD_OP_WRITEFULL;
2382e30b7577SIlya Dryomov 		else
23833b434a2aSJosh Durgin 			opcode = CEPH_OSD_OP_WRITE;
23843b434a2aSJosh Durgin 		osd_req_op_alloc_hint_init(osd_request, num_ops,
23853b434a2aSJosh Durgin 					object_size, object_size);
23863b434a2aSJosh Durgin 		num_ops++;
23873b434a2aSJosh Durgin 	} else {
23883b434a2aSJosh Durgin 		opcode = CEPH_OSD_OP_READ;
23893b434a2aSJosh Durgin 	}
23903b434a2aSJosh Durgin 
23917e868b6eSIlya Dryomov 	if (opcode == CEPH_OSD_OP_DELETE)
2392144cba14SYan, Zheng 		osd_req_op_init(osd_request, num_ops, opcode, 0);
23937e868b6eSIlya Dryomov 	else
23947e868b6eSIlya Dryomov 		osd_req_op_extent_init(osd_request, num_ops, opcode,
23957e868b6eSIlya Dryomov 				       offset, length, 0, 0);
23967e868b6eSIlya Dryomov 
23973b434a2aSJosh Durgin 	if (obj_request->type == OBJ_REQUEST_BIO)
23983b434a2aSJosh Durgin 		osd_req_op_extent_osd_data_bio(osd_request, num_ops,
23993b434a2aSJosh Durgin 					obj_request->bio_list, length);
24003b434a2aSJosh Durgin 	else if (obj_request->type == OBJ_REQUEST_PAGES)
24013b434a2aSJosh Durgin 		osd_req_op_extent_osd_data_pages(osd_request, num_ops,
24023b434a2aSJosh Durgin 					obj_request->pages, length,
24033b434a2aSJosh Durgin 					offset & ~PAGE_MASK, false, false);
24043b434a2aSJosh Durgin 
24053b434a2aSJosh Durgin 	/* Discards are also writes */
24063b434a2aSJosh Durgin 	if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
24073b434a2aSJosh Durgin 		rbd_osd_req_format_write(obj_request);
24083b434a2aSJosh Durgin 	else
24093b434a2aSJosh Durgin 		rbd_osd_req_format_read(obj_request);
24103b434a2aSJosh Durgin }
24113b434a2aSJosh Durgin 
24123b434a2aSJosh Durgin /*
2413f1a4739fSAlex Elder  * Split up an image request into one or more object requests, each
2414f1a4739fSAlex Elder  * to a different object.  The "type" parameter indicates whether
2415f1a4739fSAlex Elder  * "data_desc" is the pointer to the head of a list of bio
2416f1a4739fSAlex Elder  * structures, or the base of a page array.  In either case this
2417f1a4739fSAlex Elder  * function assumes data_desc describes memory sufficient to hold
2418f1a4739fSAlex Elder  * all data described by the image request.
2419f1a4739fSAlex Elder  */
2420f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request,
2421f1a4739fSAlex Elder 					enum obj_request_type type,
2422f1a4739fSAlex Elder 					void *data_desc)
2423bf0d5f50SAlex Elder {
2424bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = img_request->rbd_dev;
2425bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = NULL;
2426bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
2427a158073cSJingoo Han 	struct bio *bio_list = NULL;
2428f1a4739fSAlex Elder 	unsigned int bio_offset = 0;
2429a158073cSJingoo Han 	struct page **pages = NULL;
24306d2940c8SGuangliang Zhao 	enum obj_operation_type op_type;
24317da22d29SAlex Elder 	u64 img_offset;
2432bf0d5f50SAlex Elder 	u64 resid;
2433bf0d5f50SAlex Elder 
2434f1a4739fSAlex Elder 	dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2435f1a4739fSAlex Elder 		(int)type, data_desc);
243637206ee5SAlex Elder 
24377da22d29SAlex Elder 	img_offset = img_request->offset;
2438bf0d5f50SAlex Elder 	resid = img_request->length;
24394dda41d3SAlex Elder 	rbd_assert(resid > 0);
24403b434a2aSJosh Durgin 	op_type = rbd_img_request_op_type(img_request);
2441f1a4739fSAlex Elder 
2442f1a4739fSAlex Elder 	if (type == OBJ_REQUEST_BIO) {
2443f1a4739fSAlex Elder 		bio_list = data_desc;
24444f024f37SKent Overstreet 		rbd_assert(img_offset ==
24454f024f37SKent Overstreet 			   bio_list->bi_iter.bi_sector << SECTOR_SHIFT);
244690e98c52SGuangliang Zhao 	} else if (type == OBJ_REQUEST_PAGES) {
2447f1a4739fSAlex Elder 		pages = data_desc;
2448f1a4739fSAlex Elder 	}
2449f1a4739fSAlex Elder 
2450bf0d5f50SAlex Elder 	while (resid) {
24512fa12320SAlex Elder 		struct ceph_osd_request *osd_req;
2452a90bb0c1SIlya Dryomov 		u64 object_no = img_offset >> rbd_dev->header.obj_order;
245367e2b652SIlya Dryomov 		u64 offset = rbd_segment_offset(rbd_dev, img_offset);
245467e2b652SIlya Dryomov 		u64 length = rbd_segment_length(rbd_dev, img_offset, resid);
2455bf0d5f50SAlex Elder 
24566c696d85SIlya Dryomov 		obj_request = rbd_obj_request_create(type);
2457bf0d5f50SAlex Elder 		if (!obj_request)
2458bf0d5f50SAlex Elder 			goto out_unwind;
245962054da6SIlya Dryomov 
2460a90bb0c1SIlya Dryomov 		obj_request->object_no = object_no;
246167e2b652SIlya Dryomov 		obj_request->offset = offset;
246267e2b652SIlya Dryomov 		obj_request->length = length;
246367e2b652SIlya Dryomov 
246403507db6SJosh Durgin 		/*
246503507db6SJosh Durgin 		 * set obj_request->img_request before creating the
246603507db6SJosh Durgin 		 * osd_request so that it gets the right snapc
246703507db6SJosh Durgin 		 */
246803507db6SJosh Durgin 		rbd_img_obj_request_add(img_request, obj_request);
2469bf0d5f50SAlex Elder 
2470f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO) {
2471f1a4739fSAlex Elder 			unsigned int clone_size;
2472f1a4739fSAlex Elder 
2473bf0d5f50SAlex Elder 			rbd_assert(length <= (u64)UINT_MAX);
2474bf0d5f50SAlex Elder 			clone_size = (unsigned int)length;
2475f1a4739fSAlex Elder 			obj_request->bio_list =
2476f1a4739fSAlex Elder 					bio_chain_clone_range(&bio_list,
2477f1a4739fSAlex Elder 								&bio_offset,
2478f1a4739fSAlex Elder 								clone_size,
24792224d879SDavid Disseldorp 								GFP_NOIO);
2480bf0d5f50SAlex Elder 			if (!obj_request->bio_list)
248162054da6SIlya Dryomov 				goto out_unwind;
248290e98c52SGuangliang Zhao 		} else if (type == OBJ_REQUEST_PAGES) {
2483f1a4739fSAlex Elder 			unsigned int page_count;
2484f1a4739fSAlex Elder 
2485f1a4739fSAlex Elder 			obj_request->pages = pages;
2486f1a4739fSAlex Elder 			page_count = (u32)calc_pages_for(offset, length);
2487f1a4739fSAlex Elder 			obj_request->page_count = page_count;
2488f1a4739fSAlex Elder 			if ((offset + length) & ~PAGE_MASK)
2489f1a4739fSAlex Elder 				page_count--;	/* more on last page */
2490f1a4739fSAlex Elder 			pages += page_count;
2491f1a4739fSAlex Elder 		}
2492bf0d5f50SAlex Elder 
24936d2940c8SGuangliang Zhao 		osd_req = rbd_osd_req_create(rbd_dev, op_type,
24946d2940c8SGuangliang Zhao 					(op_type == OBJ_OP_WRITE) ? 2 : 1,
24952fa12320SAlex Elder 					obj_request);
24962fa12320SAlex Elder 		if (!osd_req)
249762054da6SIlya Dryomov 			goto out_unwind;
24983b434a2aSJosh Durgin 
24992fa12320SAlex Elder 		obj_request->osd_req = osd_req;
25002169238dSAlex Elder 		obj_request->callback = rbd_img_obj_callback;
25017da22d29SAlex Elder 		obj_request->img_offset = img_offset;
2502bf0d5f50SAlex Elder 
25033b434a2aSJosh Durgin 		rbd_img_obj_request_fill(obj_request, osd_req, op_type, 0);
25043b434a2aSJosh Durgin 
25057da22d29SAlex Elder 		img_offset += length;
2506bf0d5f50SAlex Elder 		resid -= length;
2507bf0d5f50SAlex Elder 	}
2508bf0d5f50SAlex Elder 
2509bf0d5f50SAlex Elder 	return 0;
2510bf0d5f50SAlex Elder 
2511bf0d5f50SAlex Elder out_unwind:
2512bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
251342dd037cSIlya Dryomov 		rbd_img_obj_request_del(img_request, obj_request);
2514bf0d5f50SAlex Elder 
2515bf0d5f50SAlex Elder 	return -ENOMEM;
2516bf0d5f50SAlex Elder }
2517bf0d5f50SAlex Elder 
25183d7efd18SAlex Elder static void
25192761713dSIlya Dryomov rbd_osd_copyup_callback(struct rbd_obj_request *obj_request)
25200eefd470SAlex Elder {
25210eefd470SAlex Elder 	struct rbd_img_request *img_request;
25220eefd470SAlex Elder 	struct rbd_device *rbd_dev;
2523ebda6408SAlex Elder 	struct page **pages;
25240eefd470SAlex Elder 	u32 page_count;
25250eefd470SAlex Elder 
25262761713dSIlya Dryomov 	dout("%s: obj %p\n", __func__, obj_request);
25272761713dSIlya Dryomov 
2528d3246fb0SJosh Durgin 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO ||
2529d3246fb0SJosh Durgin 		obj_request->type == OBJ_REQUEST_NODATA);
25300eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
25310eefd470SAlex Elder 	img_request = obj_request->img_request;
25320eefd470SAlex Elder 	rbd_assert(img_request);
25330eefd470SAlex Elder 
25340eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
25350eefd470SAlex Elder 	rbd_assert(rbd_dev);
25360eefd470SAlex Elder 
2537ebda6408SAlex Elder 	pages = obj_request->copyup_pages;
2538ebda6408SAlex Elder 	rbd_assert(pages != NULL);
25390eefd470SAlex Elder 	obj_request->copyup_pages = NULL;
2540ebda6408SAlex Elder 	page_count = obj_request->copyup_page_count;
2541ebda6408SAlex Elder 	rbd_assert(page_count);
2542ebda6408SAlex Elder 	obj_request->copyup_page_count = 0;
2543ebda6408SAlex Elder 	ceph_release_page_vector(pages, page_count);
25440eefd470SAlex Elder 
25450eefd470SAlex Elder 	/*
25460eefd470SAlex Elder 	 * We want the transfer count to reflect the size of the
25470eefd470SAlex Elder 	 * original write request.  There is no such thing as a
25480eefd470SAlex Elder 	 * successful short write, so if the request was successful
25490eefd470SAlex Elder 	 * we can just set it to the originally-requested length.
25500eefd470SAlex Elder 	 */
25510eefd470SAlex Elder 	if (!obj_request->result)
25520eefd470SAlex Elder 		obj_request->xferred = obj_request->length;
25530eefd470SAlex Elder 
25542761713dSIlya Dryomov 	obj_request_done_set(obj_request);
25550eefd470SAlex Elder }
25560eefd470SAlex Elder 
25570eefd470SAlex Elder static void
25583d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
25593d7efd18SAlex Elder {
25603d7efd18SAlex Elder 	struct rbd_obj_request *orig_request;
25610eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
25620eefd470SAlex Elder 	struct rbd_device *rbd_dev;
25633d7efd18SAlex Elder 	struct page **pages;
2564d3246fb0SJosh Durgin 	enum obj_operation_type op_type;
2565ebda6408SAlex Elder 	u32 page_count;
2566bbea1c1aSAlex Elder 	int img_result;
2567ebda6408SAlex Elder 	u64 parent_length;
25683d7efd18SAlex Elder 
25693d7efd18SAlex Elder 	rbd_assert(img_request_child_test(img_request));
25703d7efd18SAlex Elder 
25713d7efd18SAlex Elder 	/* First get what we need from the image request */
25723d7efd18SAlex Elder 
25733d7efd18SAlex Elder 	pages = img_request->copyup_pages;
25743d7efd18SAlex Elder 	rbd_assert(pages != NULL);
25753d7efd18SAlex Elder 	img_request->copyup_pages = NULL;
2576ebda6408SAlex Elder 	page_count = img_request->copyup_page_count;
2577ebda6408SAlex Elder 	rbd_assert(page_count);
2578ebda6408SAlex Elder 	img_request->copyup_page_count = 0;
25793d7efd18SAlex Elder 
25803d7efd18SAlex Elder 	orig_request = img_request->obj_request;
25813d7efd18SAlex Elder 	rbd_assert(orig_request != NULL);
2582b91f09f1SAlex Elder 	rbd_assert(obj_request_type_valid(orig_request->type));
2583bbea1c1aSAlex Elder 	img_result = img_request->result;
2584ebda6408SAlex Elder 	parent_length = img_request->length;
2585fa355112SIlya Dryomov 	rbd_assert(img_result || parent_length == img_request->xferred);
25863d7efd18SAlex Elder 	rbd_img_request_put(img_request);
25873d7efd18SAlex Elder 
258891c6febbSAlex Elder 	rbd_assert(orig_request->img_request);
258991c6febbSAlex Elder 	rbd_dev = orig_request->img_request->rbd_dev;
25903d7efd18SAlex Elder 	rbd_assert(rbd_dev);
25913d7efd18SAlex Elder 
2592bbea1c1aSAlex Elder 	/*
2593bbea1c1aSAlex Elder 	 * If the overlap has become 0 (most likely because the
2594bbea1c1aSAlex Elder 	 * image has been flattened) we need to free the pages
2595bbea1c1aSAlex Elder 	 * and re-submit the original write request.
2596bbea1c1aSAlex Elder 	 */
2597bbea1c1aSAlex Elder 	if (!rbd_dev->parent_overlap) {
2598bbea1c1aSAlex Elder 		ceph_release_page_vector(pages, page_count);
2599980917fcSIlya Dryomov 		rbd_obj_request_submit(orig_request);
2600bbea1c1aSAlex Elder 		return;
2601bbea1c1aSAlex Elder 	}
2602bbea1c1aSAlex Elder 
2603bbea1c1aSAlex Elder 	if (img_result)
26040eefd470SAlex Elder 		goto out_err;
26053d7efd18SAlex Elder 
26068785b1d4SAlex Elder 	/*
26078785b1d4SAlex Elder 	 * The original osd request is of no use to use any more.
26080ccd5926SIlya Dryomov 	 * We need a new one that can hold the three ops in a copyup
26098785b1d4SAlex Elder 	 * request.  Allocate the new copyup osd request for the
26108785b1d4SAlex Elder 	 * original request, and release the old one.
26118785b1d4SAlex Elder 	 */
2612bbea1c1aSAlex Elder 	img_result = -ENOMEM;
26130eefd470SAlex Elder 	osd_req = rbd_osd_req_create_copyup(orig_request);
26140eefd470SAlex Elder 	if (!osd_req)
26150eefd470SAlex Elder 		goto out_err;
26168785b1d4SAlex Elder 	rbd_osd_req_destroy(orig_request->osd_req);
26170eefd470SAlex Elder 	orig_request->osd_req = osd_req;
26180eefd470SAlex Elder 	orig_request->copyup_pages = pages;
2619ebda6408SAlex Elder 	orig_request->copyup_page_count = page_count;
26203d7efd18SAlex Elder 
26210eefd470SAlex Elder 	/* Initialize the copyup op */
26220eefd470SAlex Elder 
26230eefd470SAlex Elder 	osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2624ebda6408SAlex Elder 	osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
26250eefd470SAlex Elder 						false, false);
26260eefd470SAlex Elder 
2627d3246fb0SJosh Durgin 	/* Add the other op(s) */
26280ccd5926SIlya Dryomov 
2629d3246fb0SJosh Durgin 	op_type = rbd_img_request_op_type(orig_request->img_request);
2630d3246fb0SJosh Durgin 	rbd_img_obj_request_fill(orig_request, osd_req, op_type, 1);
26310eefd470SAlex Elder 
26320eefd470SAlex Elder 	/* All set, send it off. */
26330eefd470SAlex Elder 
2634980917fcSIlya Dryomov 	rbd_obj_request_submit(orig_request);
26350eefd470SAlex Elder 	return;
26360eefd470SAlex Elder 
26370eefd470SAlex Elder out_err:
2638fa355112SIlya Dryomov 	ceph_release_page_vector(pages, page_count);
26390dcc685eSIlya Dryomov 	rbd_obj_request_error(orig_request, img_result);
26403d7efd18SAlex Elder }
26413d7efd18SAlex Elder 
26423d7efd18SAlex Elder /*
26433d7efd18SAlex Elder  * Read from the parent image the range of data that covers the
26443d7efd18SAlex Elder  * entire target of the given object request.  This is used for
26453d7efd18SAlex Elder  * satisfying a layered image write request when the target of an
26463d7efd18SAlex Elder  * object request from the image request does not exist.
26473d7efd18SAlex Elder  *
26483d7efd18SAlex Elder  * A page array big enough to hold the returned data is allocated
26493d7efd18SAlex Elder  * and supplied to rbd_img_request_fill() as the "data descriptor."
26503d7efd18SAlex Elder  * When the read completes, this page array will be transferred to
26513d7efd18SAlex Elder  * the original object request for the copyup operation.
26523d7efd18SAlex Elder  *
2653c2e82414SIlya Dryomov  * If an error occurs, it is recorded as the result of the original
2654c2e82414SIlya Dryomov  * object request in rbd_img_obj_exists_callback().
26553d7efd18SAlex Elder  */
26563d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
26573d7efd18SAlex Elder {
2658058aa991SIlya Dryomov 	struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
26593d7efd18SAlex Elder 	struct rbd_img_request *parent_request = NULL;
26603d7efd18SAlex Elder 	u64 img_offset;
26613d7efd18SAlex Elder 	u64 length;
26623d7efd18SAlex Elder 	struct page **pages = NULL;
26633d7efd18SAlex Elder 	u32 page_count;
26643d7efd18SAlex Elder 	int result;
26653d7efd18SAlex Elder 
26663d7efd18SAlex Elder 	rbd_assert(rbd_dev->parent != NULL);
26673d7efd18SAlex Elder 
26683d7efd18SAlex Elder 	/*
26693d7efd18SAlex Elder 	 * Determine the byte range covered by the object in the
26703d7efd18SAlex Elder 	 * child image to which the original request was to be sent.
26713d7efd18SAlex Elder 	 */
26723d7efd18SAlex Elder 	img_offset = obj_request->img_offset - obj_request->offset;
26735bc3fb17SIlya Dryomov 	length = rbd_obj_bytes(&rbd_dev->header);
26743d7efd18SAlex Elder 
26753d7efd18SAlex Elder 	/*
2676a9e8ba2cSAlex Elder 	 * There is no defined parent data beyond the parent
2677a9e8ba2cSAlex Elder 	 * overlap, so limit what we read at that boundary if
2678a9e8ba2cSAlex Elder 	 * necessary.
2679a9e8ba2cSAlex Elder 	 */
2680a9e8ba2cSAlex Elder 	if (img_offset + length > rbd_dev->parent_overlap) {
2681a9e8ba2cSAlex Elder 		rbd_assert(img_offset < rbd_dev->parent_overlap);
2682a9e8ba2cSAlex Elder 		length = rbd_dev->parent_overlap - img_offset;
2683a9e8ba2cSAlex Elder 	}
2684a9e8ba2cSAlex Elder 
2685a9e8ba2cSAlex Elder 	/*
26863d7efd18SAlex Elder 	 * Allocate a page array big enough to receive the data read
26873d7efd18SAlex Elder 	 * from the parent.
26883d7efd18SAlex Elder 	 */
26893d7efd18SAlex Elder 	page_count = (u32)calc_pages_for(0, length);
26903d7efd18SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
26913d7efd18SAlex Elder 	if (IS_ERR(pages)) {
26923d7efd18SAlex Elder 		result = PTR_ERR(pages);
26933d7efd18SAlex Elder 		pages = NULL;
26943d7efd18SAlex Elder 		goto out_err;
26953d7efd18SAlex Elder 	}
26963d7efd18SAlex Elder 
26973d7efd18SAlex Elder 	result = -ENOMEM;
2698e93f3152SAlex Elder 	parent_request = rbd_parent_request_create(obj_request,
2699e93f3152SAlex Elder 						img_offset, length);
27003d7efd18SAlex Elder 	if (!parent_request)
27013d7efd18SAlex Elder 		goto out_err;
27023d7efd18SAlex Elder 
27033d7efd18SAlex Elder 	result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
27043d7efd18SAlex Elder 	if (result)
27053d7efd18SAlex Elder 		goto out_err;
2706058aa991SIlya Dryomov 
27073d7efd18SAlex Elder 	parent_request->copyup_pages = pages;
2708ebda6408SAlex Elder 	parent_request->copyup_page_count = page_count;
27093d7efd18SAlex Elder 	parent_request->callback = rbd_img_obj_parent_read_full_callback;
2710058aa991SIlya Dryomov 
27113d7efd18SAlex Elder 	result = rbd_img_request_submit(parent_request);
27123d7efd18SAlex Elder 	if (!result)
27133d7efd18SAlex Elder 		return 0;
27143d7efd18SAlex Elder 
27153d7efd18SAlex Elder 	parent_request->copyup_pages = NULL;
2716ebda6408SAlex Elder 	parent_request->copyup_page_count = 0;
27173d7efd18SAlex Elder 	parent_request->obj_request = NULL;
27183d7efd18SAlex Elder 	rbd_obj_request_put(obj_request);
27193d7efd18SAlex Elder out_err:
27203d7efd18SAlex Elder 	if (pages)
27213d7efd18SAlex Elder 		ceph_release_page_vector(pages, page_count);
27223d7efd18SAlex Elder 	if (parent_request)
27233d7efd18SAlex Elder 		rbd_img_request_put(parent_request);
27243d7efd18SAlex Elder 	return result;
27253d7efd18SAlex Elder }
27263d7efd18SAlex Elder 
2727c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2728c5b5ef6cSAlex Elder {
2729c5b5ef6cSAlex Elder 	struct rbd_obj_request *orig_request;
2730638f5abeSAlex Elder 	struct rbd_device *rbd_dev;
2731c5b5ef6cSAlex Elder 	int result;
2732c5b5ef6cSAlex Elder 
2733c5b5ef6cSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
2734c5b5ef6cSAlex Elder 
2735c5b5ef6cSAlex Elder 	/*
2736c5b5ef6cSAlex Elder 	 * All we need from the object request is the original
2737c5b5ef6cSAlex Elder 	 * request and the result of the STAT op.  Grab those, then
2738c5b5ef6cSAlex Elder 	 * we're done with the request.
2739c5b5ef6cSAlex Elder 	 */
2740c5b5ef6cSAlex Elder 	orig_request = obj_request->obj_request;
2741c5b5ef6cSAlex Elder 	obj_request->obj_request = NULL;
2742912c317dSAlex Elder 	rbd_obj_request_put(orig_request);
2743c5b5ef6cSAlex Elder 	rbd_assert(orig_request);
2744c5b5ef6cSAlex Elder 	rbd_assert(orig_request->img_request);
2745c5b5ef6cSAlex Elder 
2746c5b5ef6cSAlex Elder 	result = obj_request->result;
2747c5b5ef6cSAlex Elder 	obj_request->result = 0;
2748c5b5ef6cSAlex Elder 
2749c5b5ef6cSAlex Elder 	dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2750c5b5ef6cSAlex Elder 		obj_request, orig_request, result,
2751c5b5ef6cSAlex Elder 		obj_request->xferred, obj_request->length);
2752c5b5ef6cSAlex Elder 	rbd_obj_request_put(obj_request);
2753c5b5ef6cSAlex Elder 
2754638f5abeSAlex Elder 	/*
2755638f5abeSAlex Elder 	 * If the overlap has become 0 (most likely because the
2756980917fcSIlya Dryomov 	 * image has been flattened) we need to re-submit the
2757980917fcSIlya Dryomov 	 * original request.
2758638f5abeSAlex Elder 	 */
2759638f5abeSAlex Elder 	rbd_dev = orig_request->img_request->rbd_dev;
2760638f5abeSAlex Elder 	if (!rbd_dev->parent_overlap) {
2761980917fcSIlya Dryomov 		rbd_obj_request_submit(orig_request);
2762638f5abeSAlex Elder 		return;
2763638f5abeSAlex Elder 	}
2764c5b5ef6cSAlex Elder 
2765c5b5ef6cSAlex Elder 	/*
2766c5b5ef6cSAlex Elder 	 * Our only purpose here is to determine whether the object
2767c5b5ef6cSAlex Elder 	 * exists, and we don't want to treat the non-existence as
2768c5b5ef6cSAlex Elder 	 * an error.  If something else comes back, transfer the
2769c5b5ef6cSAlex Elder 	 * error to the original request and complete it now.
2770c5b5ef6cSAlex Elder 	 */
2771c5b5ef6cSAlex Elder 	if (!result) {
2772c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, true);
2773c5b5ef6cSAlex Elder 	} else if (result == -ENOENT) {
2774c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, false);
2775c2e82414SIlya Dryomov 	} else {
2776c2e82414SIlya Dryomov 		goto fail_orig_request;
2777c5b5ef6cSAlex Elder 	}
2778c5b5ef6cSAlex Elder 
2779c5b5ef6cSAlex Elder 	/*
2780c5b5ef6cSAlex Elder 	 * Resubmit the original request now that we have recorded
2781c5b5ef6cSAlex Elder 	 * whether the target object exists.
2782c5b5ef6cSAlex Elder 	 */
2783c2e82414SIlya Dryomov 	result = rbd_img_obj_request_submit(orig_request);
2784c2e82414SIlya Dryomov 	if (result)
2785c2e82414SIlya Dryomov 		goto fail_orig_request;
2786c2e82414SIlya Dryomov 
2787c2e82414SIlya Dryomov 	return;
2788c2e82414SIlya Dryomov 
2789c2e82414SIlya Dryomov fail_orig_request:
27900dcc685eSIlya Dryomov 	rbd_obj_request_error(orig_request, result);
2791c5b5ef6cSAlex Elder }
2792c5b5ef6cSAlex Elder 
2793c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2794c5b5ef6cSAlex Elder {
2795058aa991SIlya Dryomov 	struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
2796c5b5ef6cSAlex Elder 	struct rbd_obj_request *stat_request;
2797710214e3SIlya Dryomov 	struct page **pages;
2798c5b5ef6cSAlex Elder 	u32 page_count;
2799c5b5ef6cSAlex Elder 	size_t size;
2800c5b5ef6cSAlex Elder 	int ret;
2801c5b5ef6cSAlex Elder 
28026c696d85SIlya Dryomov 	stat_request = rbd_obj_request_create(OBJ_REQUEST_PAGES);
2803710214e3SIlya Dryomov 	if (!stat_request)
2804710214e3SIlya Dryomov 		return -ENOMEM;
2805710214e3SIlya Dryomov 
2806a90bb0c1SIlya Dryomov 	stat_request->object_no = obj_request->object_no;
2807a90bb0c1SIlya Dryomov 
2808710214e3SIlya Dryomov 	stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
2809710214e3SIlya Dryomov 						   stat_request);
2810710214e3SIlya Dryomov 	if (!stat_request->osd_req) {
2811710214e3SIlya Dryomov 		ret = -ENOMEM;
2812710214e3SIlya Dryomov 		goto fail_stat_request;
2813710214e3SIlya Dryomov 	}
2814710214e3SIlya Dryomov 
2815c5b5ef6cSAlex Elder 	/*
2816c5b5ef6cSAlex Elder 	 * The response data for a STAT call consists of:
2817c5b5ef6cSAlex Elder 	 *     le64 length;
2818c5b5ef6cSAlex Elder 	 *     struct {
2819c5b5ef6cSAlex Elder 	 *         le32 tv_sec;
2820c5b5ef6cSAlex Elder 	 *         le32 tv_nsec;
2821c5b5ef6cSAlex Elder 	 *     } mtime;
2822c5b5ef6cSAlex Elder 	 */
2823c5b5ef6cSAlex Elder 	size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2824c5b5ef6cSAlex Elder 	page_count = (u32)calc_pages_for(0, size);
2825c5b5ef6cSAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2826710214e3SIlya Dryomov 	if (IS_ERR(pages)) {
2827710214e3SIlya Dryomov 		ret = PTR_ERR(pages);
2828710214e3SIlya Dryomov 		goto fail_stat_request;
2829710214e3SIlya Dryomov 	}
2830c5b5ef6cSAlex Elder 
2831710214e3SIlya Dryomov 	osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0);
2832710214e3SIlya Dryomov 	osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2833710214e3SIlya Dryomov 				     false, false);
2834c5b5ef6cSAlex Elder 
2835c5b5ef6cSAlex Elder 	rbd_obj_request_get(obj_request);
2836c5b5ef6cSAlex Elder 	stat_request->obj_request = obj_request;
2837c5b5ef6cSAlex Elder 	stat_request->pages = pages;
2838c5b5ef6cSAlex Elder 	stat_request->page_count = page_count;
2839c5b5ef6cSAlex Elder 	stat_request->callback = rbd_img_obj_exists_callback;
2840c5b5ef6cSAlex Elder 
2841980917fcSIlya Dryomov 	rbd_obj_request_submit(stat_request);
2842980917fcSIlya Dryomov 	return 0;
2843c5b5ef6cSAlex Elder 
2844710214e3SIlya Dryomov fail_stat_request:
2845710214e3SIlya Dryomov 	rbd_obj_request_put(stat_request);
2846c5b5ef6cSAlex Elder 	return ret;
2847c5b5ef6cSAlex Elder }
2848c5b5ef6cSAlex Elder 
284970d045f6SIlya Dryomov static bool img_obj_request_simple(struct rbd_obj_request *obj_request)
2850b454e36dSAlex Elder {
2851058aa991SIlya Dryomov 	struct rbd_img_request *img_request = obj_request->img_request;
2852058aa991SIlya Dryomov 	struct rbd_device *rbd_dev = img_request->rbd_dev;
2853b454e36dSAlex Elder 
285470d045f6SIlya Dryomov 	/* Reads */
28551c220881SJosh Durgin 	if (!img_request_write_test(img_request) &&
28561c220881SJosh Durgin 	    !img_request_discard_test(img_request))
285770d045f6SIlya Dryomov 		return true;
2858b454e36dSAlex Elder 
285970d045f6SIlya Dryomov 	/* Non-layered writes */
286070d045f6SIlya Dryomov 	if (!img_request_layered_test(img_request))
286170d045f6SIlya Dryomov 		return true;
286270d045f6SIlya Dryomov 
286370d045f6SIlya Dryomov 	/*
286470d045f6SIlya Dryomov 	 * Layered writes outside of the parent overlap range don't
286570d045f6SIlya Dryomov 	 * share any data with the parent.
286670d045f6SIlya Dryomov 	 */
286770d045f6SIlya Dryomov 	if (!obj_request_overlaps_parent(obj_request))
286870d045f6SIlya Dryomov 		return true;
286970d045f6SIlya Dryomov 
287070d045f6SIlya Dryomov 	/*
2871c622d226SGuangliang Zhao 	 * Entire-object layered writes - we will overwrite whatever
2872c622d226SGuangliang Zhao 	 * parent data there is anyway.
2873c622d226SGuangliang Zhao 	 */
2874c622d226SGuangliang Zhao 	if (!obj_request->offset &&
2875c622d226SGuangliang Zhao 	    obj_request->length == rbd_obj_bytes(&rbd_dev->header))
2876c622d226SGuangliang Zhao 		return true;
2877c622d226SGuangliang Zhao 
2878c622d226SGuangliang Zhao 	/*
287970d045f6SIlya Dryomov 	 * If the object is known to already exist, its parent data has
288070d045f6SIlya Dryomov 	 * already been copied.
288170d045f6SIlya Dryomov 	 */
288270d045f6SIlya Dryomov 	if (obj_request_known_test(obj_request) &&
288370d045f6SIlya Dryomov 	    obj_request_exists_test(obj_request))
288470d045f6SIlya Dryomov 		return true;
288570d045f6SIlya Dryomov 
288670d045f6SIlya Dryomov 	return false;
288770d045f6SIlya Dryomov }
288870d045f6SIlya Dryomov 
288970d045f6SIlya Dryomov static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
289070d045f6SIlya Dryomov {
2891058aa991SIlya Dryomov 	rbd_assert(obj_request_img_data_test(obj_request));
2892058aa991SIlya Dryomov 	rbd_assert(obj_request_type_valid(obj_request->type));
2893058aa991SIlya Dryomov 	rbd_assert(obj_request->img_request);
2894058aa991SIlya Dryomov 
289570d045f6SIlya Dryomov 	if (img_obj_request_simple(obj_request)) {
2896980917fcSIlya Dryomov 		rbd_obj_request_submit(obj_request);
2897980917fcSIlya Dryomov 		return 0;
2898b454e36dSAlex Elder 	}
2899b454e36dSAlex Elder 
2900b454e36dSAlex Elder 	/*
29013d7efd18SAlex Elder 	 * It's a layered write.  The target object might exist but
29023d7efd18SAlex Elder 	 * we may not know that yet.  If we know it doesn't exist,
29033d7efd18SAlex Elder 	 * start by reading the data for the full target object from
29043d7efd18SAlex Elder 	 * the parent so we can use it for a copyup to the target.
2905b454e36dSAlex Elder 	 */
290670d045f6SIlya Dryomov 	if (obj_request_known_test(obj_request))
29073d7efd18SAlex Elder 		return rbd_img_obj_parent_read_full(obj_request);
29083d7efd18SAlex Elder 
29093d7efd18SAlex Elder 	/* We don't know whether the target exists.  Go find out. */
2910b454e36dSAlex Elder 
2911b454e36dSAlex Elder 	return rbd_img_obj_exists_submit(obj_request);
2912b454e36dSAlex Elder }
2913b454e36dSAlex Elder 
2914bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request)
2915bf0d5f50SAlex Elder {
2916bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
291746faeed4SAlex Elder 	struct rbd_obj_request *next_obj_request;
2918663ae2ccSIlya Dryomov 	int ret = 0;
2919bf0d5f50SAlex Elder 
292037206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
2921bf0d5f50SAlex Elder 
2922663ae2ccSIlya Dryomov 	rbd_img_request_get(img_request);
2923663ae2ccSIlya Dryomov 	for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
2924b454e36dSAlex Elder 		ret = rbd_img_obj_request_submit(obj_request);
2925bf0d5f50SAlex Elder 		if (ret)
2926663ae2ccSIlya Dryomov 			goto out_put_ireq;
2927bf0d5f50SAlex Elder 	}
2928bf0d5f50SAlex Elder 
2929663ae2ccSIlya Dryomov out_put_ireq:
2930663ae2ccSIlya Dryomov 	rbd_img_request_put(img_request);
2931663ae2ccSIlya Dryomov 	return ret;
2932bf0d5f50SAlex Elder }
2933bf0d5f50SAlex Elder 
29348b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
29358b3e1a56SAlex Elder {
29368b3e1a56SAlex Elder 	struct rbd_obj_request *obj_request;
2937a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
2938a9e8ba2cSAlex Elder 	u64 obj_end;
293902c74fbaSAlex Elder 	u64 img_xferred;
294002c74fbaSAlex Elder 	int img_result;
29418b3e1a56SAlex Elder 
29428b3e1a56SAlex Elder 	rbd_assert(img_request_child_test(img_request));
29438b3e1a56SAlex Elder 
294402c74fbaSAlex Elder 	/* First get what we need from the image request and release it */
294502c74fbaSAlex Elder 
29468b3e1a56SAlex Elder 	obj_request = img_request->obj_request;
294702c74fbaSAlex Elder 	img_xferred = img_request->xferred;
294802c74fbaSAlex Elder 	img_result = img_request->result;
294902c74fbaSAlex Elder 	rbd_img_request_put(img_request);
295002c74fbaSAlex Elder 
295102c74fbaSAlex Elder 	/*
295202c74fbaSAlex Elder 	 * If the overlap has become 0 (most likely because the
295302c74fbaSAlex Elder 	 * image has been flattened) we need to re-submit the
295402c74fbaSAlex Elder 	 * original request.
295502c74fbaSAlex Elder 	 */
2956a9e8ba2cSAlex Elder 	rbd_assert(obj_request);
2957a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_request);
295802c74fbaSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
295902c74fbaSAlex Elder 	if (!rbd_dev->parent_overlap) {
2960980917fcSIlya Dryomov 		rbd_obj_request_submit(obj_request);
296102c74fbaSAlex Elder 		return;
296202c74fbaSAlex Elder 	}
296302c74fbaSAlex Elder 
296402c74fbaSAlex Elder 	obj_request->result = img_result;
2965a9e8ba2cSAlex Elder 	if (obj_request->result)
2966a9e8ba2cSAlex Elder 		goto out;
2967a9e8ba2cSAlex Elder 
2968a9e8ba2cSAlex Elder 	/*
2969a9e8ba2cSAlex Elder 	 * We need to zero anything beyond the parent overlap
2970a9e8ba2cSAlex Elder 	 * boundary.  Since rbd_img_obj_request_read_callback()
2971a9e8ba2cSAlex Elder 	 * will zero anything beyond the end of a short read, an
2972a9e8ba2cSAlex Elder 	 * easy way to do this is to pretend the data from the
2973a9e8ba2cSAlex Elder 	 * parent came up short--ending at the overlap boundary.
2974a9e8ba2cSAlex Elder 	 */
2975a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2976a9e8ba2cSAlex Elder 	obj_end = obj_request->img_offset + obj_request->length;
2977a9e8ba2cSAlex Elder 	if (obj_end > rbd_dev->parent_overlap) {
2978a9e8ba2cSAlex Elder 		u64 xferred = 0;
2979a9e8ba2cSAlex Elder 
2980a9e8ba2cSAlex Elder 		if (obj_request->img_offset < rbd_dev->parent_overlap)
2981a9e8ba2cSAlex Elder 			xferred = rbd_dev->parent_overlap -
2982a9e8ba2cSAlex Elder 					obj_request->img_offset;
2983a9e8ba2cSAlex Elder 
298402c74fbaSAlex Elder 		obj_request->xferred = min(img_xferred, xferred);
2985a9e8ba2cSAlex Elder 	} else {
298602c74fbaSAlex Elder 		obj_request->xferred = img_xferred;
2987a9e8ba2cSAlex Elder 	}
2988a9e8ba2cSAlex Elder out:
29898b3e1a56SAlex Elder 	rbd_img_obj_request_read_callback(obj_request);
29908b3e1a56SAlex Elder 	rbd_obj_request_complete(obj_request);
29918b3e1a56SAlex Elder }
29928b3e1a56SAlex Elder 
29938b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
29948b3e1a56SAlex Elder {
29958b3e1a56SAlex Elder 	struct rbd_img_request *img_request;
29968b3e1a56SAlex Elder 	int result;
29978b3e1a56SAlex Elder 
29988b3e1a56SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
29998b3e1a56SAlex Elder 	rbd_assert(obj_request->img_request != NULL);
30008b3e1a56SAlex Elder 	rbd_assert(obj_request->result == (s32) -ENOENT);
30015b2ab72dSAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
30028b3e1a56SAlex Elder 
30038b3e1a56SAlex Elder 	/* rbd_read_finish(obj_request, obj_request->length); */
3004e93f3152SAlex Elder 	img_request = rbd_parent_request_create(obj_request,
30058b3e1a56SAlex Elder 						obj_request->img_offset,
3006e93f3152SAlex Elder 						obj_request->length);
30078b3e1a56SAlex Elder 	result = -ENOMEM;
30088b3e1a56SAlex Elder 	if (!img_request)
30098b3e1a56SAlex Elder 		goto out_err;
30108b3e1a56SAlex Elder 
30115b2ab72dSAlex Elder 	if (obj_request->type == OBJ_REQUEST_BIO)
3012f1a4739fSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
3013f1a4739fSAlex Elder 						obj_request->bio_list);
30145b2ab72dSAlex Elder 	else
30155b2ab72dSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES,
30165b2ab72dSAlex Elder 						obj_request->pages);
30178b3e1a56SAlex Elder 	if (result)
30188b3e1a56SAlex Elder 		goto out_err;
30198b3e1a56SAlex Elder 
30208b3e1a56SAlex Elder 	img_request->callback = rbd_img_parent_read_callback;
30218b3e1a56SAlex Elder 	result = rbd_img_request_submit(img_request);
30228b3e1a56SAlex Elder 	if (result)
30238b3e1a56SAlex Elder 		goto out_err;
30248b3e1a56SAlex Elder 
30258b3e1a56SAlex Elder 	return;
30268b3e1a56SAlex Elder out_err:
30278b3e1a56SAlex Elder 	if (img_request)
30288b3e1a56SAlex Elder 		rbd_img_request_put(img_request);
30298b3e1a56SAlex Elder 	obj_request->result = result;
30308b3e1a56SAlex Elder 	obj_request->xferred = 0;
30318b3e1a56SAlex Elder 	obj_request_done_set(obj_request);
30328b3e1a56SAlex Elder }
30338b3e1a56SAlex Elder 
3034ed95b21aSIlya Dryomov static const struct rbd_client_id rbd_empty_cid;
3035ed95b21aSIlya Dryomov 
3036ed95b21aSIlya Dryomov static bool rbd_cid_equal(const struct rbd_client_id *lhs,
3037ed95b21aSIlya Dryomov 			  const struct rbd_client_id *rhs)
3038ed95b21aSIlya Dryomov {
3039ed95b21aSIlya Dryomov 	return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
3040ed95b21aSIlya Dryomov }
3041ed95b21aSIlya Dryomov 
3042ed95b21aSIlya Dryomov static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
3043ed95b21aSIlya Dryomov {
3044ed95b21aSIlya Dryomov 	struct rbd_client_id cid;
3045ed95b21aSIlya Dryomov 
3046ed95b21aSIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
3047ed95b21aSIlya Dryomov 	cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
3048ed95b21aSIlya Dryomov 	cid.handle = rbd_dev->watch_cookie;
3049ed95b21aSIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
3050ed95b21aSIlya Dryomov 	return cid;
3051ed95b21aSIlya Dryomov }
3052ed95b21aSIlya Dryomov 
3053ed95b21aSIlya Dryomov /*
3054ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3055ed95b21aSIlya Dryomov  */
3056ed95b21aSIlya Dryomov static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
3057ed95b21aSIlya Dryomov 			      const struct rbd_client_id *cid)
3058ed95b21aSIlya Dryomov {
3059ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
3060ed95b21aSIlya Dryomov 	     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
3061ed95b21aSIlya Dryomov 	     cid->gid, cid->handle);
3062ed95b21aSIlya Dryomov 	rbd_dev->owner_cid = *cid; /* struct */
3063ed95b21aSIlya Dryomov }
3064ed95b21aSIlya Dryomov 
3065ed95b21aSIlya Dryomov static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
3066ed95b21aSIlya Dryomov {
3067ed95b21aSIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
3068ed95b21aSIlya Dryomov 	sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
3069ed95b21aSIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
3070ed95b21aSIlya Dryomov }
3071ed95b21aSIlya Dryomov 
3072ed95b21aSIlya Dryomov /*
3073ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3074ed95b21aSIlya Dryomov  */
3075ed95b21aSIlya Dryomov static int rbd_lock(struct rbd_device *rbd_dev)
3076ed95b21aSIlya Dryomov {
3077ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3078ed95b21aSIlya Dryomov 	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3079ed95b21aSIlya Dryomov 	char cookie[32];
3080ed95b21aSIlya Dryomov 	int ret;
3081ed95b21aSIlya Dryomov 
3082ed95b21aSIlya Dryomov 	WARN_ON(__rbd_is_lock_owner(rbd_dev));
3083ed95b21aSIlya Dryomov 
3084ed95b21aSIlya Dryomov 	format_lock_cookie(rbd_dev, cookie);
3085ed95b21aSIlya Dryomov 	ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3086ed95b21aSIlya Dryomov 			    RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
3087ed95b21aSIlya Dryomov 			    RBD_LOCK_TAG, "", 0);
3088ed95b21aSIlya Dryomov 	if (ret)
3089ed95b21aSIlya Dryomov 		return ret;
3090ed95b21aSIlya Dryomov 
3091ed95b21aSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
3092ed95b21aSIlya Dryomov 	rbd_set_owner_cid(rbd_dev, &cid);
3093ed95b21aSIlya Dryomov 	queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
3094ed95b21aSIlya Dryomov 	return 0;
3095ed95b21aSIlya Dryomov }
3096ed95b21aSIlya Dryomov 
3097ed95b21aSIlya Dryomov /*
3098ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3099ed95b21aSIlya Dryomov  */
3100ed95b21aSIlya Dryomov static int rbd_unlock(struct rbd_device *rbd_dev)
3101ed95b21aSIlya Dryomov {
3102ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3103ed95b21aSIlya Dryomov 	char cookie[32];
3104ed95b21aSIlya Dryomov 	int ret;
3105ed95b21aSIlya Dryomov 
3106ed95b21aSIlya Dryomov 	WARN_ON(!__rbd_is_lock_owner(rbd_dev));
3107ed95b21aSIlya Dryomov 
3108ed95b21aSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
3109ed95b21aSIlya Dryomov 
3110ed95b21aSIlya Dryomov 	format_lock_cookie(rbd_dev, cookie);
3111ed95b21aSIlya Dryomov 	ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3112ed95b21aSIlya Dryomov 			      RBD_LOCK_NAME, cookie);
3113ed95b21aSIlya Dryomov 	if (ret && ret != -ENOENT) {
3114ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "cls_unlock failed: %d", ret);
3115ed95b21aSIlya Dryomov 		return ret;
3116ed95b21aSIlya Dryomov 	}
3117ed95b21aSIlya Dryomov 
3118ed95b21aSIlya Dryomov 	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3119ed95b21aSIlya Dryomov 	queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
3120ed95b21aSIlya Dryomov 	return 0;
3121ed95b21aSIlya Dryomov }
3122ed95b21aSIlya Dryomov 
3123ed95b21aSIlya Dryomov static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
3124ed95b21aSIlya Dryomov 				enum rbd_notify_op notify_op,
3125ed95b21aSIlya Dryomov 				struct page ***preply_pages,
3126ed95b21aSIlya Dryomov 				size_t *preply_len)
3127ed95b21aSIlya Dryomov {
3128ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3129ed95b21aSIlya Dryomov 	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3130ed95b21aSIlya Dryomov 	int buf_size = 4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN;
3131ed95b21aSIlya Dryomov 	char buf[buf_size];
3132ed95b21aSIlya Dryomov 	void *p = buf;
3133ed95b21aSIlya Dryomov 
3134ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
3135ed95b21aSIlya Dryomov 
3136ed95b21aSIlya Dryomov 	/* encode *LockPayload NotifyMessage (op + ClientId) */
3137ed95b21aSIlya Dryomov 	ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
3138ed95b21aSIlya Dryomov 	ceph_encode_32(&p, notify_op);
3139ed95b21aSIlya Dryomov 	ceph_encode_64(&p, cid.gid);
3140ed95b21aSIlya Dryomov 	ceph_encode_64(&p, cid.handle);
3141ed95b21aSIlya Dryomov 
3142ed95b21aSIlya Dryomov 	return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
3143ed95b21aSIlya Dryomov 				&rbd_dev->header_oloc, buf, buf_size,
3144ed95b21aSIlya Dryomov 				RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
3145ed95b21aSIlya Dryomov }
3146ed95b21aSIlya Dryomov 
3147ed95b21aSIlya Dryomov static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
3148ed95b21aSIlya Dryomov 			       enum rbd_notify_op notify_op)
3149ed95b21aSIlya Dryomov {
3150ed95b21aSIlya Dryomov 	struct page **reply_pages;
3151ed95b21aSIlya Dryomov 	size_t reply_len;
3152ed95b21aSIlya Dryomov 
3153ed95b21aSIlya Dryomov 	__rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
3154ed95b21aSIlya Dryomov 	ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3155ed95b21aSIlya Dryomov }
3156ed95b21aSIlya Dryomov 
3157ed95b21aSIlya Dryomov static void rbd_notify_acquired_lock(struct work_struct *work)
3158ed95b21aSIlya Dryomov {
3159ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3160ed95b21aSIlya Dryomov 						  acquired_lock_work);
3161ed95b21aSIlya Dryomov 
3162ed95b21aSIlya Dryomov 	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
3163ed95b21aSIlya Dryomov }
3164ed95b21aSIlya Dryomov 
3165ed95b21aSIlya Dryomov static void rbd_notify_released_lock(struct work_struct *work)
3166ed95b21aSIlya Dryomov {
3167ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3168ed95b21aSIlya Dryomov 						  released_lock_work);
3169ed95b21aSIlya Dryomov 
3170ed95b21aSIlya Dryomov 	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
3171ed95b21aSIlya Dryomov }
3172ed95b21aSIlya Dryomov 
3173ed95b21aSIlya Dryomov static int rbd_request_lock(struct rbd_device *rbd_dev)
3174ed95b21aSIlya Dryomov {
3175ed95b21aSIlya Dryomov 	struct page **reply_pages;
3176ed95b21aSIlya Dryomov 	size_t reply_len;
3177ed95b21aSIlya Dryomov 	bool lock_owner_responded = false;
3178ed95b21aSIlya Dryomov 	int ret;
3179ed95b21aSIlya Dryomov 
3180ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3181ed95b21aSIlya Dryomov 
3182ed95b21aSIlya Dryomov 	ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
3183ed95b21aSIlya Dryomov 				   &reply_pages, &reply_len);
3184ed95b21aSIlya Dryomov 	if (ret && ret != -ETIMEDOUT) {
3185ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "failed to request lock: %d", ret);
3186ed95b21aSIlya Dryomov 		goto out;
3187ed95b21aSIlya Dryomov 	}
3188ed95b21aSIlya Dryomov 
3189ed95b21aSIlya Dryomov 	if (reply_len > 0 && reply_len <= PAGE_SIZE) {
3190ed95b21aSIlya Dryomov 		void *p = page_address(reply_pages[0]);
3191ed95b21aSIlya Dryomov 		void *const end = p + reply_len;
3192ed95b21aSIlya Dryomov 		u32 n;
3193ed95b21aSIlya Dryomov 
3194ed95b21aSIlya Dryomov 		ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
3195ed95b21aSIlya Dryomov 		while (n--) {
3196ed95b21aSIlya Dryomov 			u8 struct_v;
3197ed95b21aSIlya Dryomov 			u32 len;
3198ed95b21aSIlya Dryomov 
3199ed95b21aSIlya Dryomov 			ceph_decode_need(&p, end, 8 + 8, e_inval);
3200ed95b21aSIlya Dryomov 			p += 8 + 8; /* skip gid and cookie */
3201ed95b21aSIlya Dryomov 
3202ed95b21aSIlya Dryomov 			ceph_decode_32_safe(&p, end, len, e_inval);
3203ed95b21aSIlya Dryomov 			if (!len)
3204ed95b21aSIlya Dryomov 				continue;
3205ed95b21aSIlya Dryomov 
3206ed95b21aSIlya Dryomov 			if (lock_owner_responded) {
3207ed95b21aSIlya Dryomov 				rbd_warn(rbd_dev,
3208ed95b21aSIlya Dryomov 					 "duplicate lock owners detected");
3209ed95b21aSIlya Dryomov 				ret = -EIO;
3210ed95b21aSIlya Dryomov 				goto out;
3211ed95b21aSIlya Dryomov 			}
3212ed95b21aSIlya Dryomov 
3213ed95b21aSIlya Dryomov 			lock_owner_responded = true;
3214ed95b21aSIlya Dryomov 			ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
3215ed95b21aSIlya Dryomov 						  &struct_v, &len);
3216ed95b21aSIlya Dryomov 			if (ret) {
3217ed95b21aSIlya Dryomov 				rbd_warn(rbd_dev,
3218ed95b21aSIlya Dryomov 					 "failed to decode ResponseMessage: %d",
3219ed95b21aSIlya Dryomov 					 ret);
3220ed95b21aSIlya Dryomov 				goto e_inval;
3221ed95b21aSIlya Dryomov 			}
3222ed95b21aSIlya Dryomov 
3223ed95b21aSIlya Dryomov 			ret = ceph_decode_32(&p);
3224ed95b21aSIlya Dryomov 		}
3225ed95b21aSIlya Dryomov 	}
3226ed95b21aSIlya Dryomov 
3227ed95b21aSIlya Dryomov 	if (!lock_owner_responded) {
3228ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "no lock owners detected");
3229ed95b21aSIlya Dryomov 		ret = -ETIMEDOUT;
3230ed95b21aSIlya Dryomov 	}
3231ed95b21aSIlya Dryomov 
3232ed95b21aSIlya Dryomov out:
3233ed95b21aSIlya Dryomov 	ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3234ed95b21aSIlya Dryomov 	return ret;
3235ed95b21aSIlya Dryomov 
3236ed95b21aSIlya Dryomov e_inval:
3237ed95b21aSIlya Dryomov 	ret = -EINVAL;
3238ed95b21aSIlya Dryomov 	goto out;
3239ed95b21aSIlya Dryomov }
3240ed95b21aSIlya Dryomov 
3241ed95b21aSIlya Dryomov static void wake_requests(struct rbd_device *rbd_dev, bool wake_all)
3242ed95b21aSIlya Dryomov {
3243ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all);
3244ed95b21aSIlya Dryomov 
3245ed95b21aSIlya Dryomov 	cancel_delayed_work(&rbd_dev->lock_dwork);
3246ed95b21aSIlya Dryomov 	if (wake_all)
3247ed95b21aSIlya Dryomov 		wake_up_all(&rbd_dev->lock_waitq);
3248ed95b21aSIlya Dryomov 	else
3249ed95b21aSIlya Dryomov 		wake_up(&rbd_dev->lock_waitq);
3250ed95b21aSIlya Dryomov }
3251ed95b21aSIlya Dryomov 
3252ed95b21aSIlya Dryomov static int get_lock_owner_info(struct rbd_device *rbd_dev,
3253ed95b21aSIlya Dryomov 			       struct ceph_locker **lockers, u32 *num_lockers)
3254ed95b21aSIlya Dryomov {
3255ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3256ed95b21aSIlya Dryomov 	u8 lock_type;
3257ed95b21aSIlya Dryomov 	char *lock_tag;
3258ed95b21aSIlya Dryomov 	int ret;
3259ed95b21aSIlya Dryomov 
3260ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3261ed95b21aSIlya Dryomov 
3262ed95b21aSIlya Dryomov 	ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
3263ed95b21aSIlya Dryomov 				 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3264ed95b21aSIlya Dryomov 				 &lock_type, &lock_tag, lockers, num_lockers);
3265ed95b21aSIlya Dryomov 	if (ret)
3266ed95b21aSIlya Dryomov 		return ret;
3267ed95b21aSIlya Dryomov 
3268ed95b21aSIlya Dryomov 	if (*num_lockers == 0) {
3269ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
3270ed95b21aSIlya Dryomov 		goto out;
3271ed95b21aSIlya Dryomov 	}
3272ed95b21aSIlya Dryomov 
3273ed95b21aSIlya Dryomov 	if (strcmp(lock_tag, RBD_LOCK_TAG)) {
3274ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
3275ed95b21aSIlya Dryomov 			 lock_tag);
3276ed95b21aSIlya Dryomov 		ret = -EBUSY;
3277ed95b21aSIlya Dryomov 		goto out;
3278ed95b21aSIlya Dryomov 	}
3279ed95b21aSIlya Dryomov 
3280ed95b21aSIlya Dryomov 	if (lock_type == CEPH_CLS_LOCK_SHARED) {
3281ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "shared lock type detected");
3282ed95b21aSIlya Dryomov 		ret = -EBUSY;
3283ed95b21aSIlya Dryomov 		goto out;
3284ed95b21aSIlya Dryomov 	}
3285ed95b21aSIlya Dryomov 
3286ed95b21aSIlya Dryomov 	if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
3287ed95b21aSIlya Dryomov 		    strlen(RBD_LOCK_COOKIE_PREFIX))) {
3288ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
3289ed95b21aSIlya Dryomov 			 (*lockers)[0].id.cookie);
3290ed95b21aSIlya Dryomov 		ret = -EBUSY;
3291ed95b21aSIlya Dryomov 		goto out;
3292ed95b21aSIlya Dryomov 	}
3293ed95b21aSIlya Dryomov 
3294ed95b21aSIlya Dryomov out:
3295ed95b21aSIlya Dryomov 	kfree(lock_tag);
3296ed95b21aSIlya Dryomov 	return ret;
3297ed95b21aSIlya Dryomov }
3298ed95b21aSIlya Dryomov 
3299ed95b21aSIlya Dryomov static int find_watcher(struct rbd_device *rbd_dev,
3300ed95b21aSIlya Dryomov 			const struct ceph_locker *locker)
3301ed95b21aSIlya Dryomov {
3302ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3303ed95b21aSIlya Dryomov 	struct ceph_watch_item *watchers;
3304ed95b21aSIlya Dryomov 	u32 num_watchers;
3305ed95b21aSIlya Dryomov 	u64 cookie;
3306ed95b21aSIlya Dryomov 	int i;
3307ed95b21aSIlya Dryomov 	int ret;
3308ed95b21aSIlya Dryomov 
3309ed95b21aSIlya Dryomov 	ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
3310ed95b21aSIlya Dryomov 				      &rbd_dev->header_oloc, &watchers,
3311ed95b21aSIlya Dryomov 				      &num_watchers);
3312ed95b21aSIlya Dryomov 	if (ret)
3313ed95b21aSIlya Dryomov 		return ret;
3314ed95b21aSIlya Dryomov 
3315ed95b21aSIlya Dryomov 	sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
3316ed95b21aSIlya Dryomov 	for (i = 0; i < num_watchers; i++) {
3317ed95b21aSIlya Dryomov 		if (!memcmp(&watchers[i].addr, &locker->info.addr,
3318ed95b21aSIlya Dryomov 			    sizeof(locker->info.addr)) &&
3319ed95b21aSIlya Dryomov 		    watchers[i].cookie == cookie) {
3320ed95b21aSIlya Dryomov 			struct rbd_client_id cid = {
3321ed95b21aSIlya Dryomov 				.gid = le64_to_cpu(watchers[i].name.num),
3322ed95b21aSIlya Dryomov 				.handle = cookie,
3323ed95b21aSIlya Dryomov 			};
3324ed95b21aSIlya Dryomov 
3325ed95b21aSIlya Dryomov 			dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
3326ed95b21aSIlya Dryomov 			     rbd_dev, cid.gid, cid.handle);
3327ed95b21aSIlya Dryomov 			rbd_set_owner_cid(rbd_dev, &cid);
3328ed95b21aSIlya Dryomov 			ret = 1;
3329ed95b21aSIlya Dryomov 			goto out;
3330ed95b21aSIlya Dryomov 		}
3331ed95b21aSIlya Dryomov 	}
3332ed95b21aSIlya Dryomov 
3333ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
3334ed95b21aSIlya Dryomov 	ret = 0;
3335ed95b21aSIlya Dryomov out:
3336ed95b21aSIlya Dryomov 	kfree(watchers);
3337ed95b21aSIlya Dryomov 	return ret;
3338ed95b21aSIlya Dryomov }
3339ed95b21aSIlya Dryomov 
3340ed95b21aSIlya Dryomov /*
3341ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3342ed95b21aSIlya Dryomov  */
3343ed95b21aSIlya Dryomov static int rbd_try_lock(struct rbd_device *rbd_dev)
3344ed95b21aSIlya Dryomov {
3345ed95b21aSIlya Dryomov 	struct ceph_client *client = rbd_dev->rbd_client->client;
3346ed95b21aSIlya Dryomov 	struct ceph_locker *lockers;
3347ed95b21aSIlya Dryomov 	u32 num_lockers;
3348ed95b21aSIlya Dryomov 	int ret;
3349ed95b21aSIlya Dryomov 
3350ed95b21aSIlya Dryomov 	for (;;) {
3351ed95b21aSIlya Dryomov 		ret = rbd_lock(rbd_dev);
3352ed95b21aSIlya Dryomov 		if (ret != -EBUSY)
3353ed95b21aSIlya Dryomov 			return ret;
3354ed95b21aSIlya Dryomov 
3355ed95b21aSIlya Dryomov 		/* determine if the current lock holder is still alive */
3356ed95b21aSIlya Dryomov 		ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
3357ed95b21aSIlya Dryomov 		if (ret)
3358ed95b21aSIlya Dryomov 			return ret;
3359ed95b21aSIlya Dryomov 
3360ed95b21aSIlya Dryomov 		if (num_lockers == 0)
3361ed95b21aSIlya Dryomov 			goto again;
3362ed95b21aSIlya Dryomov 
3363ed95b21aSIlya Dryomov 		ret = find_watcher(rbd_dev, lockers);
3364ed95b21aSIlya Dryomov 		if (ret) {
3365ed95b21aSIlya Dryomov 			if (ret > 0)
3366ed95b21aSIlya Dryomov 				ret = 0; /* have to request lock */
3367ed95b21aSIlya Dryomov 			goto out;
3368ed95b21aSIlya Dryomov 		}
3369ed95b21aSIlya Dryomov 
3370ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
3371ed95b21aSIlya Dryomov 			 ENTITY_NAME(lockers[0].id.name));
3372ed95b21aSIlya Dryomov 
3373ed95b21aSIlya Dryomov 		ret = ceph_monc_blacklist_add(&client->monc,
3374ed95b21aSIlya Dryomov 					      &lockers[0].info.addr);
3375ed95b21aSIlya Dryomov 		if (ret) {
3376ed95b21aSIlya Dryomov 			rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
3377ed95b21aSIlya Dryomov 				 ENTITY_NAME(lockers[0].id.name), ret);
3378ed95b21aSIlya Dryomov 			goto out;
3379ed95b21aSIlya Dryomov 		}
3380ed95b21aSIlya Dryomov 
3381ed95b21aSIlya Dryomov 		ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
3382ed95b21aSIlya Dryomov 					  &rbd_dev->header_oloc, RBD_LOCK_NAME,
3383ed95b21aSIlya Dryomov 					  lockers[0].id.cookie,
3384ed95b21aSIlya Dryomov 					  &lockers[0].id.name);
3385ed95b21aSIlya Dryomov 		if (ret && ret != -ENOENT)
3386ed95b21aSIlya Dryomov 			goto out;
3387ed95b21aSIlya Dryomov 
3388ed95b21aSIlya Dryomov again:
3389ed95b21aSIlya Dryomov 		ceph_free_lockers(lockers, num_lockers);
3390ed95b21aSIlya Dryomov 	}
3391ed95b21aSIlya Dryomov 
3392ed95b21aSIlya Dryomov out:
3393ed95b21aSIlya Dryomov 	ceph_free_lockers(lockers, num_lockers);
3394ed95b21aSIlya Dryomov 	return ret;
3395ed95b21aSIlya Dryomov }
3396ed95b21aSIlya Dryomov 
3397ed95b21aSIlya Dryomov /*
3398ed95b21aSIlya Dryomov  * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED
3399ed95b21aSIlya Dryomov  */
3400ed95b21aSIlya Dryomov static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev,
3401ed95b21aSIlya Dryomov 						int *pret)
3402ed95b21aSIlya Dryomov {
3403ed95b21aSIlya Dryomov 	enum rbd_lock_state lock_state;
3404ed95b21aSIlya Dryomov 
3405ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
3406ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3407ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
3408ed95b21aSIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev)) {
3409ed95b21aSIlya Dryomov 		lock_state = rbd_dev->lock_state;
3410ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
3411ed95b21aSIlya Dryomov 		return lock_state;
3412ed95b21aSIlya Dryomov 	}
3413ed95b21aSIlya Dryomov 
3414ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3415ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3416ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3417ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
3418ed95b21aSIlya Dryomov 	if (!__rbd_is_lock_owner(rbd_dev)) {
3419ed95b21aSIlya Dryomov 		*pret = rbd_try_lock(rbd_dev);
3420ed95b21aSIlya Dryomov 		if (*pret)
3421ed95b21aSIlya Dryomov 			rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret);
3422ed95b21aSIlya Dryomov 	}
3423ed95b21aSIlya Dryomov 
3424ed95b21aSIlya Dryomov 	lock_state = rbd_dev->lock_state;
3425ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
3426ed95b21aSIlya Dryomov 	return lock_state;
3427ed95b21aSIlya Dryomov }
3428ed95b21aSIlya Dryomov 
3429ed95b21aSIlya Dryomov static void rbd_acquire_lock(struct work_struct *work)
3430ed95b21aSIlya Dryomov {
3431ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3432ed95b21aSIlya Dryomov 					    struct rbd_device, lock_dwork);
3433ed95b21aSIlya Dryomov 	enum rbd_lock_state lock_state;
3434ed95b21aSIlya Dryomov 	int ret;
3435ed95b21aSIlya Dryomov 
3436ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3437ed95b21aSIlya Dryomov again:
3438ed95b21aSIlya Dryomov 	lock_state = rbd_try_acquire_lock(rbd_dev, &ret);
3439ed95b21aSIlya Dryomov 	if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) {
3440ed95b21aSIlya Dryomov 		if (lock_state == RBD_LOCK_STATE_LOCKED)
3441ed95b21aSIlya Dryomov 			wake_requests(rbd_dev, true);
3442ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__,
3443ed95b21aSIlya Dryomov 		     rbd_dev, lock_state, ret);
3444ed95b21aSIlya Dryomov 		return;
3445ed95b21aSIlya Dryomov 	}
3446ed95b21aSIlya Dryomov 
3447ed95b21aSIlya Dryomov 	ret = rbd_request_lock(rbd_dev);
3448ed95b21aSIlya Dryomov 	if (ret == -ETIMEDOUT) {
3449ed95b21aSIlya Dryomov 		goto again; /* treat this as a dead client */
3450ed95b21aSIlya Dryomov 	} else if (ret < 0) {
3451ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "error requesting lock: %d", ret);
3452ed95b21aSIlya Dryomov 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3453ed95b21aSIlya Dryomov 				 RBD_RETRY_DELAY);
3454ed95b21aSIlya Dryomov 	} else {
3455ed95b21aSIlya Dryomov 		/*
3456ed95b21aSIlya Dryomov 		 * lock owner acked, but resend if we don't see them
3457ed95b21aSIlya Dryomov 		 * release the lock
3458ed95b21aSIlya Dryomov 		 */
3459ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p requeueing lock_dwork\n", __func__,
3460ed95b21aSIlya Dryomov 		     rbd_dev);
3461ed95b21aSIlya Dryomov 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3462ed95b21aSIlya Dryomov 		    msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
3463ed95b21aSIlya Dryomov 	}
3464ed95b21aSIlya Dryomov }
3465ed95b21aSIlya Dryomov 
3466ed95b21aSIlya Dryomov /*
3467ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3468ed95b21aSIlya Dryomov  */
3469ed95b21aSIlya Dryomov static bool rbd_release_lock(struct rbd_device *rbd_dev)
3470ed95b21aSIlya Dryomov {
3471ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3472ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
3473ed95b21aSIlya Dryomov 	if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
3474ed95b21aSIlya Dryomov 		return false;
3475ed95b21aSIlya Dryomov 
3476ed95b21aSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
3477ed95b21aSIlya Dryomov 	downgrade_write(&rbd_dev->lock_rwsem);
3478ed95b21aSIlya Dryomov 	/*
3479ed95b21aSIlya Dryomov 	 * Ensure that all in-flight IO is flushed.
3480ed95b21aSIlya Dryomov 	 *
3481ed95b21aSIlya Dryomov 	 * FIXME: ceph_osdc_sync() flushes the entire OSD client, which
3482ed95b21aSIlya Dryomov 	 * may be shared with other devices.
3483ed95b21aSIlya Dryomov 	 */
3484ed95b21aSIlya Dryomov 	ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc);
3485ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3486ed95b21aSIlya Dryomov 
3487ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3488ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3489ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
3490ed95b21aSIlya Dryomov 	if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
3491ed95b21aSIlya Dryomov 		return false;
3492ed95b21aSIlya Dryomov 
3493ed95b21aSIlya Dryomov 	if (!rbd_unlock(rbd_dev))
3494ed95b21aSIlya Dryomov 		/*
3495ed95b21aSIlya Dryomov 		 * Give others a chance to grab the lock - we would re-acquire
3496ed95b21aSIlya Dryomov 		 * almost immediately if we got new IO during ceph_osdc_sync()
3497ed95b21aSIlya Dryomov 		 * otherwise.  We need to ack our own notifications, so this
3498ed95b21aSIlya Dryomov 		 * lock_dwork will be requeued from rbd_wait_state_locked()
3499ed95b21aSIlya Dryomov 		 * after wake_requests() in rbd_handle_released_lock().
3500ed95b21aSIlya Dryomov 		 */
3501ed95b21aSIlya Dryomov 		cancel_delayed_work(&rbd_dev->lock_dwork);
3502ed95b21aSIlya Dryomov 
3503ed95b21aSIlya Dryomov 	return true;
3504ed95b21aSIlya Dryomov }
3505ed95b21aSIlya Dryomov 
3506ed95b21aSIlya Dryomov static void rbd_release_lock_work(struct work_struct *work)
3507ed95b21aSIlya Dryomov {
3508ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3509ed95b21aSIlya Dryomov 						  unlock_work);
3510ed95b21aSIlya Dryomov 
3511ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3512ed95b21aSIlya Dryomov 	rbd_release_lock(rbd_dev);
3513ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
3514ed95b21aSIlya Dryomov }
3515ed95b21aSIlya Dryomov 
3516ed95b21aSIlya Dryomov static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
3517ed95b21aSIlya Dryomov 				     void **p)
3518ed95b21aSIlya Dryomov {
3519ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
3520ed95b21aSIlya Dryomov 
3521ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
3522ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
3523ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
3524ed95b21aSIlya Dryomov 	}
3525ed95b21aSIlya Dryomov 
3526ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3527ed95b21aSIlya Dryomov 	     cid.handle);
3528ed95b21aSIlya Dryomov 	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3529ed95b21aSIlya Dryomov 		down_write(&rbd_dev->lock_rwsem);
3530ed95b21aSIlya Dryomov 		if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3531ed95b21aSIlya Dryomov 			/*
3532ed95b21aSIlya Dryomov 			 * we already know that the remote client is
3533ed95b21aSIlya Dryomov 			 * the owner
3534ed95b21aSIlya Dryomov 			 */
3535ed95b21aSIlya Dryomov 			up_write(&rbd_dev->lock_rwsem);
3536ed95b21aSIlya Dryomov 			return;
3537ed95b21aSIlya Dryomov 		}
3538ed95b21aSIlya Dryomov 
3539ed95b21aSIlya Dryomov 		rbd_set_owner_cid(rbd_dev, &cid);
3540ed95b21aSIlya Dryomov 		downgrade_write(&rbd_dev->lock_rwsem);
3541ed95b21aSIlya Dryomov 	} else {
3542ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
3543ed95b21aSIlya Dryomov 	}
3544ed95b21aSIlya Dryomov 
3545ed95b21aSIlya Dryomov 	if (!__rbd_is_lock_owner(rbd_dev))
3546ed95b21aSIlya Dryomov 		wake_requests(rbd_dev, false);
3547ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3548ed95b21aSIlya Dryomov }
3549ed95b21aSIlya Dryomov 
3550ed95b21aSIlya Dryomov static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
3551ed95b21aSIlya Dryomov 				     void **p)
3552ed95b21aSIlya Dryomov {
3553ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
3554ed95b21aSIlya Dryomov 
3555ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
3556ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
3557ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
3558ed95b21aSIlya Dryomov 	}
3559ed95b21aSIlya Dryomov 
3560ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3561ed95b21aSIlya Dryomov 	     cid.handle);
3562ed95b21aSIlya Dryomov 	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3563ed95b21aSIlya Dryomov 		down_write(&rbd_dev->lock_rwsem);
3564ed95b21aSIlya Dryomov 		if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3565ed95b21aSIlya Dryomov 			dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
3566ed95b21aSIlya Dryomov 			     __func__, rbd_dev, cid.gid, cid.handle,
3567ed95b21aSIlya Dryomov 			     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
3568ed95b21aSIlya Dryomov 			up_write(&rbd_dev->lock_rwsem);
3569ed95b21aSIlya Dryomov 			return;
3570ed95b21aSIlya Dryomov 		}
3571ed95b21aSIlya Dryomov 
3572ed95b21aSIlya Dryomov 		rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3573ed95b21aSIlya Dryomov 		downgrade_write(&rbd_dev->lock_rwsem);
3574ed95b21aSIlya Dryomov 	} else {
3575ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
3576ed95b21aSIlya Dryomov 	}
3577ed95b21aSIlya Dryomov 
3578ed95b21aSIlya Dryomov 	if (!__rbd_is_lock_owner(rbd_dev))
3579ed95b21aSIlya Dryomov 		wake_requests(rbd_dev, false);
3580ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3581ed95b21aSIlya Dryomov }
3582ed95b21aSIlya Dryomov 
3583ed95b21aSIlya Dryomov static bool rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
3584ed95b21aSIlya Dryomov 				    void **p)
3585ed95b21aSIlya Dryomov {
3586ed95b21aSIlya Dryomov 	struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
3587ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
3588ed95b21aSIlya Dryomov 	bool need_to_send;
3589ed95b21aSIlya Dryomov 
3590ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
3591ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
3592ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
3593ed95b21aSIlya Dryomov 	}
3594ed95b21aSIlya Dryomov 
3595ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3596ed95b21aSIlya Dryomov 	     cid.handle);
3597ed95b21aSIlya Dryomov 	if (rbd_cid_equal(&cid, &my_cid))
3598ed95b21aSIlya Dryomov 		return false;
3599ed95b21aSIlya Dryomov 
3600ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
3601ed95b21aSIlya Dryomov 	need_to_send = __rbd_is_lock_owner(rbd_dev);
3602ed95b21aSIlya Dryomov 	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
3603ed95b21aSIlya Dryomov 		if (!rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid)) {
3604ed95b21aSIlya Dryomov 			dout("%s rbd_dev %p queueing unlock_work\n", __func__,
3605ed95b21aSIlya Dryomov 			     rbd_dev);
3606ed95b21aSIlya Dryomov 			queue_work(rbd_dev->task_wq, &rbd_dev->unlock_work);
3607ed95b21aSIlya Dryomov 		}
3608ed95b21aSIlya Dryomov 	}
3609ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3610ed95b21aSIlya Dryomov 	return need_to_send;
3611ed95b21aSIlya Dryomov }
3612ed95b21aSIlya Dryomov 
3613ed95b21aSIlya Dryomov static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
3614ed95b21aSIlya Dryomov 				     u64 notify_id, u64 cookie, s32 *result)
3615ed95b21aSIlya Dryomov {
3616ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3617ed95b21aSIlya Dryomov 	int buf_size = 4 + CEPH_ENCODING_START_BLK_LEN;
3618ed95b21aSIlya Dryomov 	char buf[buf_size];
3619ed95b21aSIlya Dryomov 	int ret;
3620ed95b21aSIlya Dryomov 
3621ed95b21aSIlya Dryomov 	if (result) {
3622ed95b21aSIlya Dryomov 		void *p = buf;
3623ed95b21aSIlya Dryomov 
3624ed95b21aSIlya Dryomov 		/* encode ResponseMessage */
3625ed95b21aSIlya Dryomov 		ceph_start_encoding(&p, 1, 1,
3626ed95b21aSIlya Dryomov 				    buf_size - CEPH_ENCODING_START_BLK_LEN);
3627ed95b21aSIlya Dryomov 		ceph_encode_32(&p, *result);
3628ed95b21aSIlya Dryomov 	} else {
3629ed95b21aSIlya Dryomov 		buf_size = 0;
3630ed95b21aSIlya Dryomov 	}
3631ed95b21aSIlya Dryomov 
3632ed95b21aSIlya Dryomov 	ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
3633ed95b21aSIlya Dryomov 				   &rbd_dev->header_oloc, notify_id, cookie,
3634ed95b21aSIlya Dryomov 				   buf, buf_size);
3635ed95b21aSIlya Dryomov 	if (ret)
3636ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
3637ed95b21aSIlya Dryomov }
3638ed95b21aSIlya Dryomov 
3639ed95b21aSIlya Dryomov static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
3640ed95b21aSIlya Dryomov 				   u64 cookie)
3641ed95b21aSIlya Dryomov {
3642ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3643ed95b21aSIlya Dryomov 	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
3644ed95b21aSIlya Dryomov }
3645ed95b21aSIlya Dryomov 
3646ed95b21aSIlya Dryomov static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
3647ed95b21aSIlya Dryomov 					  u64 notify_id, u64 cookie, s32 result)
3648ed95b21aSIlya Dryomov {
3649ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3650ed95b21aSIlya Dryomov 	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
3651ed95b21aSIlya Dryomov }
3652922dab61SIlya Dryomov 
3653922dab61SIlya Dryomov static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
3654922dab61SIlya Dryomov 			 u64 notifier_id, void *data, size_t data_len)
3655b8d70035SAlex Elder {
3656922dab61SIlya Dryomov 	struct rbd_device *rbd_dev = arg;
3657ed95b21aSIlya Dryomov 	void *p = data;
3658ed95b21aSIlya Dryomov 	void *const end = p + data_len;
3659d4c2269bSIlya Dryomov 	u8 struct_v = 0;
3660ed95b21aSIlya Dryomov 	u32 len;
3661ed95b21aSIlya Dryomov 	u32 notify_op;
3662b8d70035SAlex Elder 	int ret;
3663b8d70035SAlex Elder 
3664ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
3665ed95b21aSIlya Dryomov 	     __func__, rbd_dev, cookie, notify_id, data_len);
3666ed95b21aSIlya Dryomov 	if (data_len) {
3667ed95b21aSIlya Dryomov 		ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
3668ed95b21aSIlya Dryomov 					  &struct_v, &len);
3669ed95b21aSIlya Dryomov 		if (ret) {
3670ed95b21aSIlya Dryomov 			rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
3671ed95b21aSIlya Dryomov 				 ret);
3672ed95b21aSIlya Dryomov 			return;
3673ed95b21aSIlya Dryomov 		}
367452bb1f9bSIlya Dryomov 
3675ed95b21aSIlya Dryomov 		notify_op = ceph_decode_32(&p);
3676ed95b21aSIlya Dryomov 	} else {
3677ed95b21aSIlya Dryomov 		/* legacy notification for header updates */
3678ed95b21aSIlya Dryomov 		notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
3679ed95b21aSIlya Dryomov 		len = 0;
3680ed95b21aSIlya Dryomov 	}
3681ed95b21aSIlya Dryomov 
3682ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
3683ed95b21aSIlya Dryomov 	switch (notify_op) {
3684ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_ACQUIRED_LOCK:
3685ed95b21aSIlya Dryomov 		rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
3686ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3687ed95b21aSIlya Dryomov 		break;
3688ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_RELEASED_LOCK:
3689ed95b21aSIlya Dryomov 		rbd_handle_released_lock(rbd_dev, struct_v, &p);
3690ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3691ed95b21aSIlya Dryomov 		break;
3692ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_REQUEST_LOCK:
3693ed95b21aSIlya Dryomov 		if (rbd_handle_request_lock(rbd_dev, struct_v, &p))
369452bb1f9bSIlya Dryomov 			/*
3695ed95b21aSIlya Dryomov 			 * send ResponseMessage(0) back so the client
3696ed95b21aSIlya Dryomov 			 * can detect a missing owner
369752bb1f9bSIlya Dryomov 			 */
3698ed95b21aSIlya Dryomov 			rbd_acknowledge_notify_result(rbd_dev, notify_id,
3699ed95b21aSIlya Dryomov 						      cookie, 0);
3700ed95b21aSIlya Dryomov 		else
3701ed95b21aSIlya Dryomov 			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3702ed95b21aSIlya Dryomov 		break;
3703ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_HEADER_UPDATE:
3704e627db08SAlex Elder 		ret = rbd_dev_refresh(rbd_dev);
3705e627db08SAlex Elder 		if (ret)
37069584d508SIlya Dryomov 			rbd_warn(rbd_dev, "refresh failed: %d", ret);
3707b8d70035SAlex Elder 
3708ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3709ed95b21aSIlya Dryomov 		break;
3710ed95b21aSIlya Dryomov 	default:
3711ed95b21aSIlya Dryomov 		if (rbd_is_lock_owner(rbd_dev))
3712ed95b21aSIlya Dryomov 			rbd_acknowledge_notify_result(rbd_dev, notify_id,
3713ed95b21aSIlya Dryomov 						      cookie, -EOPNOTSUPP);
3714ed95b21aSIlya Dryomov 		else
3715ed95b21aSIlya Dryomov 			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3716ed95b21aSIlya Dryomov 		break;
3717b8d70035SAlex Elder 	}
3718b8d70035SAlex Elder }
3719b8d70035SAlex Elder 
372099d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
37219969ebc5SAlex Elder 
3722922dab61SIlya Dryomov static void rbd_watch_errcb(void *arg, u64 cookie, int err)
3723bb040aa0SIlya Dryomov {
3724922dab61SIlya Dryomov 	struct rbd_device *rbd_dev = arg;
3725bb040aa0SIlya Dryomov 
3726922dab61SIlya Dryomov 	rbd_warn(rbd_dev, "encountered watch error: %d", err);
3727bb040aa0SIlya Dryomov 
3728ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3729ed95b21aSIlya Dryomov 	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3730ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
3731bb040aa0SIlya Dryomov 
373299d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
373399d16943SIlya Dryomov 	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
373499d16943SIlya Dryomov 		__rbd_unregister_watch(rbd_dev);
373599d16943SIlya Dryomov 		rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
3736bb040aa0SIlya Dryomov 
373799d16943SIlya Dryomov 		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
3738bb040aa0SIlya Dryomov 	}
373999d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
3740bb040aa0SIlya Dryomov }
3741bb040aa0SIlya Dryomov 
3742bb040aa0SIlya Dryomov /*
374399d16943SIlya Dryomov  * watch_mutex must be locked
37449969ebc5SAlex Elder  */
374599d16943SIlya Dryomov static int __rbd_register_watch(struct rbd_device *rbd_dev)
37469969ebc5SAlex Elder {
37479969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3748922dab61SIlya Dryomov 	struct ceph_osd_linger_request *handle;
37499969ebc5SAlex Elder 
3750922dab61SIlya Dryomov 	rbd_assert(!rbd_dev->watch_handle);
375199d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
37529969ebc5SAlex Elder 
3753922dab61SIlya Dryomov 	handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
3754922dab61SIlya Dryomov 				 &rbd_dev->header_oloc, rbd_watch_cb,
3755922dab61SIlya Dryomov 				 rbd_watch_errcb, rbd_dev);
3756922dab61SIlya Dryomov 	if (IS_ERR(handle))
3757922dab61SIlya Dryomov 		return PTR_ERR(handle);
37589969ebc5SAlex Elder 
3759922dab61SIlya Dryomov 	rbd_dev->watch_handle = handle;
37608eb87565SAlex Elder 	return 0;
37619969ebc5SAlex Elder }
37629969ebc5SAlex Elder 
376399d16943SIlya Dryomov /*
376499d16943SIlya Dryomov  * watch_mutex must be locked
376599d16943SIlya Dryomov  */
376699d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
3767fca27065SIlya Dryomov {
3768922dab61SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3769922dab61SIlya Dryomov 	int ret;
3770b30a01f2SIlya Dryomov 
377199d16943SIlya Dryomov 	rbd_assert(rbd_dev->watch_handle);
377299d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3773b30a01f2SIlya Dryomov 
3774922dab61SIlya Dryomov 	ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
3775922dab61SIlya Dryomov 	if (ret)
3776922dab61SIlya Dryomov 		rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
3777b30a01f2SIlya Dryomov 
3778922dab61SIlya Dryomov 	rbd_dev->watch_handle = NULL;
3779c525f036SIlya Dryomov }
3780c525f036SIlya Dryomov 
378199d16943SIlya Dryomov static int rbd_register_watch(struct rbd_device *rbd_dev)
3782c525f036SIlya Dryomov {
378399d16943SIlya Dryomov 	int ret;
3784811c6688SIlya Dryomov 
378599d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
378699d16943SIlya Dryomov 	rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
378799d16943SIlya Dryomov 	ret = __rbd_register_watch(rbd_dev);
378899d16943SIlya Dryomov 	if (ret)
378999d16943SIlya Dryomov 		goto out;
379099d16943SIlya Dryomov 
379199d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
379299d16943SIlya Dryomov 	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
379399d16943SIlya Dryomov 
379499d16943SIlya Dryomov out:
379599d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
379699d16943SIlya Dryomov 	return ret;
379799d16943SIlya Dryomov }
379899d16943SIlya Dryomov 
379999d16943SIlya Dryomov static void cancel_tasks_sync(struct rbd_device *rbd_dev)
380099d16943SIlya Dryomov {
380199d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
380299d16943SIlya Dryomov 
380399d16943SIlya Dryomov 	cancel_delayed_work_sync(&rbd_dev->watch_dwork);
3804ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->acquired_lock_work);
3805ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->released_lock_work);
3806ed95b21aSIlya Dryomov 	cancel_delayed_work_sync(&rbd_dev->lock_dwork);
3807ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->unlock_work);
380899d16943SIlya Dryomov }
380999d16943SIlya Dryomov 
381099d16943SIlya Dryomov static void rbd_unregister_watch(struct rbd_device *rbd_dev)
381199d16943SIlya Dryomov {
3812ed95b21aSIlya Dryomov 	WARN_ON(waitqueue_active(&rbd_dev->lock_waitq));
381399d16943SIlya Dryomov 	cancel_tasks_sync(rbd_dev);
381499d16943SIlya Dryomov 
381599d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
381699d16943SIlya Dryomov 	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
381799d16943SIlya Dryomov 		__rbd_unregister_watch(rbd_dev);
381899d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
381999d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
382099d16943SIlya Dryomov 
3821811c6688SIlya Dryomov 	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
3822fca27065SIlya Dryomov }
3823fca27065SIlya Dryomov 
382499d16943SIlya Dryomov static void rbd_reregister_watch(struct work_struct *work)
382599d16943SIlya Dryomov {
382699d16943SIlya Dryomov 	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
382799d16943SIlya Dryomov 					    struct rbd_device, watch_dwork);
3828ed95b21aSIlya Dryomov 	bool was_lock_owner = false;
382987c0fdedSIlya Dryomov 	bool need_to_wake = false;
383099d16943SIlya Dryomov 	int ret;
383199d16943SIlya Dryomov 
383299d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
383399d16943SIlya Dryomov 
3834ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3835ed95b21aSIlya Dryomov 	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3836ed95b21aSIlya Dryomov 		was_lock_owner = rbd_release_lock(rbd_dev);
3837ed95b21aSIlya Dryomov 
383899d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
383987c0fdedSIlya Dryomov 	if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
384087c0fdedSIlya Dryomov 		mutex_unlock(&rbd_dev->watch_mutex);
384187c0fdedSIlya Dryomov 		goto out;
384287c0fdedSIlya Dryomov 	}
384399d16943SIlya Dryomov 
384499d16943SIlya Dryomov 	ret = __rbd_register_watch(rbd_dev);
384599d16943SIlya Dryomov 	if (ret) {
384699d16943SIlya Dryomov 		rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
38474d73644bSIlya Dryomov 		if (ret == -EBLACKLISTED || ret == -ENOENT) {
384887c0fdedSIlya Dryomov 			set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
384987c0fdedSIlya Dryomov 			need_to_wake = true;
385087c0fdedSIlya Dryomov 		} else {
385199d16943SIlya Dryomov 			queue_delayed_work(rbd_dev->task_wq,
385299d16943SIlya Dryomov 					   &rbd_dev->watch_dwork,
385399d16943SIlya Dryomov 					   RBD_RETRY_DELAY);
385487c0fdedSIlya Dryomov 		}
385587c0fdedSIlya Dryomov 		mutex_unlock(&rbd_dev->watch_mutex);
385687c0fdedSIlya Dryomov 		goto out;
385799d16943SIlya Dryomov 	}
385899d16943SIlya Dryomov 
385987c0fdedSIlya Dryomov 	need_to_wake = true;
386099d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
386199d16943SIlya Dryomov 	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
386299d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
386399d16943SIlya Dryomov 
386499d16943SIlya Dryomov 	ret = rbd_dev_refresh(rbd_dev);
386599d16943SIlya Dryomov 	if (ret)
386699d16943SIlya Dryomov 		rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret);
386799d16943SIlya Dryomov 
3868ed95b21aSIlya Dryomov 	if (was_lock_owner) {
3869ed95b21aSIlya Dryomov 		ret = rbd_try_lock(rbd_dev);
3870ed95b21aSIlya Dryomov 		if (ret)
3871ed95b21aSIlya Dryomov 			rbd_warn(rbd_dev, "reregisteration lock failed: %d",
3872ed95b21aSIlya Dryomov 				 ret);
3873ed95b21aSIlya Dryomov 	}
3874ed95b21aSIlya Dryomov 
387587c0fdedSIlya Dryomov out:
3876ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
387787c0fdedSIlya Dryomov 	if (need_to_wake)
3878ed95b21aSIlya Dryomov 		wake_requests(rbd_dev, true);
387999d16943SIlya Dryomov }
388099d16943SIlya Dryomov 
388136be9a76SAlex Elder /*
3882f40eb349SAlex Elder  * Synchronous osd object method call.  Returns the number of bytes
3883f40eb349SAlex Elder  * returned in the outbound buffer, or a negative error code.
388436be9a76SAlex Elder  */
388536be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
3886ecd4a68aSIlya Dryomov 			     struct ceph_object_id *oid,
3887ecd4a68aSIlya Dryomov 			     struct ceph_object_locator *oloc,
388836be9a76SAlex Elder 			     const char *method_name,
38894157976bSAlex Elder 			     const void *outbound,
389036be9a76SAlex Elder 			     size_t outbound_size,
38914157976bSAlex Elder 			     void *inbound,
3892e2a58ee5SAlex Elder 			     size_t inbound_size)
389336be9a76SAlex Elder {
3894ecd4a68aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3895ecd4a68aSIlya Dryomov 	struct page *req_page = NULL;
3896ecd4a68aSIlya Dryomov 	struct page *reply_page;
389736be9a76SAlex Elder 	int ret;
389836be9a76SAlex Elder 
389936be9a76SAlex Elder 	/*
39006010a451SAlex Elder 	 * Method calls are ultimately read operations.  The result
39016010a451SAlex Elder 	 * should placed into the inbound buffer provided.  They
39026010a451SAlex Elder 	 * also supply outbound data--parameters for the object
39036010a451SAlex Elder 	 * method.  Currently if this is present it will be a
39046010a451SAlex Elder 	 * snapshot id.
390536be9a76SAlex Elder 	 */
3906ecd4a68aSIlya Dryomov 	if (outbound) {
3907ecd4a68aSIlya Dryomov 		if (outbound_size > PAGE_SIZE)
3908ecd4a68aSIlya Dryomov 			return -E2BIG;
390936be9a76SAlex Elder 
3910ecd4a68aSIlya Dryomov 		req_page = alloc_page(GFP_KERNEL);
3911ecd4a68aSIlya Dryomov 		if (!req_page)
3912ecd4a68aSIlya Dryomov 			return -ENOMEM;
391336be9a76SAlex Elder 
3914ecd4a68aSIlya Dryomov 		memcpy(page_address(req_page), outbound, outbound_size);
391504017e29SAlex Elder 	}
3916430c28c3SAlex Elder 
3917ecd4a68aSIlya Dryomov 	reply_page = alloc_page(GFP_KERNEL);
3918ecd4a68aSIlya Dryomov 	if (!reply_page) {
3919ecd4a68aSIlya Dryomov 		if (req_page)
3920ecd4a68aSIlya Dryomov 			__free_page(req_page);
3921ecd4a68aSIlya Dryomov 		return -ENOMEM;
3922ecd4a68aSIlya Dryomov 	}
392336be9a76SAlex Elder 
3924ecd4a68aSIlya Dryomov 	ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
3925ecd4a68aSIlya Dryomov 			     CEPH_OSD_FLAG_READ, req_page, outbound_size,
3926ecd4a68aSIlya Dryomov 			     reply_page, &inbound_size);
3927ecd4a68aSIlya Dryomov 	if (!ret) {
3928ecd4a68aSIlya Dryomov 		memcpy(inbound, page_address(reply_page), inbound_size);
3929ecd4a68aSIlya Dryomov 		ret = inbound_size;
3930ecd4a68aSIlya Dryomov 	}
393157385b51SAlex Elder 
3932ecd4a68aSIlya Dryomov 	if (req_page)
3933ecd4a68aSIlya Dryomov 		__free_page(req_page);
3934ecd4a68aSIlya Dryomov 	__free_page(reply_page);
393536be9a76SAlex Elder 	return ret;
393636be9a76SAlex Elder }
393736be9a76SAlex Elder 
3938ed95b21aSIlya Dryomov /*
3939ed95b21aSIlya Dryomov  * lock_rwsem must be held for read
3940ed95b21aSIlya Dryomov  */
3941ed95b21aSIlya Dryomov static void rbd_wait_state_locked(struct rbd_device *rbd_dev)
3942ed95b21aSIlya Dryomov {
3943ed95b21aSIlya Dryomov 	DEFINE_WAIT(wait);
3944ed95b21aSIlya Dryomov 
3945ed95b21aSIlya Dryomov 	do {
3946ed95b21aSIlya Dryomov 		/*
3947ed95b21aSIlya Dryomov 		 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3948ed95b21aSIlya Dryomov 		 * and cancel_delayed_work() in wake_requests().
3949ed95b21aSIlya Dryomov 		 */
3950ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3951ed95b21aSIlya Dryomov 		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3952ed95b21aSIlya Dryomov 		prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait,
3953ed95b21aSIlya Dryomov 					  TASK_UNINTERRUPTIBLE);
3954ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
3955ed95b21aSIlya Dryomov 		schedule();
3956ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
395787c0fdedSIlya Dryomov 	} while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED &&
395887c0fdedSIlya Dryomov 		 !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags));
395987c0fdedSIlya Dryomov 
3960ed95b21aSIlya Dryomov 	finish_wait(&rbd_dev->lock_waitq, &wait);
3961ed95b21aSIlya Dryomov }
3962ed95b21aSIlya Dryomov 
39637ad18afaSChristoph Hellwig static void rbd_queue_workfn(struct work_struct *work)
3964bc1ecc65SIlya Dryomov {
39657ad18afaSChristoph Hellwig 	struct request *rq = blk_mq_rq_from_pdu(work);
39667ad18afaSChristoph Hellwig 	struct rbd_device *rbd_dev = rq->q->queuedata;
3967bc1ecc65SIlya Dryomov 	struct rbd_img_request *img_request;
39684e752f0aSJosh Durgin 	struct ceph_snap_context *snapc = NULL;
3969bc1ecc65SIlya Dryomov 	u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
3970bc1ecc65SIlya Dryomov 	u64 length = blk_rq_bytes(rq);
39716d2940c8SGuangliang Zhao 	enum obj_operation_type op_type;
39724e752f0aSJosh Durgin 	u64 mapping_size;
397380de1912SIlya Dryomov 	bool must_be_locked;
3974bc1ecc65SIlya Dryomov 	int result;
3975bc1ecc65SIlya Dryomov 
3976aebf526bSChristoph Hellwig 	switch (req_op(rq)) {
3977aebf526bSChristoph Hellwig 	case REQ_OP_DISCARD:
3978aebf526bSChristoph Hellwig 		op_type = OBJ_OP_DISCARD;
3979aebf526bSChristoph Hellwig 		break;
3980aebf526bSChristoph Hellwig 	case REQ_OP_WRITE:
3981aebf526bSChristoph Hellwig 		op_type = OBJ_OP_WRITE;
3982aebf526bSChristoph Hellwig 		break;
3983aebf526bSChristoph Hellwig 	case REQ_OP_READ:
3984aebf526bSChristoph Hellwig 		op_type = OBJ_OP_READ;
3985aebf526bSChristoph Hellwig 		break;
3986aebf526bSChristoph Hellwig 	default:
3987aebf526bSChristoph Hellwig 		dout("%s: non-fs request type %d\n", __func__, req_op(rq));
39887ad18afaSChristoph Hellwig 		result = -EIO;
39897ad18afaSChristoph Hellwig 		goto err;
39907ad18afaSChristoph Hellwig 	}
39917ad18afaSChristoph Hellwig 
3992bc1ecc65SIlya Dryomov 	/* Ignore/skip any zero-length requests */
3993bc1ecc65SIlya Dryomov 
3994bc1ecc65SIlya Dryomov 	if (!length) {
3995bc1ecc65SIlya Dryomov 		dout("%s: zero-length request\n", __func__);
3996bc1ecc65SIlya Dryomov 		result = 0;
3997bc1ecc65SIlya Dryomov 		goto err_rq;
3998bc1ecc65SIlya Dryomov 	}
3999bc1ecc65SIlya Dryomov 
40006d2940c8SGuangliang Zhao 	/* Only reads are allowed to a read-only device */
4001bc1ecc65SIlya Dryomov 
40026d2940c8SGuangliang Zhao 	if (op_type != OBJ_OP_READ) {
4003bc1ecc65SIlya Dryomov 		if (rbd_dev->mapping.read_only) {
4004bc1ecc65SIlya Dryomov 			result = -EROFS;
4005bc1ecc65SIlya Dryomov 			goto err_rq;
4006bc1ecc65SIlya Dryomov 		}
4007bc1ecc65SIlya Dryomov 		rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
4008bc1ecc65SIlya Dryomov 	}
4009bc1ecc65SIlya Dryomov 
4010bc1ecc65SIlya Dryomov 	/*
4011bc1ecc65SIlya Dryomov 	 * Quit early if the mapped snapshot no longer exists.  It's
4012bc1ecc65SIlya Dryomov 	 * still possible the snapshot will have disappeared by the
4013bc1ecc65SIlya Dryomov 	 * time our request arrives at the osd, but there's no sense in
4014bc1ecc65SIlya Dryomov 	 * sending it if we already know.
4015bc1ecc65SIlya Dryomov 	 */
4016bc1ecc65SIlya Dryomov 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
4017bc1ecc65SIlya Dryomov 		dout("request for non-existent snapshot");
4018bc1ecc65SIlya Dryomov 		rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
4019bc1ecc65SIlya Dryomov 		result = -ENXIO;
4020bc1ecc65SIlya Dryomov 		goto err_rq;
4021bc1ecc65SIlya Dryomov 	}
4022bc1ecc65SIlya Dryomov 
4023bc1ecc65SIlya Dryomov 	if (offset && length > U64_MAX - offset + 1) {
4024bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
4025bc1ecc65SIlya Dryomov 			 length);
4026bc1ecc65SIlya Dryomov 		result = -EINVAL;
4027bc1ecc65SIlya Dryomov 		goto err_rq;	/* Shouldn't happen */
4028bc1ecc65SIlya Dryomov 	}
4029bc1ecc65SIlya Dryomov 
40307ad18afaSChristoph Hellwig 	blk_mq_start_request(rq);
40317ad18afaSChristoph Hellwig 
40324e752f0aSJosh Durgin 	down_read(&rbd_dev->header_rwsem);
40334e752f0aSJosh Durgin 	mapping_size = rbd_dev->mapping.size;
40346d2940c8SGuangliang Zhao 	if (op_type != OBJ_OP_READ) {
40354e752f0aSJosh Durgin 		snapc = rbd_dev->header.snapc;
40364e752f0aSJosh Durgin 		ceph_get_snap_context(snapc);
4037ed95b21aSIlya Dryomov 		must_be_locked = rbd_is_lock_supported(rbd_dev);
403880de1912SIlya Dryomov 	} else {
403980de1912SIlya Dryomov 		must_be_locked = rbd_dev->opts->lock_on_read &&
404080de1912SIlya Dryomov 					rbd_is_lock_supported(rbd_dev);
40414e752f0aSJosh Durgin 	}
40424e752f0aSJosh Durgin 	up_read(&rbd_dev->header_rwsem);
40434e752f0aSJosh Durgin 
40444e752f0aSJosh Durgin 	if (offset + length > mapping_size) {
4045bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
40464e752f0aSJosh Durgin 			 length, mapping_size);
4047bc1ecc65SIlya Dryomov 		result = -EIO;
4048bc1ecc65SIlya Dryomov 		goto err_rq;
4049bc1ecc65SIlya Dryomov 	}
4050bc1ecc65SIlya Dryomov 
4051ed95b21aSIlya Dryomov 	if (must_be_locked) {
4052ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
405387c0fdedSIlya Dryomov 		if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED &&
405487c0fdedSIlya Dryomov 		    !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags))
4055ed95b21aSIlya Dryomov 			rbd_wait_state_locked(rbd_dev);
405687c0fdedSIlya Dryomov 
405787c0fdedSIlya Dryomov 		WARN_ON((rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) ^
405887c0fdedSIlya Dryomov 			!test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags));
405987c0fdedSIlya Dryomov 		if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
406087c0fdedSIlya Dryomov 			result = -EBLACKLISTED;
406187c0fdedSIlya Dryomov 			goto err_unlock;
406287c0fdedSIlya Dryomov 		}
4063ed95b21aSIlya Dryomov 	}
4064ed95b21aSIlya Dryomov 
40656d2940c8SGuangliang Zhao 	img_request = rbd_img_request_create(rbd_dev, offset, length, op_type,
40664e752f0aSJosh Durgin 					     snapc);
4067bc1ecc65SIlya Dryomov 	if (!img_request) {
4068bc1ecc65SIlya Dryomov 		result = -ENOMEM;
4069ed95b21aSIlya Dryomov 		goto err_unlock;
4070bc1ecc65SIlya Dryomov 	}
4071bc1ecc65SIlya Dryomov 	img_request->rq = rq;
407270b16db8SIlya Dryomov 	snapc = NULL; /* img_request consumes a ref */
4073bc1ecc65SIlya Dryomov 
407490e98c52SGuangliang Zhao 	if (op_type == OBJ_OP_DISCARD)
407590e98c52SGuangliang Zhao 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA,
407690e98c52SGuangliang Zhao 					      NULL);
407790e98c52SGuangliang Zhao 	else
407890e98c52SGuangliang Zhao 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
407990e98c52SGuangliang Zhao 					      rq->bio);
4080bc1ecc65SIlya Dryomov 	if (result)
4081bc1ecc65SIlya Dryomov 		goto err_img_request;
4082bc1ecc65SIlya Dryomov 
4083bc1ecc65SIlya Dryomov 	result = rbd_img_request_submit(img_request);
4084bc1ecc65SIlya Dryomov 	if (result)
4085bc1ecc65SIlya Dryomov 		goto err_img_request;
4086bc1ecc65SIlya Dryomov 
4087ed95b21aSIlya Dryomov 	if (must_be_locked)
4088ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
4089bc1ecc65SIlya Dryomov 	return;
4090bc1ecc65SIlya Dryomov 
4091bc1ecc65SIlya Dryomov err_img_request:
4092bc1ecc65SIlya Dryomov 	rbd_img_request_put(img_request);
4093ed95b21aSIlya Dryomov err_unlock:
4094ed95b21aSIlya Dryomov 	if (must_be_locked)
4095ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
4096bc1ecc65SIlya Dryomov err_rq:
4097bc1ecc65SIlya Dryomov 	if (result)
4098bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "%s %llx at %llx result %d",
40996d2940c8SGuangliang Zhao 			 obj_op_name(op_type), length, offset, result);
41004e752f0aSJosh Durgin 	ceph_put_snap_context(snapc);
41017ad18afaSChristoph Hellwig err:
41027ad18afaSChristoph Hellwig 	blk_mq_end_request(rq, result);
4103bc1ecc65SIlya Dryomov }
4104bc1ecc65SIlya Dryomov 
41057ad18afaSChristoph Hellwig static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
41067ad18afaSChristoph Hellwig 		const struct blk_mq_queue_data *bd)
4107bc1ecc65SIlya Dryomov {
41087ad18afaSChristoph Hellwig 	struct request *rq = bd->rq;
41097ad18afaSChristoph Hellwig 	struct work_struct *work = blk_mq_rq_to_pdu(rq);
4110bc1ecc65SIlya Dryomov 
41117ad18afaSChristoph Hellwig 	queue_work(rbd_wq, work);
41127ad18afaSChristoph Hellwig 	return BLK_MQ_RQ_QUEUE_OK;
4113bf0d5f50SAlex Elder }
4114bf0d5f50SAlex Elder 
4115602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
4116602adf40SYehuda Sadeh {
4117602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
4118602adf40SYehuda Sadeh 
4119602adf40SYehuda Sadeh 	if (!disk)
4120602adf40SYehuda Sadeh 		return;
4121602adf40SYehuda Sadeh 
4122a0cab924SAlex Elder 	rbd_dev->disk = NULL;
4123a0cab924SAlex Elder 	if (disk->flags & GENHD_FL_UP) {
4124602adf40SYehuda Sadeh 		del_gendisk(disk);
4125602adf40SYehuda Sadeh 		if (disk->queue)
4126602adf40SYehuda Sadeh 			blk_cleanup_queue(disk->queue);
41277ad18afaSChristoph Hellwig 		blk_mq_free_tag_set(&rbd_dev->tag_set);
4128a0cab924SAlex Elder 	}
4129602adf40SYehuda Sadeh 	put_disk(disk);
4130602adf40SYehuda Sadeh }
4131602adf40SYehuda Sadeh 
4132788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
4133fe5478e0SIlya Dryomov 			     struct ceph_object_id *oid,
4134fe5478e0SIlya Dryomov 			     struct ceph_object_locator *oloc,
4135fe5478e0SIlya Dryomov 			     void *buf, int buf_len)
4136788e2df3SAlex Elder 
4137788e2df3SAlex Elder {
4138fe5478e0SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4139fe5478e0SIlya Dryomov 	struct ceph_osd_request *req;
4140fe5478e0SIlya Dryomov 	struct page **pages;
4141fe5478e0SIlya Dryomov 	int num_pages = calc_pages_for(0, buf_len);
4142788e2df3SAlex Elder 	int ret;
4143788e2df3SAlex Elder 
4144fe5478e0SIlya Dryomov 	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
4145fe5478e0SIlya Dryomov 	if (!req)
4146fe5478e0SIlya Dryomov 		return -ENOMEM;
4147788e2df3SAlex Elder 
4148fe5478e0SIlya Dryomov 	ceph_oid_copy(&req->r_base_oid, oid);
4149fe5478e0SIlya Dryomov 	ceph_oloc_copy(&req->r_base_oloc, oloc);
4150fe5478e0SIlya Dryomov 	req->r_flags = CEPH_OSD_FLAG_READ;
4151788e2df3SAlex Elder 
4152fe5478e0SIlya Dryomov 	ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
4153788e2df3SAlex Elder 	if (ret)
4154fe5478e0SIlya Dryomov 		goto out_req;
4155788e2df3SAlex Elder 
4156fe5478e0SIlya Dryomov 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
4157fe5478e0SIlya Dryomov 	if (IS_ERR(pages)) {
4158fe5478e0SIlya Dryomov 		ret = PTR_ERR(pages);
4159fe5478e0SIlya Dryomov 		goto out_req;
4160fe5478e0SIlya Dryomov 	}
41611ceae7efSAlex Elder 
4162fe5478e0SIlya Dryomov 	osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
4163fe5478e0SIlya Dryomov 	osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
4164fe5478e0SIlya Dryomov 					 true);
4165788e2df3SAlex Elder 
4166fe5478e0SIlya Dryomov 	ceph_osdc_start_request(osdc, req, false);
4167fe5478e0SIlya Dryomov 	ret = ceph_osdc_wait_request(osdc, req);
4168fe5478e0SIlya Dryomov 	if (ret >= 0)
4169fe5478e0SIlya Dryomov 		ceph_copy_from_page_vector(pages, buf, 0, ret);
4170fe5478e0SIlya Dryomov 
4171fe5478e0SIlya Dryomov out_req:
4172fe5478e0SIlya Dryomov 	ceph_osdc_put_request(req);
4173788e2df3SAlex Elder 	return ret;
4174788e2df3SAlex Elder }
4175788e2df3SAlex Elder 
4176602adf40SYehuda Sadeh /*
4177662518b1SAlex Elder  * Read the complete header for the given rbd device.  On successful
4178662518b1SAlex Elder  * return, the rbd_dev->header field will contain up-to-date
4179662518b1SAlex Elder  * information about the image.
41804156d998SAlex Elder  */
418199a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
41824156d998SAlex Elder {
41834156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
41844156d998SAlex Elder 	u32 snap_count = 0;
41854156d998SAlex Elder 	u64 names_size = 0;
41864156d998SAlex Elder 	u32 want_count;
41874156d998SAlex Elder 	int ret;
41884156d998SAlex Elder 
41894156d998SAlex Elder 	/*
41904156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
41914156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
41924156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
41934156d998SAlex Elder 	 * the number of snapshots could change by the time we read
41944156d998SAlex Elder 	 * it in, in which case we re-read it.
41954156d998SAlex Elder 	 */
41964156d998SAlex Elder 	do {
41974156d998SAlex Elder 		size_t size;
41984156d998SAlex Elder 
41994156d998SAlex Elder 		kfree(ondisk);
42004156d998SAlex Elder 
42014156d998SAlex Elder 		size = sizeof (*ondisk);
42024156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
42034156d998SAlex Elder 		size += names_size;
42044156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
42054156d998SAlex Elder 		if (!ondisk)
4206662518b1SAlex Elder 			return -ENOMEM;
42074156d998SAlex Elder 
4208fe5478e0SIlya Dryomov 		ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
4209fe5478e0SIlya Dryomov 					&rbd_dev->header_oloc, ondisk, size);
42104156d998SAlex Elder 		if (ret < 0)
4211662518b1SAlex Elder 			goto out;
4212c0cd10dbSAlex Elder 		if ((size_t)ret < size) {
42134156d998SAlex Elder 			ret = -ENXIO;
421406ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
421506ecc6cbSAlex Elder 				size, ret);
4216662518b1SAlex Elder 			goto out;
42174156d998SAlex Elder 		}
42184156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
42194156d998SAlex Elder 			ret = -ENXIO;
422006ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
4221662518b1SAlex Elder 			goto out;
42224156d998SAlex Elder 		}
42234156d998SAlex Elder 
42244156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
42254156d998SAlex Elder 		want_count = snap_count;
42264156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
42274156d998SAlex Elder 	} while (snap_count != want_count);
42284156d998SAlex Elder 
4229662518b1SAlex Elder 	ret = rbd_header_from_disk(rbd_dev, ondisk);
4230662518b1SAlex Elder out:
42314156d998SAlex Elder 	kfree(ondisk);
42324156d998SAlex Elder 
4233dfc5606dSYehuda Sadeh 	return ret;
4234602adf40SYehuda Sadeh }
4235602adf40SYehuda Sadeh 
423615228edeSAlex Elder /*
423715228edeSAlex Elder  * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
423815228edeSAlex Elder  * has disappeared from the (just updated) snapshot context.
423915228edeSAlex Elder  */
424015228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev)
424115228edeSAlex Elder {
424215228edeSAlex Elder 	u64 snap_id;
424315228edeSAlex Elder 
424415228edeSAlex Elder 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
424515228edeSAlex Elder 		return;
424615228edeSAlex Elder 
424715228edeSAlex Elder 	snap_id = rbd_dev->spec->snap_id;
424815228edeSAlex Elder 	if (snap_id == CEPH_NOSNAP)
424915228edeSAlex Elder 		return;
425015228edeSAlex Elder 
425115228edeSAlex Elder 	if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
425215228edeSAlex Elder 		clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
425315228edeSAlex Elder }
425415228edeSAlex Elder 
42559875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev)
42569875201eSJosh Durgin {
42579875201eSJosh Durgin 	sector_t size;
42589875201eSJosh Durgin 
42599875201eSJosh Durgin 	/*
4260811c6688SIlya Dryomov 	 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
4261811c6688SIlya Dryomov 	 * try to update its size.  If REMOVING is set, updating size
4262811c6688SIlya Dryomov 	 * is just useless work since the device can't be opened.
42639875201eSJosh Durgin 	 */
4264811c6688SIlya Dryomov 	if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
4265811c6688SIlya Dryomov 	    !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
42669875201eSJosh Durgin 		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
42679875201eSJosh Durgin 		dout("setting size to %llu sectors", (unsigned long long)size);
42689875201eSJosh Durgin 		set_capacity(rbd_dev->disk, size);
42699875201eSJosh Durgin 		revalidate_disk(rbd_dev->disk);
42709875201eSJosh Durgin 	}
42719875201eSJosh Durgin }
42729875201eSJosh Durgin 
4273cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev)
42741fe5e993SAlex Elder {
4275e627db08SAlex Elder 	u64 mapping_size;
42761fe5e993SAlex Elder 	int ret;
42771fe5e993SAlex Elder 
4278cfbf6377SAlex Elder 	down_write(&rbd_dev->header_rwsem);
42793b5cf2a2SAlex Elder 	mapping_size = rbd_dev->mapping.size;
4280a720ae09SIlya Dryomov 
4281a720ae09SIlya Dryomov 	ret = rbd_dev_header_info(rbd_dev);
428252bb1f9bSIlya Dryomov 	if (ret)
428373e39e4dSIlya Dryomov 		goto out;
428415228edeSAlex Elder 
4285e8f59b59SIlya Dryomov 	/*
4286e8f59b59SIlya Dryomov 	 * If there is a parent, see if it has disappeared due to the
4287e8f59b59SIlya Dryomov 	 * mapped image getting flattened.
4288e8f59b59SIlya Dryomov 	 */
4289e8f59b59SIlya Dryomov 	if (rbd_dev->parent) {
4290e8f59b59SIlya Dryomov 		ret = rbd_dev_v2_parent_info(rbd_dev);
4291e8f59b59SIlya Dryomov 		if (ret)
429273e39e4dSIlya Dryomov 			goto out;
4293e8f59b59SIlya Dryomov 	}
4294e8f59b59SIlya Dryomov 
42955ff1108cSIlya Dryomov 	if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
42965ff1108cSIlya Dryomov 		rbd_dev->mapping.size = rbd_dev->header.image_size;
42975ff1108cSIlya Dryomov 	} else {
42985ff1108cSIlya Dryomov 		/* validate mapped snapshot's EXISTS flag */
429915228edeSAlex Elder 		rbd_exists_validate(rbd_dev);
43005ff1108cSIlya Dryomov 	}
43015ff1108cSIlya Dryomov 
430273e39e4dSIlya Dryomov out:
4303cfbf6377SAlex Elder 	up_write(&rbd_dev->header_rwsem);
430473e39e4dSIlya Dryomov 	if (!ret && mapping_size != rbd_dev->mapping.size)
43059875201eSJosh Durgin 		rbd_dev_update_size(rbd_dev);
43061fe5e993SAlex Elder 
430773e39e4dSIlya Dryomov 	return ret;
43081fe5e993SAlex Elder }
43091fe5e993SAlex Elder 
43107ad18afaSChristoph Hellwig static int rbd_init_request(void *data, struct request *rq,
43117ad18afaSChristoph Hellwig 		unsigned int hctx_idx, unsigned int request_idx,
43127ad18afaSChristoph Hellwig 		unsigned int numa_node)
43137ad18afaSChristoph Hellwig {
43147ad18afaSChristoph Hellwig 	struct work_struct *work = blk_mq_rq_to_pdu(rq);
43157ad18afaSChristoph Hellwig 
43167ad18afaSChristoph Hellwig 	INIT_WORK(work, rbd_queue_workfn);
43177ad18afaSChristoph Hellwig 	return 0;
43187ad18afaSChristoph Hellwig }
43197ad18afaSChristoph Hellwig 
43207ad18afaSChristoph Hellwig static struct blk_mq_ops rbd_mq_ops = {
43217ad18afaSChristoph Hellwig 	.queue_rq	= rbd_queue_rq,
43227ad18afaSChristoph Hellwig 	.init_request	= rbd_init_request,
43237ad18afaSChristoph Hellwig };
43247ad18afaSChristoph Hellwig 
4325602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
4326602adf40SYehuda Sadeh {
4327602adf40SYehuda Sadeh 	struct gendisk *disk;
4328602adf40SYehuda Sadeh 	struct request_queue *q;
4329593a9e7bSAlex Elder 	u64 segment_size;
43307ad18afaSChristoph Hellwig 	int err;
4331602adf40SYehuda Sadeh 
4332602adf40SYehuda Sadeh 	/* create gendisk info */
43337e513d43SIlya Dryomov 	disk = alloc_disk(single_major ?
43347e513d43SIlya Dryomov 			  (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
43357e513d43SIlya Dryomov 			  RBD_MINORS_PER_MAJOR);
4336602adf40SYehuda Sadeh 	if (!disk)
43371fcdb8aaSAlex Elder 		return -ENOMEM;
4338602adf40SYehuda Sadeh 
4339f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
4340de71a297SAlex Elder 		 rbd_dev->dev_id);
4341602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
4342dd82fff1SIlya Dryomov 	disk->first_minor = rbd_dev->minor;
43437e513d43SIlya Dryomov 	if (single_major)
43447e513d43SIlya Dryomov 		disk->flags |= GENHD_FL_EXT_DEVT;
4345602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
4346602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
4347602adf40SYehuda Sadeh 
43487ad18afaSChristoph Hellwig 	memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
43497ad18afaSChristoph Hellwig 	rbd_dev->tag_set.ops = &rbd_mq_ops;
4350b5584180SIlya Dryomov 	rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
43517ad18afaSChristoph Hellwig 	rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
4352b5584180SIlya Dryomov 	rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
43537ad18afaSChristoph Hellwig 	rbd_dev->tag_set.nr_hw_queues = 1;
43547ad18afaSChristoph Hellwig 	rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
43557ad18afaSChristoph Hellwig 
43567ad18afaSChristoph Hellwig 	err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
43577ad18afaSChristoph Hellwig 	if (err)
4358602adf40SYehuda Sadeh 		goto out_disk;
4359029bcbd8SJosh Durgin 
43607ad18afaSChristoph Hellwig 	q = blk_mq_init_queue(&rbd_dev->tag_set);
43617ad18afaSChristoph Hellwig 	if (IS_ERR(q)) {
43627ad18afaSChristoph Hellwig 		err = PTR_ERR(q);
43637ad18afaSChristoph Hellwig 		goto out_tag_set;
43647ad18afaSChristoph Hellwig 	}
43657ad18afaSChristoph Hellwig 
4366d8a2c89cSIlya Dryomov 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
4367d8a2c89cSIlya Dryomov 	/* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
4368593a9e7bSAlex Elder 
4369029bcbd8SJosh Durgin 	/* set io sizes to object size */
4370593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
4371593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
43720d9fde4fSIlya Dryomov 	q->limits.max_sectors = queue_max_hw_sectors(q);
4373d3834fefSIlya Dryomov 	blk_queue_max_segments(q, segment_size / SECTOR_SIZE);
4374593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
4375593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
4376593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
4377029bcbd8SJosh Durgin 
437890e98c52SGuangliang Zhao 	/* enable the discard support */
437990e98c52SGuangliang Zhao 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
438090e98c52SGuangliang Zhao 	q->limits.discard_granularity = segment_size;
438190e98c52SGuangliang Zhao 	q->limits.discard_alignment = segment_size;
43822bb4cd5cSJens Axboe 	blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE);
4383b76f8239SJosh Durgin 	q->limits.discard_zeroes_data = 1;
438490e98c52SGuangliang Zhao 
4385bae818eeSRonny Hegewald 	if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
4386dc3b17ccSJan Kara 		q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
4387bae818eeSRonny Hegewald 
4388602adf40SYehuda Sadeh 	disk->queue = q;
4389602adf40SYehuda Sadeh 
4390602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
4391602adf40SYehuda Sadeh 
4392602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
4393602adf40SYehuda Sadeh 
4394602adf40SYehuda Sadeh 	return 0;
43957ad18afaSChristoph Hellwig out_tag_set:
43967ad18afaSChristoph Hellwig 	blk_mq_free_tag_set(&rbd_dev->tag_set);
4397602adf40SYehuda Sadeh out_disk:
4398602adf40SYehuda Sadeh 	put_disk(disk);
43997ad18afaSChristoph Hellwig 	return err;
4400602adf40SYehuda Sadeh }
4401602adf40SYehuda Sadeh 
4402dfc5606dSYehuda Sadeh /*
4403dfc5606dSYehuda Sadeh   sysfs
4404dfc5606dSYehuda Sadeh */
4405602adf40SYehuda Sadeh 
4406593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
4407593a9e7bSAlex Elder {
4408593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
4409593a9e7bSAlex Elder }
4410593a9e7bSAlex Elder 
4411dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
4412dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
4413602adf40SYehuda Sadeh {
4414593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4415dfc5606dSYehuda Sadeh 
4416fc71d833SAlex Elder 	return sprintf(buf, "%llu\n",
4417fc71d833SAlex Elder 		(unsigned long long)rbd_dev->mapping.size);
4418602adf40SYehuda Sadeh }
4419602adf40SYehuda Sadeh 
442034b13184SAlex Elder /*
442134b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
442234b13184SAlex Elder  * necessarily the base image.
442334b13184SAlex Elder  */
442434b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
442534b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
442634b13184SAlex Elder {
442734b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
442834b13184SAlex Elder 
442934b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
443034b13184SAlex Elder 			(unsigned long long)rbd_dev->mapping.features);
443134b13184SAlex Elder }
443234b13184SAlex Elder 
4433dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
4434dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
4435602adf40SYehuda Sadeh {
4436593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4437dfc5606dSYehuda Sadeh 
4438fc71d833SAlex Elder 	if (rbd_dev->major)
4439dfc5606dSYehuda Sadeh 		return sprintf(buf, "%d\n", rbd_dev->major);
4440fc71d833SAlex Elder 
4441fc71d833SAlex Elder 	return sprintf(buf, "(none)\n");
4442dd82fff1SIlya Dryomov }
4443fc71d833SAlex Elder 
4444dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev,
4445dd82fff1SIlya Dryomov 			      struct device_attribute *attr, char *buf)
4446dd82fff1SIlya Dryomov {
4447dd82fff1SIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4448dd82fff1SIlya Dryomov 
4449dd82fff1SIlya Dryomov 	return sprintf(buf, "%d\n", rbd_dev->minor);
4450dfc5606dSYehuda Sadeh }
4451dfc5606dSYehuda Sadeh 
4452005a07bfSIlya Dryomov static ssize_t rbd_client_addr_show(struct device *dev,
4453005a07bfSIlya Dryomov 				    struct device_attribute *attr, char *buf)
4454005a07bfSIlya Dryomov {
4455005a07bfSIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4456005a07bfSIlya Dryomov 	struct ceph_entity_addr *client_addr =
4457005a07bfSIlya Dryomov 	    ceph_client_addr(rbd_dev->rbd_client->client);
4458005a07bfSIlya Dryomov 
4459005a07bfSIlya Dryomov 	return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
4460005a07bfSIlya Dryomov 		       le32_to_cpu(client_addr->nonce));
4461005a07bfSIlya Dryomov }
4462005a07bfSIlya Dryomov 
4463dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
4464dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
4465dfc5606dSYehuda Sadeh {
4466593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4467dfc5606dSYehuda Sadeh 
44681dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
4469033268a5SIlya Dryomov 		       ceph_client_gid(rbd_dev->rbd_client->client));
4470dfc5606dSYehuda Sadeh }
4471dfc5606dSYehuda Sadeh 
4472267fb90bSMike Christie static ssize_t rbd_cluster_fsid_show(struct device *dev,
4473267fb90bSMike Christie 				     struct device_attribute *attr, char *buf)
4474267fb90bSMike Christie {
4475267fb90bSMike Christie 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4476267fb90bSMike Christie 
4477267fb90bSMike Christie 	return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
4478267fb90bSMike Christie }
4479267fb90bSMike Christie 
44800d6d1e9cSMike Christie static ssize_t rbd_config_info_show(struct device *dev,
44810d6d1e9cSMike Christie 				    struct device_attribute *attr, char *buf)
44820d6d1e9cSMike Christie {
44830d6d1e9cSMike Christie 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
44840d6d1e9cSMike Christie 
44850d6d1e9cSMike Christie 	return sprintf(buf, "%s\n", rbd_dev->config_info);
4486dfc5606dSYehuda Sadeh }
4487dfc5606dSYehuda Sadeh 
4488dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
4489dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
4490dfc5606dSYehuda Sadeh {
4491593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4492dfc5606dSYehuda Sadeh 
44930d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
4494dfc5606dSYehuda Sadeh }
4495dfc5606dSYehuda Sadeh 
44969bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
44979bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
44989bb2f334SAlex Elder {
44999bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
45009bb2f334SAlex Elder 
45010d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
45020d7dbfceSAlex Elder 			(unsigned long long) rbd_dev->spec->pool_id);
45039bb2f334SAlex Elder }
45049bb2f334SAlex Elder 
4505dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
4506dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
4507dfc5606dSYehuda Sadeh {
4508593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4509dfc5606dSYehuda Sadeh 
4510a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
45110d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
4512a92ffdf8SAlex Elder 
4513a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
4514dfc5606dSYehuda Sadeh }
4515dfc5606dSYehuda Sadeh 
4516589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
4517589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
4518589d30e0SAlex Elder {
4519589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4520589d30e0SAlex Elder 
45210d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
4522589d30e0SAlex Elder }
4523589d30e0SAlex Elder 
452434b13184SAlex Elder /*
452534b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
452634b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
452734b13184SAlex Elder  */
4528dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
4529dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
4530dfc5606dSYehuda Sadeh 			     char *buf)
4531dfc5606dSYehuda Sadeh {
4532593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4533dfc5606dSYehuda Sadeh 
45340d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
4535dfc5606dSYehuda Sadeh }
4536dfc5606dSYehuda Sadeh 
453792a58671SMike Christie static ssize_t rbd_snap_id_show(struct device *dev,
453892a58671SMike Christie 				struct device_attribute *attr, char *buf)
453992a58671SMike Christie {
454092a58671SMike Christie 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
454192a58671SMike Christie 
454292a58671SMike Christie 	return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
454392a58671SMike Christie }
454492a58671SMike Christie 
454586b00e0dSAlex Elder /*
4546ff96128fSIlya Dryomov  * For a v2 image, shows the chain of parent images, separated by empty
4547ff96128fSIlya Dryomov  * lines.  For v1 images or if there is no parent, shows "(no parent
4548ff96128fSIlya Dryomov  * image)".
454986b00e0dSAlex Elder  */
455086b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
455186b00e0dSAlex Elder 			       struct device_attribute *attr,
455286b00e0dSAlex Elder 			       char *buf)
455386b00e0dSAlex Elder {
455486b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4555ff96128fSIlya Dryomov 	ssize_t count = 0;
455686b00e0dSAlex Elder 
4557ff96128fSIlya Dryomov 	if (!rbd_dev->parent)
455886b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
455986b00e0dSAlex Elder 
4560ff96128fSIlya Dryomov 	for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
4561ff96128fSIlya Dryomov 		struct rbd_spec *spec = rbd_dev->parent_spec;
456286b00e0dSAlex Elder 
4563ff96128fSIlya Dryomov 		count += sprintf(&buf[count], "%s"
4564ff96128fSIlya Dryomov 			    "pool_id %llu\npool_name %s\n"
4565ff96128fSIlya Dryomov 			    "image_id %s\nimage_name %s\n"
4566ff96128fSIlya Dryomov 			    "snap_id %llu\nsnap_name %s\n"
4567ff96128fSIlya Dryomov 			    "overlap %llu\n",
4568ff96128fSIlya Dryomov 			    !count ? "" : "\n", /* first? */
4569ff96128fSIlya Dryomov 			    spec->pool_id, spec->pool_name,
4570ff96128fSIlya Dryomov 			    spec->image_id, spec->image_name ?: "(unknown)",
4571ff96128fSIlya Dryomov 			    spec->snap_id, spec->snap_name,
4572ff96128fSIlya Dryomov 			    rbd_dev->parent_overlap);
4573ff96128fSIlya Dryomov 	}
457486b00e0dSAlex Elder 
457586b00e0dSAlex Elder 	return count;
457686b00e0dSAlex Elder }
457786b00e0dSAlex Elder 
4578dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
4579dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
4580dfc5606dSYehuda Sadeh 				 const char *buf,
4581dfc5606dSYehuda Sadeh 				 size_t size)
4582dfc5606dSYehuda Sadeh {
4583593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4584b813623aSAlex Elder 	int ret;
4585602adf40SYehuda Sadeh 
4586cc4a38bdSAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
4587e627db08SAlex Elder 	if (ret)
458852bb1f9bSIlya Dryomov 		return ret;
4589b813623aSAlex Elder 
459052bb1f9bSIlya Dryomov 	return size;
4591dfc5606dSYehuda Sadeh }
4592602adf40SYehuda Sadeh 
4593dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
459434b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
4595dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
4596dd82fff1SIlya Dryomov static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
4597005a07bfSIlya Dryomov static DEVICE_ATTR(client_addr, S_IRUGO, rbd_client_addr_show, NULL);
4598dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
4599267fb90bSMike Christie static DEVICE_ATTR(cluster_fsid, S_IRUGO, rbd_cluster_fsid_show, NULL);
46000d6d1e9cSMike Christie static DEVICE_ATTR(config_info, S_IRUSR, rbd_config_info_show, NULL);
4601dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
46029bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
4603dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
4604589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
4605dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
4606dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
460792a58671SMike Christie static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
460886b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
4609dfc5606dSYehuda Sadeh 
4610dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
4611dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
461234b13184SAlex Elder 	&dev_attr_features.attr,
4613dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
4614dd82fff1SIlya Dryomov 	&dev_attr_minor.attr,
4615005a07bfSIlya Dryomov 	&dev_attr_client_addr.attr,
4616dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
4617267fb90bSMike Christie 	&dev_attr_cluster_fsid.attr,
46180d6d1e9cSMike Christie 	&dev_attr_config_info.attr,
4619dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
46209bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
4621dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
4622589d30e0SAlex Elder 	&dev_attr_image_id.attr,
4623dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
462492a58671SMike Christie 	&dev_attr_snap_id.attr,
462586b00e0dSAlex Elder 	&dev_attr_parent.attr,
4626dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
4627dfc5606dSYehuda Sadeh 	NULL
4628dfc5606dSYehuda Sadeh };
4629dfc5606dSYehuda Sadeh 
4630dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
4631dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
4632dfc5606dSYehuda Sadeh };
4633dfc5606dSYehuda Sadeh 
4634dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
4635dfc5606dSYehuda Sadeh 	&rbd_attr_group,
4636dfc5606dSYehuda Sadeh 	NULL
4637dfc5606dSYehuda Sadeh };
4638dfc5606dSYehuda Sadeh 
46396cac4695SIlya Dryomov static void rbd_dev_release(struct device *dev);
4640dfc5606dSYehuda Sadeh 
4641b9942bc9SBhumika Goyal static const struct device_type rbd_device_type = {
4642dfc5606dSYehuda Sadeh 	.name		= "rbd",
4643dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
46446cac4695SIlya Dryomov 	.release	= rbd_dev_release,
4645dfc5606dSYehuda Sadeh };
4646dfc5606dSYehuda Sadeh 
46478b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
46488b8fb99cSAlex Elder {
46498b8fb99cSAlex Elder 	kref_get(&spec->kref);
46508b8fb99cSAlex Elder 
46518b8fb99cSAlex Elder 	return spec;
46528b8fb99cSAlex Elder }
46538b8fb99cSAlex Elder 
46548b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
46558b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
46568b8fb99cSAlex Elder {
46578b8fb99cSAlex Elder 	if (spec)
46588b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
46598b8fb99cSAlex Elder }
46608b8fb99cSAlex Elder 
46618b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
46628b8fb99cSAlex Elder {
46638b8fb99cSAlex Elder 	struct rbd_spec *spec;
46648b8fb99cSAlex Elder 
46658b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
46668b8fb99cSAlex Elder 	if (!spec)
46678b8fb99cSAlex Elder 		return NULL;
466804077599SIlya Dryomov 
466904077599SIlya Dryomov 	spec->pool_id = CEPH_NOPOOL;
467004077599SIlya Dryomov 	spec->snap_id = CEPH_NOSNAP;
46718b8fb99cSAlex Elder 	kref_init(&spec->kref);
46728b8fb99cSAlex Elder 
46738b8fb99cSAlex Elder 	return spec;
46748b8fb99cSAlex Elder }
46758b8fb99cSAlex Elder 
46768b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
46778b8fb99cSAlex Elder {
46788b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
46798b8fb99cSAlex Elder 
46808b8fb99cSAlex Elder 	kfree(spec->pool_name);
46818b8fb99cSAlex Elder 	kfree(spec->image_id);
46828b8fb99cSAlex Elder 	kfree(spec->image_name);
46838b8fb99cSAlex Elder 	kfree(spec->snap_name);
46848b8fb99cSAlex Elder 	kfree(spec);
46858b8fb99cSAlex Elder }
46868b8fb99cSAlex Elder 
46871643dfa4SIlya Dryomov static void rbd_dev_free(struct rbd_device *rbd_dev)
4688dd5ac32dSIlya Dryomov {
468999d16943SIlya Dryomov 	WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
4690ed95b21aSIlya Dryomov 	WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
4691dd5ac32dSIlya Dryomov 
4692c41d13a3SIlya Dryomov 	ceph_oid_destroy(&rbd_dev->header_oid);
46936b6dddbeSIlya Dryomov 	ceph_oloc_destroy(&rbd_dev->header_oloc);
46940d6d1e9cSMike Christie 	kfree(rbd_dev->config_info);
4695c41d13a3SIlya Dryomov 
4696dd5ac32dSIlya Dryomov 	rbd_put_client(rbd_dev->rbd_client);
4697dd5ac32dSIlya Dryomov 	rbd_spec_put(rbd_dev->spec);
4698dd5ac32dSIlya Dryomov 	kfree(rbd_dev->opts);
4699dd5ac32dSIlya Dryomov 	kfree(rbd_dev);
47001643dfa4SIlya Dryomov }
47011643dfa4SIlya Dryomov 
47021643dfa4SIlya Dryomov static void rbd_dev_release(struct device *dev)
47031643dfa4SIlya Dryomov {
47041643dfa4SIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
47051643dfa4SIlya Dryomov 	bool need_put = !!rbd_dev->opts;
47061643dfa4SIlya Dryomov 
47071643dfa4SIlya Dryomov 	if (need_put) {
47081643dfa4SIlya Dryomov 		destroy_workqueue(rbd_dev->task_wq);
47091643dfa4SIlya Dryomov 		ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
47101643dfa4SIlya Dryomov 	}
47111643dfa4SIlya Dryomov 
47121643dfa4SIlya Dryomov 	rbd_dev_free(rbd_dev);
4713dd5ac32dSIlya Dryomov 
4714dd5ac32dSIlya Dryomov 	/*
4715dd5ac32dSIlya Dryomov 	 * This is racy, but way better than putting module outside of
4716dd5ac32dSIlya Dryomov 	 * the release callback.  The race window is pretty small, so
4717dd5ac32dSIlya Dryomov 	 * doing something similar to dm (dm-builtin.c) is overkill.
4718dd5ac32dSIlya Dryomov 	 */
4719dd5ac32dSIlya Dryomov 	if (need_put)
4720dd5ac32dSIlya Dryomov 		module_put(THIS_MODULE);
4721dd5ac32dSIlya Dryomov }
4722dd5ac32dSIlya Dryomov 
47231643dfa4SIlya Dryomov static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
47241643dfa4SIlya Dryomov 					   struct rbd_spec *spec)
4725c53d5893SAlex Elder {
4726c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
4727c53d5893SAlex Elder 
4728c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
4729c53d5893SAlex Elder 	if (!rbd_dev)
4730c53d5893SAlex Elder 		return NULL;
4731c53d5893SAlex Elder 
4732c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
4733c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
4734c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
4735c53d5893SAlex Elder 
47367e97332eSIlya Dryomov 	rbd_dev->header.data_pool_id = CEPH_NOPOOL;
4737c41d13a3SIlya Dryomov 	ceph_oid_init(&rbd_dev->header_oid);
4738431a02cdSIlya Dryomov 	rbd_dev->header_oloc.pool = spec->pool_id;
4739c41d13a3SIlya Dryomov 
474099d16943SIlya Dryomov 	mutex_init(&rbd_dev->watch_mutex);
474199d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
474299d16943SIlya Dryomov 	INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
474399d16943SIlya Dryomov 
4744ed95b21aSIlya Dryomov 	init_rwsem(&rbd_dev->lock_rwsem);
4745ed95b21aSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
4746ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
4747ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
4748ed95b21aSIlya Dryomov 	INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
4749ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
4750ed95b21aSIlya Dryomov 	init_waitqueue_head(&rbd_dev->lock_waitq);
4751ed95b21aSIlya Dryomov 
4752dd5ac32dSIlya Dryomov 	rbd_dev->dev.bus = &rbd_bus_type;
4753dd5ac32dSIlya Dryomov 	rbd_dev->dev.type = &rbd_device_type;
4754dd5ac32dSIlya Dryomov 	rbd_dev->dev.parent = &rbd_root_dev;
4755dd5ac32dSIlya Dryomov 	device_initialize(&rbd_dev->dev);
4756dd5ac32dSIlya Dryomov 
4757c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
4758d147543dSIlya Dryomov 	rbd_dev->spec = spec;
47590903e875SAlex Elder 
47601643dfa4SIlya Dryomov 	return rbd_dev;
47611643dfa4SIlya Dryomov }
47621643dfa4SIlya Dryomov 
4763dd5ac32dSIlya Dryomov /*
47641643dfa4SIlya Dryomov  * Create a mapping rbd_dev.
4765dd5ac32dSIlya Dryomov  */
47661643dfa4SIlya Dryomov static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
47671643dfa4SIlya Dryomov 					 struct rbd_spec *spec,
47681643dfa4SIlya Dryomov 					 struct rbd_options *opts)
47691643dfa4SIlya Dryomov {
47701643dfa4SIlya Dryomov 	struct rbd_device *rbd_dev;
47711643dfa4SIlya Dryomov 
47721643dfa4SIlya Dryomov 	rbd_dev = __rbd_dev_create(rbdc, spec);
47731643dfa4SIlya Dryomov 	if (!rbd_dev)
47741643dfa4SIlya Dryomov 		return NULL;
47751643dfa4SIlya Dryomov 
47761643dfa4SIlya Dryomov 	rbd_dev->opts = opts;
47771643dfa4SIlya Dryomov 
47781643dfa4SIlya Dryomov 	/* get an id and fill in device name */
47791643dfa4SIlya Dryomov 	rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
47801643dfa4SIlya Dryomov 					 minor_to_rbd_dev_id(1 << MINORBITS),
47811643dfa4SIlya Dryomov 					 GFP_KERNEL);
47821643dfa4SIlya Dryomov 	if (rbd_dev->dev_id < 0)
47831643dfa4SIlya Dryomov 		goto fail_rbd_dev;
47841643dfa4SIlya Dryomov 
47851643dfa4SIlya Dryomov 	sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
47861643dfa4SIlya Dryomov 	rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
47871643dfa4SIlya Dryomov 						   rbd_dev->name);
47881643dfa4SIlya Dryomov 	if (!rbd_dev->task_wq)
47891643dfa4SIlya Dryomov 		goto fail_dev_id;
47901643dfa4SIlya Dryomov 
47911643dfa4SIlya Dryomov 	/* we have a ref from do_rbd_add() */
4792dd5ac32dSIlya Dryomov 	__module_get(THIS_MODULE);
4793dd5ac32dSIlya Dryomov 
47941643dfa4SIlya Dryomov 	dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
4795c53d5893SAlex Elder 	return rbd_dev;
47961643dfa4SIlya Dryomov 
47971643dfa4SIlya Dryomov fail_dev_id:
47981643dfa4SIlya Dryomov 	ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
47991643dfa4SIlya Dryomov fail_rbd_dev:
48001643dfa4SIlya Dryomov 	rbd_dev_free(rbd_dev);
48011643dfa4SIlya Dryomov 	return NULL;
4802c53d5893SAlex Elder }
4803c53d5893SAlex Elder 
4804c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4805c53d5893SAlex Elder {
4806dd5ac32dSIlya Dryomov 	if (rbd_dev)
4807dd5ac32dSIlya Dryomov 		put_device(&rbd_dev->dev);
4808c53d5893SAlex Elder }
4809c53d5893SAlex Elder 
4810dfc5606dSYehuda Sadeh /*
48119d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
48129d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
48139d475de5SAlex Elder  * image.
48149d475de5SAlex Elder  */
48159d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
48169d475de5SAlex Elder 				u8 *order, u64 *snap_size)
48179d475de5SAlex Elder {
48189d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
48199d475de5SAlex Elder 	int ret;
48209d475de5SAlex Elder 	struct {
48219d475de5SAlex Elder 		u8 order;
48229d475de5SAlex Elder 		__le64 size;
48239d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
48249d475de5SAlex Elder 
4825ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4826ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_size",
48274157976bSAlex Elder 				  &snapid, sizeof(snapid),
4828e2a58ee5SAlex Elder 				  &size_buf, sizeof(size_buf));
482936be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
48309d475de5SAlex Elder 	if (ret < 0)
48319d475de5SAlex Elder 		return ret;
483257385b51SAlex Elder 	if (ret < sizeof (size_buf))
483357385b51SAlex Elder 		return -ERANGE;
48349d475de5SAlex Elder 
4835c3545579SJosh Durgin 	if (order) {
48369d475de5SAlex Elder 		*order = size_buf.order;
4837c3545579SJosh Durgin 		dout("  order %u", (unsigned int)*order);
4838c3545579SJosh Durgin 	}
48399d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
48409d475de5SAlex Elder 
4841c3545579SJosh Durgin 	dout("  snap_id 0x%016llx snap_size = %llu\n",
4842c3545579SJosh Durgin 		(unsigned long long)snap_id,
48439d475de5SAlex Elder 		(unsigned long long)*snap_size);
48449d475de5SAlex Elder 
48459d475de5SAlex Elder 	return 0;
48469d475de5SAlex Elder }
48479d475de5SAlex Elder 
48489d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
48499d475de5SAlex Elder {
48509d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
48519d475de5SAlex Elder 					&rbd_dev->header.obj_order,
48529d475de5SAlex Elder 					&rbd_dev->header.image_size);
48539d475de5SAlex Elder }
48549d475de5SAlex Elder 
48551e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
48561e130199SAlex Elder {
48571e130199SAlex Elder 	void *reply_buf;
48581e130199SAlex Elder 	int ret;
48591e130199SAlex Elder 	void *p;
48601e130199SAlex Elder 
48611e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
48621e130199SAlex Elder 	if (!reply_buf)
48631e130199SAlex Elder 		return -ENOMEM;
48641e130199SAlex Elder 
4865ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4866ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_object_prefix",
4867ecd4a68aSIlya Dryomov 				  NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
486836be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
48691e130199SAlex Elder 	if (ret < 0)
48701e130199SAlex Elder 		goto out;
48711e130199SAlex Elder 
48721e130199SAlex Elder 	p = reply_buf;
48731e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
487457385b51SAlex Elder 						p + ret, NULL, GFP_NOIO);
487557385b51SAlex Elder 	ret = 0;
48761e130199SAlex Elder 
48771e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
48781e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
48791e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
48801e130199SAlex Elder 	} else {
48811e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
48821e130199SAlex Elder 	}
48831e130199SAlex Elder out:
48841e130199SAlex Elder 	kfree(reply_buf);
48851e130199SAlex Elder 
48861e130199SAlex Elder 	return ret;
48871e130199SAlex Elder }
48881e130199SAlex Elder 
4889b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4890b1b5402aSAlex Elder 		u64 *snap_features)
4891b1b5402aSAlex Elder {
4892b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
4893b1b5402aSAlex Elder 	struct {
4894b1b5402aSAlex Elder 		__le64 features;
4895b1b5402aSAlex Elder 		__le64 incompat;
48964157976bSAlex Elder 	} __attribute__ ((packed)) features_buf = { 0 };
4897d3767f0fSIlya Dryomov 	u64 unsup;
4898b1b5402aSAlex Elder 	int ret;
4899b1b5402aSAlex Elder 
4900ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4901ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_features",
49024157976bSAlex Elder 				  &snapid, sizeof(snapid),
4903e2a58ee5SAlex Elder 				  &features_buf, sizeof(features_buf));
490436be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4905b1b5402aSAlex Elder 	if (ret < 0)
4906b1b5402aSAlex Elder 		return ret;
490757385b51SAlex Elder 	if (ret < sizeof (features_buf))
490857385b51SAlex Elder 		return -ERANGE;
4909d889140cSAlex Elder 
4910d3767f0fSIlya Dryomov 	unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
4911d3767f0fSIlya Dryomov 	if (unsup) {
4912d3767f0fSIlya Dryomov 		rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
4913d3767f0fSIlya Dryomov 			 unsup);
4914b8f5c6edSAlex Elder 		return -ENXIO;
4915d3767f0fSIlya Dryomov 	}
4916d889140cSAlex Elder 
4917b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
4918b1b5402aSAlex Elder 
4919b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
4920b1b5402aSAlex Elder 		(unsigned long long)snap_id,
4921b1b5402aSAlex Elder 		(unsigned long long)*snap_features,
4922b1b5402aSAlex Elder 		(unsigned long long)le64_to_cpu(features_buf.incompat));
4923b1b5402aSAlex Elder 
4924b1b5402aSAlex Elder 	return 0;
4925b1b5402aSAlex Elder }
4926b1b5402aSAlex Elder 
4927b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
4928b1b5402aSAlex Elder {
4929b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
4930b1b5402aSAlex Elder 						&rbd_dev->header.features);
4931b1b5402aSAlex Elder }
4932b1b5402aSAlex Elder 
493386b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
493486b00e0dSAlex Elder {
493586b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
493686b00e0dSAlex Elder 	size_t size;
493786b00e0dSAlex Elder 	void *reply_buf = NULL;
493886b00e0dSAlex Elder 	__le64 snapid;
493986b00e0dSAlex Elder 	void *p;
494086b00e0dSAlex Elder 	void *end;
4941642a2537SAlex Elder 	u64 pool_id;
494286b00e0dSAlex Elder 	char *image_id;
49433b5cf2a2SAlex Elder 	u64 snap_id;
494486b00e0dSAlex Elder 	u64 overlap;
494586b00e0dSAlex Elder 	int ret;
494686b00e0dSAlex Elder 
494786b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
494886b00e0dSAlex Elder 	if (!parent_spec)
494986b00e0dSAlex Elder 		return -ENOMEM;
495086b00e0dSAlex Elder 
495186b00e0dSAlex Elder 	size = sizeof (__le64) +				/* pool_id */
495286b00e0dSAlex Elder 		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
495386b00e0dSAlex Elder 		sizeof (__le64) +				/* snap_id */
495486b00e0dSAlex Elder 		sizeof (__le64);				/* overlap */
495586b00e0dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
495686b00e0dSAlex Elder 	if (!reply_buf) {
495786b00e0dSAlex Elder 		ret = -ENOMEM;
495886b00e0dSAlex Elder 		goto out_err;
495986b00e0dSAlex Elder 	}
496086b00e0dSAlex Elder 
49614d9b67cdSIlya Dryomov 	snapid = cpu_to_le64(rbd_dev->spec->snap_id);
4962ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4963ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_parent",
4964ecd4a68aSIlya Dryomov 				  &snapid, sizeof(snapid), reply_buf, size);
496536be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
496686b00e0dSAlex Elder 	if (ret < 0)
496786b00e0dSAlex Elder 		goto out_err;
496886b00e0dSAlex Elder 
496986b00e0dSAlex Elder 	p = reply_buf;
497057385b51SAlex Elder 	end = reply_buf + ret;
497157385b51SAlex Elder 	ret = -ERANGE;
4972642a2537SAlex Elder 	ceph_decode_64_safe(&p, end, pool_id, out_err);
4973392a9dadSAlex Elder 	if (pool_id == CEPH_NOPOOL) {
4974392a9dadSAlex Elder 		/*
4975392a9dadSAlex Elder 		 * Either the parent never existed, or we have
4976392a9dadSAlex Elder 		 * record of it but the image got flattened so it no
4977392a9dadSAlex Elder 		 * longer has a parent.  When the parent of a
4978392a9dadSAlex Elder 		 * layered image disappears we immediately set the
4979392a9dadSAlex Elder 		 * overlap to 0.  The effect of this is that all new
4980392a9dadSAlex Elder 		 * requests will be treated as if the image had no
4981392a9dadSAlex Elder 		 * parent.
4982392a9dadSAlex Elder 		 */
4983392a9dadSAlex Elder 		if (rbd_dev->parent_overlap) {
4984392a9dadSAlex Elder 			rbd_dev->parent_overlap = 0;
4985392a9dadSAlex Elder 			rbd_dev_parent_put(rbd_dev);
4986392a9dadSAlex Elder 			pr_info("%s: clone image has been flattened\n",
4987392a9dadSAlex Elder 				rbd_dev->disk->disk_name);
4988392a9dadSAlex Elder 		}
4989392a9dadSAlex Elder 
499086b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
4991392a9dadSAlex Elder 	}
499286b00e0dSAlex Elder 
49930903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
49940903e875SAlex Elder 
49950903e875SAlex Elder 	ret = -EIO;
4996642a2537SAlex Elder 	if (pool_id > (u64)U32_MAX) {
49979584d508SIlya Dryomov 		rbd_warn(NULL, "parent pool id too large (%llu > %u)",
4998642a2537SAlex Elder 			(unsigned long long)pool_id, U32_MAX);
499957385b51SAlex Elder 		goto out_err;
5000c0cd10dbSAlex Elder 	}
50010903e875SAlex Elder 
5002979ed480SAlex Elder 	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
500386b00e0dSAlex Elder 	if (IS_ERR(image_id)) {
500486b00e0dSAlex Elder 		ret = PTR_ERR(image_id);
500586b00e0dSAlex Elder 		goto out_err;
500686b00e0dSAlex Elder 	}
50073b5cf2a2SAlex Elder 	ceph_decode_64_safe(&p, end, snap_id, out_err);
500886b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, overlap, out_err);
500986b00e0dSAlex Elder 
50103b5cf2a2SAlex Elder 	/*
50113b5cf2a2SAlex Elder 	 * The parent won't change (except when the clone is
50123b5cf2a2SAlex Elder 	 * flattened, already handled that).  So we only need to
50133b5cf2a2SAlex Elder 	 * record the parent spec we have not already done so.
50143b5cf2a2SAlex Elder 	 */
50153b5cf2a2SAlex Elder 	if (!rbd_dev->parent_spec) {
50163b5cf2a2SAlex Elder 		parent_spec->pool_id = pool_id;
50173b5cf2a2SAlex Elder 		parent_spec->image_id = image_id;
50183b5cf2a2SAlex Elder 		parent_spec->snap_id = snap_id;
501986b00e0dSAlex Elder 		rbd_dev->parent_spec = parent_spec;
502086b00e0dSAlex Elder 		parent_spec = NULL;	/* rbd_dev now owns this */
5021fbba11b3SIlya Dryomov 	} else {
5022fbba11b3SIlya Dryomov 		kfree(image_id);
50233b5cf2a2SAlex Elder 	}
50243b5cf2a2SAlex Elder 
50253b5cf2a2SAlex Elder 	/*
5026cf32bd9cSIlya Dryomov 	 * We always update the parent overlap.  If it's zero we issue
5027cf32bd9cSIlya Dryomov 	 * a warning, as we will proceed as if there was no parent.
50283b5cf2a2SAlex Elder 	 */
50293b5cf2a2SAlex Elder 	if (!overlap) {
50303b5cf2a2SAlex Elder 		if (parent_spec) {
5031cf32bd9cSIlya Dryomov 			/* refresh, careful to warn just once */
5032cf32bd9cSIlya Dryomov 			if (rbd_dev->parent_overlap)
5033cf32bd9cSIlya Dryomov 				rbd_warn(rbd_dev,
5034cf32bd9cSIlya Dryomov 				    "clone now standalone (overlap became 0)");
503570cf49cfSAlex Elder 		} else {
5036cf32bd9cSIlya Dryomov 			/* initial probe */
5037cf32bd9cSIlya Dryomov 			rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
50383b5cf2a2SAlex Elder 		}
503970cf49cfSAlex Elder 	}
5040cf32bd9cSIlya Dryomov 	rbd_dev->parent_overlap = overlap;
5041cf32bd9cSIlya Dryomov 
504286b00e0dSAlex Elder out:
504386b00e0dSAlex Elder 	ret = 0;
504486b00e0dSAlex Elder out_err:
504586b00e0dSAlex Elder 	kfree(reply_buf);
504686b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
504786b00e0dSAlex Elder 
504886b00e0dSAlex Elder 	return ret;
504986b00e0dSAlex Elder }
505086b00e0dSAlex Elder 
5051cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
5052cc070d59SAlex Elder {
5053cc070d59SAlex Elder 	struct {
5054cc070d59SAlex Elder 		__le64 stripe_unit;
5055cc070d59SAlex Elder 		__le64 stripe_count;
5056cc070d59SAlex Elder 	} __attribute__ ((packed)) striping_info_buf = { 0 };
5057cc070d59SAlex Elder 	size_t size = sizeof (striping_info_buf);
5058cc070d59SAlex Elder 	void *p;
5059cc070d59SAlex Elder 	u64 obj_size;
5060cc070d59SAlex Elder 	u64 stripe_unit;
5061cc070d59SAlex Elder 	u64 stripe_count;
5062cc070d59SAlex Elder 	int ret;
5063cc070d59SAlex Elder 
5064ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5065ecd4a68aSIlya Dryomov 				&rbd_dev->header_oloc, "get_stripe_unit_count",
5066ecd4a68aSIlya Dryomov 				NULL, 0, &striping_info_buf, size);
5067cc070d59SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5068cc070d59SAlex Elder 	if (ret < 0)
5069cc070d59SAlex Elder 		return ret;
5070cc070d59SAlex Elder 	if (ret < size)
5071cc070d59SAlex Elder 		return -ERANGE;
5072cc070d59SAlex Elder 
5073cc070d59SAlex Elder 	/*
5074cc070d59SAlex Elder 	 * We don't actually support the "fancy striping" feature
5075cc070d59SAlex Elder 	 * (STRIPINGV2) yet, but if the striping sizes are the
5076cc070d59SAlex Elder 	 * defaults the behavior is the same as before.  So find
5077cc070d59SAlex Elder 	 * out, and only fail if the image has non-default values.
5078cc070d59SAlex Elder 	 */
5079cc070d59SAlex Elder 	ret = -EINVAL;
50805bc3fb17SIlya Dryomov 	obj_size = rbd_obj_bytes(&rbd_dev->header);
5081cc070d59SAlex Elder 	p = &striping_info_buf;
5082cc070d59SAlex Elder 	stripe_unit = ceph_decode_64(&p);
5083cc070d59SAlex Elder 	if (stripe_unit != obj_size) {
5084cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe unit "
5085cc070d59SAlex Elder 				"(got %llu want %llu)",
5086cc070d59SAlex Elder 				stripe_unit, obj_size);
5087cc070d59SAlex Elder 		return -EINVAL;
5088cc070d59SAlex Elder 	}
5089cc070d59SAlex Elder 	stripe_count = ceph_decode_64(&p);
5090cc070d59SAlex Elder 	if (stripe_count != 1) {
5091cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe count "
5092cc070d59SAlex Elder 				"(got %llu want 1)", stripe_count);
5093cc070d59SAlex Elder 		return -EINVAL;
5094cc070d59SAlex Elder 	}
5095500d0c0fSAlex Elder 	rbd_dev->header.stripe_unit = stripe_unit;
5096500d0c0fSAlex Elder 	rbd_dev->header.stripe_count = stripe_count;
5097cc070d59SAlex Elder 
5098cc070d59SAlex Elder 	return 0;
5099cc070d59SAlex Elder }
5100cc070d59SAlex Elder 
51017e97332eSIlya Dryomov static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
51027e97332eSIlya Dryomov {
51037e97332eSIlya Dryomov 	__le64 data_pool_id;
51047e97332eSIlya Dryomov 	int ret;
51057e97332eSIlya Dryomov 
51067e97332eSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
51077e97332eSIlya Dryomov 				  &rbd_dev->header_oloc, "get_data_pool",
51087e97332eSIlya Dryomov 				  NULL, 0, &data_pool_id, sizeof(data_pool_id));
51097e97332eSIlya Dryomov 	if (ret < 0)
51107e97332eSIlya Dryomov 		return ret;
51117e97332eSIlya Dryomov 	if (ret < sizeof(data_pool_id))
51127e97332eSIlya Dryomov 		return -EBADMSG;
51137e97332eSIlya Dryomov 
51147e97332eSIlya Dryomov 	rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
51157e97332eSIlya Dryomov 	WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
51167e97332eSIlya Dryomov 	return 0;
51177e97332eSIlya Dryomov }
51187e97332eSIlya Dryomov 
51199e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
51209e15b77dSAlex Elder {
5121ecd4a68aSIlya Dryomov 	CEPH_DEFINE_OID_ONSTACK(oid);
51229e15b77dSAlex Elder 	size_t image_id_size;
51239e15b77dSAlex Elder 	char *image_id;
51249e15b77dSAlex Elder 	void *p;
51259e15b77dSAlex Elder 	void *end;
51269e15b77dSAlex Elder 	size_t size;
51279e15b77dSAlex Elder 	void *reply_buf = NULL;
51289e15b77dSAlex Elder 	size_t len = 0;
51299e15b77dSAlex Elder 	char *image_name = NULL;
51309e15b77dSAlex Elder 	int ret;
51319e15b77dSAlex Elder 
51329e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
51339e15b77dSAlex Elder 
513469e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
513569e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
51369e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
51379e15b77dSAlex Elder 	if (!image_id)
51389e15b77dSAlex Elder 		return NULL;
51399e15b77dSAlex Elder 
51409e15b77dSAlex Elder 	p = image_id;
51414157976bSAlex Elder 	end = image_id + image_id_size;
514269e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
51439e15b77dSAlex Elder 
51449e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
51459e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
51469e15b77dSAlex Elder 	if (!reply_buf)
51479e15b77dSAlex Elder 		goto out;
51489e15b77dSAlex Elder 
5149ecd4a68aSIlya Dryomov 	ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
5150ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5151ecd4a68aSIlya Dryomov 				  "dir_get_name", image_id, image_id_size,
5152e2a58ee5SAlex Elder 				  reply_buf, size);
51539e15b77dSAlex Elder 	if (ret < 0)
51549e15b77dSAlex Elder 		goto out;
51559e15b77dSAlex Elder 	p = reply_buf;
5156f40eb349SAlex Elder 	end = reply_buf + ret;
5157f40eb349SAlex Elder 
51589e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
51599e15b77dSAlex Elder 	if (IS_ERR(image_name))
51609e15b77dSAlex Elder 		image_name = NULL;
51619e15b77dSAlex Elder 	else
51629e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
51639e15b77dSAlex Elder out:
51649e15b77dSAlex Elder 	kfree(reply_buf);
51659e15b77dSAlex Elder 	kfree(image_id);
51669e15b77dSAlex Elder 
51679e15b77dSAlex Elder 	return image_name;
51689e15b77dSAlex Elder }
51699e15b77dSAlex Elder 
51702ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
51712ad3d716SAlex Elder {
51722ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
51732ad3d716SAlex Elder 	const char *snap_name;
51742ad3d716SAlex Elder 	u32 which = 0;
51752ad3d716SAlex Elder 
51762ad3d716SAlex Elder 	/* Skip over names until we find the one we are looking for */
51772ad3d716SAlex Elder 
51782ad3d716SAlex Elder 	snap_name = rbd_dev->header.snap_names;
51792ad3d716SAlex Elder 	while (which < snapc->num_snaps) {
51802ad3d716SAlex Elder 		if (!strcmp(name, snap_name))
51812ad3d716SAlex Elder 			return snapc->snaps[which];
51822ad3d716SAlex Elder 		snap_name += strlen(snap_name) + 1;
51832ad3d716SAlex Elder 		which++;
51842ad3d716SAlex Elder 	}
51852ad3d716SAlex Elder 	return CEPH_NOSNAP;
51862ad3d716SAlex Elder }
51872ad3d716SAlex Elder 
51882ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
51892ad3d716SAlex Elder {
51902ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
51912ad3d716SAlex Elder 	u32 which;
51922ad3d716SAlex Elder 	bool found = false;
51932ad3d716SAlex Elder 	u64 snap_id;
51942ad3d716SAlex Elder 
51952ad3d716SAlex Elder 	for (which = 0; !found && which < snapc->num_snaps; which++) {
51962ad3d716SAlex Elder 		const char *snap_name;
51972ad3d716SAlex Elder 
51982ad3d716SAlex Elder 		snap_id = snapc->snaps[which];
51992ad3d716SAlex Elder 		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
5200efadc98aSJosh Durgin 		if (IS_ERR(snap_name)) {
5201efadc98aSJosh Durgin 			/* ignore no-longer existing snapshots */
5202efadc98aSJosh Durgin 			if (PTR_ERR(snap_name) == -ENOENT)
5203efadc98aSJosh Durgin 				continue;
5204efadc98aSJosh Durgin 			else
52052ad3d716SAlex Elder 				break;
5206efadc98aSJosh Durgin 		}
52072ad3d716SAlex Elder 		found = !strcmp(name, snap_name);
52082ad3d716SAlex Elder 		kfree(snap_name);
52092ad3d716SAlex Elder 	}
52102ad3d716SAlex Elder 	return found ? snap_id : CEPH_NOSNAP;
52112ad3d716SAlex Elder }
52122ad3d716SAlex Elder 
52132ad3d716SAlex Elder /*
52142ad3d716SAlex Elder  * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
52152ad3d716SAlex Elder  * no snapshot by that name is found, or if an error occurs.
52162ad3d716SAlex Elder  */
52172ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
52182ad3d716SAlex Elder {
52192ad3d716SAlex Elder 	if (rbd_dev->image_format == 1)
52202ad3d716SAlex Elder 		return rbd_v1_snap_id_by_name(rbd_dev, name);
52212ad3d716SAlex Elder 
52222ad3d716SAlex Elder 	return rbd_v2_snap_id_by_name(rbd_dev, name);
52232ad3d716SAlex Elder }
52242ad3d716SAlex Elder 
52259e15b77dSAlex Elder /*
522604077599SIlya Dryomov  * An image being mapped will have everything but the snap id.
52279e15b77dSAlex Elder  */
522804077599SIlya Dryomov static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
522904077599SIlya Dryomov {
523004077599SIlya Dryomov 	struct rbd_spec *spec = rbd_dev->spec;
523104077599SIlya Dryomov 
523204077599SIlya Dryomov 	rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
523304077599SIlya Dryomov 	rbd_assert(spec->image_id && spec->image_name);
523404077599SIlya Dryomov 	rbd_assert(spec->snap_name);
523504077599SIlya Dryomov 
523604077599SIlya Dryomov 	if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
523704077599SIlya Dryomov 		u64 snap_id;
523804077599SIlya Dryomov 
523904077599SIlya Dryomov 		snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
524004077599SIlya Dryomov 		if (snap_id == CEPH_NOSNAP)
524104077599SIlya Dryomov 			return -ENOENT;
524204077599SIlya Dryomov 
524304077599SIlya Dryomov 		spec->snap_id = snap_id;
524404077599SIlya Dryomov 	} else {
524504077599SIlya Dryomov 		spec->snap_id = CEPH_NOSNAP;
524604077599SIlya Dryomov 	}
524704077599SIlya Dryomov 
524804077599SIlya Dryomov 	return 0;
524904077599SIlya Dryomov }
525004077599SIlya Dryomov 
525104077599SIlya Dryomov /*
525204077599SIlya Dryomov  * A parent image will have all ids but none of the names.
525304077599SIlya Dryomov  *
525404077599SIlya Dryomov  * All names in an rbd spec are dynamically allocated.  It's OK if we
525504077599SIlya Dryomov  * can't figure out the name for an image id.
525604077599SIlya Dryomov  */
525704077599SIlya Dryomov static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
52589e15b77dSAlex Elder {
52592e9f7f1cSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
52602e9f7f1cSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
52612e9f7f1cSAlex Elder 	const char *pool_name;
52622e9f7f1cSAlex Elder 	const char *image_name;
52632e9f7f1cSAlex Elder 	const char *snap_name;
52649e15b77dSAlex Elder 	int ret;
52659e15b77dSAlex Elder 
526604077599SIlya Dryomov 	rbd_assert(spec->pool_id != CEPH_NOPOOL);
526704077599SIlya Dryomov 	rbd_assert(spec->image_id);
526804077599SIlya Dryomov 	rbd_assert(spec->snap_id != CEPH_NOSNAP);
52699e15b77dSAlex Elder 
52702e9f7f1cSAlex Elder 	/* Get the pool name; we have to make our own copy of this */
52719e15b77dSAlex Elder 
52722e9f7f1cSAlex Elder 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
52732e9f7f1cSAlex Elder 	if (!pool_name) {
52742e9f7f1cSAlex Elder 		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
5275935dc89fSAlex Elder 		return -EIO;
5276935dc89fSAlex Elder 	}
52772e9f7f1cSAlex Elder 	pool_name = kstrdup(pool_name, GFP_KERNEL);
52782e9f7f1cSAlex Elder 	if (!pool_name)
52799e15b77dSAlex Elder 		return -ENOMEM;
52809e15b77dSAlex Elder 
52819e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
52829e15b77dSAlex Elder 
52832e9f7f1cSAlex Elder 	image_name = rbd_dev_image_name(rbd_dev);
52842e9f7f1cSAlex Elder 	if (!image_name)
528506ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
52869e15b77dSAlex Elder 
528704077599SIlya Dryomov 	/* Fetch the snapshot name */
52889e15b77dSAlex Elder 
52892e9f7f1cSAlex Elder 	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
5290da6a6b63SJosh Durgin 	if (IS_ERR(snap_name)) {
5291da6a6b63SJosh Durgin 		ret = PTR_ERR(snap_name);
52929e15b77dSAlex Elder 		goto out_err;
52932e9f7f1cSAlex Elder 	}
52942e9f7f1cSAlex Elder 
52952e9f7f1cSAlex Elder 	spec->pool_name = pool_name;
52962e9f7f1cSAlex Elder 	spec->image_name = image_name;
52972e9f7f1cSAlex Elder 	spec->snap_name = snap_name;
52989e15b77dSAlex Elder 
52999e15b77dSAlex Elder 	return 0;
530004077599SIlya Dryomov 
53019e15b77dSAlex Elder out_err:
53022e9f7f1cSAlex Elder 	kfree(image_name);
53032e9f7f1cSAlex Elder 	kfree(pool_name);
53049e15b77dSAlex Elder 	return ret;
53059e15b77dSAlex Elder }
53069e15b77dSAlex Elder 
5307cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
530835d489f9SAlex Elder {
530935d489f9SAlex Elder 	size_t size;
531035d489f9SAlex Elder 	int ret;
531135d489f9SAlex Elder 	void *reply_buf;
531235d489f9SAlex Elder 	void *p;
531335d489f9SAlex Elder 	void *end;
531435d489f9SAlex Elder 	u64 seq;
531535d489f9SAlex Elder 	u32 snap_count;
531635d489f9SAlex Elder 	struct ceph_snap_context *snapc;
531735d489f9SAlex Elder 	u32 i;
531835d489f9SAlex Elder 
531935d489f9SAlex Elder 	/*
532035d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
532135d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
532235d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
532335d489f9SAlex Elder 	 * prepared to receive.
532435d489f9SAlex Elder 	 */
532535d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
532635d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
532735d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
532835d489f9SAlex Elder 	if (!reply_buf)
532935d489f9SAlex Elder 		return -ENOMEM;
533035d489f9SAlex Elder 
5331ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5332ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_snapcontext",
5333ecd4a68aSIlya Dryomov 				  NULL, 0, reply_buf, size);
533436be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
533535d489f9SAlex Elder 	if (ret < 0)
533635d489f9SAlex Elder 		goto out;
533735d489f9SAlex Elder 
533835d489f9SAlex Elder 	p = reply_buf;
533957385b51SAlex Elder 	end = reply_buf + ret;
534057385b51SAlex Elder 	ret = -ERANGE;
534135d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
534235d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
534335d489f9SAlex Elder 
534435d489f9SAlex Elder 	/*
534535d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
534635d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
534735d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
534835d489f9SAlex Elder 	 * allocate is representable in a size_t.
534935d489f9SAlex Elder 	 */
535035d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
535135d489f9SAlex Elder 				 / sizeof (u64)) {
535235d489f9SAlex Elder 		ret = -EINVAL;
535335d489f9SAlex Elder 		goto out;
535435d489f9SAlex Elder 	}
535535d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
535635d489f9SAlex Elder 		goto out;
5357468521c1SAlex Elder 	ret = 0;
535835d489f9SAlex Elder 
5359812164f8SAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
536035d489f9SAlex Elder 	if (!snapc) {
536135d489f9SAlex Elder 		ret = -ENOMEM;
536235d489f9SAlex Elder 		goto out;
536335d489f9SAlex Elder 	}
536435d489f9SAlex Elder 	snapc->seq = seq;
536535d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
536635d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
536735d489f9SAlex Elder 
536849ece554SAlex Elder 	ceph_put_snap_context(rbd_dev->header.snapc);
536935d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
537035d489f9SAlex Elder 
537135d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
537235d489f9SAlex Elder 		(unsigned long long)seq, (unsigned int)snap_count);
537335d489f9SAlex Elder out:
537435d489f9SAlex Elder 	kfree(reply_buf);
537535d489f9SAlex Elder 
537657385b51SAlex Elder 	return ret;
537735d489f9SAlex Elder }
537835d489f9SAlex Elder 
537954cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
538054cac61fSAlex Elder 					u64 snap_id)
5381b8b1e2dbSAlex Elder {
5382b8b1e2dbSAlex Elder 	size_t size;
5383b8b1e2dbSAlex Elder 	void *reply_buf;
538454cac61fSAlex Elder 	__le64 snapid;
5385b8b1e2dbSAlex Elder 	int ret;
5386b8b1e2dbSAlex Elder 	void *p;
5387b8b1e2dbSAlex Elder 	void *end;
5388b8b1e2dbSAlex Elder 	char *snap_name;
5389b8b1e2dbSAlex Elder 
5390b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
5391b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
5392b8b1e2dbSAlex Elder 	if (!reply_buf)
5393b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
5394b8b1e2dbSAlex Elder 
539554cac61fSAlex Elder 	snapid = cpu_to_le64(snap_id);
5396ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5397ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_snapshot_name",
5398ecd4a68aSIlya Dryomov 				  &snapid, sizeof(snapid), reply_buf, size);
539936be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5400f40eb349SAlex Elder 	if (ret < 0) {
5401f40eb349SAlex Elder 		snap_name = ERR_PTR(ret);
5402b8b1e2dbSAlex Elder 		goto out;
5403f40eb349SAlex Elder 	}
5404b8b1e2dbSAlex Elder 
5405b8b1e2dbSAlex Elder 	p = reply_buf;
5406f40eb349SAlex Elder 	end = reply_buf + ret;
5407e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
5408f40eb349SAlex Elder 	if (IS_ERR(snap_name))
5409b8b1e2dbSAlex Elder 		goto out;
5410f40eb349SAlex Elder 
5411b8b1e2dbSAlex Elder 	dout("  snap_id 0x%016llx snap_name = %s\n",
541254cac61fSAlex Elder 		(unsigned long long)snap_id, snap_name);
5413b8b1e2dbSAlex Elder out:
5414b8b1e2dbSAlex Elder 	kfree(reply_buf);
5415b8b1e2dbSAlex Elder 
5416f40eb349SAlex Elder 	return snap_name;
5417b8b1e2dbSAlex Elder }
5418b8b1e2dbSAlex Elder 
54192df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
5420117973fbSAlex Elder {
54212df3fac7SAlex Elder 	bool first_time = rbd_dev->header.object_prefix == NULL;
5422117973fbSAlex Elder 	int ret;
5423117973fbSAlex Elder 
54241617e40cSJosh Durgin 	ret = rbd_dev_v2_image_size(rbd_dev);
54251617e40cSJosh Durgin 	if (ret)
5426cfbf6377SAlex Elder 		return ret;
54271617e40cSJosh Durgin 
54282df3fac7SAlex Elder 	if (first_time) {
54292df3fac7SAlex Elder 		ret = rbd_dev_v2_header_onetime(rbd_dev);
54302df3fac7SAlex Elder 		if (ret)
5431cfbf6377SAlex Elder 			return ret;
54322df3fac7SAlex Elder 	}
54332df3fac7SAlex Elder 
5434cc4a38bdSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev);
5435d194cd1dSIlya Dryomov 	if (ret && first_time) {
5436d194cd1dSIlya Dryomov 		kfree(rbd_dev->header.object_prefix);
5437d194cd1dSIlya Dryomov 		rbd_dev->header.object_prefix = NULL;
5438d194cd1dSIlya Dryomov 	}
5439117973fbSAlex Elder 
5440117973fbSAlex Elder 	return ret;
5441117973fbSAlex Elder }
5442117973fbSAlex Elder 
5443a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev)
5444a720ae09SIlya Dryomov {
5445a720ae09SIlya Dryomov 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5446a720ae09SIlya Dryomov 
5447a720ae09SIlya Dryomov 	if (rbd_dev->image_format == 1)
5448a720ae09SIlya Dryomov 		return rbd_dev_v1_header_info(rbd_dev);
5449a720ae09SIlya Dryomov 
5450a720ae09SIlya Dryomov 	return rbd_dev_v2_header_info(rbd_dev);
5451a720ae09SIlya Dryomov }
5452a720ae09SIlya Dryomov 
54531ddbe94eSAlex Elder /*
5454e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
5455e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
5456593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
5457593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
5458e28fff26SAlex Elder  */
5459e28fff26SAlex Elder static inline size_t next_token(const char **buf)
5460e28fff26SAlex Elder {
5461e28fff26SAlex Elder         /*
5462e28fff26SAlex Elder         * These are the characters that produce nonzero for
5463e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
5464e28fff26SAlex Elder         */
5465e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
5466e28fff26SAlex Elder 
5467e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
5468e28fff26SAlex Elder 
5469e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
5470e28fff26SAlex Elder }
5471e28fff26SAlex Elder 
5472e28fff26SAlex Elder /*
5473ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
5474ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
5475ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
5476ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
5477ea3352f4SAlex Elder  *
5478ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
5479ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
5480ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
5481ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
5482ea3352f4SAlex Elder  *
5483ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
5484ea3352f4SAlex Elder  * the end of the found token.
5485ea3352f4SAlex Elder  *
5486ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
5487ea3352f4SAlex Elder  */
5488ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
5489ea3352f4SAlex Elder {
5490ea3352f4SAlex Elder 	char *dup;
5491ea3352f4SAlex Elder 	size_t len;
5492ea3352f4SAlex Elder 
5493ea3352f4SAlex Elder 	len = next_token(buf);
54944caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
5495ea3352f4SAlex Elder 	if (!dup)
5496ea3352f4SAlex Elder 		return NULL;
5497ea3352f4SAlex Elder 	*(dup + len) = '\0';
5498ea3352f4SAlex Elder 	*buf += len;
5499ea3352f4SAlex Elder 
5500ea3352f4SAlex Elder 	if (lenp)
5501ea3352f4SAlex Elder 		*lenp = len;
5502ea3352f4SAlex Elder 
5503ea3352f4SAlex Elder 	return dup;
5504ea3352f4SAlex Elder }
5505ea3352f4SAlex Elder 
5506ea3352f4SAlex Elder /*
5507859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
5508859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
5509859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
5510859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
5511d22f76e7SAlex Elder  *
5512859c31dfSAlex Elder  * The information extracted from these options is recorded in
5513859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
5514859c31dfSAlex Elder  * structures:
5515859c31dfSAlex Elder  *  ceph_opts
5516859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
5517859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
5518859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
5519859c31dfSAlex Elder  *  rbd_opts
5520859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
5521859c31dfSAlex Elder  *	this function; caller must release with kfree().
5522859c31dfSAlex Elder  *  spec
5523859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
5524859c31dfSAlex Elder  *	initialized by this function based on parsed options.
5525859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
5526859c31dfSAlex Elder  *
5527859c31dfSAlex Elder  * The options passed take this form:
5528859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
5529859c31dfSAlex Elder  * where:
5530859c31dfSAlex Elder  *  <mon_addrs>
5531859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
5532859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
5533859c31dfSAlex Elder  *      by a port number (separated by a colon).
5534859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
5535859c31dfSAlex Elder  *  <options>
5536859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
5537859c31dfSAlex Elder  *  <pool_name>
5538859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
5539859c31dfSAlex Elder  *  <image_name>
5540859c31dfSAlex Elder  *      The name of the image in that pool to map.
5541859c31dfSAlex Elder  *  <snap_id>
5542859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
5543859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
5544859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
5545859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
5546a725f65eSAlex Elder  */
5547859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
5548dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
5549859c31dfSAlex Elder 				struct rbd_options **opts,
5550859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
5551a725f65eSAlex Elder {
5552e28fff26SAlex Elder 	size_t len;
5553859c31dfSAlex Elder 	char *options;
55540ddebc0cSAlex Elder 	const char *mon_addrs;
5555ecb4dc22SAlex Elder 	char *snap_name;
55560ddebc0cSAlex Elder 	size_t mon_addrs_size;
5557859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
55584e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
5559859c31dfSAlex Elder 	struct ceph_options *copts;
5560dc79b113SAlex Elder 	int ret;
5561e28fff26SAlex Elder 
5562e28fff26SAlex Elder 	/* The first four tokens are required */
5563e28fff26SAlex Elder 
55647ef3214aSAlex Elder 	len = next_token(&buf);
55654fb5d671SAlex Elder 	if (!len) {
55664fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
55674fb5d671SAlex Elder 		return -EINVAL;
55684fb5d671SAlex Elder 	}
55690ddebc0cSAlex Elder 	mon_addrs = buf;
5570f28e565aSAlex Elder 	mon_addrs_size = len + 1;
55717ef3214aSAlex Elder 	buf += len;
5572a725f65eSAlex Elder 
5573dc79b113SAlex Elder 	ret = -EINVAL;
5574f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
5575f28e565aSAlex Elder 	if (!options)
5576dc79b113SAlex Elder 		return -ENOMEM;
55774fb5d671SAlex Elder 	if (!*options) {
55784fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
55794fb5d671SAlex Elder 		goto out_err;
55804fb5d671SAlex Elder 	}
5581a725f65eSAlex Elder 
5582859c31dfSAlex Elder 	spec = rbd_spec_alloc();
5583859c31dfSAlex Elder 	if (!spec)
5584f28e565aSAlex Elder 		goto out_mem;
5585859c31dfSAlex Elder 
5586859c31dfSAlex Elder 	spec->pool_name = dup_token(&buf, NULL);
5587859c31dfSAlex Elder 	if (!spec->pool_name)
5588859c31dfSAlex Elder 		goto out_mem;
55894fb5d671SAlex Elder 	if (!*spec->pool_name) {
55904fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
55914fb5d671SAlex Elder 		goto out_err;
55924fb5d671SAlex Elder 	}
5593e28fff26SAlex Elder 
559469e7a02fSAlex Elder 	spec->image_name = dup_token(&buf, NULL);
5595859c31dfSAlex Elder 	if (!spec->image_name)
5596f28e565aSAlex Elder 		goto out_mem;
55974fb5d671SAlex Elder 	if (!*spec->image_name) {
55984fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
55994fb5d671SAlex Elder 		goto out_err;
56004fb5d671SAlex Elder 	}
5601e28fff26SAlex Elder 
5602f28e565aSAlex Elder 	/*
5603f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
5604f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
5605f28e565aSAlex Elder 	 */
56063feeb894SAlex Elder 	len = next_token(&buf);
5607820a5f3eSAlex Elder 	if (!len) {
56083feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
56093feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
5610f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
5611dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
5612f28e565aSAlex Elder 		goto out_err;
5613849b4260SAlex Elder 	}
5614ecb4dc22SAlex Elder 	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
5615ecb4dc22SAlex Elder 	if (!snap_name)
5616f28e565aSAlex Elder 		goto out_mem;
5617ecb4dc22SAlex Elder 	*(snap_name + len) = '\0';
5618ecb4dc22SAlex Elder 	spec->snap_name = snap_name;
5619e5c35534SAlex Elder 
56200ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
5621e28fff26SAlex Elder 
56224e9afebaSAlex Elder 	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
56234e9afebaSAlex Elder 	if (!rbd_opts)
56244e9afebaSAlex Elder 		goto out_mem;
56254e9afebaSAlex Elder 
56264e9afebaSAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
5627b5584180SIlya Dryomov 	rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
562880de1912SIlya Dryomov 	rbd_opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
5629d22f76e7SAlex Elder 
5630859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
56310ddebc0cSAlex Elder 					mon_addrs + mon_addrs_size - 1,
56324e9afebaSAlex Elder 					parse_rbd_opts_token, rbd_opts);
5633859c31dfSAlex Elder 	if (IS_ERR(copts)) {
5634859c31dfSAlex Elder 		ret = PTR_ERR(copts);
5635dc79b113SAlex Elder 		goto out_err;
5636dc79b113SAlex Elder 	}
5637859c31dfSAlex Elder 	kfree(options);
5638859c31dfSAlex Elder 
5639859c31dfSAlex Elder 	*ceph_opts = copts;
56404e9afebaSAlex Elder 	*opts = rbd_opts;
5641859c31dfSAlex Elder 	*rbd_spec = spec;
56420ddebc0cSAlex Elder 
5643dc79b113SAlex Elder 	return 0;
5644f28e565aSAlex Elder out_mem:
5645dc79b113SAlex Elder 	ret = -ENOMEM;
5646d22f76e7SAlex Elder out_err:
5647859c31dfSAlex Elder 	kfree(rbd_opts);
5648859c31dfSAlex Elder 	rbd_spec_put(spec);
5649f28e565aSAlex Elder 	kfree(options);
5650d22f76e7SAlex Elder 
5651dc79b113SAlex Elder 	return ret;
5652a725f65eSAlex Elder }
5653a725f65eSAlex Elder 
5654589d30e0SAlex Elder /*
565530ba1f02SIlya Dryomov  * Return pool id (>= 0) or a negative error code.
565630ba1f02SIlya Dryomov  */
565730ba1f02SIlya Dryomov static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
565830ba1f02SIlya Dryomov {
5659a319bf56SIlya Dryomov 	struct ceph_options *opts = rbdc->client->options;
566030ba1f02SIlya Dryomov 	u64 newest_epoch;
566130ba1f02SIlya Dryomov 	int tries = 0;
566230ba1f02SIlya Dryomov 	int ret;
566330ba1f02SIlya Dryomov 
566430ba1f02SIlya Dryomov again:
566530ba1f02SIlya Dryomov 	ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
566630ba1f02SIlya Dryomov 	if (ret == -ENOENT && tries++ < 1) {
5667d0b19705SIlya Dryomov 		ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap",
566830ba1f02SIlya Dryomov 					    &newest_epoch);
566930ba1f02SIlya Dryomov 		if (ret < 0)
567030ba1f02SIlya Dryomov 			return ret;
567130ba1f02SIlya Dryomov 
567230ba1f02SIlya Dryomov 		if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
56737cca78c9SIlya Dryomov 			ceph_osdc_maybe_request_map(&rbdc->client->osdc);
567430ba1f02SIlya Dryomov 			(void) ceph_monc_wait_osdmap(&rbdc->client->monc,
5675a319bf56SIlya Dryomov 						     newest_epoch,
5676a319bf56SIlya Dryomov 						     opts->mount_timeout);
567730ba1f02SIlya Dryomov 			goto again;
567830ba1f02SIlya Dryomov 		} else {
567930ba1f02SIlya Dryomov 			/* the osdmap we have is new enough */
568030ba1f02SIlya Dryomov 			return -ENOENT;
568130ba1f02SIlya Dryomov 		}
568230ba1f02SIlya Dryomov 	}
568330ba1f02SIlya Dryomov 
568430ba1f02SIlya Dryomov 	return ret;
568530ba1f02SIlya Dryomov }
568630ba1f02SIlya Dryomov 
568730ba1f02SIlya Dryomov /*
5688589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
5689589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
5690589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
5691589d30e0SAlex Elder  *
5692589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
5693589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
5694589d30e0SAlex Elder  * with the supplied name.
5695589d30e0SAlex Elder  *
5696589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
5697589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
5698589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
5699589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
5700589d30e0SAlex Elder  */
5701589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
5702589d30e0SAlex Elder {
5703589d30e0SAlex Elder 	int ret;
5704589d30e0SAlex Elder 	size_t size;
5705ecd4a68aSIlya Dryomov 	CEPH_DEFINE_OID_ONSTACK(oid);
5706589d30e0SAlex Elder 	void *response;
5707c0fba368SAlex Elder 	char *image_id;
57082f82ee54SAlex Elder 
5709589d30e0SAlex Elder 	/*
57102c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
57112c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
5712c0fba368SAlex Elder 	 * need to fetch the image id again in this case.  We
5713c0fba368SAlex Elder 	 * do still need to set the image format though.
57142c0d0a10SAlex Elder 	 */
5715c0fba368SAlex Elder 	if (rbd_dev->spec->image_id) {
5716c0fba368SAlex Elder 		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
5717c0fba368SAlex Elder 
57182c0d0a10SAlex Elder 		return 0;
5719c0fba368SAlex Elder 	}
57202c0d0a10SAlex Elder 
57212c0d0a10SAlex Elder 	/*
5722589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
5723589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
5724589d30e0SAlex Elder 	 */
5725ecd4a68aSIlya Dryomov 	ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
5726ecd4a68aSIlya Dryomov 			       rbd_dev->spec->image_name);
5727ecd4a68aSIlya Dryomov 	if (ret)
5728ecd4a68aSIlya Dryomov 		return ret;
5729ecd4a68aSIlya Dryomov 
5730ecd4a68aSIlya Dryomov 	dout("rbd id object name is %s\n", oid.name);
5731589d30e0SAlex Elder 
5732589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
5733589d30e0SAlex Elder 
5734589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
5735589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
5736589d30e0SAlex Elder 	if (!response) {
5737589d30e0SAlex Elder 		ret = -ENOMEM;
5738589d30e0SAlex Elder 		goto out;
5739589d30e0SAlex Elder 	}
5740589d30e0SAlex Elder 
5741c0fba368SAlex Elder 	/* If it doesn't exist we'll assume it's a format 1 image */
5742c0fba368SAlex Elder 
5743ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5744ecd4a68aSIlya Dryomov 				  "get_id", NULL, 0,
5745e2a58ee5SAlex Elder 				  response, RBD_IMAGE_ID_LEN_MAX);
574636be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5747c0fba368SAlex Elder 	if (ret == -ENOENT) {
5748c0fba368SAlex Elder 		image_id = kstrdup("", GFP_KERNEL);
5749c0fba368SAlex Elder 		ret = image_id ? 0 : -ENOMEM;
5750c0fba368SAlex Elder 		if (!ret)
5751c0fba368SAlex Elder 			rbd_dev->image_format = 1;
57527dd440c9SIlya Dryomov 	} else if (ret >= 0) {
5753c0fba368SAlex Elder 		void *p = response;
5754589d30e0SAlex Elder 
5755c0fba368SAlex Elder 		image_id = ceph_extract_encoded_string(&p, p + ret,
5756979ed480SAlex Elder 						NULL, GFP_NOIO);
5757461f758aSDuan Jiong 		ret = PTR_ERR_OR_ZERO(image_id);
5758c0fba368SAlex Elder 		if (!ret)
5759c0fba368SAlex Elder 			rbd_dev->image_format = 2;
5760c0fba368SAlex Elder 	}
5761c0fba368SAlex Elder 
5762c0fba368SAlex Elder 	if (!ret) {
5763c0fba368SAlex Elder 		rbd_dev->spec->image_id = image_id;
5764c0fba368SAlex Elder 		dout("image_id is %s\n", image_id);
5765589d30e0SAlex Elder 	}
5766589d30e0SAlex Elder out:
5767589d30e0SAlex Elder 	kfree(response);
5768ecd4a68aSIlya Dryomov 	ceph_oid_destroy(&oid);
5769589d30e0SAlex Elder 	return ret;
5770589d30e0SAlex Elder }
5771589d30e0SAlex Elder 
57723abef3b3SAlex Elder /*
57733abef3b3SAlex Elder  * Undo whatever state changes are made by v1 or v2 header info
57743abef3b3SAlex Elder  * call.
57753abef3b3SAlex Elder  */
57766fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
57776fd48b3bSAlex Elder {
57786fd48b3bSAlex Elder 	struct rbd_image_header	*header;
57796fd48b3bSAlex Elder 
5780a2acd00eSAlex Elder 	rbd_dev_parent_put(rbd_dev);
57816fd48b3bSAlex Elder 
57826fd48b3bSAlex Elder 	/* Free dynamic fields from the header, then zero it out */
57836fd48b3bSAlex Elder 
57846fd48b3bSAlex Elder 	header = &rbd_dev->header;
5785812164f8SAlex Elder 	ceph_put_snap_context(header->snapc);
57866fd48b3bSAlex Elder 	kfree(header->snap_sizes);
57876fd48b3bSAlex Elder 	kfree(header->snap_names);
57886fd48b3bSAlex Elder 	kfree(header->object_prefix);
57896fd48b3bSAlex Elder 	memset(header, 0, sizeof (*header));
57906fd48b3bSAlex Elder }
57916fd48b3bSAlex Elder 
57922df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
5793a30b71b9SAlex Elder {
5794a30b71b9SAlex Elder 	int ret;
5795a30b71b9SAlex Elder 
57961e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
579757385b51SAlex Elder 	if (ret)
57981e130199SAlex Elder 		goto out_err;
5799b1b5402aSAlex Elder 
58002df3fac7SAlex Elder 	/*
58012df3fac7SAlex Elder 	 * Get the and check features for the image.  Currently the
58022df3fac7SAlex Elder 	 * features are assumed to never change.
58032df3fac7SAlex Elder 	 */
5804b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
580557385b51SAlex Elder 	if (ret)
5806b1b5402aSAlex Elder 		goto out_err;
580735d489f9SAlex Elder 
5808cc070d59SAlex Elder 	/* If the image supports fancy striping, get its parameters */
5809cc070d59SAlex Elder 
5810cc070d59SAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
5811cc070d59SAlex Elder 		ret = rbd_dev_v2_striping_info(rbd_dev);
5812cc070d59SAlex Elder 		if (ret < 0)
5813cc070d59SAlex Elder 			goto out_err;
5814cc070d59SAlex Elder 	}
5815a30b71b9SAlex Elder 
58167e97332eSIlya Dryomov 	if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
58177e97332eSIlya Dryomov 		ret = rbd_dev_v2_data_pool(rbd_dev);
58187e97332eSIlya Dryomov 		if (ret)
58197e97332eSIlya Dryomov 			goto out_err;
58207e97332eSIlya Dryomov 	}
58217e97332eSIlya Dryomov 
5822263423f8SIlya Dryomov 	rbd_init_layout(rbd_dev);
582335152979SAlex Elder 	return 0;
5824263423f8SIlya Dryomov 
58259d475de5SAlex Elder out_err:
5826642a2537SAlex Elder 	rbd_dev->header.features = 0;
58271e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
58281e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
58299d475de5SAlex Elder 	return ret;
5830a30b71b9SAlex Elder }
5831a30b71b9SAlex Elder 
58326d69bb53SIlya Dryomov /*
58336d69bb53SIlya Dryomov  * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
58346d69bb53SIlya Dryomov  * rbd_dev_image_probe() recursion depth, which means it's also the
58356d69bb53SIlya Dryomov  * length of the already discovered part of the parent chain.
58366d69bb53SIlya Dryomov  */
58376d69bb53SIlya Dryomov static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
583883a06263SAlex Elder {
58392f82ee54SAlex Elder 	struct rbd_device *parent = NULL;
5840124afba2SAlex Elder 	int ret;
5841124afba2SAlex Elder 
5842124afba2SAlex Elder 	if (!rbd_dev->parent_spec)
5843124afba2SAlex Elder 		return 0;
5844124afba2SAlex Elder 
58456d69bb53SIlya Dryomov 	if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
58466d69bb53SIlya Dryomov 		pr_info("parent chain is too long (%d)\n", depth);
58476d69bb53SIlya Dryomov 		ret = -EINVAL;
58486d69bb53SIlya Dryomov 		goto out_err;
58496d69bb53SIlya Dryomov 	}
58506d69bb53SIlya Dryomov 
58511643dfa4SIlya Dryomov 	parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
58521f2c6651SIlya Dryomov 	if (!parent) {
5853124afba2SAlex Elder 		ret = -ENOMEM;
5854124afba2SAlex Elder 		goto out_err;
58551f2c6651SIlya Dryomov 	}
58561f2c6651SIlya Dryomov 
58571f2c6651SIlya Dryomov 	/*
58581f2c6651SIlya Dryomov 	 * Images related by parent/child relationships always share
58591f2c6651SIlya Dryomov 	 * rbd_client and spec/parent_spec, so bump their refcounts.
58601f2c6651SIlya Dryomov 	 */
58611f2c6651SIlya Dryomov 	__rbd_get_client(rbd_dev->rbd_client);
58621f2c6651SIlya Dryomov 	rbd_spec_get(rbd_dev->parent_spec);
5863124afba2SAlex Elder 
58646d69bb53SIlya Dryomov 	ret = rbd_dev_image_probe(parent, depth);
5865124afba2SAlex Elder 	if (ret < 0)
5866124afba2SAlex Elder 		goto out_err;
58671f2c6651SIlya Dryomov 
5868124afba2SAlex Elder 	rbd_dev->parent = parent;
5869a2acd00eSAlex Elder 	atomic_set(&rbd_dev->parent_ref, 1);
5870124afba2SAlex Elder 	return 0;
5871124afba2SAlex Elder 
58721f2c6651SIlya Dryomov out_err:
58731f2c6651SIlya Dryomov 	rbd_dev_unparent(rbd_dev);
58741f2c6651SIlya Dryomov 	rbd_dev_destroy(parent);
5875124afba2SAlex Elder 	return ret;
5876124afba2SAlex Elder }
5877124afba2SAlex Elder 
5878811c6688SIlya Dryomov /*
5879811c6688SIlya Dryomov  * rbd_dev->header_rwsem must be locked for write and will be unlocked
5880811c6688SIlya Dryomov  * upon return.
5881811c6688SIlya Dryomov  */
5882200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
5883124afba2SAlex Elder {
588483a06263SAlex Elder 	int ret;
588583a06263SAlex Elder 
58869b60e70bSIlya Dryomov 	/* Record our major and minor device numbers. */
588783a06263SAlex Elder 
58889b60e70bSIlya Dryomov 	if (!single_major) {
588983a06263SAlex Elder 		ret = register_blkdev(0, rbd_dev->name);
589083a06263SAlex Elder 		if (ret < 0)
58911643dfa4SIlya Dryomov 			goto err_out_unlock;
58929b60e70bSIlya Dryomov 
589383a06263SAlex Elder 		rbd_dev->major = ret;
5894dd82fff1SIlya Dryomov 		rbd_dev->minor = 0;
58959b60e70bSIlya Dryomov 	} else {
58969b60e70bSIlya Dryomov 		rbd_dev->major = rbd_major;
58979b60e70bSIlya Dryomov 		rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
58989b60e70bSIlya Dryomov 	}
589983a06263SAlex Elder 
590083a06263SAlex Elder 	/* Set up the blkdev mapping. */
590183a06263SAlex Elder 
590283a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
590383a06263SAlex Elder 	if (ret)
590483a06263SAlex Elder 		goto err_out_blkdev;
590583a06263SAlex Elder 
5906f35a4deeSAlex Elder 	ret = rbd_dev_mapping_set(rbd_dev);
590783a06263SAlex Elder 	if (ret)
590883a06263SAlex Elder 		goto err_out_disk;
5909bc1ecc65SIlya Dryomov 
5910f35a4deeSAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
591122001f61SJosh Durgin 	set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only);
5912f35a4deeSAlex Elder 
5913dd5ac32dSIlya Dryomov 	dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
5914dd5ac32dSIlya Dryomov 	ret = device_add(&rbd_dev->dev);
5915f35a4deeSAlex Elder 	if (ret)
5916f5ee37bdSIlya Dryomov 		goto err_out_mapping;
591783a06263SAlex Elder 
591883a06263SAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
591983a06263SAlex Elder 
5920129b79d4SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5921811c6688SIlya Dryomov 	up_write(&rbd_dev->header_rwsem);
592283a06263SAlex Elder 
59231643dfa4SIlya Dryomov 	spin_lock(&rbd_dev_list_lock);
59241643dfa4SIlya Dryomov 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
59251643dfa4SIlya Dryomov 	spin_unlock(&rbd_dev_list_lock);
59261643dfa4SIlya Dryomov 
5927811c6688SIlya Dryomov 	add_disk(rbd_dev->disk);
5928ca7909e8SIlya Dryomov 	pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
5929ca7909e8SIlya Dryomov 		(unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
5930ca7909e8SIlya Dryomov 		rbd_dev->header.features);
593183a06263SAlex Elder 
593283a06263SAlex Elder 	return ret;
59332f82ee54SAlex Elder 
5934f35a4deeSAlex Elder err_out_mapping:
5935f35a4deeSAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
593683a06263SAlex Elder err_out_disk:
593783a06263SAlex Elder 	rbd_free_disk(rbd_dev);
593883a06263SAlex Elder err_out_blkdev:
59399b60e70bSIlya Dryomov 	if (!single_major)
594083a06263SAlex Elder 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
5941811c6688SIlya Dryomov err_out_unlock:
5942811c6688SIlya Dryomov 	up_write(&rbd_dev->header_rwsem);
594383a06263SAlex Elder 	return ret;
594483a06263SAlex Elder }
594583a06263SAlex Elder 
5946332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev)
5947332bb12dSAlex Elder {
5948332bb12dSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
5949c41d13a3SIlya Dryomov 	int ret;
5950332bb12dSAlex Elder 
5951332bb12dSAlex Elder 	/* Record the header object name for this rbd image. */
5952332bb12dSAlex Elder 
5953332bb12dSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5954332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
5955c41d13a3SIlya Dryomov 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5956332bb12dSAlex Elder 				       spec->image_name, RBD_SUFFIX);
5957332bb12dSAlex Elder 	else
5958c41d13a3SIlya Dryomov 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5959332bb12dSAlex Elder 				       RBD_HEADER_PREFIX, spec->image_id);
5960c41d13a3SIlya Dryomov 
5961c41d13a3SIlya Dryomov 	return ret;
5962332bb12dSAlex Elder }
5963332bb12dSAlex Elder 
5964200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev)
5965200a6a8bSAlex Elder {
59666fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
59676fd48b3bSAlex Elder 	rbd_dev->image_format = 0;
59686fd48b3bSAlex Elder 	kfree(rbd_dev->spec->image_id);
59696fd48b3bSAlex Elder 	rbd_dev->spec->image_id = NULL;
5970200a6a8bSAlex Elder }
5971200a6a8bSAlex Elder 
5972a30b71b9SAlex Elder /*
5973a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
59741f3ef788SAlex Elder  * device.  If this image is the one being mapped (i.e., not a
59751f3ef788SAlex Elder  * parent), initiate a watch on its header object before using that
59761f3ef788SAlex Elder  * object to get detailed information about the rbd image.
5977a30b71b9SAlex Elder  */
59786d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
5979a30b71b9SAlex Elder {
5980a30b71b9SAlex Elder 	int ret;
5981a30b71b9SAlex Elder 
5982a30b71b9SAlex Elder 	/*
59833abef3b3SAlex Elder 	 * Get the id from the image id object.  Unless there's an
59843abef3b3SAlex Elder 	 * error, rbd_dev->spec->image_id will be filled in with
59853abef3b3SAlex Elder 	 * a dynamically-allocated string, and rbd_dev->image_format
59863abef3b3SAlex Elder 	 * will be set to either 1 or 2.
5987a30b71b9SAlex Elder 	 */
5988a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
5989a30b71b9SAlex Elder 	if (ret)
5990c0fba368SAlex Elder 		return ret;
5991c0fba368SAlex Elder 
5992332bb12dSAlex Elder 	ret = rbd_dev_header_name(rbd_dev);
5993332bb12dSAlex Elder 	if (ret)
5994332bb12dSAlex Elder 		goto err_out_format;
5995332bb12dSAlex Elder 
59966d69bb53SIlya Dryomov 	if (!depth) {
599799d16943SIlya Dryomov 		ret = rbd_register_watch(rbd_dev);
59981fe48023SIlya Dryomov 		if (ret) {
59991fe48023SIlya Dryomov 			if (ret == -ENOENT)
60001fe48023SIlya Dryomov 				pr_info("image %s/%s does not exist\n",
60011fe48023SIlya Dryomov 					rbd_dev->spec->pool_name,
60021fe48023SIlya Dryomov 					rbd_dev->spec->image_name);
6003c41d13a3SIlya Dryomov 			goto err_out_format;
60041f3ef788SAlex Elder 		}
60051fe48023SIlya Dryomov 	}
6006b644de2bSAlex Elder 
6007a720ae09SIlya Dryomov 	ret = rbd_dev_header_info(rbd_dev);
60085655c4d9SAlex Elder 	if (ret)
6009b644de2bSAlex Elder 		goto err_out_watch;
6010a30b71b9SAlex Elder 
601104077599SIlya Dryomov 	/*
601204077599SIlya Dryomov 	 * If this image is the one being mapped, we have pool name and
601304077599SIlya Dryomov 	 * id, image name and id, and snap name - need to fill snap id.
601404077599SIlya Dryomov 	 * Otherwise this is a parent image, identified by pool, image
601504077599SIlya Dryomov 	 * and snap ids - need to fill in names for those ids.
601604077599SIlya Dryomov 	 */
60176d69bb53SIlya Dryomov 	if (!depth)
601804077599SIlya Dryomov 		ret = rbd_spec_fill_snap_id(rbd_dev);
601904077599SIlya Dryomov 	else
602004077599SIlya Dryomov 		ret = rbd_spec_fill_names(rbd_dev);
60211fe48023SIlya Dryomov 	if (ret) {
60221fe48023SIlya Dryomov 		if (ret == -ENOENT)
60231fe48023SIlya Dryomov 			pr_info("snap %s/%s@%s does not exist\n",
60241fe48023SIlya Dryomov 				rbd_dev->spec->pool_name,
60251fe48023SIlya Dryomov 				rbd_dev->spec->image_name,
60261fe48023SIlya Dryomov 				rbd_dev->spec->snap_name);
602733dca39fSAlex Elder 		goto err_out_probe;
60281fe48023SIlya Dryomov 	}
60299bb81c9bSAlex Elder 
6030e8f59b59SIlya Dryomov 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
6031e8f59b59SIlya Dryomov 		ret = rbd_dev_v2_parent_info(rbd_dev);
6032e8f59b59SIlya Dryomov 		if (ret)
6033e8f59b59SIlya Dryomov 			goto err_out_probe;
6034e8f59b59SIlya Dryomov 
6035e8f59b59SIlya Dryomov 		/*
6036e8f59b59SIlya Dryomov 		 * Need to warn users if this image is the one being
6037e8f59b59SIlya Dryomov 		 * mapped and has a parent.
6038e8f59b59SIlya Dryomov 		 */
60396d69bb53SIlya Dryomov 		if (!depth && rbd_dev->parent_spec)
6040e8f59b59SIlya Dryomov 			rbd_warn(rbd_dev,
6041e8f59b59SIlya Dryomov 				 "WARNING: kernel layering is EXPERIMENTAL!");
6042e8f59b59SIlya Dryomov 	}
6043e8f59b59SIlya Dryomov 
60446d69bb53SIlya Dryomov 	ret = rbd_dev_probe_parent(rbd_dev, depth);
604530d60ba2SAlex Elder 	if (ret)
604630d60ba2SAlex Elder 		goto err_out_probe;
604783a06263SAlex Elder 
604830d60ba2SAlex Elder 	dout("discovered format %u image, header name is %s\n",
6049c41d13a3SIlya Dryomov 		rbd_dev->image_format, rbd_dev->header_oid.name);
605030d60ba2SAlex Elder 	return 0;
6051e8f59b59SIlya Dryomov 
60526fd48b3bSAlex Elder err_out_probe:
60536fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
6054b644de2bSAlex Elder err_out_watch:
60556d69bb53SIlya Dryomov 	if (!depth)
605699d16943SIlya Dryomov 		rbd_unregister_watch(rbd_dev);
6057332bb12dSAlex Elder err_out_format:
6058332bb12dSAlex Elder 	rbd_dev->image_format = 0;
60595655c4d9SAlex Elder 	kfree(rbd_dev->spec->image_id);
60605655c4d9SAlex Elder 	rbd_dev->spec->image_id = NULL;
60615655c4d9SAlex Elder 	return ret;
606283a06263SAlex Elder }
606383a06263SAlex Elder 
60649b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus,
606559c2be1eSYehuda Sadeh 			  const char *buf,
606659c2be1eSYehuda Sadeh 			  size_t count)
6067602adf40SYehuda Sadeh {
6068cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
6069dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
60704e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
6071859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
60729d3997fdSAlex Elder 	struct rbd_client *rbdc;
607351344a38SAlex Elder 	bool read_only;
6074b51c83c2SIlya Dryomov 	int rc;
6075602adf40SYehuda Sadeh 
6076602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
6077602adf40SYehuda Sadeh 		return -ENODEV;
6078602adf40SYehuda Sadeh 
6079a725f65eSAlex Elder 	/* parse add command */
6080859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
6081dc79b113SAlex Elder 	if (rc < 0)
6082dd5ac32dSIlya Dryomov 		goto out;
6083a725f65eSAlex Elder 
60849d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
60859d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
60869d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
60870ddebc0cSAlex Elder 		goto err_out_args;
60889d3997fdSAlex Elder 	}
6089602adf40SYehuda Sadeh 
6090602adf40SYehuda Sadeh 	/* pick the pool */
609130ba1f02SIlya Dryomov 	rc = rbd_add_get_pool_id(rbdc, spec->pool_name);
60921fe48023SIlya Dryomov 	if (rc < 0) {
60931fe48023SIlya Dryomov 		if (rc == -ENOENT)
60941fe48023SIlya Dryomov 			pr_info("pool %s does not exist\n", spec->pool_name);
6095602adf40SYehuda Sadeh 		goto err_out_client;
60961fe48023SIlya Dryomov 	}
6097859c31dfSAlex Elder 	spec->pool_id = (u64)rc;
6098859c31dfSAlex Elder 
6099d147543dSIlya Dryomov 	rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
6100b51c83c2SIlya Dryomov 	if (!rbd_dev) {
6101b51c83c2SIlya Dryomov 		rc = -ENOMEM;
6102bd4ba655SAlex Elder 		goto err_out_client;
6103b51c83c2SIlya Dryomov 	}
6104c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
6105c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
6106d147543dSIlya Dryomov 	rbd_opts = NULL;	/* rbd_dev now owns this */
6107602adf40SYehuda Sadeh 
61080d6d1e9cSMike Christie 	rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
61090d6d1e9cSMike Christie 	if (!rbd_dev->config_info) {
61100d6d1e9cSMike Christie 		rc = -ENOMEM;
61110d6d1e9cSMike Christie 		goto err_out_rbd_dev;
61120d6d1e9cSMike Christie 	}
61130d6d1e9cSMike Christie 
6114811c6688SIlya Dryomov 	down_write(&rbd_dev->header_rwsem);
61156d69bb53SIlya Dryomov 	rc = rbd_dev_image_probe(rbd_dev, 0);
61160d6d1e9cSMike Christie 	if (rc < 0) {
61170d6d1e9cSMike Christie 		up_write(&rbd_dev->header_rwsem);
6118c53d5893SAlex Elder 		goto err_out_rbd_dev;
61190d6d1e9cSMike Christie 	}
612005fd6f6fSAlex Elder 
61217ce4eef7SAlex Elder 	/* If we are mapping a snapshot it must be marked read-only */
61227ce4eef7SAlex Elder 
6123d147543dSIlya Dryomov 	read_only = rbd_dev->opts->read_only;
61247ce4eef7SAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
61257ce4eef7SAlex Elder 		read_only = true;
61267ce4eef7SAlex Elder 	rbd_dev->mapping.read_only = read_only;
61277ce4eef7SAlex Elder 
6128b536f69aSAlex Elder 	rc = rbd_dev_device_setup(rbd_dev);
61293abef3b3SAlex Elder 	if (rc) {
6130e37180c0SIlya Dryomov 		/*
613199d16943SIlya Dryomov 		 * rbd_unregister_watch() can't be moved into
6132e37180c0SIlya Dryomov 		 * rbd_dev_image_release() without refactoring, see
6133e37180c0SIlya Dryomov 		 * commit 1f3ef78861ac.
6134e37180c0SIlya Dryomov 		 */
613599d16943SIlya Dryomov 		rbd_unregister_watch(rbd_dev);
61368b679ec5SIlya Dryomov 		goto err_out_image_probe;
61373abef3b3SAlex Elder 	}
61383abef3b3SAlex Elder 
6139dd5ac32dSIlya Dryomov 	rc = count;
6140dd5ac32dSIlya Dryomov out:
6141dd5ac32dSIlya Dryomov 	module_put(THIS_MODULE);
6142dd5ac32dSIlya Dryomov 	return rc;
6143b536f69aSAlex Elder 
61448b679ec5SIlya Dryomov err_out_image_probe:
61458b679ec5SIlya Dryomov 	rbd_dev_image_release(rbd_dev);
6146c53d5893SAlex Elder err_out_rbd_dev:
6147c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
6148bd4ba655SAlex Elder err_out_client:
61499d3997fdSAlex Elder 	rbd_put_client(rbdc);
61500ddebc0cSAlex Elder err_out_args:
6151859c31dfSAlex Elder 	rbd_spec_put(spec);
6152d147543dSIlya Dryomov 	kfree(rbd_opts);
6153dd5ac32dSIlya Dryomov 	goto out;
6154602adf40SYehuda Sadeh }
6155602adf40SYehuda Sadeh 
61569b60e70bSIlya Dryomov static ssize_t rbd_add(struct bus_type *bus,
61579b60e70bSIlya Dryomov 		       const char *buf,
61589b60e70bSIlya Dryomov 		       size_t count)
61599b60e70bSIlya Dryomov {
61609b60e70bSIlya Dryomov 	if (single_major)
61619b60e70bSIlya Dryomov 		return -EINVAL;
61629b60e70bSIlya Dryomov 
61639b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
61649b60e70bSIlya Dryomov }
61659b60e70bSIlya Dryomov 
61669b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus,
61679b60e70bSIlya Dryomov 				    const char *buf,
61689b60e70bSIlya Dryomov 				    size_t count)
61699b60e70bSIlya Dryomov {
61709b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
61719b60e70bSIlya Dryomov }
61729b60e70bSIlya Dryomov 
6173dd5ac32dSIlya Dryomov static void rbd_dev_device_release(struct rbd_device *rbd_dev)
6174602adf40SYehuda Sadeh {
6175602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
61761643dfa4SIlya Dryomov 
61771643dfa4SIlya Dryomov 	spin_lock(&rbd_dev_list_lock);
61781643dfa4SIlya Dryomov 	list_del_init(&rbd_dev->node);
61791643dfa4SIlya Dryomov 	spin_unlock(&rbd_dev_list_lock);
61801643dfa4SIlya Dryomov 
6181200a6a8bSAlex Elder 	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
6182dd5ac32dSIlya Dryomov 	device_del(&rbd_dev->dev);
61836d80b130SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
61849b60e70bSIlya Dryomov 	if (!single_major)
6185602adf40SYehuda Sadeh 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
6186602adf40SYehuda Sadeh }
6187602adf40SYehuda Sadeh 
618805a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
618905a46afdSAlex Elder {
6190ad945fc1SAlex Elder 	while (rbd_dev->parent) {
619105a46afdSAlex Elder 		struct rbd_device *first = rbd_dev;
619205a46afdSAlex Elder 		struct rbd_device *second = first->parent;
619305a46afdSAlex Elder 		struct rbd_device *third;
619405a46afdSAlex Elder 
619505a46afdSAlex Elder 		/*
619605a46afdSAlex Elder 		 * Follow to the parent with no grandparent and
619705a46afdSAlex Elder 		 * remove it.
619805a46afdSAlex Elder 		 */
619905a46afdSAlex Elder 		while (second && (third = second->parent)) {
620005a46afdSAlex Elder 			first = second;
620105a46afdSAlex Elder 			second = third;
620205a46afdSAlex Elder 		}
6203ad945fc1SAlex Elder 		rbd_assert(second);
62048ad42cd0SAlex Elder 		rbd_dev_image_release(second);
62058b679ec5SIlya Dryomov 		rbd_dev_destroy(second);
6206ad945fc1SAlex Elder 		first->parent = NULL;
6207ad945fc1SAlex Elder 		first->parent_overlap = 0;
6208ad945fc1SAlex Elder 
6209ad945fc1SAlex Elder 		rbd_assert(first->parent_spec);
621005a46afdSAlex Elder 		rbd_spec_put(first->parent_spec);
621105a46afdSAlex Elder 		first->parent_spec = NULL;
621205a46afdSAlex Elder 	}
621305a46afdSAlex Elder }
621405a46afdSAlex Elder 
62159b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus,
6216602adf40SYehuda Sadeh 			     const char *buf,
6217602adf40SYehuda Sadeh 			     size_t count)
6218602adf40SYehuda Sadeh {
6219602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
6220751cc0e3SAlex Elder 	struct list_head *tmp;
6221751cc0e3SAlex Elder 	int dev_id;
62220276dca6SMike Christie 	char opt_buf[6];
622382a442d2SAlex Elder 	bool already = false;
62240276dca6SMike Christie 	bool force = false;
62250d8189e1SAlex Elder 	int ret;
6226602adf40SYehuda Sadeh 
62270276dca6SMike Christie 	dev_id = -1;
62280276dca6SMike Christie 	opt_buf[0] = '\0';
62290276dca6SMike Christie 	sscanf(buf, "%d %5s", &dev_id, opt_buf);
62300276dca6SMike Christie 	if (dev_id < 0) {
62310276dca6SMike Christie 		pr_err("dev_id out of range\n");
6232602adf40SYehuda Sadeh 		return -EINVAL;
62330276dca6SMike Christie 	}
62340276dca6SMike Christie 	if (opt_buf[0] != '\0') {
62350276dca6SMike Christie 		if (!strcmp(opt_buf, "force")) {
62360276dca6SMike Christie 			force = true;
62370276dca6SMike Christie 		} else {
62380276dca6SMike Christie 			pr_err("bad remove option at '%s'\n", opt_buf);
62390276dca6SMike Christie 			return -EINVAL;
62400276dca6SMike Christie 		}
62410276dca6SMike Christie 	}
6242602adf40SYehuda Sadeh 
6243602adf40SYehuda Sadeh 	ret = -ENOENT;
6244751cc0e3SAlex Elder 	spin_lock(&rbd_dev_list_lock);
6245751cc0e3SAlex Elder 	list_for_each(tmp, &rbd_dev_list) {
6246751cc0e3SAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
6247751cc0e3SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
6248751cc0e3SAlex Elder 			ret = 0;
6249751cc0e3SAlex Elder 			break;
6250602adf40SYehuda Sadeh 		}
6251751cc0e3SAlex Elder 	}
6252751cc0e3SAlex Elder 	if (!ret) {
6253a14ea269SAlex Elder 		spin_lock_irq(&rbd_dev->lock);
62540276dca6SMike Christie 		if (rbd_dev->open_count && !force)
625542382b70SAlex Elder 			ret = -EBUSY;
6256b82d167bSAlex Elder 		else
625782a442d2SAlex Elder 			already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
625882a442d2SAlex Elder 							&rbd_dev->flags);
6259a14ea269SAlex Elder 		spin_unlock_irq(&rbd_dev->lock);
6260751cc0e3SAlex Elder 	}
6261751cc0e3SAlex Elder 	spin_unlock(&rbd_dev_list_lock);
626282a442d2SAlex Elder 	if (ret < 0 || already)
62631ba0f1e7SAlex Elder 		return ret;
6264751cc0e3SAlex Elder 
62650276dca6SMike Christie 	if (force) {
62660276dca6SMike Christie 		/*
62670276dca6SMike Christie 		 * Prevent new IO from being queued and wait for existing
62680276dca6SMike Christie 		 * IO to complete/fail.
62690276dca6SMike Christie 		 */
62700276dca6SMike Christie 		blk_mq_freeze_queue(rbd_dev->disk->queue);
62710276dca6SMike Christie 		blk_set_queue_dying(rbd_dev->disk->queue);
62720276dca6SMike Christie 	}
62730276dca6SMike Christie 
6274ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
6275ed95b21aSIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev))
6276ed95b21aSIlya Dryomov 		rbd_unlock(rbd_dev);
6277ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
627899d16943SIlya Dryomov 	rbd_unregister_watch(rbd_dev);
6279fca27065SIlya Dryomov 
62809875201eSJosh Durgin 	/*
62819875201eSJosh Durgin 	 * Don't free anything from rbd_dev->disk until after all
62829875201eSJosh Durgin 	 * notifies are completely processed. Otherwise
62839875201eSJosh Durgin 	 * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting
62849875201eSJosh Durgin 	 * in a potential use after free of rbd_dev->disk or rbd_dev.
62859875201eSJosh Durgin 	 */
6286dd5ac32dSIlya Dryomov 	rbd_dev_device_release(rbd_dev);
62878ad42cd0SAlex Elder 	rbd_dev_image_release(rbd_dev);
62888b679ec5SIlya Dryomov 	rbd_dev_destroy(rbd_dev);
62891ba0f1e7SAlex Elder 	return count;
6290602adf40SYehuda Sadeh }
6291602adf40SYehuda Sadeh 
62929b60e70bSIlya Dryomov static ssize_t rbd_remove(struct bus_type *bus,
62939b60e70bSIlya Dryomov 			  const char *buf,
62949b60e70bSIlya Dryomov 			  size_t count)
62959b60e70bSIlya Dryomov {
62969b60e70bSIlya Dryomov 	if (single_major)
62979b60e70bSIlya Dryomov 		return -EINVAL;
62989b60e70bSIlya Dryomov 
62999b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
63009b60e70bSIlya Dryomov }
63019b60e70bSIlya Dryomov 
63029b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus,
63039b60e70bSIlya Dryomov 				       const char *buf,
63049b60e70bSIlya Dryomov 				       size_t count)
63059b60e70bSIlya Dryomov {
63069b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
63079b60e70bSIlya Dryomov }
63089b60e70bSIlya Dryomov 
6309602adf40SYehuda Sadeh /*
6310602adf40SYehuda Sadeh  * create control files in sysfs
6311dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
6312602adf40SYehuda Sadeh  */
6313602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
6314602adf40SYehuda Sadeh {
6315dfc5606dSYehuda Sadeh 	int ret;
6316602adf40SYehuda Sadeh 
6317fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
6318dfc5606dSYehuda Sadeh 	if (ret < 0)
6319dfc5606dSYehuda Sadeh 		return ret;
6320602adf40SYehuda Sadeh 
6321fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
6322fed4c143SAlex Elder 	if (ret < 0)
6323fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
6324602adf40SYehuda Sadeh 
6325602adf40SYehuda Sadeh 	return ret;
6326602adf40SYehuda Sadeh }
6327602adf40SYehuda Sadeh 
6328602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
6329602adf40SYehuda Sadeh {
6330dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
6331fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
6332602adf40SYehuda Sadeh }
6333602adf40SYehuda Sadeh 
63341c2a9dfeSAlex Elder static int rbd_slab_init(void)
63351c2a9dfeSAlex Elder {
63361c2a9dfeSAlex Elder 	rbd_assert(!rbd_img_request_cache);
633703d94406SGeliang Tang 	rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
6338868311b1SAlex Elder 	if (!rbd_img_request_cache)
6339868311b1SAlex Elder 		return -ENOMEM;
6340868311b1SAlex Elder 
6341868311b1SAlex Elder 	rbd_assert(!rbd_obj_request_cache);
634203d94406SGeliang Tang 	rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
634378c2a44aSAlex Elder 	if (!rbd_obj_request_cache)
634478c2a44aSAlex Elder 		goto out_err;
634578c2a44aSAlex Elder 
63461c2a9dfeSAlex Elder 	return 0;
63471c2a9dfeSAlex Elder 
63486c696d85SIlya Dryomov out_err:
6349868311b1SAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
6350868311b1SAlex Elder 	rbd_img_request_cache = NULL;
63511c2a9dfeSAlex Elder 	return -ENOMEM;
63521c2a9dfeSAlex Elder }
63531c2a9dfeSAlex Elder 
63541c2a9dfeSAlex Elder static void rbd_slab_exit(void)
63551c2a9dfeSAlex Elder {
6356868311b1SAlex Elder 	rbd_assert(rbd_obj_request_cache);
6357868311b1SAlex Elder 	kmem_cache_destroy(rbd_obj_request_cache);
6358868311b1SAlex Elder 	rbd_obj_request_cache = NULL;
6359868311b1SAlex Elder 
63601c2a9dfeSAlex Elder 	rbd_assert(rbd_img_request_cache);
63611c2a9dfeSAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
63621c2a9dfeSAlex Elder 	rbd_img_request_cache = NULL;
63631c2a9dfeSAlex Elder }
63641c2a9dfeSAlex Elder 
6365cc344fa1SAlex Elder static int __init rbd_init(void)
6366602adf40SYehuda Sadeh {
6367602adf40SYehuda Sadeh 	int rc;
6368602adf40SYehuda Sadeh 
63691e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
63701e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
63711e32d34cSAlex Elder 		return -EINVAL;
63721e32d34cSAlex Elder 	}
6373e1b4d96dSIlya Dryomov 
63741c2a9dfeSAlex Elder 	rc = rbd_slab_init();
6375602adf40SYehuda Sadeh 	if (rc)
6376602adf40SYehuda Sadeh 		return rc;
6377e1b4d96dSIlya Dryomov 
6378f5ee37bdSIlya Dryomov 	/*
6379f5ee37bdSIlya Dryomov 	 * The number of active work items is limited by the number of
6380f77303bdSIlya Dryomov 	 * rbd devices * queue depth, so leave @max_active at default.
6381f5ee37bdSIlya Dryomov 	 */
6382f5ee37bdSIlya Dryomov 	rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
6383f5ee37bdSIlya Dryomov 	if (!rbd_wq) {
6384f5ee37bdSIlya Dryomov 		rc = -ENOMEM;
6385f5ee37bdSIlya Dryomov 		goto err_out_slab;
6386f5ee37bdSIlya Dryomov 	}
6387f5ee37bdSIlya Dryomov 
63889b60e70bSIlya Dryomov 	if (single_major) {
63899b60e70bSIlya Dryomov 		rbd_major = register_blkdev(0, RBD_DRV_NAME);
63909b60e70bSIlya Dryomov 		if (rbd_major < 0) {
63919b60e70bSIlya Dryomov 			rc = rbd_major;
6392f5ee37bdSIlya Dryomov 			goto err_out_wq;
63939b60e70bSIlya Dryomov 		}
63949b60e70bSIlya Dryomov 	}
63959b60e70bSIlya Dryomov 
63961c2a9dfeSAlex Elder 	rc = rbd_sysfs_init();
63971c2a9dfeSAlex Elder 	if (rc)
63989b60e70bSIlya Dryomov 		goto err_out_blkdev;
63991c2a9dfeSAlex Elder 
64009b60e70bSIlya Dryomov 	if (single_major)
64019b60e70bSIlya Dryomov 		pr_info("loaded (major %d)\n", rbd_major);
64029b60e70bSIlya Dryomov 	else
6403e1b4d96dSIlya Dryomov 		pr_info("loaded\n");
64049b60e70bSIlya Dryomov 
6405e1b4d96dSIlya Dryomov 	return 0;
6406e1b4d96dSIlya Dryomov 
64079b60e70bSIlya Dryomov err_out_blkdev:
64089b60e70bSIlya Dryomov 	if (single_major)
64099b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
6410f5ee37bdSIlya Dryomov err_out_wq:
6411f5ee37bdSIlya Dryomov 	destroy_workqueue(rbd_wq);
6412e1b4d96dSIlya Dryomov err_out_slab:
6413e1b4d96dSIlya Dryomov 	rbd_slab_exit();
64141c2a9dfeSAlex Elder 	return rc;
6415602adf40SYehuda Sadeh }
6416602adf40SYehuda Sadeh 
6417cc344fa1SAlex Elder static void __exit rbd_exit(void)
6418602adf40SYehuda Sadeh {
6419ffe312cfSIlya Dryomov 	ida_destroy(&rbd_dev_id_ida);
6420602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
64219b60e70bSIlya Dryomov 	if (single_major)
64229b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
6423f5ee37bdSIlya Dryomov 	destroy_workqueue(rbd_wq);
64241c2a9dfeSAlex Elder 	rbd_slab_exit();
6425602adf40SYehuda Sadeh }
6426602adf40SYehuda Sadeh 
6427602adf40SYehuda Sadeh module_init(rbd_init);
6428602adf40SYehuda Sadeh module_exit(rbd_exit);
6429602adf40SYehuda Sadeh 
6430d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
6431602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
6432602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
6433602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
6434602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
6435602adf40SYehuda Sadeh 
643690da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
6437602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
6438