xref: /openbmc/linux/drivers/block/rbd.c (revision 1643dfa4)
1e2a58ee5SAlex Elder 
2602adf40SYehuda Sadeh /*
3602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh 
6602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
7602adf40SYehuda Sadeh 
8602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
9602adf40SYehuda Sadeh 
10602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
11602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
12602adf40SYehuda Sadeh    the Free Software Foundation.
13602adf40SYehuda Sadeh 
14602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
15602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
16602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17602adf40SYehuda Sadeh    GNU General Public License for more details.
18602adf40SYehuda Sadeh 
19602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
20602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
21602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24602adf40SYehuda Sadeh 
25dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
26602adf40SYehuda Sadeh 
27dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
28602adf40SYehuda Sadeh 
29602adf40SYehuda Sadeh  */
30602adf40SYehuda Sadeh 
31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
34602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3559c2be1eSYehuda Sadeh #include <linux/parser.h>
3630d1cff8SAlex Elder #include <linux/bsearch.h>
37602adf40SYehuda Sadeh 
38602adf40SYehuda Sadeh #include <linux/kernel.h>
39602adf40SYehuda Sadeh #include <linux/device.h>
40602adf40SYehuda Sadeh #include <linux/module.h>
417ad18afaSChristoph Hellwig #include <linux/blk-mq.h>
42602adf40SYehuda Sadeh #include <linux/fs.h>
43602adf40SYehuda Sadeh #include <linux/blkdev.h>
441c2a9dfeSAlex Elder #include <linux/slab.h>
45f8a22fc2SIlya Dryomov #include <linux/idr.h>
46bc1ecc65SIlya Dryomov #include <linux/workqueue.h>
47602adf40SYehuda Sadeh 
48602adf40SYehuda Sadeh #include "rbd_types.h"
49602adf40SYehuda Sadeh 
50aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
51aafb230eSAlex Elder 
52593a9e7bSAlex Elder /*
53593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
54593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
55593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
56593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
57593a9e7bSAlex Elder  */
58593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
59593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
60593a9e7bSAlex Elder 
61a2acd00eSAlex Elder /*
62a2acd00eSAlex Elder  * Increment the given counter and return its updated value.
63a2acd00eSAlex Elder  * If the counter is already 0 it will not be incremented.
64a2acd00eSAlex Elder  * If the counter is already at its maximum value returns
65a2acd00eSAlex Elder  * -EINVAL without updating it.
66a2acd00eSAlex Elder  */
67a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v)
68a2acd00eSAlex Elder {
69a2acd00eSAlex Elder 	unsigned int counter;
70a2acd00eSAlex Elder 
71a2acd00eSAlex Elder 	counter = (unsigned int)__atomic_add_unless(v, 1, 0);
72a2acd00eSAlex Elder 	if (counter <= (unsigned int)INT_MAX)
73a2acd00eSAlex Elder 		return (int)counter;
74a2acd00eSAlex Elder 
75a2acd00eSAlex Elder 	atomic_dec(v);
76a2acd00eSAlex Elder 
77a2acd00eSAlex Elder 	return -EINVAL;
78a2acd00eSAlex Elder }
79a2acd00eSAlex Elder 
80a2acd00eSAlex Elder /* Decrement the counter.  Return the resulting value, or -EINVAL */
81a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v)
82a2acd00eSAlex Elder {
83a2acd00eSAlex Elder 	int counter;
84a2acd00eSAlex Elder 
85a2acd00eSAlex Elder 	counter = atomic_dec_return(v);
86a2acd00eSAlex Elder 	if (counter >= 0)
87a2acd00eSAlex Elder 		return counter;
88a2acd00eSAlex Elder 
89a2acd00eSAlex Elder 	atomic_inc(v);
90a2acd00eSAlex Elder 
91a2acd00eSAlex Elder 	return -EINVAL;
92a2acd00eSAlex Elder }
93a2acd00eSAlex Elder 
94f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
95602adf40SYehuda Sadeh 
967e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR		256
977e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT	4
98602adf40SYehuda Sadeh 
996d69bb53SIlya Dryomov #define RBD_MAX_PARENT_CHAIN_LEN	16
1006d69bb53SIlya Dryomov 
101d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
102d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
103d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
104d4b125e9SAlex Elder 
10535d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
106602adf40SYehuda Sadeh 
107602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
108602adf40SYehuda Sadeh 
1099682fc6dSAlex Elder #define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
1109682fc6dSAlex Elder 
1119e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
1129e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
113589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
1149e15b77dSAlex Elder 
1151e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
116589d30e0SAlex Elder 
117d889140cSAlex Elder /* Feature bits */
118d889140cSAlex Elder 
1195cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING	(1<<0)
1205cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2	(1<<1)
1215cbf6f12SAlex Elder #define RBD_FEATURES_ALL \
1225cbf6f12SAlex Elder 	    (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
123d889140cSAlex Elder 
124d889140cSAlex Elder /* Features supported by this (client software) implementation. */
125d889140cSAlex Elder 
126770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
127d889140cSAlex Elder 
12881a89793SAlex Elder /*
12981a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
13081a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
13181a89793SAlex Elder  */
132602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
133602adf40SYehuda Sadeh 
134602adf40SYehuda Sadeh /*
135602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
136602adf40SYehuda Sadeh  */
137602adf40SYehuda Sadeh struct rbd_image_header {
138f35a4deeSAlex Elder 	/* These six fields never change for a given rbd image */
139849b4260SAlex Elder 	char *object_prefix;
140602adf40SYehuda Sadeh 	__u8 obj_order;
141602adf40SYehuda Sadeh 	__u8 crypt_type;
142602adf40SYehuda Sadeh 	__u8 comp_type;
143f35a4deeSAlex Elder 	u64 stripe_unit;
144f35a4deeSAlex Elder 	u64 stripe_count;
145f35a4deeSAlex Elder 	u64 features;		/* Might be changeable someday? */
146602adf40SYehuda Sadeh 
147f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
148f84344f3SAlex Elder 	u64 image_size;
149f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
150f35a4deeSAlex Elder 	char *snap_names;	/* format 1 only */
151f35a4deeSAlex Elder 	u64 *snap_sizes;	/* format 1 only */
15259c2be1eSYehuda Sadeh };
15359c2be1eSYehuda Sadeh 
1540d7dbfceSAlex Elder /*
1550d7dbfceSAlex Elder  * An rbd image specification.
1560d7dbfceSAlex Elder  *
1570d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
158c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
159c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
160c66c6e0cSAlex Elder  *
161c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
162c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
163c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
164c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
165c66c6e0cSAlex Elder  *
166c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
167c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
168c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
169c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
170c66c6e0cSAlex Elder  * is shared between the parent and child).
171c66c6e0cSAlex Elder  *
172c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
173c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
174c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
175c66c6e0cSAlex Elder  *
176c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
177c66c6e0cSAlex Elder  * could be a null pointer).
1780d7dbfceSAlex Elder  */
1790d7dbfceSAlex Elder struct rbd_spec {
1800d7dbfceSAlex Elder 	u64		pool_id;
181ecb4dc22SAlex Elder 	const char	*pool_name;
1820d7dbfceSAlex Elder 
183ecb4dc22SAlex Elder 	const char	*image_id;
184ecb4dc22SAlex Elder 	const char	*image_name;
1850d7dbfceSAlex Elder 
1860d7dbfceSAlex Elder 	u64		snap_id;
187ecb4dc22SAlex Elder 	const char	*snap_name;
1880d7dbfceSAlex Elder 
1890d7dbfceSAlex Elder 	struct kref	kref;
1900d7dbfceSAlex Elder };
1910d7dbfceSAlex Elder 
192602adf40SYehuda Sadeh /*
193f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
194602adf40SYehuda Sadeh  */
195602adf40SYehuda Sadeh struct rbd_client {
196602adf40SYehuda Sadeh 	struct ceph_client	*client;
197602adf40SYehuda Sadeh 	struct kref		kref;
198602adf40SYehuda Sadeh 	struct list_head	node;
199602adf40SYehuda Sadeh };
200602adf40SYehuda Sadeh 
201bf0d5f50SAlex Elder struct rbd_img_request;
202bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
203bf0d5f50SAlex Elder 
204bf0d5f50SAlex Elder #define	BAD_WHICH	U32_MAX		/* Good which or bad which, which? */
205bf0d5f50SAlex Elder 
206bf0d5f50SAlex Elder struct rbd_obj_request;
207bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
208bf0d5f50SAlex Elder 
2099969ebc5SAlex Elder enum obj_request_type {
2109969ebc5SAlex Elder 	OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
2119969ebc5SAlex Elder };
212bf0d5f50SAlex Elder 
2136d2940c8SGuangliang Zhao enum obj_operation_type {
2146d2940c8SGuangliang Zhao 	OBJ_OP_WRITE,
2156d2940c8SGuangliang Zhao 	OBJ_OP_READ,
21690e98c52SGuangliang Zhao 	OBJ_OP_DISCARD,
2176d2940c8SGuangliang Zhao };
2186d2940c8SGuangliang Zhao 
219926f9b3fSAlex Elder enum obj_req_flags {
220926f9b3fSAlex Elder 	OBJ_REQ_DONE,		/* completion flag: not done = 0, done = 1 */
2216365d33aSAlex Elder 	OBJ_REQ_IMG_DATA,	/* object usage: standalone = 0, image = 1 */
2225679c59fSAlex Elder 	OBJ_REQ_KNOWN,		/* EXISTS flag valid: no = 0, yes = 1 */
2235679c59fSAlex Elder 	OBJ_REQ_EXISTS,		/* target exists: no = 0, yes = 1 */
224926f9b3fSAlex Elder };
225926f9b3fSAlex Elder 
226bf0d5f50SAlex Elder struct rbd_obj_request {
227bf0d5f50SAlex Elder 	const char		*object_name;
228bf0d5f50SAlex Elder 	u64			offset;		/* object start byte */
229bf0d5f50SAlex Elder 	u64			length;		/* bytes from offset */
230926f9b3fSAlex Elder 	unsigned long		flags;
231bf0d5f50SAlex Elder 
232c5b5ef6cSAlex Elder 	/*
233c5b5ef6cSAlex Elder 	 * An object request associated with an image will have its
234c5b5ef6cSAlex Elder 	 * img_data flag set; a standalone object request will not.
235c5b5ef6cSAlex Elder 	 *
236c5b5ef6cSAlex Elder 	 * A standalone object request will have which == BAD_WHICH
237c5b5ef6cSAlex Elder 	 * and a null obj_request pointer.
238c5b5ef6cSAlex Elder 	 *
239c5b5ef6cSAlex Elder 	 * An object request initiated in support of a layered image
240c5b5ef6cSAlex Elder 	 * object (to check for its existence before a write) will
241c5b5ef6cSAlex Elder 	 * have which == BAD_WHICH and a non-null obj_request pointer.
242c5b5ef6cSAlex Elder 	 *
243c5b5ef6cSAlex Elder 	 * Finally, an object request for rbd image data will have
244c5b5ef6cSAlex Elder 	 * which != BAD_WHICH, and will have a non-null img_request
245c5b5ef6cSAlex Elder 	 * pointer.  The value of which will be in the range
246c5b5ef6cSAlex Elder 	 * 0..(img_request->obj_request_count-1).
247c5b5ef6cSAlex Elder 	 */
248c5b5ef6cSAlex Elder 	union {
249c5b5ef6cSAlex Elder 		struct rbd_obj_request	*obj_request;	/* STAT op */
250c5b5ef6cSAlex Elder 		struct {
251bf0d5f50SAlex Elder 			struct rbd_img_request	*img_request;
252c5b5ef6cSAlex Elder 			u64			img_offset;
253c5b5ef6cSAlex Elder 			/* links for img_request->obj_requests list */
254c5b5ef6cSAlex Elder 			struct list_head	links;
255c5b5ef6cSAlex Elder 		};
256c5b5ef6cSAlex Elder 	};
257bf0d5f50SAlex Elder 	u32			which;		/* posn image request list */
258bf0d5f50SAlex Elder 
259bf0d5f50SAlex Elder 	enum obj_request_type	type;
260788e2df3SAlex Elder 	union {
261bf0d5f50SAlex Elder 		struct bio	*bio_list;
262788e2df3SAlex Elder 		struct {
263788e2df3SAlex Elder 			struct page	**pages;
264788e2df3SAlex Elder 			u32		page_count;
265788e2df3SAlex Elder 		};
266788e2df3SAlex Elder 	};
2670eefd470SAlex Elder 	struct page		**copyup_pages;
268ebda6408SAlex Elder 	u32			copyup_page_count;
269bf0d5f50SAlex Elder 
270bf0d5f50SAlex Elder 	struct ceph_osd_request	*osd_req;
271bf0d5f50SAlex Elder 
272bf0d5f50SAlex Elder 	u64			xferred;	/* bytes transferred */
2731b83bef2SSage Weil 	int			result;
274bf0d5f50SAlex Elder 
275bf0d5f50SAlex Elder 	rbd_obj_callback_t	callback;
276788e2df3SAlex Elder 	struct completion	completion;
277bf0d5f50SAlex Elder 
278bf0d5f50SAlex Elder 	struct kref		kref;
279bf0d5f50SAlex Elder };
280bf0d5f50SAlex Elder 
2810c425248SAlex Elder enum img_req_flags {
2829849e986SAlex Elder 	IMG_REQ_WRITE,		/* I/O direction: read = 0, write = 1 */
2839849e986SAlex Elder 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
284d0b2e944SAlex Elder 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
28590e98c52SGuangliang Zhao 	IMG_REQ_DISCARD,	/* discard: normal = 0, discard request = 1 */
2860c425248SAlex Elder };
2870c425248SAlex Elder 
288bf0d5f50SAlex Elder struct rbd_img_request {
289bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
290bf0d5f50SAlex Elder 	u64			offset;	/* starting image byte offset */
291bf0d5f50SAlex Elder 	u64			length;	/* byte count from offset */
2920c425248SAlex Elder 	unsigned long		flags;
293bf0d5f50SAlex Elder 	union {
294bf0d5f50SAlex Elder 		u64			snap_id;	/* for reads */
2959849e986SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
2969849e986SAlex Elder 	};
2979849e986SAlex Elder 	union {
2989849e986SAlex Elder 		struct request		*rq;		/* block request */
2999849e986SAlex Elder 		struct rbd_obj_request	*obj_request;	/* obj req initiator */
300bf0d5f50SAlex Elder 	};
3013d7efd18SAlex Elder 	struct page		**copyup_pages;
302ebda6408SAlex Elder 	u32			copyup_page_count;
303bf0d5f50SAlex Elder 	spinlock_t		completion_lock;/* protects next_completion */
304bf0d5f50SAlex Elder 	u32			next_completion;
305bf0d5f50SAlex Elder 	rbd_img_callback_t	callback;
30655f27e09SAlex Elder 	u64			xferred;/* aggregate bytes transferred */
307a5a337d4SAlex Elder 	int			result;	/* first nonzero obj_request result */
308bf0d5f50SAlex Elder 
309bf0d5f50SAlex Elder 	u32			obj_request_count;
310bf0d5f50SAlex Elder 	struct list_head	obj_requests;	/* rbd_obj_request structs */
311bf0d5f50SAlex Elder 
312bf0d5f50SAlex Elder 	struct kref		kref;
313bf0d5f50SAlex Elder };
314bf0d5f50SAlex Elder 
315bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
316ef06f4d3SAlex Elder 	list_for_each_entry(oreq, &(ireq)->obj_requests, links)
317bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \
318ef06f4d3SAlex Elder 	list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
319bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
320ef06f4d3SAlex Elder 	list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
321bf0d5f50SAlex Elder 
322f84344f3SAlex Elder struct rbd_mapping {
32399c1f08fSAlex Elder 	u64                     size;
32434b13184SAlex Elder 	u64                     features;
325f84344f3SAlex Elder 	bool			read_only;
326f84344f3SAlex Elder };
327f84344f3SAlex Elder 
328602adf40SYehuda Sadeh /*
329602adf40SYehuda Sadeh  * a single device
330602adf40SYehuda Sadeh  */
331602adf40SYehuda Sadeh struct rbd_device {
332de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
333602adf40SYehuda Sadeh 
334602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
335dd82fff1SIlya Dryomov 	int			minor;
336602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
337602adf40SYehuda Sadeh 
338a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
339602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
340602adf40SYehuda Sadeh 
341602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
342602adf40SYehuda Sadeh 
343b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
344602adf40SYehuda Sadeh 
345602adf40SYehuda Sadeh 	struct rbd_image_header	header;
346b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
3470d7dbfceSAlex Elder 	struct rbd_spec		*spec;
348d147543dSIlya Dryomov 	struct rbd_options	*opts;
349602adf40SYehuda Sadeh 
350c41d13a3SIlya Dryomov 	struct ceph_object_id	header_oid;
351922dab61SIlya Dryomov 	struct ceph_object_locator header_oloc;
352971f839aSAlex Elder 
3531643dfa4SIlya Dryomov 	struct ceph_file_layout	layout;		/* used for all rbd requests */
3540903e875SAlex Elder 
355922dab61SIlya Dryomov 	struct ceph_osd_linger_request *watch_handle;
35659c2be1eSYehuda Sadeh 
3571643dfa4SIlya Dryomov 	struct workqueue_struct	*task_wq;
3581643dfa4SIlya Dryomov 
35986b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
36086b00e0dSAlex Elder 	u64			parent_overlap;
361a2acd00eSAlex Elder 	atomic_t		parent_ref;
3622f82ee54SAlex Elder 	struct rbd_device	*parent;
36386b00e0dSAlex Elder 
3647ad18afaSChristoph Hellwig 	/* Block layer tags. */
3657ad18afaSChristoph Hellwig 	struct blk_mq_tag_set	tag_set;
3667ad18afaSChristoph Hellwig 
367c666601aSJosh Durgin 	/* protects updating the header */
368c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
369f84344f3SAlex Elder 
370f84344f3SAlex Elder 	struct rbd_mapping	mapping;
371602adf40SYehuda Sadeh 
372602adf40SYehuda Sadeh 	struct list_head	node;
373dfc5606dSYehuda Sadeh 
374dfc5606dSYehuda Sadeh 	/* sysfs related */
375dfc5606dSYehuda Sadeh 	struct device		dev;
376b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
377dfc5606dSYehuda Sadeh };
378dfc5606dSYehuda Sadeh 
379b82d167bSAlex Elder /*
380b82d167bSAlex Elder  * Flag bits for rbd_dev->flags.  If atomicity is required,
381b82d167bSAlex Elder  * rbd_dev->lock is used to protect access.
382b82d167bSAlex Elder  *
383b82d167bSAlex Elder  * Currently, only the "removing" flag (which is coupled with the
384b82d167bSAlex Elder  * "open_count" field) requires atomic access.
385b82d167bSAlex Elder  */
3866d292906SAlex Elder enum rbd_dev_flags {
3876d292906SAlex Elder 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
388b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
3896d292906SAlex Elder };
3906d292906SAlex Elder 
391cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex);	/* Serialize client creation */
392e124a82fSAlex Elder 
393602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
394e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
395e124a82fSAlex Elder 
396602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
397432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
398602adf40SYehuda Sadeh 
39978c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */
40078c2a44aSAlex Elder 
4011c2a9dfeSAlex Elder static struct kmem_cache	*rbd_img_request_cache;
402868311b1SAlex Elder static struct kmem_cache	*rbd_obj_request_cache;
40378c2a44aSAlex Elder static struct kmem_cache	*rbd_segment_name_cache;
4041c2a9dfeSAlex Elder 
4059b60e70bSIlya Dryomov static int rbd_major;
406f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida);
407f8a22fc2SIlya Dryomov 
408f5ee37bdSIlya Dryomov static struct workqueue_struct *rbd_wq;
409f5ee37bdSIlya Dryomov 
4109b60e70bSIlya Dryomov /*
4119b60e70bSIlya Dryomov  * Default to false for now, as single-major requires >= 0.75 version of
4129b60e70bSIlya Dryomov  * userspace rbd utility.
4139b60e70bSIlya Dryomov  */
4149b60e70bSIlya Dryomov static bool single_major = false;
4159b60e70bSIlya Dryomov module_param(single_major, bool, S_IRUGO);
4169b60e70bSIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)");
4179b60e70bSIlya Dryomov 
4183d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request);
4193d7efd18SAlex Elder 
420f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
421f0f8cef5SAlex Elder 		       size_t count);
422f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
423f0f8cef5SAlex Elder 			  size_t count);
4249b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
4259b60e70bSIlya Dryomov 				    size_t count);
4269b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
4279b60e70bSIlya Dryomov 				       size_t count);
4286d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
429a2acd00eSAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
430f0f8cef5SAlex Elder 
4319b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id)
4329b60e70bSIlya Dryomov {
4337e513d43SIlya Dryomov 	return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
4349b60e70bSIlya Dryomov }
4359b60e70bSIlya Dryomov 
4369b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor)
4379b60e70bSIlya Dryomov {
4387e513d43SIlya Dryomov 	return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
4399b60e70bSIlya Dryomov }
4409b60e70bSIlya Dryomov 
441b15a21ddSGreg Kroah-Hartman static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
442b15a21ddSGreg Kroah-Hartman static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
4439b60e70bSIlya Dryomov static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
4449b60e70bSIlya Dryomov static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
445b15a21ddSGreg Kroah-Hartman 
446b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = {
447b15a21ddSGreg Kroah-Hartman 	&bus_attr_add.attr,
448b15a21ddSGreg Kroah-Hartman 	&bus_attr_remove.attr,
4499b60e70bSIlya Dryomov 	&bus_attr_add_single_major.attr,
4509b60e70bSIlya Dryomov 	&bus_attr_remove_single_major.attr,
451b15a21ddSGreg Kroah-Hartman 	NULL,
452f0f8cef5SAlex Elder };
45392c76dc0SIlya Dryomov 
45492c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj,
45592c76dc0SIlya Dryomov 				  struct attribute *attr, int index)
45692c76dc0SIlya Dryomov {
4579b60e70bSIlya Dryomov 	if (!single_major &&
4589b60e70bSIlya Dryomov 	    (attr == &bus_attr_add_single_major.attr ||
4599b60e70bSIlya Dryomov 	     attr == &bus_attr_remove_single_major.attr))
4609b60e70bSIlya Dryomov 		return 0;
4619b60e70bSIlya Dryomov 
46292c76dc0SIlya Dryomov 	return attr->mode;
46392c76dc0SIlya Dryomov }
46492c76dc0SIlya Dryomov 
46592c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = {
46692c76dc0SIlya Dryomov 	.attrs = rbd_bus_attrs,
46792c76dc0SIlya Dryomov 	.is_visible = rbd_bus_is_visible,
46892c76dc0SIlya Dryomov };
46992c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus);
470f0f8cef5SAlex Elder 
471f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
472f0f8cef5SAlex Elder 	.name		= "rbd",
473b15a21ddSGreg Kroah-Hartman 	.bus_groups	= rbd_bus_groups,
474f0f8cef5SAlex Elder };
475f0f8cef5SAlex Elder 
476f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
477f0f8cef5SAlex Elder {
478f0f8cef5SAlex Elder }
479f0f8cef5SAlex Elder 
480f0f8cef5SAlex Elder static struct device rbd_root_dev = {
481f0f8cef5SAlex Elder 	.init_name =    "rbd",
482f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
483f0f8cef5SAlex Elder };
484f0f8cef5SAlex Elder 
48506ecc6cbSAlex Elder static __printf(2, 3)
48606ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
48706ecc6cbSAlex Elder {
48806ecc6cbSAlex Elder 	struct va_format vaf;
48906ecc6cbSAlex Elder 	va_list args;
49006ecc6cbSAlex Elder 
49106ecc6cbSAlex Elder 	va_start(args, fmt);
49206ecc6cbSAlex Elder 	vaf.fmt = fmt;
49306ecc6cbSAlex Elder 	vaf.va = &args;
49406ecc6cbSAlex Elder 
49506ecc6cbSAlex Elder 	if (!rbd_dev)
49606ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
49706ecc6cbSAlex Elder 	else if (rbd_dev->disk)
49806ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
49906ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
50006ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
50106ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
50206ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
50306ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
50406ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
50506ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
50606ecc6cbSAlex Elder 	else	/* punt */
50706ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
50806ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
50906ecc6cbSAlex Elder 	va_end(args);
51006ecc6cbSAlex Elder }
51106ecc6cbSAlex Elder 
512aafb230eSAlex Elder #ifdef RBD_DEBUG
513aafb230eSAlex Elder #define rbd_assert(expr)						\
514aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
515aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
516aafb230eSAlex Elder 						"at line %d:\n\n"	\
517aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
518aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
519aafb230eSAlex Elder 			BUG();						\
520aafb230eSAlex Elder 		}
521aafb230eSAlex Elder #else /* !RBD_DEBUG */
522aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
523aafb230eSAlex Elder #endif /* !RBD_DEBUG */
524dfc5606dSYehuda Sadeh 
5252761713dSIlya Dryomov static void rbd_osd_copyup_callback(struct rbd_obj_request *obj_request);
526b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
52705a46afdSAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
52805a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
5298b3e1a56SAlex Elder 
530cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev);
5312df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
532a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev);
533e8f59b59SIlya Dryomov static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
53454cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
53554cac61fSAlex Elder 					u64 snap_id);
5362ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
5372ad3d716SAlex Elder 				u8 *order, u64 *snap_size);
5382ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
5392ad3d716SAlex Elder 		u64 *snap_features);
54059c2be1eSYehuda Sadeh 
541602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
542602adf40SYehuda Sadeh {
543f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
544b82d167bSAlex Elder 	bool removing = false;
545602adf40SYehuda Sadeh 
546f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
547602adf40SYehuda Sadeh 		return -EROFS;
548602adf40SYehuda Sadeh 
549a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
550b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
551b82d167bSAlex Elder 		removing = true;
552b82d167bSAlex Elder 	else
553b82d167bSAlex Elder 		rbd_dev->open_count++;
554a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
555b82d167bSAlex Elder 	if (removing)
556b82d167bSAlex Elder 		return -ENOENT;
557b82d167bSAlex Elder 
558c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
559340c7a2bSAlex Elder 
560602adf40SYehuda Sadeh 	return 0;
561602adf40SYehuda Sadeh }
562602adf40SYehuda Sadeh 
563db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode)
564dfc5606dSYehuda Sadeh {
565dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
566b82d167bSAlex Elder 	unsigned long open_count_before;
567b82d167bSAlex Elder 
568a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
569b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
570a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
571b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
572dfc5606dSYehuda Sadeh 
573c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
574dfc5606dSYehuda Sadeh }
575dfc5606dSYehuda Sadeh 
576131fd9f6SGuangliang Zhao static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
577131fd9f6SGuangliang Zhao {
57877f33c03SJosh Durgin 	int ret = 0;
579131fd9f6SGuangliang Zhao 	int val;
580131fd9f6SGuangliang Zhao 	bool ro;
58177f33c03SJosh Durgin 	bool ro_changed = false;
582131fd9f6SGuangliang Zhao 
58377f33c03SJosh Durgin 	/* get_user() may sleep, so call it before taking rbd_dev->lock */
584131fd9f6SGuangliang Zhao 	if (get_user(val, (int __user *)(arg)))
585131fd9f6SGuangliang Zhao 		return -EFAULT;
586131fd9f6SGuangliang Zhao 
587131fd9f6SGuangliang Zhao 	ro = val ? true : false;
588131fd9f6SGuangliang Zhao 	/* Snapshot doesn't allow to write*/
589131fd9f6SGuangliang Zhao 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
590131fd9f6SGuangliang Zhao 		return -EROFS;
591131fd9f6SGuangliang Zhao 
59277f33c03SJosh Durgin 	spin_lock_irq(&rbd_dev->lock);
59377f33c03SJosh Durgin 	/* prevent others open this device */
59477f33c03SJosh Durgin 	if (rbd_dev->open_count > 1) {
59577f33c03SJosh Durgin 		ret = -EBUSY;
59677f33c03SJosh Durgin 		goto out;
597131fd9f6SGuangliang Zhao 	}
598131fd9f6SGuangliang Zhao 
59977f33c03SJosh Durgin 	if (rbd_dev->mapping.read_only != ro) {
60077f33c03SJosh Durgin 		rbd_dev->mapping.read_only = ro;
60177f33c03SJosh Durgin 		ro_changed = true;
60277f33c03SJosh Durgin 	}
60377f33c03SJosh Durgin 
60477f33c03SJosh Durgin out:
60577f33c03SJosh Durgin 	spin_unlock_irq(&rbd_dev->lock);
60677f33c03SJosh Durgin 	/* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */
60777f33c03SJosh Durgin 	if (ret == 0 && ro_changed)
60877f33c03SJosh Durgin 		set_disk_ro(rbd_dev->disk, ro ? 1 : 0);
60977f33c03SJosh Durgin 
61077f33c03SJosh Durgin 	return ret;
611131fd9f6SGuangliang Zhao }
612131fd9f6SGuangliang Zhao 
613131fd9f6SGuangliang Zhao static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
614131fd9f6SGuangliang Zhao 			unsigned int cmd, unsigned long arg)
615131fd9f6SGuangliang Zhao {
616131fd9f6SGuangliang Zhao 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
617131fd9f6SGuangliang Zhao 	int ret = 0;
618131fd9f6SGuangliang Zhao 
619131fd9f6SGuangliang Zhao 	switch (cmd) {
620131fd9f6SGuangliang Zhao 	case BLKROSET:
621131fd9f6SGuangliang Zhao 		ret = rbd_ioctl_set_ro(rbd_dev, arg);
622131fd9f6SGuangliang Zhao 		break;
623131fd9f6SGuangliang Zhao 	default:
624131fd9f6SGuangliang Zhao 		ret = -ENOTTY;
625131fd9f6SGuangliang Zhao 	}
626131fd9f6SGuangliang Zhao 
627131fd9f6SGuangliang Zhao 	return ret;
628131fd9f6SGuangliang Zhao }
629131fd9f6SGuangliang Zhao 
630131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT
631131fd9f6SGuangliang Zhao static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
632131fd9f6SGuangliang Zhao 				unsigned int cmd, unsigned long arg)
633131fd9f6SGuangliang Zhao {
634131fd9f6SGuangliang Zhao 	return rbd_ioctl(bdev, mode, cmd, arg);
635131fd9f6SGuangliang Zhao }
636131fd9f6SGuangliang Zhao #endif /* CONFIG_COMPAT */
637131fd9f6SGuangliang Zhao 
638602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
639602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
640602adf40SYehuda Sadeh 	.open			= rbd_open,
641dfc5606dSYehuda Sadeh 	.release		= rbd_release,
642131fd9f6SGuangliang Zhao 	.ioctl			= rbd_ioctl,
643131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT
644131fd9f6SGuangliang Zhao 	.compat_ioctl		= rbd_compat_ioctl,
645131fd9f6SGuangliang Zhao #endif
646602adf40SYehuda Sadeh };
647602adf40SYehuda Sadeh 
648602adf40SYehuda Sadeh /*
6497262cfcaSAlex Elder  * Initialize an rbd client instance.  Success or not, this function
650cfbf6377SAlex Elder  * consumes ceph_opts.  Caller holds client_mutex.
651602adf40SYehuda Sadeh  */
652f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
653602adf40SYehuda Sadeh {
654602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
655602adf40SYehuda Sadeh 	int ret = -ENOMEM;
656602adf40SYehuda Sadeh 
65737206ee5SAlex Elder 	dout("%s:\n", __func__);
658602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
659602adf40SYehuda Sadeh 	if (!rbdc)
660602adf40SYehuda Sadeh 		goto out_opt;
661602adf40SYehuda Sadeh 
662602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
663602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
664602adf40SYehuda Sadeh 
66543ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
666602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
66708f75463SAlex Elder 		goto out_rbdc;
66843ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
669602adf40SYehuda Sadeh 
670602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
671602adf40SYehuda Sadeh 	if (ret < 0)
67208f75463SAlex Elder 		goto out_client;
673602adf40SYehuda Sadeh 
674432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
675602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
676432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
677602adf40SYehuda Sadeh 
67837206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
679bc534d86SAlex Elder 
680602adf40SYehuda Sadeh 	return rbdc;
68108f75463SAlex Elder out_client:
682602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
68308f75463SAlex Elder out_rbdc:
684602adf40SYehuda Sadeh 	kfree(rbdc);
685602adf40SYehuda Sadeh out_opt:
68643ae4701SAlex Elder 	if (ceph_opts)
68743ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
68837206ee5SAlex Elder 	dout("%s: error %d\n", __func__, ret);
68937206ee5SAlex Elder 
69028f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
691602adf40SYehuda Sadeh }
692602adf40SYehuda Sadeh 
6932f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
6942f82ee54SAlex Elder {
6952f82ee54SAlex Elder 	kref_get(&rbdc->kref);
6962f82ee54SAlex Elder 
6972f82ee54SAlex Elder 	return rbdc;
6982f82ee54SAlex Elder }
6992f82ee54SAlex Elder 
700602adf40SYehuda Sadeh /*
7011f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
7021f7ba331SAlex Elder  * found, bump its reference count.
703602adf40SYehuda Sadeh  */
7041f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
705602adf40SYehuda Sadeh {
706602adf40SYehuda Sadeh 	struct rbd_client *client_node;
7071f7ba331SAlex Elder 	bool found = false;
708602adf40SYehuda Sadeh 
70943ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
710602adf40SYehuda Sadeh 		return NULL;
711602adf40SYehuda Sadeh 
7121f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
7131f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
7141f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
7152f82ee54SAlex Elder 			__rbd_get_client(client_node);
7162f82ee54SAlex Elder 
7171f7ba331SAlex Elder 			found = true;
7181f7ba331SAlex Elder 			break;
7191f7ba331SAlex Elder 		}
7201f7ba331SAlex Elder 	}
7211f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
7221f7ba331SAlex Elder 
7231f7ba331SAlex Elder 	return found ? client_node : NULL;
724602adf40SYehuda Sadeh }
725602adf40SYehuda Sadeh 
726602adf40SYehuda Sadeh /*
727210c104cSIlya Dryomov  * (Per device) rbd map options
72859c2be1eSYehuda Sadeh  */
72959c2be1eSYehuda Sadeh enum {
730b5584180SIlya Dryomov 	Opt_queue_depth,
73159c2be1eSYehuda Sadeh 	Opt_last_int,
73259c2be1eSYehuda Sadeh 	/* int args above */
73359c2be1eSYehuda Sadeh 	Opt_last_string,
73459c2be1eSYehuda Sadeh 	/* string args above */
735cc0538b6SAlex Elder 	Opt_read_only,
736cc0538b6SAlex Elder 	Opt_read_write,
737210c104cSIlya Dryomov 	Opt_err
73859c2be1eSYehuda Sadeh };
73959c2be1eSYehuda Sadeh 
74043ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
741b5584180SIlya Dryomov 	{Opt_queue_depth, "queue_depth=%d"},
74259c2be1eSYehuda Sadeh 	/* int args above */
74359c2be1eSYehuda Sadeh 	/* string args above */
744be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
745cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
746cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
747cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
748210c104cSIlya Dryomov 	{Opt_err, NULL}
74959c2be1eSYehuda Sadeh };
75059c2be1eSYehuda Sadeh 
75198571b5aSAlex Elder struct rbd_options {
752b5584180SIlya Dryomov 	int	queue_depth;
75398571b5aSAlex Elder 	bool	read_only;
75498571b5aSAlex Elder };
75598571b5aSAlex Elder 
756b5584180SIlya Dryomov #define RBD_QUEUE_DEPTH_DEFAULT	BLKDEV_MAX_RQ
75798571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
75898571b5aSAlex Elder 
75959c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
76059c2be1eSYehuda Sadeh {
76143ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
76259c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
76359c2be1eSYehuda Sadeh 	int token, intval, ret;
76459c2be1eSYehuda Sadeh 
76543ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
76659c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
76759c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
76859c2be1eSYehuda Sadeh 		if (ret < 0) {
769210c104cSIlya Dryomov 			pr_err("bad mount option arg (not int) at '%s'\n", c);
77059c2be1eSYehuda Sadeh 			return ret;
77159c2be1eSYehuda Sadeh 		}
77259c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
77359c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
774210c104cSIlya Dryomov 		dout("got string token %d val %s\n", token, argstr[0].from);
77559c2be1eSYehuda Sadeh 	} else {
77659c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
77759c2be1eSYehuda Sadeh 	}
77859c2be1eSYehuda Sadeh 
77959c2be1eSYehuda Sadeh 	switch (token) {
780b5584180SIlya Dryomov 	case Opt_queue_depth:
781b5584180SIlya Dryomov 		if (intval < 1) {
782b5584180SIlya Dryomov 			pr_err("queue_depth out of range\n");
783b5584180SIlya Dryomov 			return -EINVAL;
784b5584180SIlya Dryomov 		}
785b5584180SIlya Dryomov 		rbd_opts->queue_depth = intval;
786b5584180SIlya Dryomov 		break;
787cc0538b6SAlex Elder 	case Opt_read_only:
788cc0538b6SAlex Elder 		rbd_opts->read_only = true;
789cc0538b6SAlex Elder 		break;
790cc0538b6SAlex Elder 	case Opt_read_write:
791cc0538b6SAlex Elder 		rbd_opts->read_only = false;
792cc0538b6SAlex Elder 		break;
79359c2be1eSYehuda Sadeh 	default:
794210c104cSIlya Dryomov 		/* libceph prints "bad option" msg */
795210c104cSIlya Dryomov 		return -EINVAL;
79659c2be1eSYehuda Sadeh 	}
797210c104cSIlya Dryomov 
79859c2be1eSYehuda Sadeh 	return 0;
79959c2be1eSYehuda Sadeh }
80059c2be1eSYehuda Sadeh 
8016d2940c8SGuangliang Zhao static char* obj_op_name(enum obj_operation_type op_type)
8026d2940c8SGuangliang Zhao {
8036d2940c8SGuangliang Zhao 	switch (op_type) {
8046d2940c8SGuangliang Zhao 	case OBJ_OP_READ:
8056d2940c8SGuangliang Zhao 		return "read";
8066d2940c8SGuangliang Zhao 	case OBJ_OP_WRITE:
8076d2940c8SGuangliang Zhao 		return "write";
80890e98c52SGuangliang Zhao 	case OBJ_OP_DISCARD:
80990e98c52SGuangliang Zhao 		return "discard";
8106d2940c8SGuangliang Zhao 	default:
8116d2940c8SGuangliang Zhao 		return "???";
8126d2940c8SGuangliang Zhao 	}
8136d2940c8SGuangliang Zhao }
8146d2940c8SGuangliang Zhao 
81559c2be1eSYehuda Sadeh /*
816602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
8177262cfcaSAlex Elder  * not exist create it.  Either way, ceph_opts is consumed by this
8187262cfcaSAlex Elder  * function.
819602adf40SYehuda Sadeh  */
8209d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
821602adf40SYehuda Sadeh {
822f8c38929SAlex Elder 	struct rbd_client *rbdc;
82359c2be1eSYehuda Sadeh 
824cfbf6377SAlex Elder 	mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
8251f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
8269d3997fdSAlex Elder 	if (rbdc)	/* using an existing client */
82743ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
8289d3997fdSAlex Elder 	else
829f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
830cfbf6377SAlex Elder 	mutex_unlock(&client_mutex);
831d720bcb0SAlex Elder 
8329d3997fdSAlex Elder 	return rbdc;
833602adf40SYehuda Sadeh }
834602adf40SYehuda Sadeh 
835602adf40SYehuda Sadeh /*
836602adf40SYehuda Sadeh  * Destroy ceph client
837d23a4b3fSAlex Elder  *
838432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
839602adf40SYehuda Sadeh  */
840602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
841602adf40SYehuda Sadeh {
842602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
843602adf40SYehuda Sadeh 
84437206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
845cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
846602adf40SYehuda Sadeh 	list_del(&rbdc->node);
847cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
848602adf40SYehuda Sadeh 
849602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
850602adf40SYehuda Sadeh 	kfree(rbdc);
851602adf40SYehuda Sadeh }
852602adf40SYehuda Sadeh 
853602adf40SYehuda Sadeh /*
854602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
855602adf40SYehuda Sadeh  * it.
856602adf40SYehuda Sadeh  */
8579d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
858602adf40SYehuda Sadeh {
859c53d5893SAlex Elder 	if (rbdc)
8609d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
861602adf40SYehuda Sadeh }
862602adf40SYehuda Sadeh 
863a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
864a30b71b9SAlex Elder {
865a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
866a30b71b9SAlex Elder }
867a30b71b9SAlex Elder 
8688e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
8698e94af8eSAlex Elder {
870103a150fSAlex Elder 	size_t size;
871103a150fSAlex Elder 	u32 snap_count;
872103a150fSAlex Elder 
873103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
874103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
875103a150fSAlex Elder 		return false;
876103a150fSAlex Elder 
877db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
878db2388b6SAlex Elder 
879db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
880db2388b6SAlex Elder 		return false;
881db2388b6SAlex Elder 
882db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
883db2388b6SAlex Elder 
884db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
885db2388b6SAlex Elder 		return false;
886db2388b6SAlex Elder 
887103a150fSAlex Elder 	/*
888103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
889103a150fSAlex Elder 	 * that limits the number of snapshots.
890103a150fSAlex Elder 	 */
891103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
892103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
893103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
894103a150fSAlex Elder 		return false;
895103a150fSAlex Elder 
896103a150fSAlex Elder 	/*
897103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
898103a150fSAlex Elder 	 * header must also be representable in a size_t.
899103a150fSAlex Elder 	 */
900103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
901103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
902103a150fSAlex Elder 		return false;
903103a150fSAlex Elder 
904103a150fSAlex Elder 	return true;
9058e94af8eSAlex Elder }
9068e94af8eSAlex Elder 
907602adf40SYehuda Sadeh /*
908bb23e37aSAlex Elder  * Fill an rbd image header with information from the given format 1
909bb23e37aSAlex Elder  * on-disk header.
910602adf40SYehuda Sadeh  */
911662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev,
9124156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
913602adf40SYehuda Sadeh {
914662518b1SAlex Elder 	struct rbd_image_header *header = &rbd_dev->header;
915bb23e37aSAlex Elder 	bool first_time = header->object_prefix == NULL;
916bb23e37aSAlex Elder 	struct ceph_snap_context *snapc;
917bb23e37aSAlex Elder 	char *object_prefix = NULL;
918bb23e37aSAlex Elder 	char *snap_names = NULL;
919bb23e37aSAlex Elder 	u64 *snap_sizes = NULL;
920ccece235SAlex Elder 	u32 snap_count;
921d2bb24e5SAlex Elder 	size_t size;
922bb23e37aSAlex Elder 	int ret = -ENOMEM;
923621901d6SAlex Elder 	u32 i;
924602adf40SYehuda Sadeh 
925bb23e37aSAlex Elder 	/* Allocate this now to avoid having to handle failure below */
926103a150fSAlex Elder 
927bb23e37aSAlex Elder 	if (first_time) {
928bb23e37aSAlex Elder 		size_t len;
929bb23e37aSAlex Elder 
930bb23e37aSAlex Elder 		len = strnlen(ondisk->object_prefix,
931bb23e37aSAlex Elder 				sizeof (ondisk->object_prefix));
932bb23e37aSAlex Elder 		object_prefix = kmalloc(len + 1, GFP_KERNEL);
933bb23e37aSAlex Elder 		if (!object_prefix)
934602adf40SYehuda Sadeh 			return -ENOMEM;
935bb23e37aSAlex Elder 		memcpy(object_prefix, ondisk->object_prefix, len);
936bb23e37aSAlex Elder 		object_prefix[len] = '\0';
937bb23e37aSAlex Elder 	}
93800f1f36fSAlex Elder 
939bb23e37aSAlex Elder 	/* Allocate the snapshot context and fill it in */
940d2bb24e5SAlex Elder 
941602adf40SYehuda Sadeh 	snap_count = le32_to_cpu(ondisk->snap_count);
942bb23e37aSAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
943bb23e37aSAlex Elder 	if (!snapc)
944bb23e37aSAlex Elder 		goto out_err;
945bb23e37aSAlex Elder 	snapc->seq = le64_to_cpu(ondisk->snap_seq);
946602adf40SYehuda Sadeh 	if (snap_count) {
947bb23e37aSAlex Elder 		struct rbd_image_snap_ondisk *snaps;
948f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
949f785cc1dSAlex Elder 
950bb23e37aSAlex Elder 		/* We'll keep a copy of the snapshot names... */
951621901d6SAlex Elder 
952f785cc1dSAlex Elder 		if (snap_names_len > (u64)SIZE_MAX)
953bb23e37aSAlex Elder 			goto out_2big;
954bb23e37aSAlex Elder 		snap_names = kmalloc(snap_names_len, GFP_KERNEL);
955bb23e37aSAlex Elder 		if (!snap_names)
956602adf40SYehuda Sadeh 			goto out_err;
957bb23e37aSAlex Elder 
958bb23e37aSAlex Elder 		/* ...as well as the array of their sizes. */
959bb23e37aSAlex Elder 
960bb23e37aSAlex Elder 		size = snap_count * sizeof (*header->snap_sizes);
961bb23e37aSAlex Elder 		snap_sizes = kmalloc(size, GFP_KERNEL);
962bb23e37aSAlex Elder 		if (!snap_sizes)
963bb23e37aSAlex Elder 			goto out_err;
964bb23e37aSAlex Elder 
965f785cc1dSAlex Elder 		/*
966bb23e37aSAlex Elder 		 * Copy the names, and fill in each snapshot's id
967bb23e37aSAlex Elder 		 * and size.
968bb23e37aSAlex Elder 		 *
96999a41ebcSAlex Elder 		 * Note that rbd_dev_v1_header_info() guarantees the
970bb23e37aSAlex Elder 		 * ondisk buffer we're working with has
971f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
972f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
973f785cc1dSAlex Elder 		 */
974bb23e37aSAlex Elder 		memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
975bb23e37aSAlex Elder 		snaps = ondisk->snaps;
976bb23e37aSAlex Elder 		for (i = 0; i < snap_count; i++) {
977bb23e37aSAlex Elder 			snapc->snaps[i] = le64_to_cpu(snaps[i].id);
978bb23e37aSAlex Elder 			snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
979bb23e37aSAlex Elder 		}
980602adf40SYehuda Sadeh 	}
981849b4260SAlex Elder 
982bb23e37aSAlex Elder 	/* We won't fail any more, fill in the header */
983bb23e37aSAlex Elder 
984bb23e37aSAlex Elder 	if (first_time) {
985bb23e37aSAlex Elder 		header->object_prefix = object_prefix;
986602adf40SYehuda Sadeh 		header->obj_order = ondisk->options.order;
987602adf40SYehuda Sadeh 		header->crypt_type = ondisk->options.crypt_type;
988602adf40SYehuda Sadeh 		header->comp_type = ondisk->options.comp_type;
989bb23e37aSAlex Elder 		/* The rest aren't used for format 1 images */
990bb23e37aSAlex Elder 		header->stripe_unit = 0;
991bb23e37aSAlex Elder 		header->stripe_count = 0;
992bb23e37aSAlex Elder 		header->features = 0;
993662518b1SAlex Elder 	} else {
994662518b1SAlex Elder 		ceph_put_snap_context(header->snapc);
995662518b1SAlex Elder 		kfree(header->snap_names);
996662518b1SAlex Elder 		kfree(header->snap_sizes);
997bb23e37aSAlex Elder 	}
9986a52325fSAlex Elder 
999bb23e37aSAlex Elder 	/* The remaining fields always get updated (when we refresh) */
1000621901d6SAlex Elder 
1001f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
1002bb23e37aSAlex Elder 	header->snapc = snapc;
1003bb23e37aSAlex Elder 	header->snap_names = snap_names;
1004bb23e37aSAlex Elder 	header->snap_sizes = snap_sizes;
1005468521c1SAlex Elder 
1006602adf40SYehuda Sadeh 	return 0;
1007bb23e37aSAlex Elder out_2big:
1008bb23e37aSAlex Elder 	ret = -EIO;
10096a52325fSAlex Elder out_err:
1010bb23e37aSAlex Elder 	kfree(snap_sizes);
1011bb23e37aSAlex Elder 	kfree(snap_names);
1012bb23e37aSAlex Elder 	ceph_put_snap_context(snapc);
1013bb23e37aSAlex Elder 	kfree(object_prefix);
1014ccece235SAlex Elder 
1015bb23e37aSAlex Elder 	return ret;
1016602adf40SYehuda Sadeh }
1017602adf40SYehuda Sadeh 
10189682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
10199682fc6dSAlex Elder {
10209682fc6dSAlex Elder 	const char *snap_name;
10219682fc6dSAlex Elder 
10229682fc6dSAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
10239682fc6dSAlex Elder 
10249682fc6dSAlex Elder 	/* Skip over names until we find the one we are looking for */
10259682fc6dSAlex Elder 
10269682fc6dSAlex Elder 	snap_name = rbd_dev->header.snap_names;
10279682fc6dSAlex Elder 	while (which--)
10289682fc6dSAlex Elder 		snap_name += strlen(snap_name) + 1;
10299682fc6dSAlex Elder 
10309682fc6dSAlex Elder 	return kstrdup(snap_name, GFP_KERNEL);
10319682fc6dSAlex Elder }
10329682fc6dSAlex Elder 
103330d1cff8SAlex Elder /*
103430d1cff8SAlex Elder  * Snapshot id comparison function for use with qsort()/bsearch().
103530d1cff8SAlex Elder  * Note that result is for snapshots in *descending* order.
103630d1cff8SAlex Elder  */
103730d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2)
103830d1cff8SAlex Elder {
103930d1cff8SAlex Elder 	u64 snap_id1 = *(u64 *)s1;
104030d1cff8SAlex Elder 	u64 snap_id2 = *(u64 *)s2;
104130d1cff8SAlex Elder 
104230d1cff8SAlex Elder 	if (snap_id1 < snap_id2)
104330d1cff8SAlex Elder 		return 1;
104430d1cff8SAlex Elder 	return snap_id1 == snap_id2 ? 0 : -1;
104530d1cff8SAlex Elder }
104630d1cff8SAlex Elder 
104730d1cff8SAlex Elder /*
104830d1cff8SAlex Elder  * Search a snapshot context to see if the given snapshot id is
104930d1cff8SAlex Elder  * present.
105030d1cff8SAlex Elder  *
105130d1cff8SAlex Elder  * Returns the position of the snapshot id in the array if it's found,
105230d1cff8SAlex Elder  * or BAD_SNAP_INDEX otherwise.
105330d1cff8SAlex Elder  *
105430d1cff8SAlex Elder  * Note: The snapshot array is in kept sorted (by the osd) in
105530d1cff8SAlex Elder  * reverse order, highest snapshot id first.
105630d1cff8SAlex Elder  */
10579682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
10589682fc6dSAlex Elder {
10599682fc6dSAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
106030d1cff8SAlex Elder 	u64 *found;
10619682fc6dSAlex Elder 
106230d1cff8SAlex Elder 	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
106330d1cff8SAlex Elder 				sizeof (snap_id), snapid_compare_reverse);
10649682fc6dSAlex Elder 
106530d1cff8SAlex Elder 	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
10669682fc6dSAlex Elder }
10679682fc6dSAlex Elder 
10682ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
10692ad3d716SAlex Elder 					u64 snap_id)
107054cac61fSAlex Elder {
107154cac61fSAlex Elder 	u32 which;
1072da6a6b63SJosh Durgin 	const char *snap_name;
107354cac61fSAlex Elder 
107454cac61fSAlex Elder 	which = rbd_dev_snap_index(rbd_dev, snap_id);
107554cac61fSAlex Elder 	if (which == BAD_SNAP_INDEX)
1076da6a6b63SJosh Durgin 		return ERR_PTR(-ENOENT);
107754cac61fSAlex Elder 
1078da6a6b63SJosh Durgin 	snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1079da6a6b63SJosh Durgin 	return snap_name ? snap_name : ERR_PTR(-ENOMEM);
108054cac61fSAlex Elder }
108154cac61fSAlex Elder 
10829e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
10839e15b77dSAlex Elder {
10849e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
10859e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
10869e15b77dSAlex Elder 
108754cac61fSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
108854cac61fSAlex Elder 	if (rbd_dev->image_format == 1)
108954cac61fSAlex Elder 		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
10909e15b77dSAlex Elder 
109154cac61fSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
10929e15b77dSAlex Elder }
10939e15b77dSAlex Elder 
10942ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
10952ad3d716SAlex Elder 				u64 *snap_size)
1096602adf40SYehuda Sadeh {
10972ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
10982ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
10992ad3d716SAlex Elder 		*snap_size = rbd_dev->header.image_size;
11002ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
11012ad3d716SAlex Elder 		u32 which;
110200f1f36fSAlex Elder 
11032ad3d716SAlex Elder 		which = rbd_dev_snap_index(rbd_dev, snap_id);
11042ad3d716SAlex Elder 		if (which == BAD_SNAP_INDEX)
11052ad3d716SAlex Elder 			return -ENOENT;
110600f1f36fSAlex Elder 
11072ad3d716SAlex Elder 		*snap_size = rbd_dev->header.snap_sizes[which];
11082ad3d716SAlex Elder 	} else {
11092ad3d716SAlex Elder 		u64 size = 0;
11102ad3d716SAlex Elder 		int ret;
11112ad3d716SAlex Elder 
11122ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
11132ad3d716SAlex Elder 		if (ret)
11142ad3d716SAlex Elder 			return ret;
11152ad3d716SAlex Elder 
11162ad3d716SAlex Elder 		*snap_size = size;
11172ad3d716SAlex Elder 	}
11182ad3d716SAlex Elder 	return 0;
11192ad3d716SAlex Elder }
11202ad3d716SAlex Elder 
11212ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
11222ad3d716SAlex Elder 			u64 *snap_features)
11232ad3d716SAlex Elder {
11242ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
11252ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
11262ad3d716SAlex Elder 		*snap_features = rbd_dev->header.features;
11272ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
11282ad3d716SAlex Elder 		*snap_features = 0;	/* No features for format 1 */
11292ad3d716SAlex Elder 	} else {
11302ad3d716SAlex Elder 		u64 features = 0;
11312ad3d716SAlex Elder 		int ret;
11322ad3d716SAlex Elder 
11332ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
11342ad3d716SAlex Elder 		if (ret)
11352ad3d716SAlex Elder 			return ret;
11362ad3d716SAlex Elder 
11372ad3d716SAlex Elder 		*snap_features = features;
11382ad3d716SAlex Elder 	}
11392ad3d716SAlex Elder 	return 0;
114000f1f36fSAlex Elder }
1141602adf40SYehuda Sadeh 
1142d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1143602adf40SYehuda Sadeh {
11448f4b7d98SAlex Elder 	u64 snap_id = rbd_dev->spec->snap_id;
11452ad3d716SAlex Elder 	u64 size = 0;
11462ad3d716SAlex Elder 	u64 features = 0;
11472ad3d716SAlex Elder 	int ret;
11488b0241f8SAlex Elder 
11492ad3d716SAlex Elder 	ret = rbd_snap_size(rbd_dev, snap_id, &size);
11502ad3d716SAlex Elder 	if (ret)
11512ad3d716SAlex Elder 		return ret;
11522ad3d716SAlex Elder 	ret = rbd_snap_features(rbd_dev, snap_id, &features);
11532ad3d716SAlex Elder 	if (ret)
11542ad3d716SAlex Elder 		return ret;
11552ad3d716SAlex Elder 
11562ad3d716SAlex Elder 	rbd_dev->mapping.size = size;
11572ad3d716SAlex Elder 	rbd_dev->mapping.features = features;
11582ad3d716SAlex Elder 
11598b0241f8SAlex Elder 	return 0;
1160602adf40SYehuda Sadeh }
1161602adf40SYehuda Sadeh 
1162d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1163d1cf5788SAlex Elder {
1164d1cf5788SAlex Elder 	rbd_dev->mapping.size = 0;
1165d1cf5788SAlex Elder 	rbd_dev->mapping.features = 0;
1166200a6a8bSAlex Elder }
1167200a6a8bSAlex Elder 
11687d5079aaSHimangi Saraogi static void rbd_segment_name_free(const char *name)
11697d5079aaSHimangi Saraogi {
11707d5079aaSHimangi Saraogi 	/* The explicit cast here is needed to drop the const qualifier */
11717d5079aaSHimangi Saraogi 
11727d5079aaSHimangi Saraogi 	kmem_cache_free(rbd_segment_name_cache, (void *)name);
11737d5079aaSHimangi Saraogi }
11747d5079aaSHimangi Saraogi 
117598571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
1176602adf40SYehuda Sadeh {
117765ccfe21SAlex Elder 	char *name;
117865ccfe21SAlex Elder 	u64 segment;
117965ccfe21SAlex Elder 	int ret;
11803a96d5cdSJosh Durgin 	char *name_format;
1181602adf40SYehuda Sadeh 
118278c2a44aSAlex Elder 	name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
118365ccfe21SAlex Elder 	if (!name)
118465ccfe21SAlex Elder 		return NULL;
118565ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
11863a96d5cdSJosh Durgin 	name_format = "%s.%012llx";
11873a96d5cdSJosh Durgin 	if (rbd_dev->image_format == 2)
11883a96d5cdSJosh Durgin 		name_format = "%s.%016llx";
11892d0ebc5dSIlya Dryomov 	ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format,
119065ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
11912d0ebc5dSIlya Dryomov 	if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) {
119265ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
119365ccfe21SAlex Elder 			segment, ret);
11947d5079aaSHimangi Saraogi 		rbd_segment_name_free(name);
119565ccfe21SAlex Elder 		name = NULL;
119665ccfe21SAlex Elder 	}
1197602adf40SYehuda Sadeh 
119865ccfe21SAlex Elder 	return name;
119965ccfe21SAlex Elder }
1200602adf40SYehuda Sadeh 
120165ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
120265ccfe21SAlex Elder {
120365ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
1204602adf40SYehuda Sadeh 
120565ccfe21SAlex Elder 	return offset & (segment_size - 1);
120665ccfe21SAlex Elder }
120765ccfe21SAlex Elder 
120865ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
120965ccfe21SAlex Elder 				u64 offset, u64 length)
121065ccfe21SAlex Elder {
121165ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
121265ccfe21SAlex Elder 
121365ccfe21SAlex Elder 	offset &= segment_size - 1;
121465ccfe21SAlex Elder 
1215aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
121665ccfe21SAlex Elder 	if (offset + length > segment_size)
121765ccfe21SAlex Elder 		length = segment_size - offset;
121865ccfe21SAlex Elder 
121965ccfe21SAlex Elder 	return length;
1220602adf40SYehuda Sadeh }
1221602adf40SYehuda Sadeh 
1222602adf40SYehuda Sadeh /*
1223029bcbd8SJosh Durgin  * returns the size of an object in the image
1224029bcbd8SJosh Durgin  */
1225029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
1226029bcbd8SJosh Durgin {
1227029bcbd8SJosh Durgin 	return 1 << header->obj_order;
1228029bcbd8SJosh Durgin }
1229029bcbd8SJosh Durgin 
1230029bcbd8SJosh Durgin /*
1231602adf40SYehuda Sadeh  * bio helpers
1232602adf40SYehuda Sadeh  */
1233602adf40SYehuda Sadeh 
1234602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
1235602adf40SYehuda Sadeh {
1236602adf40SYehuda Sadeh 	struct bio *tmp;
1237602adf40SYehuda Sadeh 
1238602adf40SYehuda Sadeh 	while (chain) {
1239602adf40SYehuda Sadeh 		tmp = chain;
1240602adf40SYehuda Sadeh 		chain = chain->bi_next;
1241602adf40SYehuda Sadeh 		bio_put(tmp);
1242602adf40SYehuda Sadeh 	}
1243602adf40SYehuda Sadeh }
1244602adf40SYehuda Sadeh 
1245602adf40SYehuda Sadeh /*
1246602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
1247602adf40SYehuda Sadeh  */
1248602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
1249602adf40SYehuda Sadeh {
12507988613bSKent Overstreet 	struct bio_vec bv;
12517988613bSKent Overstreet 	struct bvec_iter iter;
1252602adf40SYehuda Sadeh 	unsigned long flags;
1253602adf40SYehuda Sadeh 	void *buf;
1254602adf40SYehuda Sadeh 	int pos = 0;
1255602adf40SYehuda Sadeh 
1256602adf40SYehuda Sadeh 	while (chain) {
12577988613bSKent Overstreet 		bio_for_each_segment(bv, chain, iter) {
12587988613bSKent Overstreet 			if (pos + bv.bv_len > start_ofs) {
1259602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
12607988613bSKent Overstreet 				buf = bvec_kmap_irq(&bv, &flags);
1261602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
12627988613bSKent Overstreet 				       bv.bv_len - remainder);
12637988613bSKent Overstreet 				flush_dcache_page(bv.bv_page);
126485b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
1265602adf40SYehuda Sadeh 			}
12667988613bSKent Overstreet 			pos += bv.bv_len;
1267602adf40SYehuda Sadeh 		}
1268602adf40SYehuda Sadeh 
1269602adf40SYehuda Sadeh 		chain = chain->bi_next;
1270602adf40SYehuda Sadeh 	}
1271602adf40SYehuda Sadeh }
1272602adf40SYehuda Sadeh 
1273602adf40SYehuda Sadeh /*
1274b9434c5bSAlex Elder  * similar to zero_bio_chain(), zeros data defined by a page array,
1275b9434c5bSAlex Elder  * starting at the given byte offset from the start of the array and
1276b9434c5bSAlex Elder  * continuing up to the given end offset.  The pages array is
1277b9434c5bSAlex Elder  * assumed to be big enough to hold all bytes up to the end.
1278b9434c5bSAlex Elder  */
1279b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end)
1280b9434c5bSAlex Elder {
1281b9434c5bSAlex Elder 	struct page **page = &pages[offset >> PAGE_SHIFT];
1282b9434c5bSAlex Elder 
1283b9434c5bSAlex Elder 	rbd_assert(end > offset);
1284b9434c5bSAlex Elder 	rbd_assert(end - offset <= (u64)SIZE_MAX);
1285b9434c5bSAlex Elder 	while (offset < end) {
1286b9434c5bSAlex Elder 		size_t page_offset;
1287b9434c5bSAlex Elder 		size_t length;
1288b9434c5bSAlex Elder 		unsigned long flags;
1289b9434c5bSAlex Elder 		void *kaddr;
1290b9434c5bSAlex Elder 
1291491205a8SGeert Uytterhoeven 		page_offset = offset & ~PAGE_MASK;
1292491205a8SGeert Uytterhoeven 		length = min_t(size_t, PAGE_SIZE - page_offset, end - offset);
1293b9434c5bSAlex Elder 		local_irq_save(flags);
1294b9434c5bSAlex Elder 		kaddr = kmap_atomic(*page);
1295b9434c5bSAlex Elder 		memset(kaddr + page_offset, 0, length);
1296e2156054SAlex Elder 		flush_dcache_page(*page);
1297b9434c5bSAlex Elder 		kunmap_atomic(kaddr);
1298b9434c5bSAlex Elder 		local_irq_restore(flags);
1299b9434c5bSAlex Elder 
1300b9434c5bSAlex Elder 		offset += length;
1301b9434c5bSAlex Elder 		page++;
1302b9434c5bSAlex Elder 	}
1303b9434c5bSAlex Elder }
1304b9434c5bSAlex Elder 
1305b9434c5bSAlex Elder /*
1306f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
1307f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
1308602adf40SYehuda Sadeh  */
1309f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
1310f7760dadSAlex Elder 					unsigned int offset,
1311f7760dadSAlex Elder 					unsigned int len,
1312f7760dadSAlex Elder 					gfp_t gfpmask)
1313602adf40SYehuda Sadeh {
1314f7760dadSAlex Elder 	struct bio *bio;
1315602adf40SYehuda Sadeh 
13165341a627SKent Overstreet 	bio = bio_clone(bio_src, gfpmask);
1317f7760dadSAlex Elder 	if (!bio)
1318f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
1319f7760dadSAlex Elder 
13205341a627SKent Overstreet 	bio_advance(bio, offset);
13214f024f37SKent Overstreet 	bio->bi_iter.bi_size = len;
1322602adf40SYehuda Sadeh 
1323f7760dadSAlex Elder 	return bio;
1324602adf40SYehuda Sadeh }
1325602adf40SYehuda Sadeh 
1326f7760dadSAlex Elder /*
1327f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
1328f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
1329f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
1330f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
1331f7760dadSAlex Elder  *
1332f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
1333f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
1334f7760dadSAlex Elder  * the start of data to be cloned is located.
1335f7760dadSAlex Elder  *
1336f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
1337f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
1338f7760dadSAlex Elder  * contain the offset of that byte within that bio.
1339f7760dadSAlex Elder  */
1340f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
1341f7760dadSAlex Elder 					unsigned int *offset,
1342f7760dadSAlex Elder 					unsigned int len,
1343f7760dadSAlex Elder 					gfp_t gfpmask)
1344f7760dadSAlex Elder {
1345f7760dadSAlex Elder 	struct bio *bi = *bio_src;
1346f7760dadSAlex Elder 	unsigned int off = *offset;
1347f7760dadSAlex Elder 	struct bio *chain = NULL;
1348f7760dadSAlex Elder 	struct bio **end;
1349602adf40SYehuda Sadeh 
1350f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
1351602adf40SYehuda Sadeh 
13524f024f37SKent Overstreet 	if (!bi || off >= bi->bi_iter.bi_size || !len)
1353f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
1354602adf40SYehuda Sadeh 
1355f7760dadSAlex Elder 	end = &chain;
1356f7760dadSAlex Elder 	while (len) {
1357f7760dadSAlex Elder 		unsigned int bi_size;
1358f7760dadSAlex Elder 		struct bio *bio;
1359f7760dadSAlex Elder 
1360f5400b7aSAlex Elder 		if (!bi) {
1361f5400b7aSAlex Elder 			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1362f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
1363f5400b7aSAlex Elder 		}
13644f024f37SKent Overstreet 		bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len);
1365f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
1366f7760dadSAlex Elder 		if (!bio)
1367f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
1368f7760dadSAlex Elder 
1369f7760dadSAlex Elder 		*end = bio;
1370f7760dadSAlex Elder 		end = &bio->bi_next;
1371f7760dadSAlex Elder 
1372f7760dadSAlex Elder 		off += bi_size;
13734f024f37SKent Overstreet 		if (off == bi->bi_iter.bi_size) {
1374f7760dadSAlex Elder 			bi = bi->bi_next;
1375f7760dadSAlex Elder 			off = 0;
1376f7760dadSAlex Elder 		}
1377f7760dadSAlex Elder 		len -= bi_size;
1378f7760dadSAlex Elder 	}
1379f7760dadSAlex Elder 	*bio_src = bi;
1380f7760dadSAlex Elder 	*offset = off;
1381f7760dadSAlex Elder 
1382f7760dadSAlex Elder 	return chain;
1383f7760dadSAlex Elder out_err:
1384f7760dadSAlex Elder 	bio_chain_put(chain);
1385f7760dadSAlex Elder 
1386602adf40SYehuda Sadeh 	return NULL;
1387602adf40SYehuda Sadeh }
1388602adf40SYehuda Sadeh 
1389926f9b3fSAlex Elder /*
1390926f9b3fSAlex Elder  * The default/initial value for all object request flags is 0.  For
1391926f9b3fSAlex Elder  * each flag, once its value is set to 1 it is never reset to 0
1392926f9b3fSAlex Elder  * again.
1393926f9b3fSAlex Elder  */
13946365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
13956365d33aSAlex Elder {
13966365d33aSAlex Elder 	if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
13976365d33aSAlex Elder 		struct rbd_device *rbd_dev;
13986365d33aSAlex Elder 
139957acbaa7SAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
14009584d508SIlya Dryomov 		rbd_warn(rbd_dev, "obj_request %p already marked img_data",
14016365d33aSAlex Elder 			obj_request);
14026365d33aSAlex Elder 	}
14036365d33aSAlex Elder }
14046365d33aSAlex Elder 
14056365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
14066365d33aSAlex Elder {
14076365d33aSAlex Elder 	smp_mb();
14086365d33aSAlex Elder 	return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
14096365d33aSAlex Elder }
14106365d33aSAlex Elder 
141157acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request)
141257acbaa7SAlex Elder {
141357acbaa7SAlex Elder 	if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
141457acbaa7SAlex Elder 		struct rbd_device *rbd_dev = NULL;
141557acbaa7SAlex Elder 
141657acbaa7SAlex Elder 		if (obj_request_img_data_test(obj_request))
141757acbaa7SAlex Elder 			rbd_dev = obj_request->img_request->rbd_dev;
14189584d508SIlya Dryomov 		rbd_warn(rbd_dev, "obj_request %p already marked done",
141957acbaa7SAlex Elder 			obj_request);
142057acbaa7SAlex Elder 	}
142157acbaa7SAlex Elder }
142257acbaa7SAlex Elder 
142357acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request)
142457acbaa7SAlex Elder {
142557acbaa7SAlex Elder 	smp_mb();
142657acbaa7SAlex Elder 	return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
142757acbaa7SAlex Elder }
142857acbaa7SAlex Elder 
14295679c59fSAlex Elder /*
14305679c59fSAlex Elder  * This sets the KNOWN flag after (possibly) setting the EXISTS
14315679c59fSAlex Elder  * flag.  The latter is set based on the "exists" value provided.
14325679c59fSAlex Elder  *
14335679c59fSAlex Elder  * Note that for our purposes once an object exists it never goes
14345679c59fSAlex Elder  * away again.  It's possible that the response from two existence
14355679c59fSAlex Elder  * checks are separated by the creation of the target object, and
14365679c59fSAlex Elder  * the first ("doesn't exist") response arrives *after* the second
14375679c59fSAlex Elder  * ("does exist").  In that case we ignore the second one.
14385679c59fSAlex Elder  */
14395679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request,
14405679c59fSAlex Elder 				bool exists)
14415679c59fSAlex Elder {
14425679c59fSAlex Elder 	if (exists)
14435679c59fSAlex Elder 		set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
14445679c59fSAlex Elder 	set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
14455679c59fSAlex Elder 	smp_mb();
14465679c59fSAlex Elder }
14475679c59fSAlex Elder 
14485679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request)
14495679c59fSAlex Elder {
14505679c59fSAlex Elder 	smp_mb();
14515679c59fSAlex Elder 	return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
14525679c59fSAlex Elder }
14535679c59fSAlex Elder 
14545679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
14555679c59fSAlex Elder {
14565679c59fSAlex Elder 	smp_mb();
14575679c59fSAlex Elder 	return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
14585679c59fSAlex Elder }
14595679c59fSAlex Elder 
14609638556aSIlya Dryomov static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request)
14619638556aSIlya Dryomov {
14629638556aSIlya Dryomov 	struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
14639638556aSIlya Dryomov 
14649638556aSIlya Dryomov 	return obj_request->img_offset <
14659638556aSIlya Dryomov 	    round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header));
14669638556aSIlya Dryomov }
14679638556aSIlya Dryomov 
1468bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1469bf0d5f50SAlex Elder {
147037206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
147137206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1472bf0d5f50SAlex Elder 	kref_get(&obj_request->kref);
1473bf0d5f50SAlex Elder }
1474bf0d5f50SAlex Elder 
1475bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
1476bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1477bf0d5f50SAlex Elder {
1478bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
147937206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
148037206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1481bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1482bf0d5f50SAlex Elder }
1483bf0d5f50SAlex Elder 
14840f2d5be7SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request)
14850f2d5be7SAlex Elder {
14860f2d5be7SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
14870f2d5be7SAlex Elder 	     atomic_read(&img_request->kref.refcount));
14880f2d5be7SAlex Elder 	kref_get(&img_request->kref);
14890f2d5be7SAlex Elder }
14900f2d5be7SAlex Elder 
1491e93f3152SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request);
1492e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref);
1493bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref);
1494bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request)
1495bf0d5f50SAlex Elder {
1496bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
149737206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
149837206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1499e93f3152SAlex Elder 	if (img_request_child_test(img_request))
1500e93f3152SAlex Elder 		kref_put(&img_request->kref, rbd_parent_request_destroy);
1501e93f3152SAlex Elder 	else
1502bf0d5f50SAlex Elder 		kref_put(&img_request->kref, rbd_img_request_destroy);
1503bf0d5f50SAlex Elder }
1504bf0d5f50SAlex Elder 
1505bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1506bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1507bf0d5f50SAlex Elder {
150825dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
150925dcf954SAlex Elder 
1510b155e86cSAlex Elder 	/* Image request now owns object's original reference */
1511bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
151225dcf954SAlex Elder 	obj_request->which = img_request->obj_request_count;
15136365d33aSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
15146365d33aSAlex Elder 	obj_request_img_data_set(obj_request);
1515bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
151625dcf954SAlex Elder 	img_request->obj_request_count++;
151725dcf954SAlex Elder 	list_add_tail(&obj_request->links, &img_request->obj_requests);
151837206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
151937206ee5SAlex Elder 		obj_request->which);
1520bf0d5f50SAlex Elder }
1521bf0d5f50SAlex Elder 
1522bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1523bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1524bf0d5f50SAlex Elder {
1525bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
152625dcf954SAlex Elder 
152737206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
152837206ee5SAlex Elder 		obj_request->which);
1529bf0d5f50SAlex Elder 	list_del(&obj_request->links);
153025dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
153125dcf954SAlex Elder 	img_request->obj_request_count--;
153225dcf954SAlex Elder 	rbd_assert(obj_request->which == img_request->obj_request_count);
153325dcf954SAlex Elder 	obj_request->which = BAD_WHICH;
15346365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
1535bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1536bf0d5f50SAlex Elder 	obj_request->img_request = NULL;
153725dcf954SAlex Elder 	obj_request->callback = NULL;
1538bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1539bf0d5f50SAlex Elder }
1540bf0d5f50SAlex Elder 
1541bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type)
1542bf0d5f50SAlex Elder {
1543bf0d5f50SAlex Elder 	switch (type) {
15449969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1545bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1546788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1547bf0d5f50SAlex Elder 		return true;
1548bf0d5f50SAlex Elder 	default:
1549bf0d5f50SAlex Elder 		return false;
1550bf0d5f50SAlex Elder 	}
1551bf0d5f50SAlex Elder }
1552bf0d5f50SAlex Elder 
1553bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1554bf0d5f50SAlex Elder 				struct rbd_obj_request *obj_request)
1555bf0d5f50SAlex Elder {
155671c20a06SIlya Dryomov 	dout("%s %p\n", __func__, obj_request);
1557bf0d5f50SAlex Elder 	return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1558bf0d5f50SAlex Elder }
1559bf0d5f50SAlex Elder 
156071c20a06SIlya Dryomov static void rbd_obj_request_end(struct rbd_obj_request *obj_request)
156171c20a06SIlya Dryomov {
156271c20a06SIlya Dryomov 	dout("%s %p\n", __func__, obj_request);
156371c20a06SIlya Dryomov 	ceph_osdc_cancel_request(obj_request->osd_req);
156471c20a06SIlya Dryomov }
156571c20a06SIlya Dryomov 
156671c20a06SIlya Dryomov /*
156771c20a06SIlya Dryomov  * Wait for an object request to complete.  If interrupted, cancel the
156871c20a06SIlya Dryomov  * underlying osd request.
15692894e1d7SIlya Dryomov  *
15702894e1d7SIlya Dryomov  * @timeout: in jiffies, 0 means "wait forever"
157171c20a06SIlya Dryomov  */
15722894e1d7SIlya Dryomov static int __rbd_obj_request_wait(struct rbd_obj_request *obj_request,
15732894e1d7SIlya Dryomov 				  unsigned long timeout)
157471c20a06SIlya Dryomov {
15752894e1d7SIlya Dryomov 	long ret;
157671c20a06SIlya Dryomov 
157771c20a06SIlya Dryomov 	dout("%s %p\n", __func__, obj_request);
15782894e1d7SIlya Dryomov 	ret = wait_for_completion_interruptible_timeout(
15792894e1d7SIlya Dryomov 					&obj_request->completion,
15802894e1d7SIlya Dryomov 					ceph_timeout_jiffies(timeout));
15812894e1d7SIlya Dryomov 	if (ret <= 0) {
15822894e1d7SIlya Dryomov 		if (ret == 0)
15832894e1d7SIlya Dryomov 			ret = -ETIMEDOUT;
158471c20a06SIlya Dryomov 		rbd_obj_request_end(obj_request);
15852894e1d7SIlya Dryomov 	} else {
15862894e1d7SIlya Dryomov 		ret = 0;
15872894e1d7SIlya Dryomov 	}
15882894e1d7SIlya Dryomov 
15892894e1d7SIlya Dryomov 	dout("%s %p ret %d\n", __func__, obj_request, (int)ret);
159071c20a06SIlya Dryomov 	return ret;
159171c20a06SIlya Dryomov }
159271c20a06SIlya Dryomov 
15932894e1d7SIlya Dryomov static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
15942894e1d7SIlya Dryomov {
15952894e1d7SIlya Dryomov 	return __rbd_obj_request_wait(obj_request, 0);
15962894e1d7SIlya Dryomov }
15972894e1d7SIlya Dryomov 
1598bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request)
1599bf0d5f50SAlex Elder {
160055f27e09SAlex Elder 
160137206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
160255f27e09SAlex Elder 
160355f27e09SAlex Elder 	/*
160455f27e09SAlex Elder 	 * If no error occurred, compute the aggregate transfer
160555f27e09SAlex Elder 	 * count for the image request.  We could instead use
160655f27e09SAlex Elder 	 * atomic64_cmpxchg() to update it as each object request
160755f27e09SAlex Elder 	 * completes; not clear which way is better off hand.
160855f27e09SAlex Elder 	 */
160955f27e09SAlex Elder 	if (!img_request->result) {
161055f27e09SAlex Elder 		struct rbd_obj_request *obj_request;
161155f27e09SAlex Elder 		u64 xferred = 0;
161255f27e09SAlex Elder 
161355f27e09SAlex Elder 		for_each_obj_request(img_request, obj_request)
161455f27e09SAlex Elder 			xferred += obj_request->xferred;
161555f27e09SAlex Elder 		img_request->xferred = xferred;
161655f27e09SAlex Elder 	}
161755f27e09SAlex Elder 
1618bf0d5f50SAlex Elder 	if (img_request->callback)
1619bf0d5f50SAlex Elder 		img_request->callback(img_request);
1620bf0d5f50SAlex Elder 	else
1621bf0d5f50SAlex Elder 		rbd_img_request_put(img_request);
1622bf0d5f50SAlex Elder }
1623bf0d5f50SAlex Elder 
16240c425248SAlex Elder /*
16250c425248SAlex Elder  * The default/initial value for all image request flags is 0.  Each
16260c425248SAlex Elder  * is conditionally set to 1 at image request initialization time
16270c425248SAlex Elder  * and currently never change thereafter.
16280c425248SAlex Elder  */
16290c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request)
16300c425248SAlex Elder {
16310c425248SAlex Elder 	set_bit(IMG_REQ_WRITE, &img_request->flags);
16320c425248SAlex Elder 	smp_mb();
16330c425248SAlex Elder }
16340c425248SAlex Elder 
16350c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request)
16360c425248SAlex Elder {
16370c425248SAlex Elder 	smp_mb();
16380c425248SAlex Elder 	return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
16390c425248SAlex Elder }
16400c425248SAlex Elder 
164190e98c52SGuangliang Zhao /*
164290e98c52SGuangliang Zhao  * Set the discard flag when the img_request is an discard request
164390e98c52SGuangliang Zhao  */
164490e98c52SGuangliang Zhao static void img_request_discard_set(struct rbd_img_request *img_request)
164590e98c52SGuangliang Zhao {
164690e98c52SGuangliang Zhao 	set_bit(IMG_REQ_DISCARD, &img_request->flags);
164790e98c52SGuangliang Zhao 	smp_mb();
164890e98c52SGuangliang Zhao }
164990e98c52SGuangliang Zhao 
165090e98c52SGuangliang Zhao static bool img_request_discard_test(struct rbd_img_request *img_request)
165190e98c52SGuangliang Zhao {
165290e98c52SGuangliang Zhao 	smp_mb();
165390e98c52SGuangliang Zhao 	return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0;
165490e98c52SGuangliang Zhao }
165590e98c52SGuangliang Zhao 
16569849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request)
16579849e986SAlex Elder {
16589849e986SAlex Elder 	set_bit(IMG_REQ_CHILD, &img_request->flags);
16599849e986SAlex Elder 	smp_mb();
16609849e986SAlex Elder }
16619849e986SAlex Elder 
1662e93f3152SAlex Elder static void img_request_child_clear(struct rbd_img_request *img_request)
1663e93f3152SAlex Elder {
1664e93f3152SAlex Elder 	clear_bit(IMG_REQ_CHILD, &img_request->flags);
1665e93f3152SAlex Elder 	smp_mb();
1666e93f3152SAlex Elder }
1667e93f3152SAlex Elder 
16689849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request)
16699849e986SAlex Elder {
16709849e986SAlex Elder 	smp_mb();
16719849e986SAlex Elder 	return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
16729849e986SAlex Elder }
16739849e986SAlex Elder 
1674d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request)
1675d0b2e944SAlex Elder {
1676d0b2e944SAlex Elder 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1677d0b2e944SAlex Elder 	smp_mb();
1678d0b2e944SAlex Elder }
1679d0b2e944SAlex Elder 
1680a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request)
1681a2acd00eSAlex Elder {
1682a2acd00eSAlex Elder 	clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1683a2acd00eSAlex Elder 	smp_mb();
1684a2acd00eSAlex Elder }
1685a2acd00eSAlex Elder 
1686d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request)
1687d0b2e944SAlex Elder {
1688d0b2e944SAlex Elder 	smp_mb();
1689d0b2e944SAlex Elder 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1690d0b2e944SAlex Elder }
1691d0b2e944SAlex Elder 
16923b434a2aSJosh Durgin static enum obj_operation_type
16933b434a2aSJosh Durgin rbd_img_request_op_type(struct rbd_img_request *img_request)
16943b434a2aSJosh Durgin {
16953b434a2aSJosh Durgin 	if (img_request_write_test(img_request))
16963b434a2aSJosh Durgin 		return OBJ_OP_WRITE;
16973b434a2aSJosh Durgin 	else if (img_request_discard_test(img_request))
16983b434a2aSJosh Durgin 		return OBJ_OP_DISCARD;
16993b434a2aSJosh Durgin 	else
17003b434a2aSJosh Durgin 		return OBJ_OP_READ;
17013b434a2aSJosh Durgin }
17023b434a2aSJosh Durgin 
17036e2a4505SAlex Elder static void
17046e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
17056e2a4505SAlex Elder {
1706b9434c5bSAlex Elder 	u64 xferred = obj_request->xferred;
1707b9434c5bSAlex Elder 	u64 length = obj_request->length;
1708b9434c5bSAlex Elder 
17096e2a4505SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
17106e2a4505SAlex Elder 		obj_request, obj_request->img_request, obj_request->result,
1711b9434c5bSAlex Elder 		xferred, length);
17126e2a4505SAlex Elder 	/*
171317c1cc1dSJosh Durgin 	 * ENOENT means a hole in the image.  We zero-fill the entire
171417c1cc1dSJosh Durgin 	 * length of the request.  A short read also implies zero-fill
171517c1cc1dSJosh Durgin 	 * to the end of the request.  An error requires the whole
171617c1cc1dSJosh Durgin 	 * length of the request to be reported finished with an error
171717c1cc1dSJosh Durgin 	 * to the block layer.  In each case we update the xferred
171817c1cc1dSJosh Durgin 	 * count to indicate the whole request was satisfied.
17196e2a4505SAlex Elder 	 */
1720b9434c5bSAlex Elder 	rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
17216e2a4505SAlex Elder 	if (obj_request->result == -ENOENT) {
1722b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
17236e2a4505SAlex Elder 			zero_bio_chain(obj_request->bio_list, 0);
1724b9434c5bSAlex Elder 		else
1725b9434c5bSAlex Elder 			zero_pages(obj_request->pages, 0, length);
17266e2a4505SAlex Elder 		obj_request->result = 0;
1727b9434c5bSAlex Elder 	} else if (xferred < length && !obj_request->result) {
1728b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
1729b9434c5bSAlex Elder 			zero_bio_chain(obj_request->bio_list, xferred);
1730b9434c5bSAlex Elder 		else
1731b9434c5bSAlex Elder 			zero_pages(obj_request->pages, xferred, length);
17326e2a4505SAlex Elder 	}
173317c1cc1dSJosh Durgin 	obj_request->xferred = length;
17346e2a4505SAlex Elder 	obj_request_done_set(obj_request);
17356e2a4505SAlex Elder }
17366e2a4505SAlex Elder 
1737bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1738bf0d5f50SAlex Elder {
173937206ee5SAlex Elder 	dout("%s: obj %p cb %p\n", __func__, obj_request,
174037206ee5SAlex Elder 		obj_request->callback);
1741bf0d5f50SAlex Elder 	if (obj_request->callback)
1742bf0d5f50SAlex Elder 		obj_request->callback(obj_request);
1743788e2df3SAlex Elder 	else
1744788e2df3SAlex Elder 		complete_all(&obj_request->completion);
1745bf0d5f50SAlex Elder }
1746bf0d5f50SAlex Elder 
1747c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1748bf0d5f50SAlex Elder {
174957acbaa7SAlex Elder 	struct rbd_img_request *img_request = NULL;
1750a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev = NULL;
175157acbaa7SAlex Elder 	bool layered = false;
175257acbaa7SAlex Elder 
175357acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
175457acbaa7SAlex Elder 		img_request = obj_request->img_request;
175557acbaa7SAlex Elder 		layered = img_request && img_request_layered_test(img_request);
1756a9e8ba2cSAlex Elder 		rbd_dev = img_request->rbd_dev;
175757acbaa7SAlex Elder 	}
17588b3e1a56SAlex Elder 
17598b3e1a56SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
17608b3e1a56SAlex Elder 		obj_request, img_request, obj_request->result,
17618b3e1a56SAlex Elder 		obj_request->xferred, obj_request->length);
1762a9e8ba2cSAlex Elder 	if (layered && obj_request->result == -ENOENT &&
1763a9e8ba2cSAlex Elder 			obj_request->img_offset < rbd_dev->parent_overlap)
17648b3e1a56SAlex Elder 		rbd_img_parent_read(obj_request);
17658b3e1a56SAlex Elder 	else if (img_request)
17666e2a4505SAlex Elder 		rbd_img_obj_request_read_callback(obj_request);
17676e2a4505SAlex Elder 	else
176807741308SAlex Elder 		obj_request_done_set(obj_request);
1769bf0d5f50SAlex Elder }
1770bf0d5f50SAlex Elder 
1771c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1772bf0d5f50SAlex Elder {
17731b83bef2SSage Weil 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
17741b83bef2SSage Weil 		obj_request->result, obj_request->length);
17751b83bef2SSage Weil 	/*
17768b3e1a56SAlex Elder 	 * There is no such thing as a successful short write.  Set
17778b3e1a56SAlex Elder 	 * it to our originally-requested length.
17781b83bef2SSage Weil 	 */
17791b83bef2SSage Weil 	obj_request->xferred = obj_request->length;
178007741308SAlex Elder 	obj_request_done_set(obj_request);
1781bf0d5f50SAlex Elder }
1782bf0d5f50SAlex Elder 
178390e98c52SGuangliang Zhao static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request)
178490e98c52SGuangliang Zhao {
178590e98c52SGuangliang Zhao 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
178690e98c52SGuangliang Zhao 		obj_request->result, obj_request->length);
178790e98c52SGuangliang Zhao 	/*
178890e98c52SGuangliang Zhao 	 * There is no such thing as a successful short discard.  Set
178990e98c52SGuangliang Zhao 	 * it to our originally-requested length.
179090e98c52SGuangliang Zhao 	 */
179190e98c52SGuangliang Zhao 	obj_request->xferred = obj_request->length;
1792d0265de7SJosh Durgin 	/* discarding a non-existent object is not a problem */
1793d0265de7SJosh Durgin 	if (obj_request->result == -ENOENT)
1794d0265de7SJosh Durgin 		obj_request->result = 0;
179590e98c52SGuangliang Zhao 	obj_request_done_set(obj_request);
179690e98c52SGuangliang Zhao }
179790e98c52SGuangliang Zhao 
1798fbfab539SAlex Elder /*
1799fbfab539SAlex Elder  * For a simple stat call there's nothing to do.  We'll do more if
1800fbfab539SAlex Elder  * this is part of a write sequence for a layered image.
1801fbfab539SAlex Elder  */
1802c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1803fbfab539SAlex Elder {
180437206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
1805fbfab539SAlex Elder 	obj_request_done_set(obj_request);
1806fbfab539SAlex Elder }
1807fbfab539SAlex Elder 
18082761713dSIlya Dryomov static void rbd_osd_call_callback(struct rbd_obj_request *obj_request)
18092761713dSIlya Dryomov {
18102761713dSIlya Dryomov 	dout("%s: obj %p\n", __func__, obj_request);
18112761713dSIlya Dryomov 
18122761713dSIlya Dryomov 	if (obj_request_img_data_test(obj_request))
18132761713dSIlya Dryomov 		rbd_osd_copyup_callback(obj_request);
18142761713dSIlya Dryomov 	else
18152761713dSIlya Dryomov 		obj_request_done_set(obj_request);
18162761713dSIlya Dryomov }
18172761713dSIlya Dryomov 
181885e084feSIlya Dryomov static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
1819bf0d5f50SAlex Elder {
1820bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1821bf0d5f50SAlex Elder 	u16 opcode;
1822bf0d5f50SAlex Elder 
182385e084feSIlya Dryomov 	dout("%s: osd_req %p\n", __func__, osd_req);
1824bf0d5f50SAlex Elder 	rbd_assert(osd_req == obj_request->osd_req);
182557acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
182657acbaa7SAlex Elder 		rbd_assert(obj_request->img_request);
182757acbaa7SAlex Elder 		rbd_assert(obj_request->which != BAD_WHICH);
182857acbaa7SAlex Elder 	} else {
182957acbaa7SAlex Elder 		rbd_assert(obj_request->which == BAD_WHICH);
183057acbaa7SAlex Elder 	}
1831bf0d5f50SAlex Elder 
18321b83bef2SSage Weil 	if (osd_req->r_result < 0)
18331b83bef2SSage Weil 		obj_request->result = osd_req->r_result;
1834bf0d5f50SAlex Elder 
1835c47f9371SAlex Elder 	/*
1836c47f9371SAlex Elder 	 * We support a 64-bit length, but ultimately it has to be
18377ad18afaSChristoph Hellwig 	 * passed to the block layer, which just supports a 32-bit
18387ad18afaSChristoph Hellwig 	 * length field.
1839c47f9371SAlex Elder 	 */
18407665d85bSYan, Zheng 	obj_request->xferred = osd_req->r_ops[0].outdata_len;
1841c47f9371SAlex Elder 	rbd_assert(obj_request->xferred < (u64)UINT_MAX);
18420ccd5926SIlya Dryomov 
184379528734SAlex Elder 	opcode = osd_req->r_ops[0].op;
1844bf0d5f50SAlex Elder 	switch (opcode) {
1845bf0d5f50SAlex Elder 	case CEPH_OSD_OP_READ:
1846c47f9371SAlex Elder 		rbd_osd_read_callback(obj_request);
1847bf0d5f50SAlex Elder 		break;
18480ccd5926SIlya Dryomov 	case CEPH_OSD_OP_SETALLOCHINT:
1849e30b7577SIlya Dryomov 		rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE ||
1850e30b7577SIlya Dryomov 			   osd_req->r_ops[1].op == CEPH_OSD_OP_WRITEFULL);
18510ccd5926SIlya Dryomov 		/* fall through */
1852bf0d5f50SAlex Elder 	case CEPH_OSD_OP_WRITE:
1853e30b7577SIlya Dryomov 	case CEPH_OSD_OP_WRITEFULL:
1854c47f9371SAlex Elder 		rbd_osd_write_callback(obj_request);
1855bf0d5f50SAlex Elder 		break;
1856fbfab539SAlex Elder 	case CEPH_OSD_OP_STAT:
1857c47f9371SAlex Elder 		rbd_osd_stat_callback(obj_request);
1858fbfab539SAlex Elder 		break;
185990e98c52SGuangliang Zhao 	case CEPH_OSD_OP_DELETE:
186090e98c52SGuangliang Zhao 	case CEPH_OSD_OP_TRUNCATE:
186190e98c52SGuangliang Zhao 	case CEPH_OSD_OP_ZERO:
186290e98c52SGuangliang Zhao 		rbd_osd_discard_callback(obj_request);
186390e98c52SGuangliang Zhao 		break;
186436be9a76SAlex Elder 	case CEPH_OSD_OP_CALL:
18652761713dSIlya Dryomov 		rbd_osd_call_callback(obj_request);
18662761713dSIlya Dryomov 		break;
1867bf0d5f50SAlex Elder 	default:
18689584d508SIlya Dryomov 		rbd_warn(NULL, "%s: unsupported op %hu",
1869bf0d5f50SAlex Elder 			obj_request->object_name, (unsigned short) opcode);
1870bf0d5f50SAlex Elder 		break;
1871bf0d5f50SAlex Elder 	}
1872bf0d5f50SAlex Elder 
187307741308SAlex Elder 	if (obj_request_done_test(obj_request))
1874bf0d5f50SAlex Elder 		rbd_obj_request_complete(obj_request);
1875bf0d5f50SAlex Elder }
1876bf0d5f50SAlex Elder 
18779d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1878430c28c3SAlex Elder {
1879430c28c3SAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
18808c042b0dSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
1881430c28c3SAlex Elder 
1882bb873b53SIlya Dryomov 	if (img_request)
1883bb873b53SIlya Dryomov 		osd_req->r_snapid = img_request->snap_id;
18849d4df01fSAlex Elder }
18859d4df01fSAlex Elder 
18869d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
18879d4df01fSAlex Elder {
18889d4df01fSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
18899d4df01fSAlex Elder 
1890bb873b53SIlya Dryomov 	osd_req->r_mtime = CURRENT_TIME;
1891bb873b53SIlya Dryomov 	osd_req->r_data_offset = obj_request->offset;
1892430c28c3SAlex Elder }
1893430c28c3SAlex Elder 
18940ccd5926SIlya Dryomov /*
18950ccd5926SIlya Dryomov  * Create an osd request.  A read request has one osd op (read).
18960ccd5926SIlya Dryomov  * A write request has either one (watch) or two (hint+write) osd ops.
18970ccd5926SIlya Dryomov  * (All rbd data writes are prefixed with an allocation hint op, but
18980ccd5926SIlya Dryomov  * technically osd watch is a write request, hence this distinction.)
18990ccd5926SIlya Dryomov  */
1900bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create(
1901bf0d5f50SAlex Elder 					struct rbd_device *rbd_dev,
19026d2940c8SGuangliang Zhao 					enum obj_operation_type op_type,
1903deb236b3SIlya Dryomov 					unsigned int num_ops,
1904430c28c3SAlex Elder 					struct rbd_obj_request *obj_request)
1905bf0d5f50SAlex Elder {
1906bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1907bf0d5f50SAlex Elder 	struct ceph_osd_client *osdc;
1908bf0d5f50SAlex Elder 	struct ceph_osd_request *osd_req;
1909bf0d5f50SAlex Elder 
191090e98c52SGuangliang Zhao 	if (obj_request_img_data_test(obj_request) &&
191190e98c52SGuangliang Zhao 		(op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) {
19126365d33aSAlex Elder 		struct rbd_img_request *img_request = obj_request->img_request;
191390e98c52SGuangliang Zhao 		if (op_type == OBJ_OP_WRITE) {
19146d2940c8SGuangliang Zhao 			rbd_assert(img_request_write_test(img_request));
191590e98c52SGuangliang Zhao 		} else {
191690e98c52SGuangliang Zhao 			rbd_assert(img_request_discard_test(img_request));
191790e98c52SGuangliang Zhao 		}
1918bf0d5f50SAlex Elder 		snapc = img_request->snapc;
1919bf0d5f50SAlex Elder 	}
1920bf0d5f50SAlex Elder 
19216d2940c8SGuangliang Zhao 	rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2));
1922deb236b3SIlya Dryomov 
1923deb236b3SIlya Dryomov 	/* Allocate and initialize the request, for the num_ops ops */
1924bf0d5f50SAlex Elder 
1925bf0d5f50SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
1926deb236b3SIlya Dryomov 	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
19272224d879SDavid Disseldorp 					  GFP_NOIO);
1928bf0d5f50SAlex Elder 	if (!osd_req)
192913d1ad16SIlya Dryomov 		goto fail;
1930bf0d5f50SAlex Elder 
193190e98c52SGuangliang Zhao 	if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
1932bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1933430c28c3SAlex Elder 	else
1934bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_READ;
1935bf0d5f50SAlex Elder 
1936bf0d5f50SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
1937bf0d5f50SAlex Elder 	osd_req->r_priv = obj_request;
1938bf0d5f50SAlex Elder 
19397627151eSYan, Zheng 	osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id;
1940d30291b9SIlya Dryomov 	if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
1941d30291b9SIlya Dryomov 			     obj_request->object_name))
1942d30291b9SIlya Dryomov 		goto fail;
1943bf0d5f50SAlex Elder 
194413d1ad16SIlya Dryomov 	if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
194513d1ad16SIlya Dryomov 		goto fail;
194613d1ad16SIlya Dryomov 
1947bf0d5f50SAlex Elder 	return osd_req;
194813d1ad16SIlya Dryomov 
194913d1ad16SIlya Dryomov fail:
195013d1ad16SIlya Dryomov 	ceph_osdc_put_request(osd_req);
195113d1ad16SIlya Dryomov 	return NULL;
1952bf0d5f50SAlex Elder }
1953bf0d5f50SAlex Elder 
19540eefd470SAlex Elder /*
1955d3246fb0SJosh Durgin  * Create a copyup osd request based on the information in the object
1956d3246fb0SJosh Durgin  * request supplied.  A copyup request has two or three osd ops, a
1957d3246fb0SJosh Durgin  * copyup method call, potentially a hint op, and a write or truncate
1958d3246fb0SJosh Durgin  * or zero op.
19590eefd470SAlex Elder  */
19600eefd470SAlex Elder static struct ceph_osd_request *
19610eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
19620eefd470SAlex Elder {
19630eefd470SAlex Elder 	struct rbd_img_request *img_request;
19640eefd470SAlex Elder 	struct ceph_snap_context *snapc;
19650eefd470SAlex Elder 	struct rbd_device *rbd_dev;
19660eefd470SAlex Elder 	struct ceph_osd_client *osdc;
19670eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
1968d3246fb0SJosh Durgin 	int num_osd_ops = 3;
19690eefd470SAlex Elder 
19700eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
19710eefd470SAlex Elder 	img_request = obj_request->img_request;
19720eefd470SAlex Elder 	rbd_assert(img_request);
1973d3246fb0SJosh Durgin 	rbd_assert(img_request_write_test(img_request) ||
1974d3246fb0SJosh Durgin 			img_request_discard_test(img_request));
19750eefd470SAlex Elder 
1976d3246fb0SJosh Durgin 	if (img_request_discard_test(img_request))
1977d3246fb0SJosh Durgin 		num_osd_ops = 2;
1978d3246fb0SJosh Durgin 
1979d3246fb0SJosh Durgin 	/* Allocate and initialize the request, for all the ops */
19800eefd470SAlex Elder 
19810eefd470SAlex Elder 	snapc = img_request->snapc;
19820eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
19830eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
1984d3246fb0SJosh Durgin 	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops,
19852224d879SDavid Disseldorp 						false, GFP_NOIO);
19860eefd470SAlex Elder 	if (!osd_req)
198713d1ad16SIlya Dryomov 		goto fail;
19880eefd470SAlex Elder 
19890eefd470SAlex Elder 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
19900eefd470SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
19910eefd470SAlex Elder 	osd_req->r_priv = obj_request;
19920eefd470SAlex Elder 
19937627151eSYan, Zheng 	osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id;
1994d30291b9SIlya Dryomov 	if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
1995d30291b9SIlya Dryomov 			     obj_request->object_name))
1996d30291b9SIlya Dryomov 		goto fail;
19970eefd470SAlex Elder 
199813d1ad16SIlya Dryomov 	if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
199913d1ad16SIlya Dryomov 		goto fail;
200013d1ad16SIlya Dryomov 
20010eefd470SAlex Elder 	return osd_req;
200213d1ad16SIlya Dryomov 
200313d1ad16SIlya Dryomov fail:
200413d1ad16SIlya Dryomov 	ceph_osdc_put_request(osd_req);
200513d1ad16SIlya Dryomov 	return NULL;
20060eefd470SAlex Elder }
20070eefd470SAlex Elder 
20080eefd470SAlex Elder 
2009bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
2010bf0d5f50SAlex Elder {
2011bf0d5f50SAlex Elder 	ceph_osdc_put_request(osd_req);
2012bf0d5f50SAlex Elder }
2013bf0d5f50SAlex Elder 
2014bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */
2015bf0d5f50SAlex Elder 
2016bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
2017bf0d5f50SAlex Elder 						u64 offset, u64 length,
2018bf0d5f50SAlex Elder 						enum obj_request_type type)
2019bf0d5f50SAlex Elder {
2020bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2021bf0d5f50SAlex Elder 	size_t size;
2022bf0d5f50SAlex Elder 	char *name;
2023bf0d5f50SAlex Elder 
2024bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(type));
2025bf0d5f50SAlex Elder 
2026bf0d5f50SAlex Elder 	size = strlen(object_name) + 1;
20275a60e876SIlya Dryomov 	name = kmalloc(size, GFP_NOIO);
2028f907ad55SAlex Elder 	if (!name)
2029bf0d5f50SAlex Elder 		return NULL;
2030bf0d5f50SAlex Elder 
20315a60e876SIlya Dryomov 	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
2032f907ad55SAlex Elder 	if (!obj_request) {
2033f907ad55SAlex Elder 		kfree(name);
2034f907ad55SAlex Elder 		return NULL;
2035f907ad55SAlex Elder 	}
2036f907ad55SAlex Elder 
2037bf0d5f50SAlex Elder 	obj_request->object_name = memcpy(name, object_name, size);
2038bf0d5f50SAlex Elder 	obj_request->offset = offset;
2039bf0d5f50SAlex Elder 	obj_request->length = length;
2040926f9b3fSAlex Elder 	obj_request->flags = 0;
2041bf0d5f50SAlex Elder 	obj_request->which = BAD_WHICH;
2042bf0d5f50SAlex Elder 	obj_request->type = type;
2043bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&obj_request->links);
2044788e2df3SAlex Elder 	init_completion(&obj_request->completion);
2045bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
2046bf0d5f50SAlex Elder 
204737206ee5SAlex Elder 	dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
204837206ee5SAlex Elder 		offset, length, (int)type, obj_request);
204937206ee5SAlex Elder 
2050bf0d5f50SAlex Elder 	return obj_request;
2051bf0d5f50SAlex Elder }
2052bf0d5f50SAlex Elder 
2053bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
2054bf0d5f50SAlex Elder {
2055bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2056bf0d5f50SAlex Elder 
2057bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
2058bf0d5f50SAlex Elder 
205937206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
206037206ee5SAlex Elder 
2061bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
2062bf0d5f50SAlex Elder 	rbd_assert(obj_request->which == BAD_WHICH);
2063bf0d5f50SAlex Elder 
2064bf0d5f50SAlex Elder 	if (obj_request->osd_req)
2065bf0d5f50SAlex Elder 		rbd_osd_req_destroy(obj_request->osd_req);
2066bf0d5f50SAlex Elder 
2067bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
2068bf0d5f50SAlex Elder 	switch (obj_request->type) {
20699969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
20709969ebc5SAlex Elder 		break;		/* Nothing to do */
2071bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
2072bf0d5f50SAlex Elder 		if (obj_request->bio_list)
2073bf0d5f50SAlex Elder 			bio_chain_put(obj_request->bio_list);
2074bf0d5f50SAlex Elder 		break;
2075788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
2076788e2df3SAlex Elder 		if (obj_request->pages)
2077788e2df3SAlex Elder 			ceph_release_page_vector(obj_request->pages,
2078788e2df3SAlex Elder 						obj_request->page_count);
2079788e2df3SAlex Elder 		break;
2080bf0d5f50SAlex Elder 	}
2081bf0d5f50SAlex Elder 
2082f907ad55SAlex Elder 	kfree(obj_request->object_name);
2083868311b1SAlex Elder 	obj_request->object_name = NULL;
2084868311b1SAlex Elder 	kmem_cache_free(rbd_obj_request_cache, obj_request);
2085bf0d5f50SAlex Elder }
2086bf0d5f50SAlex Elder 
2087fb65d228SAlex Elder /* It's OK to call this for a device with no parent */
2088fb65d228SAlex Elder 
2089fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
2090fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev)
2091fb65d228SAlex Elder {
2092fb65d228SAlex Elder 	rbd_dev_remove_parent(rbd_dev);
2093fb65d228SAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
2094fb65d228SAlex Elder 	rbd_dev->parent_spec = NULL;
2095fb65d228SAlex Elder 	rbd_dev->parent_overlap = 0;
2096fb65d228SAlex Elder }
2097fb65d228SAlex Elder 
2098bf0d5f50SAlex Elder /*
2099a2acd00eSAlex Elder  * Parent image reference counting is used to determine when an
2100a2acd00eSAlex Elder  * image's parent fields can be safely torn down--after there are no
2101a2acd00eSAlex Elder  * more in-flight requests to the parent image.  When the last
2102a2acd00eSAlex Elder  * reference is dropped, cleaning them up is safe.
2103a2acd00eSAlex Elder  */
2104a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
2105a2acd00eSAlex Elder {
2106a2acd00eSAlex Elder 	int counter;
2107a2acd00eSAlex Elder 
2108a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
2109a2acd00eSAlex Elder 		return;
2110a2acd00eSAlex Elder 
2111a2acd00eSAlex Elder 	counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
2112a2acd00eSAlex Elder 	if (counter > 0)
2113a2acd00eSAlex Elder 		return;
2114a2acd00eSAlex Elder 
2115a2acd00eSAlex Elder 	/* Last reference; clean up parent data structures */
2116a2acd00eSAlex Elder 
2117a2acd00eSAlex Elder 	if (!counter)
2118a2acd00eSAlex Elder 		rbd_dev_unparent(rbd_dev);
2119a2acd00eSAlex Elder 	else
21209584d508SIlya Dryomov 		rbd_warn(rbd_dev, "parent reference underflow");
2121a2acd00eSAlex Elder }
2122a2acd00eSAlex Elder 
2123a2acd00eSAlex Elder /*
2124a2acd00eSAlex Elder  * If an image has a non-zero parent overlap, get a reference to its
2125a2acd00eSAlex Elder  * parent.
2126a2acd00eSAlex Elder  *
2127a2acd00eSAlex Elder  * Returns true if the rbd device has a parent with a non-zero
2128a2acd00eSAlex Elder  * overlap and a reference for it was successfully taken, or
2129a2acd00eSAlex Elder  * false otherwise.
2130a2acd00eSAlex Elder  */
2131a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
2132a2acd00eSAlex Elder {
2133ae43e9d0SIlya Dryomov 	int counter = 0;
2134a2acd00eSAlex Elder 
2135a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
2136a2acd00eSAlex Elder 		return false;
2137a2acd00eSAlex Elder 
2138ae43e9d0SIlya Dryomov 	down_read(&rbd_dev->header_rwsem);
2139ae43e9d0SIlya Dryomov 	if (rbd_dev->parent_overlap)
2140a2acd00eSAlex Elder 		counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
2141ae43e9d0SIlya Dryomov 	up_read(&rbd_dev->header_rwsem);
2142a2acd00eSAlex Elder 
2143a2acd00eSAlex Elder 	if (counter < 0)
21449584d508SIlya Dryomov 		rbd_warn(rbd_dev, "parent reference overflow");
2145a2acd00eSAlex Elder 
2146ae43e9d0SIlya Dryomov 	return counter > 0;
2147a2acd00eSAlex Elder }
2148a2acd00eSAlex Elder 
2149bf0d5f50SAlex Elder /*
2150bf0d5f50SAlex Elder  * Caller is responsible for filling in the list of object requests
2151bf0d5f50SAlex Elder  * that comprises the image request, and the Linux request pointer
2152bf0d5f50SAlex Elder  * (if there is one).
2153bf0d5f50SAlex Elder  */
2154cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create(
2155cc344fa1SAlex Elder 					struct rbd_device *rbd_dev,
2156bf0d5f50SAlex Elder 					u64 offset, u64 length,
21576d2940c8SGuangliang Zhao 					enum obj_operation_type op_type,
21584e752f0aSJosh Durgin 					struct ceph_snap_context *snapc)
2159bf0d5f50SAlex Elder {
2160bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
2161bf0d5f50SAlex Elder 
21627a716aacSIlya Dryomov 	img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
2163bf0d5f50SAlex Elder 	if (!img_request)
2164bf0d5f50SAlex Elder 		return NULL;
2165bf0d5f50SAlex Elder 
2166bf0d5f50SAlex Elder 	img_request->rq = NULL;
2167bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
2168bf0d5f50SAlex Elder 	img_request->offset = offset;
2169bf0d5f50SAlex Elder 	img_request->length = length;
21700c425248SAlex Elder 	img_request->flags = 0;
217190e98c52SGuangliang Zhao 	if (op_type == OBJ_OP_DISCARD) {
217290e98c52SGuangliang Zhao 		img_request_discard_set(img_request);
217390e98c52SGuangliang Zhao 		img_request->snapc = snapc;
217490e98c52SGuangliang Zhao 	} else if (op_type == OBJ_OP_WRITE) {
21750c425248SAlex Elder 		img_request_write_set(img_request);
21764e752f0aSJosh Durgin 		img_request->snapc = snapc;
21770c425248SAlex Elder 	} else {
2178bf0d5f50SAlex Elder 		img_request->snap_id = rbd_dev->spec->snap_id;
21790c425248SAlex Elder 	}
2180a2acd00eSAlex Elder 	if (rbd_dev_parent_get(rbd_dev))
2181d0b2e944SAlex Elder 		img_request_layered_set(img_request);
2182bf0d5f50SAlex Elder 	spin_lock_init(&img_request->completion_lock);
2183bf0d5f50SAlex Elder 	img_request->next_completion = 0;
2184bf0d5f50SAlex Elder 	img_request->callback = NULL;
2185a5a337d4SAlex Elder 	img_request->result = 0;
2186bf0d5f50SAlex Elder 	img_request->obj_request_count = 0;
2187bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&img_request->obj_requests);
2188bf0d5f50SAlex Elder 	kref_init(&img_request->kref);
2189bf0d5f50SAlex Elder 
219037206ee5SAlex Elder 	dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
21916d2940c8SGuangliang Zhao 		obj_op_name(op_type), offset, length, img_request);
219237206ee5SAlex Elder 
2193bf0d5f50SAlex Elder 	return img_request;
2194bf0d5f50SAlex Elder }
2195bf0d5f50SAlex Elder 
2196bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref)
2197bf0d5f50SAlex Elder {
2198bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
2199bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2200bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
2201bf0d5f50SAlex Elder 
2202bf0d5f50SAlex Elder 	img_request = container_of(kref, struct rbd_img_request, kref);
2203bf0d5f50SAlex Elder 
220437206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
220537206ee5SAlex Elder 
2206bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2207bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
220825dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count == 0);
2209bf0d5f50SAlex Elder 
2210a2acd00eSAlex Elder 	if (img_request_layered_test(img_request)) {
2211a2acd00eSAlex Elder 		img_request_layered_clear(img_request);
2212a2acd00eSAlex Elder 		rbd_dev_parent_put(img_request->rbd_dev);
2213a2acd00eSAlex Elder 	}
2214a2acd00eSAlex Elder 
2215bef95455SJosh Durgin 	if (img_request_write_test(img_request) ||
2216bef95455SJosh Durgin 		img_request_discard_test(img_request))
2217812164f8SAlex Elder 		ceph_put_snap_context(img_request->snapc);
2218bf0d5f50SAlex Elder 
22191c2a9dfeSAlex Elder 	kmem_cache_free(rbd_img_request_cache, img_request);
2220bf0d5f50SAlex Elder }
2221bf0d5f50SAlex Elder 
2222e93f3152SAlex Elder static struct rbd_img_request *rbd_parent_request_create(
2223e93f3152SAlex Elder 					struct rbd_obj_request *obj_request,
2224e93f3152SAlex Elder 					u64 img_offset, u64 length)
2225e93f3152SAlex Elder {
2226e93f3152SAlex Elder 	struct rbd_img_request *parent_request;
2227e93f3152SAlex Elder 	struct rbd_device *rbd_dev;
2228e93f3152SAlex Elder 
2229e93f3152SAlex Elder 	rbd_assert(obj_request->img_request);
2230e93f3152SAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2231e93f3152SAlex Elder 
22324e752f0aSJosh Durgin 	parent_request = rbd_img_request_create(rbd_dev->parent, img_offset,
22336d2940c8SGuangliang Zhao 						length, OBJ_OP_READ, NULL);
2234e93f3152SAlex Elder 	if (!parent_request)
2235e93f3152SAlex Elder 		return NULL;
2236e93f3152SAlex Elder 
2237e93f3152SAlex Elder 	img_request_child_set(parent_request);
2238e93f3152SAlex Elder 	rbd_obj_request_get(obj_request);
2239e93f3152SAlex Elder 	parent_request->obj_request = obj_request;
2240e93f3152SAlex Elder 
2241e93f3152SAlex Elder 	return parent_request;
2242e93f3152SAlex Elder }
2243e93f3152SAlex Elder 
2244e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref)
2245e93f3152SAlex Elder {
2246e93f3152SAlex Elder 	struct rbd_img_request *parent_request;
2247e93f3152SAlex Elder 	struct rbd_obj_request *orig_request;
2248e93f3152SAlex Elder 
2249e93f3152SAlex Elder 	parent_request = container_of(kref, struct rbd_img_request, kref);
2250e93f3152SAlex Elder 	orig_request = parent_request->obj_request;
2251e93f3152SAlex Elder 
2252e93f3152SAlex Elder 	parent_request->obj_request = NULL;
2253e93f3152SAlex Elder 	rbd_obj_request_put(orig_request);
2254e93f3152SAlex Elder 	img_request_child_clear(parent_request);
2255e93f3152SAlex Elder 
2256e93f3152SAlex Elder 	rbd_img_request_destroy(kref);
2257e93f3152SAlex Elder }
2258e93f3152SAlex Elder 
22591217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
22601217857fSAlex Elder {
22616365d33aSAlex Elder 	struct rbd_img_request *img_request;
22621217857fSAlex Elder 	unsigned int xferred;
22631217857fSAlex Elder 	int result;
22648b3e1a56SAlex Elder 	bool more;
22651217857fSAlex Elder 
22666365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
22676365d33aSAlex Elder 	img_request = obj_request->img_request;
22686365d33aSAlex Elder 
22691217857fSAlex Elder 	rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
22701217857fSAlex Elder 	xferred = (unsigned int)obj_request->xferred;
22711217857fSAlex Elder 	result = obj_request->result;
22721217857fSAlex Elder 	if (result) {
22731217857fSAlex Elder 		struct rbd_device *rbd_dev = img_request->rbd_dev;
22746d2940c8SGuangliang Zhao 		enum obj_operation_type op_type;
22756d2940c8SGuangliang Zhao 
227690e98c52SGuangliang Zhao 		if (img_request_discard_test(img_request))
227790e98c52SGuangliang Zhao 			op_type = OBJ_OP_DISCARD;
227890e98c52SGuangliang Zhao 		else if (img_request_write_test(img_request))
227990e98c52SGuangliang Zhao 			op_type = OBJ_OP_WRITE;
228090e98c52SGuangliang Zhao 		else
228190e98c52SGuangliang Zhao 			op_type = OBJ_OP_READ;
22821217857fSAlex Elder 
22839584d508SIlya Dryomov 		rbd_warn(rbd_dev, "%s %llx at %llx (%llx)",
22846d2940c8SGuangliang Zhao 			obj_op_name(op_type), obj_request->length,
22856d2940c8SGuangliang Zhao 			obj_request->img_offset, obj_request->offset);
22869584d508SIlya Dryomov 		rbd_warn(rbd_dev, "  result %d xferred %x",
22871217857fSAlex Elder 			result, xferred);
22881217857fSAlex Elder 		if (!img_request->result)
22891217857fSAlex Elder 			img_request->result = result;
2290082a75daSIlya Dryomov 		/*
2291082a75daSIlya Dryomov 		 * Need to end I/O on the entire obj_request worth of
2292082a75daSIlya Dryomov 		 * bytes in case of error.
2293082a75daSIlya Dryomov 		 */
2294082a75daSIlya Dryomov 		xferred = obj_request->length;
22951217857fSAlex Elder 	}
22961217857fSAlex Elder 
2297f1a4739fSAlex Elder 	/* Image object requests don't own their page array */
2298f1a4739fSAlex Elder 
2299f1a4739fSAlex Elder 	if (obj_request->type == OBJ_REQUEST_PAGES) {
2300f1a4739fSAlex Elder 		obj_request->pages = NULL;
2301f1a4739fSAlex Elder 		obj_request->page_count = 0;
2302f1a4739fSAlex Elder 	}
2303f1a4739fSAlex Elder 
23048b3e1a56SAlex Elder 	if (img_request_child_test(img_request)) {
23058b3e1a56SAlex Elder 		rbd_assert(img_request->obj_request != NULL);
23068b3e1a56SAlex Elder 		more = obj_request->which < img_request->obj_request_count - 1;
23078b3e1a56SAlex Elder 	} else {
23088b3e1a56SAlex Elder 		rbd_assert(img_request->rq != NULL);
23097ad18afaSChristoph Hellwig 
23107ad18afaSChristoph Hellwig 		more = blk_update_request(img_request->rq, result, xferred);
23117ad18afaSChristoph Hellwig 		if (!more)
23127ad18afaSChristoph Hellwig 			__blk_mq_end_request(img_request->rq, result);
23138b3e1a56SAlex Elder 	}
23148b3e1a56SAlex Elder 
23158b3e1a56SAlex Elder 	return more;
23161217857fSAlex Elder }
23171217857fSAlex Elder 
23182169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
23192169238dSAlex Elder {
23202169238dSAlex Elder 	struct rbd_img_request *img_request;
23212169238dSAlex Elder 	u32 which = obj_request->which;
23222169238dSAlex Elder 	bool more = true;
23232169238dSAlex Elder 
23246365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
23252169238dSAlex Elder 	img_request = obj_request->img_request;
23262169238dSAlex Elder 
23272169238dSAlex Elder 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
23282169238dSAlex Elder 	rbd_assert(img_request != NULL);
23292169238dSAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
23302169238dSAlex Elder 	rbd_assert(which != BAD_WHICH);
23312169238dSAlex Elder 	rbd_assert(which < img_request->obj_request_count);
23322169238dSAlex Elder 
23332169238dSAlex Elder 	spin_lock_irq(&img_request->completion_lock);
23342169238dSAlex Elder 	if (which != img_request->next_completion)
23352169238dSAlex Elder 		goto out;
23362169238dSAlex Elder 
23372169238dSAlex Elder 	for_each_obj_request_from(img_request, obj_request) {
23382169238dSAlex Elder 		rbd_assert(more);
23392169238dSAlex Elder 		rbd_assert(which < img_request->obj_request_count);
23402169238dSAlex Elder 
23412169238dSAlex Elder 		if (!obj_request_done_test(obj_request))
23422169238dSAlex Elder 			break;
23431217857fSAlex Elder 		more = rbd_img_obj_end_request(obj_request);
23442169238dSAlex Elder 		which++;
23452169238dSAlex Elder 	}
23462169238dSAlex Elder 
23472169238dSAlex Elder 	rbd_assert(more ^ (which == img_request->obj_request_count));
23482169238dSAlex Elder 	img_request->next_completion = which;
23492169238dSAlex Elder out:
23502169238dSAlex Elder 	spin_unlock_irq(&img_request->completion_lock);
23510f2d5be7SAlex Elder 	rbd_img_request_put(img_request);
23522169238dSAlex Elder 
23532169238dSAlex Elder 	if (!more)
23542169238dSAlex Elder 		rbd_img_request_complete(img_request);
23552169238dSAlex Elder }
23562169238dSAlex Elder 
2357f1a4739fSAlex Elder /*
23583b434a2aSJosh Durgin  * Add individual osd ops to the given ceph_osd_request and prepare
23593b434a2aSJosh Durgin  * them for submission. num_ops is the current number of
23603b434a2aSJosh Durgin  * osd operations already to the object request.
23613b434a2aSJosh Durgin  */
23623b434a2aSJosh Durgin static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request,
23633b434a2aSJosh Durgin 				struct ceph_osd_request *osd_request,
23643b434a2aSJosh Durgin 				enum obj_operation_type op_type,
23653b434a2aSJosh Durgin 				unsigned int num_ops)
23663b434a2aSJosh Durgin {
23673b434a2aSJosh Durgin 	struct rbd_img_request *img_request = obj_request->img_request;
23683b434a2aSJosh Durgin 	struct rbd_device *rbd_dev = img_request->rbd_dev;
23693b434a2aSJosh Durgin 	u64 object_size = rbd_obj_bytes(&rbd_dev->header);
23703b434a2aSJosh Durgin 	u64 offset = obj_request->offset;
23713b434a2aSJosh Durgin 	u64 length = obj_request->length;
23723b434a2aSJosh Durgin 	u64 img_end;
23733b434a2aSJosh Durgin 	u16 opcode;
23743b434a2aSJosh Durgin 
23753b434a2aSJosh Durgin 	if (op_type == OBJ_OP_DISCARD) {
2376d3246fb0SJosh Durgin 		if (!offset && length == object_size &&
2377d3246fb0SJosh Durgin 		    (!img_request_layered_test(img_request) ||
2378d3246fb0SJosh Durgin 		     !obj_request_overlaps_parent(obj_request))) {
23793b434a2aSJosh Durgin 			opcode = CEPH_OSD_OP_DELETE;
23803b434a2aSJosh Durgin 		} else if ((offset + length == object_size)) {
23813b434a2aSJosh Durgin 			opcode = CEPH_OSD_OP_TRUNCATE;
23823b434a2aSJosh Durgin 		} else {
23833b434a2aSJosh Durgin 			down_read(&rbd_dev->header_rwsem);
23843b434a2aSJosh Durgin 			img_end = rbd_dev->header.image_size;
23853b434a2aSJosh Durgin 			up_read(&rbd_dev->header_rwsem);
23863b434a2aSJosh Durgin 
23873b434a2aSJosh Durgin 			if (obj_request->img_offset + length == img_end)
23883b434a2aSJosh Durgin 				opcode = CEPH_OSD_OP_TRUNCATE;
23893b434a2aSJosh Durgin 			else
23903b434a2aSJosh Durgin 				opcode = CEPH_OSD_OP_ZERO;
23913b434a2aSJosh Durgin 		}
23923b434a2aSJosh Durgin 	} else if (op_type == OBJ_OP_WRITE) {
2393e30b7577SIlya Dryomov 		if (!offset && length == object_size)
2394e30b7577SIlya Dryomov 			opcode = CEPH_OSD_OP_WRITEFULL;
2395e30b7577SIlya Dryomov 		else
23963b434a2aSJosh Durgin 			opcode = CEPH_OSD_OP_WRITE;
23973b434a2aSJosh Durgin 		osd_req_op_alloc_hint_init(osd_request, num_ops,
23983b434a2aSJosh Durgin 					object_size, object_size);
23993b434a2aSJosh Durgin 		num_ops++;
24003b434a2aSJosh Durgin 	} else {
24013b434a2aSJosh Durgin 		opcode = CEPH_OSD_OP_READ;
24023b434a2aSJosh Durgin 	}
24033b434a2aSJosh Durgin 
24047e868b6eSIlya Dryomov 	if (opcode == CEPH_OSD_OP_DELETE)
2405144cba14SYan, Zheng 		osd_req_op_init(osd_request, num_ops, opcode, 0);
24067e868b6eSIlya Dryomov 	else
24077e868b6eSIlya Dryomov 		osd_req_op_extent_init(osd_request, num_ops, opcode,
24087e868b6eSIlya Dryomov 				       offset, length, 0, 0);
24097e868b6eSIlya Dryomov 
24103b434a2aSJosh Durgin 	if (obj_request->type == OBJ_REQUEST_BIO)
24113b434a2aSJosh Durgin 		osd_req_op_extent_osd_data_bio(osd_request, num_ops,
24123b434a2aSJosh Durgin 					obj_request->bio_list, length);
24133b434a2aSJosh Durgin 	else if (obj_request->type == OBJ_REQUEST_PAGES)
24143b434a2aSJosh Durgin 		osd_req_op_extent_osd_data_pages(osd_request, num_ops,
24153b434a2aSJosh Durgin 					obj_request->pages, length,
24163b434a2aSJosh Durgin 					offset & ~PAGE_MASK, false, false);
24173b434a2aSJosh Durgin 
24183b434a2aSJosh Durgin 	/* Discards are also writes */
24193b434a2aSJosh Durgin 	if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
24203b434a2aSJosh Durgin 		rbd_osd_req_format_write(obj_request);
24213b434a2aSJosh Durgin 	else
24223b434a2aSJosh Durgin 		rbd_osd_req_format_read(obj_request);
24233b434a2aSJosh Durgin }
24243b434a2aSJosh Durgin 
24253b434a2aSJosh Durgin /*
2426f1a4739fSAlex Elder  * Split up an image request into one or more object requests, each
2427f1a4739fSAlex Elder  * to a different object.  The "type" parameter indicates whether
2428f1a4739fSAlex Elder  * "data_desc" is the pointer to the head of a list of bio
2429f1a4739fSAlex Elder  * structures, or the base of a page array.  In either case this
2430f1a4739fSAlex Elder  * function assumes data_desc describes memory sufficient to hold
2431f1a4739fSAlex Elder  * all data described by the image request.
2432f1a4739fSAlex Elder  */
2433f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request,
2434f1a4739fSAlex Elder 					enum obj_request_type type,
2435f1a4739fSAlex Elder 					void *data_desc)
2436bf0d5f50SAlex Elder {
2437bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = img_request->rbd_dev;
2438bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = NULL;
2439bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
2440a158073cSJingoo Han 	struct bio *bio_list = NULL;
2441f1a4739fSAlex Elder 	unsigned int bio_offset = 0;
2442a158073cSJingoo Han 	struct page **pages = NULL;
24436d2940c8SGuangliang Zhao 	enum obj_operation_type op_type;
24447da22d29SAlex Elder 	u64 img_offset;
2445bf0d5f50SAlex Elder 	u64 resid;
2446bf0d5f50SAlex Elder 
2447f1a4739fSAlex Elder 	dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2448f1a4739fSAlex Elder 		(int)type, data_desc);
244937206ee5SAlex Elder 
24507da22d29SAlex Elder 	img_offset = img_request->offset;
2451bf0d5f50SAlex Elder 	resid = img_request->length;
24524dda41d3SAlex Elder 	rbd_assert(resid > 0);
24533b434a2aSJosh Durgin 	op_type = rbd_img_request_op_type(img_request);
2454f1a4739fSAlex Elder 
2455f1a4739fSAlex Elder 	if (type == OBJ_REQUEST_BIO) {
2456f1a4739fSAlex Elder 		bio_list = data_desc;
24574f024f37SKent Overstreet 		rbd_assert(img_offset ==
24584f024f37SKent Overstreet 			   bio_list->bi_iter.bi_sector << SECTOR_SHIFT);
245990e98c52SGuangliang Zhao 	} else if (type == OBJ_REQUEST_PAGES) {
2460f1a4739fSAlex Elder 		pages = data_desc;
2461f1a4739fSAlex Elder 	}
2462f1a4739fSAlex Elder 
2463bf0d5f50SAlex Elder 	while (resid) {
24642fa12320SAlex Elder 		struct ceph_osd_request *osd_req;
2465bf0d5f50SAlex Elder 		const char *object_name;
2466bf0d5f50SAlex Elder 		u64 offset;
2467bf0d5f50SAlex Elder 		u64 length;
2468bf0d5f50SAlex Elder 
24697da22d29SAlex Elder 		object_name = rbd_segment_name(rbd_dev, img_offset);
2470bf0d5f50SAlex Elder 		if (!object_name)
2471bf0d5f50SAlex Elder 			goto out_unwind;
24727da22d29SAlex Elder 		offset = rbd_segment_offset(rbd_dev, img_offset);
24737da22d29SAlex Elder 		length = rbd_segment_length(rbd_dev, img_offset, resid);
2474bf0d5f50SAlex Elder 		obj_request = rbd_obj_request_create(object_name,
2475f1a4739fSAlex Elder 						offset, length, type);
247678c2a44aSAlex Elder 		/* object request has its own copy of the object name */
247778c2a44aSAlex Elder 		rbd_segment_name_free(object_name);
2478bf0d5f50SAlex Elder 		if (!obj_request)
2479bf0d5f50SAlex Elder 			goto out_unwind;
248062054da6SIlya Dryomov 
248103507db6SJosh Durgin 		/*
248203507db6SJosh Durgin 		 * set obj_request->img_request before creating the
248303507db6SJosh Durgin 		 * osd_request so that it gets the right snapc
248403507db6SJosh Durgin 		 */
248503507db6SJosh Durgin 		rbd_img_obj_request_add(img_request, obj_request);
2486bf0d5f50SAlex Elder 
2487f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO) {
2488f1a4739fSAlex Elder 			unsigned int clone_size;
2489f1a4739fSAlex Elder 
2490bf0d5f50SAlex Elder 			rbd_assert(length <= (u64)UINT_MAX);
2491bf0d5f50SAlex Elder 			clone_size = (unsigned int)length;
2492f1a4739fSAlex Elder 			obj_request->bio_list =
2493f1a4739fSAlex Elder 					bio_chain_clone_range(&bio_list,
2494f1a4739fSAlex Elder 								&bio_offset,
2495f1a4739fSAlex Elder 								clone_size,
24962224d879SDavid Disseldorp 								GFP_NOIO);
2497bf0d5f50SAlex Elder 			if (!obj_request->bio_list)
249862054da6SIlya Dryomov 				goto out_unwind;
249990e98c52SGuangliang Zhao 		} else if (type == OBJ_REQUEST_PAGES) {
2500f1a4739fSAlex Elder 			unsigned int page_count;
2501f1a4739fSAlex Elder 
2502f1a4739fSAlex Elder 			obj_request->pages = pages;
2503f1a4739fSAlex Elder 			page_count = (u32)calc_pages_for(offset, length);
2504f1a4739fSAlex Elder 			obj_request->page_count = page_count;
2505f1a4739fSAlex Elder 			if ((offset + length) & ~PAGE_MASK)
2506f1a4739fSAlex Elder 				page_count--;	/* more on last page */
2507f1a4739fSAlex Elder 			pages += page_count;
2508f1a4739fSAlex Elder 		}
2509bf0d5f50SAlex Elder 
25106d2940c8SGuangliang Zhao 		osd_req = rbd_osd_req_create(rbd_dev, op_type,
25116d2940c8SGuangliang Zhao 					(op_type == OBJ_OP_WRITE) ? 2 : 1,
25122fa12320SAlex Elder 					obj_request);
25132fa12320SAlex Elder 		if (!osd_req)
251462054da6SIlya Dryomov 			goto out_unwind;
25153b434a2aSJosh Durgin 
25162fa12320SAlex Elder 		obj_request->osd_req = osd_req;
25172169238dSAlex Elder 		obj_request->callback = rbd_img_obj_callback;
25187da22d29SAlex Elder 		obj_request->img_offset = img_offset;
2519bf0d5f50SAlex Elder 
25203b434a2aSJosh Durgin 		rbd_img_obj_request_fill(obj_request, osd_req, op_type, 0);
25213b434a2aSJosh Durgin 
25223b434a2aSJosh Durgin 		rbd_img_request_get(img_request);
25233b434a2aSJosh Durgin 
25247da22d29SAlex Elder 		img_offset += length;
2525bf0d5f50SAlex Elder 		resid -= length;
2526bf0d5f50SAlex Elder 	}
2527bf0d5f50SAlex Elder 
2528bf0d5f50SAlex Elder 	return 0;
2529bf0d5f50SAlex Elder 
2530bf0d5f50SAlex Elder out_unwind:
2531bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
253242dd037cSIlya Dryomov 		rbd_img_obj_request_del(img_request, obj_request);
2533bf0d5f50SAlex Elder 
2534bf0d5f50SAlex Elder 	return -ENOMEM;
2535bf0d5f50SAlex Elder }
2536bf0d5f50SAlex Elder 
25373d7efd18SAlex Elder static void
25382761713dSIlya Dryomov rbd_osd_copyup_callback(struct rbd_obj_request *obj_request)
25390eefd470SAlex Elder {
25400eefd470SAlex Elder 	struct rbd_img_request *img_request;
25410eefd470SAlex Elder 	struct rbd_device *rbd_dev;
2542ebda6408SAlex Elder 	struct page **pages;
25430eefd470SAlex Elder 	u32 page_count;
25440eefd470SAlex Elder 
25452761713dSIlya Dryomov 	dout("%s: obj %p\n", __func__, obj_request);
25462761713dSIlya Dryomov 
2547d3246fb0SJosh Durgin 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO ||
2548d3246fb0SJosh Durgin 		obj_request->type == OBJ_REQUEST_NODATA);
25490eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
25500eefd470SAlex Elder 	img_request = obj_request->img_request;
25510eefd470SAlex Elder 	rbd_assert(img_request);
25520eefd470SAlex Elder 
25530eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
25540eefd470SAlex Elder 	rbd_assert(rbd_dev);
25550eefd470SAlex Elder 
2556ebda6408SAlex Elder 	pages = obj_request->copyup_pages;
2557ebda6408SAlex Elder 	rbd_assert(pages != NULL);
25580eefd470SAlex Elder 	obj_request->copyup_pages = NULL;
2559ebda6408SAlex Elder 	page_count = obj_request->copyup_page_count;
2560ebda6408SAlex Elder 	rbd_assert(page_count);
2561ebda6408SAlex Elder 	obj_request->copyup_page_count = 0;
2562ebda6408SAlex Elder 	ceph_release_page_vector(pages, page_count);
25630eefd470SAlex Elder 
25640eefd470SAlex Elder 	/*
25650eefd470SAlex Elder 	 * We want the transfer count to reflect the size of the
25660eefd470SAlex Elder 	 * original write request.  There is no such thing as a
25670eefd470SAlex Elder 	 * successful short write, so if the request was successful
25680eefd470SAlex Elder 	 * we can just set it to the originally-requested length.
25690eefd470SAlex Elder 	 */
25700eefd470SAlex Elder 	if (!obj_request->result)
25710eefd470SAlex Elder 		obj_request->xferred = obj_request->length;
25720eefd470SAlex Elder 
25732761713dSIlya Dryomov 	obj_request_done_set(obj_request);
25740eefd470SAlex Elder }
25750eefd470SAlex Elder 
25760eefd470SAlex Elder static void
25773d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
25783d7efd18SAlex Elder {
25793d7efd18SAlex Elder 	struct rbd_obj_request *orig_request;
25800eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
25810eefd470SAlex Elder 	struct ceph_osd_client *osdc;
25820eefd470SAlex Elder 	struct rbd_device *rbd_dev;
25833d7efd18SAlex Elder 	struct page **pages;
2584d3246fb0SJosh Durgin 	enum obj_operation_type op_type;
2585ebda6408SAlex Elder 	u32 page_count;
2586bbea1c1aSAlex Elder 	int img_result;
2587ebda6408SAlex Elder 	u64 parent_length;
25883d7efd18SAlex Elder 
25893d7efd18SAlex Elder 	rbd_assert(img_request_child_test(img_request));
25903d7efd18SAlex Elder 
25913d7efd18SAlex Elder 	/* First get what we need from the image request */
25923d7efd18SAlex Elder 
25933d7efd18SAlex Elder 	pages = img_request->copyup_pages;
25943d7efd18SAlex Elder 	rbd_assert(pages != NULL);
25953d7efd18SAlex Elder 	img_request->copyup_pages = NULL;
2596ebda6408SAlex Elder 	page_count = img_request->copyup_page_count;
2597ebda6408SAlex Elder 	rbd_assert(page_count);
2598ebda6408SAlex Elder 	img_request->copyup_page_count = 0;
25993d7efd18SAlex Elder 
26003d7efd18SAlex Elder 	orig_request = img_request->obj_request;
26013d7efd18SAlex Elder 	rbd_assert(orig_request != NULL);
2602b91f09f1SAlex Elder 	rbd_assert(obj_request_type_valid(orig_request->type));
2603bbea1c1aSAlex Elder 	img_result = img_request->result;
2604ebda6408SAlex Elder 	parent_length = img_request->length;
2605ebda6408SAlex Elder 	rbd_assert(parent_length == img_request->xferred);
26063d7efd18SAlex Elder 	rbd_img_request_put(img_request);
26073d7efd18SAlex Elder 
260891c6febbSAlex Elder 	rbd_assert(orig_request->img_request);
260991c6febbSAlex Elder 	rbd_dev = orig_request->img_request->rbd_dev;
26103d7efd18SAlex Elder 	rbd_assert(rbd_dev);
26113d7efd18SAlex Elder 
2612bbea1c1aSAlex Elder 	/*
2613bbea1c1aSAlex Elder 	 * If the overlap has become 0 (most likely because the
2614bbea1c1aSAlex Elder 	 * image has been flattened) we need to free the pages
2615bbea1c1aSAlex Elder 	 * and re-submit the original write request.
2616bbea1c1aSAlex Elder 	 */
2617bbea1c1aSAlex Elder 	if (!rbd_dev->parent_overlap) {
2618bbea1c1aSAlex Elder 		struct ceph_osd_client *osdc;
2619bbea1c1aSAlex Elder 
2620bbea1c1aSAlex Elder 		ceph_release_page_vector(pages, page_count);
2621bbea1c1aSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2622bbea1c1aSAlex Elder 		img_result = rbd_obj_request_submit(osdc, orig_request);
2623bbea1c1aSAlex Elder 		if (!img_result)
2624bbea1c1aSAlex Elder 			return;
2625bbea1c1aSAlex Elder 	}
2626bbea1c1aSAlex Elder 
2627bbea1c1aSAlex Elder 	if (img_result)
26280eefd470SAlex Elder 		goto out_err;
26293d7efd18SAlex Elder 
26308785b1d4SAlex Elder 	/*
26318785b1d4SAlex Elder 	 * The original osd request is of no use to use any more.
26320ccd5926SIlya Dryomov 	 * We need a new one that can hold the three ops in a copyup
26338785b1d4SAlex Elder 	 * request.  Allocate the new copyup osd request for the
26348785b1d4SAlex Elder 	 * original request, and release the old one.
26358785b1d4SAlex Elder 	 */
2636bbea1c1aSAlex Elder 	img_result = -ENOMEM;
26370eefd470SAlex Elder 	osd_req = rbd_osd_req_create_copyup(orig_request);
26380eefd470SAlex Elder 	if (!osd_req)
26390eefd470SAlex Elder 		goto out_err;
26408785b1d4SAlex Elder 	rbd_osd_req_destroy(orig_request->osd_req);
26410eefd470SAlex Elder 	orig_request->osd_req = osd_req;
26420eefd470SAlex Elder 	orig_request->copyup_pages = pages;
2643ebda6408SAlex Elder 	orig_request->copyup_page_count = page_count;
26443d7efd18SAlex Elder 
26450eefd470SAlex Elder 	/* Initialize the copyup op */
26460eefd470SAlex Elder 
26470eefd470SAlex Elder 	osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2648ebda6408SAlex Elder 	osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
26490eefd470SAlex Elder 						false, false);
26500eefd470SAlex Elder 
2651d3246fb0SJosh Durgin 	/* Add the other op(s) */
26520ccd5926SIlya Dryomov 
2653d3246fb0SJosh Durgin 	op_type = rbd_img_request_op_type(orig_request->img_request);
2654d3246fb0SJosh Durgin 	rbd_img_obj_request_fill(orig_request, osd_req, op_type, 1);
26550eefd470SAlex Elder 
26560eefd470SAlex Elder 	/* All set, send it off. */
26570eefd470SAlex Elder 
26580eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2659bbea1c1aSAlex Elder 	img_result = rbd_obj_request_submit(osdc, orig_request);
2660bbea1c1aSAlex Elder 	if (!img_result)
26610eefd470SAlex Elder 		return;
26620eefd470SAlex Elder out_err:
26630eefd470SAlex Elder 	/* Record the error code and complete the request */
26640eefd470SAlex Elder 
2665bbea1c1aSAlex Elder 	orig_request->result = img_result;
26660eefd470SAlex Elder 	orig_request->xferred = 0;
26673d7efd18SAlex Elder 	obj_request_done_set(orig_request);
26683d7efd18SAlex Elder 	rbd_obj_request_complete(orig_request);
26693d7efd18SAlex Elder }
26703d7efd18SAlex Elder 
26713d7efd18SAlex Elder /*
26723d7efd18SAlex Elder  * Read from the parent image the range of data that covers the
26733d7efd18SAlex Elder  * entire target of the given object request.  This is used for
26743d7efd18SAlex Elder  * satisfying a layered image write request when the target of an
26753d7efd18SAlex Elder  * object request from the image request does not exist.
26763d7efd18SAlex Elder  *
26773d7efd18SAlex Elder  * A page array big enough to hold the returned data is allocated
26783d7efd18SAlex Elder  * and supplied to rbd_img_request_fill() as the "data descriptor."
26793d7efd18SAlex Elder  * When the read completes, this page array will be transferred to
26803d7efd18SAlex Elder  * the original object request for the copyup operation.
26813d7efd18SAlex Elder  *
26823d7efd18SAlex Elder  * If an error occurs, record it as the result of the original
26833d7efd18SAlex Elder  * object request and mark it done so it gets completed.
26843d7efd18SAlex Elder  */
26853d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
26863d7efd18SAlex Elder {
26873d7efd18SAlex Elder 	struct rbd_img_request *img_request = NULL;
26883d7efd18SAlex Elder 	struct rbd_img_request *parent_request = NULL;
26893d7efd18SAlex Elder 	struct rbd_device *rbd_dev;
26903d7efd18SAlex Elder 	u64 img_offset;
26913d7efd18SAlex Elder 	u64 length;
26923d7efd18SAlex Elder 	struct page **pages = NULL;
26933d7efd18SAlex Elder 	u32 page_count;
26943d7efd18SAlex Elder 	int result;
26953d7efd18SAlex Elder 
26963d7efd18SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
2697b91f09f1SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
26983d7efd18SAlex Elder 
26993d7efd18SAlex Elder 	img_request = obj_request->img_request;
27003d7efd18SAlex Elder 	rbd_assert(img_request != NULL);
27013d7efd18SAlex Elder 	rbd_dev = img_request->rbd_dev;
27023d7efd18SAlex Elder 	rbd_assert(rbd_dev->parent != NULL);
27033d7efd18SAlex Elder 
27043d7efd18SAlex Elder 	/*
27053d7efd18SAlex Elder 	 * Determine the byte range covered by the object in the
27063d7efd18SAlex Elder 	 * child image to which the original request was to be sent.
27073d7efd18SAlex Elder 	 */
27083d7efd18SAlex Elder 	img_offset = obj_request->img_offset - obj_request->offset;
27093d7efd18SAlex Elder 	length = (u64)1 << rbd_dev->header.obj_order;
27103d7efd18SAlex Elder 
27113d7efd18SAlex Elder 	/*
2712a9e8ba2cSAlex Elder 	 * There is no defined parent data beyond the parent
2713a9e8ba2cSAlex Elder 	 * overlap, so limit what we read at that boundary if
2714a9e8ba2cSAlex Elder 	 * necessary.
2715a9e8ba2cSAlex Elder 	 */
2716a9e8ba2cSAlex Elder 	if (img_offset + length > rbd_dev->parent_overlap) {
2717a9e8ba2cSAlex Elder 		rbd_assert(img_offset < rbd_dev->parent_overlap);
2718a9e8ba2cSAlex Elder 		length = rbd_dev->parent_overlap - img_offset;
2719a9e8ba2cSAlex Elder 	}
2720a9e8ba2cSAlex Elder 
2721a9e8ba2cSAlex Elder 	/*
27223d7efd18SAlex Elder 	 * Allocate a page array big enough to receive the data read
27233d7efd18SAlex Elder 	 * from the parent.
27243d7efd18SAlex Elder 	 */
27253d7efd18SAlex Elder 	page_count = (u32)calc_pages_for(0, length);
27263d7efd18SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
27273d7efd18SAlex Elder 	if (IS_ERR(pages)) {
27283d7efd18SAlex Elder 		result = PTR_ERR(pages);
27293d7efd18SAlex Elder 		pages = NULL;
27303d7efd18SAlex Elder 		goto out_err;
27313d7efd18SAlex Elder 	}
27323d7efd18SAlex Elder 
27333d7efd18SAlex Elder 	result = -ENOMEM;
2734e93f3152SAlex Elder 	parent_request = rbd_parent_request_create(obj_request,
2735e93f3152SAlex Elder 						img_offset, length);
27363d7efd18SAlex Elder 	if (!parent_request)
27373d7efd18SAlex Elder 		goto out_err;
27383d7efd18SAlex Elder 
27393d7efd18SAlex Elder 	result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
27403d7efd18SAlex Elder 	if (result)
27413d7efd18SAlex Elder 		goto out_err;
27423d7efd18SAlex Elder 	parent_request->copyup_pages = pages;
2743ebda6408SAlex Elder 	parent_request->copyup_page_count = page_count;
27443d7efd18SAlex Elder 
27453d7efd18SAlex Elder 	parent_request->callback = rbd_img_obj_parent_read_full_callback;
27463d7efd18SAlex Elder 	result = rbd_img_request_submit(parent_request);
27473d7efd18SAlex Elder 	if (!result)
27483d7efd18SAlex Elder 		return 0;
27493d7efd18SAlex Elder 
27503d7efd18SAlex Elder 	parent_request->copyup_pages = NULL;
2751ebda6408SAlex Elder 	parent_request->copyup_page_count = 0;
27523d7efd18SAlex Elder 	parent_request->obj_request = NULL;
27533d7efd18SAlex Elder 	rbd_obj_request_put(obj_request);
27543d7efd18SAlex Elder out_err:
27553d7efd18SAlex Elder 	if (pages)
27563d7efd18SAlex Elder 		ceph_release_page_vector(pages, page_count);
27573d7efd18SAlex Elder 	if (parent_request)
27583d7efd18SAlex Elder 		rbd_img_request_put(parent_request);
27593d7efd18SAlex Elder 	obj_request->result = result;
27603d7efd18SAlex Elder 	obj_request->xferred = 0;
27613d7efd18SAlex Elder 	obj_request_done_set(obj_request);
27623d7efd18SAlex Elder 
27633d7efd18SAlex Elder 	return result;
27643d7efd18SAlex Elder }
27653d7efd18SAlex Elder 
2766c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2767c5b5ef6cSAlex Elder {
2768c5b5ef6cSAlex Elder 	struct rbd_obj_request *orig_request;
2769638f5abeSAlex Elder 	struct rbd_device *rbd_dev;
2770c5b5ef6cSAlex Elder 	int result;
2771c5b5ef6cSAlex Elder 
2772c5b5ef6cSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
2773c5b5ef6cSAlex Elder 
2774c5b5ef6cSAlex Elder 	/*
2775c5b5ef6cSAlex Elder 	 * All we need from the object request is the original
2776c5b5ef6cSAlex Elder 	 * request and the result of the STAT op.  Grab those, then
2777c5b5ef6cSAlex Elder 	 * we're done with the request.
2778c5b5ef6cSAlex Elder 	 */
2779c5b5ef6cSAlex Elder 	orig_request = obj_request->obj_request;
2780c5b5ef6cSAlex Elder 	obj_request->obj_request = NULL;
2781912c317dSAlex Elder 	rbd_obj_request_put(orig_request);
2782c5b5ef6cSAlex Elder 	rbd_assert(orig_request);
2783c5b5ef6cSAlex Elder 	rbd_assert(orig_request->img_request);
2784c5b5ef6cSAlex Elder 
2785c5b5ef6cSAlex Elder 	result = obj_request->result;
2786c5b5ef6cSAlex Elder 	obj_request->result = 0;
2787c5b5ef6cSAlex Elder 
2788c5b5ef6cSAlex Elder 	dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2789c5b5ef6cSAlex Elder 		obj_request, orig_request, result,
2790c5b5ef6cSAlex Elder 		obj_request->xferred, obj_request->length);
2791c5b5ef6cSAlex Elder 	rbd_obj_request_put(obj_request);
2792c5b5ef6cSAlex Elder 
2793638f5abeSAlex Elder 	/*
2794638f5abeSAlex Elder 	 * If the overlap has become 0 (most likely because the
2795638f5abeSAlex Elder 	 * image has been flattened) we need to free the pages
2796638f5abeSAlex Elder 	 * and re-submit the original write request.
2797638f5abeSAlex Elder 	 */
2798638f5abeSAlex Elder 	rbd_dev = orig_request->img_request->rbd_dev;
2799638f5abeSAlex Elder 	if (!rbd_dev->parent_overlap) {
2800638f5abeSAlex Elder 		struct ceph_osd_client *osdc;
2801638f5abeSAlex Elder 
2802638f5abeSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2803638f5abeSAlex Elder 		result = rbd_obj_request_submit(osdc, orig_request);
2804638f5abeSAlex Elder 		if (!result)
2805638f5abeSAlex Elder 			return;
2806638f5abeSAlex Elder 	}
2807c5b5ef6cSAlex Elder 
2808c5b5ef6cSAlex Elder 	/*
2809c5b5ef6cSAlex Elder 	 * Our only purpose here is to determine whether the object
2810c5b5ef6cSAlex Elder 	 * exists, and we don't want to treat the non-existence as
2811c5b5ef6cSAlex Elder 	 * an error.  If something else comes back, transfer the
2812c5b5ef6cSAlex Elder 	 * error to the original request and complete it now.
2813c5b5ef6cSAlex Elder 	 */
2814c5b5ef6cSAlex Elder 	if (!result) {
2815c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, true);
2816c5b5ef6cSAlex Elder 	} else if (result == -ENOENT) {
2817c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, false);
2818c5b5ef6cSAlex Elder 	} else if (result) {
2819c5b5ef6cSAlex Elder 		orig_request->result = result;
28203d7efd18SAlex Elder 		goto out;
2821c5b5ef6cSAlex Elder 	}
2822c5b5ef6cSAlex Elder 
2823c5b5ef6cSAlex Elder 	/*
2824c5b5ef6cSAlex Elder 	 * Resubmit the original request now that we have recorded
2825c5b5ef6cSAlex Elder 	 * whether the target object exists.
2826c5b5ef6cSAlex Elder 	 */
2827b454e36dSAlex Elder 	orig_request->result = rbd_img_obj_request_submit(orig_request);
28283d7efd18SAlex Elder out:
2829c5b5ef6cSAlex Elder 	if (orig_request->result)
2830c5b5ef6cSAlex Elder 		rbd_obj_request_complete(orig_request);
2831c5b5ef6cSAlex Elder }
2832c5b5ef6cSAlex Elder 
2833c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2834c5b5ef6cSAlex Elder {
2835c5b5ef6cSAlex Elder 	struct rbd_obj_request *stat_request;
2836c5b5ef6cSAlex Elder 	struct rbd_device *rbd_dev;
2837c5b5ef6cSAlex Elder 	struct ceph_osd_client *osdc;
2838c5b5ef6cSAlex Elder 	struct page **pages = NULL;
2839c5b5ef6cSAlex Elder 	u32 page_count;
2840c5b5ef6cSAlex Elder 	size_t size;
2841c5b5ef6cSAlex Elder 	int ret;
2842c5b5ef6cSAlex Elder 
2843c5b5ef6cSAlex Elder 	/*
2844c5b5ef6cSAlex Elder 	 * The response data for a STAT call consists of:
2845c5b5ef6cSAlex Elder 	 *     le64 length;
2846c5b5ef6cSAlex Elder 	 *     struct {
2847c5b5ef6cSAlex Elder 	 *         le32 tv_sec;
2848c5b5ef6cSAlex Elder 	 *         le32 tv_nsec;
2849c5b5ef6cSAlex Elder 	 *     } mtime;
2850c5b5ef6cSAlex Elder 	 */
2851c5b5ef6cSAlex Elder 	size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2852c5b5ef6cSAlex Elder 	page_count = (u32)calc_pages_for(0, size);
2853c5b5ef6cSAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2854c5b5ef6cSAlex Elder 	if (IS_ERR(pages))
2855c5b5ef6cSAlex Elder 		return PTR_ERR(pages);
2856c5b5ef6cSAlex Elder 
2857c5b5ef6cSAlex Elder 	ret = -ENOMEM;
2858c5b5ef6cSAlex Elder 	stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2859c5b5ef6cSAlex Elder 							OBJ_REQUEST_PAGES);
2860c5b5ef6cSAlex Elder 	if (!stat_request)
2861c5b5ef6cSAlex Elder 		goto out;
2862c5b5ef6cSAlex Elder 
2863c5b5ef6cSAlex Elder 	rbd_obj_request_get(obj_request);
2864c5b5ef6cSAlex Elder 	stat_request->obj_request = obj_request;
2865c5b5ef6cSAlex Elder 	stat_request->pages = pages;
2866c5b5ef6cSAlex Elder 	stat_request->page_count = page_count;
2867c5b5ef6cSAlex Elder 
2868c5b5ef6cSAlex Elder 	rbd_assert(obj_request->img_request);
2869c5b5ef6cSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
28706d2940c8SGuangliang Zhao 	stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
2871c5b5ef6cSAlex Elder 						   stat_request);
2872c5b5ef6cSAlex Elder 	if (!stat_request->osd_req)
2873c5b5ef6cSAlex Elder 		goto out;
2874c5b5ef6cSAlex Elder 	stat_request->callback = rbd_img_obj_exists_callback;
2875c5b5ef6cSAlex Elder 
2876144cba14SYan, Zheng 	osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0);
2877c5b5ef6cSAlex Elder 	osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2878c5b5ef6cSAlex Elder 					false, false);
28799d4df01fSAlex Elder 	rbd_osd_req_format_read(stat_request);
2880c5b5ef6cSAlex Elder 
2881c5b5ef6cSAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2882c5b5ef6cSAlex Elder 	ret = rbd_obj_request_submit(osdc, stat_request);
2883c5b5ef6cSAlex Elder out:
2884c5b5ef6cSAlex Elder 	if (ret)
2885c5b5ef6cSAlex Elder 		rbd_obj_request_put(obj_request);
2886c5b5ef6cSAlex Elder 
2887c5b5ef6cSAlex Elder 	return ret;
2888c5b5ef6cSAlex Elder }
2889c5b5ef6cSAlex Elder 
289070d045f6SIlya Dryomov static bool img_obj_request_simple(struct rbd_obj_request *obj_request)
2891b454e36dSAlex Elder {
2892b454e36dSAlex Elder 	struct rbd_img_request *img_request;
2893a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
2894b454e36dSAlex Elder 
2895b454e36dSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
2896b454e36dSAlex Elder 
2897b454e36dSAlex Elder 	img_request = obj_request->img_request;
2898b454e36dSAlex Elder 	rbd_assert(img_request);
2899a9e8ba2cSAlex Elder 	rbd_dev = img_request->rbd_dev;
2900b454e36dSAlex Elder 
290170d045f6SIlya Dryomov 	/* Reads */
29021c220881SJosh Durgin 	if (!img_request_write_test(img_request) &&
29031c220881SJosh Durgin 	    !img_request_discard_test(img_request))
290470d045f6SIlya Dryomov 		return true;
2905b454e36dSAlex Elder 
290670d045f6SIlya Dryomov 	/* Non-layered writes */
290770d045f6SIlya Dryomov 	if (!img_request_layered_test(img_request))
290870d045f6SIlya Dryomov 		return true;
290970d045f6SIlya Dryomov 
291070d045f6SIlya Dryomov 	/*
291170d045f6SIlya Dryomov 	 * Layered writes outside of the parent overlap range don't
291270d045f6SIlya Dryomov 	 * share any data with the parent.
291370d045f6SIlya Dryomov 	 */
291470d045f6SIlya Dryomov 	if (!obj_request_overlaps_parent(obj_request))
291570d045f6SIlya Dryomov 		return true;
291670d045f6SIlya Dryomov 
291770d045f6SIlya Dryomov 	/*
2918c622d226SGuangliang Zhao 	 * Entire-object layered writes - we will overwrite whatever
2919c622d226SGuangliang Zhao 	 * parent data there is anyway.
2920c622d226SGuangliang Zhao 	 */
2921c622d226SGuangliang Zhao 	if (!obj_request->offset &&
2922c622d226SGuangliang Zhao 	    obj_request->length == rbd_obj_bytes(&rbd_dev->header))
2923c622d226SGuangliang Zhao 		return true;
2924c622d226SGuangliang Zhao 
2925c622d226SGuangliang Zhao 	/*
292670d045f6SIlya Dryomov 	 * If the object is known to already exist, its parent data has
292770d045f6SIlya Dryomov 	 * already been copied.
292870d045f6SIlya Dryomov 	 */
292970d045f6SIlya Dryomov 	if (obj_request_known_test(obj_request) &&
293070d045f6SIlya Dryomov 	    obj_request_exists_test(obj_request))
293170d045f6SIlya Dryomov 		return true;
293270d045f6SIlya Dryomov 
293370d045f6SIlya Dryomov 	return false;
293470d045f6SIlya Dryomov }
293570d045f6SIlya Dryomov 
293670d045f6SIlya Dryomov static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
293770d045f6SIlya Dryomov {
293870d045f6SIlya Dryomov 	if (img_obj_request_simple(obj_request)) {
2939b454e36dSAlex Elder 		struct rbd_device *rbd_dev;
2940b454e36dSAlex Elder 		struct ceph_osd_client *osdc;
2941b454e36dSAlex Elder 
2942b454e36dSAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
2943b454e36dSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2944b454e36dSAlex Elder 
2945b454e36dSAlex Elder 		return rbd_obj_request_submit(osdc, obj_request);
2946b454e36dSAlex Elder 	}
2947b454e36dSAlex Elder 
2948b454e36dSAlex Elder 	/*
29493d7efd18SAlex Elder 	 * It's a layered write.  The target object might exist but
29503d7efd18SAlex Elder 	 * we may not know that yet.  If we know it doesn't exist,
29513d7efd18SAlex Elder 	 * start by reading the data for the full target object from
29523d7efd18SAlex Elder 	 * the parent so we can use it for a copyup to the target.
2953b454e36dSAlex Elder 	 */
295470d045f6SIlya Dryomov 	if (obj_request_known_test(obj_request))
29553d7efd18SAlex Elder 		return rbd_img_obj_parent_read_full(obj_request);
29563d7efd18SAlex Elder 
29573d7efd18SAlex Elder 	/* We don't know whether the target exists.  Go find out. */
2958b454e36dSAlex Elder 
2959b454e36dSAlex Elder 	return rbd_img_obj_exists_submit(obj_request);
2960b454e36dSAlex Elder }
2961b454e36dSAlex Elder 
2962bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request)
2963bf0d5f50SAlex Elder {
2964bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
296546faeed4SAlex Elder 	struct rbd_obj_request *next_obj_request;
2966663ae2ccSIlya Dryomov 	int ret = 0;
2967bf0d5f50SAlex Elder 
296837206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
2969bf0d5f50SAlex Elder 
2970663ae2ccSIlya Dryomov 	rbd_img_request_get(img_request);
2971663ae2ccSIlya Dryomov 	for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
2972b454e36dSAlex Elder 		ret = rbd_img_obj_request_submit(obj_request);
2973bf0d5f50SAlex Elder 		if (ret)
2974663ae2ccSIlya Dryomov 			goto out_put_ireq;
2975bf0d5f50SAlex Elder 	}
2976bf0d5f50SAlex Elder 
2977663ae2ccSIlya Dryomov out_put_ireq:
2978663ae2ccSIlya Dryomov 	rbd_img_request_put(img_request);
2979663ae2ccSIlya Dryomov 	return ret;
2980bf0d5f50SAlex Elder }
2981bf0d5f50SAlex Elder 
29828b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
29838b3e1a56SAlex Elder {
29848b3e1a56SAlex Elder 	struct rbd_obj_request *obj_request;
2985a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
2986a9e8ba2cSAlex Elder 	u64 obj_end;
298702c74fbaSAlex Elder 	u64 img_xferred;
298802c74fbaSAlex Elder 	int img_result;
29898b3e1a56SAlex Elder 
29908b3e1a56SAlex Elder 	rbd_assert(img_request_child_test(img_request));
29918b3e1a56SAlex Elder 
299202c74fbaSAlex Elder 	/* First get what we need from the image request and release it */
299302c74fbaSAlex Elder 
29948b3e1a56SAlex Elder 	obj_request = img_request->obj_request;
299502c74fbaSAlex Elder 	img_xferred = img_request->xferred;
299602c74fbaSAlex Elder 	img_result = img_request->result;
299702c74fbaSAlex Elder 	rbd_img_request_put(img_request);
299802c74fbaSAlex Elder 
299902c74fbaSAlex Elder 	/*
300002c74fbaSAlex Elder 	 * If the overlap has become 0 (most likely because the
300102c74fbaSAlex Elder 	 * image has been flattened) we need to re-submit the
300202c74fbaSAlex Elder 	 * original request.
300302c74fbaSAlex Elder 	 */
3004a9e8ba2cSAlex Elder 	rbd_assert(obj_request);
3005a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_request);
300602c74fbaSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
300702c74fbaSAlex Elder 	if (!rbd_dev->parent_overlap) {
300802c74fbaSAlex Elder 		struct ceph_osd_client *osdc;
30098b3e1a56SAlex Elder 
301002c74fbaSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
301102c74fbaSAlex Elder 		img_result = rbd_obj_request_submit(osdc, obj_request);
301202c74fbaSAlex Elder 		if (!img_result)
301302c74fbaSAlex Elder 			return;
301402c74fbaSAlex Elder 	}
301502c74fbaSAlex Elder 
301602c74fbaSAlex Elder 	obj_request->result = img_result;
3017a9e8ba2cSAlex Elder 	if (obj_request->result)
3018a9e8ba2cSAlex Elder 		goto out;
3019a9e8ba2cSAlex Elder 
3020a9e8ba2cSAlex Elder 	/*
3021a9e8ba2cSAlex Elder 	 * We need to zero anything beyond the parent overlap
3022a9e8ba2cSAlex Elder 	 * boundary.  Since rbd_img_obj_request_read_callback()
3023a9e8ba2cSAlex Elder 	 * will zero anything beyond the end of a short read, an
3024a9e8ba2cSAlex Elder 	 * easy way to do this is to pretend the data from the
3025a9e8ba2cSAlex Elder 	 * parent came up short--ending at the overlap boundary.
3026a9e8ba2cSAlex Elder 	 */
3027a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
3028a9e8ba2cSAlex Elder 	obj_end = obj_request->img_offset + obj_request->length;
3029a9e8ba2cSAlex Elder 	if (obj_end > rbd_dev->parent_overlap) {
3030a9e8ba2cSAlex Elder 		u64 xferred = 0;
3031a9e8ba2cSAlex Elder 
3032a9e8ba2cSAlex Elder 		if (obj_request->img_offset < rbd_dev->parent_overlap)
3033a9e8ba2cSAlex Elder 			xferred = rbd_dev->parent_overlap -
3034a9e8ba2cSAlex Elder 					obj_request->img_offset;
3035a9e8ba2cSAlex Elder 
303602c74fbaSAlex Elder 		obj_request->xferred = min(img_xferred, xferred);
3037a9e8ba2cSAlex Elder 	} else {
303802c74fbaSAlex Elder 		obj_request->xferred = img_xferred;
3039a9e8ba2cSAlex Elder 	}
3040a9e8ba2cSAlex Elder out:
30418b3e1a56SAlex Elder 	rbd_img_obj_request_read_callback(obj_request);
30428b3e1a56SAlex Elder 	rbd_obj_request_complete(obj_request);
30438b3e1a56SAlex Elder }
30448b3e1a56SAlex Elder 
30458b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
30468b3e1a56SAlex Elder {
30478b3e1a56SAlex Elder 	struct rbd_img_request *img_request;
30488b3e1a56SAlex Elder 	int result;
30498b3e1a56SAlex Elder 
30508b3e1a56SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
30518b3e1a56SAlex Elder 	rbd_assert(obj_request->img_request != NULL);
30528b3e1a56SAlex Elder 	rbd_assert(obj_request->result == (s32) -ENOENT);
30535b2ab72dSAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
30548b3e1a56SAlex Elder 
30558b3e1a56SAlex Elder 	/* rbd_read_finish(obj_request, obj_request->length); */
3056e93f3152SAlex Elder 	img_request = rbd_parent_request_create(obj_request,
30578b3e1a56SAlex Elder 						obj_request->img_offset,
3058e93f3152SAlex Elder 						obj_request->length);
30598b3e1a56SAlex Elder 	result = -ENOMEM;
30608b3e1a56SAlex Elder 	if (!img_request)
30618b3e1a56SAlex Elder 		goto out_err;
30628b3e1a56SAlex Elder 
30635b2ab72dSAlex Elder 	if (obj_request->type == OBJ_REQUEST_BIO)
3064f1a4739fSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
3065f1a4739fSAlex Elder 						obj_request->bio_list);
30665b2ab72dSAlex Elder 	else
30675b2ab72dSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES,
30685b2ab72dSAlex Elder 						obj_request->pages);
30698b3e1a56SAlex Elder 	if (result)
30708b3e1a56SAlex Elder 		goto out_err;
30718b3e1a56SAlex Elder 
30728b3e1a56SAlex Elder 	img_request->callback = rbd_img_parent_read_callback;
30738b3e1a56SAlex Elder 	result = rbd_img_request_submit(img_request);
30748b3e1a56SAlex Elder 	if (result)
30758b3e1a56SAlex Elder 		goto out_err;
30768b3e1a56SAlex Elder 
30778b3e1a56SAlex Elder 	return;
30788b3e1a56SAlex Elder out_err:
30798b3e1a56SAlex Elder 	if (img_request)
30808b3e1a56SAlex Elder 		rbd_img_request_put(img_request);
30818b3e1a56SAlex Elder 	obj_request->result = result;
30828b3e1a56SAlex Elder 	obj_request->xferred = 0;
30838b3e1a56SAlex Elder 	obj_request_done_set(obj_request);
30848b3e1a56SAlex Elder }
30858b3e1a56SAlex Elder 
3086922dab61SIlya Dryomov static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev);
3087922dab61SIlya Dryomov static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev);
3088922dab61SIlya Dryomov 
3089922dab61SIlya Dryomov static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
3090922dab61SIlya Dryomov 			 u64 notifier_id, void *data, size_t data_len)
3091b8d70035SAlex Elder {
3092922dab61SIlya Dryomov 	struct rbd_device *rbd_dev = arg;
30932169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3094b8d70035SAlex Elder 	int ret;
3095b8d70035SAlex Elder 
3096922dab61SIlya Dryomov 	dout("%s rbd_dev %p cookie %llu notify_id %llu\n", __func__, rbd_dev,
3097922dab61SIlya Dryomov 	     cookie, notify_id);
309852bb1f9bSIlya Dryomov 
309952bb1f9bSIlya Dryomov 	/*
310052bb1f9bSIlya Dryomov 	 * Until adequate refresh error handling is in place, there is
310152bb1f9bSIlya Dryomov 	 * not much we can do here, except warn.
310252bb1f9bSIlya Dryomov 	 *
310352bb1f9bSIlya Dryomov 	 * See http://tracker.ceph.com/issues/5040
310452bb1f9bSIlya Dryomov 	 */
3105e627db08SAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
3106e627db08SAlex Elder 	if (ret)
31079584d508SIlya Dryomov 		rbd_warn(rbd_dev, "refresh failed: %d", ret);
3108b8d70035SAlex Elder 
3109922dab61SIlya Dryomov 	ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
3110922dab61SIlya Dryomov 				   &rbd_dev->header_oloc, notify_id, cookie,
3111922dab61SIlya Dryomov 				   NULL, 0);
311252bb1f9bSIlya Dryomov 	if (ret)
31139584d508SIlya Dryomov 		rbd_warn(rbd_dev, "notify_ack ret %d", ret);
3114b8d70035SAlex Elder }
3115b8d70035SAlex Elder 
3116922dab61SIlya Dryomov static void rbd_watch_errcb(void *arg, u64 cookie, int err)
3117bb040aa0SIlya Dryomov {
3118922dab61SIlya Dryomov 	struct rbd_device *rbd_dev = arg;
3119bb040aa0SIlya Dryomov 	int ret;
3120bb040aa0SIlya Dryomov 
3121922dab61SIlya Dryomov 	rbd_warn(rbd_dev, "encountered watch error: %d", err);
3122bb040aa0SIlya Dryomov 
3123922dab61SIlya Dryomov 	__rbd_dev_header_unwatch_sync(rbd_dev);
3124bb040aa0SIlya Dryomov 
3125922dab61SIlya Dryomov 	ret = rbd_dev_header_watch_sync(rbd_dev);
3126bb040aa0SIlya Dryomov 	if (ret) {
3127922dab61SIlya Dryomov 		rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
3128922dab61SIlya Dryomov 		return;
3129bb040aa0SIlya Dryomov 	}
3130bb040aa0SIlya Dryomov 
3131922dab61SIlya Dryomov 	ret = rbd_dev_refresh(rbd_dev);
3132922dab61SIlya Dryomov 	if (ret)
3133922dab61SIlya Dryomov 		rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret);
3134bb040aa0SIlya Dryomov }
3135bb040aa0SIlya Dryomov 
3136bb040aa0SIlya Dryomov /*
3137b30a01f2SIlya Dryomov  * Initiate a watch request, synchronously.
31389969ebc5SAlex Elder  */
3139b30a01f2SIlya Dryomov static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
31409969ebc5SAlex Elder {
31419969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3142922dab61SIlya Dryomov 	struct ceph_osd_linger_request *handle;
31439969ebc5SAlex Elder 
3144922dab61SIlya Dryomov 	rbd_assert(!rbd_dev->watch_handle);
31459969ebc5SAlex Elder 
3146922dab61SIlya Dryomov 	handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
3147922dab61SIlya Dryomov 				 &rbd_dev->header_oloc, rbd_watch_cb,
3148922dab61SIlya Dryomov 				 rbd_watch_errcb, rbd_dev);
3149922dab61SIlya Dryomov 	if (IS_ERR(handle))
3150922dab61SIlya Dryomov 		return PTR_ERR(handle);
31519969ebc5SAlex Elder 
3152922dab61SIlya Dryomov 	rbd_dev->watch_handle = handle;
31538eb87565SAlex Elder 	return 0;
31549969ebc5SAlex Elder }
31559969ebc5SAlex Elder 
3156c525f036SIlya Dryomov static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
3157fca27065SIlya Dryomov {
3158922dab61SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3159922dab61SIlya Dryomov 	int ret;
3160b30a01f2SIlya Dryomov 
3161922dab61SIlya Dryomov 	if (!rbd_dev->watch_handle)
3162922dab61SIlya Dryomov 		return;
3163b30a01f2SIlya Dryomov 
3164922dab61SIlya Dryomov 	ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
3165922dab61SIlya Dryomov 	if (ret)
3166922dab61SIlya Dryomov 		rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
3167b30a01f2SIlya Dryomov 
3168922dab61SIlya Dryomov 	rbd_dev->watch_handle = NULL;
3169c525f036SIlya Dryomov }
3170c525f036SIlya Dryomov 
3171c525f036SIlya Dryomov /*
3172c525f036SIlya Dryomov  * Tear down a watch request, synchronously.
3173c525f036SIlya Dryomov  */
3174c525f036SIlya Dryomov static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
3175c525f036SIlya Dryomov {
3176c525f036SIlya Dryomov 	__rbd_dev_header_unwatch_sync(rbd_dev);
3177811c6688SIlya Dryomov 
3178811c6688SIlya Dryomov 	dout("%s flushing notifies\n", __func__);
3179811c6688SIlya Dryomov 	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
3180fca27065SIlya Dryomov }
3181fca27065SIlya Dryomov 
318236be9a76SAlex Elder /*
3183f40eb349SAlex Elder  * Synchronous osd object method call.  Returns the number of bytes
3184f40eb349SAlex Elder  * returned in the outbound buffer, or a negative error code.
318536be9a76SAlex Elder  */
318636be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
318736be9a76SAlex Elder 			     const char *object_name,
318836be9a76SAlex Elder 			     const char *class_name,
318936be9a76SAlex Elder 			     const char *method_name,
31904157976bSAlex Elder 			     const void *outbound,
319136be9a76SAlex Elder 			     size_t outbound_size,
31924157976bSAlex Elder 			     void *inbound,
3193e2a58ee5SAlex Elder 			     size_t inbound_size)
319436be9a76SAlex Elder {
31952169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
319636be9a76SAlex Elder 	struct rbd_obj_request *obj_request;
319736be9a76SAlex Elder 	struct page **pages;
319836be9a76SAlex Elder 	u32 page_count;
319936be9a76SAlex Elder 	int ret;
320036be9a76SAlex Elder 
320136be9a76SAlex Elder 	/*
32026010a451SAlex Elder 	 * Method calls are ultimately read operations.  The result
32036010a451SAlex Elder 	 * should placed into the inbound buffer provided.  They
32046010a451SAlex Elder 	 * also supply outbound data--parameters for the object
32056010a451SAlex Elder 	 * method.  Currently if this is present it will be a
32066010a451SAlex Elder 	 * snapshot id.
320736be9a76SAlex Elder 	 */
320836be9a76SAlex Elder 	page_count = (u32)calc_pages_for(0, inbound_size);
320936be9a76SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
321036be9a76SAlex Elder 	if (IS_ERR(pages))
321136be9a76SAlex Elder 		return PTR_ERR(pages);
321236be9a76SAlex Elder 
321336be9a76SAlex Elder 	ret = -ENOMEM;
32146010a451SAlex Elder 	obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
321536be9a76SAlex Elder 							OBJ_REQUEST_PAGES);
321636be9a76SAlex Elder 	if (!obj_request)
321736be9a76SAlex Elder 		goto out;
321836be9a76SAlex Elder 
321936be9a76SAlex Elder 	obj_request->pages = pages;
322036be9a76SAlex Elder 	obj_request->page_count = page_count;
322136be9a76SAlex Elder 
32226d2940c8SGuangliang Zhao 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
3223deb236b3SIlya Dryomov 						  obj_request);
322436be9a76SAlex Elder 	if (!obj_request->osd_req)
322536be9a76SAlex Elder 		goto out;
322636be9a76SAlex Elder 
3227c99d2d4aSAlex Elder 	osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
322804017e29SAlex Elder 					class_name, method_name);
322904017e29SAlex Elder 	if (outbound_size) {
323004017e29SAlex Elder 		struct ceph_pagelist *pagelist;
323104017e29SAlex Elder 
323204017e29SAlex Elder 		pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
323304017e29SAlex Elder 		if (!pagelist)
323404017e29SAlex Elder 			goto out;
323504017e29SAlex Elder 
323604017e29SAlex Elder 		ceph_pagelist_init(pagelist);
323704017e29SAlex Elder 		ceph_pagelist_append(pagelist, outbound, outbound_size);
323804017e29SAlex Elder 		osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
323904017e29SAlex Elder 						pagelist);
324004017e29SAlex Elder 	}
3241a4ce40a9SAlex Elder 	osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
3242a4ce40a9SAlex Elder 					obj_request->pages, inbound_size,
324344cd188dSAlex Elder 					0, false, false);
32449d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
3245430c28c3SAlex Elder 
324636be9a76SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
324736be9a76SAlex Elder 	if (ret)
324836be9a76SAlex Elder 		goto out;
324936be9a76SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
325036be9a76SAlex Elder 	if (ret)
325136be9a76SAlex Elder 		goto out;
325236be9a76SAlex Elder 
325336be9a76SAlex Elder 	ret = obj_request->result;
325436be9a76SAlex Elder 	if (ret < 0)
325536be9a76SAlex Elder 		goto out;
325657385b51SAlex Elder 
325757385b51SAlex Elder 	rbd_assert(obj_request->xferred < (u64)INT_MAX);
325857385b51SAlex Elder 	ret = (int)obj_request->xferred;
3259903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
326036be9a76SAlex Elder out:
326136be9a76SAlex Elder 	if (obj_request)
326236be9a76SAlex Elder 		rbd_obj_request_put(obj_request);
326336be9a76SAlex Elder 	else
326436be9a76SAlex Elder 		ceph_release_page_vector(pages, page_count);
326536be9a76SAlex Elder 
326636be9a76SAlex Elder 	return ret;
326736be9a76SAlex Elder }
326836be9a76SAlex Elder 
32697ad18afaSChristoph Hellwig static void rbd_queue_workfn(struct work_struct *work)
3270bc1ecc65SIlya Dryomov {
32717ad18afaSChristoph Hellwig 	struct request *rq = blk_mq_rq_from_pdu(work);
32727ad18afaSChristoph Hellwig 	struct rbd_device *rbd_dev = rq->q->queuedata;
3273bc1ecc65SIlya Dryomov 	struct rbd_img_request *img_request;
32744e752f0aSJosh Durgin 	struct ceph_snap_context *snapc = NULL;
3275bc1ecc65SIlya Dryomov 	u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
3276bc1ecc65SIlya Dryomov 	u64 length = blk_rq_bytes(rq);
32776d2940c8SGuangliang Zhao 	enum obj_operation_type op_type;
32784e752f0aSJosh Durgin 	u64 mapping_size;
3279bc1ecc65SIlya Dryomov 	int result;
3280bc1ecc65SIlya Dryomov 
32817ad18afaSChristoph Hellwig 	if (rq->cmd_type != REQ_TYPE_FS) {
32827ad18afaSChristoph Hellwig 		dout("%s: non-fs request type %d\n", __func__,
32837ad18afaSChristoph Hellwig 			(int) rq->cmd_type);
32847ad18afaSChristoph Hellwig 		result = -EIO;
32857ad18afaSChristoph Hellwig 		goto err;
32867ad18afaSChristoph Hellwig 	}
32877ad18afaSChristoph Hellwig 
3288c2df40dfSMike Christie 	if (req_op(rq) == REQ_OP_DISCARD)
328990e98c52SGuangliang Zhao 		op_type = OBJ_OP_DISCARD;
3290c2df40dfSMike Christie 	else if (req_op(rq) == REQ_OP_WRITE)
32916d2940c8SGuangliang Zhao 		op_type = OBJ_OP_WRITE;
32926d2940c8SGuangliang Zhao 	else
32936d2940c8SGuangliang Zhao 		op_type = OBJ_OP_READ;
32946d2940c8SGuangliang Zhao 
3295bc1ecc65SIlya Dryomov 	/* Ignore/skip any zero-length requests */
3296bc1ecc65SIlya Dryomov 
3297bc1ecc65SIlya Dryomov 	if (!length) {
3298bc1ecc65SIlya Dryomov 		dout("%s: zero-length request\n", __func__);
3299bc1ecc65SIlya Dryomov 		result = 0;
3300bc1ecc65SIlya Dryomov 		goto err_rq;
3301bc1ecc65SIlya Dryomov 	}
3302bc1ecc65SIlya Dryomov 
33036d2940c8SGuangliang Zhao 	/* Only reads are allowed to a read-only device */
3304bc1ecc65SIlya Dryomov 
33056d2940c8SGuangliang Zhao 	if (op_type != OBJ_OP_READ) {
3306bc1ecc65SIlya Dryomov 		if (rbd_dev->mapping.read_only) {
3307bc1ecc65SIlya Dryomov 			result = -EROFS;
3308bc1ecc65SIlya Dryomov 			goto err_rq;
3309bc1ecc65SIlya Dryomov 		}
3310bc1ecc65SIlya Dryomov 		rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
3311bc1ecc65SIlya Dryomov 	}
3312bc1ecc65SIlya Dryomov 
3313bc1ecc65SIlya Dryomov 	/*
3314bc1ecc65SIlya Dryomov 	 * Quit early if the mapped snapshot no longer exists.  It's
3315bc1ecc65SIlya Dryomov 	 * still possible the snapshot will have disappeared by the
3316bc1ecc65SIlya Dryomov 	 * time our request arrives at the osd, but there's no sense in
3317bc1ecc65SIlya Dryomov 	 * sending it if we already know.
3318bc1ecc65SIlya Dryomov 	 */
3319bc1ecc65SIlya Dryomov 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3320bc1ecc65SIlya Dryomov 		dout("request for non-existent snapshot");
3321bc1ecc65SIlya Dryomov 		rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3322bc1ecc65SIlya Dryomov 		result = -ENXIO;
3323bc1ecc65SIlya Dryomov 		goto err_rq;
3324bc1ecc65SIlya Dryomov 	}
3325bc1ecc65SIlya Dryomov 
3326bc1ecc65SIlya Dryomov 	if (offset && length > U64_MAX - offset + 1) {
3327bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
3328bc1ecc65SIlya Dryomov 			 length);
3329bc1ecc65SIlya Dryomov 		result = -EINVAL;
3330bc1ecc65SIlya Dryomov 		goto err_rq;	/* Shouldn't happen */
3331bc1ecc65SIlya Dryomov 	}
3332bc1ecc65SIlya Dryomov 
33337ad18afaSChristoph Hellwig 	blk_mq_start_request(rq);
33347ad18afaSChristoph Hellwig 
33354e752f0aSJosh Durgin 	down_read(&rbd_dev->header_rwsem);
33364e752f0aSJosh Durgin 	mapping_size = rbd_dev->mapping.size;
33376d2940c8SGuangliang Zhao 	if (op_type != OBJ_OP_READ) {
33384e752f0aSJosh Durgin 		snapc = rbd_dev->header.snapc;
33394e752f0aSJosh Durgin 		ceph_get_snap_context(snapc);
33404e752f0aSJosh Durgin 	}
33414e752f0aSJosh Durgin 	up_read(&rbd_dev->header_rwsem);
33424e752f0aSJosh Durgin 
33434e752f0aSJosh Durgin 	if (offset + length > mapping_size) {
3344bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
33454e752f0aSJosh Durgin 			 length, mapping_size);
3346bc1ecc65SIlya Dryomov 		result = -EIO;
3347bc1ecc65SIlya Dryomov 		goto err_rq;
3348bc1ecc65SIlya Dryomov 	}
3349bc1ecc65SIlya Dryomov 
33506d2940c8SGuangliang Zhao 	img_request = rbd_img_request_create(rbd_dev, offset, length, op_type,
33514e752f0aSJosh Durgin 					     snapc);
3352bc1ecc65SIlya Dryomov 	if (!img_request) {
3353bc1ecc65SIlya Dryomov 		result = -ENOMEM;
3354bc1ecc65SIlya Dryomov 		goto err_rq;
3355bc1ecc65SIlya Dryomov 	}
3356bc1ecc65SIlya Dryomov 	img_request->rq = rq;
335770b16db8SIlya Dryomov 	snapc = NULL; /* img_request consumes a ref */
3358bc1ecc65SIlya Dryomov 
335990e98c52SGuangliang Zhao 	if (op_type == OBJ_OP_DISCARD)
336090e98c52SGuangliang Zhao 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA,
336190e98c52SGuangliang Zhao 					      NULL);
336290e98c52SGuangliang Zhao 	else
336390e98c52SGuangliang Zhao 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
336490e98c52SGuangliang Zhao 					      rq->bio);
3365bc1ecc65SIlya Dryomov 	if (result)
3366bc1ecc65SIlya Dryomov 		goto err_img_request;
3367bc1ecc65SIlya Dryomov 
3368bc1ecc65SIlya Dryomov 	result = rbd_img_request_submit(img_request);
3369bc1ecc65SIlya Dryomov 	if (result)
3370bc1ecc65SIlya Dryomov 		goto err_img_request;
3371bc1ecc65SIlya Dryomov 
3372bc1ecc65SIlya Dryomov 	return;
3373bc1ecc65SIlya Dryomov 
3374bc1ecc65SIlya Dryomov err_img_request:
3375bc1ecc65SIlya Dryomov 	rbd_img_request_put(img_request);
3376bc1ecc65SIlya Dryomov err_rq:
3377bc1ecc65SIlya Dryomov 	if (result)
3378bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "%s %llx at %llx result %d",
33796d2940c8SGuangliang Zhao 			 obj_op_name(op_type), length, offset, result);
33804e752f0aSJosh Durgin 	ceph_put_snap_context(snapc);
33817ad18afaSChristoph Hellwig err:
33827ad18afaSChristoph Hellwig 	blk_mq_end_request(rq, result);
3383bc1ecc65SIlya Dryomov }
3384bc1ecc65SIlya Dryomov 
33857ad18afaSChristoph Hellwig static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
33867ad18afaSChristoph Hellwig 		const struct blk_mq_queue_data *bd)
3387bc1ecc65SIlya Dryomov {
33887ad18afaSChristoph Hellwig 	struct request *rq = bd->rq;
33897ad18afaSChristoph Hellwig 	struct work_struct *work = blk_mq_rq_to_pdu(rq);
3390bc1ecc65SIlya Dryomov 
33917ad18afaSChristoph Hellwig 	queue_work(rbd_wq, work);
33927ad18afaSChristoph Hellwig 	return BLK_MQ_RQ_QUEUE_OK;
3393bf0d5f50SAlex Elder }
3394bf0d5f50SAlex Elder 
3395602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
3396602adf40SYehuda Sadeh {
3397602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
3398602adf40SYehuda Sadeh 
3399602adf40SYehuda Sadeh 	if (!disk)
3400602adf40SYehuda Sadeh 		return;
3401602adf40SYehuda Sadeh 
3402a0cab924SAlex Elder 	rbd_dev->disk = NULL;
3403a0cab924SAlex Elder 	if (disk->flags & GENHD_FL_UP) {
3404602adf40SYehuda Sadeh 		del_gendisk(disk);
3405602adf40SYehuda Sadeh 		if (disk->queue)
3406602adf40SYehuda Sadeh 			blk_cleanup_queue(disk->queue);
34077ad18afaSChristoph Hellwig 		blk_mq_free_tag_set(&rbd_dev->tag_set);
3408a0cab924SAlex Elder 	}
3409602adf40SYehuda Sadeh 	put_disk(disk);
3410602adf40SYehuda Sadeh }
3411602adf40SYehuda Sadeh 
3412788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
3413788e2df3SAlex Elder 				const char *object_name,
34147097f8dfSAlex Elder 				u64 offset, u64 length, void *buf)
3415788e2df3SAlex Elder 
3416788e2df3SAlex Elder {
34172169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3418788e2df3SAlex Elder 	struct rbd_obj_request *obj_request;
3419788e2df3SAlex Elder 	struct page **pages = NULL;
3420788e2df3SAlex Elder 	u32 page_count;
34211ceae7efSAlex Elder 	size_t size;
3422788e2df3SAlex Elder 	int ret;
3423788e2df3SAlex Elder 
3424788e2df3SAlex Elder 	page_count = (u32) calc_pages_for(offset, length);
3425788e2df3SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
3426788e2df3SAlex Elder 	if (IS_ERR(pages))
3427a8d42056SJan Kara 		return PTR_ERR(pages);
3428788e2df3SAlex Elder 
3429788e2df3SAlex Elder 	ret = -ENOMEM;
3430788e2df3SAlex Elder 	obj_request = rbd_obj_request_create(object_name, offset, length,
3431788e2df3SAlex Elder 							OBJ_REQUEST_PAGES);
3432788e2df3SAlex Elder 	if (!obj_request)
3433788e2df3SAlex Elder 		goto out;
3434788e2df3SAlex Elder 
3435788e2df3SAlex Elder 	obj_request->pages = pages;
3436788e2df3SAlex Elder 	obj_request->page_count = page_count;
3437788e2df3SAlex Elder 
34386d2940c8SGuangliang Zhao 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
3439deb236b3SIlya Dryomov 						  obj_request);
3440788e2df3SAlex Elder 	if (!obj_request->osd_req)
3441788e2df3SAlex Elder 		goto out;
3442788e2df3SAlex Elder 
3443c99d2d4aSAlex Elder 	osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
3444c99d2d4aSAlex Elder 					offset, length, 0, 0);
3445406e2c9fSAlex Elder 	osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
3446a4ce40a9SAlex Elder 					obj_request->pages,
344744cd188dSAlex Elder 					obj_request->length,
344844cd188dSAlex Elder 					obj_request->offset & ~PAGE_MASK,
344944cd188dSAlex Elder 					false, false);
34509d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
3451430c28c3SAlex Elder 
3452788e2df3SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
3453788e2df3SAlex Elder 	if (ret)
3454788e2df3SAlex Elder 		goto out;
3455788e2df3SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
3456788e2df3SAlex Elder 	if (ret)
3457788e2df3SAlex Elder 		goto out;
3458788e2df3SAlex Elder 
3459788e2df3SAlex Elder 	ret = obj_request->result;
3460788e2df3SAlex Elder 	if (ret < 0)
3461788e2df3SAlex Elder 		goto out;
34621ceae7efSAlex Elder 
34631ceae7efSAlex Elder 	rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
34641ceae7efSAlex Elder 	size = (size_t) obj_request->xferred;
3465903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, buf, 0, size);
346623ed6e13SAlex Elder 	rbd_assert(size <= (size_t)INT_MAX);
346723ed6e13SAlex Elder 	ret = (int)size;
3468788e2df3SAlex Elder out:
3469788e2df3SAlex Elder 	if (obj_request)
3470788e2df3SAlex Elder 		rbd_obj_request_put(obj_request);
3471788e2df3SAlex Elder 	else
3472788e2df3SAlex Elder 		ceph_release_page_vector(pages, page_count);
3473788e2df3SAlex Elder 
3474788e2df3SAlex Elder 	return ret;
3475788e2df3SAlex Elder }
3476788e2df3SAlex Elder 
3477602adf40SYehuda Sadeh /*
3478662518b1SAlex Elder  * Read the complete header for the given rbd device.  On successful
3479662518b1SAlex Elder  * return, the rbd_dev->header field will contain up-to-date
3480662518b1SAlex Elder  * information about the image.
34814156d998SAlex Elder  */
348299a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
34834156d998SAlex Elder {
34844156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
34854156d998SAlex Elder 	u32 snap_count = 0;
34864156d998SAlex Elder 	u64 names_size = 0;
34874156d998SAlex Elder 	u32 want_count;
34884156d998SAlex Elder 	int ret;
34894156d998SAlex Elder 
34904156d998SAlex Elder 	/*
34914156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
34924156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
34934156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
34944156d998SAlex Elder 	 * the number of snapshots could change by the time we read
34954156d998SAlex Elder 	 * it in, in which case we re-read it.
34964156d998SAlex Elder 	 */
34974156d998SAlex Elder 	do {
34984156d998SAlex Elder 		size_t size;
34994156d998SAlex Elder 
35004156d998SAlex Elder 		kfree(ondisk);
35014156d998SAlex Elder 
35024156d998SAlex Elder 		size = sizeof (*ondisk);
35034156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
35044156d998SAlex Elder 		size += names_size;
35054156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
35064156d998SAlex Elder 		if (!ondisk)
3507662518b1SAlex Elder 			return -ENOMEM;
35084156d998SAlex Elder 
3509c41d13a3SIlya Dryomov 		ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_oid.name,
35107097f8dfSAlex Elder 				       0, size, ondisk);
35114156d998SAlex Elder 		if (ret < 0)
3512662518b1SAlex Elder 			goto out;
3513c0cd10dbSAlex Elder 		if ((size_t)ret < size) {
35144156d998SAlex Elder 			ret = -ENXIO;
351506ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
351606ecc6cbSAlex Elder 				size, ret);
3517662518b1SAlex Elder 			goto out;
35184156d998SAlex Elder 		}
35194156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
35204156d998SAlex Elder 			ret = -ENXIO;
352106ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
3522662518b1SAlex Elder 			goto out;
35234156d998SAlex Elder 		}
35244156d998SAlex Elder 
35254156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
35264156d998SAlex Elder 		want_count = snap_count;
35274156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
35284156d998SAlex Elder 	} while (snap_count != want_count);
35294156d998SAlex Elder 
3530662518b1SAlex Elder 	ret = rbd_header_from_disk(rbd_dev, ondisk);
3531662518b1SAlex Elder out:
35324156d998SAlex Elder 	kfree(ondisk);
35334156d998SAlex Elder 
3534dfc5606dSYehuda Sadeh 	return ret;
3535602adf40SYehuda Sadeh }
3536602adf40SYehuda Sadeh 
353715228edeSAlex Elder /*
353815228edeSAlex Elder  * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
353915228edeSAlex Elder  * has disappeared from the (just updated) snapshot context.
354015228edeSAlex Elder  */
354115228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev)
354215228edeSAlex Elder {
354315228edeSAlex Elder 	u64 snap_id;
354415228edeSAlex Elder 
354515228edeSAlex Elder 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
354615228edeSAlex Elder 		return;
354715228edeSAlex Elder 
354815228edeSAlex Elder 	snap_id = rbd_dev->spec->snap_id;
354915228edeSAlex Elder 	if (snap_id == CEPH_NOSNAP)
355015228edeSAlex Elder 		return;
355115228edeSAlex Elder 
355215228edeSAlex Elder 	if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
355315228edeSAlex Elder 		clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
355415228edeSAlex Elder }
355515228edeSAlex Elder 
35569875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev)
35579875201eSJosh Durgin {
35589875201eSJosh Durgin 	sector_t size;
35599875201eSJosh Durgin 
35609875201eSJosh Durgin 	/*
3561811c6688SIlya Dryomov 	 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
3562811c6688SIlya Dryomov 	 * try to update its size.  If REMOVING is set, updating size
3563811c6688SIlya Dryomov 	 * is just useless work since the device can't be opened.
35649875201eSJosh Durgin 	 */
3565811c6688SIlya Dryomov 	if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
3566811c6688SIlya Dryomov 	    !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
35679875201eSJosh Durgin 		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
35689875201eSJosh Durgin 		dout("setting size to %llu sectors", (unsigned long long)size);
35699875201eSJosh Durgin 		set_capacity(rbd_dev->disk, size);
35709875201eSJosh Durgin 		revalidate_disk(rbd_dev->disk);
35719875201eSJosh Durgin 	}
35729875201eSJosh Durgin }
35739875201eSJosh Durgin 
3574cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev)
35751fe5e993SAlex Elder {
3576e627db08SAlex Elder 	u64 mapping_size;
35771fe5e993SAlex Elder 	int ret;
35781fe5e993SAlex Elder 
3579cfbf6377SAlex Elder 	down_write(&rbd_dev->header_rwsem);
35803b5cf2a2SAlex Elder 	mapping_size = rbd_dev->mapping.size;
3581a720ae09SIlya Dryomov 
3582a720ae09SIlya Dryomov 	ret = rbd_dev_header_info(rbd_dev);
358352bb1f9bSIlya Dryomov 	if (ret)
358473e39e4dSIlya Dryomov 		goto out;
358515228edeSAlex Elder 
3586e8f59b59SIlya Dryomov 	/*
3587e8f59b59SIlya Dryomov 	 * If there is a parent, see if it has disappeared due to the
3588e8f59b59SIlya Dryomov 	 * mapped image getting flattened.
3589e8f59b59SIlya Dryomov 	 */
3590e8f59b59SIlya Dryomov 	if (rbd_dev->parent) {
3591e8f59b59SIlya Dryomov 		ret = rbd_dev_v2_parent_info(rbd_dev);
3592e8f59b59SIlya Dryomov 		if (ret)
359373e39e4dSIlya Dryomov 			goto out;
3594e8f59b59SIlya Dryomov 	}
3595e8f59b59SIlya Dryomov 
35965ff1108cSIlya Dryomov 	if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
35975ff1108cSIlya Dryomov 		rbd_dev->mapping.size = rbd_dev->header.image_size;
35985ff1108cSIlya Dryomov 	} else {
35995ff1108cSIlya Dryomov 		/* validate mapped snapshot's EXISTS flag */
360015228edeSAlex Elder 		rbd_exists_validate(rbd_dev);
36015ff1108cSIlya Dryomov 	}
36025ff1108cSIlya Dryomov 
360373e39e4dSIlya Dryomov out:
3604cfbf6377SAlex Elder 	up_write(&rbd_dev->header_rwsem);
360573e39e4dSIlya Dryomov 	if (!ret && mapping_size != rbd_dev->mapping.size)
36069875201eSJosh Durgin 		rbd_dev_update_size(rbd_dev);
36071fe5e993SAlex Elder 
360873e39e4dSIlya Dryomov 	return ret;
36091fe5e993SAlex Elder }
36101fe5e993SAlex Elder 
36117ad18afaSChristoph Hellwig static int rbd_init_request(void *data, struct request *rq,
36127ad18afaSChristoph Hellwig 		unsigned int hctx_idx, unsigned int request_idx,
36137ad18afaSChristoph Hellwig 		unsigned int numa_node)
36147ad18afaSChristoph Hellwig {
36157ad18afaSChristoph Hellwig 	struct work_struct *work = blk_mq_rq_to_pdu(rq);
36167ad18afaSChristoph Hellwig 
36177ad18afaSChristoph Hellwig 	INIT_WORK(work, rbd_queue_workfn);
36187ad18afaSChristoph Hellwig 	return 0;
36197ad18afaSChristoph Hellwig }
36207ad18afaSChristoph Hellwig 
36217ad18afaSChristoph Hellwig static struct blk_mq_ops rbd_mq_ops = {
36227ad18afaSChristoph Hellwig 	.queue_rq	= rbd_queue_rq,
36237ad18afaSChristoph Hellwig 	.map_queue	= blk_mq_map_queue,
36247ad18afaSChristoph Hellwig 	.init_request	= rbd_init_request,
36257ad18afaSChristoph Hellwig };
36267ad18afaSChristoph Hellwig 
3627602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
3628602adf40SYehuda Sadeh {
3629602adf40SYehuda Sadeh 	struct gendisk *disk;
3630602adf40SYehuda Sadeh 	struct request_queue *q;
3631593a9e7bSAlex Elder 	u64 segment_size;
36327ad18afaSChristoph Hellwig 	int err;
3633602adf40SYehuda Sadeh 
3634602adf40SYehuda Sadeh 	/* create gendisk info */
36357e513d43SIlya Dryomov 	disk = alloc_disk(single_major ?
36367e513d43SIlya Dryomov 			  (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
36377e513d43SIlya Dryomov 			  RBD_MINORS_PER_MAJOR);
3638602adf40SYehuda Sadeh 	if (!disk)
36391fcdb8aaSAlex Elder 		return -ENOMEM;
3640602adf40SYehuda Sadeh 
3641f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
3642de71a297SAlex Elder 		 rbd_dev->dev_id);
3643602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
3644dd82fff1SIlya Dryomov 	disk->first_minor = rbd_dev->minor;
36457e513d43SIlya Dryomov 	if (single_major)
36467e513d43SIlya Dryomov 		disk->flags |= GENHD_FL_EXT_DEVT;
3647602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
3648602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
3649602adf40SYehuda Sadeh 
36507ad18afaSChristoph Hellwig 	memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
36517ad18afaSChristoph Hellwig 	rbd_dev->tag_set.ops = &rbd_mq_ops;
3652b5584180SIlya Dryomov 	rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
36537ad18afaSChristoph Hellwig 	rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
3654b5584180SIlya Dryomov 	rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
36557ad18afaSChristoph Hellwig 	rbd_dev->tag_set.nr_hw_queues = 1;
36567ad18afaSChristoph Hellwig 	rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
36577ad18afaSChristoph Hellwig 
36587ad18afaSChristoph Hellwig 	err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
36597ad18afaSChristoph Hellwig 	if (err)
3660602adf40SYehuda Sadeh 		goto out_disk;
3661029bcbd8SJosh Durgin 
36627ad18afaSChristoph Hellwig 	q = blk_mq_init_queue(&rbd_dev->tag_set);
36637ad18afaSChristoph Hellwig 	if (IS_ERR(q)) {
36647ad18afaSChristoph Hellwig 		err = PTR_ERR(q);
36657ad18afaSChristoph Hellwig 		goto out_tag_set;
36667ad18afaSChristoph Hellwig 	}
36677ad18afaSChristoph Hellwig 
3668d8a2c89cSIlya Dryomov 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
3669d8a2c89cSIlya Dryomov 	/* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
3670593a9e7bSAlex Elder 
3671029bcbd8SJosh Durgin 	/* set io sizes to object size */
3672593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
3673593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
36740d9fde4fSIlya Dryomov 	q->limits.max_sectors = queue_max_hw_sectors(q);
3675d3834fefSIlya Dryomov 	blk_queue_max_segments(q, segment_size / SECTOR_SIZE);
3676593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
3677593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
3678593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
3679029bcbd8SJosh Durgin 
368090e98c52SGuangliang Zhao 	/* enable the discard support */
368190e98c52SGuangliang Zhao 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
368290e98c52SGuangliang Zhao 	q->limits.discard_granularity = segment_size;
368390e98c52SGuangliang Zhao 	q->limits.discard_alignment = segment_size;
36842bb4cd5cSJens Axboe 	blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE);
3685b76f8239SJosh Durgin 	q->limits.discard_zeroes_data = 1;
368690e98c52SGuangliang Zhao 
3687bae818eeSRonny Hegewald 	if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
3688bae818eeSRonny Hegewald 		q->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES;
3689bae818eeSRonny Hegewald 
3690602adf40SYehuda Sadeh 	disk->queue = q;
3691602adf40SYehuda Sadeh 
3692602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
3693602adf40SYehuda Sadeh 
3694602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
3695602adf40SYehuda Sadeh 
3696602adf40SYehuda Sadeh 	return 0;
36977ad18afaSChristoph Hellwig out_tag_set:
36987ad18afaSChristoph Hellwig 	blk_mq_free_tag_set(&rbd_dev->tag_set);
3699602adf40SYehuda Sadeh out_disk:
3700602adf40SYehuda Sadeh 	put_disk(disk);
37017ad18afaSChristoph Hellwig 	return err;
3702602adf40SYehuda Sadeh }
3703602adf40SYehuda Sadeh 
3704dfc5606dSYehuda Sadeh /*
3705dfc5606dSYehuda Sadeh   sysfs
3706dfc5606dSYehuda Sadeh */
3707602adf40SYehuda Sadeh 
3708593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3709593a9e7bSAlex Elder {
3710593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
3711593a9e7bSAlex Elder }
3712593a9e7bSAlex Elder 
3713dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
3714dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3715602adf40SYehuda Sadeh {
3716593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3717dfc5606dSYehuda Sadeh 
3718fc71d833SAlex Elder 	return sprintf(buf, "%llu\n",
3719fc71d833SAlex Elder 		(unsigned long long)rbd_dev->mapping.size);
3720602adf40SYehuda Sadeh }
3721602adf40SYehuda Sadeh 
372234b13184SAlex Elder /*
372334b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
372434b13184SAlex Elder  * necessarily the base image.
372534b13184SAlex Elder  */
372634b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
372734b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
372834b13184SAlex Elder {
372934b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
373034b13184SAlex Elder 
373134b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
373234b13184SAlex Elder 			(unsigned long long)rbd_dev->mapping.features);
373334b13184SAlex Elder }
373434b13184SAlex Elder 
3735dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
3736dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
3737602adf40SYehuda Sadeh {
3738593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3739dfc5606dSYehuda Sadeh 
3740fc71d833SAlex Elder 	if (rbd_dev->major)
3741dfc5606dSYehuda Sadeh 		return sprintf(buf, "%d\n", rbd_dev->major);
3742fc71d833SAlex Elder 
3743fc71d833SAlex Elder 	return sprintf(buf, "(none)\n");
3744dd82fff1SIlya Dryomov }
3745fc71d833SAlex Elder 
3746dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev,
3747dd82fff1SIlya Dryomov 			      struct device_attribute *attr, char *buf)
3748dd82fff1SIlya Dryomov {
3749dd82fff1SIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3750dd82fff1SIlya Dryomov 
3751dd82fff1SIlya Dryomov 	return sprintf(buf, "%d\n", rbd_dev->minor);
3752dfc5606dSYehuda Sadeh }
3753dfc5606dSYehuda Sadeh 
3754dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
3755dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
3756dfc5606dSYehuda Sadeh {
3757593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3758dfc5606dSYehuda Sadeh 
37591dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
3760033268a5SIlya Dryomov 		       ceph_client_gid(rbd_dev->rbd_client->client));
3761dfc5606dSYehuda Sadeh }
3762dfc5606dSYehuda Sadeh 
3763dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
3764dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3765dfc5606dSYehuda Sadeh {
3766593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3767dfc5606dSYehuda Sadeh 
37680d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
3769dfc5606dSYehuda Sadeh }
3770dfc5606dSYehuda Sadeh 
37719bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
37729bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
37739bb2f334SAlex Elder {
37749bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
37759bb2f334SAlex Elder 
37760d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
37770d7dbfceSAlex Elder 			(unsigned long long) rbd_dev->spec->pool_id);
37789bb2f334SAlex Elder }
37799bb2f334SAlex Elder 
3780dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
3781dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3782dfc5606dSYehuda Sadeh {
3783593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3784dfc5606dSYehuda Sadeh 
3785a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
37860d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3787a92ffdf8SAlex Elder 
3788a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
3789dfc5606dSYehuda Sadeh }
3790dfc5606dSYehuda Sadeh 
3791589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
3792589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
3793589d30e0SAlex Elder {
3794589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3795589d30e0SAlex Elder 
37960d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
3797589d30e0SAlex Elder }
3798589d30e0SAlex Elder 
379934b13184SAlex Elder /*
380034b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
380134b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
380234b13184SAlex Elder  */
3803dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
3804dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
3805dfc5606dSYehuda Sadeh 			     char *buf)
3806dfc5606dSYehuda Sadeh {
3807593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3808dfc5606dSYehuda Sadeh 
38090d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
3810dfc5606dSYehuda Sadeh }
3811dfc5606dSYehuda Sadeh 
381286b00e0dSAlex Elder /*
3813ff96128fSIlya Dryomov  * For a v2 image, shows the chain of parent images, separated by empty
3814ff96128fSIlya Dryomov  * lines.  For v1 images or if there is no parent, shows "(no parent
3815ff96128fSIlya Dryomov  * image)".
381686b00e0dSAlex Elder  */
381786b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
381886b00e0dSAlex Elder 			       struct device_attribute *attr,
381986b00e0dSAlex Elder 			       char *buf)
382086b00e0dSAlex Elder {
382186b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3822ff96128fSIlya Dryomov 	ssize_t count = 0;
382386b00e0dSAlex Elder 
3824ff96128fSIlya Dryomov 	if (!rbd_dev->parent)
382586b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
382686b00e0dSAlex Elder 
3827ff96128fSIlya Dryomov 	for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
3828ff96128fSIlya Dryomov 		struct rbd_spec *spec = rbd_dev->parent_spec;
382986b00e0dSAlex Elder 
3830ff96128fSIlya Dryomov 		count += sprintf(&buf[count], "%s"
3831ff96128fSIlya Dryomov 			    "pool_id %llu\npool_name %s\n"
3832ff96128fSIlya Dryomov 			    "image_id %s\nimage_name %s\n"
3833ff96128fSIlya Dryomov 			    "snap_id %llu\nsnap_name %s\n"
3834ff96128fSIlya Dryomov 			    "overlap %llu\n",
3835ff96128fSIlya Dryomov 			    !count ? "" : "\n", /* first? */
3836ff96128fSIlya Dryomov 			    spec->pool_id, spec->pool_name,
3837ff96128fSIlya Dryomov 			    spec->image_id, spec->image_name ?: "(unknown)",
3838ff96128fSIlya Dryomov 			    spec->snap_id, spec->snap_name,
3839ff96128fSIlya Dryomov 			    rbd_dev->parent_overlap);
3840ff96128fSIlya Dryomov 	}
384186b00e0dSAlex Elder 
384286b00e0dSAlex Elder 	return count;
384386b00e0dSAlex Elder }
384486b00e0dSAlex Elder 
3845dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
3846dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
3847dfc5606dSYehuda Sadeh 				 const char *buf,
3848dfc5606dSYehuda Sadeh 				 size_t size)
3849dfc5606dSYehuda Sadeh {
3850593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3851b813623aSAlex Elder 	int ret;
3852602adf40SYehuda Sadeh 
3853cc4a38bdSAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
3854e627db08SAlex Elder 	if (ret)
385552bb1f9bSIlya Dryomov 		return ret;
3856b813623aSAlex Elder 
385752bb1f9bSIlya Dryomov 	return size;
3858dfc5606dSYehuda Sadeh }
3859602adf40SYehuda Sadeh 
3860dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
386134b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
3862dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
3863dd82fff1SIlya Dryomov static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
3864dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
3865dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
38669bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
3867dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
3868589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
3869dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3870dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
387186b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
3872dfc5606dSYehuda Sadeh 
3873dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
3874dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
387534b13184SAlex Elder 	&dev_attr_features.attr,
3876dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
3877dd82fff1SIlya Dryomov 	&dev_attr_minor.attr,
3878dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
3879dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
38809bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
3881dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
3882589d30e0SAlex Elder 	&dev_attr_image_id.attr,
3883dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
388486b00e0dSAlex Elder 	&dev_attr_parent.attr,
3885dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
3886dfc5606dSYehuda Sadeh 	NULL
3887dfc5606dSYehuda Sadeh };
3888dfc5606dSYehuda Sadeh 
3889dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
3890dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
3891dfc5606dSYehuda Sadeh };
3892dfc5606dSYehuda Sadeh 
3893dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
3894dfc5606dSYehuda Sadeh 	&rbd_attr_group,
3895dfc5606dSYehuda Sadeh 	NULL
3896dfc5606dSYehuda Sadeh };
3897dfc5606dSYehuda Sadeh 
38986cac4695SIlya Dryomov static void rbd_dev_release(struct device *dev);
3899dfc5606dSYehuda Sadeh 
3900dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
3901dfc5606dSYehuda Sadeh 	.name		= "rbd",
3902dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
39036cac4695SIlya Dryomov 	.release	= rbd_dev_release,
3904dfc5606dSYehuda Sadeh };
3905dfc5606dSYehuda Sadeh 
39068b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
39078b8fb99cSAlex Elder {
39088b8fb99cSAlex Elder 	kref_get(&spec->kref);
39098b8fb99cSAlex Elder 
39108b8fb99cSAlex Elder 	return spec;
39118b8fb99cSAlex Elder }
39128b8fb99cSAlex Elder 
39138b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
39148b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
39158b8fb99cSAlex Elder {
39168b8fb99cSAlex Elder 	if (spec)
39178b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
39188b8fb99cSAlex Elder }
39198b8fb99cSAlex Elder 
39208b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
39218b8fb99cSAlex Elder {
39228b8fb99cSAlex Elder 	struct rbd_spec *spec;
39238b8fb99cSAlex Elder 
39248b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
39258b8fb99cSAlex Elder 	if (!spec)
39268b8fb99cSAlex Elder 		return NULL;
392704077599SIlya Dryomov 
392804077599SIlya Dryomov 	spec->pool_id = CEPH_NOPOOL;
392904077599SIlya Dryomov 	spec->snap_id = CEPH_NOSNAP;
39308b8fb99cSAlex Elder 	kref_init(&spec->kref);
39318b8fb99cSAlex Elder 
39328b8fb99cSAlex Elder 	return spec;
39338b8fb99cSAlex Elder }
39348b8fb99cSAlex Elder 
39358b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
39368b8fb99cSAlex Elder {
39378b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
39388b8fb99cSAlex Elder 
39398b8fb99cSAlex Elder 	kfree(spec->pool_name);
39408b8fb99cSAlex Elder 	kfree(spec->image_id);
39418b8fb99cSAlex Elder 	kfree(spec->image_name);
39428b8fb99cSAlex Elder 	kfree(spec->snap_name);
39438b8fb99cSAlex Elder 	kfree(spec);
39448b8fb99cSAlex Elder }
39458b8fb99cSAlex Elder 
39461643dfa4SIlya Dryomov static void rbd_dev_free(struct rbd_device *rbd_dev)
3947dd5ac32dSIlya Dryomov {
3948c41d13a3SIlya Dryomov 	ceph_oid_destroy(&rbd_dev->header_oid);
39496b6dddbeSIlya Dryomov 	ceph_oloc_destroy(&rbd_dev->header_oloc);
3950c41d13a3SIlya Dryomov 
3951dd5ac32dSIlya Dryomov 	rbd_put_client(rbd_dev->rbd_client);
3952dd5ac32dSIlya Dryomov 	rbd_spec_put(rbd_dev->spec);
3953dd5ac32dSIlya Dryomov 	kfree(rbd_dev->opts);
3954dd5ac32dSIlya Dryomov 	kfree(rbd_dev);
39551643dfa4SIlya Dryomov }
39561643dfa4SIlya Dryomov 
39571643dfa4SIlya Dryomov static void rbd_dev_release(struct device *dev)
39581643dfa4SIlya Dryomov {
39591643dfa4SIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
39601643dfa4SIlya Dryomov 	bool need_put = !!rbd_dev->opts;
39611643dfa4SIlya Dryomov 
39621643dfa4SIlya Dryomov 	if (need_put) {
39631643dfa4SIlya Dryomov 		destroy_workqueue(rbd_dev->task_wq);
39641643dfa4SIlya Dryomov 		ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
39651643dfa4SIlya Dryomov 	}
39661643dfa4SIlya Dryomov 
39671643dfa4SIlya Dryomov 	rbd_dev_free(rbd_dev);
3968dd5ac32dSIlya Dryomov 
3969dd5ac32dSIlya Dryomov 	/*
3970dd5ac32dSIlya Dryomov 	 * This is racy, but way better than putting module outside of
3971dd5ac32dSIlya Dryomov 	 * the release callback.  The race window is pretty small, so
3972dd5ac32dSIlya Dryomov 	 * doing something similar to dm (dm-builtin.c) is overkill.
3973dd5ac32dSIlya Dryomov 	 */
3974dd5ac32dSIlya Dryomov 	if (need_put)
3975dd5ac32dSIlya Dryomov 		module_put(THIS_MODULE);
3976dd5ac32dSIlya Dryomov }
3977dd5ac32dSIlya Dryomov 
39781643dfa4SIlya Dryomov static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
39791643dfa4SIlya Dryomov 					   struct rbd_spec *spec)
3980c53d5893SAlex Elder {
3981c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
3982c53d5893SAlex Elder 
3983c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
3984c53d5893SAlex Elder 	if (!rbd_dev)
3985c53d5893SAlex Elder 		return NULL;
3986c53d5893SAlex Elder 
3987c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
3988c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
3989c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
3990c53d5893SAlex Elder 
3991c41d13a3SIlya Dryomov 	ceph_oid_init(&rbd_dev->header_oid);
3992922dab61SIlya Dryomov 	ceph_oloc_init(&rbd_dev->header_oloc);
3993c41d13a3SIlya Dryomov 
3994dd5ac32dSIlya Dryomov 	rbd_dev->dev.bus = &rbd_bus_type;
3995dd5ac32dSIlya Dryomov 	rbd_dev->dev.type = &rbd_device_type;
3996dd5ac32dSIlya Dryomov 	rbd_dev->dev.parent = &rbd_root_dev;
3997dd5ac32dSIlya Dryomov 	device_initialize(&rbd_dev->dev);
3998dd5ac32dSIlya Dryomov 
3999c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
4000d147543dSIlya Dryomov 	rbd_dev->spec = spec;
40010903e875SAlex Elder 
40027627151eSYan, Zheng 	rbd_dev->layout.stripe_unit = 1 << RBD_MAX_OBJ_ORDER;
40037627151eSYan, Zheng 	rbd_dev->layout.stripe_count = 1;
40047627151eSYan, Zheng 	rbd_dev->layout.object_size = 1 << RBD_MAX_OBJ_ORDER;
40057627151eSYan, Zheng 	rbd_dev->layout.pool_id = spec->pool_id;
400630c156d9SYan, Zheng 	RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
40070903e875SAlex Elder 
40081643dfa4SIlya Dryomov 	return rbd_dev;
40091643dfa4SIlya Dryomov }
40101643dfa4SIlya Dryomov 
4011dd5ac32dSIlya Dryomov /*
40121643dfa4SIlya Dryomov  * Create a mapping rbd_dev.
4013dd5ac32dSIlya Dryomov  */
40141643dfa4SIlya Dryomov static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
40151643dfa4SIlya Dryomov 					 struct rbd_spec *spec,
40161643dfa4SIlya Dryomov 					 struct rbd_options *opts)
40171643dfa4SIlya Dryomov {
40181643dfa4SIlya Dryomov 	struct rbd_device *rbd_dev;
40191643dfa4SIlya Dryomov 
40201643dfa4SIlya Dryomov 	rbd_dev = __rbd_dev_create(rbdc, spec);
40211643dfa4SIlya Dryomov 	if (!rbd_dev)
40221643dfa4SIlya Dryomov 		return NULL;
40231643dfa4SIlya Dryomov 
40241643dfa4SIlya Dryomov 	rbd_dev->opts = opts;
40251643dfa4SIlya Dryomov 
40261643dfa4SIlya Dryomov 	/* get an id and fill in device name */
40271643dfa4SIlya Dryomov 	rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
40281643dfa4SIlya Dryomov 					 minor_to_rbd_dev_id(1 << MINORBITS),
40291643dfa4SIlya Dryomov 					 GFP_KERNEL);
40301643dfa4SIlya Dryomov 	if (rbd_dev->dev_id < 0)
40311643dfa4SIlya Dryomov 		goto fail_rbd_dev;
40321643dfa4SIlya Dryomov 
40331643dfa4SIlya Dryomov 	sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
40341643dfa4SIlya Dryomov 	rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
40351643dfa4SIlya Dryomov 						   rbd_dev->name);
40361643dfa4SIlya Dryomov 	if (!rbd_dev->task_wq)
40371643dfa4SIlya Dryomov 		goto fail_dev_id;
40381643dfa4SIlya Dryomov 
40391643dfa4SIlya Dryomov 	/* we have a ref from do_rbd_add() */
4040dd5ac32dSIlya Dryomov 	__module_get(THIS_MODULE);
4041dd5ac32dSIlya Dryomov 
40421643dfa4SIlya Dryomov 	dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
4043c53d5893SAlex Elder 	return rbd_dev;
40441643dfa4SIlya Dryomov 
40451643dfa4SIlya Dryomov fail_dev_id:
40461643dfa4SIlya Dryomov 	ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
40471643dfa4SIlya Dryomov fail_rbd_dev:
40481643dfa4SIlya Dryomov 	rbd_dev_free(rbd_dev);
40491643dfa4SIlya Dryomov 	return NULL;
4050c53d5893SAlex Elder }
4051c53d5893SAlex Elder 
4052c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4053c53d5893SAlex Elder {
4054dd5ac32dSIlya Dryomov 	if (rbd_dev)
4055dd5ac32dSIlya Dryomov 		put_device(&rbd_dev->dev);
4056c53d5893SAlex Elder }
4057c53d5893SAlex Elder 
4058dfc5606dSYehuda Sadeh /*
40599d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
40609d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
40619d475de5SAlex Elder  * image.
40629d475de5SAlex Elder  */
40639d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
40649d475de5SAlex Elder 				u8 *order, u64 *snap_size)
40659d475de5SAlex Elder {
40669d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
40679d475de5SAlex Elder 	int ret;
40689d475de5SAlex Elder 	struct {
40699d475de5SAlex Elder 		u8 order;
40709d475de5SAlex Elder 		__le64 size;
40719d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
40729d475de5SAlex Elder 
4073c41d13a3SIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
40749d475de5SAlex Elder 				"rbd", "get_size",
40754157976bSAlex Elder 				&snapid, sizeof (snapid),
4076e2a58ee5SAlex Elder 				&size_buf, sizeof (size_buf));
407736be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
40789d475de5SAlex Elder 	if (ret < 0)
40799d475de5SAlex Elder 		return ret;
408057385b51SAlex Elder 	if (ret < sizeof (size_buf))
408157385b51SAlex Elder 		return -ERANGE;
40829d475de5SAlex Elder 
4083c3545579SJosh Durgin 	if (order) {
40849d475de5SAlex Elder 		*order = size_buf.order;
4085c3545579SJosh Durgin 		dout("  order %u", (unsigned int)*order);
4086c3545579SJosh Durgin 	}
40879d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
40889d475de5SAlex Elder 
4089c3545579SJosh Durgin 	dout("  snap_id 0x%016llx snap_size = %llu\n",
4090c3545579SJosh Durgin 		(unsigned long long)snap_id,
40919d475de5SAlex Elder 		(unsigned long long)*snap_size);
40929d475de5SAlex Elder 
40939d475de5SAlex Elder 	return 0;
40949d475de5SAlex Elder }
40959d475de5SAlex Elder 
40969d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
40979d475de5SAlex Elder {
40989d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
40999d475de5SAlex Elder 					&rbd_dev->header.obj_order,
41009d475de5SAlex Elder 					&rbd_dev->header.image_size);
41019d475de5SAlex Elder }
41029d475de5SAlex Elder 
41031e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
41041e130199SAlex Elder {
41051e130199SAlex Elder 	void *reply_buf;
41061e130199SAlex Elder 	int ret;
41071e130199SAlex Elder 	void *p;
41081e130199SAlex Elder 
41091e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
41101e130199SAlex Elder 	if (!reply_buf)
41111e130199SAlex Elder 		return -ENOMEM;
41121e130199SAlex Elder 
4113c41d13a3SIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
41144157976bSAlex Elder 				"rbd", "get_object_prefix", NULL, 0,
4115e2a58ee5SAlex Elder 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
411636be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
41171e130199SAlex Elder 	if (ret < 0)
41181e130199SAlex Elder 		goto out;
41191e130199SAlex Elder 
41201e130199SAlex Elder 	p = reply_buf;
41211e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
412257385b51SAlex Elder 						p + ret, NULL, GFP_NOIO);
412357385b51SAlex Elder 	ret = 0;
41241e130199SAlex Elder 
41251e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
41261e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
41271e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
41281e130199SAlex Elder 	} else {
41291e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
41301e130199SAlex Elder 	}
41311e130199SAlex Elder out:
41321e130199SAlex Elder 	kfree(reply_buf);
41331e130199SAlex Elder 
41341e130199SAlex Elder 	return ret;
41351e130199SAlex Elder }
41361e130199SAlex Elder 
4137b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4138b1b5402aSAlex Elder 		u64 *snap_features)
4139b1b5402aSAlex Elder {
4140b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
4141b1b5402aSAlex Elder 	struct {
4142b1b5402aSAlex Elder 		__le64 features;
4143b1b5402aSAlex Elder 		__le64 incompat;
41444157976bSAlex Elder 	} __attribute__ ((packed)) features_buf = { 0 };
4145d3767f0fSIlya Dryomov 	u64 unsup;
4146b1b5402aSAlex Elder 	int ret;
4147b1b5402aSAlex Elder 
4148c41d13a3SIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
4149b1b5402aSAlex Elder 				"rbd", "get_features",
41504157976bSAlex Elder 				&snapid, sizeof (snapid),
4151e2a58ee5SAlex Elder 				&features_buf, sizeof (features_buf));
415236be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4153b1b5402aSAlex Elder 	if (ret < 0)
4154b1b5402aSAlex Elder 		return ret;
415557385b51SAlex Elder 	if (ret < sizeof (features_buf))
415657385b51SAlex Elder 		return -ERANGE;
4157d889140cSAlex Elder 
4158d3767f0fSIlya Dryomov 	unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
4159d3767f0fSIlya Dryomov 	if (unsup) {
4160d3767f0fSIlya Dryomov 		rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
4161d3767f0fSIlya Dryomov 			 unsup);
4162b8f5c6edSAlex Elder 		return -ENXIO;
4163d3767f0fSIlya Dryomov 	}
4164d889140cSAlex Elder 
4165b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
4166b1b5402aSAlex Elder 
4167b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
4168b1b5402aSAlex Elder 		(unsigned long long)snap_id,
4169b1b5402aSAlex Elder 		(unsigned long long)*snap_features,
4170b1b5402aSAlex Elder 		(unsigned long long)le64_to_cpu(features_buf.incompat));
4171b1b5402aSAlex Elder 
4172b1b5402aSAlex Elder 	return 0;
4173b1b5402aSAlex Elder }
4174b1b5402aSAlex Elder 
4175b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
4176b1b5402aSAlex Elder {
4177b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
4178b1b5402aSAlex Elder 						&rbd_dev->header.features);
4179b1b5402aSAlex Elder }
4180b1b5402aSAlex Elder 
418186b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
418286b00e0dSAlex Elder {
418386b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
418486b00e0dSAlex Elder 	size_t size;
418586b00e0dSAlex Elder 	void *reply_buf = NULL;
418686b00e0dSAlex Elder 	__le64 snapid;
418786b00e0dSAlex Elder 	void *p;
418886b00e0dSAlex Elder 	void *end;
4189642a2537SAlex Elder 	u64 pool_id;
419086b00e0dSAlex Elder 	char *image_id;
41913b5cf2a2SAlex Elder 	u64 snap_id;
419286b00e0dSAlex Elder 	u64 overlap;
419386b00e0dSAlex Elder 	int ret;
419486b00e0dSAlex Elder 
419586b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
419686b00e0dSAlex Elder 	if (!parent_spec)
419786b00e0dSAlex Elder 		return -ENOMEM;
419886b00e0dSAlex Elder 
419986b00e0dSAlex Elder 	size = sizeof (__le64) +				/* pool_id */
420086b00e0dSAlex Elder 		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
420186b00e0dSAlex Elder 		sizeof (__le64) +				/* snap_id */
420286b00e0dSAlex Elder 		sizeof (__le64);				/* overlap */
420386b00e0dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
420486b00e0dSAlex Elder 	if (!reply_buf) {
420586b00e0dSAlex Elder 		ret = -ENOMEM;
420686b00e0dSAlex Elder 		goto out_err;
420786b00e0dSAlex Elder 	}
420886b00e0dSAlex Elder 
42094d9b67cdSIlya Dryomov 	snapid = cpu_to_le64(rbd_dev->spec->snap_id);
4210c41d13a3SIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
421186b00e0dSAlex Elder 				"rbd", "get_parent",
42124157976bSAlex Elder 				&snapid, sizeof (snapid),
4213e2a58ee5SAlex Elder 				reply_buf, size);
421436be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
421586b00e0dSAlex Elder 	if (ret < 0)
421686b00e0dSAlex Elder 		goto out_err;
421786b00e0dSAlex Elder 
421886b00e0dSAlex Elder 	p = reply_buf;
421957385b51SAlex Elder 	end = reply_buf + ret;
422057385b51SAlex Elder 	ret = -ERANGE;
4221642a2537SAlex Elder 	ceph_decode_64_safe(&p, end, pool_id, out_err);
4222392a9dadSAlex Elder 	if (pool_id == CEPH_NOPOOL) {
4223392a9dadSAlex Elder 		/*
4224392a9dadSAlex Elder 		 * Either the parent never existed, or we have
4225392a9dadSAlex Elder 		 * record of it but the image got flattened so it no
4226392a9dadSAlex Elder 		 * longer has a parent.  When the parent of a
4227392a9dadSAlex Elder 		 * layered image disappears we immediately set the
4228392a9dadSAlex Elder 		 * overlap to 0.  The effect of this is that all new
4229392a9dadSAlex Elder 		 * requests will be treated as if the image had no
4230392a9dadSAlex Elder 		 * parent.
4231392a9dadSAlex Elder 		 */
4232392a9dadSAlex Elder 		if (rbd_dev->parent_overlap) {
4233392a9dadSAlex Elder 			rbd_dev->parent_overlap = 0;
4234392a9dadSAlex Elder 			rbd_dev_parent_put(rbd_dev);
4235392a9dadSAlex Elder 			pr_info("%s: clone image has been flattened\n",
4236392a9dadSAlex Elder 				rbd_dev->disk->disk_name);
4237392a9dadSAlex Elder 		}
4238392a9dadSAlex Elder 
423986b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
4240392a9dadSAlex Elder 	}
424186b00e0dSAlex Elder 
42420903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
42430903e875SAlex Elder 
42440903e875SAlex Elder 	ret = -EIO;
4245642a2537SAlex Elder 	if (pool_id > (u64)U32_MAX) {
42469584d508SIlya Dryomov 		rbd_warn(NULL, "parent pool id too large (%llu > %u)",
4247642a2537SAlex Elder 			(unsigned long long)pool_id, U32_MAX);
424857385b51SAlex Elder 		goto out_err;
4249c0cd10dbSAlex Elder 	}
42500903e875SAlex Elder 
4251979ed480SAlex Elder 	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
425286b00e0dSAlex Elder 	if (IS_ERR(image_id)) {
425386b00e0dSAlex Elder 		ret = PTR_ERR(image_id);
425486b00e0dSAlex Elder 		goto out_err;
425586b00e0dSAlex Elder 	}
42563b5cf2a2SAlex Elder 	ceph_decode_64_safe(&p, end, snap_id, out_err);
425786b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, overlap, out_err);
425886b00e0dSAlex Elder 
42593b5cf2a2SAlex Elder 	/*
42603b5cf2a2SAlex Elder 	 * The parent won't change (except when the clone is
42613b5cf2a2SAlex Elder 	 * flattened, already handled that).  So we only need to
42623b5cf2a2SAlex Elder 	 * record the parent spec we have not already done so.
42633b5cf2a2SAlex Elder 	 */
42643b5cf2a2SAlex Elder 	if (!rbd_dev->parent_spec) {
42653b5cf2a2SAlex Elder 		parent_spec->pool_id = pool_id;
42663b5cf2a2SAlex Elder 		parent_spec->image_id = image_id;
42673b5cf2a2SAlex Elder 		parent_spec->snap_id = snap_id;
426886b00e0dSAlex Elder 		rbd_dev->parent_spec = parent_spec;
426986b00e0dSAlex Elder 		parent_spec = NULL;	/* rbd_dev now owns this */
4270fbba11b3SIlya Dryomov 	} else {
4271fbba11b3SIlya Dryomov 		kfree(image_id);
42723b5cf2a2SAlex Elder 	}
42733b5cf2a2SAlex Elder 
42743b5cf2a2SAlex Elder 	/*
4275cf32bd9cSIlya Dryomov 	 * We always update the parent overlap.  If it's zero we issue
4276cf32bd9cSIlya Dryomov 	 * a warning, as we will proceed as if there was no parent.
42773b5cf2a2SAlex Elder 	 */
42783b5cf2a2SAlex Elder 	if (!overlap) {
42793b5cf2a2SAlex Elder 		if (parent_spec) {
4280cf32bd9cSIlya Dryomov 			/* refresh, careful to warn just once */
4281cf32bd9cSIlya Dryomov 			if (rbd_dev->parent_overlap)
4282cf32bd9cSIlya Dryomov 				rbd_warn(rbd_dev,
4283cf32bd9cSIlya Dryomov 				    "clone now standalone (overlap became 0)");
428470cf49cfSAlex Elder 		} else {
4285cf32bd9cSIlya Dryomov 			/* initial probe */
4286cf32bd9cSIlya Dryomov 			rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
42873b5cf2a2SAlex Elder 		}
428870cf49cfSAlex Elder 	}
4289cf32bd9cSIlya Dryomov 	rbd_dev->parent_overlap = overlap;
4290cf32bd9cSIlya Dryomov 
429186b00e0dSAlex Elder out:
429286b00e0dSAlex Elder 	ret = 0;
429386b00e0dSAlex Elder out_err:
429486b00e0dSAlex Elder 	kfree(reply_buf);
429586b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
429686b00e0dSAlex Elder 
429786b00e0dSAlex Elder 	return ret;
429886b00e0dSAlex Elder }
429986b00e0dSAlex Elder 
4300cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
4301cc070d59SAlex Elder {
4302cc070d59SAlex Elder 	struct {
4303cc070d59SAlex Elder 		__le64 stripe_unit;
4304cc070d59SAlex Elder 		__le64 stripe_count;
4305cc070d59SAlex Elder 	} __attribute__ ((packed)) striping_info_buf = { 0 };
4306cc070d59SAlex Elder 	size_t size = sizeof (striping_info_buf);
4307cc070d59SAlex Elder 	void *p;
4308cc070d59SAlex Elder 	u64 obj_size;
4309cc070d59SAlex Elder 	u64 stripe_unit;
4310cc070d59SAlex Elder 	u64 stripe_count;
4311cc070d59SAlex Elder 	int ret;
4312cc070d59SAlex Elder 
4313c41d13a3SIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
4314cc070d59SAlex Elder 				"rbd", "get_stripe_unit_count", NULL, 0,
4315e2a58ee5SAlex Elder 				(char *)&striping_info_buf, size);
4316cc070d59SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4317cc070d59SAlex Elder 	if (ret < 0)
4318cc070d59SAlex Elder 		return ret;
4319cc070d59SAlex Elder 	if (ret < size)
4320cc070d59SAlex Elder 		return -ERANGE;
4321cc070d59SAlex Elder 
4322cc070d59SAlex Elder 	/*
4323cc070d59SAlex Elder 	 * We don't actually support the "fancy striping" feature
4324cc070d59SAlex Elder 	 * (STRIPINGV2) yet, but if the striping sizes are the
4325cc070d59SAlex Elder 	 * defaults the behavior is the same as before.  So find
4326cc070d59SAlex Elder 	 * out, and only fail if the image has non-default values.
4327cc070d59SAlex Elder 	 */
4328cc070d59SAlex Elder 	ret = -EINVAL;
4329cc070d59SAlex Elder 	obj_size = (u64)1 << rbd_dev->header.obj_order;
4330cc070d59SAlex Elder 	p = &striping_info_buf;
4331cc070d59SAlex Elder 	stripe_unit = ceph_decode_64(&p);
4332cc070d59SAlex Elder 	if (stripe_unit != obj_size) {
4333cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe unit "
4334cc070d59SAlex Elder 				"(got %llu want %llu)",
4335cc070d59SAlex Elder 				stripe_unit, obj_size);
4336cc070d59SAlex Elder 		return -EINVAL;
4337cc070d59SAlex Elder 	}
4338cc070d59SAlex Elder 	stripe_count = ceph_decode_64(&p);
4339cc070d59SAlex Elder 	if (stripe_count != 1) {
4340cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe count "
4341cc070d59SAlex Elder 				"(got %llu want 1)", stripe_count);
4342cc070d59SAlex Elder 		return -EINVAL;
4343cc070d59SAlex Elder 	}
4344500d0c0fSAlex Elder 	rbd_dev->header.stripe_unit = stripe_unit;
4345500d0c0fSAlex Elder 	rbd_dev->header.stripe_count = stripe_count;
4346cc070d59SAlex Elder 
4347cc070d59SAlex Elder 	return 0;
4348cc070d59SAlex Elder }
4349cc070d59SAlex Elder 
43509e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
43519e15b77dSAlex Elder {
43529e15b77dSAlex Elder 	size_t image_id_size;
43539e15b77dSAlex Elder 	char *image_id;
43549e15b77dSAlex Elder 	void *p;
43559e15b77dSAlex Elder 	void *end;
43569e15b77dSAlex Elder 	size_t size;
43579e15b77dSAlex Elder 	void *reply_buf = NULL;
43589e15b77dSAlex Elder 	size_t len = 0;
43599e15b77dSAlex Elder 	char *image_name = NULL;
43609e15b77dSAlex Elder 	int ret;
43619e15b77dSAlex Elder 
43629e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
43639e15b77dSAlex Elder 
436469e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
436569e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
43669e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
43679e15b77dSAlex Elder 	if (!image_id)
43689e15b77dSAlex Elder 		return NULL;
43699e15b77dSAlex Elder 
43709e15b77dSAlex Elder 	p = image_id;
43714157976bSAlex Elder 	end = image_id + image_id_size;
437269e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
43739e15b77dSAlex Elder 
43749e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
43759e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
43769e15b77dSAlex Elder 	if (!reply_buf)
43779e15b77dSAlex Elder 		goto out;
43789e15b77dSAlex Elder 
437936be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
43809e15b77dSAlex Elder 				"rbd", "dir_get_name",
43819e15b77dSAlex Elder 				image_id, image_id_size,
4382e2a58ee5SAlex Elder 				reply_buf, size);
43839e15b77dSAlex Elder 	if (ret < 0)
43849e15b77dSAlex Elder 		goto out;
43859e15b77dSAlex Elder 	p = reply_buf;
4386f40eb349SAlex Elder 	end = reply_buf + ret;
4387f40eb349SAlex Elder 
43889e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
43899e15b77dSAlex Elder 	if (IS_ERR(image_name))
43909e15b77dSAlex Elder 		image_name = NULL;
43919e15b77dSAlex Elder 	else
43929e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
43939e15b77dSAlex Elder out:
43949e15b77dSAlex Elder 	kfree(reply_buf);
43959e15b77dSAlex Elder 	kfree(image_id);
43969e15b77dSAlex Elder 
43979e15b77dSAlex Elder 	return image_name;
43989e15b77dSAlex Elder }
43999e15b77dSAlex Elder 
44002ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
44012ad3d716SAlex Elder {
44022ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
44032ad3d716SAlex Elder 	const char *snap_name;
44042ad3d716SAlex Elder 	u32 which = 0;
44052ad3d716SAlex Elder 
44062ad3d716SAlex Elder 	/* Skip over names until we find the one we are looking for */
44072ad3d716SAlex Elder 
44082ad3d716SAlex Elder 	snap_name = rbd_dev->header.snap_names;
44092ad3d716SAlex Elder 	while (which < snapc->num_snaps) {
44102ad3d716SAlex Elder 		if (!strcmp(name, snap_name))
44112ad3d716SAlex Elder 			return snapc->snaps[which];
44122ad3d716SAlex Elder 		snap_name += strlen(snap_name) + 1;
44132ad3d716SAlex Elder 		which++;
44142ad3d716SAlex Elder 	}
44152ad3d716SAlex Elder 	return CEPH_NOSNAP;
44162ad3d716SAlex Elder }
44172ad3d716SAlex Elder 
44182ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
44192ad3d716SAlex Elder {
44202ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
44212ad3d716SAlex Elder 	u32 which;
44222ad3d716SAlex Elder 	bool found = false;
44232ad3d716SAlex Elder 	u64 snap_id;
44242ad3d716SAlex Elder 
44252ad3d716SAlex Elder 	for (which = 0; !found && which < snapc->num_snaps; which++) {
44262ad3d716SAlex Elder 		const char *snap_name;
44272ad3d716SAlex Elder 
44282ad3d716SAlex Elder 		snap_id = snapc->snaps[which];
44292ad3d716SAlex Elder 		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
4430efadc98aSJosh Durgin 		if (IS_ERR(snap_name)) {
4431efadc98aSJosh Durgin 			/* ignore no-longer existing snapshots */
4432efadc98aSJosh Durgin 			if (PTR_ERR(snap_name) == -ENOENT)
4433efadc98aSJosh Durgin 				continue;
4434efadc98aSJosh Durgin 			else
44352ad3d716SAlex Elder 				break;
4436efadc98aSJosh Durgin 		}
44372ad3d716SAlex Elder 		found = !strcmp(name, snap_name);
44382ad3d716SAlex Elder 		kfree(snap_name);
44392ad3d716SAlex Elder 	}
44402ad3d716SAlex Elder 	return found ? snap_id : CEPH_NOSNAP;
44412ad3d716SAlex Elder }
44422ad3d716SAlex Elder 
44432ad3d716SAlex Elder /*
44442ad3d716SAlex Elder  * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
44452ad3d716SAlex Elder  * no snapshot by that name is found, or if an error occurs.
44462ad3d716SAlex Elder  */
44472ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
44482ad3d716SAlex Elder {
44492ad3d716SAlex Elder 	if (rbd_dev->image_format == 1)
44502ad3d716SAlex Elder 		return rbd_v1_snap_id_by_name(rbd_dev, name);
44512ad3d716SAlex Elder 
44522ad3d716SAlex Elder 	return rbd_v2_snap_id_by_name(rbd_dev, name);
44532ad3d716SAlex Elder }
44542ad3d716SAlex Elder 
44559e15b77dSAlex Elder /*
445604077599SIlya Dryomov  * An image being mapped will have everything but the snap id.
44579e15b77dSAlex Elder  */
445804077599SIlya Dryomov static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
445904077599SIlya Dryomov {
446004077599SIlya Dryomov 	struct rbd_spec *spec = rbd_dev->spec;
446104077599SIlya Dryomov 
446204077599SIlya Dryomov 	rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
446304077599SIlya Dryomov 	rbd_assert(spec->image_id && spec->image_name);
446404077599SIlya Dryomov 	rbd_assert(spec->snap_name);
446504077599SIlya Dryomov 
446604077599SIlya Dryomov 	if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
446704077599SIlya Dryomov 		u64 snap_id;
446804077599SIlya Dryomov 
446904077599SIlya Dryomov 		snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
447004077599SIlya Dryomov 		if (snap_id == CEPH_NOSNAP)
447104077599SIlya Dryomov 			return -ENOENT;
447204077599SIlya Dryomov 
447304077599SIlya Dryomov 		spec->snap_id = snap_id;
447404077599SIlya Dryomov 	} else {
447504077599SIlya Dryomov 		spec->snap_id = CEPH_NOSNAP;
447604077599SIlya Dryomov 	}
447704077599SIlya Dryomov 
447804077599SIlya Dryomov 	return 0;
447904077599SIlya Dryomov }
448004077599SIlya Dryomov 
448104077599SIlya Dryomov /*
448204077599SIlya Dryomov  * A parent image will have all ids but none of the names.
448304077599SIlya Dryomov  *
448404077599SIlya Dryomov  * All names in an rbd spec are dynamically allocated.  It's OK if we
448504077599SIlya Dryomov  * can't figure out the name for an image id.
448604077599SIlya Dryomov  */
448704077599SIlya Dryomov static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
44889e15b77dSAlex Elder {
44892e9f7f1cSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
44902e9f7f1cSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
44912e9f7f1cSAlex Elder 	const char *pool_name;
44922e9f7f1cSAlex Elder 	const char *image_name;
44932e9f7f1cSAlex Elder 	const char *snap_name;
44949e15b77dSAlex Elder 	int ret;
44959e15b77dSAlex Elder 
449604077599SIlya Dryomov 	rbd_assert(spec->pool_id != CEPH_NOPOOL);
449704077599SIlya Dryomov 	rbd_assert(spec->image_id);
449804077599SIlya Dryomov 	rbd_assert(spec->snap_id != CEPH_NOSNAP);
44999e15b77dSAlex Elder 
45002e9f7f1cSAlex Elder 	/* Get the pool name; we have to make our own copy of this */
45019e15b77dSAlex Elder 
45022e9f7f1cSAlex Elder 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
45032e9f7f1cSAlex Elder 	if (!pool_name) {
45042e9f7f1cSAlex Elder 		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
4505935dc89fSAlex Elder 		return -EIO;
4506935dc89fSAlex Elder 	}
45072e9f7f1cSAlex Elder 	pool_name = kstrdup(pool_name, GFP_KERNEL);
45082e9f7f1cSAlex Elder 	if (!pool_name)
45099e15b77dSAlex Elder 		return -ENOMEM;
45109e15b77dSAlex Elder 
45119e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
45129e15b77dSAlex Elder 
45132e9f7f1cSAlex Elder 	image_name = rbd_dev_image_name(rbd_dev);
45142e9f7f1cSAlex Elder 	if (!image_name)
451506ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
45169e15b77dSAlex Elder 
451704077599SIlya Dryomov 	/* Fetch the snapshot name */
45189e15b77dSAlex Elder 
45192e9f7f1cSAlex Elder 	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
4520da6a6b63SJosh Durgin 	if (IS_ERR(snap_name)) {
4521da6a6b63SJosh Durgin 		ret = PTR_ERR(snap_name);
45229e15b77dSAlex Elder 		goto out_err;
45232e9f7f1cSAlex Elder 	}
45242e9f7f1cSAlex Elder 
45252e9f7f1cSAlex Elder 	spec->pool_name = pool_name;
45262e9f7f1cSAlex Elder 	spec->image_name = image_name;
45272e9f7f1cSAlex Elder 	spec->snap_name = snap_name;
45289e15b77dSAlex Elder 
45299e15b77dSAlex Elder 	return 0;
453004077599SIlya Dryomov 
45319e15b77dSAlex Elder out_err:
45322e9f7f1cSAlex Elder 	kfree(image_name);
45332e9f7f1cSAlex Elder 	kfree(pool_name);
45349e15b77dSAlex Elder 	return ret;
45359e15b77dSAlex Elder }
45369e15b77dSAlex Elder 
4537cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
453835d489f9SAlex Elder {
453935d489f9SAlex Elder 	size_t size;
454035d489f9SAlex Elder 	int ret;
454135d489f9SAlex Elder 	void *reply_buf;
454235d489f9SAlex Elder 	void *p;
454335d489f9SAlex Elder 	void *end;
454435d489f9SAlex Elder 	u64 seq;
454535d489f9SAlex Elder 	u32 snap_count;
454635d489f9SAlex Elder 	struct ceph_snap_context *snapc;
454735d489f9SAlex Elder 	u32 i;
454835d489f9SAlex Elder 
454935d489f9SAlex Elder 	/*
455035d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
455135d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
455235d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
455335d489f9SAlex Elder 	 * prepared to receive.
455435d489f9SAlex Elder 	 */
455535d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
455635d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
455735d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
455835d489f9SAlex Elder 	if (!reply_buf)
455935d489f9SAlex Elder 		return -ENOMEM;
456035d489f9SAlex Elder 
4561c41d13a3SIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
45624157976bSAlex Elder 				"rbd", "get_snapcontext", NULL, 0,
4563e2a58ee5SAlex Elder 				reply_buf, size);
456436be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
456535d489f9SAlex Elder 	if (ret < 0)
456635d489f9SAlex Elder 		goto out;
456735d489f9SAlex Elder 
456835d489f9SAlex Elder 	p = reply_buf;
456957385b51SAlex Elder 	end = reply_buf + ret;
457057385b51SAlex Elder 	ret = -ERANGE;
457135d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
457235d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
457335d489f9SAlex Elder 
457435d489f9SAlex Elder 	/*
457535d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
457635d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
457735d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
457835d489f9SAlex Elder 	 * allocate is representable in a size_t.
457935d489f9SAlex Elder 	 */
458035d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
458135d489f9SAlex Elder 				 / sizeof (u64)) {
458235d489f9SAlex Elder 		ret = -EINVAL;
458335d489f9SAlex Elder 		goto out;
458435d489f9SAlex Elder 	}
458535d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
458635d489f9SAlex Elder 		goto out;
4587468521c1SAlex Elder 	ret = 0;
458835d489f9SAlex Elder 
4589812164f8SAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
459035d489f9SAlex Elder 	if (!snapc) {
459135d489f9SAlex Elder 		ret = -ENOMEM;
459235d489f9SAlex Elder 		goto out;
459335d489f9SAlex Elder 	}
459435d489f9SAlex Elder 	snapc->seq = seq;
459535d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
459635d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
459735d489f9SAlex Elder 
459849ece554SAlex Elder 	ceph_put_snap_context(rbd_dev->header.snapc);
459935d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
460035d489f9SAlex Elder 
460135d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
460235d489f9SAlex Elder 		(unsigned long long)seq, (unsigned int)snap_count);
460335d489f9SAlex Elder out:
460435d489f9SAlex Elder 	kfree(reply_buf);
460535d489f9SAlex Elder 
460657385b51SAlex Elder 	return ret;
460735d489f9SAlex Elder }
460835d489f9SAlex Elder 
460954cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
461054cac61fSAlex Elder 					u64 snap_id)
4611b8b1e2dbSAlex Elder {
4612b8b1e2dbSAlex Elder 	size_t size;
4613b8b1e2dbSAlex Elder 	void *reply_buf;
461454cac61fSAlex Elder 	__le64 snapid;
4615b8b1e2dbSAlex Elder 	int ret;
4616b8b1e2dbSAlex Elder 	void *p;
4617b8b1e2dbSAlex Elder 	void *end;
4618b8b1e2dbSAlex Elder 	char *snap_name;
4619b8b1e2dbSAlex Elder 
4620b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
4621b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
4622b8b1e2dbSAlex Elder 	if (!reply_buf)
4623b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
4624b8b1e2dbSAlex Elder 
462554cac61fSAlex Elder 	snapid = cpu_to_le64(snap_id);
4626c41d13a3SIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
4627b8b1e2dbSAlex Elder 				"rbd", "get_snapshot_name",
462854cac61fSAlex Elder 				&snapid, sizeof (snapid),
4629e2a58ee5SAlex Elder 				reply_buf, size);
463036be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4631f40eb349SAlex Elder 	if (ret < 0) {
4632f40eb349SAlex Elder 		snap_name = ERR_PTR(ret);
4633b8b1e2dbSAlex Elder 		goto out;
4634f40eb349SAlex Elder 	}
4635b8b1e2dbSAlex Elder 
4636b8b1e2dbSAlex Elder 	p = reply_buf;
4637f40eb349SAlex Elder 	end = reply_buf + ret;
4638e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
4639f40eb349SAlex Elder 	if (IS_ERR(snap_name))
4640b8b1e2dbSAlex Elder 		goto out;
4641f40eb349SAlex Elder 
4642b8b1e2dbSAlex Elder 	dout("  snap_id 0x%016llx snap_name = %s\n",
464354cac61fSAlex Elder 		(unsigned long long)snap_id, snap_name);
4644b8b1e2dbSAlex Elder out:
4645b8b1e2dbSAlex Elder 	kfree(reply_buf);
4646b8b1e2dbSAlex Elder 
4647f40eb349SAlex Elder 	return snap_name;
4648b8b1e2dbSAlex Elder }
4649b8b1e2dbSAlex Elder 
46502df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
4651117973fbSAlex Elder {
46522df3fac7SAlex Elder 	bool first_time = rbd_dev->header.object_prefix == NULL;
4653117973fbSAlex Elder 	int ret;
4654117973fbSAlex Elder 
46551617e40cSJosh Durgin 	ret = rbd_dev_v2_image_size(rbd_dev);
46561617e40cSJosh Durgin 	if (ret)
4657cfbf6377SAlex Elder 		return ret;
46581617e40cSJosh Durgin 
46592df3fac7SAlex Elder 	if (first_time) {
46602df3fac7SAlex Elder 		ret = rbd_dev_v2_header_onetime(rbd_dev);
46612df3fac7SAlex Elder 		if (ret)
4662cfbf6377SAlex Elder 			return ret;
46632df3fac7SAlex Elder 	}
46642df3fac7SAlex Elder 
4665cc4a38bdSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev);
4666d194cd1dSIlya Dryomov 	if (ret && first_time) {
4667d194cd1dSIlya Dryomov 		kfree(rbd_dev->header.object_prefix);
4668d194cd1dSIlya Dryomov 		rbd_dev->header.object_prefix = NULL;
4669d194cd1dSIlya Dryomov 	}
4670117973fbSAlex Elder 
4671117973fbSAlex Elder 	return ret;
4672117973fbSAlex Elder }
4673117973fbSAlex Elder 
4674a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev)
4675a720ae09SIlya Dryomov {
4676a720ae09SIlya Dryomov 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4677a720ae09SIlya Dryomov 
4678a720ae09SIlya Dryomov 	if (rbd_dev->image_format == 1)
4679a720ae09SIlya Dryomov 		return rbd_dev_v1_header_info(rbd_dev);
4680a720ae09SIlya Dryomov 
4681a720ae09SIlya Dryomov 	return rbd_dev_v2_header_info(rbd_dev);
4682a720ae09SIlya Dryomov }
4683a720ae09SIlya Dryomov 
46841ddbe94eSAlex Elder /*
4685e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
4686e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
4687593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
4688593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
4689e28fff26SAlex Elder  */
4690e28fff26SAlex Elder static inline size_t next_token(const char **buf)
4691e28fff26SAlex Elder {
4692e28fff26SAlex Elder         /*
4693e28fff26SAlex Elder         * These are the characters that produce nonzero for
4694e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
4695e28fff26SAlex Elder         */
4696e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
4697e28fff26SAlex Elder 
4698e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
4699e28fff26SAlex Elder 
4700e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
4701e28fff26SAlex Elder }
4702e28fff26SAlex Elder 
4703e28fff26SAlex Elder /*
4704ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
4705ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
4706ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
4707ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
4708ea3352f4SAlex Elder  *
4709ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
4710ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
4711ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
4712ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
4713ea3352f4SAlex Elder  *
4714ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
4715ea3352f4SAlex Elder  * the end of the found token.
4716ea3352f4SAlex Elder  *
4717ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
4718ea3352f4SAlex Elder  */
4719ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
4720ea3352f4SAlex Elder {
4721ea3352f4SAlex Elder 	char *dup;
4722ea3352f4SAlex Elder 	size_t len;
4723ea3352f4SAlex Elder 
4724ea3352f4SAlex Elder 	len = next_token(buf);
47254caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
4726ea3352f4SAlex Elder 	if (!dup)
4727ea3352f4SAlex Elder 		return NULL;
4728ea3352f4SAlex Elder 	*(dup + len) = '\0';
4729ea3352f4SAlex Elder 	*buf += len;
4730ea3352f4SAlex Elder 
4731ea3352f4SAlex Elder 	if (lenp)
4732ea3352f4SAlex Elder 		*lenp = len;
4733ea3352f4SAlex Elder 
4734ea3352f4SAlex Elder 	return dup;
4735ea3352f4SAlex Elder }
4736ea3352f4SAlex Elder 
4737ea3352f4SAlex Elder /*
4738859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
4739859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
4740859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
4741859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
4742d22f76e7SAlex Elder  *
4743859c31dfSAlex Elder  * The information extracted from these options is recorded in
4744859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
4745859c31dfSAlex Elder  * structures:
4746859c31dfSAlex Elder  *  ceph_opts
4747859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
4748859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
4749859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
4750859c31dfSAlex Elder  *  rbd_opts
4751859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
4752859c31dfSAlex Elder  *	this function; caller must release with kfree().
4753859c31dfSAlex Elder  *  spec
4754859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
4755859c31dfSAlex Elder  *	initialized by this function based on parsed options.
4756859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
4757859c31dfSAlex Elder  *
4758859c31dfSAlex Elder  * The options passed take this form:
4759859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4760859c31dfSAlex Elder  * where:
4761859c31dfSAlex Elder  *  <mon_addrs>
4762859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
4763859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
4764859c31dfSAlex Elder  *      by a port number (separated by a colon).
4765859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
4766859c31dfSAlex Elder  *  <options>
4767859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
4768859c31dfSAlex Elder  *  <pool_name>
4769859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
4770859c31dfSAlex Elder  *  <image_name>
4771859c31dfSAlex Elder  *      The name of the image in that pool to map.
4772859c31dfSAlex Elder  *  <snap_id>
4773859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
4774859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
4775859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
4776859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
4777a725f65eSAlex Elder  */
4778859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
4779dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
4780859c31dfSAlex Elder 				struct rbd_options **opts,
4781859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
4782a725f65eSAlex Elder {
4783e28fff26SAlex Elder 	size_t len;
4784859c31dfSAlex Elder 	char *options;
47850ddebc0cSAlex Elder 	const char *mon_addrs;
4786ecb4dc22SAlex Elder 	char *snap_name;
47870ddebc0cSAlex Elder 	size_t mon_addrs_size;
4788859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
47894e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
4790859c31dfSAlex Elder 	struct ceph_options *copts;
4791dc79b113SAlex Elder 	int ret;
4792e28fff26SAlex Elder 
4793e28fff26SAlex Elder 	/* The first four tokens are required */
4794e28fff26SAlex Elder 
47957ef3214aSAlex Elder 	len = next_token(&buf);
47964fb5d671SAlex Elder 	if (!len) {
47974fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
47984fb5d671SAlex Elder 		return -EINVAL;
47994fb5d671SAlex Elder 	}
48000ddebc0cSAlex Elder 	mon_addrs = buf;
4801f28e565aSAlex Elder 	mon_addrs_size = len + 1;
48027ef3214aSAlex Elder 	buf += len;
4803a725f65eSAlex Elder 
4804dc79b113SAlex Elder 	ret = -EINVAL;
4805f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
4806f28e565aSAlex Elder 	if (!options)
4807dc79b113SAlex Elder 		return -ENOMEM;
48084fb5d671SAlex Elder 	if (!*options) {
48094fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
48104fb5d671SAlex Elder 		goto out_err;
48114fb5d671SAlex Elder 	}
4812a725f65eSAlex Elder 
4813859c31dfSAlex Elder 	spec = rbd_spec_alloc();
4814859c31dfSAlex Elder 	if (!spec)
4815f28e565aSAlex Elder 		goto out_mem;
4816859c31dfSAlex Elder 
4817859c31dfSAlex Elder 	spec->pool_name = dup_token(&buf, NULL);
4818859c31dfSAlex Elder 	if (!spec->pool_name)
4819859c31dfSAlex Elder 		goto out_mem;
48204fb5d671SAlex Elder 	if (!*spec->pool_name) {
48214fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
48224fb5d671SAlex Elder 		goto out_err;
48234fb5d671SAlex Elder 	}
4824e28fff26SAlex Elder 
482569e7a02fSAlex Elder 	spec->image_name = dup_token(&buf, NULL);
4826859c31dfSAlex Elder 	if (!spec->image_name)
4827f28e565aSAlex Elder 		goto out_mem;
48284fb5d671SAlex Elder 	if (!*spec->image_name) {
48294fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
48304fb5d671SAlex Elder 		goto out_err;
48314fb5d671SAlex Elder 	}
4832e28fff26SAlex Elder 
4833f28e565aSAlex Elder 	/*
4834f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
4835f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
4836f28e565aSAlex Elder 	 */
48373feeb894SAlex Elder 	len = next_token(&buf);
4838820a5f3eSAlex Elder 	if (!len) {
48393feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
48403feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
4841f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
4842dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
4843f28e565aSAlex Elder 		goto out_err;
4844849b4260SAlex Elder 	}
4845ecb4dc22SAlex Elder 	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
4846ecb4dc22SAlex Elder 	if (!snap_name)
4847f28e565aSAlex Elder 		goto out_mem;
4848ecb4dc22SAlex Elder 	*(snap_name + len) = '\0';
4849ecb4dc22SAlex Elder 	spec->snap_name = snap_name;
4850e5c35534SAlex Elder 
48510ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
4852e28fff26SAlex Elder 
48534e9afebaSAlex Elder 	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
48544e9afebaSAlex Elder 	if (!rbd_opts)
48554e9afebaSAlex Elder 		goto out_mem;
48564e9afebaSAlex Elder 
48574e9afebaSAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
4858b5584180SIlya Dryomov 	rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
4859d22f76e7SAlex Elder 
4860859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
48610ddebc0cSAlex Elder 					mon_addrs + mon_addrs_size - 1,
48624e9afebaSAlex Elder 					parse_rbd_opts_token, rbd_opts);
4863859c31dfSAlex Elder 	if (IS_ERR(copts)) {
4864859c31dfSAlex Elder 		ret = PTR_ERR(copts);
4865dc79b113SAlex Elder 		goto out_err;
4866dc79b113SAlex Elder 	}
4867859c31dfSAlex Elder 	kfree(options);
4868859c31dfSAlex Elder 
4869859c31dfSAlex Elder 	*ceph_opts = copts;
48704e9afebaSAlex Elder 	*opts = rbd_opts;
4871859c31dfSAlex Elder 	*rbd_spec = spec;
48720ddebc0cSAlex Elder 
4873dc79b113SAlex Elder 	return 0;
4874f28e565aSAlex Elder out_mem:
4875dc79b113SAlex Elder 	ret = -ENOMEM;
4876d22f76e7SAlex Elder out_err:
4877859c31dfSAlex Elder 	kfree(rbd_opts);
4878859c31dfSAlex Elder 	rbd_spec_put(spec);
4879f28e565aSAlex Elder 	kfree(options);
4880d22f76e7SAlex Elder 
4881dc79b113SAlex Elder 	return ret;
4882a725f65eSAlex Elder }
4883a725f65eSAlex Elder 
4884589d30e0SAlex Elder /*
488530ba1f02SIlya Dryomov  * Return pool id (>= 0) or a negative error code.
488630ba1f02SIlya Dryomov  */
488730ba1f02SIlya Dryomov static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
488830ba1f02SIlya Dryomov {
4889a319bf56SIlya Dryomov 	struct ceph_options *opts = rbdc->client->options;
489030ba1f02SIlya Dryomov 	u64 newest_epoch;
489130ba1f02SIlya Dryomov 	int tries = 0;
489230ba1f02SIlya Dryomov 	int ret;
489330ba1f02SIlya Dryomov 
489430ba1f02SIlya Dryomov again:
489530ba1f02SIlya Dryomov 	ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
489630ba1f02SIlya Dryomov 	if (ret == -ENOENT && tries++ < 1) {
4897d0b19705SIlya Dryomov 		ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap",
489830ba1f02SIlya Dryomov 					    &newest_epoch);
489930ba1f02SIlya Dryomov 		if (ret < 0)
490030ba1f02SIlya Dryomov 			return ret;
490130ba1f02SIlya Dryomov 
490230ba1f02SIlya Dryomov 		if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
49037cca78c9SIlya Dryomov 			ceph_osdc_maybe_request_map(&rbdc->client->osdc);
490430ba1f02SIlya Dryomov 			(void) ceph_monc_wait_osdmap(&rbdc->client->monc,
4905a319bf56SIlya Dryomov 						     newest_epoch,
4906a319bf56SIlya Dryomov 						     opts->mount_timeout);
490730ba1f02SIlya Dryomov 			goto again;
490830ba1f02SIlya Dryomov 		} else {
490930ba1f02SIlya Dryomov 			/* the osdmap we have is new enough */
491030ba1f02SIlya Dryomov 			return -ENOENT;
491130ba1f02SIlya Dryomov 		}
491230ba1f02SIlya Dryomov 	}
491330ba1f02SIlya Dryomov 
491430ba1f02SIlya Dryomov 	return ret;
491530ba1f02SIlya Dryomov }
491630ba1f02SIlya Dryomov 
491730ba1f02SIlya Dryomov /*
4918589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
4919589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
4920589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
4921589d30e0SAlex Elder  *
4922589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
4923589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
4924589d30e0SAlex Elder  * with the supplied name.
4925589d30e0SAlex Elder  *
4926589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
4927589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
4928589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
4929589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
4930589d30e0SAlex Elder  */
4931589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
4932589d30e0SAlex Elder {
4933589d30e0SAlex Elder 	int ret;
4934589d30e0SAlex Elder 	size_t size;
4935589d30e0SAlex Elder 	char *object_name;
4936589d30e0SAlex Elder 	void *response;
4937c0fba368SAlex Elder 	char *image_id;
49382f82ee54SAlex Elder 
4939589d30e0SAlex Elder 	/*
49402c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
49412c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
4942c0fba368SAlex Elder 	 * need to fetch the image id again in this case.  We
4943c0fba368SAlex Elder 	 * do still need to set the image format though.
49442c0d0a10SAlex Elder 	 */
4945c0fba368SAlex Elder 	if (rbd_dev->spec->image_id) {
4946c0fba368SAlex Elder 		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
4947c0fba368SAlex Elder 
49482c0d0a10SAlex Elder 		return 0;
4949c0fba368SAlex Elder 	}
49502c0d0a10SAlex Elder 
49512c0d0a10SAlex Elder 	/*
4952589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
4953589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
4954589d30e0SAlex Elder 	 */
495569e7a02fSAlex Elder 	size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
4956589d30e0SAlex Elder 	object_name = kmalloc(size, GFP_NOIO);
4957589d30e0SAlex Elder 	if (!object_name)
4958589d30e0SAlex Elder 		return -ENOMEM;
49590d7dbfceSAlex Elder 	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
4960589d30e0SAlex Elder 	dout("rbd id object name is %s\n", object_name);
4961589d30e0SAlex Elder 
4962589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
4963589d30e0SAlex Elder 
4964589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
4965589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
4966589d30e0SAlex Elder 	if (!response) {
4967589d30e0SAlex Elder 		ret = -ENOMEM;
4968589d30e0SAlex Elder 		goto out;
4969589d30e0SAlex Elder 	}
4970589d30e0SAlex Elder 
4971c0fba368SAlex Elder 	/* If it doesn't exist we'll assume it's a format 1 image */
4972c0fba368SAlex Elder 
497336be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, object_name,
49744157976bSAlex Elder 				"rbd", "get_id", NULL, 0,
4975e2a58ee5SAlex Elder 				response, RBD_IMAGE_ID_LEN_MAX);
497636be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4977c0fba368SAlex Elder 	if (ret == -ENOENT) {
4978c0fba368SAlex Elder 		image_id = kstrdup("", GFP_KERNEL);
4979c0fba368SAlex Elder 		ret = image_id ? 0 : -ENOMEM;
4980c0fba368SAlex Elder 		if (!ret)
4981c0fba368SAlex Elder 			rbd_dev->image_format = 1;
49827dd440c9SIlya Dryomov 	} else if (ret >= 0) {
4983c0fba368SAlex Elder 		void *p = response;
4984589d30e0SAlex Elder 
4985c0fba368SAlex Elder 		image_id = ceph_extract_encoded_string(&p, p + ret,
4986979ed480SAlex Elder 						NULL, GFP_NOIO);
4987461f758aSDuan Jiong 		ret = PTR_ERR_OR_ZERO(image_id);
4988c0fba368SAlex Elder 		if (!ret)
4989c0fba368SAlex Elder 			rbd_dev->image_format = 2;
4990c0fba368SAlex Elder 	}
4991c0fba368SAlex Elder 
4992c0fba368SAlex Elder 	if (!ret) {
4993c0fba368SAlex Elder 		rbd_dev->spec->image_id = image_id;
4994c0fba368SAlex Elder 		dout("image_id is %s\n", image_id);
4995589d30e0SAlex Elder 	}
4996589d30e0SAlex Elder out:
4997589d30e0SAlex Elder 	kfree(response);
4998589d30e0SAlex Elder 	kfree(object_name);
4999589d30e0SAlex Elder 
5000589d30e0SAlex Elder 	return ret;
5001589d30e0SAlex Elder }
5002589d30e0SAlex Elder 
50033abef3b3SAlex Elder /*
50043abef3b3SAlex Elder  * Undo whatever state changes are made by v1 or v2 header info
50053abef3b3SAlex Elder  * call.
50063abef3b3SAlex Elder  */
50076fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
50086fd48b3bSAlex Elder {
50096fd48b3bSAlex Elder 	struct rbd_image_header	*header;
50106fd48b3bSAlex Elder 
5011a2acd00eSAlex Elder 	rbd_dev_parent_put(rbd_dev);
50126fd48b3bSAlex Elder 
50136fd48b3bSAlex Elder 	/* Free dynamic fields from the header, then zero it out */
50146fd48b3bSAlex Elder 
50156fd48b3bSAlex Elder 	header = &rbd_dev->header;
5016812164f8SAlex Elder 	ceph_put_snap_context(header->snapc);
50176fd48b3bSAlex Elder 	kfree(header->snap_sizes);
50186fd48b3bSAlex Elder 	kfree(header->snap_names);
50196fd48b3bSAlex Elder 	kfree(header->object_prefix);
50206fd48b3bSAlex Elder 	memset(header, 0, sizeof (*header));
50216fd48b3bSAlex Elder }
50226fd48b3bSAlex Elder 
50232df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
5024a30b71b9SAlex Elder {
5025a30b71b9SAlex Elder 	int ret;
5026a30b71b9SAlex Elder 
50271e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
502857385b51SAlex Elder 	if (ret)
50291e130199SAlex Elder 		goto out_err;
5030b1b5402aSAlex Elder 
50312df3fac7SAlex Elder 	/*
50322df3fac7SAlex Elder 	 * Get the and check features for the image.  Currently the
50332df3fac7SAlex Elder 	 * features are assumed to never change.
50342df3fac7SAlex Elder 	 */
5035b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
503657385b51SAlex Elder 	if (ret)
5037b1b5402aSAlex Elder 		goto out_err;
503835d489f9SAlex Elder 
5039cc070d59SAlex Elder 	/* If the image supports fancy striping, get its parameters */
5040cc070d59SAlex Elder 
5041cc070d59SAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
5042cc070d59SAlex Elder 		ret = rbd_dev_v2_striping_info(rbd_dev);
5043cc070d59SAlex Elder 		if (ret < 0)
5044cc070d59SAlex Elder 			goto out_err;
5045cc070d59SAlex Elder 	}
50462df3fac7SAlex Elder 	/* No support for crypto and compression type format 2 images */
5047a30b71b9SAlex Elder 
504835152979SAlex Elder 	return 0;
50499d475de5SAlex Elder out_err:
5050642a2537SAlex Elder 	rbd_dev->header.features = 0;
50511e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
50521e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
50539d475de5SAlex Elder 
50549d475de5SAlex Elder 	return ret;
5055a30b71b9SAlex Elder }
5056a30b71b9SAlex Elder 
50576d69bb53SIlya Dryomov /*
50586d69bb53SIlya Dryomov  * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
50596d69bb53SIlya Dryomov  * rbd_dev_image_probe() recursion depth, which means it's also the
50606d69bb53SIlya Dryomov  * length of the already discovered part of the parent chain.
50616d69bb53SIlya Dryomov  */
50626d69bb53SIlya Dryomov static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
506383a06263SAlex Elder {
50642f82ee54SAlex Elder 	struct rbd_device *parent = NULL;
5065124afba2SAlex Elder 	int ret;
5066124afba2SAlex Elder 
5067124afba2SAlex Elder 	if (!rbd_dev->parent_spec)
5068124afba2SAlex Elder 		return 0;
5069124afba2SAlex Elder 
50706d69bb53SIlya Dryomov 	if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
50716d69bb53SIlya Dryomov 		pr_info("parent chain is too long (%d)\n", depth);
50726d69bb53SIlya Dryomov 		ret = -EINVAL;
50736d69bb53SIlya Dryomov 		goto out_err;
50746d69bb53SIlya Dryomov 	}
50756d69bb53SIlya Dryomov 
50761643dfa4SIlya Dryomov 	parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
50771f2c6651SIlya Dryomov 	if (!parent) {
5078124afba2SAlex Elder 		ret = -ENOMEM;
5079124afba2SAlex Elder 		goto out_err;
50801f2c6651SIlya Dryomov 	}
50811f2c6651SIlya Dryomov 
50821f2c6651SIlya Dryomov 	/*
50831f2c6651SIlya Dryomov 	 * Images related by parent/child relationships always share
50841f2c6651SIlya Dryomov 	 * rbd_client and spec/parent_spec, so bump their refcounts.
50851f2c6651SIlya Dryomov 	 */
50861f2c6651SIlya Dryomov 	__rbd_get_client(rbd_dev->rbd_client);
50871f2c6651SIlya Dryomov 	rbd_spec_get(rbd_dev->parent_spec);
5088124afba2SAlex Elder 
50896d69bb53SIlya Dryomov 	ret = rbd_dev_image_probe(parent, depth);
5090124afba2SAlex Elder 	if (ret < 0)
5091124afba2SAlex Elder 		goto out_err;
50921f2c6651SIlya Dryomov 
5093124afba2SAlex Elder 	rbd_dev->parent = parent;
5094a2acd00eSAlex Elder 	atomic_set(&rbd_dev->parent_ref, 1);
5095124afba2SAlex Elder 	return 0;
5096124afba2SAlex Elder 
50971f2c6651SIlya Dryomov out_err:
50981f2c6651SIlya Dryomov 	rbd_dev_unparent(rbd_dev);
50991f2c6651SIlya Dryomov 	rbd_dev_destroy(parent);
5100124afba2SAlex Elder 	return ret;
5101124afba2SAlex Elder }
5102124afba2SAlex Elder 
5103811c6688SIlya Dryomov /*
5104811c6688SIlya Dryomov  * rbd_dev->header_rwsem must be locked for write and will be unlocked
5105811c6688SIlya Dryomov  * upon return.
5106811c6688SIlya Dryomov  */
5107200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
5108124afba2SAlex Elder {
510983a06263SAlex Elder 	int ret;
511083a06263SAlex Elder 
51119b60e70bSIlya Dryomov 	/* Record our major and minor device numbers. */
511283a06263SAlex Elder 
51139b60e70bSIlya Dryomov 	if (!single_major) {
511483a06263SAlex Elder 		ret = register_blkdev(0, rbd_dev->name);
511583a06263SAlex Elder 		if (ret < 0)
51161643dfa4SIlya Dryomov 			goto err_out_unlock;
51179b60e70bSIlya Dryomov 
511883a06263SAlex Elder 		rbd_dev->major = ret;
5119dd82fff1SIlya Dryomov 		rbd_dev->minor = 0;
51209b60e70bSIlya Dryomov 	} else {
51219b60e70bSIlya Dryomov 		rbd_dev->major = rbd_major;
51229b60e70bSIlya Dryomov 		rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
51239b60e70bSIlya Dryomov 	}
512483a06263SAlex Elder 
512583a06263SAlex Elder 	/* Set up the blkdev mapping. */
512683a06263SAlex Elder 
512783a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
512883a06263SAlex Elder 	if (ret)
512983a06263SAlex Elder 		goto err_out_blkdev;
513083a06263SAlex Elder 
5131f35a4deeSAlex Elder 	ret = rbd_dev_mapping_set(rbd_dev);
513283a06263SAlex Elder 	if (ret)
513383a06263SAlex Elder 		goto err_out_disk;
5134bc1ecc65SIlya Dryomov 
5135f35a4deeSAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
513622001f61SJosh Durgin 	set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only);
5137f35a4deeSAlex Elder 
5138dd5ac32dSIlya Dryomov 	dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
5139dd5ac32dSIlya Dryomov 	ret = device_add(&rbd_dev->dev);
5140f35a4deeSAlex Elder 	if (ret)
5141f5ee37bdSIlya Dryomov 		goto err_out_mapping;
514283a06263SAlex Elder 
514383a06263SAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
514483a06263SAlex Elder 
5145129b79d4SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5146811c6688SIlya Dryomov 	up_write(&rbd_dev->header_rwsem);
514783a06263SAlex Elder 
51481643dfa4SIlya Dryomov 	spin_lock(&rbd_dev_list_lock);
51491643dfa4SIlya Dryomov 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
51501643dfa4SIlya Dryomov 	spin_unlock(&rbd_dev_list_lock);
51511643dfa4SIlya Dryomov 
5152811c6688SIlya Dryomov 	add_disk(rbd_dev->disk);
515383a06263SAlex Elder 	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
515483a06263SAlex Elder 		(unsigned long long) rbd_dev->mapping.size);
515583a06263SAlex Elder 
515683a06263SAlex Elder 	return ret;
51572f82ee54SAlex Elder 
5158f35a4deeSAlex Elder err_out_mapping:
5159f35a4deeSAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
516083a06263SAlex Elder err_out_disk:
516183a06263SAlex Elder 	rbd_free_disk(rbd_dev);
516283a06263SAlex Elder err_out_blkdev:
51639b60e70bSIlya Dryomov 	if (!single_major)
516483a06263SAlex Elder 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
5165811c6688SIlya Dryomov err_out_unlock:
5166811c6688SIlya Dryomov 	up_write(&rbd_dev->header_rwsem);
516783a06263SAlex Elder 	return ret;
516883a06263SAlex Elder }
516983a06263SAlex Elder 
5170332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev)
5171332bb12dSAlex Elder {
5172332bb12dSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
5173c41d13a3SIlya Dryomov 	int ret;
5174332bb12dSAlex Elder 
5175332bb12dSAlex Elder 	/* Record the header object name for this rbd image. */
5176332bb12dSAlex Elder 
5177332bb12dSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5178332bb12dSAlex Elder 
51797627151eSYan, Zheng 	rbd_dev->header_oloc.pool = rbd_dev->layout.pool_id;
5180332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
5181c41d13a3SIlya Dryomov 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5182332bb12dSAlex Elder 				       spec->image_name, RBD_SUFFIX);
5183332bb12dSAlex Elder 	else
5184c41d13a3SIlya Dryomov 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5185332bb12dSAlex Elder 				       RBD_HEADER_PREFIX, spec->image_id);
5186c41d13a3SIlya Dryomov 
5187c41d13a3SIlya Dryomov 	return ret;
5188332bb12dSAlex Elder }
5189332bb12dSAlex Elder 
5190200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev)
5191200a6a8bSAlex Elder {
51926fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
51936fd48b3bSAlex Elder 	rbd_dev->image_format = 0;
51946fd48b3bSAlex Elder 	kfree(rbd_dev->spec->image_id);
51956fd48b3bSAlex Elder 	rbd_dev->spec->image_id = NULL;
51966fd48b3bSAlex Elder 
5197200a6a8bSAlex Elder 	rbd_dev_destroy(rbd_dev);
5198200a6a8bSAlex Elder }
5199200a6a8bSAlex Elder 
5200a30b71b9SAlex Elder /*
5201a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
52021f3ef788SAlex Elder  * device.  If this image is the one being mapped (i.e., not a
52031f3ef788SAlex Elder  * parent), initiate a watch on its header object before using that
52041f3ef788SAlex Elder  * object to get detailed information about the rbd image.
5205a30b71b9SAlex Elder  */
52066d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
5207a30b71b9SAlex Elder {
5208a30b71b9SAlex Elder 	int ret;
5209a30b71b9SAlex Elder 
5210a30b71b9SAlex Elder 	/*
52113abef3b3SAlex Elder 	 * Get the id from the image id object.  Unless there's an
52123abef3b3SAlex Elder 	 * error, rbd_dev->spec->image_id will be filled in with
52133abef3b3SAlex Elder 	 * a dynamically-allocated string, and rbd_dev->image_format
52143abef3b3SAlex Elder 	 * will be set to either 1 or 2.
5215a30b71b9SAlex Elder 	 */
5216a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
5217a30b71b9SAlex Elder 	if (ret)
5218c0fba368SAlex Elder 		return ret;
5219c0fba368SAlex Elder 
5220332bb12dSAlex Elder 	ret = rbd_dev_header_name(rbd_dev);
5221332bb12dSAlex Elder 	if (ret)
5222332bb12dSAlex Elder 		goto err_out_format;
5223332bb12dSAlex Elder 
52246d69bb53SIlya Dryomov 	if (!depth) {
5225fca27065SIlya Dryomov 		ret = rbd_dev_header_watch_sync(rbd_dev);
52261fe48023SIlya Dryomov 		if (ret) {
52271fe48023SIlya Dryomov 			if (ret == -ENOENT)
52281fe48023SIlya Dryomov 				pr_info("image %s/%s does not exist\n",
52291fe48023SIlya Dryomov 					rbd_dev->spec->pool_name,
52301fe48023SIlya Dryomov 					rbd_dev->spec->image_name);
5231c41d13a3SIlya Dryomov 			goto err_out_format;
52321f3ef788SAlex Elder 		}
52331fe48023SIlya Dryomov 	}
5234b644de2bSAlex Elder 
5235a720ae09SIlya Dryomov 	ret = rbd_dev_header_info(rbd_dev);
52365655c4d9SAlex Elder 	if (ret)
5237b644de2bSAlex Elder 		goto err_out_watch;
5238a30b71b9SAlex Elder 
523904077599SIlya Dryomov 	/*
524004077599SIlya Dryomov 	 * If this image is the one being mapped, we have pool name and
524104077599SIlya Dryomov 	 * id, image name and id, and snap name - need to fill snap id.
524204077599SIlya Dryomov 	 * Otherwise this is a parent image, identified by pool, image
524304077599SIlya Dryomov 	 * and snap ids - need to fill in names for those ids.
524404077599SIlya Dryomov 	 */
52456d69bb53SIlya Dryomov 	if (!depth)
524604077599SIlya Dryomov 		ret = rbd_spec_fill_snap_id(rbd_dev);
524704077599SIlya Dryomov 	else
524804077599SIlya Dryomov 		ret = rbd_spec_fill_names(rbd_dev);
52491fe48023SIlya Dryomov 	if (ret) {
52501fe48023SIlya Dryomov 		if (ret == -ENOENT)
52511fe48023SIlya Dryomov 			pr_info("snap %s/%s@%s does not exist\n",
52521fe48023SIlya Dryomov 				rbd_dev->spec->pool_name,
52531fe48023SIlya Dryomov 				rbd_dev->spec->image_name,
52541fe48023SIlya Dryomov 				rbd_dev->spec->snap_name);
525533dca39fSAlex Elder 		goto err_out_probe;
52561fe48023SIlya Dryomov 	}
52579bb81c9bSAlex Elder 
5258e8f59b59SIlya Dryomov 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
5259e8f59b59SIlya Dryomov 		ret = rbd_dev_v2_parent_info(rbd_dev);
5260e8f59b59SIlya Dryomov 		if (ret)
5261e8f59b59SIlya Dryomov 			goto err_out_probe;
5262e8f59b59SIlya Dryomov 
5263e8f59b59SIlya Dryomov 		/*
5264e8f59b59SIlya Dryomov 		 * Need to warn users if this image is the one being
5265e8f59b59SIlya Dryomov 		 * mapped and has a parent.
5266e8f59b59SIlya Dryomov 		 */
52676d69bb53SIlya Dryomov 		if (!depth && rbd_dev->parent_spec)
5268e8f59b59SIlya Dryomov 			rbd_warn(rbd_dev,
5269e8f59b59SIlya Dryomov 				 "WARNING: kernel layering is EXPERIMENTAL!");
5270e8f59b59SIlya Dryomov 	}
5271e8f59b59SIlya Dryomov 
52726d69bb53SIlya Dryomov 	ret = rbd_dev_probe_parent(rbd_dev, depth);
527330d60ba2SAlex Elder 	if (ret)
527430d60ba2SAlex Elder 		goto err_out_probe;
527583a06263SAlex Elder 
527630d60ba2SAlex Elder 	dout("discovered format %u image, header name is %s\n",
5277c41d13a3SIlya Dryomov 		rbd_dev->image_format, rbd_dev->header_oid.name);
527830d60ba2SAlex Elder 	return 0;
5279e8f59b59SIlya Dryomov 
52806fd48b3bSAlex Elder err_out_probe:
52816fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
5282b644de2bSAlex Elder err_out_watch:
52836d69bb53SIlya Dryomov 	if (!depth)
5284fca27065SIlya Dryomov 		rbd_dev_header_unwatch_sync(rbd_dev);
5285332bb12dSAlex Elder err_out_format:
5286332bb12dSAlex Elder 	rbd_dev->image_format = 0;
52875655c4d9SAlex Elder 	kfree(rbd_dev->spec->image_id);
52885655c4d9SAlex Elder 	rbd_dev->spec->image_id = NULL;
52895655c4d9SAlex Elder 	return ret;
529083a06263SAlex Elder }
529183a06263SAlex Elder 
52929b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus,
529359c2be1eSYehuda Sadeh 			  const char *buf,
529459c2be1eSYehuda Sadeh 			  size_t count)
5295602adf40SYehuda Sadeh {
5296cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
5297dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
52984e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
5299859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
53009d3997fdSAlex Elder 	struct rbd_client *rbdc;
530151344a38SAlex Elder 	bool read_only;
5302b51c83c2SIlya Dryomov 	int rc;
5303602adf40SYehuda Sadeh 
5304602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
5305602adf40SYehuda Sadeh 		return -ENODEV;
5306602adf40SYehuda Sadeh 
5307a725f65eSAlex Elder 	/* parse add command */
5308859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
5309dc79b113SAlex Elder 	if (rc < 0)
5310dd5ac32dSIlya Dryomov 		goto out;
5311a725f65eSAlex Elder 
53129d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
53139d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
53149d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
53150ddebc0cSAlex Elder 		goto err_out_args;
53169d3997fdSAlex Elder 	}
5317602adf40SYehuda Sadeh 
5318602adf40SYehuda Sadeh 	/* pick the pool */
531930ba1f02SIlya Dryomov 	rc = rbd_add_get_pool_id(rbdc, spec->pool_name);
53201fe48023SIlya Dryomov 	if (rc < 0) {
53211fe48023SIlya Dryomov 		if (rc == -ENOENT)
53221fe48023SIlya Dryomov 			pr_info("pool %s does not exist\n", spec->pool_name);
5323602adf40SYehuda Sadeh 		goto err_out_client;
53241fe48023SIlya Dryomov 	}
5325859c31dfSAlex Elder 	spec->pool_id = (u64)rc;
5326859c31dfSAlex Elder 
5327d147543dSIlya Dryomov 	rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
5328b51c83c2SIlya Dryomov 	if (!rbd_dev) {
5329b51c83c2SIlya Dryomov 		rc = -ENOMEM;
5330bd4ba655SAlex Elder 		goto err_out_client;
5331b51c83c2SIlya Dryomov 	}
5332c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
5333c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
5334d147543dSIlya Dryomov 	rbd_opts = NULL;	/* rbd_dev now owns this */
5335602adf40SYehuda Sadeh 
5336811c6688SIlya Dryomov 	down_write(&rbd_dev->header_rwsem);
53376d69bb53SIlya Dryomov 	rc = rbd_dev_image_probe(rbd_dev, 0);
5338a30b71b9SAlex Elder 	if (rc < 0)
5339c53d5893SAlex Elder 		goto err_out_rbd_dev;
534005fd6f6fSAlex Elder 
53417ce4eef7SAlex Elder 	/* If we are mapping a snapshot it must be marked read-only */
53427ce4eef7SAlex Elder 
5343d147543dSIlya Dryomov 	read_only = rbd_dev->opts->read_only;
53447ce4eef7SAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
53457ce4eef7SAlex Elder 		read_only = true;
53467ce4eef7SAlex Elder 	rbd_dev->mapping.read_only = read_only;
53477ce4eef7SAlex Elder 
5348b536f69aSAlex Elder 	rc = rbd_dev_device_setup(rbd_dev);
53493abef3b3SAlex Elder 	if (rc) {
5350e37180c0SIlya Dryomov 		/*
5351e37180c0SIlya Dryomov 		 * rbd_dev_header_unwatch_sync() can't be moved into
5352e37180c0SIlya Dryomov 		 * rbd_dev_image_release() without refactoring, see
5353e37180c0SIlya Dryomov 		 * commit 1f3ef78861ac.
5354e37180c0SIlya Dryomov 		 */
5355e37180c0SIlya Dryomov 		rbd_dev_header_unwatch_sync(rbd_dev);
53563abef3b3SAlex Elder 		rbd_dev_image_release(rbd_dev);
5357dd5ac32dSIlya Dryomov 		goto out;
53583abef3b3SAlex Elder 	}
53593abef3b3SAlex Elder 
5360dd5ac32dSIlya Dryomov 	rc = count;
5361dd5ac32dSIlya Dryomov out:
5362dd5ac32dSIlya Dryomov 	module_put(THIS_MODULE);
5363dd5ac32dSIlya Dryomov 	return rc;
5364b536f69aSAlex Elder 
5365c53d5893SAlex Elder err_out_rbd_dev:
5366811c6688SIlya Dryomov 	up_write(&rbd_dev->header_rwsem);
5367c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
5368bd4ba655SAlex Elder err_out_client:
53699d3997fdSAlex Elder 	rbd_put_client(rbdc);
53700ddebc0cSAlex Elder err_out_args:
5371859c31dfSAlex Elder 	rbd_spec_put(spec);
5372d147543dSIlya Dryomov 	kfree(rbd_opts);
5373dd5ac32dSIlya Dryomov 	goto out;
5374602adf40SYehuda Sadeh }
5375602adf40SYehuda Sadeh 
53769b60e70bSIlya Dryomov static ssize_t rbd_add(struct bus_type *bus,
53779b60e70bSIlya Dryomov 		       const char *buf,
53789b60e70bSIlya Dryomov 		       size_t count)
53799b60e70bSIlya Dryomov {
53809b60e70bSIlya Dryomov 	if (single_major)
53819b60e70bSIlya Dryomov 		return -EINVAL;
53829b60e70bSIlya Dryomov 
53839b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
53849b60e70bSIlya Dryomov }
53859b60e70bSIlya Dryomov 
53869b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus,
53879b60e70bSIlya Dryomov 				    const char *buf,
53889b60e70bSIlya Dryomov 				    size_t count)
53899b60e70bSIlya Dryomov {
53909b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
53919b60e70bSIlya Dryomov }
53929b60e70bSIlya Dryomov 
5393dd5ac32dSIlya Dryomov static void rbd_dev_device_release(struct rbd_device *rbd_dev)
5394602adf40SYehuda Sadeh {
5395602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
53961643dfa4SIlya Dryomov 
53971643dfa4SIlya Dryomov 	spin_lock(&rbd_dev_list_lock);
53981643dfa4SIlya Dryomov 	list_del_init(&rbd_dev->node);
53991643dfa4SIlya Dryomov 	spin_unlock(&rbd_dev_list_lock);
54001643dfa4SIlya Dryomov 
5401200a6a8bSAlex Elder 	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5402dd5ac32dSIlya Dryomov 	device_del(&rbd_dev->dev);
54036d80b130SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
54049b60e70bSIlya Dryomov 	if (!single_major)
5405602adf40SYehuda Sadeh 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
5406602adf40SYehuda Sadeh }
5407602adf40SYehuda Sadeh 
540805a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
540905a46afdSAlex Elder {
5410ad945fc1SAlex Elder 	while (rbd_dev->parent) {
541105a46afdSAlex Elder 		struct rbd_device *first = rbd_dev;
541205a46afdSAlex Elder 		struct rbd_device *second = first->parent;
541305a46afdSAlex Elder 		struct rbd_device *third;
541405a46afdSAlex Elder 
541505a46afdSAlex Elder 		/*
541605a46afdSAlex Elder 		 * Follow to the parent with no grandparent and
541705a46afdSAlex Elder 		 * remove it.
541805a46afdSAlex Elder 		 */
541905a46afdSAlex Elder 		while (second && (third = second->parent)) {
542005a46afdSAlex Elder 			first = second;
542105a46afdSAlex Elder 			second = third;
542205a46afdSAlex Elder 		}
5423ad945fc1SAlex Elder 		rbd_assert(second);
54248ad42cd0SAlex Elder 		rbd_dev_image_release(second);
5425ad945fc1SAlex Elder 		first->parent = NULL;
5426ad945fc1SAlex Elder 		first->parent_overlap = 0;
5427ad945fc1SAlex Elder 
5428ad945fc1SAlex Elder 		rbd_assert(first->parent_spec);
542905a46afdSAlex Elder 		rbd_spec_put(first->parent_spec);
543005a46afdSAlex Elder 		first->parent_spec = NULL;
543105a46afdSAlex Elder 	}
543205a46afdSAlex Elder }
543305a46afdSAlex Elder 
54349b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus,
5435602adf40SYehuda Sadeh 			     const char *buf,
5436602adf40SYehuda Sadeh 			     size_t count)
5437602adf40SYehuda Sadeh {
5438602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
5439751cc0e3SAlex Elder 	struct list_head *tmp;
5440751cc0e3SAlex Elder 	int dev_id;
5441602adf40SYehuda Sadeh 	unsigned long ul;
544282a442d2SAlex Elder 	bool already = false;
54430d8189e1SAlex Elder 	int ret;
5444602adf40SYehuda Sadeh 
5445bb8e0e84SJingoo Han 	ret = kstrtoul(buf, 10, &ul);
54460d8189e1SAlex Elder 	if (ret)
54470d8189e1SAlex Elder 		return ret;
5448602adf40SYehuda Sadeh 
5449602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
5450751cc0e3SAlex Elder 	dev_id = (int)ul;
5451751cc0e3SAlex Elder 	if (dev_id != ul)
5452602adf40SYehuda Sadeh 		return -EINVAL;
5453602adf40SYehuda Sadeh 
5454602adf40SYehuda Sadeh 	ret = -ENOENT;
5455751cc0e3SAlex Elder 	spin_lock(&rbd_dev_list_lock);
5456751cc0e3SAlex Elder 	list_for_each(tmp, &rbd_dev_list) {
5457751cc0e3SAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
5458751cc0e3SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
5459751cc0e3SAlex Elder 			ret = 0;
5460751cc0e3SAlex Elder 			break;
5461602adf40SYehuda Sadeh 		}
5462751cc0e3SAlex Elder 	}
5463751cc0e3SAlex Elder 	if (!ret) {
5464a14ea269SAlex Elder 		spin_lock_irq(&rbd_dev->lock);
5465b82d167bSAlex Elder 		if (rbd_dev->open_count)
546642382b70SAlex Elder 			ret = -EBUSY;
5467b82d167bSAlex Elder 		else
546882a442d2SAlex Elder 			already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
546982a442d2SAlex Elder 							&rbd_dev->flags);
5470a14ea269SAlex Elder 		spin_unlock_irq(&rbd_dev->lock);
5471751cc0e3SAlex Elder 	}
5472751cc0e3SAlex Elder 	spin_unlock(&rbd_dev_list_lock);
547382a442d2SAlex Elder 	if (ret < 0 || already)
54741ba0f1e7SAlex Elder 		return ret;
5475751cc0e3SAlex Elder 
5476fca27065SIlya Dryomov 	rbd_dev_header_unwatch_sync(rbd_dev);
5477fca27065SIlya Dryomov 
54789875201eSJosh Durgin 	/*
54799875201eSJosh Durgin 	 * Don't free anything from rbd_dev->disk until after all
54809875201eSJosh Durgin 	 * notifies are completely processed. Otherwise
54819875201eSJosh Durgin 	 * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting
54829875201eSJosh Durgin 	 * in a potential use after free of rbd_dev->disk or rbd_dev.
54839875201eSJosh Durgin 	 */
5484dd5ac32dSIlya Dryomov 	rbd_dev_device_release(rbd_dev);
54858ad42cd0SAlex Elder 	rbd_dev_image_release(rbd_dev);
5486aafb230eSAlex Elder 
54871ba0f1e7SAlex Elder 	return count;
5488602adf40SYehuda Sadeh }
5489602adf40SYehuda Sadeh 
54909b60e70bSIlya Dryomov static ssize_t rbd_remove(struct bus_type *bus,
54919b60e70bSIlya Dryomov 			  const char *buf,
54929b60e70bSIlya Dryomov 			  size_t count)
54939b60e70bSIlya Dryomov {
54949b60e70bSIlya Dryomov 	if (single_major)
54959b60e70bSIlya Dryomov 		return -EINVAL;
54969b60e70bSIlya Dryomov 
54979b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
54989b60e70bSIlya Dryomov }
54999b60e70bSIlya Dryomov 
55009b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus,
55019b60e70bSIlya Dryomov 				       const char *buf,
55029b60e70bSIlya Dryomov 				       size_t count)
55039b60e70bSIlya Dryomov {
55049b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
55059b60e70bSIlya Dryomov }
55069b60e70bSIlya Dryomov 
5507602adf40SYehuda Sadeh /*
5508602adf40SYehuda Sadeh  * create control files in sysfs
5509dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
5510602adf40SYehuda Sadeh  */
5511602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
5512602adf40SYehuda Sadeh {
5513dfc5606dSYehuda Sadeh 	int ret;
5514602adf40SYehuda Sadeh 
5515fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
5516dfc5606dSYehuda Sadeh 	if (ret < 0)
5517dfc5606dSYehuda Sadeh 		return ret;
5518602adf40SYehuda Sadeh 
5519fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
5520fed4c143SAlex Elder 	if (ret < 0)
5521fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
5522602adf40SYehuda Sadeh 
5523602adf40SYehuda Sadeh 	return ret;
5524602adf40SYehuda Sadeh }
5525602adf40SYehuda Sadeh 
5526602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
5527602adf40SYehuda Sadeh {
5528dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
5529fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
5530602adf40SYehuda Sadeh }
5531602adf40SYehuda Sadeh 
55321c2a9dfeSAlex Elder static int rbd_slab_init(void)
55331c2a9dfeSAlex Elder {
55341c2a9dfeSAlex Elder 	rbd_assert(!rbd_img_request_cache);
553503d94406SGeliang Tang 	rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
5536868311b1SAlex Elder 	if (!rbd_img_request_cache)
5537868311b1SAlex Elder 		return -ENOMEM;
5538868311b1SAlex Elder 
5539868311b1SAlex Elder 	rbd_assert(!rbd_obj_request_cache);
554003d94406SGeliang Tang 	rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
554178c2a44aSAlex Elder 	if (!rbd_obj_request_cache)
554278c2a44aSAlex Elder 		goto out_err;
554378c2a44aSAlex Elder 
554478c2a44aSAlex Elder 	rbd_assert(!rbd_segment_name_cache);
554578c2a44aSAlex Elder 	rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
55462d0ebc5dSIlya Dryomov 					CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL);
554778c2a44aSAlex Elder 	if (rbd_segment_name_cache)
55481c2a9dfeSAlex Elder 		return 0;
554978c2a44aSAlex Elder out_err:
555078c2a44aSAlex Elder 	kmem_cache_destroy(rbd_obj_request_cache);
555178c2a44aSAlex Elder 	rbd_obj_request_cache = NULL;
55521c2a9dfeSAlex Elder 
5553868311b1SAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
5554868311b1SAlex Elder 	rbd_img_request_cache = NULL;
5555868311b1SAlex Elder 
55561c2a9dfeSAlex Elder 	return -ENOMEM;
55571c2a9dfeSAlex Elder }
55581c2a9dfeSAlex Elder 
55591c2a9dfeSAlex Elder static void rbd_slab_exit(void)
55601c2a9dfeSAlex Elder {
556178c2a44aSAlex Elder 	rbd_assert(rbd_segment_name_cache);
556278c2a44aSAlex Elder 	kmem_cache_destroy(rbd_segment_name_cache);
556378c2a44aSAlex Elder 	rbd_segment_name_cache = NULL;
556478c2a44aSAlex Elder 
5565868311b1SAlex Elder 	rbd_assert(rbd_obj_request_cache);
5566868311b1SAlex Elder 	kmem_cache_destroy(rbd_obj_request_cache);
5567868311b1SAlex Elder 	rbd_obj_request_cache = NULL;
5568868311b1SAlex Elder 
55691c2a9dfeSAlex Elder 	rbd_assert(rbd_img_request_cache);
55701c2a9dfeSAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
55711c2a9dfeSAlex Elder 	rbd_img_request_cache = NULL;
55721c2a9dfeSAlex Elder }
55731c2a9dfeSAlex Elder 
5574cc344fa1SAlex Elder static int __init rbd_init(void)
5575602adf40SYehuda Sadeh {
5576602adf40SYehuda Sadeh 	int rc;
5577602adf40SYehuda Sadeh 
55781e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
55791e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
55801e32d34cSAlex Elder 		return -EINVAL;
55811e32d34cSAlex Elder 	}
5582e1b4d96dSIlya Dryomov 
55831c2a9dfeSAlex Elder 	rc = rbd_slab_init();
5584602adf40SYehuda Sadeh 	if (rc)
5585602adf40SYehuda Sadeh 		return rc;
5586e1b4d96dSIlya Dryomov 
5587f5ee37bdSIlya Dryomov 	/*
5588f5ee37bdSIlya Dryomov 	 * The number of active work items is limited by the number of
5589f77303bdSIlya Dryomov 	 * rbd devices * queue depth, so leave @max_active at default.
5590f5ee37bdSIlya Dryomov 	 */
5591f5ee37bdSIlya Dryomov 	rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
5592f5ee37bdSIlya Dryomov 	if (!rbd_wq) {
5593f5ee37bdSIlya Dryomov 		rc = -ENOMEM;
5594f5ee37bdSIlya Dryomov 		goto err_out_slab;
5595f5ee37bdSIlya Dryomov 	}
5596f5ee37bdSIlya Dryomov 
55979b60e70bSIlya Dryomov 	if (single_major) {
55989b60e70bSIlya Dryomov 		rbd_major = register_blkdev(0, RBD_DRV_NAME);
55999b60e70bSIlya Dryomov 		if (rbd_major < 0) {
56009b60e70bSIlya Dryomov 			rc = rbd_major;
5601f5ee37bdSIlya Dryomov 			goto err_out_wq;
56029b60e70bSIlya Dryomov 		}
56039b60e70bSIlya Dryomov 	}
56049b60e70bSIlya Dryomov 
56051c2a9dfeSAlex Elder 	rc = rbd_sysfs_init();
56061c2a9dfeSAlex Elder 	if (rc)
56079b60e70bSIlya Dryomov 		goto err_out_blkdev;
56081c2a9dfeSAlex Elder 
56099b60e70bSIlya Dryomov 	if (single_major)
56109b60e70bSIlya Dryomov 		pr_info("loaded (major %d)\n", rbd_major);
56119b60e70bSIlya Dryomov 	else
5612e1b4d96dSIlya Dryomov 		pr_info("loaded\n");
56139b60e70bSIlya Dryomov 
5614e1b4d96dSIlya Dryomov 	return 0;
5615e1b4d96dSIlya Dryomov 
56169b60e70bSIlya Dryomov err_out_blkdev:
56179b60e70bSIlya Dryomov 	if (single_major)
56189b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
5619f5ee37bdSIlya Dryomov err_out_wq:
5620f5ee37bdSIlya Dryomov 	destroy_workqueue(rbd_wq);
5621e1b4d96dSIlya Dryomov err_out_slab:
5622e1b4d96dSIlya Dryomov 	rbd_slab_exit();
56231c2a9dfeSAlex Elder 	return rc;
5624602adf40SYehuda Sadeh }
5625602adf40SYehuda Sadeh 
5626cc344fa1SAlex Elder static void __exit rbd_exit(void)
5627602adf40SYehuda Sadeh {
5628ffe312cfSIlya Dryomov 	ida_destroy(&rbd_dev_id_ida);
5629602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
56309b60e70bSIlya Dryomov 	if (single_major)
56319b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
5632f5ee37bdSIlya Dryomov 	destroy_workqueue(rbd_wq);
56331c2a9dfeSAlex Elder 	rbd_slab_exit();
5634602adf40SYehuda Sadeh }
5635602adf40SYehuda Sadeh 
5636602adf40SYehuda Sadeh module_init(rbd_init);
5637602adf40SYehuda Sadeh module_exit(rbd_exit);
5638602adf40SYehuda Sadeh 
5639d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
5640602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5641602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
5642602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
5643602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5644602adf40SYehuda Sadeh 
564590da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
5646602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
5647