xref: /openbmc/linux/drivers/block/rbd.c (revision 2894e1d7)
1e2a58ee5SAlex Elder 
2602adf40SYehuda Sadeh /*
3602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh 
6602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
7602adf40SYehuda Sadeh 
8602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
9602adf40SYehuda Sadeh 
10602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
11602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
12602adf40SYehuda Sadeh    the Free Software Foundation.
13602adf40SYehuda Sadeh 
14602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
15602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
16602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17602adf40SYehuda Sadeh    GNU General Public License for more details.
18602adf40SYehuda Sadeh 
19602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
20602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
21602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24602adf40SYehuda Sadeh 
25dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
26602adf40SYehuda Sadeh 
27dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
28602adf40SYehuda Sadeh 
29602adf40SYehuda Sadeh  */
30602adf40SYehuda Sadeh 
31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
34602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3559c2be1eSYehuda Sadeh #include <linux/parser.h>
3630d1cff8SAlex Elder #include <linux/bsearch.h>
37602adf40SYehuda Sadeh 
38602adf40SYehuda Sadeh #include <linux/kernel.h>
39602adf40SYehuda Sadeh #include <linux/device.h>
40602adf40SYehuda Sadeh #include <linux/module.h>
417ad18afaSChristoph Hellwig #include <linux/blk-mq.h>
42602adf40SYehuda Sadeh #include <linux/fs.h>
43602adf40SYehuda Sadeh #include <linux/blkdev.h>
441c2a9dfeSAlex Elder #include <linux/slab.h>
45f8a22fc2SIlya Dryomov #include <linux/idr.h>
46bc1ecc65SIlya Dryomov #include <linux/workqueue.h>
47602adf40SYehuda Sadeh 
48602adf40SYehuda Sadeh #include "rbd_types.h"
49602adf40SYehuda Sadeh 
50aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
51aafb230eSAlex Elder 
52593a9e7bSAlex Elder /*
53593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
54593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
55593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
56593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
57593a9e7bSAlex Elder  */
58593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
59593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
60593a9e7bSAlex Elder 
61a2acd00eSAlex Elder /*
62a2acd00eSAlex Elder  * Increment the given counter and return its updated value.
63a2acd00eSAlex Elder  * If the counter is already 0 it will not be incremented.
64a2acd00eSAlex Elder  * If the counter is already at its maximum value returns
65a2acd00eSAlex Elder  * -EINVAL without updating it.
66a2acd00eSAlex Elder  */
67a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v)
68a2acd00eSAlex Elder {
69a2acd00eSAlex Elder 	unsigned int counter;
70a2acd00eSAlex Elder 
71a2acd00eSAlex Elder 	counter = (unsigned int)__atomic_add_unless(v, 1, 0);
72a2acd00eSAlex Elder 	if (counter <= (unsigned int)INT_MAX)
73a2acd00eSAlex Elder 		return (int)counter;
74a2acd00eSAlex Elder 
75a2acd00eSAlex Elder 	atomic_dec(v);
76a2acd00eSAlex Elder 
77a2acd00eSAlex Elder 	return -EINVAL;
78a2acd00eSAlex Elder }
79a2acd00eSAlex Elder 
80a2acd00eSAlex Elder /* Decrement the counter.  Return the resulting value, or -EINVAL */
81a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v)
82a2acd00eSAlex Elder {
83a2acd00eSAlex Elder 	int counter;
84a2acd00eSAlex Elder 
85a2acd00eSAlex Elder 	counter = atomic_dec_return(v);
86a2acd00eSAlex Elder 	if (counter >= 0)
87a2acd00eSAlex Elder 		return counter;
88a2acd00eSAlex Elder 
89a2acd00eSAlex Elder 	atomic_inc(v);
90a2acd00eSAlex Elder 
91a2acd00eSAlex Elder 	return -EINVAL;
92a2acd00eSAlex Elder }
93a2acd00eSAlex Elder 
94f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
95602adf40SYehuda Sadeh 
967e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR		256
977e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT	4
98602adf40SYehuda Sadeh 
99d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
100d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
101d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
102d4b125e9SAlex Elder 
10335d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
104602adf40SYehuda Sadeh 
105602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
106602adf40SYehuda Sadeh 
1079682fc6dSAlex Elder #define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
1089682fc6dSAlex Elder 
1099e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
1109e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
111589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
1129e15b77dSAlex Elder 
1131e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
114589d30e0SAlex Elder 
115d889140cSAlex Elder /* Feature bits */
116d889140cSAlex Elder 
1175cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING	(1<<0)
1185cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2	(1<<1)
1195cbf6f12SAlex Elder #define RBD_FEATURES_ALL \
1205cbf6f12SAlex Elder 	    (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
121d889140cSAlex Elder 
122d889140cSAlex Elder /* Features supported by this (client software) implementation. */
123d889140cSAlex Elder 
124770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
125d889140cSAlex Elder 
12681a89793SAlex Elder /*
12781a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
12881a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
12981a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
13081a89793SAlex Elder  * enough to hold all possible device names.
13181a89793SAlex Elder  */
132602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
13381a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
134602adf40SYehuda Sadeh 
135602adf40SYehuda Sadeh /*
136602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
137602adf40SYehuda Sadeh  */
138602adf40SYehuda Sadeh struct rbd_image_header {
139f35a4deeSAlex Elder 	/* These six fields never change for a given rbd image */
140849b4260SAlex Elder 	char *object_prefix;
141602adf40SYehuda Sadeh 	__u8 obj_order;
142602adf40SYehuda Sadeh 	__u8 crypt_type;
143602adf40SYehuda Sadeh 	__u8 comp_type;
144f35a4deeSAlex Elder 	u64 stripe_unit;
145f35a4deeSAlex Elder 	u64 stripe_count;
146f35a4deeSAlex Elder 	u64 features;		/* Might be changeable someday? */
147602adf40SYehuda Sadeh 
148f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
149f84344f3SAlex Elder 	u64 image_size;
150f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
151f35a4deeSAlex Elder 	char *snap_names;	/* format 1 only */
152f35a4deeSAlex Elder 	u64 *snap_sizes;	/* format 1 only */
15359c2be1eSYehuda Sadeh };
15459c2be1eSYehuda Sadeh 
1550d7dbfceSAlex Elder /*
1560d7dbfceSAlex Elder  * An rbd image specification.
1570d7dbfceSAlex Elder  *
1580d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
159c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
160c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
161c66c6e0cSAlex Elder  *
162c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
163c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
164c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
165c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
166c66c6e0cSAlex Elder  *
167c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
168c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
169c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
170c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
171c66c6e0cSAlex Elder  * is shared between the parent and child).
172c66c6e0cSAlex Elder  *
173c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
174c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
175c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
176c66c6e0cSAlex Elder  *
177c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
178c66c6e0cSAlex Elder  * could be a null pointer).
1790d7dbfceSAlex Elder  */
1800d7dbfceSAlex Elder struct rbd_spec {
1810d7dbfceSAlex Elder 	u64		pool_id;
182ecb4dc22SAlex Elder 	const char	*pool_name;
1830d7dbfceSAlex Elder 
184ecb4dc22SAlex Elder 	const char	*image_id;
185ecb4dc22SAlex Elder 	const char	*image_name;
1860d7dbfceSAlex Elder 
1870d7dbfceSAlex Elder 	u64		snap_id;
188ecb4dc22SAlex Elder 	const char	*snap_name;
1890d7dbfceSAlex Elder 
1900d7dbfceSAlex Elder 	struct kref	kref;
1910d7dbfceSAlex Elder };
1920d7dbfceSAlex Elder 
193602adf40SYehuda Sadeh /*
194f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
195602adf40SYehuda Sadeh  */
196602adf40SYehuda Sadeh struct rbd_client {
197602adf40SYehuda Sadeh 	struct ceph_client	*client;
198602adf40SYehuda Sadeh 	struct kref		kref;
199602adf40SYehuda Sadeh 	struct list_head	node;
200602adf40SYehuda Sadeh };
201602adf40SYehuda Sadeh 
202bf0d5f50SAlex Elder struct rbd_img_request;
203bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
204bf0d5f50SAlex Elder 
205bf0d5f50SAlex Elder #define	BAD_WHICH	U32_MAX		/* Good which or bad which, which? */
206bf0d5f50SAlex Elder 
207bf0d5f50SAlex Elder struct rbd_obj_request;
208bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
209bf0d5f50SAlex Elder 
2109969ebc5SAlex Elder enum obj_request_type {
2119969ebc5SAlex Elder 	OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
2129969ebc5SAlex Elder };
213bf0d5f50SAlex Elder 
2146d2940c8SGuangliang Zhao enum obj_operation_type {
2156d2940c8SGuangliang Zhao 	OBJ_OP_WRITE,
2166d2940c8SGuangliang Zhao 	OBJ_OP_READ,
21790e98c52SGuangliang Zhao 	OBJ_OP_DISCARD,
2186d2940c8SGuangliang Zhao };
2196d2940c8SGuangliang Zhao 
220926f9b3fSAlex Elder enum obj_req_flags {
221926f9b3fSAlex Elder 	OBJ_REQ_DONE,		/* completion flag: not done = 0, done = 1 */
2226365d33aSAlex Elder 	OBJ_REQ_IMG_DATA,	/* object usage: standalone = 0, image = 1 */
2235679c59fSAlex Elder 	OBJ_REQ_KNOWN,		/* EXISTS flag valid: no = 0, yes = 1 */
2245679c59fSAlex Elder 	OBJ_REQ_EXISTS,		/* target exists: no = 0, yes = 1 */
225926f9b3fSAlex Elder };
226926f9b3fSAlex Elder 
227bf0d5f50SAlex Elder struct rbd_obj_request {
228bf0d5f50SAlex Elder 	const char		*object_name;
229bf0d5f50SAlex Elder 	u64			offset;		/* object start byte */
230bf0d5f50SAlex Elder 	u64			length;		/* bytes from offset */
231926f9b3fSAlex Elder 	unsigned long		flags;
232bf0d5f50SAlex Elder 
233c5b5ef6cSAlex Elder 	/*
234c5b5ef6cSAlex Elder 	 * An object request associated with an image will have its
235c5b5ef6cSAlex Elder 	 * img_data flag set; a standalone object request will not.
236c5b5ef6cSAlex Elder 	 *
237c5b5ef6cSAlex Elder 	 * A standalone object request will have which == BAD_WHICH
238c5b5ef6cSAlex Elder 	 * and a null obj_request pointer.
239c5b5ef6cSAlex Elder 	 *
240c5b5ef6cSAlex Elder 	 * An object request initiated in support of a layered image
241c5b5ef6cSAlex Elder 	 * object (to check for its existence before a write) will
242c5b5ef6cSAlex Elder 	 * have which == BAD_WHICH and a non-null obj_request pointer.
243c5b5ef6cSAlex Elder 	 *
244c5b5ef6cSAlex Elder 	 * Finally, an object request for rbd image data will have
245c5b5ef6cSAlex Elder 	 * which != BAD_WHICH, and will have a non-null img_request
246c5b5ef6cSAlex Elder 	 * pointer.  The value of which will be in the range
247c5b5ef6cSAlex Elder 	 * 0..(img_request->obj_request_count-1).
248c5b5ef6cSAlex Elder 	 */
249c5b5ef6cSAlex Elder 	union {
250c5b5ef6cSAlex Elder 		struct rbd_obj_request	*obj_request;	/* STAT op */
251c5b5ef6cSAlex Elder 		struct {
252bf0d5f50SAlex Elder 			struct rbd_img_request	*img_request;
253c5b5ef6cSAlex Elder 			u64			img_offset;
254c5b5ef6cSAlex Elder 			/* links for img_request->obj_requests list */
255c5b5ef6cSAlex Elder 			struct list_head	links;
256c5b5ef6cSAlex Elder 		};
257c5b5ef6cSAlex Elder 	};
258bf0d5f50SAlex Elder 	u32			which;		/* posn image request list */
259bf0d5f50SAlex Elder 
260bf0d5f50SAlex Elder 	enum obj_request_type	type;
261788e2df3SAlex Elder 	union {
262bf0d5f50SAlex Elder 		struct bio	*bio_list;
263788e2df3SAlex Elder 		struct {
264788e2df3SAlex Elder 			struct page	**pages;
265788e2df3SAlex Elder 			u32		page_count;
266788e2df3SAlex Elder 		};
267788e2df3SAlex Elder 	};
2680eefd470SAlex Elder 	struct page		**copyup_pages;
269ebda6408SAlex Elder 	u32			copyup_page_count;
270bf0d5f50SAlex Elder 
271bf0d5f50SAlex Elder 	struct ceph_osd_request	*osd_req;
272bf0d5f50SAlex Elder 
273bf0d5f50SAlex Elder 	u64			xferred;	/* bytes transferred */
2741b83bef2SSage Weil 	int			result;
275bf0d5f50SAlex Elder 
276bf0d5f50SAlex Elder 	rbd_obj_callback_t	callback;
277788e2df3SAlex Elder 	struct completion	completion;
278bf0d5f50SAlex Elder 
279bf0d5f50SAlex Elder 	struct kref		kref;
280bf0d5f50SAlex Elder };
281bf0d5f50SAlex Elder 
2820c425248SAlex Elder enum img_req_flags {
2839849e986SAlex Elder 	IMG_REQ_WRITE,		/* I/O direction: read = 0, write = 1 */
2849849e986SAlex Elder 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
285d0b2e944SAlex Elder 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
28690e98c52SGuangliang Zhao 	IMG_REQ_DISCARD,	/* discard: normal = 0, discard request = 1 */
2870c425248SAlex Elder };
2880c425248SAlex Elder 
289bf0d5f50SAlex Elder struct rbd_img_request {
290bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
291bf0d5f50SAlex Elder 	u64			offset;	/* starting image byte offset */
292bf0d5f50SAlex Elder 	u64			length;	/* byte count from offset */
2930c425248SAlex Elder 	unsigned long		flags;
294bf0d5f50SAlex Elder 	union {
295bf0d5f50SAlex Elder 		u64			snap_id;	/* for reads */
2969849e986SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
2979849e986SAlex Elder 	};
2989849e986SAlex Elder 	union {
2999849e986SAlex Elder 		struct request		*rq;		/* block request */
3009849e986SAlex Elder 		struct rbd_obj_request	*obj_request;	/* obj req initiator */
301bf0d5f50SAlex Elder 	};
3023d7efd18SAlex Elder 	struct page		**copyup_pages;
303ebda6408SAlex Elder 	u32			copyup_page_count;
304bf0d5f50SAlex Elder 	spinlock_t		completion_lock;/* protects next_completion */
305bf0d5f50SAlex Elder 	u32			next_completion;
306bf0d5f50SAlex Elder 	rbd_img_callback_t	callback;
30755f27e09SAlex Elder 	u64			xferred;/* aggregate bytes transferred */
308a5a337d4SAlex Elder 	int			result;	/* first nonzero obj_request result */
309bf0d5f50SAlex Elder 
310bf0d5f50SAlex Elder 	u32			obj_request_count;
311bf0d5f50SAlex Elder 	struct list_head	obj_requests;	/* rbd_obj_request structs */
312bf0d5f50SAlex Elder 
313bf0d5f50SAlex Elder 	struct kref		kref;
314bf0d5f50SAlex Elder };
315bf0d5f50SAlex Elder 
316bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
317ef06f4d3SAlex Elder 	list_for_each_entry(oreq, &(ireq)->obj_requests, links)
318bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \
319ef06f4d3SAlex Elder 	list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
320bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
321ef06f4d3SAlex Elder 	list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
322bf0d5f50SAlex Elder 
323f84344f3SAlex Elder struct rbd_mapping {
32499c1f08fSAlex Elder 	u64                     size;
32534b13184SAlex Elder 	u64                     features;
326f84344f3SAlex Elder 	bool			read_only;
327f84344f3SAlex Elder };
328f84344f3SAlex Elder 
329602adf40SYehuda Sadeh /*
330602adf40SYehuda Sadeh  * a single device
331602adf40SYehuda Sadeh  */
332602adf40SYehuda Sadeh struct rbd_device {
333de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
334602adf40SYehuda Sadeh 
335602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
336dd82fff1SIlya Dryomov 	int			minor;
337602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
338602adf40SYehuda Sadeh 
339a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
340602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
341602adf40SYehuda Sadeh 
342602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
343602adf40SYehuda Sadeh 
344b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
345602adf40SYehuda Sadeh 
346602adf40SYehuda Sadeh 	struct rbd_image_header	header;
347b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
3480d7dbfceSAlex Elder 	struct rbd_spec		*spec;
349602adf40SYehuda Sadeh 
3500d7dbfceSAlex Elder 	char			*header_name;
351971f839aSAlex Elder 
3520903e875SAlex Elder 	struct ceph_file_layout	layout;
3530903e875SAlex Elder 
35459c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
355975241afSAlex Elder 	struct rbd_obj_request	*watch_request;
35659c2be1eSYehuda Sadeh 
35786b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
35886b00e0dSAlex Elder 	u64			parent_overlap;
359a2acd00eSAlex Elder 	atomic_t		parent_ref;
3602f82ee54SAlex Elder 	struct rbd_device	*parent;
36186b00e0dSAlex Elder 
3627ad18afaSChristoph Hellwig 	/* Block layer tags. */
3637ad18afaSChristoph Hellwig 	struct blk_mq_tag_set	tag_set;
3647ad18afaSChristoph Hellwig 
365c666601aSJosh Durgin 	/* protects updating the header */
366c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
367f84344f3SAlex Elder 
368f84344f3SAlex Elder 	struct rbd_mapping	mapping;
369602adf40SYehuda Sadeh 
370602adf40SYehuda Sadeh 	struct list_head	node;
371dfc5606dSYehuda Sadeh 
372dfc5606dSYehuda Sadeh 	/* sysfs related */
373dfc5606dSYehuda Sadeh 	struct device		dev;
374b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
375dfc5606dSYehuda Sadeh };
376dfc5606dSYehuda Sadeh 
377b82d167bSAlex Elder /*
378b82d167bSAlex Elder  * Flag bits for rbd_dev->flags.  If atomicity is required,
379b82d167bSAlex Elder  * rbd_dev->lock is used to protect access.
380b82d167bSAlex Elder  *
381b82d167bSAlex Elder  * Currently, only the "removing" flag (which is coupled with the
382b82d167bSAlex Elder  * "open_count" field) requires atomic access.
383b82d167bSAlex Elder  */
3846d292906SAlex Elder enum rbd_dev_flags {
3856d292906SAlex Elder 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
386b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
3876d292906SAlex Elder };
3886d292906SAlex Elder 
389cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex);	/* Serialize client creation */
390e124a82fSAlex Elder 
391602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
392e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
393e124a82fSAlex Elder 
394602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
395432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
396602adf40SYehuda Sadeh 
39778c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */
39878c2a44aSAlex Elder 
3991c2a9dfeSAlex Elder static struct kmem_cache	*rbd_img_request_cache;
400868311b1SAlex Elder static struct kmem_cache	*rbd_obj_request_cache;
40178c2a44aSAlex Elder static struct kmem_cache	*rbd_segment_name_cache;
4021c2a9dfeSAlex Elder 
4039b60e70bSIlya Dryomov static int rbd_major;
404f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida);
405f8a22fc2SIlya Dryomov 
406f5ee37bdSIlya Dryomov static struct workqueue_struct *rbd_wq;
407f5ee37bdSIlya Dryomov 
4089b60e70bSIlya Dryomov /*
4099b60e70bSIlya Dryomov  * Default to false for now, as single-major requires >= 0.75 version of
4109b60e70bSIlya Dryomov  * userspace rbd utility.
4119b60e70bSIlya Dryomov  */
4129b60e70bSIlya Dryomov static bool single_major = false;
4139b60e70bSIlya Dryomov module_param(single_major, bool, S_IRUGO);
4149b60e70bSIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)");
4159b60e70bSIlya Dryomov 
4163d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request);
4173d7efd18SAlex Elder 
418200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev);
419dfc5606dSYehuda Sadeh 
420f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
421f0f8cef5SAlex Elder 		       size_t count);
422f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
423f0f8cef5SAlex Elder 			  size_t count);
4249b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
4259b60e70bSIlya Dryomov 				    size_t count);
4269b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
4279b60e70bSIlya Dryomov 				       size_t count);
4281f3ef788SAlex Elder static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping);
429a2acd00eSAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
430f0f8cef5SAlex Elder 
4319b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id)
4329b60e70bSIlya Dryomov {
4337e513d43SIlya Dryomov 	return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
4349b60e70bSIlya Dryomov }
4359b60e70bSIlya Dryomov 
4369b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor)
4379b60e70bSIlya Dryomov {
4387e513d43SIlya Dryomov 	return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
4399b60e70bSIlya Dryomov }
4409b60e70bSIlya Dryomov 
441b15a21ddSGreg Kroah-Hartman static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
442b15a21ddSGreg Kroah-Hartman static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
4439b60e70bSIlya Dryomov static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
4449b60e70bSIlya Dryomov static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
445b15a21ddSGreg Kroah-Hartman 
446b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = {
447b15a21ddSGreg Kroah-Hartman 	&bus_attr_add.attr,
448b15a21ddSGreg Kroah-Hartman 	&bus_attr_remove.attr,
4499b60e70bSIlya Dryomov 	&bus_attr_add_single_major.attr,
4509b60e70bSIlya Dryomov 	&bus_attr_remove_single_major.attr,
451b15a21ddSGreg Kroah-Hartman 	NULL,
452f0f8cef5SAlex Elder };
45392c76dc0SIlya Dryomov 
45492c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj,
45592c76dc0SIlya Dryomov 				  struct attribute *attr, int index)
45692c76dc0SIlya Dryomov {
4579b60e70bSIlya Dryomov 	if (!single_major &&
4589b60e70bSIlya Dryomov 	    (attr == &bus_attr_add_single_major.attr ||
4599b60e70bSIlya Dryomov 	     attr == &bus_attr_remove_single_major.attr))
4609b60e70bSIlya Dryomov 		return 0;
4619b60e70bSIlya Dryomov 
46292c76dc0SIlya Dryomov 	return attr->mode;
46392c76dc0SIlya Dryomov }
46492c76dc0SIlya Dryomov 
46592c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = {
46692c76dc0SIlya Dryomov 	.attrs = rbd_bus_attrs,
46792c76dc0SIlya Dryomov 	.is_visible = rbd_bus_is_visible,
46892c76dc0SIlya Dryomov };
46992c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus);
470f0f8cef5SAlex Elder 
471f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
472f0f8cef5SAlex Elder 	.name		= "rbd",
473b15a21ddSGreg Kroah-Hartman 	.bus_groups	= rbd_bus_groups,
474f0f8cef5SAlex Elder };
475f0f8cef5SAlex Elder 
476f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
477f0f8cef5SAlex Elder {
478f0f8cef5SAlex Elder }
479f0f8cef5SAlex Elder 
480f0f8cef5SAlex Elder static struct device rbd_root_dev = {
481f0f8cef5SAlex Elder 	.init_name =    "rbd",
482f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
483f0f8cef5SAlex Elder };
484f0f8cef5SAlex Elder 
48506ecc6cbSAlex Elder static __printf(2, 3)
48606ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
48706ecc6cbSAlex Elder {
48806ecc6cbSAlex Elder 	struct va_format vaf;
48906ecc6cbSAlex Elder 	va_list args;
49006ecc6cbSAlex Elder 
49106ecc6cbSAlex Elder 	va_start(args, fmt);
49206ecc6cbSAlex Elder 	vaf.fmt = fmt;
49306ecc6cbSAlex Elder 	vaf.va = &args;
49406ecc6cbSAlex Elder 
49506ecc6cbSAlex Elder 	if (!rbd_dev)
49606ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
49706ecc6cbSAlex Elder 	else if (rbd_dev->disk)
49806ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
49906ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
50006ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
50106ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
50206ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
50306ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
50406ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
50506ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
50606ecc6cbSAlex Elder 	else	/* punt */
50706ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
50806ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
50906ecc6cbSAlex Elder 	va_end(args);
51006ecc6cbSAlex Elder }
51106ecc6cbSAlex Elder 
512aafb230eSAlex Elder #ifdef RBD_DEBUG
513aafb230eSAlex Elder #define rbd_assert(expr)						\
514aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
515aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
516aafb230eSAlex Elder 						"at line %d:\n\n"	\
517aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
518aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
519aafb230eSAlex Elder 			BUG();						\
520aafb230eSAlex Elder 		}
521aafb230eSAlex Elder #else /* !RBD_DEBUG */
522aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
523aafb230eSAlex Elder #endif /* !RBD_DEBUG */
524dfc5606dSYehuda Sadeh 
525b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
52605a46afdSAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
52705a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
5288b3e1a56SAlex Elder 
529cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev);
5302df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
531a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev);
532e8f59b59SIlya Dryomov static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
53354cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
53454cac61fSAlex Elder 					u64 snap_id);
5352ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
5362ad3d716SAlex Elder 				u8 *order, u64 *snap_size);
5372ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
5382ad3d716SAlex Elder 		u64 *snap_features);
5392ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name);
54059c2be1eSYehuda Sadeh 
541602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
542602adf40SYehuda Sadeh {
543f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
544b82d167bSAlex Elder 	bool removing = false;
545602adf40SYehuda Sadeh 
546f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
547602adf40SYehuda Sadeh 		return -EROFS;
548602adf40SYehuda Sadeh 
549a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
550b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
551b82d167bSAlex Elder 		removing = true;
552b82d167bSAlex Elder 	else
553b82d167bSAlex Elder 		rbd_dev->open_count++;
554a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
555b82d167bSAlex Elder 	if (removing)
556b82d167bSAlex Elder 		return -ENOENT;
557b82d167bSAlex Elder 
558c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
559340c7a2bSAlex Elder 
560602adf40SYehuda Sadeh 	return 0;
561602adf40SYehuda Sadeh }
562602adf40SYehuda Sadeh 
563db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode)
564dfc5606dSYehuda Sadeh {
565dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
566b82d167bSAlex Elder 	unsigned long open_count_before;
567b82d167bSAlex Elder 
568a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
569b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
570a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
571b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
572dfc5606dSYehuda Sadeh 
573c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
574dfc5606dSYehuda Sadeh }
575dfc5606dSYehuda Sadeh 
576131fd9f6SGuangliang Zhao static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
577131fd9f6SGuangliang Zhao {
57877f33c03SJosh Durgin 	int ret = 0;
579131fd9f6SGuangliang Zhao 	int val;
580131fd9f6SGuangliang Zhao 	bool ro;
58177f33c03SJosh Durgin 	bool ro_changed = false;
582131fd9f6SGuangliang Zhao 
58377f33c03SJosh Durgin 	/* get_user() may sleep, so call it before taking rbd_dev->lock */
584131fd9f6SGuangliang Zhao 	if (get_user(val, (int __user *)(arg)))
585131fd9f6SGuangliang Zhao 		return -EFAULT;
586131fd9f6SGuangliang Zhao 
587131fd9f6SGuangliang Zhao 	ro = val ? true : false;
588131fd9f6SGuangliang Zhao 	/* Snapshot doesn't allow to write*/
589131fd9f6SGuangliang Zhao 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
590131fd9f6SGuangliang Zhao 		return -EROFS;
591131fd9f6SGuangliang Zhao 
59277f33c03SJosh Durgin 	spin_lock_irq(&rbd_dev->lock);
59377f33c03SJosh Durgin 	/* prevent others open this device */
59477f33c03SJosh Durgin 	if (rbd_dev->open_count > 1) {
59577f33c03SJosh Durgin 		ret = -EBUSY;
59677f33c03SJosh Durgin 		goto out;
597131fd9f6SGuangliang Zhao 	}
598131fd9f6SGuangliang Zhao 
59977f33c03SJosh Durgin 	if (rbd_dev->mapping.read_only != ro) {
60077f33c03SJosh Durgin 		rbd_dev->mapping.read_only = ro;
60177f33c03SJosh Durgin 		ro_changed = true;
60277f33c03SJosh Durgin 	}
60377f33c03SJosh Durgin 
60477f33c03SJosh Durgin out:
60577f33c03SJosh Durgin 	spin_unlock_irq(&rbd_dev->lock);
60677f33c03SJosh Durgin 	/* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */
60777f33c03SJosh Durgin 	if (ret == 0 && ro_changed)
60877f33c03SJosh Durgin 		set_disk_ro(rbd_dev->disk, ro ? 1 : 0);
60977f33c03SJosh Durgin 
61077f33c03SJosh Durgin 	return ret;
611131fd9f6SGuangliang Zhao }
612131fd9f6SGuangliang Zhao 
613131fd9f6SGuangliang Zhao static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
614131fd9f6SGuangliang Zhao 			unsigned int cmd, unsigned long arg)
615131fd9f6SGuangliang Zhao {
616131fd9f6SGuangliang Zhao 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
617131fd9f6SGuangliang Zhao 	int ret = 0;
618131fd9f6SGuangliang Zhao 
619131fd9f6SGuangliang Zhao 	switch (cmd) {
620131fd9f6SGuangliang Zhao 	case BLKROSET:
621131fd9f6SGuangliang Zhao 		ret = rbd_ioctl_set_ro(rbd_dev, arg);
622131fd9f6SGuangliang Zhao 		break;
623131fd9f6SGuangliang Zhao 	default:
624131fd9f6SGuangliang Zhao 		ret = -ENOTTY;
625131fd9f6SGuangliang Zhao 	}
626131fd9f6SGuangliang Zhao 
627131fd9f6SGuangliang Zhao 	return ret;
628131fd9f6SGuangliang Zhao }
629131fd9f6SGuangliang Zhao 
630131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT
631131fd9f6SGuangliang Zhao static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
632131fd9f6SGuangliang Zhao 				unsigned int cmd, unsigned long arg)
633131fd9f6SGuangliang Zhao {
634131fd9f6SGuangliang Zhao 	return rbd_ioctl(bdev, mode, cmd, arg);
635131fd9f6SGuangliang Zhao }
636131fd9f6SGuangliang Zhao #endif /* CONFIG_COMPAT */
637131fd9f6SGuangliang Zhao 
638602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
639602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
640602adf40SYehuda Sadeh 	.open			= rbd_open,
641dfc5606dSYehuda Sadeh 	.release		= rbd_release,
642131fd9f6SGuangliang Zhao 	.ioctl			= rbd_ioctl,
643131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT
644131fd9f6SGuangliang Zhao 	.compat_ioctl		= rbd_compat_ioctl,
645131fd9f6SGuangliang Zhao #endif
646602adf40SYehuda Sadeh };
647602adf40SYehuda Sadeh 
648602adf40SYehuda Sadeh /*
6497262cfcaSAlex Elder  * Initialize an rbd client instance.  Success or not, this function
650cfbf6377SAlex Elder  * consumes ceph_opts.  Caller holds client_mutex.
651602adf40SYehuda Sadeh  */
652f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
653602adf40SYehuda Sadeh {
654602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
655602adf40SYehuda Sadeh 	int ret = -ENOMEM;
656602adf40SYehuda Sadeh 
65737206ee5SAlex Elder 	dout("%s:\n", __func__);
658602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
659602adf40SYehuda Sadeh 	if (!rbdc)
660602adf40SYehuda Sadeh 		goto out_opt;
661602adf40SYehuda Sadeh 
662602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
663602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
664602adf40SYehuda Sadeh 
66543ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
666602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
66708f75463SAlex Elder 		goto out_rbdc;
66843ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
669602adf40SYehuda Sadeh 
670602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
671602adf40SYehuda Sadeh 	if (ret < 0)
67208f75463SAlex Elder 		goto out_client;
673602adf40SYehuda Sadeh 
674432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
675602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
676432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
677602adf40SYehuda Sadeh 
67837206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
679bc534d86SAlex Elder 
680602adf40SYehuda Sadeh 	return rbdc;
68108f75463SAlex Elder out_client:
682602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
68308f75463SAlex Elder out_rbdc:
684602adf40SYehuda Sadeh 	kfree(rbdc);
685602adf40SYehuda Sadeh out_opt:
68643ae4701SAlex Elder 	if (ceph_opts)
68743ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
68837206ee5SAlex Elder 	dout("%s: error %d\n", __func__, ret);
68937206ee5SAlex Elder 
69028f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
691602adf40SYehuda Sadeh }
692602adf40SYehuda Sadeh 
6932f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
6942f82ee54SAlex Elder {
6952f82ee54SAlex Elder 	kref_get(&rbdc->kref);
6962f82ee54SAlex Elder 
6972f82ee54SAlex Elder 	return rbdc;
6982f82ee54SAlex Elder }
6992f82ee54SAlex Elder 
700602adf40SYehuda Sadeh /*
7011f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
7021f7ba331SAlex Elder  * found, bump its reference count.
703602adf40SYehuda Sadeh  */
7041f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
705602adf40SYehuda Sadeh {
706602adf40SYehuda Sadeh 	struct rbd_client *client_node;
7071f7ba331SAlex Elder 	bool found = false;
708602adf40SYehuda Sadeh 
70943ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
710602adf40SYehuda Sadeh 		return NULL;
711602adf40SYehuda Sadeh 
7121f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
7131f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
7141f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
7152f82ee54SAlex Elder 			__rbd_get_client(client_node);
7162f82ee54SAlex Elder 
7171f7ba331SAlex Elder 			found = true;
7181f7ba331SAlex Elder 			break;
7191f7ba331SAlex Elder 		}
7201f7ba331SAlex Elder 	}
7211f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
7221f7ba331SAlex Elder 
7231f7ba331SAlex Elder 	return found ? client_node : NULL;
724602adf40SYehuda Sadeh }
725602adf40SYehuda Sadeh 
726602adf40SYehuda Sadeh /*
72759c2be1eSYehuda Sadeh  * mount options
72859c2be1eSYehuda Sadeh  */
72959c2be1eSYehuda Sadeh enum {
73059c2be1eSYehuda Sadeh 	Opt_last_int,
73159c2be1eSYehuda Sadeh 	/* int args above */
73259c2be1eSYehuda Sadeh 	Opt_last_string,
73359c2be1eSYehuda Sadeh 	/* string args above */
734cc0538b6SAlex Elder 	Opt_read_only,
735cc0538b6SAlex Elder 	Opt_read_write,
736cc0538b6SAlex Elder 	/* Boolean args above */
737cc0538b6SAlex Elder 	Opt_last_bool,
73859c2be1eSYehuda Sadeh };
73959c2be1eSYehuda Sadeh 
74043ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
74159c2be1eSYehuda Sadeh 	/* int args above */
74259c2be1eSYehuda Sadeh 	/* string args above */
743be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
744cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
745cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
746cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
747cc0538b6SAlex Elder 	/* Boolean args above */
74859c2be1eSYehuda Sadeh 	{-1, NULL}
74959c2be1eSYehuda Sadeh };
75059c2be1eSYehuda Sadeh 
75198571b5aSAlex Elder struct rbd_options {
75298571b5aSAlex Elder 	bool	read_only;
75398571b5aSAlex Elder };
75498571b5aSAlex Elder 
75598571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
75698571b5aSAlex Elder 
75759c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
75859c2be1eSYehuda Sadeh {
75943ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
76059c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
76159c2be1eSYehuda Sadeh 	int token, intval, ret;
76259c2be1eSYehuda Sadeh 
76343ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
76459c2be1eSYehuda Sadeh 	if (token < 0)
76559c2be1eSYehuda Sadeh 		return -EINVAL;
76659c2be1eSYehuda Sadeh 
76759c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
76859c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
76959c2be1eSYehuda Sadeh 		if (ret < 0) {
77059c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
77159c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
77259c2be1eSYehuda Sadeh 			return ret;
77359c2be1eSYehuda Sadeh 		}
77459c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
77559c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
77659c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
77759c2be1eSYehuda Sadeh 		     argstr[0].from);
778cc0538b6SAlex Elder 	} else if (token > Opt_last_string && token < Opt_last_bool) {
779cc0538b6SAlex Elder 		dout("got Boolean token %d\n", token);
78059c2be1eSYehuda Sadeh 	} else {
78159c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
78259c2be1eSYehuda Sadeh 	}
78359c2be1eSYehuda Sadeh 
78459c2be1eSYehuda Sadeh 	switch (token) {
785cc0538b6SAlex Elder 	case Opt_read_only:
786cc0538b6SAlex Elder 		rbd_opts->read_only = true;
787cc0538b6SAlex Elder 		break;
788cc0538b6SAlex Elder 	case Opt_read_write:
789cc0538b6SAlex Elder 		rbd_opts->read_only = false;
790cc0538b6SAlex Elder 		break;
79159c2be1eSYehuda Sadeh 	default:
792aafb230eSAlex Elder 		rbd_assert(false);
793aafb230eSAlex Elder 		break;
79459c2be1eSYehuda Sadeh 	}
79559c2be1eSYehuda Sadeh 	return 0;
79659c2be1eSYehuda Sadeh }
79759c2be1eSYehuda Sadeh 
7986d2940c8SGuangliang Zhao static char* obj_op_name(enum obj_operation_type op_type)
7996d2940c8SGuangliang Zhao {
8006d2940c8SGuangliang Zhao 	switch (op_type) {
8016d2940c8SGuangliang Zhao 	case OBJ_OP_READ:
8026d2940c8SGuangliang Zhao 		return "read";
8036d2940c8SGuangliang Zhao 	case OBJ_OP_WRITE:
8046d2940c8SGuangliang Zhao 		return "write";
80590e98c52SGuangliang Zhao 	case OBJ_OP_DISCARD:
80690e98c52SGuangliang Zhao 		return "discard";
8076d2940c8SGuangliang Zhao 	default:
8086d2940c8SGuangliang Zhao 		return "???";
8096d2940c8SGuangliang Zhao 	}
8106d2940c8SGuangliang Zhao }
8116d2940c8SGuangliang Zhao 
81259c2be1eSYehuda Sadeh /*
813602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
8147262cfcaSAlex Elder  * not exist create it.  Either way, ceph_opts is consumed by this
8157262cfcaSAlex Elder  * function.
816602adf40SYehuda Sadeh  */
8179d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
818602adf40SYehuda Sadeh {
819f8c38929SAlex Elder 	struct rbd_client *rbdc;
82059c2be1eSYehuda Sadeh 
821cfbf6377SAlex Elder 	mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
8221f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
8239d3997fdSAlex Elder 	if (rbdc)	/* using an existing client */
82443ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
8259d3997fdSAlex Elder 	else
826f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
827cfbf6377SAlex Elder 	mutex_unlock(&client_mutex);
828d720bcb0SAlex Elder 
8299d3997fdSAlex Elder 	return rbdc;
830602adf40SYehuda Sadeh }
831602adf40SYehuda Sadeh 
832602adf40SYehuda Sadeh /*
833602adf40SYehuda Sadeh  * Destroy ceph client
834d23a4b3fSAlex Elder  *
835432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
836602adf40SYehuda Sadeh  */
837602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
838602adf40SYehuda Sadeh {
839602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
840602adf40SYehuda Sadeh 
84137206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
842cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
843602adf40SYehuda Sadeh 	list_del(&rbdc->node);
844cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
845602adf40SYehuda Sadeh 
846602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
847602adf40SYehuda Sadeh 	kfree(rbdc);
848602adf40SYehuda Sadeh }
849602adf40SYehuda Sadeh 
850602adf40SYehuda Sadeh /*
851602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
852602adf40SYehuda Sadeh  * it.
853602adf40SYehuda Sadeh  */
8549d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
855602adf40SYehuda Sadeh {
856c53d5893SAlex Elder 	if (rbdc)
8579d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
858602adf40SYehuda Sadeh }
859602adf40SYehuda Sadeh 
860a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
861a30b71b9SAlex Elder {
862a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
863a30b71b9SAlex Elder }
864a30b71b9SAlex Elder 
8658e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
8668e94af8eSAlex Elder {
867103a150fSAlex Elder 	size_t size;
868103a150fSAlex Elder 	u32 snap_count;
869103a150fSAlex Elder 
870103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
871103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
872103a150fSAlex Elder 		return false;
873103a150fSAlex Elder 
874db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
875db2388b6SAlex Elder 
876db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
877db2388b6SAlex Elder 		return false;
878db2388b6SAlex Elder 
879db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
880db2388b6SAlex Elder 
881db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
882db2388b6SAlex Elder 		return false;
883db2388b6SAlex Elder 
884103a150fSAlex Elder 	/*
885103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
886103a150fSAlex Elder 	 * that limits the number of snapshots.
887103a150fSAlex Elder 	 */
888103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
889103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
890103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
891103a150fSAlex Elder 		return false;
892103a150fSAlex Elder 
893103a150fSAlex Elder 	/*
894103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
895103a150fSAlex Elder 	 * header must also be representable in a size_t.
896103a150fSAlex Elder 	 */
897103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
898103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
899103a150fSAlex Elder 		return false;
900103a150fSAlex Elder 
901103a150fSAlex Elder 	return true;
9028e94af8eSAlex Elder }
9038e94af8eSAlex Elder 
904602adf40SYehuda Sadeh /*
905bb23e37aSAlex Elder  * Fill an rbd image header with information from the given format 1
906bb23e37aSAlex Elder  * on-disk header.
907602adf40SYehuda Sadeh  */
908662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev,
9094156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
910602adf40SYehuda Sadeh {
911662518b1SAlex Elder 	struct rbd_image_header *header = &rbd_dev->header;
912bb23e37aSAlex Elder 	bool first_time = header->object_prefix == NULL;
913bb23e37aSAlex Elder 	struct ceph_snap_context *snapc;
914bb23e37aSAlex Elder 	char *object_prefix = NULL;
915bb23e37aSAlex Elder 	char *snap_names = NULL;
916bb23e37aSAlex Elder 	u64 *snap_sizes = NULL;
917ccece235SAlex Elder 	u32 snap_count;
918d2bb24e5SAlex Elder 	size_t size;
919bb23e37aSAlex Elder 	int ret = -ENOMEM;
920621901d6SAlex Elder 	u32 i;
921602adf40SYehuda Sadeh 
922bb23e37aSAlex Elder 	/* Allocate this now to avoid having to handle failure below */
923103a150fSAlex Elder 
924bb23e37aSAlex Elder 	if (first_time) {
925bb23e37aSAlex Elder 		size_t len;
926bb23e37aSAlex Elder 
927bb23e37aSAlex Elder 		len = strnlen(ondisk->object_prefix,
928bb23e37aSAlex Elder 				sizeof (ondisk->object_prefix));
929bb23e37aSAlex Elder 		object_prefix = kmalloc(len + 1, GFP_KERNEL);
930bb23e37aSAlex Elder 		if (!object_prefix)
931602adf40SYehuda Sadeh 			return -ENOMEM;
932bb23e37aSAlex Elder 		memcpy(object_prefix, ondisk->object_prefix, len);
933bb23e37aSAlex Elder 		object_prefix[len] = '\0';
934bb23e37aSAlex Elder 	}
93500f1f36fSAlex Elder 
936bb23e37aSAlex Elder 	/* Allocate the snapshot context and fill it in */
937d2bb24e5SAlex Elder 
938602adf40SYehuda Sadeh 	snap_count = le32_to_cpu(ondisk->snap_count);
939bb23e37aSAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
940bb23e37aSAlex Elder 	if (!snapc)
941bb23e37aSAlex Elder 		goto out_err;
942bb23e37aSAlex Elder 	snapc->seq = le64_to_cpu(ondisk->snap_seq);
943602adf40SYehuda Sadeh 	if (snap_count) {
944bb23e37aSAlex Elder 		struct rbd_image_snap_ondisk *snaps;
945f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
946f785cc1dSAlex Elder 
947bb23e37aSAlex Elder 		/* We'll keep a copy of the snapshot names... */
948621901d6SAlex Elder 
949f785cc1dSAlex Elder 		if (snap_names_len > (u64)SIZE_MAX)
950bb23e37aSAlex Elder 			goto out_2big;
951bb23e37aSAlex Elder 		snap_names = kmalloc(snap_names_len, GFP_KERNEL);
952bb23e37aSAlex Elder 		if (!snap_names)
953602adf40SYehuda Sadeh 			goto out_err;
954bb23e37aSAlex Elder 
955bb23e37aSAlex Elder 		/* ...as well as the array of their sizes. */
956bb23e37aSAlex Elder 
957bb23e37aSAlex Elder 		size = snap_count * sizeof (*header->snap_sizes);
958bb23e37aSAlex Elder 		snap_sizes = kmalloc(size, GFP_KERNEL);
959bb23e37aSAlex Elder 		if (!snap_sizes)
960bb23e37aSAlex Elder 			goto out_err;
961bb23e37aSAlex Elder 
962f785cc1dSAlex Elder 		/*
963bb23e37aSAlex Elder 		 * Copy the names, and fill in each snapshot's id
964bb23e37aSAlex Elder 		 * and size.
965bb23e37aSAlex Elder 		 *
96699a41ebcSAlex Elder 		 * Note that rbd_dev_v1_header_info() guarantees the
967bb23e37aSAlex Elder 		 * ondisk buffer we're working with has
968f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
969f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
970f785cc1dSAlex Elder 		 */
971bb23e37aSAlex Elder 		memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
972bb23e37aSAlex Elder 		snaps = ondisk->snaps;
973bb23e37aSAlex Elder 		for (i = 0; i < snap_count; i++) {
974bb23e37aSAlex Elder 			snapc->snaps[i] = le64_to_cpu(snaps[i].id);
975bb23e37aSAlex Elder 			snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
976bb23e37aSAlex Elder 		}
977602adf40SYehuda Sadeh 	}
978849b4260SAlex Elder 
979bb23e37aSAlex Elder 	/* We won't fail any more, fill in the header */
980bb23e37aSAlex Elder 
981bb23e37aSAlex Elder 	if (first_time) {
982bb23e37aSAlex Elder 		header->object_prefix = object_prefix;
983602adf40SYehuda Sadeh 		header->obj_order = ondisk->options.order;
984602adf40SYehuda Sadeh 		header->crypt_type = ondisk->options.crypt_type;
985602adf40SYehuda Sadeh 		header->comp_type = ondisk->options.comp_type;
986bb23e37aSAlex Elder 		/* The rest aren't used for format 1 images */
987bb23e37aSAlex Elder 		header->stripe_unit = 0;
988bb23e37aSAlex Elder 		header->stripe_count = 0;
989bb23e37aSAlex Elder 		header->features = 0;
990662518b1SAlex Elder 	} else {
991662518b1SAlex Elder 		ceph_put_snap_context(header->snapc);
992662518b1SAlex Elder 		kfree(header->snap_names);
993662518b1SAlex Elder 		kfree(header->snap_sizes);
994bb23e37aSAlex Elder 	}
9956a52325fSAlex Elder 
996bb23e37aSAlex Elder 	/* The remaining fields always get updated (when we refresh) */
997621901d6SAlex Elder 
998f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
999bb23e37aSAlex Elder 	header->snapc = snapc;
1000bb23e37aSAlex Elder 	header->snap_names = snap_names;
1001bb23e37aSAlex Elder 	header->snap_sizes = snap_sizes;
1002468521c1SAlex Elder 
1003602adf40SYehuda Sadeh 	return 0;
1004bb23e37aSAlex Elder out_2big:
1005bb23e37aSAlex Elder 	ret = -EIO;
10066a52325fSAlex Elder out_err:
1007bb23e37aSAlex Elder 	kfree(snap_sizes);
1008bb23e37aSAlex Elder 	kfree(snap_names);
1009bb23e37aSAlex Elder 	ceph_put_snap_context(snapc);
1010bb23e37aSAlex Elder 	kfree(object_prefix);
1011ccece235SAlex Elder 
1012bb23e37aSAlex Elder 	return ret;
1013602adf40SYehuda Sadeh }
1014602adf40SYehuda Sadeh 
10159682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
10169682fc6dSAlex Elder {
10179682fc6dSAlex Elder 	const char *snap_name;
10189682fc6dSAlex Elder 
10199682fc6dSAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
10209682fc6dSAlex Elder 
10219682fc6dSAlex Elder 	/* Skip over names until we find the one we are looking for */
10229682fc6dSAlex Elder 
10239682fc6dSAlex Elder 	snap_name = rbd_dev->header.snap_names;
10249682fc6dSAlex Elder 	while (which--)
10259682fc6dSAlex Elder 		snap_name += strlen(snap_name) + 1;
10269682fc6dSAlex Elder 
10279682fc6dSAlex Elder 	return kstrdup(snap_name, GFP_KERNEL);
10289682fc6dSAlex Elder }
10299682fc6dSAlex Elder 
103030d1cff8SAlex Elder /*
103130d1cff8SAlex Elder  * Snapshot id comparison function for use with qsort()/bsearch().
103230d1cff8SAlex Elder  * Note that result is for snapshots in *descending* order.
103330d1cff8SAlex Elder  */
103430d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2)
103530d1cff8SAlex Elder {
103630d1cff8SAlex Elder 	u64 snap_id1 = *(u64 *)s1;
103730d1cff8SAlex Elder 	u64 snap_id2 = *(u64 *)s2;
103830d1cff8SAlex Elder 
103930d1cff8SAlex Elder 	if (snap_id1 < snap_id2)
104030d1cff8SAlex Elder 		return 1;
104130d1cff8SAlex Elder 	return snap_id1 == snap_id2 ? 0 : -1;
104230d1cff8SAlex Elder }
104330d1cff8SAlex Elder 
104430d1cff8SAlex Elder /*
104530d1cff8SAlex Elder  * Search a snapshot context to see if the given snapshot id is
104630d1cff8SAlex Elder  * present.
104730d1cff8SAlex Elder  *
104830d1cff8SAlex Elder  * Returns the position of the snapshot id in the array if it's found,
104930d1cff8SAlex Elder  * or BAD_SNAP_INDEX otherwise.
105030d1cff8SAlex Elder  *
105130d1cff8SAlex Elder  * Note: The snapshot array is in kept sorted (by the osd) in
105230d1cff8SAlex Elder  * reverse order, highest snapshot id first.
105330d1cff8SAlex Elder  */
10549682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
10559682fc6dSAlex Elder {
10569682fc6dSAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
105730d1cff8SAlex Elder 	u64 *found;
10589682fc6dSAlex Elder 
105930d1cff8SAlex Elder 	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
106030d1cff8SAlex Elder 				sizeof (snap_id), snapid_compare_reverse);
10619682fc6dSAlex Elder 
106230d1cff8SAlex Elder 	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
10639682fc6dSAlex Elder }
10649682fc6dSAlex Elder 
10652ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
10662ad3d716SAlex Elder 					u64 snap_id)
106754cac61fSAlex Elder {
106854cac61fSAlex Elder 	u32 which;
1069da6a6b63SJosh Durgin 	const char *snap_name;
107054cac61fSAlex Elder 
107154cac61fSAlex Elder 	which = rbd_dev_snap_index(rbd_dev, snap_id);
107254cac61fSAlex Elder 	if (which == BAD_SNAP_INDEX)
1073da6a6b63SJosh Durgin 		return ERR_PTR(-ENOENT);
107454cac61fSAlex Elder 
1075da6a6b63SJosh Durgin 	snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1076da6a6b63SJosh Durgin 	return snap_name ? snap_name : ERR_PTR(-ENOMEM);
107754cac61fSAlex Elder }
107854cac61fSAlex Elder 
10799e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
10809e15b77dSAlex Elder {
10819e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
10829e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
10839e15b77dSAlex Elder 
108454cac61fSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
108554cac61fSAlex Elder 	if (rbd_dev->image_format == 1)
108654cac61fSAlex Elder 		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
10879e15b77dSAlex Elder 
108854cac61fSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
10899e15b77dSAlex Elder }
10909e15b77dSAlex Elder 
10912ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
10922ad3d716SAlex Elder 				u64 *snap_size)
1093602adf40SYehuda Sadeh {
10942ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
10952ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
10962ad3d716SAlex Elder 		*snap_size = rbd_dev->header.image_size;
10972ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
10982ad3d716SAlex Elder 		u32 which;
109900f1f36fSAlex Elder 
11002ad3d716SAlex Elder 		which = rbd_dev_snap_index(rbd_dev, snap_id);
11012ad3d716SAlex Elder 		if (which == BAD_SNAP_INDEX)
11022ad3d716SAlex Elder 			return -ENOENT;
110300f1f36fSAlex Elder 
11042ad3d716SAlex Elder 		*snap_size = rbd_dev->header.snap_sizes[which];
11052ad3d716SAlex Elder 	} else {
11062ad3d716SAlex Elder 		u64 size = 0;
11072ad3d716SAlex Elder 		int ret;
11082ad3d716SAlex Elder 
11092ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
11102ad3d716SAlex Elder 		if (ret)
11112ad3d716SAlex Elder 			return ret;
11122ad3d716SAlex Elder 
11132ad3d716SAlex Elder 		*snap_size = size;
11142ad3d716SAlex Elder 	}
11152ad3d716SAlex Elder 	return 0;
11162ad3d716SAlex Elder }
11172ad3d716SAlex Elder 
11182ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
11192ad3d716SAlex Elder 			u64 *snap_features)
11202ad3d716SAlex Elder {
11212ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
11222ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
11232ad3d716SAlex Elder 		*snap_features = rbd_dev->header.features;
11242ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
11252ad3d716SAlex Elder 		*snap_features = 0;	/* No features for format 1 */
11262ad3d716SAlex Elder 	} else {
11272ad3d716SAlex Elder 		u64 features = 0;
11282ad3d716SAlex Elder 		int ret;
11292ad3d716SAlex Elder 
11302ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
11312ad3d716SAlex Elder 		if (ret)
11322ad3d716SAlex Elder 			return ret;
11332ad3d716SAlex Elder 
11342ad3d716SAlex Elder 		*snap_features = features;
11352ad3d716SAlex Elder 	}
11362ad3d716SAlex Elder 	return 0;
113700f1f36fSAlex Elder }
1138602adf40SYehuda Sadeh 
1139d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1140602adf40SYehuda Sadeh {
11418f4b7d98SAlex Elder 	u64 snap_id = rbd_dev->spec->snap_id;
11422ad3d716SAlex Elder 	u64 size = 0;
11432ad3d716SAlex Elder 	u64 features = 0;
11442ad3d716SAlex Elder 	int ret;
11458b0241f8SAlex Elder 
11462ad3d716SAlex Elder 	ret = rbd_snap_size(rbd_dev, snap_id, &size);
11472ad3d716SAlex Elder 	if (ret)
11482ad3d716SAlex Elder 		return ret;
11492ad3d716SAlex Elder 	ret = rbd_snap_features(rbd_dev, snap_id, &features);
11502ad3d716SAlex Elder 	if (ret)
11512ad3d716SAlex Elder 		return ret;
11522ad3d716SAlex Elder 
11532ad3d716SAlex Elder 	rbd_dev->mapping.size = size;
11542ad3d716SAlex Elder 	rbd_dev->mapping.features = features;
11552ad3d716SAlex Elder 
11568b0241f8SAlex Elder 	return 0;
1157602adf40SYehuda Sadeh }
1158602adf40SYehuda Sadeh 
1159d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1160d1cf5788SAlex Elder {
1161d1cf5788SAlex Elder 	rbd_dev->mapping.size = 0;
1162d1cf5788SAlex Elder 	rbd_dev->mapping.features = 0;
1163200a6a8bSAlex Elder }
1164200a6a8bSAlex Elder 
11657d5079aaSHimangi Saraogi static void rbd_segment_name_free(const char *name)
11667d5079aaSHimangi Saraogi {
11677d5079aaSHimangi Saraogi 	/* The explicit cast here is needed to drop the const qualifier */
11687d5079aaSHimangi Saraogi 
11697d5079aaSHimangi Saraogi 	kmem_cache_free(rbd_segment_name_cache, (void *)name);
11707d5079aaSHimangi Saraogi }
11717d5079aaSHimangi Saraogi 
117298571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
1173602adf40SYehuda Sadeh {
117465ccfe21SAlex Elder 	char *name;
117565ccfe21SAlex Elder 	u64 segment;
117665ccfe21SAlex Elder 	int ret;
11773a96d5cdSJosh Durgin 	char *name_format;
1178602adf40SYehuda Sadeh 
117978c2a44aSAlex Elder 	name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
118065ccfe21SAlex Elder 	if (!name)
118165ccfe21SAlex Elder 		return NULL;
118265ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
11833a96d5cdSJosh Durgin 	name_format = "%s.%012llx";
11843a96d5cdSJosh Durgin 	if (rbd_dev->image_format == 2)
11853a96d5cdSJosh Durgin 		name_format = "%s.%016llx";
11862d0ebc5dSIlya Dryomov 	ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format,
118765ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
11882d0ebc5dSIlya Dryomov 	if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) {
118965ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
119065ccfe21SAlex Elder 			segment, ret);
11917d5079aaSHimangi Saraogi 		rbd_segment_name_free(name);
119265ccfe21SAlex Elder 		name = NULL;
119365ccfe21SAlex Elder 	}
1194602adf40SYehuda Sadeh 
119565ccfe21SAlex Elder 	return name;
119665ccfe21SAlex Elder }
1197602adf40SYehuda Sadeh 
119865ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
119965ccfe21SAlex Elder {
120065ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
1201602adf40SYehuda Sadeh 
120265ccfe21SAlex Elder 	return offset & (segment_size - 1);
120365ccfe21SAlex Elder }
120465ccfe21SAlex Elder 
120565ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
120665ccfe21SAlex Elder 				u64 offset, u64 length)
120765ccfe21SAlex Elder {
120865ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
120965ccfe21SAlex Elder 
121065ccfe21SAlex Elder 	offset &= segment_size - 1;
121165ccfe21SAlex Elder 
1212aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
121365ccfe21SAlex Elder 	if (offset + length > segment_size)
121465ccfe21SAlex Elder 		length = segment_size - offset;
121565ccfe21SAlex Elder 
121665ccfe21SAlex Elder 	return length;
1217602adf40SYehuda Sadeh }
1218602adf40SYehuda Sadeh 
1219602adf40SYehuda Sadeh /*
1220029bcbd8SJosh Durgin  * returns the size of an object in the image
1221029bcbd8SJosh Durgin  */
1222029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
1223029bcbd8SJosh Durgin {
1224029bcbd8SJosh Durgin 	return 1 << header->obj_order;
1225029bcbd8SJosh Durgin }
1226029bcbd8SJosh Durgin 
1227029bcbd8SJosh Durgin /*
1228602adf40SYehuda Sadeh  * bio helpers
1229602adf40SYehuda Sadeh  */
1230602adf40SYehuda Sadeh 
1231602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
1232602adf40SYehuda Sadeh {
1233602adf40SYehuda Sadeh 	struct bio *tmp;
1234602adf40SYehuda Sadeh 
1235602adf40SYehuda Sadeh 	while (chain) {
1236602adf40SYehuda Sadeh 		tmp = chain;
1237602adf40SYehuda Sadeh 		chain = chain->bi_next;
1238602adf40SYehuda Sadeh 		bio_put(tmp);
1239602adf40SYehuda Sadeh 	}
1240602adf40SYehuda Sadeh }
1241602adf40SYehuda Sadeh 
1242602adf40SYehuda Sadeh /*
1243602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
1244602adf40SYehuda Sadeh  */
1245602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
1246602adf40SYehuda Sadeh {
12477988613bSKent Overstreet 	struct bio_vec bv;
12487988613bSKent Overstreet 	struct bvec_iter iter;
1249602adf40SYehuda Sadeh 	unsigned long flags;
1250602adf40SYehuda Sadeh 	void *buf;
1251602adf40SYehuda Sadeh 	int pos = 0;
1252602adf40SYehuda Sadeh 
1253602adf40SYehuda Sadeh 	while (chain) {
12547988613bSKent Overstreet 		bio_for_each_segment(bv, chain, iter) {
12557988613bSKent Overstreet 			if (pos + bv.bv_len > start_ofs) {
1256602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
12577988613bSKent Overstreet 				buf = bvec_kmap_irq(&bv, &flags);
1258602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
12597988613bSKent Overstreet 				       bv.bv_len - remainder);
12607988613bSKent Overstreet 				flush_dcache_page(bv.bv_page);
126185b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
1262602adf40SYehuda Sadeh 			}
12637988613bSKent Overstreet 			pos += bv.bv_len;
1264602adf40SYehuda Sadeh 		}
1265602adf40SYehuda Sadeh 
1266602adf40SYehuda Sadeh 		chain = chain->bi_next;
1267602adf40SYehuda Sadeh 	}
1268602adf40SYehuda Sadeh }
1269602adf40SYehuda Sadeh 
1270602adf40SYehuda Sadeh /*
1271b9434c5bSAlex Elder  * similar to zero_bio_chain(), zeros data defined by a page array,
1272b9434c5bSAlex Elder  * starting at the given byte offset from the start of the array and
1273b9434c5bSAlex Elder  * continuing up to the given end offset.  The pages array is
1274b9434c5bSAlex Elder  * assumed to be big enough to hold all bytes up to the end.
1275b9434c5bSAlex Elder  */
1276b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end)
1277b9434c5bSAlex Elder {
1278b9434c5bSAlex Elder 	struct page **page = &pages[offset >> PAGE_SHIFT];
1279b9434c5bSAlex Elder 
1280b9434c5bSAlex Elder 	rbd_assert(end > offset);
1281b9434c5bSAlex Elder 	rbd_assert(end - offset <= (u64)SIZE_MAX);
1282b9434c5bSAlex Elder 	while (offset < end) {
1283b9434c5bSAlex Elder 		size_t page_offset;
1284b9434c5bSAlex Elder 		size_t length;
1285b9434c5bSAlex Elder 		unsigned long flags;
1286b9434c5bSAlex Elder 		void *kaddr;
1287b9434c5bSAlex Elder 
1288491205a8SGeert Uytterhoeven 		page_offset = offset & ~PAGE_MASK;
1289491205a8SGeert Uytterhoeven 		length = min_t(size_t, PAGE_SIZE - page_offset, end - offset);
1290b9434c5bSAlex Elder 		local_irq_save(flags);
1291b9434c5bSAlex Elder 		kaddr = kmap_atomic(*page);
1292b9434c5bSAlex Elder 		memset(kaddr + page_offset, 0, length);
1293e2156054SAlex Elder 		flush_dcache_page(*page);
1294b9434c5bSAlex Elder 		kunmap_atomic(kaddr);
1295b9434c5bSAlex Elder 		local_irq_restore(flags);
1296b9434c5bSAlex Elder 
1297b9434c5bSAlex Elder 		offset += length;
1298b9434c5bSAlex Elder 		page++;
1299b9434c5bSAlex Elder 	}
1300b9434c5bSAlex Elder }
1301b9434c5bSAlex Elder 
1302b9434c5bSAlex Elder /*
1303f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
1304f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
1305602adf40SYehuda Sadeh  */
1306f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
1307f7760dadSAlex Elder 					unsigned int offset,
1308f7760dadSAlex Elder 					unsigned int len,
1309f7760dadSAlex Elder 					gfp_t gfpmask)
1310602adf40SYehuda Sadeh {
1311f7760dadSAlex Elder 	struct bio *bio;
1312602adf40SYehuda Sadeh 
13135341a627SKent Overstreet 	bio = bio_clone(bio_src, gfpmask);
1314f7760dadSAlex Elder 	if (!bio)
1315f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
1316f7760dadSAlex Elder 
13175341a627SKent Overstreet 	bio_advance(bio, offset);
13184f024f37SKent Overstreet 	bio->bi_iter.bi_size = len;
1319602adf40SYehuda Sadeh 
1320f7760dadSAlex Elder 	return bio;
1321602adf40SYehuda Sadeh }
1322602adf40SYehuda Sadeh 
1323f7760dadSAlex Elder /*
1324f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
1325f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
1326f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
1327f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
1328f7760dadSAlex Elder  *
1329f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
1330f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
1331f7760dadSAlex Elder  * the start of data to be cloned is located.
1332f7760dadSAlex Elder  *
1333f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
1334f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
1335f7760dadSAlex Elder  * contain the offset of that byte within that bio.
1336f7760dadSAlex Elder  */
1337f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
1338f7760dadSAlex Elder 					unsigned int *offset,
1339f7760dadSAlex Elder 					unsigned int len,
1340f7760dadSAlex Elder 					gfp_t gfpmask)
1341f7760dadSAlex Elder {
1342f7760dadSAlex Elder 	struct bio *bi = *bio_src;
1343f7760dadSAlex Elder 	unsigned int off = *offset;
1344f7760dadSAlex Elder 	struct bio *chain = NULL;
1345f7760dadSAlex Elder 	struct bio **end;
1346602adf40SYehuda Sadeh 
1347f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
1348602adf40SYehuda Sadeh 
13494f024f37SKent Overstreet 	if (!bi || off >= bi->bi_iter.bi_size || !len)
1350f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
1351602adf40SYehuda Sadeh 
1352f7760dadSAlex Elder 	end = &chain;
1353f7760dadSAlex Elder 	while (len) {
1354f7760dadSAlex Elder 		unsigned int bi_size;
1355f7760dadSAlex Elder 		struct bio *bio;
1356f7760dadSAlex Elder 
1357f5400b7aSAlex Elder 		if (!bi) {
1358f5400b7aSAlex Elder 			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1359f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
1360f5400b7aSAlex Elder 		}
13614f024f37SKent Overstreet 		bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len);
1362f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
1363f7760dadSAlex Elder 		if (!bio)
1364f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
1365f7760dadSAlex Elder 
1366f7760dadSAlex Elder 		*end = bio;
1367f7760dadSAlex Elder 		end = &bio->bi_next;
1368f7760dadSAlex Elder 
1369f7760dadSAlex Elder 		off += bi_size;
13704f024f37SKent Overstreet 		if (off == bi->bi_iter.bi_size) {
1371f7760dadSAlex Elder 			bi = bi->bi_next;
1372f7760dadSAlex Elder 			off = 0;
1373f7760dadSAlex Elder 		}
1374f7760dadSAlex Elder 		len -= bi_size;
1375f7760dadSAlex Elder 	}
1376f7760dadSAlex Elder 	*bio_src = bi;
1377f7760dadSAlex Elder 	*offset = off;
1378f7760dadSAlex Elder 
1379f7760dadSAlex Elder 	return chain;
1380f7760dadSAlex Elder out_err:
1381f7760dadSAlex Elder 	bio_chain_put(chain);
1382f7760dadSAlex Elder 
1383602adf40SYehuda Sadeh 	return NULL;
1384602adf40SYehuda Sadeh }
1385602adf40SYehuda Sadeh 
1386926f9b3fSAlex Elder /*
1387926f9b3fSAlex Elder  * The default/initial value for all object request flags is 0.  For
1388926f9b3fSAlex Elder  * each flag, once its value is set to 1 it is never reset to 0
1389926f9b3fSAlex Elder  * again.
1390926f9b3fSAlex Elder  */
13916365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
13926365d33aSAlex Elder {
13936365d33aSAlex Elder 	if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
13946365d33aSAlex Elder 		struct rbd_device *rbd_dev;
13956365d33aSAlex Elder 
139657acbaa7SAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
13979584d508SIlya Dryomov 		rbd_warn(rbd_dev, "obj_request %p already marked img_data",
13986365d33aSAlex Elder 			obj_request);
13996365d33aSAlex Elder 	}
14006365d33aSAlex Elder }
14016365d33aSAlex Elder 
14026365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
14036365d33aSAlex Elder {
14046365d33aSAlex Elder 	smp_mb();
14056365d33aSAlex Elder 	return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
14066365d33aSAlex Elder }
14076365d33aSAlex Elder 
140857acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request)
140957acbaa7SAlex Elder {
141057acbaa7SAlex Elder 	if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
141157acbaa7SAlex Elder 		struct rbd_device *rbd_dev = NULL;
141257acbaa7SAlex Elder 
141357acbaa7SAlex Elder 		if (obj_request_img_data_test(obj_request))
141457acbaa7SAlex Elder 			rbd_dev = obj_request->img_request->rbd_dev;
14159584d508SIlya Dryomov 		rbd_warn(rbd_dev, "obj_request %p already marked done",
141657acbaa7SAlex Elder 			obj_request);
141757acbaa7SAlex Elder 	}
141857acbaa7SAlex Elder }
141957acbaa7SAlex Elder 
142057acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request)
142157acbaa7SAlex Elder {
142257acbaa7SAlex Elder 	smp_mb();
142357acbaa7SAlex Elder 	return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
142457acbaa7SAlex Elder }
142557acbaa7SAlex Elder 
14265679c59fSAlex Elder /*
14275679c59fSAlex Elder  * This sets the KNOWN flag after (possibly) setting the EXISTS
14285679c59fSAlex Elder  * flag.  The latter is set based on the "exists" value provided.
14295679c59fSAlex Elder  *
14305679c59fSAlex Elder  * Note that for our purposes once an object exists it never goes
14315679c59fSAlex Elder  * away again.  It's possible that the response from two existence
14325679c59fSAlex Elder  * checks are separated by the creation of the target object, and
14335679c59fSAlex Elder  * the first ("doesn't exist") response arrives *after* the second
14345679c59fSAlex Elder  * ("does exist").  In that case we ignore the second one.
14355679c59fSAlex Elder  */
14365679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request,
14375679c59fSAlex Elder 				bool exists)
14385679c59fSAlex Elder {
14395679c59fSAlex Elder 	if (exists)
14405679c59fSAlex Elder 		set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
14415679c59fSAlex Elder 	set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
14425679c59fSAlex Elder 	smp_mb();
14435679c59fSAlex Elder }
14445679c59fSAlex Elder 
14455679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request)
14465679c59fSAlex Elder {
14475679c59fSAlex Elder 	smp_mb();
14485679c59fSAlex Elder 	return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
14495679c59fSAlex Elder }
14505679c59fSAlex Elder 
14515679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
14525679c59fSAlex Elder {
14535679c59fSAlex Elder 	smp_mb();
14545679c59fSAlex Elder 	return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
14555679c59fSAlex Elder }
14565679c59fSAlex Elder 
14579638556aSIlya Dryomov static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request)
14589638556aSIlya Dryomov {
14599638556aSIlya Dryomov 	struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
14609638556aSIlya Dryomov 
14619638556aSIlya Dryomov 	return obj_request->img_offset <
14629638556aSIlya Dryomov 	    round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header));
14639638556aSIlya Dryomov }
14649638556aSIlya Dryomov 
1465bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1466bf0d5f50SAlex Elder {
146737206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
146837206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1469bf0d5f50SAlex Elder 	kref_get(&obj_request->kref);
1470bf0d5f50SAlex Elder }
1471bf0d5f50SAlex Elder 
1472bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
1473bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1474bf0d5f50SAlex Elder {
1475bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
147637206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
147737206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1478bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1479bf0d5f50SAlex Elder }
1480bf0d5f50SAlex Elder 
14810f2d5be7SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request)
14820f2d5be7SAlex Elder {
14830f2d5be7SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
14840f2d5be7SAlex Elder 	     atomic_read(&img_request->kref.refcount));
14850f2d5be7SAlex Elder 	kref_get(&img_request->kref);
14860f2d5be7SAlex Elder }
14870f2d5be7SAlex Elder 
1488e93f3152SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request);
1489e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref);
1490bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref);
1491bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request)
1492bf0d5f50SAlex Elder {
1493bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
149437206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
149537206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1496e93f3152SAlex Elder 	if (img_request_child_test(img_request))
1497e93f3152SAlex Elder 		kref_put(&img_request->kref, rbd_parent_request_destroy);
1498e93f3152SAlex Elder 	else
1499bf0d5f50SAlex Elder 		kref_put(&img_request->kref, rbd_img_request_destroy);
1500bf0d5f50SAlex Elder }
1501bf0d5f50SAlex Elder 
1502bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1503bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1504bf0d5f50SAlex Elder {
150525dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
150625dcf954SAlex Elder 
1507b155e86cSAlex Elder 	/* Image request now owns object's original reference */
1508bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
150925dcf954SAlex Elder 	obj_request->which = img_request->obj_request_count;
15106365d33aSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
15116365d33aSAlex Elder 	obj_request_img_data_set(obj_request);
1512bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
151325dcf954SAlex Elder 	img_request->obj_request_count++;
151425dcf954SAlex Elder 	list_add_tail(&obj_request->links, &img_request->obj_requests);
151537206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
151637206ee5SAlex Elder 		obj_request->which);
1517bf0d5f50SAlex Elder }
1518bf0d5f50SAlex Elder 
1519bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1520bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1521bf0d5f50SAlex Elder {
1522bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
152325dcf954SAlex Elder 
152437206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
152537206ee5SAlex Elder 		obj_request->which);
1526bf0d5f50SAlex Elder 	list_del(&obj_request->links);
152725dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
152825dcf954SAlex Elder 	img_request->obj_request_count--;
152925dcf954SAlex Elder 	rbd_assert(obj_request->which == img_request->obj_request_count);
153025dcf954SAlex Elder 	obj_request->which = BAD_WHICH;
15316365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
1532bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1533bf0d5f50SAlex Elder 	obj_request->img_request = NULL;
153425dcf954SAlex Elder 	obj_request->callback = NULL;
1535bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1536bf0d5f50SAlex Elder }
1537bf0d5f50SAlex Elder 
1538bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type)
1539bf0d5f50SAlex Elder {
1540bf0d5f50SAlex Elder 	switch (type) {
15419969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1542bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1543788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1544bf0d5f50SAlex Elder 		return true;
1545bf0d5f50SAlex Elder 	default:
1546bf0d5f50SAlex Elder 		return false;
1547bf0d5f50SAlex Elder 	}
1548bf0d5f50SAlex Elder }
1549bf0d5f50SAlex Elder 
1550bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1551bf0d5f50SAlex Elder 				struct rbd_obj_request *obj_request)
1552bf0d5f50SAlex Elder {
155371c20a06SIlya Dryomov 	dout("%s %p\n", __func__, obj_request);
1554bf0d5f50SAlex Elder 	return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1555bf0d5f50SAlex Elder }
1556bf0d5f50SAlex Elder 
155771c20a06SIlya Dryomov static void rbd_obj_request_end(struct rbd_obj_request *obj_request)
155871c20a06SIlya Dryomov {
155971c20a06SIlya Dryomov 	dout("%s %p\n", __func__, obj_request);
156071c20a06SIlya Dryomov 	ceph_osdc_cancel_request(obj_request->osd_req);
156171c20a06SIlya Dryomov }
156271c20a06SIlya Dryomov 
156371c20a06SIlya Dryomov /*
156471c20a06SIlya Dryomov  * Wait for an object request to complete.  If interrupted, cancel the
156571c20a06SIlya Dryomov  * underlying osd request.
15662894e1d7SIlya Dryomov  *
15672894e1d7SIlya Dryomov  * @timeout: in jiffies, 0 means "wait forever"
156871c20a06SIlya Dryomov  */
15692894e1d7SIlya Dryomov static int __rbd_obj_request_wait(struct rbd_obj_request *obj_request,
15702894e1d7SIlya Dryomov 				  unsigned long timeout)
157171c20a06SIlya Dryomov {
15722894e1d7SIlya Dryomov 	long ret;
157371c20a06SIlya Dryomov 
157471c20a06SIlya Dryomov 	dout("%s %p\n", __func__, obj_request);
15752894e1d7SIlya Dryomov 	ret = wait_for_completion_interruptible_timeout(
15762894e1d7SIlya Dryomov 					&obj_request->completion,
15772894e1d7SIlya Dryomov 					ceph_timeout_jiffies(timeout));
15782894e1d7SIlya Dryomov 	if (ret <= 0) {
15792894e1d7SIlya Dryomov 		if (ret == 0)
15802894e1d7SIlya Dryomov 			ret = -ETIMEDOUT;
158171c20a06SIlya Dryomov 		rbd_obj_request_end(obj_request);
15822894e1d7SIlya Dryomov 	} else {
15832894e1d7SIlya Dryomov 		ret = 0;
15842894e1d7SIlya Dryomov 	}
15852894e1d7SIlya Dryomov 
15862894e1d7SIlya Dryomov 	dout("%s %p ret %d\n", __func__, obj_request, (int)ret);
158771c20a06SIlya Dryomov 	return ret;
158871c20a06SIlya Dryomov }
158971c20a06SIlya Dryomov 
15902894e1d7SIlya Dryomov static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
15912894e1d7SIlya Dryomov {
15922894e1d7SIlya Dryomov 	return __rbd_obj_request_wait(obj_request, 0);
15932894e1d7SIlya Dryomov }
15942894e1d7SIlya Dryomov 
15952894e1d7SIlya Dryomov static int rbd_obj_request_wait_timeout(struct rbd_obj_request *obj_request,
15962894e1d7SIlya Dryomov 					unsigned long timeout)
15972894e1d7SIlya Dryomov {
15982894e1d7SIlya Dryomov 	return __rbd_obj_request_wait(obj_request, timeout);
159971c20a06SIlya Dryomov }
160071c20a06SIlya Dryomov 
1601bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request)
1602bf0d5f50SAlex Elder {
160355f27e09SAlex Elder 
160437206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
160555f27e09SAlex Elder 
160655f27e09SAlex Elder 	/*
160755f27e09SAlex Elder 	 * If no error occurred, compute the aggregate transfer
160855f27e09SAlex Elder 	 * count for the image request.  We could instead use
160955f27e09SAlex Elder 	 * atomic64_cmpxchg() to update it as each object request
161055f27e09SAlex Elder 	 * completes; not clear which way is better off hand.
161155f27e09SAlex Elder 	 */
161255f27e09SAlex Elder 	if (!img_request->result) {
161355f27e09SAlex Elder 		struct rbd_obj_request *obj_request;
161455f27e09SAlex Elder 		u64 xferred = 0;
161555f27e09SAlex Elder 
161655f27e09SAlex Elder 		for_each_obj_request(img_request, obj_request)
161755f27e09SAlex Elder 			xferred += obj_request->xferred;
161855f27e09SAlex Elder 		img_request->xferred = xferred;
161955f27e09SAlex Elder 	}
162055f27e09SAlex Elder 
1621bf0d5f50SAlex Elder 	if (img_request->callback)
1622bf0d5f50SAlex Elder 		img_request->callback(img_request);
1623bf0d5f50SAlex Elder 	else
1624bf0d5f50SAlex Elder 		rbd_img_request_put(img_request);
1625bf0d5f50SAlex Elder }
1626bf0d5f50SAlex Elder 
16270c425248SAlex Elder /*
16280c425248SAlex Elder  * The default/initial value for all image request flags is 0.  Each
16290c425248SAlex Elder  * is conditionally set to 1 at image request initialization time
16300c425248SAlex Elder  * and currently never change thereafter.
16310c425248SAlex Elder  */
16320c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request)
16330c425248SAlex Elder {
16340c425248SAlex Elder 	set_bit(IMG_REQ_WRITE, &img_request->flags);
16350c425248SAlex Elder 	smp_mb();
16360c425248SAlex Elder }
16370c425248SAlex Elder 
16380c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request)
16390c425248SAlex Elder {
16400c425248SAlex Elder 	smp_mb();
16410c425248SAlex Elder 	return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
16420c425248SAlex Elder }
16430c425248SAlex Elder 
164490e98c52SGuangliang Zhao /*
164590e98c52SGuangliang Zhao  * Set the discard flag when the img_request is an discard request
164690e98c52SGuangliang Zhao  */
164790e98c52SGuangliang Zhao static void img_request_discard_set(struct rbd_img_request *img_request)
164890e98c52SGuangliang Zhao {
164990e98c52SGuangliang Zhao 	set_bit(IMG_REQ_DISCARD, &img_request->flags);
165090e98c52SGuangliang Zhao 	smp_mb();
165190e98c52SGuangliang Zhao }
165290e98c52SGuangliang Zhao 
165390e98c52SGuangliang Zhao static bool img_request_discard_test(struct rbd_img_request *img_request)
165490e98c52SGuangliang Zhao {
165590e98c52SGuangliang Zhao 	smp_mb();
165690e98c52SGuangliang Zhao 	return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0;
165790e98c52SGuangliang Zhao }
165890e98c52SGuangliang Zhao 
16599849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request)
16609849e986SAlex Elder {
16619849e986SAlex Elder 	set_bit(IMG_REQ_CHILD, &img_request->flags);
16629849e986SAlex Elder 	smp_mb();
16639849e986SAlex Elder }
16649849e986SAlex Elder 
1665e93f3152SAlex Elder static void img_request_child_clear(struct rbd_img_request *img_request)
1666e93f3152SAlex Elder {
1667e93f3152SAlex Elder 	clear_bit(IMG_REQ_CHILD, &img_request->flags);
1668e93f3152SAlex Elder 	smp_mb();
1669e93f3152SAlex Elder }
1670e93f3152SAlex Elder 
16719849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request)
16729849e986SAlex Elder {
16739849e986SAlex Elder 	smp_mb();
16749849e986SAlex Elder 	return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
16759849e986SAlex Elder }
16769849e986SAlex Elder 
1677d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request)
1678d0b2e944SAlex Elder {
1679d0b2e944SAlex Elder 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1680d0b2e944SAlex Elder 	smp_mb();
1681d0b2e944SAlex Elder }
1682d0b2e944SAlex Elder 
1683a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request)
1684a2acd00eSAlex Elder {
1685a2acd00eSAlex Elder 	clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1686a2acd00eSAlex Elder 	smp_mb();
1687a2acd00eSAlex Elder }
1688a2acd00eSAlex Elder 
1689d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request)
1690d0b2e944SAlex Elder {
1691d0b2e944SAlex Elder 	smp_mb();
1692d0b2e944SAlex Elder 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1693d0b2e944SAlex Elder }
1694d0b2e944SAlex Elder 
16953b434a2aSJosh Durgin static enum obj_operation_type
16963b434a2aSJosh Durgin rbd_img_request_op_type(struct rbd_img_request *img_request)
16973b434a2aSJosh Durgin {
16983b434a2aSJosh Durgin 	if (img_request_write_test(img_request))
16993b434a2aSJosh Durgin 		return OBJ_OP_WRITE;
17003b434a2aSJosh Durgin 	else if (img_request_discard_test(img_request))
17013b434a2aSJosh Durgin 		return OBJ_OP_DISCARD;
17023b434a2aSJosh Durgin 	else
17033b434a2aSJosh Durgin 		return OBJ_OP_READ;
17043b434a2aSJosh Durgin }
17053b434a2aSJosh Durgin 
17066e2a4505SAlex Elder static void
17076e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
17086e2a4505SAlex Elder {
1709b9434c5bSAlex Elder 	u64 xferred = obj_request->xferred;
1710b9434c5bSAlex Elder 	u64 length = obj_request->length;
1711b9434c5bSAlex Elder 
17126e2a4505SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
17136e2a4505SAlex Elder 		obj_request, obj_request->img_request, obj_request->result,
1714b9434c5bSAlex Elder 		xferred, length);
17156e2a4505SAlex Elder 	/*
171617c1cc1dSJosh Durgin 	 * ENOENT means a hole in the image.  We zero-fill the entire
171717c1cc1dSJosh Durgin 	 * length of the request.  A short read also implies zero-fill
171817c1cc1dSJosh Durgin 	 * to the end of the request.  An error requires the whole
171917c1cc1dSJosh Durgin 	 * length of the request to be reported finished with an error
172017c1cc1dSJosh Durgin 	 * to the block layer.  In each case we update the xferred
172117c1cc1dSJosh Durgin 	 * count to indicate the whole request was satisfied.
17226e2a4505SAlex Elder 	 */
1723b9434c5bSAlex Elder 	rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
17246e2a4505SAlex Elder 	if (obj_request->result == -ENOENT) {
1725b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
17266e2a4505SAlex Elder 			zero_bio_chain(obj_request->bio_list, 0);
1727b9434c5bSAlex Elder 		else
1728b9434c5bSAlex Elder 			zero_pages(obj_request->pages, 0, length);
17296e2a4505SAlex Elder 		obj_request->result = 0;
1730b9434c5bSAlex Elder 	} else if (xferred < length && !obj_request->result) {
1731b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
1732b9434c5bSAlex Elder 			zero_bio_chain(obj_request->bio_list, xferred);
1733b9434c5bSAlex Elder 		else
1734b9434c5bSAlex Elder 			zero_pages(obj_request->pages, xferred, length);
17356e2a4505SAlex Elder 	}
173617c1cc1dSJosh Durgin 	obj_request->xferred = length;
17376e2a4505SAlex Elder 	obj_request_done_set(obj_request);
17386e2a4505SAlex Elder }
17396e2a4505SAlex Elder 
1740bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1741bf0d5f50SAlex Elder {
174237206ee5SAlex Elder 	dout("%s: obj %p cb %p\n", __func__, obj_request,
174337206ee5SAlex Elder 		obj_request->callback);
1744bf0d5f50SAlex Elder 	if (obj_request->callback)
1745bf0d5f50SAlex Elder 		obj_request->callback(obj_request);
1746788e2df3SAlex Elder 	else
1747788e2df3SAlex Elder 		complete_all(&obj_request->completion);
1748bf0d5f50SAlex Elder }
1749bf0d5f50SAlex Elder 
1750c47f9371SAlex Elder static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
175139bf2c5dSAlex Elder {
175239bf2c5dSAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
175339bf2c5dSAlex Elder 	obj_request_done_set(obj_request);
175439bf2c5dSAlex Elder }
175539bf2c5dSAlex Elder 
1756c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1757bf0d5f50SAlex Elder {
175857acbaa7SAlex Elder 	struct rbd_img_request *img_request = NULL;
1759a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev = NULL;
176057acbaa7SAlex Elder 	bool layered = false;
176157acbaa7SAlex Elder 
176257acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
176357acbaa7SAlex Elder 		img_request = obj_request->img_request;
176457acbaa7SAlex Elder 		layered = img_request && img_request_layered_test(img_request);
1765a9e8ba2cSAlex Elder 		rbd_dev = img_request->rbd_dev;
176657acbaa7SAlex Elder 	}
17678b3e1a56SAlex Elder 
17688b3e1a56SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
17698b3e1a56SAlex Elder 		obj_request, img_request, obj_request->result,
17708b3e1a56SAlex Elder 		obj_request->xferred, obj_request->length);
1771a9e8ba2cSAlex Elder 	if (layered && obj_request->result == -ENOENT &&
1772a9e8ba2cSAlex Elder 			obj_request->img_offset < rbd_dev->parent_overlap)
17738b3e1a56SAlex Elder 		rbd_img_parent_read(obj_request);
17748b3e1a56SAlex Elder 	else if (img_request)
17756e2a4505SAlex Elder 		rbd_img_obj_request_read_callback(obj_request);
17766e2a4505SAlex Elder 	else
177707741308SAlex Elder 		obj_request_done_set(obj_request);
1778bf0d5f50SAlex Elder }
1779bf0d5f50SAlex Elder 
1780c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1781bf0d5f50SAlex Elder {
17821b83bef2SSage Weil 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
17831b83bef2SSage Weil 		obj_request->result, obj_request->length);
17841b83bef2SSage Weil 	/*
17858b3e1a56SAlex Elder 	 * There is no such thing as a successful short write.  Set
17868b3e1a56SAlex Elder 	 * it to our originally-requested length.
17871b83bef2SSage Weil 	 */
17881b83bef2SSage Weil 	obj_request->xferred = obj_request->length;
178907741308SAlex Elder 	obj_request_done_set(obj_request);
1790bf0d5f50SAlex Elder }
1791bf0d5f50SAlex Elder 
179290e98c52SGuangliang Zhao static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request)
179390e98c52SGuangliang Zhao {
179490e98c52SGuangliang Zhao 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
179590e98c52SGuangliang Zhao 		obj_request->result, obj_request->length);
179690e98c52SGuangliang Zhao 	/*
179790e98c52SGuangliang Zhao 	 * There is no such thing as a successful short discard.  Set
179890e98c52SGuangliang Zhao 	 * it to our originally-requested length.
179990e98c52SGuangliang Zhao 	 */
180090e98c52SGuangliang Zhao 	obj_request->xferred = obj_request->length;
1801d0265de7SJosh Durgin 	/* discarding a non-existent object is not a problem */
1802d0265de7SJosh Durgin 	if (obj_request->result == -ENOENT)
1803d0265de7SJosh Durgin 		obj_request->result = 0;
180490e98c52SGuangliang Zhao 	obj_request_done_set(obj_request);
180590e98c52SGuangliang Zhao }
180690e98c52SGuangliang Zhao 
1807fbfab539SAlex Elder /*
1808fbfab539SAlex Elder  * For a simple stat call there's nothing to do.  We'll do more if
1809fbfab539SAlex Elder  * this is part of a write sequence for a layered image.
1810fbfab539SAlex Elder  */
1811c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1812fbfab539SAlex Elder {
181337206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
1814fbfab539SAlex Elder 	obj_request_done_set(obj_request);
1815fbfab539SAlex Elder }
1816fbfab539SAlex Elder 
1817bf0d5f50SAlex Elder static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1818bf0d5f50SAlex Elder 				struct ceph_msg *msg)
1819bf0d5f50SAlex Elder {
1820bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1821bf0d5f50SAlex Elder 	u16 opcode;
1822bf0d5f50SAlex Elder 
182337206ee5SAlex Elder 	dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1824bf0d5f50SAlex Elder 	rbd_assert(osd_req == obj_request->osd_req);
182557acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
182657acbaa7SAlex Elder 		rbd_assert(obj_request->img_request);
182757acbaa7SAlex Elder 		rbd_assert(obj_request->which != BAD_WHICH);
182857acbaa7SAlex Elder 	} else {
182957acbaa7SAlex Elder 		rbd_assert(obj_request->which == BAD_WHICH);
183057acbaa7SAlex Elder 	}
1831bf0d5f50SAlex Elder 
18321b83bef2SSage Weil 	if (osd_req->r_result < 0)
18331b83bef2SSage Weil 		obj_request->result = osd_req->r_result;
1834bf0d5f50SAlex Elder 
18357cc69d42SIlya Dryomov 	rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP);
1836bf0d5f50SAlex Elder 
1837c47f9371SAlex Elder 	/*
1838c47f9371SAlex Elder 	 * We support a 64-bit length, but ultimately it has to be
18397ad18afaSChristoph Hellwig 	 * passed to the block layer, which just supports a 32-bit
18407ad18afaSChristoph Hellwig 	 * length field.
1841c47f9371SAlex Elder 	 */
18421b83bef2SSage Weil 	obj_request->xferred = osd_req->r_reply_op_len[0];
1843c47f9371SAlex Elder 	rbd_assert(obj_request->xferred < (u64)UINT_MAX);
18440ccd5926SIlya Dryomov 
184579528734SAlex Elder 	opcode = osd_req->r_ops[0].op;
1846bf0d5f50SAlex Elder 	switch (opcode) {
1847bf0d5f50SAlex Elder 	case CEPH_OSD_OP_READ:
1848c47f9371SAlex Elder 		rbd_osd_read_callback(obj_request);
1849bf0d5f50SAlex Elder 		break;
18500ccd5926SIlya Dryomov 	case CEPH_OSD_OP_SETALLOCHINT:
18510ccd5926SIlya Dryomov 		rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE);
18520ccd5926SIlya Dryomov 		/* fall through */
1853bf0d5f50SAlex Elder 	case CEPH_OSD_OP_WRITE:
1854c47f9371SAlex Elder 		rbd_osd_write_callback(obj_request);
1855bf0d5f50SAlex Elder 		break;
1856fbfab539SAlex Elder 	case CEPH_OSD_OP_STAT:
1857c47f9371SAlex Elder 		rbd_osd_stat_callback(obj_request);
1858fbfab539SAlex Elder 		break;
185990e98c52SGuangliang Zhao 	case CEPH_OSD_OP_DELETE:
186090e98c52SGuangliang Zhao 	case CEPH_OSD_OP_TRUNCATE:
186190e98c52SGuangliang Zhao 	case CEPH_OSD_OP_ZERO:
186290e98c52SGuangliang Zhao 		rbd_osd_discard_callback(obj_request);
186390e98c52SGuangliang Zhao 		break;
186436be9a76SAlex Elder 	case CEPH_OSD_OP_CALL:
1865b8d70035SAlex Elder 	case CEPH_OSD_OP_NOTIFY_ACK:
18669969ebc5SAlex Elder 	case CEPH_OSD_OP_WATCH:
1867c47f9371SAlex Elder 		rbd_osd_trivial_callback(obj_request);
18689969ebc5SAlex Elder 		break;
1869bf0d5f50SAlex Elder 	default:
18709584d508SIlya Dryomov 		rbd_warn(NULL, "%s: unsupported op %hu",
1871bf0d5f50SAlex Elder 			obj_request->object_name, (unsigned short) opcode);
1872bf0d5f50SAlex Elder 		break;
1873bf0d5f50SAlex Elder 	}
1874bf0d5f50SAlex Elder 
187507741308SAlex Elder 	if (obj_request_done_test(obj_request))
1876bf0d5f50SAlex Elder 		rbd_obj_request_complete(obj_request);
1877bf0d5f50SAlex Elder }
1878bf0d5f50SAlex Elder 
18799d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1880430c28c3SAlex Elder {
1881430c28c3SAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
18828c042b0dSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
18839d4df01fSAlex Elder 	u64 snap_id;
1884430c28c3SAlex Elder 
18858c042b0dSAlex Elder 	rbd_assert(osd_req != NULL);
1886430c28c3SAlex Elder 
18879d4df01fSAlex Elder 	snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
18888c042b0dSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
18899d4df01fSAlex Elder 			NULL, snap_id, NULL);
18909d4df01fSAlex Elder }
18919d4df01fSAlex Elder 
18929d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
18939d4df01fSAlex Elder {
18949d4df01fSAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
18959d4df01fSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
18969d4df01fSAlex Elder 	struct ceph_snap_context *snapc;
18979d4df01fSAlex Elder 	struct timespec mtime = CURRENT_TIME;
18989d4df01fSAlex Elder 
18999d4df01fSAlex Elder 	rbd_assert(osd_req != NULL);
19009d4df01fSAlex Elder 
19019d4df01fSAlex Elder 	snapc = img_request ? img_request->snapc : NULL;
19029d4df01fSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
19039d4df01fSAlex Elder 			snapc, CEPH_NOSNAP, &mtime);
1904430c28c3SAlex Elder }
1905430c28c3SAlex Elder 
19060ccd5926SIlya Dryomov /*
19070ccd5926SIlya Dryomov  * Create an osd request.  A read request has one osd op (read).
19080ccd5926SIlya Dryomov  * A write request has either one (watch) or two (hint+write) osd ops.
19090ccd5926SIlya Dryomov  * (All rbd data writes are prefixed with an allocation hint op, but
19100ccd5926SIlya Dryomov  * technically osd watch is a write request, hence this distinction.)
19110ccd5926SIlya Dryomov  */
1912bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create(
1913bf0d5f50SAlex Elder 					struct rbd_device *rbd_dev,
19146d2940c8SGuangliang Zhao 					enum obj_operation_type op_type,
1915deb236b3SIlya Dryomov 					unsigned int num_ops,
1916430c28c3SAlex Elder 					struct rbd_obj_request *obj_request)
1917bf0d5f50SAlex Elder {
1918bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1919bf0d5f50SAlex Elder 	struct ceph_osd_client *osdc;
1920bf0d5f50SAlex Elder 	struct ceph_osd_request *osd_req;
1921bf0d5f50SAlex Elder 
192290e98c52SGuangliang Zhao 	if (obj_request_img_data_test(obj_request) &&
192390e98c52SGuangliang Zhao 		(op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) {
19246365d33aSAlex Elder 		struct rbd_img_request *img_request = obj_request->img_request;
192590e98c52SGuangliang Zhao 		if (op_type == OBJ_OP_WRITE) {
19266d2940c8SGuangliang Zhao 			rbd_assert(img_request_write_test(img_request));
192790e98c52SGuangliang Zhao 		} else {
192890e98c52SGuangliang Zhao 			rbd_assert(img_request_discard_test(img_request));
192990e98c52SGuangliang Zhao 		}
1930bf0d5f50SAlex Elder 		snapc = img_request->snapc;
1931bf0d5f50SAlex Elder 	}
1932bf0d5f50SAlex Elder 
19336d2940c8SGuangliang Zhao 	rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2));
1934deb236b3SIlya Dryomov 
1935deb236b3SIlya Dryomov 	/* Allocate and initialize the request, for the num_ops ops */
1936bf0d5f50SAlex Elder 
1937bf0d5f50SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
1938deb236b3SIlya Dryomov 	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
1939deb236b3SIlya Dryomov 					  GFP_ATOMIC);
1940bf0d5f50SAlex Elder 	if (!osd_req)
1941bf0d5f50SAlex Elder 		return NULL;	/* ENOMEM */
1942bf0d5f50SAlex Elder 
194390e98c52SGuangliang Zhao 	if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
1944bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1945430c28c3SAlex Elder 	else
1946bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_READ;
1947bf0d5f50SAlex Elder 
1948bf0d5f50SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
1949bf0d5f50SAlex Elder 	osd_req->r_priv = obj_request;
1950bf0d5f50SAlex Elder 
19513c972c95SIlya Dryomov 	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
19523c972c95SIlya Dryomov 	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
1953bf0d5f50SAlex Elder 
1954bf0d5f50SAlex Elder 	return osd_req;
1955bf0d5f50SAlex Elder }
1956bf0d5f50SAlex Elder 
19570eefd470SAlex Elder /*
1958d3246fb0SJosh Durgin  * Create a copyup osd request based on the information in the object
1959d3246fb0SJosh Durgin  * request supplied.  A copyup request has two or three osd ops, a
1960d3246fb0SJosh Durgin  * copyup method call, potentially a hint op, and a write or truncate
1961d3246fb0SJosh Durgin  * or zero op.
19620eefd470SAlex Elder  */
19630eefd470SAlex Elder static struct ceph_osd_request *
19640eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
19650eefd470SAlex Elder {
19660eefd470SAlex Elder 	struct rbd_img_request *img_request;
19670eefd470SAlex Elder 	struct ceph_snap_context *snapc;
19680eefd470SAlex Elder 	struct rbd_device *rbd_dev;
19690eefd470SAlex Elder 	struct ceph_osd_client *osdc;
19700eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
1971d3246fb0SJosh Durgin 	int num_osd_ops = 3;
19720eefd470SAlex Elder 
19730eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
19740eefd470SAlex Elder 	img_request = obj_request->img_request;
19750eefd470SAlex Elder 	rbd_assert(img_request);
1976d3246fb0SJosh Durgin 	rbd_assert(img_request_write_test(img_request) ||
1977d3246fb0SJosh Durgin 			img_request_discard_test(img_request));
19780eefd470SAlex Elder 
1979d3246fb0SJosh Durgin 	if (img_request_discard_test(img_request))
1980d3246fb0SJosh Durgin 		num_osd_ops = 2;
1981d3246fb0SJosh Durgin 
1982d3246fb0SJosh Durgin 	/* Allocate and initialize the request, for all the ops */
19830eefd470SAlex Elder 
19840eefd470SAlex Elder 	snapc = img_request->snapc;
19850eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
19860eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
1987d3246fb0SJosh Durgin 	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops,
1988d3246fb0SJosh Durgin 						false, GFP_ATOMIC);
19890eefd470SAlex Elder 	if (!osd_req)
19900eefd470SAlex Elder 		return NULL;	/* ENOMEM */
19910eefd470SAlex Elder 
19920eefd470SAlex Elder 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
19930eefd470SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
19940eefd470SAlex Elder 	osd_req->r_priv = obj_request;
19950eefd470SAlex Elder 
19963c972c95SIlya Dryomov 	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
19973c972c95SIlya Dryomov 	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
19980eefd470SAlex Elder 
19990eefd470SAlex Elder 	return osd_req;
20000eefd470SAlex Elder }
20010eefd470SAlex Elder 
20020eefd470SAlex Elder 
2003bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
2004bf0d5f50SAlex Elder {
2005bf0d5f50SAlex Elder 	ceph_osdc_put_request(osd_req);
2006bf0d5f50SAlex Elder }
2007bf0d5f50SAlex Elder 
2008bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */
2009bf0d5f50SAlex Elder 
2010bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
2011bf0d5f50SAlex Elder 						u64 offset, u64 length,
2012bf0d5f50SAlex Elder 						enum obj_request_type type)
2013bf0d5f50SAlex Elder {
2014bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2015bf0d5f50SAlex Elder 	size_t size;
2016bf0d5f50SAlex Elder 	char *name;
2017bf0d5f50SAlex Elder 
2018bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(type));
2019bf0d5f50SAlex Elder 
2020bf0d5f50SAlex Elder 	size = strlen(object_name) + 1;
2021f907ad55SAlex Elder 	name = kmalloc(size, GFP_KERNEL);
2022f907ad55SAlex Elder 	if (!name)
2023bf0d5f50SAlex Elder 		return NULL;
2024bf0d5f50SAlex Elder 
2025868311b1SAlex Elder 	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL);
2026f907ad55SAlex Elder 	if (!obj_request) {
2027f907ad55SAlex Elder 		kfree(name);
2028f907ad55SAlex Elder 		return NULL;
2029f907ad55SAlex Elder 	}
2030f907ad55SAlex Elder 
2031bf0d5f50SAlex Elder 	obj_request->object_name = memcpy(name, object_name, size);
2032bf0d5f50SAlex Elder 	obj_request->offset = offset;
2033bf0d5f50SAlex Elder 	obj_request->length = length;
2034926f9b3fSAlex Elder 	obj_request->flags = 0;
2035bf0d5f50SAlex Elder 	obj_request->which = BAD_WHICH;
2036bf0d5f50SAlex Elder 	obj_request->type = type;
2037bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&obj_request->links);
2038788e2df3SAlex Elder 	init_completion(&obj_request->completion);
2039bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
2040bf0d5f50SAlex Elder 
204137206ee5SAlex Elder 	dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
204237206ee5SAlex Elder 		offset, length, (int)type, obj_request);
204337206ee5SAlex Elder 
2044bf0d5f50SAlex Elder 	return obj_request;
2045bf0d5f50SAlex Elder }
2046bf0d5f50SAlex Elder 
2047bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
2048bf0d5f50SAlex Elder {
2049bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2050bf0d5f50SAlex Elder 
2051bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
2052bf0d5f50SAlex Elder 
205337206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
205437206ee5SAlex Elder 
2055bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
2056bf0d5f50SAlex Elder 	rbd_assert(obj_request->which == BAD_WHICH);
2057bf0d5f50SAlex Elder 
2058bf0d5f50SAlex Elder 	if (obj_request->osd_req)
2059bf0d5f50SAlex Elder 		rbd_osd_req_destroy(obj_request->osd_req);
2060bf0d5f50SAlex Elder 
2061bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
2062bf0d5f50SAlex Elder 	switch (obj_request->type) {
20639969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
20649969ebc5SAlex Elder 		break;		/* Nothing to do */
2065bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
2066bf0d5f50SAlex Elder 		if (obj_request->bio_list)
2067bf0d5f50SAlex Elder 			bio_chain_put(obj_request->bio_list);
2068bf0d5f50SAlex Elder 		break;
2069788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
2070788e2df3SAlex Elder 		if (obj_request->pages)
2071788e2df3SAlex Elder 			ceph_release_page_vector(obj_request->pages,
2072788e2df3SAlex Elder 						obj_request->page_count);
2073788e2df3SAlex Elder 		break;
2074bf0d5f50SAlex Elder 	}
2075bf0d5f50SAlex Elder 
2076f907ad55SAlex Elder 	kfree(obj_request->object_name);
2077868311b1SAlex Elder 	obj_request->object_name = NULL;
2078868311b1SAlex Elder 	kmem_cache_free(rbd_obj_request_cache, obj_request);
2079bf0d5f50SAlex Elder }
2080bf0d5f50SAlex Elder 
2081fb65d228SAlex Elder /* It's OK to call this for a device with no parent */
2082fb65d228SAlex Elder 
2083fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
2084fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev)
2085fb65d228SAlex Elder {
2086fb65d228SAlex Elder 	rbd_dev_remove_parent(rbd_dev);
2087fb65d228SAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
2088fb65d228SAlex Elder 	rbd_dev->parent_spec = NULL;
2089fb65d228SAlex Elder 	rbd_dev->parent_overlap = 0;
2090fb65d228SAlex Elder }
2091fb65d228SAlex Elder 
2092bf0d5f50SAlex Elder /*
2093a2acd00eSAlex Elder  * Parent image reference counting is used to determine when an
2094a2acd00eSAlex Elder  * image's parent fields can be safely torn down--after there are no
2095a2acd00eSAlex Elder  * more in-flight requests to the parent image.  When the last
2096a2acd00eSAlex Elder  * reference is dropped, cleaning them up is safe.
2097a2acd00eSAlex Elder  */
2098a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
2099a2acd00eSAlex Elder {
2100a2acd00eSAlex Elder 	int counter;
2101a2acd00eSAlex Elder 
2102a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
2103a2acd00eSAlex Elder 		return;
2104a2acd00eSAlex Elder 
2105a2acd00eSAlex Elder 	counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
2106a2acd00eSAlex Elder 	if (counter > 0)
2107a2acd00eSAlex Elder 		return;
2108a2acd00eSAlex Elder 
2109a2acd00eSAlex Elder 	/* Last reference; clean up parent data structures */
2110a2acd00eSAlex Elder 
2111a2acd00eSAlex Elder 	if (!counter)
2112a2acd00eSAlex Elder 		rbd_dev_unparent(rbd_dev);
2113a2acd00eSAlex Elder 	else
21149584d508SIlya Dryomov 		rbd_warn(rbd_dev, "parent reference underflow");
2115a2acd00eSAlex Elder }
2116a2acd00eSAlex Elder 
2117a2acd00eSAlex Elder /*
2118a2acd00eSAlex Elder  * If an image has a non-zero parent overlap, get a reference to its
2119a2acd00eSAlex Elder  * parent.
2120a2acd00eSAlex Elder  *
2121a2acd00eSAlex Elder  * Returns true if the rbd device has a parent with a non-zero
2122a2acd00eSAlex Elder  * overlap and a reference for it was successfully taken, or
2123a2acd00eSAlex Elder  * false otherwise.
2124a2acd00eSAlex Elder  */
2125a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
2126a2acd00eSAlex Elder {
2127ae43e9d0SIlya Dryomov 	int counter = 0;
2128a2acd00eSAlex Elder 
2129a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
2130a2acd00eSAlex Elder 		return false;
2131a2acd00eSAlex Elder 
2132ae43e9d0SIlya Dryomov 	down_read(&rbd_dev->header_rwsem);
2133ae43e9d0SIlya Dryomov 	if (rbd_dev->parent_overlap)
2134a2acd00eSAlex Elder 		counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
2135ae43e9d0SIlya Dryomov 	up_read(&rbd_dev->header_rwsem);
2136a2acd00eSAlex Elder 
2137a2acd00eSAlex Elder 	if (counter < 0)
21389584d508SIlya Dryomov 		rbd_warn(rbd_dev, "parent reference overflow");
2139a2acd00eSAlex Elder 
2140ae43e9d0SIlya Dryomov 	return counter > 0;
2141a2acd00eSAlex Elder }
2142a2acd00eSAlex Elder 
2143bf0d5f50SAlex Elder /*
2144bf0d5f50SAlex Elder  * Caller is responsible for filling in the list of object requests
2145bf0d5f50SAlex Elder  * that comprises the image request, and the Linux request pointer
2146bf0d5f50SAlex Elder  * (if there is one).
2147bf0d5f50SAlex Elder  */
2148cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create(
2149cc344fa1SAlex Elder 					struct rbd_device *rbd_dev,
2150bf0d5f50SAlex Elder 					u64 offset, u64 length,
21516d2940c8SGuangliang Zhao 					enum obj_operation_type op_type,
21524e752f0aSJosh Durgin 					struct ceph_snap_context *snapc)
2153bf0d5f50SAlex Elder {
2154bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
2155bf0d5f50SAlex Elder 
21567a716aacSIlya Dryomov 	img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
2157bf0d5f50SAlex Elder 	if (!img_request)
2158bf0d5f50SAlex Elder 		return NULL;
2159bf0d5f50SAlex Elder 
2160bf0d5f50SAlex Elder 	img_request->rq = NULL;
2161bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
2162bf0d5f50SAlex Elder 	img_request->offset = offset;
2163bf0d5f50SAlex Elder 	img_request->length = length;
21640c425248SAlex Elder 	img_request->flags = 0;
216590e98c52SGuangliang Zhao 	if (op_type == OBJ_OP_DISCARD) {
216690e98c52SGuangliang Zhao 		img_request_discard_set(img_request);
216790e98c52SGuangliang Zhao 		img_request->snapc = snapc;
216890e98c52SGuangliang Zhao 	} else if (op_type == OBJ_OP_WRITE) {
21690c425248SAlex Elder 		img_request_write_set(img_request);
21704e752f0aSJosh Durgin 		img_request->snapc = snapc;
21710c425248SAlex Elder 	} else {
2172bf0d5f50SAlex Elder 		img_request->snap_id = rbd_dev->spec->snap_id;
21730c425248SAlex Elder 	}
2174a2acd00eSAlex Elder 	if (rbd_dev_parent_get(rbd_dev))
2175d0b2e944SAlex Elder 		img_request_layered_set(img_request);
2176bf0d5f50SAlex Elder 	spin_lock_init(&img_request->completion_lock);
2177bf0d5f50SAlex Elder 	img_request->next_completion = 0;
2178bf0d5f50SAlex Elder 	img_request->callback = NULL;
2179a5a337d4SAlex Elder 	img_request->result = 0;
2180bf0d5f50SAlex Elder 	img_request->obj_request_count = 0;
2181bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&img_request->obj_requests);
2182bf0d5f50SAlex Elder 	kref_init(&img_request->kref);
2183bf0d5f50SAlex Elder 
218437206ee5SAlex Elder 	dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
21856d2940c8SGuangliang Zhao 		obj_op_name(op_type), offset, length, img_request);
218637206ee5SAlex Elder 
2187bf0d5f50SAlex Elder 	return img_request;
2188bf0d5f50SAlex Elder }
2189bf0d5f50SAlex Elder 
2190bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref)
2191bf0d5f50SAlex Elder {
2192bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
2193bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2194bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
2195bf0d5f50SAlex Elder 
2196bf0d5f50SAlex Elder 	img_request = container_of(kref, struct rbd_img_request, kref);
2197bf0d5f50SAlex Elder 
219837206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
219937206ee5SAlex Elder 
2200bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2201bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
220225dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count == 0);
2203bf0d5f50SAlex Elder 
2204a2acd00eSAlex Elder 	if (img_request_layered_test(img_request)) {
2205a2acd00eSAlex Elder 		img_request_layered_clear(img_request);
2206a2acd00eSAlex Elder 		rbd_dev_parent_put(img_request->rbd_dev);
2207a2acd00eSAlex Elder 	}
2208a2acd00eSAlex Elder 
2209bef95455SJosh Durgin 	if (img_request_write_test(img_request) ||
2210bef95455SJosh Durgin 		img_request_discard_test(img_request))
2211812164f8SAlex Elder 		ceph_put_snap_context(img_request->snapc);
2212bf0d5f50SAlex Elder 
22131c2a9dfeSAlex Elder 	kmem_cache_free(rbd_img_request_cache, img_request);
2214bf0d5f50SAlex Elder }
2215bf0d5f50SAlex Elder 
2216e93f3152SAlex Elder static struct rbd_img_request *rbd_parent_request_create(
2217e93f3152SAlex Elder 					struct rbd_obj_request *obj_request,
2218e93f3152SAlex Elder 					u64 img_offset, u64 length)
2219e93f3152SAlex Elder {
2220e93f3152SAlex Elder 	struct rbd_img_request *parent_request;
2221e93f3152SAlex Elder 	struct rbd_device *rbd_dev;
2222e93f3152SAlex Elder 
2223e93f3152SAlex Elder 	rbd_assert(obj_request->img_request);
2224e93f3152SAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2225e93f3152SAlex Elder 
22264e752f0aSJosh Durgin 	parent_request = rbd_img_request_create(rbd_dev->parent, img_offset,
22276d2940c8SGuangliang Zhao 						length, OBJ_OP_READ, NULL);
2228e93f3152SAlex Elder 	if (!parent_request)
2229e93f3152SAlex Elder 		return NULL;
2230e93f3152SAlex Elder 
2231e93f3152SAlex Elder 	img_request_child_set(parent_request);
2232e93f3152SAlex Elder 	rbd_obj_request_get(obj_request);
2233e93f3152SAlex Elder 	parent_request->obj_request = obj_request;
2234e93f3152SAlex Elder 
2235e93f3152SAlex Elder 	return parent_request;
2236e93f3152SAlex Elder }
2237e93f3152SAlex Elder 
2238e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref)
2239e93f3152SAlex Elder {
2240e93f3152SAlex Elder 	struct rbd_img_request *parent_request;
2241e93f3152SAlex Elder 	struct rbd_obj_request *orig_request;
2242e93f3152SAlex Elder 
2243e93f3152SAlex Elder 	parent_request = container_of(kref, struct rbd_img_request, kref);
2244e93f3152SAlex Elder 	orig_request = parent_request->obj_request;
2245e93f3152SAlex Elder 
2246e93f3152SAlex Elder 	parent_request->obj_request = NULL;
2247e93f3152SAlex Elder 	rbd_obj_request_put(orig_request);
2248e93f3152SAlex Elder 	img_request_child_clear(parent_request);
2249e93f3152SAlex Elder 
2250e93f3152SAlex Elder 	rbd_img_request_destroy(kref);
2251e93f3152SAlex Elder }
2252e93f3152SAlex Elder 
22531217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
22541217857fSAlex Elder {
22556365d33aSAlex Elder 	struct rbd_img_request *img_request;
22561217857fSAlex Elder 	unsigned int xferred;
22571217857fSAlex Elder 	int result;
22588b3e1a56SAlex Elder 	bool more;
22591217857fSAlex Elder 
22606365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
22616365d33aSAlex Elder 	img_request = obj_request->img_request;
22626365d33aSAlex Elder 
22631217857fSAlex Elder 	rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
22641217857fSAlex Elder 	xferred = (unsigned int)obj_request->xferred;
22651217857fSAlex Elder 	result = obj_request->result;
22661217857fSAlex Elder 	if (result) {
22671217857fSAlex Elder 		struct rbd_device *rbd_dev = img_request->rbd_dev;
22686d2940c8SGuangliang Zhao 		enum obj_operation_type op_type;
22696d2940c8SGuangliang Zhao 
227090e98c52SGuangliang Zhao 		if (img_request_discard_test(img_request))
227190e98c52SGuangliang Zhao 			op_type = OBJ_OP_DISCARD;
227290e98c52SGuangliang Zhao 		else if (img_request_write_test(img_request))
227390e98c52SGuangliang Zhao 			op_type = OBJ_OP_WRITE;
227490e98c52SGuangliang Zhao 		else
227590e98c52SGuangliang Zhao 			op_type = OBJ_OP_READ;
22761217857fSAlex Elder 
22779584d508SIlya Dryomov 		rbd_warn(rbd_dev, "%s %llx at %llx (%llx)",
22786d2940c8SGuangliang Zhao 			obj_op_name(op_type), obj_request->length,
22796d2940c8SGuangliang Zhao 			obj_request->img_offset, obj_request->offset);
22809584d508SIlya Dryomov 		rbd_warn(rbd_dev, "  result %d xferred %x",
22811217857fSAlex Elder 			result, xferred);
22821217857fSAlex Elder 		if (!img_request->result)
22831217857fSAlex Elder 			img_request->result = result;
2284082a75daSIlya Dryomov 		/*
2285082a75daSIlya Dryomov 		 * Need to end I/O on the entire obj_request worth of
2286082a75daSIlya Dryomov 		 * bytes in case of error.
2287082a75daSIlya Dryomov 		 */
2288082a75daSIlya Dryomov 		xferred = obj_request->length;
22891217857fSAlex Elder 	}
22901217857fSAlex Elder 
2291f1a4739fSAlex Elder 	/* Image object requests don't own their page array */
2292f1a4739fSAlex Elder 
2293f1a4739fSAlex Elder 	if (obj_request->type == OBJ_REQUEST_PAGES) {
2294f1a4739fSAlex Elder 		obj_request->pages = NULL;
2295f1a4739fSAlex Elder 		obj_request->page_count = 0;
2296f1a4739fSAlex Elder 	}
2297f1a4739fSAlex Elder 
22988b3e1a56SAlex Elder 	if (img_request_child_test(img_request)) {
22998b3e1a56SAlex Elder 		rbd_assert(img_request->obj_request != NULL);
23008b3e1a56SAlex Elder 		more = obj_request->which < img_request->obj_request_count - 1;
23018b3e1a56SAlex Elder 	} else {
23028b3e1a56SAlex Elder 		rbd_assert(img_request->rq != NULL);
23037ad18afaSChristoph Hellwig 
23047ad18afaSChristoph Hellwig 		more = blk_update_request(img_request->rq, result, xferred);
23057ad18afaSChristoph Hellwig 		if (!more)
23067ad18afaSChristoph Hellwig 			__blk_mq_end_request(img_request->rq, result);
23078b3e1a56SAlex Elder 	}
23088b3e1a56SAlex Elder 
23098b3e1a56SAlex Elder 	return more;
23101217857fSAlex Elder }
23111217857fSAlex Elder 
23122169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
23132169238dSAlex Elder {
23142169238dSAlex Elder 	struct rbd_img_request *img_request;
23152169238dSAlex Elder 	u32 which = obj_request->which;
23162169238dSAlex Elder 	bool more = true;
23172169238dSAlex Elder 
23186365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
23192169238dSAlex Elder 	img_request = obj_request->img_request;
23202169238dSAlex Elder 
23212169238dSAlex Elder 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
23222169238dSAlex Elder 	rbd_assert(img_request != NULL);
23232169238dSAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
23242169238dSAlex Elder 	rbd_assert(which != BAD_WHICH);
23252169238dSAlex Elder 	rbd_assert(which < img_request->obj_request_count);
23262169238dSAlex Elder 
23272169238dSAlex Elder 	spin_lock_irq(&img_request->completion_lock);
23282169238dSAlex Elder 	if (which != img_request->next_completion)
23292169238dSAlex Elder 		goto out;
23302169238dSAlex Elder 
23312169238dSAlex Elder 	for_each_obj_request_from(img_request, obj_request) {
23322169238dSAlex Elder 		rbd_assert(more);
23332169238dSAlex Elder 		rbd_assert(which < img_request->obj_request_count);
23342169238dSAlex Elder 
23352169238dSAlex Elder 		if (!obj_request_done_test(obj_request))
23362169238dSAlex Elder 			break;
23371217857fSAlex Elder 		more = rbd_img_obj_end_request(obj_request);
23382169238dSAlex Elder 		which++;
23392169238dSAlex Elder 	}
23402169238dSAlex Elder 
23412169238dSAlex Elder 	rbd_assert(more ^ (which == img_request->obj_request_count));
23422169238dSAlex Elder 	img_request->next_completion = which;
23432169238dSAlex Elder out:
23442169238dSAlex Elder 	spin_unlock_irq(&img_request->completion_lock);
23450f2d5be7SAlex Elder 	rbd_img_request_put(img_request);
23462169238dSAlex Elder 
23472169238dSAlex Elder 	if (!more)
23482169238dSAlex Elder 		rbd_img_request_complete(img_request);
23492169238dSAlex Elder }
23502169238dSAlex Elder 
2351f1a4739fSAlex Elder /*
23523b434a2aSJosh Durgin  * Add individual osd ops to the given ceph_osd_request and prepare
23533b434a2aSJosh Durgin  * them for submission. num_ops is the current number of
23543b434a2aSJosh Durgin  * osd operations already to the object request.
23553b434a2aSJosh Durgin  */
23563b434a2aSJosh Durgin static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request,
23573b434a2aSJosh Durgin 				struct ceph_osd_request *osd_request,
23583b434a2aSJosh Durgin 				enum obj_operation_type op_type,
23593b434a2aSJosh Durgin 				unsigned int num_ops)
23603b434a2aSJosh Durgin {
23613b434a2aSJosh Durgin 	struct rbd_img_request *img_request = obj_request->img_request;
23623b434a2aSJosh Durgin 	struct rbd_device *rbd_dev = img_request->rbd_dev;
23633b434a2aSJosh Durgin 	u64 object_size = rbd_obj_bytes(&rbd_dev->header);
23643b434a2aSJosh Durgin 	u64 offset = obj_request->offset;
23653b434a2aSJosh Durgin 	u64 length = obj_request->length;
23663b434a2aSJosh Durgin 	u64 img_end;
23673b434a2aSJosh Durgin 	u16 opcode;
23683b434a2aSJosh Durgin 
23693b434a2aSJosh Durgin 	if (op_type == OBJ_OP_DISCARD) {
2370d3246fb0SJosh Durgin 		if (!offset && length == object_size &&
2371d3246fb0SJosh Durgin 		    (!img_request_layered_test(img_request) ||
2372d3246fb0SJosh Durgin 		     !obj_request_overlaps_parent(obj_request))) {
23733b434a2aSJosh Durgin 			opcode = CEPH_OSD_OP_DELETE;
23743b434a2aSJosh Durgin 		} else if ((offset + length == object_size)) {
23753b434a2aSJosh Durgin 			opcode = CEPH_OSD_OP_TRUNCATE;
23763b434a2aSJosh Durgin 		} else {
23773b434a2aSJosh Durgin 			down_read(&rbd_dev->header_rwsem);
23783b434a2aSJosh Durgin 			img_end = rbd_dev->header.image_size;
23793b434a2aSJosh Durgin 			up_read(&rbd_dev->header_rwsem);
23803b434a2aSJosh Durgin 
23813b434a2aSJosh Durgin 			if (obj_request->img_offset + length == img_end)
23823b434a2aSJosh Durgin 				opcode = CEPH_OSD_OP_TRUNCATE;
23833b434a2aSJosh Durgin 			else
23843b434a2aSJosh Durgin 				opcode = CEPH_OSD_OP_ZERO;
23853b434a2aSJosh Durgin 		}
23863b434a2aSJosh Durgin 	} else if (op_type == OBJ_OP_WRITE) {
23873b434a2aSJosh Durgin 		opcode = CEPH_OSD_OP_WRITE;
23883b434a2aSJosh Durgin 		osd_req_op_alloc_hint_init(osd_request, num_ops,
23893b434a2aSJosh Durgin 					object_size, object_size);
23903b434a2aSJosh Durgin 		num_ops++;
23913b434a2aSJosh Durgin 	} else {
23923b434a2aSJosh Durgin 		opcode = CEPH_OSD_OP_READ;
23933b434a2aSJosh Durgin 	}
23943b434a2aSJosh Durgin 
23957e868b6eSIlya Dryomov 	if (opcode == CEPH_OSD_OP_DELETE)
2396144cba14SYan, Zheng 		osd_req_op_init(osd_request, num_ops, opcode, 0);
23977e868b6eSIlya Dryomov 	else
23987e868b6eSIlya Dryomov 		osd_req_op_extent_init(osd_request, num_ops, opcode,
23997e868b6eSIlya Dryomov 				       offset, length, 0, 0);
24007e868b6eSIlya Dryomov 
24013b434a2aSJosh Durgin 	if (obj_request->type == OBJ_REQUEST_BIO)
24023b434a2aSJosh Durgin 		osd_req_op_extent_osd_data_bio(osd_request, num_ops,
24033b434a2aSJosh Durgin 					obj_request->bio_list, length);
24043b434a2aSJosh Durgin 	else if (obj_request->type == OBJ_REQUEST_PAGES)
24053b434a2aSJosh Durgin 		osd_req_op_extent_osd_data_pages(osd_request, num_ops,
24063b434a2aSJosh Durgin 					obj_request->pages, length,
24073b434a2aSJosh Durgin 					offset & ~PAGE_MASK, false, false);
24083b434a2aSJosh Durgin 
24093b434a2aSJosh Durgin 	/* Discards are also writes */
24103b434a2aSJosh Durgin 	if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
24113b434a2aSJosh Durgin 		rbd_osd_req_format_write(obj_request);
24123b434a2aSJosh Durgin 	else
24133b434a2aSJosh Durgin 		rbd_osd_req_format_read(obj_request);
24143b434a2aSJosh Durgin }
24153b434a2aSJosh Durgin 
24163b434a2aSJosh Durgin /*
2417f1a4739fSAlex Elder  * Split up an image request into one or more object requests, each
2418f1a4739fSAlex Elder  * to a different object.  The "type" parameter indicates whether
2419f1a4739fSAlex Elder  * "data_desc" is the pointer to the head of a list of bio
2420f1a4739fSAlex Elder  * structures, or the base of a page array.  In either case this
2421f1a4739fSAlex Elder  * function assumes data_desc describes memory sufficient to hold
2422f1a4739fSAlex Elder  * all data described by the image request.
2423f1a4739fSAlex Elder  */
2424f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request,
2425f1a4739fSAlex Elder 					enum obj_request_type type,
2426f1a4739fSAlex Elder 					void *data_desc)
2427bf0d5f50SAlex Elder {
2428bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = img_request->rbd_dev;
2429bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = NULL;
2430bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
2431a158073cSJingoo Han 	struct bio *bio_list = NULL;
2432f1a4739fSAlex Elder 	unsigned int bio_offset = 0;
2433a158073cSJingoo Han 	struct page **pages = NULL;
24346d2940c8SGuangliang Zhao 	enum obj_operation_type op_type;
24357da22d29SAlex Elder 	u64 img_offset;
2436bf0d5f50SAlex Elder 	u64 resid;
2437bf0d5f50SAlex Elder 
2438f1a4739fSAlex Elder 	dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2439f1a4739fSAlex Elder 		(int)type, data_desc);
244037206ee5SAlex Elder 
24417da22d29SAlex Elder 	img_offset = img_request->offset;
2442bf0d5f50SAlex Elder 	resid = img_request->length;
24434dda41d3SAlex Elder 	rbd_assert(resid > 0);
24443b434a2aSJosh Durgin 	op_type = rbd_img_request_op_type(img_request);
2445f1a4739fSAlex Elder 
2446f1a4739fSAlex Elder 	if (type == OBJ_REQUEST_BIO) {
2447f1a4739fSAlex Elder 		bio_list = data_desc;
24484f024f37SKent Overstreet 		rbd_assert(img_offset ==
24494f024f37SKent Overstreet 			   bio_list->bi_iter.bi_sector << SECTOR_SHIFT);
245090e98c52SGuangliang Zhao 	} else if (type == OBJ_REQUEST_PAGES) {
2451f1a4739fSAlex Elder 		pages = data_desc;
2452f1a4739fSAlex Elder 	}
2453f1a4739fSAlex Elder 
2454bf0d5f50SAlex Elder 	while (resid) {
24552fa12320SAlex Elder 		struct ceph_osd_request *osd_req;
2456bf0d5f50SAlex Elder 		const char *object_name;
2457bf0d5f50SAlex Elder 		u64 offset;
2458bf0d5f50SAlex Elder 		u64 length;
2459bf0d5f50SAlex Elder 
24607da22d29SAlex Elder 		object_name = rbd_segment_name(rbd_dev, img_offset);
2461bf0d5f50SAlex Elder 		if (!object_name)
2462bf0d5f50SAlex Elder 			goto out_unwind;
24637da22d29SAlex Elder 		offset = rbd_segment_offset(rbd_dev, img_offset);
24647da22d29SAlex Elder 		length = rbd_segment_length(rbd_dev, img_offset, resid);
2465bf0d5f50SAlex Elder 		obj_request = rbd_obj_request_create(object_name,
2466f1a4739fSAlex Elder 						offset, length, type);
246778c2a44aSAlex Elder 		/* object request has its own copy of the object name */
246878c2a44aSAlex Elder 		rbd_segment_name_free(object_name);
2469bf0d5f50SAlex Elder 		if (!obj_request)
2470bf0d5f50SAlex Elder 			goto out_unwind;
247162054da6SIlya Dryomov 
247203507db6SJosh Durgin 		/*
247303507db6SJosh Durgin 		 * set obj_request->img_request before creating the
247403507db6SJosh Durgin 		 * osd_request so that it gets the right snapc
247503507db6SJosh Durgin 		 */
247603507db6SJosh Durgin 		rbd_img_obj_request_add(img_request, obj_request);
2477bf0d5f50SAlex Elder 
2478f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO) {
2479f1a4739fSAlex Elder 			unsigned int clone_size;
2480f1a4739fSAlex Elder 
2481bf0d5f50SAlex Elder 			rbd_assert(length <= (u64)UINT_MAX);
2482bf0d5f50SAlex Elder 			clone_size = (unsigned int)length;
2483f1a4739fSAlex Elder 			obj_request->bio_list =
2484f1a4739fSAlex Elder 					bio_chain_clone_range(&bio_list,
2485f1a4739fSAlex Elder 								&bio_offset,
2486f1a4739fSAlex Elder 								clone_size,
2487bf0d5f50SAlex Elder 								GFP_ATOMIC);
2488bf0d5f50SAlex Elder 			if (!obj_request->bio_list)
248962054da6SIlya Dryomov 				goto out_unwind;
249090e98c52SGuangliang Zhao 		} else if (type == OBJ_REQUEST_PAGES) {
2491f1a4739fSAlex Elder 			unsigned int page_count;
2492f1a4739fSAlex Elder 
2493f1a4739fSAlex Elder 			obj_request->pages = pages;
2494f1a4739fSAlex Elder 			page_count = (u32)calc_pages_for(offset, length);
2495f1a4739fSAlex Elder 			obj_request->page_count = page_count;
2496f1a4739fSAlex Elder 			if ((offset + length) & ~PAGE_MASK)
2497f1a4739fSAlex Elder 				page_count--;	/* more on last page */
2498f1a4739fSAlex Elder 			pages += page_count;
2499f1a4739fSAlex Elder 		}
2500bf0d5f50SAlex Elder 
25016d2940c8SGuangliang Zhao 		osd_req = rbd_osd_req_create(rbd_dev, op_type,
25026d2940c8SGuangliang Zhao 					(op_type == OBJ_OP_WRITE) ? 2 : 1,
25032fa12320SAlex Elder 					obj_request);
25042fa12320SAlex Elder 		if (!osd_req)
250562054da6SIlya Dryomov 			goto out_unwind;
25063b434a2aSJosh Durgin 
25072fa12320SAlex Elder 		obj_request->osd_req = osd_req;
25082169238dSAlex Elder 		obj_request->callback = rbd_img_obj_callback;
25097da22d29SAlex Elder 		obj_request->img_offset = img_offset;
2510bf0d5f50SAlex Elder 
25113b434a2aSJosh Durgin 		rbd_img_obj_request_fill(obj_request, osd_req, op_type, 0);
25123b434a2aSJosh Durgin 
25133b434a2aSJosh Durgin 		rbd_img_request_get(img_request);
25143b434a2aSJosh Durgin 
25157da22d29SAlex Elder 		img_offset += length;
2516bf0d5f50SAlex Elder 		resid -= length;
2517bf0d5f50SAlex Elder 	}
2518bf0d5f50SAlex Elder 
2519bf0d5f50SAlex Elder 	return 0;
2520bf0d5f50SAlex Elder 
2521bf0d5f50SAlex Elder out_unwind:
2522bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
252342dd037cSIlya Dryomov 		rbd_img_obj_request_del(img_request, obj_request);
2524bf0d5f50SAlex Elder 
2525bf0d5f50SAlex Elder 	return -ENOMEM;
2526bf0d5f50SAlex Elder }
2527bf0d5f50SAlex Elder 
25283d7efd18SAlex Elder static void
25290eefd470SAlex Elder rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
25300eefd470SAlex Elder {
25310eefd470SAlex Elder 	struct rbd_img_request *img_request;
25320eefd470SAlex Elder 	struct rbd_device *rbd_dev;
2533ebda6408SAlex Elder 	struct page **pages;
25340eefd470SAlex Elder 	u32 page_count;
25350eefd470SAlex Elder 
2536d3246fb0SJosh Durgin 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO ||
2537d3246fb0SJosh Durgin 		obj_request->type == OBJ_REQUEST_NODATA);
25380eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
25390eefd470SAlex Elder 	img_request = obj_request->img_request;
25400eefd470SAlex Elder 	rbd_assert(img_request);
25410eefd470SAlex Elder 
25420eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
25430eefd470SAlex Elder 	rbd_assert(rbd_dev);
25440eefd470SAlex Elder 
2545ebda6408SAlex Elder 	pages = obj_request->copyup_pages;
2546ebda6408SAlex Elder 	rbd_assert(pages != NULL);
25470eefd470SAlex Elder 	obj_request->copyup_pages = NULL;
2548ebda6408SAlex Elder 	page_count = obj_request->copyup_page_count;
2549ebda6408SAlex Elder 	rbd_assert(page_count);
2550ebda6408SAlex Elder 	obj_request->copyup_page_count = 0;
2551ebda6408SAlex Elder 	ceph_release_page_vector(pages, page_count);
25520eefd470SAlex Elder 
25530eefd470SAlex Elder 	/*
25540eefd470SAlex Elder 	 * We want the transfer count to reflect the size of the
25550eefd470SAlex Elder 	 * original write request.  There is no such thing as a
25560eefd470SAlex Elder 	 * successful short write, so if the request was successful
25570eefd470SAlex Elder 	 * we can just set it to the originally-requested length.
25580eefd470SAlex Elder 	 */
25590eefd470SAlex Elder 	if (!obj_request->result)
25600eefd470SAlex Elder 		obj_request->xferred = obj_request->length;
25610eefd470SAlex Elder 
25620eefd470SAlex Elder 	/* Finish up with the normal image object callback */
25630eefd470SAlex Elder 
25640eefd470SAlex Elder 	rbd_img_obj_callback(obj_request);
25650eefd470SAlex Elder }
25660eefd470SAlex Elder 
25670eefd470SAlex Elder static void
25683d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
25693d7efd18SAlex Elder {
25703d7efd18SAlex Elder 	struct rbd_obj_request *orig_request;
25710eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
25720eefd470SAlex Elder 	struct ceph_osd_client *osdc;
25730eefd470SAlex Elder 	struct rbd_device *rbd_dev;
25743d7efd18SAlex Elder 	struct page **pages;
2575d3246fb0SJosh Durgin 	enum obj_operation_type op_type;
2576ebda6408SAlex Elder 	u32 page_count;
2577bbea1c1aSAlex Elder 	int img_result;
2578ebda6408SAlex Elder 	u64 parent_length;
25793d7efd18SAlex Elder 
25803d7efd18SAlex Elder 	rbd_assert(img_request_child_test(img_request));
25813d7efd18SAlex Elder 
25823d7efd18SAlex Elder 	/* First get what we need from the image request */
25833d7efd18SAlex Elder 
25843d7efd18SAlex Elder 	pages = img_request->copyup_pages;
25853d7efd18SAlex Elder 	rbd_assert(pages != NULL);
25863d7efd18SAlex Elder 	img_request->copyup_pages = NULL;
2587ebda6408SAlex Elder 	page_count = img_request->copyup_page_count;
2588ebda6408SAlex Elder 	rbd_assert(page_count);
2589ebda6408SAlex Elder 	img_request->copyup_page_count = 0;
25903d7efd18SAlex Elder 
25913d7efd18SAlex Elder 	orig_request = img_request->obj_request;
25923d7efd18SAlex Elder 	rbd_assert(orig_request != NULL);
2593b91f09f1SAlex Elder 	rbd_assert(obj_request_type_valid(orig_request->type));
2594bbea1c1aSAlex Elder 	img_result = img_request->result;
2595ebda6408SAlex Elder 	parent_length = img_request->length;
2596ebda6408SAlex Elder 	rbd_assert(parent_length == img_request->xferred);
25973d7efd18SAlex Elder 	rbd_img_request_put(img_request);
25983d7efd18SAlex Elder 
259991c6febbSAlex Elder 	rbd_assert(orig_request->img_request);
260091c6febbSAlex Elder 	rbd_dev = orig_request->img_request->rbd_dev;
26013d7efd18SAlex Elder 	rbd_assert(rbd_dev);
26023d7efd18SAlex Elder 
2603bbea1c1aSAlex Elder 	/*
2604bbea1c1aSAlex Elder 	 * If the overlap has become 0 (most likely because the
2605bbea1c1aSAlex Elder 	 * image has been flattened) we need to free the pages
2606bbea1c1aSAlex Elder 	 * and re-submit the original write request.
2607bbea1c1aSAlex Elder 	 */
2608bbea1c1aSAlex Elder 	if (!rbd_dev->parent_overlap) {
2609bbea1c1aSAlex Elder 		struct ceph_osd_client *osdc;
2610bbea1c1aSAlex Elder 
2611bbea1c1aSAlex Elder 		ceph_release_page_vector(pages, page_count);
2612bbea1c1aSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2613bbea1c1aSAlex Elder 		img_result = rbd_obj_request_submit(osdc, orig_request);
2614bbea1c1aSAlex Elder 		if (!img_result)
2615bbea1c1aSAlex Elder 			return;
2616bbea1c1aSAlex Elder 	}
2617bbea1c1aSAlex Elder 
2618bbea1c1aSAlex Elder 	if (img_result)
26190eefd470SAlex Elder 		goto out_err;
26203d7efd18SAlex Elder 
26218785b1d4SAlex Elder 	/*
26228785b1d4SAlex Elder 	 * The original osd request is of no use to use any more.
26230ccd5926SIlya Dryomov 	 * We need a new one that can hold the three ops in a copyup
26248785b1d4SAlex Elder 	 * request.  Allocate the new copyup osd request for the
26258785b1d4SAlex Elder 	 * original request, and release the old one.
26268785b1d4SAlex Elder 	 */
2627bbea1c1aSAlex Elder 	img_result = -ENOMEM;
26280eefd470SAlex Elder 	osd_req = rbd_osd_req_create_copyup(orig_request);
26290eefd470SAlex Elder 	if (!osd_req)
26300eefd470SAlex Elder 		goto out_err;
26318785b1d4SAlex Elder 	rbd_osd_req_destroy(orig_request->osd_req);
26320eefd470SAlex Elder 	orig_request->osd_req = osd_req;
26330eefd470SAlex Elder 	orig_request->copyup_pages = pages;
2634ebda6408SAlex Elder 	orig_request->copyup_page_count = page_count;
26353d7efd18SAlex Elder 
26360eefd470SAlex Elder 	/* Initialize the copyup op */
26370eefd470SAlex Elder 
26380eefd470SAlex Elder 	osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2639ebda6408SAlex Elder 	osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
26400eefd470SAlex Elder 						false, false);
26410eefd470SAlex Elder 
2642d3246fb0SJosh Durgin 	/* Add the other op(s) */
26430ccd5926SIlya Dryomov 
2644d3246fb0SJosh Durgin 	op_type = rbd_img_request_op_type(orig_request->img_request);
2645d3246fb0SJosh Durgin 	rbd_img_obj_request_fill(orig_request, osd_req, op_type, 1);
26460eefd470SAlex Elder 
26470eefd470SAlex Elder 	/* All set, send it off. */
26480eefd470SAlex Elder 
26490eefd470SAlex Elder 	orig_request->callback = rbd_img_obj_copyup_callback;
26500eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2651bbea1c1aSAlex Elder 	img_result = rbd_obj_request_submit(osdc, orig_request);
2652bbea1c1aSAlex Elder 	if (!img_result)
26530eefd470SAlex Elder 		return;
26540eefd470SAlex Elder out_err:
26550eefd470SAlex Elder 	/* Record the error code and complete the request */
26560eefd470SAlex Elder 
2657bbea1c1aSAlex Elder 	orig_request->result = img_result;
26580eefd470SAlex Elder 	orig_request->xferred = 0;
26593d7efd18SAlex Elder 	obj_request_done_set(orig_request);
26603d7efd18SAlex Elder 	rbd_obj_request_complete(orig_request);
26613d7efd18SAlex Elder }
26623d7efd18SAlex Elder 
26633d7efd18SAlex Elder /*
26643d7efd18SAlex Elder  * Read from the parent image the range of data that covers the
26653d7efd18SAlex Elder  * entire target of the given object request.  This is used for
26663d7efd18SAlex Elder  * satisfying a layered image write request when the target of an
26673d7efd18SAlex Elder  * object request from the image request does not exist.
26683d7efd18SAlex Elder  *
26693d7efd18SAlex Elder  * A page array big enough to hold the returned data is allocated
26703d7efd18SAlex Elder  * and supplied to rbd_img_request_fill() as the "data descriptor."
26713d7efd18SAlex Elder  * When the read completes, this page array will be transferred to
26723d7efd18SAlex Elder  * the original object request for the copyup operation.
26733d7efd18SAlex Elder  *
26743d7efd18SAlex Elder  * If an error occurs, record it as the result of the original
26753d7efd18SAlex Elder  * object request and mark it done so it gets completed.
26763d7efd18SAlex Elder  */
26773d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
26783d7efd18SAlex Elder {
26793d7efd18SAlex Elder 	struct rbd_img_request *img_request = NULL;
26803d7efd18SAlex Elder 	struct rbd_img_request *parent_request = NULL;
26813d7efd18SAlex Elder 	struct rbd_device *rbd_dev;
26823d7efd18SAlex Elder 	u64 img_offset;
26833d7efd18SAlex Elder 	u64 length;
26843d7efd18SAlex Elder 	struct page **pages = NULL;
26853d7efd18SAlex Elder 	u32 page_count;
26863d7efd18SAlex Elder 	int result;
26873d7efd18SAlex Elder 
26883d7efd18SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
2689b91f09f1SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
26903d7efd18SAlex Elder 
26913d7efd18SAlex Elder 	img_request = obj_request->img_request;
26923d7efd18SAlex Elder 	rbd_assert(img_request != NULL);
26933d7efd18SAlex Elder 	rbd_dev = img_request->rbd_dev;
26943d7efd18SAlex Elder 	rbd_assert(rbd_dev->parent != NULL);
26953d7efd18SAlex Elder 
26963d7efd18SAlex Elder 	/*
26973d7efd18SAlex Elder 	 * Determine the byte range covered by the object in the
26983d7efd18SAlex Elder 	 * child image to which the original request was to be sent.
26993d7efd18SAlex Elder 	 */
27003d7efd18SAlex Elder 	img_offset = obj_request->img_offset - obj_request->offset;
27013d7efd18SAlex Elder 	length = (u64)1 << rbd_dev->header.obj_order;
27023d7efd18SAlex Elder 
27033d7efd18SAlex Elder 	/*
2704a9e8ba2cSAlex Elder 	 * There is no defined parent data beyond the parent
2705a9e8ba2cSAlex Elder 	 * overlap, so limit what we read at that boundary if
2706a9e8ba2cSAlex Elder 	 * necessary.
2707a9e8ba2cSAlex Elder 	 */
2708a9e8ba2cSAlex Elder 	if (img_offset + length > rbd_dev->parent_overlap) {
2709a9e8ba2cSAlex Elder 		rbd_assert(img_offset < rbd_dev->parent_overlap);
2710a9e8ba2cSAlex Elder 		length = rbd_dev->parent_overlap - img_offset;
2711a9e8ba2cSAlex Elder 	}
2712a9e8ba2cSAlex Elder 
2713a9e8ba2cSAlex Elder 	/*
27143d7efd18SAlex Elder 	 * Allocate a page array big enough to receive the data read
27153d7efd18SAlex Elder 	 * from the parent.
27163d7efd18SAlex Elder 	 */
27173d7efd18SAlex Elder 	page_count = (u32)calc_pages_for(0, length);
27183d7efd18SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
27193d7efd18SAlex Elder 	if (IS_ERR(pages)) {
27203d7efd18SAlex Elder 		result = PTR_ERR(pages);
27213d7efd18SAlex Elder 		pages = NULL;
27223d7efd18SAlex Elder 		goto out_err;
27233d7efd18SAlex Elder 	}
27243d7efd18SAlex Elder 
27253d7efd18SAlex Elder 	result = -ENOMEM;
2726e93f3152SAlex Elder 	parent_request = rbd_parent_request_create(obj_request,
2727e93f3152SAlex Elder 						img_offset, length);
27283d7efd18SAlex Elder 	if (!parent_request)
27293d7efd18SAlex Elder 		goto out_err;
27303d7efd18SAlex Elder 
27313d7efd18SAlex Elder 	result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
27323d7efd18SAlex Elder 	if (result)
27333d7efd18SAlex Elder 		goto out_err;
27343d7efd18SAlex Elder 	parent_request->copyup_pages = pages;
2735ebda6408SAlex Elder 	parent_request->copyup_page_count = page_count;
27363d7efd18SAlex Elder 
27373d7efd18SAlex Elder 	parent_request->callback = rbd_img_obj_parent_read_full_callback;
27383d7efd18SAlex Elder 	result = rbd_img_request_submit(parent_request);
27393d7efd18SAlex Elder 	if (!result)
27403d7efd18SAlex Elder 		return 0;
27413d7efd18SAlex Elder 
27423d7efd18SAlex Elder 	parent_request->copyup_pages = NULL;
2743ebda6408SAlex Elder 	parent_request->copyup_page_count = 0;
27443d7efd18SAlex Elder 	parent_request->obj_request = NULL;
27453d7efd18SAlex Elder 	rbd_obj_request_put(obj_request);
27463d7efd18SAlex Elder out_err:
27473d7efd18SAlex Elder 	if (pages)
27483d7efd18SAlex Elder 		ceph_release_page_vector(pages, page_count);
27493d7efd18SAlex Elder 	if (parent_request)
27503d7efd18SAlex Elder 		rbd_img_request_put(parent_request);
27513d7efd18SAlex Elder 	obj_request->result = result;
27523d7efd18SAlex Elder 	obj_request->xferred = 0;
27533d7efd18SAlex Elder 	obj_request_done_set(obj_request);
27543d7efd18SAlex Elder 
27553d7efd18SAlex Elder 	return result;
27563d7efd18SAlex Elder }
27573d7efd18SAlex Elder 
2758c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2759c5b5ef6cSAlex Elder {
2760c5b5ef6cSAlex Elder 	struct rbd_obj_request *orig_request;
2761638f5abeSAlex Elder 	struct rbd_device *rbd_dev;
2762c5b5ef6cSAlex Elder 	int result;
2763c5b5ef6cSAlex Elder 
2764c5b5ef6cSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
2765c5b5ef6cSAlex Elder 
2766c5b5ef6cSAlex Elder 	/*
2767c5b5ef6cSAlex Elder 	 * All we need from the object request is the original
2768c5b5ef6cSAlex Elder 	 * request and the result of the STAT op.  Grab those, then
2769c5b5ef6cSAlex Elder 	 * we're done with the request.
2770c5b5ef6cSAlex Elder 	 */
2771c5b5ef6cSAlex Elder 	orig_request = obj_request->obj_request;
2772c5b5ef6cSAlex Elder 	obj_request->obj_request = NULL;
2773912c317dSAlex Elder 	rbd_obj_request_put(orig_request);
2774c5b5ef6cSAlex Elder 	rbd_assert(orig_request);
2775c5b5ef6cSAlex Elder 	rbd_assert(orig_request->img_request);
2776c5b5ef6cSAlex Elder 
2777c5b5ef6cSAlex Elder 	result = obj_request->result;
2778c5b5ef6cSAlex Elder 	obj_request->result = 0;
2779c5b5ef6cSAlex Elder 
2780c5b5ef6cSAlex Elder 	dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2781c5b5ef6cSAlex Elder 		obj_request, orig_request, result,
2782c5b5ef6cSAlex Elder 		obj_request->xferred, obj_request->length);
2783c5b5ef6cSAlex Elder 	rbd_obj_request_put(obj_request);
2784c5b5ef6cSAlex Elder 
2785638f5abeSAlex Elder 	/*
2786638f5abeSAlex Elder 	 * If the overlap has become 0 (most likely because the
2787638f5abeSAlex Elder 	 * image has been flattened) we need to free the pages
2788638f5abeSAlex Elder 	 * and re-submit the original write request.
2789638f5abeSAlex Elder 	 */
2790638f5abeSAlex Elder 	rbd_dev = orig_request->img_request->rbd_dev;
2791638f5abeSAlex Elder 	if (!rbd_dev->parent_overlap) {
2792638f5abeSAlex Elder 		struct ceph_osd_client *osdc;
2793638f5abeSAlex Elder 
2794638f5abeSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2795638f5abeSAlex Elder 		result = rbd_obj_request_submit(osdc, orig_request);
2796638f5abeSAlex Elder 		if (!result)
2797638f5abeSAlex Elder 			return;
2798638f5abeSAlex Elder 	}
2799c5b5ef6cSAlex Elder 
2800c5b5ef6cSAlex Elder 	/*
2801c5b5ef6cSAlex Elder 	 * Our only purpose here is to determine whether the object
2802c5b5ef6cSAlex Elder 	 * exists, and we don't want to treat the non-existence as
2803c5b5ef6cSAlex Elder 	 * an error.  If something else comes back, transfer the
2804c5b5ef6cSAlex Elder 	 * error to the original request and complete it now.
2805c5b5ef6cSAlex Elder 	 */
2806c5b5ef6cSAlex Elder 	if (!result) {
2807c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, true);
2808c5b5ef6cSAlex Elder 	} else if (result == -ENOENT) {
2809c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, false);
2810c5b5ef6cSAlex Elder 	} else if (result) {
2811c5b5ef6cSAlex Elder 		orig_request->result = result;
28123d7efd18SAlex Elder 		goto out;
2813c5b5ef6cSAlex Elder 	}
2814c5b5ef6cSAlex Elder 
2815c5b5ef6cSAlex Elder 	/*
2816c5b5ef6cSAlex Elder 	 * Resubmit the original request now that we have recorded
2817c5b5ef6cSAlex Elder 	 * whether the target object exists.
2818c5b5ef6cSAlex Elder 	 */
2819b454e36dSAlex Elder 	orig_request->result = rbd_img_obj_request_submit(orig_request);
28203d7efd18SAlex Elder out:
2821c5b5ef6cSAlex Elder 	if (orig_request->result)
2822c5b5ef6cSAlex Elder 		rbd_obj_request_complete(orig_request);
2823c5b5ef6cSAlex Elder }
2824c5b5ef6cSAlex Elder 
2825c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2826c5b5ef6cSAlex Elder {
2827c5b5ef6cSAlex Elder 	struct rbd_obj_request *stat_request;
2828c5b5ef6cSAlex Elder 	struct rbd_device *rbd_dev;
2829c5b5ef6cSAlex Elder 	struct ceph_osd_client *osdc;
2830c5b5ef6cSAlex Elder 	struct page **pages = NULL;
2831c5b5ef6cSAlex Elder 	u32 page_count;
2832c5b5ef6cSAlex Elder 	size_t size;
2833c5b5ef6cSAlex Elder 	int ret;
2834c5b5ef6cSAlex Elder 
2835c5b5ef6cSAlex Elder 	/*
2836c5b5ef6cSAlex Elder 	 * The response data for a STAT call consists of:
2837c5b5ef6cSAlex Elder 	 *     le64 length;
2838c5b5ef6cSAlex Elder 	 *     struct {
2839c5b5ef6cSAlex Elder 	 *         le32 tv_sec;
2840c5b5ef6cSAlex Elder 	 *         le32 tv_nsec;
2841c5b5ef6cSAlex Elder 	 *     } mtime;
2842c5b5ef6cSAlex Elder 	 */
2843c5b5ef6cSAlex Elder 	size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2844c5b5ef6cSAlex Elder 	page_count = (u32)calc_pages_for(0, size);
2845c5b5ef6cSAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2846c5b5ef6cSAlex Elder 	if (IS_ERR(pages))
2847c5b5ef6cSAlex Elder 		return PTR_ERR(pages);
2848c5b5ef6cSAlex Elder 
2849c5b5ef6cSAlex Elder 	ret = -ENOMEM;
2850c5b5ef6cSAlex Elder 	stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2851c5b5ef6cSAlex Elder 							OBJ_REQUEST_PAGES);
2852c5b5ef6cSAlex Elder 	if (!stat_request)
2853c5b5ef6cSAlex Elder 		goto out;
2854c5b5ef6cSAlex Elder 
2855c5b5ef6cSAlex Elder 	rbd_obj_request_get(obj_request);
2856c5b5ef6cSAlex Elder 	stat_request->obj_request = obj_request;
2857c5b5ef6cSAlex Elder 	stat_request->pages = pages;
2858c5b5ef6cSAlex Elder 	stat_request->page_count = page_count;
2859c5b5ef6cSAlex Elder 
2860c5b5ef6cSAlex Elder 	rbd_assert(obj_request->img_request);
2861c5b5ef6cSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
28626d2940c8SGuangliang Zhao 	stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
2863c5b5ef6cSAlex Elder 						   stat_request);
2864c5b5ef6cSAlex Elder 	if (!stat_request->osd_req)
2865c5b5ef6cSAlex Elder 		goto out;
2866c5b5ef6cSAlex Elder 	stat_request->callback = rbd_img_obj_exists_callback;
2867c5b5ef6cSAlex Elder 
2868144cba14SYan, Zheng 	osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0);
2869c5b5ef6cSAlex Elder 	osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2870c5b5ef6cSAlex Elder 					false, false);
28719d4df01fSAlex Elder 	rbd_osd_req_format_read(stat_request);
2872c5b5ef6cSAlex Elder 
2873c5b5ef6cSAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2874c5b5ef6cSAlex Elder 	ret = rbd_obj_request_submit(osdc, stat_request);
2875c5b5ef6cSAlex Elder out:
2876c5b5ef6cSAlex Elder 	if (ret)
2877c5b5ef6cSAlex Elder 		rbd_obj_request_put(obj_request);
2878c5b5ef6cSAlex Elder 
2879c5b5ef6cSAlex Elder 	return ret;
2880c5b5ef6cSAlex Elder }
2881c5b5ef6cSAlex Elder 
288270d045f6SIlya Dryomov static bool img_obj_request_simple(struct rbd_obj_request *obj_request)
2883b454e36dSAlex Elder {
2884b454e36dSAlex Elder 	struct rbd_img_request *img_request;
2885a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
2886b454e36dSAlex Elder 
2887b454e36dSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
2888b454e36dSAlex Elder 
2889b454e36dSAlex Elder 	img_request = obj_request->img_request;
2890b454e36dSAlex Elder 	rbd_assert(img_request);
2891a9e8ba2cSAlex Elder 	rbd_dev = img_request->rbd_dev;
2892b454e36dSAlex Elder 
289370d045f6SIlya Dryomov 	/* Reads */
28941c220881SJosh Durgin 	if (!img_request_write_test(img_request) &&
28951c220881SJosh Durgin 	    !img_request_discard_test(img_request))
289670d045f6SIlya Dryomov 		return true;
2897b454e36dSAlex Elder 
289870d045f6SIlya Dryomov 	/* Non-layered writes */
289970d045f6SIlya Dryomov 	if (!img_request_layered_test(img_request))
290070d045f6SIlya Dryomov 		return true;
290170d045f6SIlya Dryomov 
290270d045f6SIlya Dryomov 	/*
290370d045f6SIlya Dryomov 	 * Layered writes outside of the parent overlap range don't
290470d045f6SIlya Dryomov 	 * share any data with the parent.
290570d045f6SIlya Dryomov 	 */
290670d045f6SIlya Dryomov 	if (!obj_request_overlaps_parent(obj_request))
290770d045f6SIlya Dryomov 		return true;
290870d045f6SIlya Dryomov 
290970d045f6SIlya Dryomov 	/*
2910c622d226SGuangliang Zhao 	 * Entire-object layered writes - we will overwrite whatever
2911c622d226SGuangliang Zhao 	 * parent data there is anyway.
2912c622d226SGuangliang Zhao 	 */
2913c622d226SGuangliang Zhao 	if (!obj_request->offset &&
2914c622d226SGuangliang Zhao 	    obj_request->length == rbd_obj_bytes(&rbd_dev->header))
2915c622d226SGuangliang Zhao 		return true;
2916c622d226SGuangliang Zhao 
2917c622d226SGuangliang Zhao 	/*
291870d045f6SIlya Dryomov 	 * If the object is known to already exist, its parent data has
291970d045f6SIlya Dryomov 	 * already been copied.
292070d045f6SIlya Dryomov 	 */
292170d045f6SIlya Dryomov 	if (obj_request_known_test(obj_request) &&
292270d045f6SIlya Dryomov 	    obj_request_exists_test(obj_request))
292370d045f6SIlya Dryomov 		return true;
292470d045f6SIlya Dryomov 
292570d045f6SIlya Dryomov 	return false;
292670d045f6SIlya Dryomov }
292770d045f6SIlya Dryomov 
292870d045f6SIlya Dryomov static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
292970d045f6SIlya Dryomov {
293070d045f6SIlya Dryomov 	if (img_obj_request_simple(obj_request)) {
2931b454e36dSAlex Elder 		struct rbd_device *rbd_dev;
2932b454e36dSAlex Elder 		struct ceph_osd_client *osdc;
2933b454e36dSAlex Elder 
2934b454e36dSAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
2935b454e36dSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2936b454e36dSAlex Elder 
2937b454e36dSAlex Elder 		return rbd_obj_request_submit(osdc, obj_request);
2938b454e36dSAlex Elder 	}
2939b454e36dSAlex Elder 
2940b454e36dSAlex Elder 	/*
29413d7efd18SAlex Elder 	 * It's a layered write.  The target object might exist but
29423d7efd18SAlex Elder 	 * we may not know that yet.  If we know it doesn't exist,
29433d7efd18SAlex Elder 	 * start by reading the data for the full target object from
29443d7efd18SAlex Elder 	 * the parent so we can use it for a copyup to the target.
2945b454e36dSAlex Elder 	 */
294670d045f6SIlya Dryomov 	if (obj_request_known_test(obj_request))
29473d7efd18SAlex Elder 		return rbd_img_obj_parent_read_full(obj_request);
29483d7efd18SAlex Elder 
29493d7efd18SAlex Elder 	/* We don't know whether the target exists.  Go find out. */
2950b454e36dSAlex Elder 
2951b454e36dSAlex Elder 	return rbd_img_obj_exists_submit(obj_request);
2952b454e36dSAlex Elder }
2953b454e36dSAlex Elder 
2954bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request)
2955bf0d5f50SAlex Elder {
2956bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
295746faeed4SAlex Elder 	struct rbd_obj_request *next_obj_request;
2958bf0d5f50SAlex Elder 
295937206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
296046faeed4SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
2961bf0d5f50SAlex Elder 		int ret;
2962bf0d5f50SAlex Elder 
2963b454e36dSAlex Elder 		ret = rbd_img_obj_request_submit(obj_request);
2964bf0d5f50SAlex Elder 		if (ret)
2965bf0d5f50SAlex Elder 			return ret;
2966bf0d5f50SAlex Elder 	}
2967bf0d5f50SAlex Elder 
2968bf0d5f50SAlex Elder 	return 0;
2969bf0d5f50SAlex Elder }
2970bf0d5f50SAlex Elder 
29718b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
29728b3e1a56SAlex Elder {
29738b3e1a56SAlex Elder 	struct rbd_obj_request *obj_request;
2974a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
2975a9e8ba2cSAlex Elder 	u64 obj_end;
297602c74fbaSAlex Elder 	u64 img_xferred;
297702c74fbaSAlex Elder 	int img_result;
29788b3e1a56SAlex Elder 
29798b3e1a56SAlex Elder 	rbd_assert(img_request_child_test(img_request));
29808b3e1a56SAlex Elder 
298102c74fbaSAlex Elder 	/* First get what we need from the image request and release it */
298202c74fbaSAlex Elder 
29838b3e1a56SAlex Elder 	obj_request = img_request->obj_request;
298402c74fbaSAlex Elder 	img_xferred = img_request->xferred;
298502c74fbaSAlex Elder 	img_result = img_request->result;
298602c74fbaSAlex Elder 	rbd_img_request_put(img_request);
298702c74fbaSAlex Elder 
298802c74fbaSAlex Elder 	/*
298902c74fbaSAlex Elder 	 * If the overlap has become 0 (most likely because the
299002c74fbaSAlex Elder 	 * image has been flattened) we need to re-submit the
299102c74fbaSAlex Elder 	 * original request.
299202c74fbaSAlex Elder 	 */
2993a9e8ba2cSAlex Elder 	rbd_assert(obj_request);
2994a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_request);
299502c74fbaSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
299602c74fbaSAlex Elder 	if (!rbd_dev->parent_overlap) {
299702c74fbaSAlex Elder 		struct ceph_osd_client *osdc;
29988b3e1a56SAlex Elder 
299902c74fbaSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
300002c74fbaSAlex Elder 		img_result = rbd_obj_request_submit(osdc, obj_request);
300102c74fbaSAlex Elder 		if (!img_result)
300202c74fbaSAlex Elder 			return;
300302c74fbaSAlex Elder 	}
300402c74fbaSAlex Elder 
300502c74fbaSAlex Elder 	obj_request->result = img_result;
3006a9e8ba2cSAlex Elder 	if (obj_request->result)
3007a9e8ba2cSAlex Elder 		goto out;
3008a9e8ba2cSAlex Elder 
3009a9e8ba2cSAlex Elder 	/*
3010a9e8ba2cSAlex Elder 	 * We need to zero anything beyond the parent overlap
3011a9e8ba2cSAlex Elder 	 * boundary.  Since rbd_img_obj_request_read_callback()
3012a9e8ba2cSAlex Elder 	 * will zero anything beyond the end of a short read, an
3013a9e8ba2cSAlex Elder 	 * easy way to do this is to pretend the data from the
3014a9e8ba2cSAlex Elder 	 * parent came up short--ending at the overlap boundary.
3015a9e8ba2cSAlex Elder 	 */
3016a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
3017a9e8ba2cSAlex Elder 	obj_end = obj_request->img_offset + obj_request->length;
3018a9e8ba2cSAlex Elder 	if (obj_end > rbd_dev->parent_overlap) {
3019a9e8ba2cSAlex Elder 		u64 xferred = 0;
3020a9e8ba2cSAlex Elder 
3021a9e8ba2cSAlex Elder 		if (obj_request->img_offset < rbd_dev->parent_overlap)
3022a9e8ba2cSAlex Elder 			xferred = rbd_dev->parent_overlap -
3023a9e8ba2cSAlex Elder 					obj_request->img_offset;
3024a9e8ba2cSAlex Elder 
302502c74fbaSAlex Elder 		obj_request->xferred = min(img_xferred, xferred);
3026a9e8ba2cSAlex Elder 	} else {
302702c74fbaSAlex Elder 		obj_request->xferred = img_xferred;
3028a9e8ba2cSAlex Elder 	}
3029a9e8ba2cSAlex Elder out:
30308b3e1a56SAlex Elder 	rbd_img_obj_request_read_callback(obj_request);
30318b3e1a56SAlex Elder 	rbd_obj_request_complete(obj_request);
30328b3e1a56SAlex Elder }
30338b3e1a56SAlex Elder 
30348b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
30358b3e1a56SAlex Elder {
30368b3e1a56SAlex Elder 	struct rbd_img_request *img_request;
30378b3e1a56SAlex Elder 	int result;
30388b3e1a56SAlex Elder 
30398b3e1a56SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
30408b3e1a56SAlex Elder 	rbd_assert(obj_request->img_request != NULL);
30418b3e1a56SAlex Elder 	rbd_assert(obj_request->result == (s32) -ENOENT);
30425b2ab72dSAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
30438b3e1a56SAlex Elder 
30448b3e1a56SAlex Elder 	/* rbd_read_finish(obj_request, obj_request->length); */
3045e93f3152SAlex Elder 	img_request = rbd_parent_request_create(obj_request,
30468b3e1a56SAlex Elder 						obj_request->img_offset,
3047e93f3152SAlex Elder 						obj_request->length);
30488b3e1a56SAlex Elder 	result = -ENOMEM;
30498b3e1a56SAlex Elder 	if (!img_request)
30508b3e1a56SAlex Elder 		goto out_err;
30518b3e1a56SAlex Elder 
30525b2ab72dSAlex Elder 	if (obj_request->type == OBJ_REQUEST_BIO)
3053f1a4739fSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
3054f1a4739fSAlex Elder 						obj_request->bio_list);
30555b2ab72dSAlex Elder 	else
30565b2ab72dSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES,
30575b2ab72dSAlex Elder 						obj_request->pages);
30588b3e1a56SAlex Elder 	if (result)
30598b3e1a56SAlex Elder 		goto out_err;
30608b3e1a56SAlex Elder 
30618b3e1a56SAlex Elder 	img_request->callback = rbd_img_parent_read_callback;
30628b3e1a56SAlex Elder 	result = rbd_img_request_submit(img_request);
30638b3e1a56SAlex Elder 	if (result)
30648b3e1a56SAlex Elder 		goto out_err;
30658b3e1a56SAlex Elder 
30668b3e1a56SAlex Elder 	return;
30678b3e1a56SAlex Elder out_err:
30688b3e1a56SAlex Elder 	if (img_request)
30698b3e1a56SAlex Elder 		rbd_img_request_put(img_request);
30708b3e1a56SAlex Elder 	obj_request->result = result;
30718b3e1a56SAlex Elder 	obj_request->xferred = 0;
30728b3e1a56SAlex Elder 	obj_request_done_set(obj_request);
30738b3e1a56SAlex Elder }
30748b3e1a56SAlex Elder 
307520e0af67SJosh Durgin static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id)
3076b8d70035SAlex Elder {
3077b8d70035SAlex Elder 	struct rbd_obj_request *obj_request;
30782169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3079b8d70035SAlex Elder 	int ret;
3080b8d70035SAlex Elder 
3081b8d70035SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
3082b8d70035SAlex Elder 							OBJ_REQUEST_NODATA);
3083b8d70035SAlex Elder 	if (!obj_request)
3084b8d70035SAlex Elder 		return -ENOMEM;
3085b8d70035SAlex Elder 
3086b8d70035SAlex Elder 	ret = -ENOMEM;
30876d2940c8SGuangliang Zhao 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
3088deb236b3SIlya Dryomov 						  obj_request);
3089b8d70035SAlex Elder 	if (!obj_request->osd_req)
3090b8d70035SAlex Elder 		goto out;
3091b8d70035SAlex Elder 
3092c99d2d4aSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
3093cc4a38bdSAlex Elder 					notify_id, 0, 0);
30949d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
3095430c28c3SAlex Elder 
3096b8d70035SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
3097cf81b60eSAlex Elder 	if (ret)
309820e0af67SJosh Durgin 		goto out;
309920e0af67SJosh Durgin 	ret = rbd_obj_request_wait(obj_request);
310020e0af67SJosh Durgin out:
3101b8d70035SAlex Elder 	rbd_obj_request_put(obj_request);
3102b8d70035SAlex Elder 
3103b8d70035SAlex Elder 	return ret;
3104b8d70035SAlex Elder }
3105b8d70035SAlex Elder 
3106b8d70035SAlex Elder static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
3107b8d70035SAlex Elder {
3108b8d70035SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
3109e627db08SAlex Elder 	int ret;
3110b8d70035SAlex Elder 
3111b8d70035SAlex Elder 	if (!rbd_dev)
3112b8d70035SAlex Elder 		return;
3113b8d70035SAlex Elder 
311437206ee5SAlex Elder 	dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
3115b8d70035SAlex Elder 		rbd_dev->header_name, (unsigned long long)notify_id,
3116b8d70035SAlex Elder 		(unsigned int)opcode);
311752bb1f9bSIlya Dryomov 
311852bb1f9bSIlya Dryomov 	/*
311952bb1f9bSIlya Dryomov 	 * Until adequate refresh error handling is in place, there is
312052bb1f9bSIlya Dryomov 	 * not much we can do here, except warn.
312152bb1f9bSIlya Dryomov 	 *
312252bb1f9bSIlya Dryomov 	 * See http://tracker.ceph.com/issues/5040
312352bb1f9bSIlya Dryomov 	 */
3124e627db08SAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
3125e627db08SAlex Elder 	if (ret)
31269584d508SIlya Dryomov 		rbd_warn(rbd_dev, "refresh failed: %d", ret);
3127b8d70035SAlex Elder 
312852bb1f9bSIlya Dryomov 	ret = rbd_obj_notify_ack_sync(rbd_dev, notify_id);
312952bb1f9bSIlya Dryomov 	if (ret)
31309584d508SIlya Dryomov 		rbd_warn(rbd_dev, "notify_ack ret %d", ret);
3131b8d70035SAlex Elder }
3132b8d70035SAlex Elder 
31339969ebc5SAlex Elder /*
3134bb040aa0SIlya Dryomov  * Send a (un)watch request and wait for the ack.  Return a request
3135bb040aa0SIlya Dryomov  * with a ref held on success or error.
3136bb040aa0SIlya Dryomov  */
3137bb040aa0SIlya Dryomov static struct rbd_obj_request *rbd_obj_watch_request_helper(
3138bb040aa0SIlya Dryomov 						struct rbd_device *rbd_dev,
3139bb040aa0SIlya Dryomov 						bool watch)
3140bb040aa0SIlya Dryomov {
3141bb040aa0SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
31422894e1d7SIlya Dryomov 	struct ceph_options *opts = osdc->client->options;
3143bb040aa0SIlya Dryomov 	struct rbd_obj_request *obj_request;
3144bb040aa0SIlya Dryomov 	int ret;
3145bb040aa0SIlya Dryomov 
3146bb040aa0SIlya Dryomov 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
3147bb040aa0SIlya Dryomov 					     OBJ_REQUEST_NODATA);
3148bb040aa0SIlya Dryomov 	if (!obj_request)
3149bb040aa0SIlya Dryomov 		return ERR_PTR(-ENOMEM);
3150bb040aa0SIlya Dryomov 
31516d2940c8SGuangliang Zhao 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_WRITE, 1,
3152bb040aa0SIlya Dryomov 						  obj_request);
3153bb040aa0SIlya Dryomov 	if (!obj_request->osd_req) {
3154bb040aa0SIlya Dryomov 		ret = -ENOMEM;
3155bb040aa0SIlya Dryomov 		goto out;
3156bb040aa0SIlya Dryomov 	}
3157bb040aa0SIlya Dryomov 
3158bb040aa0SIlya Dryomov 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
3159bb040aa0SIlya Dryomov 			      rbd_dev->watch_event->cookie, 0, watch);
3160bb040aa0SIlya Dryomov 	rbd_osd_req_format_write(obj_request);
3161bb040aa0SIlya Dryomov 
3162bb040aa0SIlya Dryomov 	if (watch)
3163bb040aa0SIlya Dryomov 		ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
3164bb040aa0SIlya Dryomov 
3165bb040aa0SIlya Dryomov 	ret = rbd_obj_request_submit(osdc, obj_request);
3166bb040aa0SIlya Dryomov 	if (ret)
3167bb040aa0SIlya Dryomov 		goto out;
3168bb040aa0SIlya Dryomov 
31692894e1d7SIlya Dryomov 	ret = rbd_obj_request_wait_timeout(obj_request, opts->mount_timeout);
3170bb040aa0SIlya Dryomov 	if (ret)
3171bb040aa0SIlya Dryomov 		goto out;
3172bb040aa0SIlya Dryomov 
3173bb040aa0SIlya Dryomov 	ret = obj_request->result;
3174bb040aa0SIlya Dryomov 	if (ret) {
3175bb040aa0SIlya Dryomov 		if (watch)
3176bb040aa0SIlya Dryomov 			rbd_obj_request_end(obj_request);
3177bb040aa0SIlya Dryomov 		goto out;
3178bb040aa0SIlya Dryomov 	}
3179bb040aa0SIlya Dryomov 
3180bb040aa0SIlya Dryomov 	return obj_request;
3181bb040aa0SIlya Dryomov 
3182bb040aa0SIlya Dryomov out:
3183bb040aa0SIlya Dryomov 	rbd_obj_request_put(obj_request);
3184bb040aa0SIlya Dryomov 	return ERR_PTR(ret);
3185bb040aa0SIlya Dryomov }
3186bb040aa0SIlya Dryomov 
3187bb040aa0SIlya Dryomov /*
3188b30a01f2SIlya Dryomov  * Initiate a watch request, synchronously.
31899969ebc5SAlex Elder  */
3190b30a01f2SIlya Dryomov static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
31919969ebc5SAlex Elder {
31929969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
31939969ebc5SAlex Elder 	struct rbd_obj_request *obj_request;
31949969ebc5SAlex Elder 	int ret;
31959969ebc5SAlex Elder 
3196b30a01f2SIlya Dryomov 	rbd_assert(!rbd_dev->watch_event);
3197b30a01f2SIlya Dryomov 	rbd_assert(!rbd_dev->watch_request);
31989969ebc5SAlex Elder 
31993c663bbdSAlex Elder 	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
32009969ebc5SAlex Elder 				     &rbd_dev->watch_event);
32019969ebc5SAlex Elder 	if (ret < 0)
32029969ebc5SAlex Elder 		return ret;
32039969ebc5SAlex Elder 
320476756a51SIlya Dryomov 	obj_request = rbd_obj_watch_request_helper(rbd_dev, true);
320576756a51SIlya Dryomov 	if (IS_ERR(obj_request)) {
320676756a51SIlya Dryomov 		ceph_osdc_cancel_event(rbd_dev->watch_event);
320776756a51SIlya Dryomov 		rbd_dev->watch_event = NULL;
320876756a51SIlya Dryomov 		return PTR_ERR(obj_request);
3209b30a01f2SIlya Dryomov 	}
32109969ebc5SAlex Elder 
32118eb87565SAlex Elder 	/*
32128eb87565SAlex Elder 	 * A watch request is set to linger, so the underlying osd
32138eb87565SAlex Elder 	 * request won't go away until we unregister it.  We retain
32148eb87565SAlex Elder 	 * a pointer to the object request during that time (in
321576756a51SIlya Dryomov 	 * rbd_dev->watch_request), so we'll keep a reference to it.
321676756a51SIlya Dryomov 	 * We'll drop that reference after we've unregistered it in
321776756a51SIlya Dryomov 	 * rbd_dev_header_unwatch_sync().
32188eb87565SAlex Elder 	 */
32198eb87565SAlex Elder 	rbd_dev->watch_request = obj_request;
32208eb87565SAlex Elder 
32218eb87565SAlex Elder 	return 0;
32229969ebc5SAlex Elder }
32239969ebc5SAlex Elder 
3224b30a01f2SIlya Dryomov /*
3225b30a01f2SIlya Dryomov  * Tear down a watch request, synchronously.
3226b30a01f2SIlya Dryomov  */
322776756a51SIlya Dryomov static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
3228fca27065SIlya Dryomov {
3229b30a01f2SIlya Dryomov 	struct rbd_obj_request *obj_request;
3230b30a01f2SIlya Dryomov 
3231b30a01f2SIlya Dryomov 	rbd_assert(rbd_dev->watch_event);
3232b30a01f2SIlya Dryomov 	rbd_assert(rbd_dev->watch_request);
3233b30a01f2SIlya Dryomov 
323476756a51SIlya Dryomov 	rbd_obj_request_end(rbd_dev->watch_request);
3235b30a01f2SIlya Dryomov 	rbd_obj_request_put(rbd_dev->watch_request);
3236b30a01f2SIlya Dryomov 	rbd_dev->watch_request = NULL;
3237b30a01f2SIlya Dryomov 
323876756a51SIlya Dryomov 	obj_request = rbd_obj_watch_request_helper(rbd_dev, false);
323976756a51SIlya Dryomov 	if (!IS_ERR(obj_request))
3240b30a01f2SIlya Dryomov 		rbd_obj_request_put(obj_request);
324176756a51SIlya Dryomov 	else
324276756a51SIlya Dryomov 		rbd_warn(rbd_dev, "unable to tear down watch request (%ld)",
324376756a51SIlya Dryomov 			 PTR_ERR(obj_request));
324476756a51SIlya Dryomov 
3245b30a01f2SIlya Dryomov 	ceph_osdc_cancel_event(rbd_dev->watch_event);
3246b30a01f2SIlya Dryomov 	rbd_dev->watch_event = NULL;
3247fca27065SIlya Dryomov }
3248fca27065SIlya Dryomov 
324936be9a76SAlex Elder /*
3250f40eb349SAlex Elder  * Synchronous osd object method call.  Returns the number of bytes
3251f40eb349SAlex Elder  * returned in the outbound buffer, or a negative error code.
325236be9a76SAlex Elder  */
325336be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
325436be9a76SAlex Elder 			     const char *object_name,
325536be9a76SAlex Elder 			     const char *class_name,
325636be9a76SAlex Elder 			     const char *method_name,
32574157976bSAlex Elder 			     const void *outbound,
325836be9a76SAlex Elder 			     size_t outbound_size,
32594157976bSAlex Elder 			     void *inbound,
3260e2a58ee5SAlex Elder 			     size_t inbound_size)
326136be9a76SAlex Elder {
32622169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
326336be9a76SAlex Elder 	struct rbd_obj_request *obj_request;
326436be9a76SAlex Elder 	struct page **pages;
326536be9a76SAlex Elder 	u32 page_count;
326636be9a76SAlex Elder 	int ret;
326736be9a76SAlex Elder 
326836be9a76SAlex Elder 	/*
32696010a451SAlex Elder 	 * Method calls are ultimately read operations.  The result
32706010a451SAlex Elder 	 * should placed into the inbound buffer provided.  They
32716010a451SAlex Elder 	 * also supply outbound data--parameters for the object
32726010a451SAlex Elder 	 * method.  Currently if this is present it will be a
32736010a451SAlex Elder 	 * snapshot id.
327436be9a76SAlex Elder 	 */
327536be9a76SAlex Elder 	page_count = (u32)calc_pages_for(0, inbound_size);
327636be9a76SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
327736be9a76SAlex Elder 	if (IS_ERR(pages))
327836be9a76SAlex Elder 		return PTR_ERR(pages);
327936be9a76SAlex Elder 
328036be9a76SAlex Elder 	ret = -ENOMEM;
32816010a451SAlex Elder 	obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
328236be9a76SAlex Elder 							OBJ_REQUEST_PAGES);
328336be9a76SAlex Elder 	if (!obj_request)
328436be9a76SAlex Elder 		goto out;
328536be9a76SAlex Elder 
328636be9a76SAlex Elder 	obj_request->pages = pages;
328736be9a76SAlex Elder 	obj_request->page_count = page_count;
328836be9a76SAlex Elder 
32896d2940c8SGuangliang Zhao 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
3290deb236b3SIlya Dryomov 						  obj_request);
329136be9a76SAlex Elder 	if (!obj_request->osd_req)
329236be9a76SAlex Elder 		goto out;
329336be9a76SAlex Elder 
3294c99d2d4aSAlex Elder 	osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
329504017e29SAlex Elder 					class_name, method_name);
329604017e29SAlex Elder 	if (outbound_size) {
329704017e29SAlex Elder 		struct ceph_pagelist *pagelist;
329804017e29SAlex Elder 
329904017e29SAlex Elder 		pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
330004017e29SAlex Elder 		if (!pagelist)
330104017e29SAlex Elder 			goto out;
330204017e29SAlex Elder 
330304017e29SAlex Elder 		ceph_pagelist_init(pagelist);
330404017e29SAlex Elder 		ceph_pagelist_append(pagelist, outbound, outbound_size);
330504017e29SAlex Elder 		osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
330604017e29SAlex Elder 						pagelist);
330704017e29SAlex Elder 	}
3308a4ce40a9SAlex Elder 	osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
3309a4ce40a9SAlex Elder 					obj_request->pages, inbound_size,
331044cd188dSAlex Elder 					0, false, false);
33119d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
3312430c28c3SAlex Elder 
331336be9a76SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
331436be9a76SAlex Elder 	if (ret)
331536be9a76SAlex Elder 		goto out;
331636be9a76SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
331736be9a76SAlex Elder 	if (ret)
331836be9a76SAlex Elder 		goto out;
331936be9a76SAlex Elder 
332036be9a76SAlex Elder 	ret = obj_request->result;
332136be9a76SAlex Elder 	if (ret < 0)
332236be9a76SAlex Elder 		goto out;
332357385b51SAlex Elder 
332457385b51SAlex Elder 	rbd_assert(obj_request->xferred < (u64)INT_MAX);
332557385b51SAlex Elder 	ret = (int)obj_request->xferred;
3326903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
332736be9a76SAlex Elder out:
332836be9a76SAlex Elder 	if (obj_request)
332936be9a76SAlex Elder 		rbd_obj_request_put(obj_request);
333036be9a76SAlex Elder 	else
333136be9a76SAlex Elder 		ceph_release_page_vector(pages, page_count);
333236be9a76SAlex Elder 
333336be9a76SAlex Elder 	return ret;
333436be9a76SAlex Elder }
333536be9a76SAlex Elder 
33367ad18afaSChristoph Hellwig static void rbd_queue_workfn(struct work_struct *work)
3337bc1ecc65SIlya Dryomov {
33387ad18afaSChristoph Hellwig 	struct request *rq = blk_mq_rq_from_pdu(work);
33397ad18afaSChristoph Hellwig 	struct rbd_device *rbd_dev = rq->q->queuedata;
3340bc1ecc65SIlya Dryomov 	struct rbd_img_request *img_request;
33414e752f0aSJosh Durgin 	struct ceph_snap_context *snapc = NULL;
3342bc1ecc65SIlya Dryomov 	u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
3343bc1ecc65SIlya Dryomov 	u64 length = blk_rq_bytes(rq);
33446d2940c8SGuangliang Zhao 	enum obj_operation_type op_type;
33454e752f0aSJosh Durgin 	u64 mapping_size;
3346bc1ecc65SIlya Dryomov 	int result;
3347bc1ecc65SIlya Dryomov 
33487ad18afaSChristoph Hellwig 	if (rq->cmd_type != REQ_TYPE_FS) {
33497ad18afaSChristoph Hellwig 		dout("%s: non-fs request type %d\n", __func__,
33507ad18afaSChristoph Hellwig 			(int) rq->cmd_type);
33517ad18afaSChristoph Hellwig 		result = -EIO;
33527ad18afaSChristoph Hellwig 		goto err;
33537ad18afaSChristoph Hellwig 	}
33547ad18afaSChristoph Hellwig 
335590e98c52SGuangliang Zhao 	if (rq->cmd_flags & REQ_DISCARD)
335690e98c52SGuangliang Zhao 		op_type = OBJ_OP_DISCARD;
335790e98c52SGuangliang Zhao 	else if (rq->cmd_flags & REQ_WRITE)
33586d2940c8SGuangliang Zhao 		op_type = OBJ_OP_WRITE;
33596d2940c8SGuangliang Zhao 	else
33606d2940c8SGuangliang Zhao 		op_type = OBJ_OP_READ;
33616d2940c8SGuangliang Zhao 
3362bc1ecc65SIlya Dryomov 	/* Ignore/skip any zero-length requests */
3363bc1ecc65SIlya Dryomov 
3364bc1ecc65SIlya Dryomov 	if (!length) {
3365bc1ecc65SIlya Dryomov 		dout("%s: zero-length request\n", __func__);
3366bc1ecc65SIlya Dryomov 		result = 0;
3367bc1ecc65SIlya Dryomov 		goto err_rq;
3368bc1ecc65SIlya Dryomov 	}
3369bc1ecc65SIlya Dryomov 
33706d2940c8SGuangliang Zhao 	/* Only reads are allowed to a read-only device */
3371bc1ecc65SIlya Dryomov 
33726d2940c8SGuangliang Zhao 	if (op_type != OBJ_OP_READ) {
3373bc1ecc65SIlya Dryomov 		if (rbd_dev->mapping.read_only) {
3374bc1ecc65SIlya Dryomov 			result = -EROFS;
3375bc1ecc65SIlya Dryomov 			goto err_rq;
3376bc1ecc65SIlya Dryomov 		}
3377bc1ecc65SIlya Dryomov 		rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
3378bc1ecc65SIlya Dryomov 	}
3379bc1ecc65SIlya Dryomov 
3380bc1ecc65SIlya Dryomov 	/*
3381bc1ecc65SIlya Dryomov 	 * Quit early if the mapped snapshot no longer exists.  It's
3382bc1ecc65SIlya Dryomov 	 * still possible the snapshot will have disappeared by the
3383bc1ecc65SIlya Dryomov 	 * time our request arrives at the osd, but there's no sense in
3384bc1ecc65SIlya Dryomov 	 * sending it if we already know.
3385bc1ecc65SIlya Dryomov 	 */
3386bc1ecc65SIlya Dryomov 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3387bc1ecc65SIlya Dryomov 		dout("request for non-existent snapshot");
3388bc1ecc65SIlya Dryomov 		rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3389bc1ecc65SIlya Dryomov 		result = -ENXIO;
3390bc1ecc65SIlya Dryomov 		goto err_rq;
3391bc1ecc65SIlya Dryomov 	}
3392bc1ecc65SIlya Dryomov 
3393bc1ecc65SIlya Dryomov 	if (offset && length > U64_MAX - offset + 1) {
3394bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
3395bc1ecc65SIlya Dryomov 			 length);
3396bc1ecc65SIlya Dryomov 		result = -EINVAL;
3397bc1ecc65SIlya Dryomov 		goto err_rq;	/* Shouldn't happen */
3398bc1ecc65SIlya Dryomov 	}
3399bc1ecc65SIlya Dryomov 
34007ad18afaSChristoph Hellwig 	blk_mq_start_request(rq);
34017ad18afaSChristoph Hellwig 
34024e752f0aSJosh Durgin 	down_read(&rbd_dev->header_rwsem);
34034e752f0aSJosh Durgin 	mapping_size = rbd_dev->mapping.size;
34046d2940c8SGuangliang Zhao 	if (op_type != OBJ_OP_READ) {
34054e752f0aSJosh Durgin 		snapc = rbd_dev->header.snapc;
34064e752f0aSJosh Durgin 		ceph_get_snap_context(snapc);
34074e752f0aSJosh Durgin 	}
34084e752f0aSJosh Durgin 	up_read(&rbd_dev->header_rwsem);
34094e752f0aSJosh Durgin 
34104e752f0aSJosh Durgin 	if (offset + length > mapping_size) {
3411bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
34124e752f0aSJosh Durgin 			 length, mapping_size);
3413bc1ecc65SIlya Dryomov 		result = -EIO;
3414bc1ecc65SIlya Dryomov 		goto err_rq;
3415bc1ecc65SIlya Dryomov 	}
3416bc1ecc65SIlya Dryomov 
34176d2940c8SGuangliang Zhao 	img_request = rbd_img_request_create(rbd_dev, offset, length, op_type,
34184e752f0aSJosh Durgin 					     snapc);
3419bc1ecc65SIlya Dryomov 	if (!img_request) {
3420bc1ecc65SIlya Dryomov 		result = -ENOMEM;
3421bc1ecc65SIlya Dryomov 		goto err_rq;
3422bc1ecc65SIlya Dryomov 	}
3423bc1ecc65SIlya Dryomov 	img_request->rq = rq;
3424bc1ecc65SIlya Dryomov 
342590e98c52SGuangliang Zhao 	if (op_type == OBJ_OP_DISCARD)
342690e98c52SGuangliang Zhao 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA,
342790e98c52SGuangliang Zhao 					      NULL);
342890e98c52SGuangliang Zhao 	else
342990e98c52SGuangliang Zhao 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
343090e98c52SGuangliang Zhao 					      rq->bio);
3431bc1ecc65SIlya Dryomov 	if (result)
3432bc1ecc65SIlya Dryomov 		goto err_img_request;
3433bc1ecc65SIlya Dryomov 
3434bc1ecc65SIlya Dryomov 	result = rbd_img_request_submit(img_request);
3435bc1ecc65SIlya Dryomov 	if (result)
3436bc1ecc65SIlya Dryomov 		goto err_img_request;
3437bc1ecc65SIlya Dryomov 
3438bc1ecc65SIlya Dryomov 	return;
3439bc1ecc65SIlya Dryomov 
3440bc1ecc65SIlya Dryomov err_img_request:
3441bc1ecc65SIlya Dryomov 	rbd_img_request_put(img_request);
3442bc1ecc65SIlya Dryomov err_rq:
3443bc1ecc65SIlya Dryomov 	if (result)
3444bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "%s %llx at %llx result %d",
34456d2940c8SGuangliang Zhao 			 obj_op_name(op_type), length, offset, result);
34464e752f0aSJosh Durgin 	ceph_put_snap_context(snapc);
34477ad18afaSChristoph Hellwig err:
34487ad18afaSChristoph Hellwig 	blk_mq_end_request(rq, result);
3449bc1ecc65SIlya Dryomov }
3450bc1ecc65SIlya Dryomov 
34517ad18afaSChristoph Hellwig static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
34527ad18afaSChristoph Hellwig 		const struct blk_mq_queue_data *bd)
3453bc1ecc65SIlya Dryomov {
34547ad18afaSChristoph Hellwig 	struct request *rq = bd->rq;
34557ad18afaSChristoph Hellwig 	struct work_struct *work = blk_mq_rq_to_pdu(rq);
3456bc1ecc65SIlya Dryomov 
34577ad18afaSChristoph Hellwig 	queue_work(rbd_wq, work);
34587ad18afaSChristoph Hellwig 	return BLK_MQ_RQ_QUEUE_OK;
3459bf0d5f50SAlex Elder }
3460bf0d5f50SAlex Elder 
3461602adf40SYehuda Sadeh /*
3462602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
3463602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
3464f7760dadSAlex Elder  * which we handle later at bio_chain_clone_range()
3465602adf40SYehuda Sadeh  */
3466602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
3467602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
3468602adf40SYehuda Sadeh {
3469602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
3470e5cfeed2SAlex Elder 	sector_t sector_offset;
3471e5cfeed2SAlex Elder 	sector_t sectors_per_obj;
3472e5cfeed2SAlex Elder 	sector_t obj_sector_offset;
3473e5cfeed2SAlex Elder 	int ret;
3474602adf40SYehuda Sadeh 
3475e5cfeed2SAlex Elder 	/*
3476e5cfeed2SAlex Elder 	 * Find how far into its rbd object the partition-relative
3477e5cfeed2SAlex Elder 	 * bio start sector is to offset relative to the enclosing
3478e5cfeed2SAlex Elder 	 * device.
3479e5cfeed2SAlex Elder 	 */
3480e5cfeed2SAlex Elder 	sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
3481e5cfeed2SAlex Elder 	sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
3482e5cfeed2SAlex Elder 	obj_sector_offset = sector_offset & (sectors_per_obj - 1);
3483593a9e7bSAlex Elder 
3484e5cfeed2SAlex Elder 	/*
3485e5cfeed2SAlex Elder 	 * Compute the number of bytes from that offset to the end
3486e5cfeed2SAlex Elder 	 * of the object.  Account for what's already used by the bio.
3487e5cfeed2SAlex Elder 	 */
3488e5cfeed2SAlex Elder 	ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
3489e5cfeed2SAlex Elder 	if (ret > bmd->bi_size)
3490e5cfeed2SAlex Elder 		ret -= bmd->bi_size;
3491e5cfeed2SAlex Elder 	else
3492e5cfeed2SAlex Elder 		ret = 0;
3493e5cfeed2SAlex Elder 
3494e5cfeed2SAlex Elder 	/*
3495e5cfeed2SAlex Elder 	 * Don't send back more than was asked for.  And if the bio
3496e5cfeed2SAlex Elder 	 * was empty, let the whole thing through because:  "Note
3497e5cfeed2SAlex Elder 	 * that a block device *must* allow a single page to be
3498e5cfeed2SAlex Elder 	 * added to an empty bio."
3499e5cfeed2SAlex Elder 	 */
3500e5cfeed2SAlex Elder 	rbd_assert(bvec->bv_len <= PAGE_SIZE);
3501e5cfeed2SAlex Elder 	if (ret > (int) bvec->bv_len || !bmd->bi_size)
3502e5cfeed2SAlex Elder 		ret = (int) bvec->bv_len;
3503e5cfeed2SAlex Elder 
3504e5cfeed2SAlex Elder 	return ret;
3505602adf40SYehuda Sadeh }
3506602adf40SYehuda Sadeh 
3507602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
3508602adf40SYehuda Sadeh {
3509602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
3510602adf40SYehuda Sadeh 
3511602adf40SYehuda Sadeh 	if (!disk)
3512602adf40SYehuda Sadeh 		return;
3513602adf40SYehuda Sadeh 
3514a0cab924SAlex Elder 	rbd_dev->disk = NULL;
3515a0cab924SAlex Elder 	if (disk->flags & GENHD_FL_UP) {
3516602adf40SYehuda Sadeh 		del_gendisk(disk);
3517602adf40SYehuda Sadeh 		if (disk->queue)
3518602adf40SYehuda Sadeh 			blk_cleanup_queue(disk->queue);
35197ad18afaSChristoph Hellwig 		blk_mq_free_tag_set(&rbd_dev->tag_set);
3520a0cab924SAlex Elder 	}
3521602adf40SYehuda Sadeh 	put_disk(disk);
3522602adf40SYehuda Sadeh }
3523602adf40SYehuda Sadeh 
3524788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
3525788e2df3SAlex Elder 				const char *object_name,
35267097f8dfSAlex Elder 				u64 offset, u64 length, void *buf)
3527788e2df3SAlex Elder 
3528788e2df3SAlex Elder {
35292169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3530788e2df3SAlex Elder 	struct rbd_obj_request *obj_request;
3531788e2df3SAlex Elder 	struct page **pages = NULL;
3532788e2df3SAlex Elder 	u32 page_count;
35331ceae7efSAlex Elder 	size_t size;
3534788e2df3SAlex Elder 	int ret;
3535788e2df3SAlex Elder 
3536788e2df3SAlex Elder 	page_count = (u32) calc_pages_for(offset, length);
3537788e2df3SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
3538788e2df3SAlex Elder 	if (IS_ERR(pages))
3539a8d42056SJan Kara 		return PTR_ERR(pages);
3540788e2df3SAlex Elder 
3541788e2df3SAlex Elder 	ret = -ENOMEM;
3542788e2df3SAlex Elder 	obj_request = rbd_obj_request_create(object_name, offset, length,
3543788e2df3SAlex Elder 							OBJ_REQUEST_PAGES);
3544788e2df3SAlex Elder 	if (!obj_request)
3545788e2df3SAlex Elder 		goto out;
3546788e2df3SAlex Elder 
3547788e2df3SAlex Elder 	obj_request->pages = pages;
3548788e2df3SAlex Elder 	obj_request->page_count = page_count;
3549788e2df3SAlex Elder 
35506d2940c8SGuangliang Zhao 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
3551deb236b3SIlya Dryomov 						  obj_request);
3552788e2df3SAlex Elder 	if (!obj_request->osd_req)
3553788e2df3SAlex Elder 		goto out;
3554788e2df3SAlex Elder 
3555c99d2d4aSAlex Elder 	osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
3556c99d2d4aSAlex Elder 					offset, length, 0, 0);
3557406e2c9fSAlex Elder 	osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
3558a4ce40a9SAlex Elder 					obj_request->pages,
355944cd188dSAlex Elder 					obj_request->length,
356044cd188dSAlex Elder 					obj_request->offset & ~PAGE_MASK,
356144cd188dSAlex Elder 					false, false);
35629d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
3563430c28c3SAlex Elder 
3564788e2df3SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
3565788e2df3SAlex Elder 	if (ret)
3566788e2df3SAlex Elder 		goto out;
3567788e2df3SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
3568788e2df3SAlex Elder 	if (ret)
3569788e2df3SAlex Elder 		goto out;
3570788e2df3SAlex Elder 
3571788e2df3SAlex Elder 	ret = obj_request->result;
3572788e2df3SAlex Elder 	if (ret < 0)
3573788e2df3SAlex Elder 		goto out;
35741ceae7efSAlex Elder 
35751ceae7efSAlex Elder 	rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
35761ceae7efSAlex Elder 	size = (size_t) obj_request->xferred;
3577903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, buf, 0, size);
357823ed6e13SAlex Elder 	rbd_assert(size <= (size_t)INT_MAX);
357923ed6e13SAlex Elder 	ret = (int)size;
3580788e2df3SAlex Elder out:
3581788e2df3SAlex Elder 	if (obj_request)
3582788e2df3SAlex Elder 		rbd_obj_request_put(obj_request);
3583788e2df3SAlex Elder 	else
3584788e2df3SAlex Elder 		ceph_release_page_vector(pages, page_count);
3585788e2df3SAlex Elder 
3586788e2df3SAlex Elder 	return ret;
3587788e2df3SAlex Elder }
3588788e2df3SAlex Elder 
3589602adf40SYehuda Sadeh /*
3590662518b1SAlex Elder  * Read the complete header for the given rbd device.  On successful
3591662518b1SAlex Elder  * return, the rbd_dev->header field will contain up-to-date
3592662518b1SAlex Elder  * information about the image.
35934156d998SAlex Elder  */
359499a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
35954156d998SAlex Elder {
35964156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
35974156d998SAlex Elder 	u32 snap_count = 0;
35984156d998SAlex Elder 	u64 names_size = 0;
35994156d998SAlex Elder 	u32 want_count;
36004156d998SAlex Elder 	int ret;
36014156d998SAlex Elder 
36024156d998SAlex Elder 	/*
36034156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
36044156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
36054156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
36064156d998SAlex Elder 	 * the number of snapshots could change by the time we read
36074156d998SAlex Elder 	 * it in, in which case we re-read it.
36084156d998SAlex Elder 	 */
36094156d998SAlex Elder 	do {
36104156d998SAlex Elder 		size_t size;
36114156d998SAlex Elder 
36124156d998SAlex Elder 		kfree(ondisk);
36134156d998SAlex Elder 
36144156d998SAlex Elder 		size = sizeof (*ondisk);
36154156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
36164156d998SAlex Elder 		size += names_size;
36174156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
36184156d998SAlex Elder 		if (!ondisk)
3619662518b1SAlex Elder 			return -ENOMEM;
36204156d998SAlex Elder 
3621788e2df3SAlex Elder 		ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
36227097f8dfSAlex Elder 				       0, size, ondisk);
36234156d998SAlex Elder 		if (ret < 0)
3624662518b1SAlex Elder 			goto out;
3625c0cd10dbSAlex Elder 		if ((size_t)ret < size) {
36264156d998SAlex Elder 			ret = -ENXIO;
362706ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
362806ecc6cbSAlex Elder 				size, ret);
3629662518b1SAlex Elder 			goto out;
36304156d998SAlex Elder 		}
36314156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
36324156d998SAlex Elder 			ret = -ENXIO;
363306ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
3634662518b1SAlex Elder 			goto out;
36354156d998SAlex Elder 		}
36364156d998SAlex Elder 
36374156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
36384156d998SAlex Elder 		want_count = snap_count;
36394156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
36404156d998SAlex Elder 	} while (snap_count != want_count);
36414156d998SAlex Elder 
3642662518b1SAlex Elder 	ret = rbd_header_from_disk(rbd_dev, ondisk);
3643662518b1SAlex Elder out:
36444156d998SAlex Elder 	kfree(ondisk);
36454156d998SAlex Elder 
3646dfc5606dSYehuda Sadeh 	return ret;
3647602adf40SYehuda Sadeh }
3648602adf40SYehuda Sadeh 
364915228edeSAlex Elder /*
365015228edeSAlex Elder  * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
365115228edeSAlex Elder  * has disappeared from the (just updated) snapshot context.
365215228edeSAlex Elder  */
365315228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev)
365415228edeSAlex Elder {
365515228edeSAlex Elder 	u64 snap_id;
365615228edeSAlex Elder 
365715228edeSAlex Elder 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
365815228edeSAlex Elder 		return;
365915228edeSAlex Elder 
366015228edeSAlex Elder 	snap_id = rbd_dev->spec->snap_id;
366115228edeSAlex Elder 	if (snap_id == CEPH_NOSNAP)
366215228edeSAlex Elder 		return;
366315228edeSAlex Elder 
366415228edeSAlex Elder 	if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
366515228edeSAlex Elder 		clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
366615228edeSAlex Elder }
366715228edeSAlex Elder 
36689875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev)
36699875201eSJosh Durgin {
36709875201eSJosh Durgin 	sector_t size;
36719875201eSJosh Durgin 	bool removing;
36729875201eSJosh Durgin 
36739875201eSJosh Durgin 	/*
36749875201eSJosh Durgin 	 * Don't hold the lock while doing disk operations,
36759875201eSJosh Durgin 	 * or lock ordering will conflict with the bdev mutex via:
36769875201eSJosh Durgin 	 * rbd_add() -> blkdev_get() -> rbd_open()
36779875201eSJosh Durgin 	 */
36789875201eSJosh Durgin 	spin_lock_irq(&rbd_dev->lock);
36799875201eSJosh Durgin 	removing = test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
36809875201eSJosh Durgin 	spin_unlock_irq(&rbd_dev->lock);
36819875201eSJosh Durgin 	/*
36829875201eSJosh Durgin 	 * If the device is being removed, rbd_dev->disk has
36839875201eSJosh Durgin 	 * been destroyed, so don't try to update its size
36849875201eSJosh Durgin 	 */
36859875201eSJosh Durgin 	if (!removing) {
36869875201eSJosh Durgin 		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
36879875201eSJosh Durgin 		dout("setting size to %llu sectors", (unsigned long long)size);
36889875201eSJosh Durgin 		set_capacity(rbd_dev->disk, size);
36899875201eSJosh Durgin 		revalidate_disk(rbd_dev->disk);
36909875201eSJosh Durgin 	}
36919875201eSJosh Durgin }
36929875201eSJosh Durgin 
3693cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev)
36941fe5e993SAlex Elder {
3695e627db08SAlex Elder 	u64 mapping_size;
36961fe5e993SAlex Elder 	int ret;
36971fe5e993SAlex Elder 
3698cfbf6377SAlex Elder 	down_write(&rbd_dev->header_rwsem);
36993b5cf2a2SAlex Elder 	mapping_size = rbd_dev->mapping.size;
3700a720ae09SIlya Dryomov 
3701a720ae09SIlya Dryomov 	ret = rbd_dev_header_info(rbd_dev);
370252bb1f9bSIlya Dryomov 	if (ret)
370373e39e4dSIlya Dryomov 		goto out;
370415228edeSAlex Elder 
3705e8f59b59SIlya Dryomov 	/*
3706e8f59b59SIlya Dryomov 	 * If there is a parent, see if it has disappeared due to the
3707e8f59b59SIlya Dryomov 	 * mapped image getting flattened.
3708e8f59b59SIlya Dryomov 	 */
3709e8f59b59SIlya Dryomov 	if (rbd_dev->parent) {
3710e8f59b59SIlya Dryomov 		ret = rbd_dev_v2_parent_info(rbd_dev);
3711e8f59b59SIlya Dryomov 		if (ret)
371273e39e4dSIlya Dryomov 			goto out;
3713e8f59b59SIlya Dryomov 	}
3714e8f59b59SIlya Dryomov 
37155ff1108cSIlya Dryomov 	if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
37165ff1108cSIlya Dryomov 		rbd_dev->mapping.size = rbd_dev->header.image_size;
37175ff1108cSIlya Dryomov 	} else {
37185ff1108cSIlya Dryomov 		/* validate mapped snapshot's EXISTS flag */
371915228edeSAlex Elder 		rbd_exists_validate(rbd_dev);
37205ff1108cSIlya Dryomov 	}
37215ff1108cSIlya Dryomov 
372273e39e4dSIlya Dryomov out:
3723cfbf6377SAlex Elder 	up_write(&rbd_dev->header_rwsem);
372473e39e4dSIlya Dryomov 	if (!ret && mapping_size != rbd_dev->mapping.size)
37259875201eSJosh Durgin 		rbd_dev_update_size(rbd_dev);
37261fe5e993SAlex Elder 
372773e39e4dSIlya Dryomov 	return ret;
37281fe5e993SAlex Elder }
37291fe5e993SAlex Elder 
37307ad18afaSChristoph Hellwig static int rbd_init_request(void *data, struct request *rq,
37317ad18afaSChristoph Hellwig 		unsigned int hctx_idx, unsigned int request_idx,
37327ad18afaSChristoph Hellwig 		unsigned int numa_node)
37337ad18afaSChristoph Hellwig {
37347ad18afaSChristoph Hellwig 	struct work_struct *work = blk_mq_rq_to_pdu(rq);
37357ad18afaSChristoph Hellwig 
37367ad18afaSChristoph Hellwig 	INIT_WORK(work, rbd_queue_workfn);
37377ad18afaSChristoph Hellwig 	return 0;
37387ad18afaSChristoph Hellwig }
37397ad18afaSChristoph Hellwig 
37407ad18afaSChristoph Hellwig static struct blk_mq_ops rbd_mq_ops = {
37417ad18afaSChristoph Hellwig 	.queue_rq	= rbd_queue_rq,
37427ad18afaSChristoph Hellwig 	.map_queue	= blk_mq_map_queue,
37437ad18afaSChristoph Hellwig 	.init_request	= rbd_init_request,
37447ad18afaSChristoph Hellwig };
37457ad18afaSChristoph Hellwig 
3746602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
3747602adf40SYehuda Sadeh {
3748602adf40SYehuda Sadeh 	struct gendisk *disk;
3749602adf40SYehuda Sadeh 	struct request_queue *q;
3750593a9e7bSAlex Elder 	u64 segment_size;
37517ad18afaSChristoph Hellwig 	int err;
3752602adf40SYehuda Sadeh 
3753602adf40SYehuda Sadeh 	/* create gendisk info */
37547e513d43SIlya Dryomov 	disk = alloc_disk(single_major ?
37557e513d43SIlya Dryomov 			  (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
37567e513d43SIlya Dryomov 			  RBD_MINORS_PER_MAJOR);
3757602adf40SYehuda Sadeh 	if (!disk)
37581fcdb8aaSAlex Elder 		return -ENOMEM;
3759602adf40SYehuda Sadeh 
3760f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
3761de71a297SAlex Elder 		 rbd_dev->dev_id);
3762602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
3763dd82fff1SIlya Dryomov 	disk->first_minor = rbd_dev->minor;
37647e513d43SIlya Dryomov 	if (single_major)
37657e513d43SIlya Dryomov 		disk->flags |= GENHD_FL_EXT_DEVT;
3766602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
3767602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
3768602adf40SYehuda Sadeh 
37697ad18afaSChristoph Hellwig 	memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
37707ad18afaSChristoph Hellwig 	rbd_dev->tag_set.ops = &rbd_mq_ops;
37717ad18afaSChristoph Hellwig 	rbd_dev->tag_set.queue_depth = BLKDEV_MAX_RQ;
37727ad18afaSChristoph Hellwig 	rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
37737ad18afaSChristoph Hellwig 	rbd_dev->tag_set.flags =
37747ad18afaSChristoph Hellwig 		BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
37757ad18afaSChristoph Hellwig 	rbd_dev->tag_set.nr_hw_queues = 1;
37767ad18afaSChristoph Hellwig 	rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
37777ad18afaSChristoph Hellwig 
37787ad18afaSChristoph Hellwig 	err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
37797ad18afaSChristoph Hellwig 	if (err)
3780602adf40SYehuda Sadeh 		goto out_disk;
3781029bcbd8SJosh Durgin 
37827ad18afaSChristoph Hellwig 	q = blk_mq_init_queue(&rbd_dev->tag_set);
37837ad18afaSChristoph Hellwig 	if (IS_ERR(q)) {
37847ad18afaSChristoph Hellwig 		err = PTR_ERR(q);
37857ad18afaSChristoph Hellwig 		goto out_tag_set;
37867ad18afaSChristoph Hellwig 	}
37877ad18afaSChristoph Hellwig 
3788d8a2c89cSIlya Dryomov 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
3789d8a2c89cSIlya Dryomov 	/* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
3790593a9e7bSAlex Elder 
3791029bcbd8SJosh Durgin 	/* set io sizes to object size */
3792593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
3793593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
3794593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
3795593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
3796593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
3797029bcbd8SJosh Durgin 
379890e98c52SGuangliang Zhao 	/* enable the discard support */
379990e98c52SGuangliang Zhao 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
380090e98c52SGuangliang Zhao 	q->limits.discard_granularity = segment_size;
380190e98c52SGuangliang Zhao 	q->limits.discard_alignment = segment_size;
3802b76f8239SJosh Durgin 	q->limits.max_discard_sectors = segment_size / SECTOR_SIZE;
3803b76f8239SJosh Durgin 	q->limits.discard_zeroes_data = 1;
380490e98c52SGuangliang Zhao 
3805602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
3806602adf40SYehuda Sadeh 	disk->queue = q;
3807602adf40SYehuda Sadeh 
3808602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
3809602adf40SYehuda Sadeh 
3810602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
3811602adf40SYehuda Sadeh 
3812602adf40SYehuda Sadeh 	return 0;
38137ad18afaSChristoph Hellwig out_tag_set:
38147ad18afaSChristoph Hellwig 	blk_mq_free_tag_set(&rbd_dev->tag_set);
3815602adf40SYehuda Sadeh out_disk:
3816602adf40SYehuda Sadeh 	put_disk(disk);
38177ad18afaSChristoph Hellwig 	return err;
3818602adf40SYehuda Sadeh }
3819602adf40SYehuda Sadeh 
3820dfc5606dSYehuda Sadeh /*
3821dfc5606dSYehuda Sadeh   sysfs
3822dfc5606dSYehuda Sadeh */
3823602adf40SYehuda Sadeh 
3824593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3825593a9e7bSAlex Elder {
3826593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
3827593a9e7bSAlex Elder }
3828593a9e7bSAlex Elder 
3829dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
3830dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3831602adf40SYehuda Sadeh {
3832593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3833dfc5606dSYehuda Sadeh 
3834fc71d833SAlex Elder 	return sprintf(buf, "%llu\n",
3835fc71d833SAlex Elder 		(unsigned long long)rbd_dev->mapping.size);
3836602adf40SYehuda Sadeh }
3837602adf40SYehuda Sadeh 
383834b13184SAlex Elder /*
383934b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
384034b13184SAlex Elder  * necessarily the base image.
384134b13184SAlex Elder  */
384234b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
384334b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
384434b13184SAlex Elder {
384534b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
384634b13184SAlex Elder 
384734b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
384834b13184SAlex Elder 			(unsigned long long)rbd_dev->mapping.features);
384934b13184SAlex Elder }
385034b13184SAlex Elder 
3851dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
3852dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
3853602adf40SYehuda Sadeh {
3854593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3855dfc5606dSYehuda Sadeh 
3856fc71d833SAlex Elder 	if (rbd_dev->major)
3857dfc5606dSYehuda Sadeh 		return sprintf(buf, "%d\n", rbd_dev->major);
3858fc71d833SAlex Elder 
3859fc71d833SAlex Elder 	return sprintf(buf, "(none)\n");
3860dd82fff1SIlya Dryomov }
3861fc71d833SAlex Elder 
3862dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev,
3863dd82fff1SIlya Dryomov 			      struct device_attribute *attr, char *buf)
3864dd82fff1SIlya Dryomov {
3865dd82fff1SIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3866dd82fff1SIlya Dryomov 
3867dd82fff1SIlya Dryomov 	return sprintf(buf, "%d\n", rbd_dev->minor);
3868dfc5606dSYehuda Sadeh }
3869dfc5606dSYehuda Sadeh 
3870dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
3871dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
3872dfc5606dSYehuda Sadeh {
3873593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3874dfc5606dSYehuda Sadeh 
38751dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
38761dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
3877dfc5606dSYehuda Sadeh }
3878dfc5606dSYehuda Sadeh 
3879dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
3880dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3881dfc5606dSYehuda Sadeh {
3882593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3883dfc5606dSYehuda Sadeh 
38840d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
3885dfc5606dSYehuda Sadeh }
3886dfc5606dSYehuda Sadeh 
38879bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
38889bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
38899bb2f334SAlex Elder {
38909bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
38919bb2f334SAlex Elder 
38920d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
38930d7dbfceSAlex Elder 			(unsigned long long) rbd_dev->spec->pool_id);
38949bb2f334SAlex Elder }
38959bb2f334SAlex Elder 
3896dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
3897dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3898dfc5606dSYehuda Sadeh {
3899593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3900dfc5606dSYehuda Sadeh 
3901a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
39020d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3903a92ffdf8SAlex Elder 
3904a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
3905dfc5606dSYehuda Sadeh }
3906dfc5606dSYehuda Sadeh 
3907589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
3908589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
3909589d30e0SAlex Elder {
3910589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3911589d30e0SAlex Elder 
39120d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
3913589d30e0SAlex Elder }
3914589d30e0SAlex Elder 
391534b13184SAlex Elder /*
391634b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
391734b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
391834b13184SAlex Elder  */
3919dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
3920dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
3921dfc5606dSYehuda Sadeh 			     char *buf)
3922dfc5606dSYehuda Sadeh {
3923593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3924dfc5606dSYehuda Sadeh 
39250d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
3926dfc5606dSYehuda Sadeh }
3927dfc5606dSYehuda Sadeh 
392886b00e0dSAlex Elder /*
3929ff96128fSIlya Dryomov  * For a v2 image, shows the chain of parent images, separated by empty
3930ff96128fSIlya Dryomov  * lines.  For v1 images or if there is no parent, shows "(no parent
3931ff96128fSIlya Dryomov  * image)".
393286b00e0dSAlex Elder  */
393386b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
393486b00e0dSAlex Elder 			       struct device_attribute *attr,
393586b00e0dSAlex Elder 			       char *buf)
393686b00e0dSAlex Elder {
393786b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3938ff96128fSIlya Dryomov 	ssize_t count = 0;
393986b00e0dSAlex Elder 
3940ff96128fSIlya Dryomov 	if (!rbd_dev->parent)
394186b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
394286b00e0dSAlex Elder 
3943ff96128fSIlya Dryomov 	for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
3944ff96128fSIlya Dryomov 		struct rbd_spec *spec = rbd_dev->parent_spec;
394586b00e0dSAlex Elder 
3946ff96128fSIlya Dryomov 		count += sprintf(&buf[count], "%s"
3947ff96128fSIlya Dryomov 			    "pool_id %llu\npool_name %s\n"
3948ff96128fSIlya Dryomov 			    "image_id %s\nimage_name %s\n"
3949ff96128fSIlya Dryomov 			    "snap_id %llu\nsnap_name %s\n"
3950ff96128fSIlya Dryomov 			    "overlap %llu\n",
3951ff96128fSIlya Dryomov 			    !count ? "" : "\n", /* first? */
3952ff96128fSIlya Dryomov 			    spec->pool_id, spec->pool_name,
3953ff96128fSIlya Dryomov 			    spec->image_id, spec->image_name ?: "(unknown)",
3954ff96128fSIlya Dryomov 			    spec->snap_id, spec->snap_name,
3955ff96128fSIlya Dryomov 			    rbd_dev->parent_overlap);
3956ff96128fSIlya Dryomov 	}
395786b00e0dSAlex Elder 
395886b00e0dSAlex Elder 	return count;
395986b00e0dSAlex Elder }
396086b00e0dSAlex Elder 
3961dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
3962dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
3963dfc5606dSYehuda Sadeh 				 const char *buf,
3964dfc5606dSYehuda Sadeh 				 size_t size)
3965dfc5606dSYehuda Sadeh {
3966593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3967b813623aSAlex Elder 	int ret;
3968602adf40SYehuda Sadeh 
3969cc4a38bdSAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
3970e627db08SAlex Elder 	if (ret)
397152bb1f9bSIlya Dryomov 		return ret;
3972b813623aSAlex Elder 
397352bb1f9bSIlya Dryomov 	return size;
3974dfc5606dSYehuda Sadeh }
3975602adf40SYehuda Sadeh 
3976dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
397734b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
3978dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
3979dd82fff1SIlya Dryomov static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
3980dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
3981dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
39829bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
3983dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
3984589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
3985dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3986dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
398786b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
3988dfc5606dSYehuda Sadeh 
3989dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
3990dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
399134b13184SAlex Elder 	&dev_attr_features.attr,
3992dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
3993dd82fff1SIlya Dryomov 	&dev_attr_minor.attr,
3994dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
3995dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
39969bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
3997dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
3998589d30e0SAlex Elder 	&dev_attr_image_id.attr,
3999dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
400086b00e0dSAlex Elder 	&dev_attr_parent.attr,
4001dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
4002dfc5606dSYehuda Sadeh 	NULL
4003dfc5606dSYehuda Sadeh };
4004dfc5606dSYehuda Sadeh 
4005dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
4006dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
4007dfc5606dSYehuda Sadeh };
4008dfc5606dSYehuda Sadeh 
4009dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
4010dfc5606dSYehuda Sadeh 	&rbd_attr_group,
4011dfc5606dSYehuda Sadeh 	NULL
4012dfc5606dSYehuda Sadeh };
4013dfc5606dSYehuda Sadeh 
4014dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
4015dfc5606dSYehuda Sadeh {
4016dfc5606dSYehuda Sadeh }
4017dfc5606dSYehuda Sadeh 
4018dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
4019dfc5606dSYehuda Sadeh 	.name		= "rbd",
4020dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
4021dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
4022dfc5606dSYehuda Sadeh };
4023dfc5606dSYehuda Sadeh 
40248b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
40258b8fb99cSAlex Elder {
40268b8fb99cSAlex Elder 	kref_get(&spec->kref);
40278b8fb99cSAlex Elder 
40288b8fb99cSAlex Elder 	return spec;
40298b8fb99cSAlex Elder }
40308b8fb99cSAlex Elder 
40318b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
40328b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
40338b8fb99cSAlex Elder {
40348b8fb99cSAlex Elder 	if (spec)
40358b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
40368b8fb99cSAlex Elder }
40378b8fb99cSAlex Elder 
40388b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
40398b8fb99cSAlex Elder {
40408b8fb99cSAlex Elder 	struct rbd_spec *spec;
40418b8fb99cSAlex Elder 
40428b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
40438b8fb99cSAlex Elder 	if (!spec)
40448b8fb99cSAlex Elder 		return NULL;
404504077599SIlya Dryomov 
404604077599SIlya Dryomov 	spec->pool_id = CEPH_NOPOOL;
404704077599SIlya Dryomov 	spec->snap_id = CEPH_NOSNAP;
40488b8fb99cSAlex Elder 	kref_init(&spec->kref);
40498b8fb99cSAlex Elder 
40508b8fb99cSAlex Elder 	return spec;
40518b8fb99cSAlex Elder }
40528b8fb99cSAlex Elder 
40538b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
40548b8fb99cSAlex Elder {
40558b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
40568b8fb99cSAlex Elder 
40578b8fb99cSAlex Elder 	kfree(spec->pool_name);
40588b8fb99cSAlex Elder 	kfree(spec->image_id);
40598b8fb99cSAlex Elder 	kfree(spec->image_name);
40608b8fb99cSAlex Elder 	kfree(spec->snap_name);
40618b8fb99cSAlex Elder 	kfree(spec);
40628b8fb99cSAlex Elder }
40638b8fb99cSAlex Elder 
4064cc344fa1SAlex Elder static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
4065c53d5893SAlex Elder 				struct rbd_spec *spec)
4066c53d5893SAlex Elder {
4067c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
4068c53d5893SAlex Elder 
4069c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
4070c53d5893SAlex Elder 	if (!rbd_dev)
4071c53d5893SAlex Elder 		return NULL;
4072c53d5893SAlex Elder 
4073c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
40746d292906SAlex Elder 	rbd_dev->flags = 0;
4075a2acd00eSAlex Elder 	atomic_set(&rbd_dev->parent_ref, 0);
4076c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
4077c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
4078c53d5893SAlex Elder 
4079c53d5893SAlex Elder 	rbd_dev->spec = spec;
4080c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
4081c53d5893SAlex Elder 
40820903e875SAlex Elder 	/* Initialize the layout used for all rbd requests */
40830903e875SAlex Elder 
40840903e875SAlex Elder 	rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
40850903e875SAlex Elder 	rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
40860903e875SAlex Elder 	rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
40870903e875SAlex Elder 	rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
40880903e875SAlex Elder 
4089c53d5893SAlex Elder 	return rbd_dev;
4090c53d5893SAlex Elder }
4091c53d5893SAlex Elder 
4092c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4093c53d5893SAlex Elder {
4094c53d5893SAlex Elder 	rbd_put_client(rbd_dev->rbd_client);
4095c53d5893SAlex Elder 	rbd_spec_put(rbd_dev->spec);
4096c53d5893SAlex Elder 	kfree(rbd_dev);
4097c53d5893SAlex Elder }
4098c53d5893SAlex Elder 
4099dfc5606dSYehuda Sadeh /*
41009d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
41019d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
41029d475de5SAlex Elder  * image.
41039d475de5SAlex Elder  */
41049d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
41059d475de5SAlex Elder 				u8 *order, u64 *snap_size)
41069d475de5SAlex Elder {
41079d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
41089d475de5SAlex Elder 	int ret;
41099d475de5SAlex Elder 	struct {
41109d475de5SAlex Elder 		u8 order;
41119d475de5SAlex Elder 		__le64 size;
41129d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
41139d475de5SAlex Elder 
411436be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
41159d475de5SAlex Elder 				"rbd", "get_size",
41164157976bSAlex Elder 				&snapid, sizeof (snapid),
4117e2a58ee5SAlex Elder 				&size_buf, sizeof (size_buf));
411836be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
41199d475de5SAlex Elder 	if (ret < 0)
41209d475de5SAlex Elder 		return ret;
412157385b51SAlex Elder 	if (ret < sizeof (size_buf))
412257385b51SAlex Elder 		return -ERANGE;
41239d475de5SAlex Elder 
4124c3545579SJosh Durgin 	if (order) {
41259d475de5SAlex Elder 		*order = size_buf.order;
4126c3545579SJosh Durgin 		dout("  order %u", (unsigned int)*order);
4127c3545579SJosh Durgin 	}
41289d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
41299d475de5SAlex Elder 
4130c3545579SJosh Durgin 	dout("  snap_id 0x%016llx snap_size = %llu\n",
4131c3545579SJosh Durgin 		(unsigned long long)snap_id,
41329d475de5SAlex Elder 		(unsigned long long)*snap_size);
41339d475de5SAlex Elder 
41349d475de5SAlex Elder 	return 0;
41359d475de5SAlex Elder }
41369d475de5SAlex Elder 
41379d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
41389d475de5SAlex Elder {
41399d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
41409d475de5SAlex Elder 					&rbd_dev->header.obj_order,
41419d475de5SAlex Elder 					&rbd_dev->header.image_size);
41429d475de5SAlex Elder }
41439d475de5SAlex Elder 
41441e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
41451e130199SAlex Elder {
41461e130199SAlex Elder 	void *reply_buf;
41471e130199SAlex Elder 	int ret;
41481e130199SAlex Elder 	void *p;
41491e130199SAlex Elder 
41501e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
41511e130199SAlex Elder 	if (!reply_buf)
41521e130199SAlex Elder 		return -ENOMEM;
41531e130199SAlex Elder 
415436be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
41554157976bSAlex Elder 				"rbd", "get_object_prefix", NULL, 0,
4156e2a58ee5SAlex Elder 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
415736be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
41581e130199SAlex Elder 	if (ret < 0)
41591e130199SAlex Elder 		goto out;
41601e130199SAlex Elder 
41611e130199SAlex Elder 	p = reply_buf;
41621e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
416357385b51SAlex Elder 						p + ret, NULL, GFP_NOIO);
416457385b51SAlex Elder 	ret = 0;
41651e130199SAlex Elder 
41661e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
41671e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
41681e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
41691e130199SAlex Elder 	} else {
41701e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
41711e130199SAlex Elder 	}
41721e130199SAlex Elder out:
41731e130199SAlex Elder 	kfree(reply_buf);
41741e130199SAlex Elder 
41751e130199SAlex Elder 	return ret;
41761e130199SAlex Elder }
41771e130199SAlex Elder 
4178b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4179b1b5402aSAlex Elder 		u64 *snap_features)
4180b1b5402aSAlex Elder {
4181b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
4182b1b5402aSAlex Elder 	struct {
4183b1b5402aSAlex Elder 		__le64 features;
4184b1b5402aSAlex Elder 		__le64 incompat;
41854157976bSAlex Elder 	} __attribute__ ((packed)) features_buf = { 0 };
4186d889140cSAlex Elder 	u64 incompat;
4187b1b5402aSAlex Elder 	int ret;
4188b1b5402aSAlex Elder 
418936be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4190b1b5402aSAlex Elder 				"rbd", "get_features",
41914157976bSAlex Elder 				&snapid, sizeof (snapid),
4192e2a58ee5SAlex Elder 				&features_buf, sizeof (features_buf));
419336be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4194b1b5402aSAlex Elder 	if (ret < 0)
4195b1b5402aSAlex Elder 		return ret;
419657385b51SAlex Elder 	if (ret < sizeof (features_buf))
419757385b51SAlex Elder 		return -ERANGE;
4198d889140cSAlex Elder 
4199d889140cSAlex Elder 	incompat = le64_to_cpu(features_buf.incompat);
42005cbf6f12SAlex Elder 	if (incompat & ~RBD_FEATURES_SUPPORTED)
4201b8f5c6edSAlex Elder 		return -ENXIO;
4202d889140cSAlex Elder 
4203b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
4204b1b5402aSAlex Elder 
4205b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
4206b1b5402aSAlex Elder 		(unsigned long long)snap_id,
4207b1b5402aSAlex Elder 		(unsigned long long)*snap_features,
4208b1b5402aSAlex Elder 		(unsigned long long)le64_to_cpu(features_buf.incompat));
4209b1b5402aSAlex Elder 
4210b1b5402aSAlex Elder 	return 0;
4211b1b5402aSAlex Elder }
4212b1b5402aSAlex Elder 
4213b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
4214b1b5402aSAlex Elder {
4215b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
4216b1b5402aSAlex Elder 						&rbd_dev->header.features);
4217b1b5402aSAlex Elder }
4218b1b5402aSAlex Elder 
421986b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
422086b00e0dSAlex Elder {
422186b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
422286b00e0dSAlex Elder 	size_t size;
422386b00e0dSAlex Elder 	void *reply_buf = NULL;
422486b00e0dSAlex Elder 	__le64 snapid;
422586b00e0dSAlex Elder 	void *p;
422686b00e0dSAlex Elder 	void *end;
4227642a2537SAlex Elder 	u64 pool_id;
422886b00e0dSAlex Elder 	char *image_id;
42293b5cf2a2SAlex Elder 	u64 snap_id;
423086b00e0dSAlex Elder 	u64 overlap;
423186b00e0dSAlex Elder 	int ret;
423286b00e0dSAlex Elder 
423386b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
423486b00e0dSAlex Elder 	if (!parent_spec)
423586b00e0dSAlex Elder 		return -ENOMEM;
423686b00e0dSAlex Elder 
423786b00e0dSAlex Elder 	size = sizeof (__le64) +				/* pool_id */
423886b00e0dSAlex Elder 		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
423986b00e0dSAlex Elder 		sizeof (__le64) +				/* snap_id */
424086b00e0dSAlex Elder 		sizeof (__le64);				/* overlap */
424186b00e0dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
424286b00e0dSAlex Elder 	if (!reply_buf) {
424386b00e0dSAlex Elder 		ret = -ENOMEM;
424486b00e0dSAlex Elder 		goto out_err;
424586b00e0dSAlex Elder 	}
424686b00e0dSAlex Elder 
42474d9b67cdSIlya Dryomov 	snapid = cpu_to_le64(rbd_dev->spec->snap_id);
424836be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
424986b00e0dSAlex Elder 				"rbd", "get_parent",
42504157976bSAlex Elder 				&snapid, sizeof (snapid),
4251e2a58ee5SAlex Elder 				reply_buf, size);
425236be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
425386b00e0dSAlex Elder 	if (ret < 0)
425486b00e0dSAlex Elder 		goto out_err;
425586b00e0dSAlex Elder 
425686b00e0dSAlex Elder 	p = reply_buf;
425757385b51SAlex Elder 	end = reply_buf + ret;
425857385b51SAlex Elder 	ret = -ERANGE;
4259642a2537SAlex Elder 	ceph_decode_64_safe(&p, end, pool_id, out_err);
4260392a9dadSAlex Elder 	if (pool_id == CEPH_NOPOOL) {
4261392a9dadSAlex Elder 		/*
4262392a9dadSAlex Elder 		 * Either the parent never existed, or we have
4263392a9dadSAlex Elder 		 * record of it but the image got flattened so it no
4264392a9dadSAlex Elder 		 * longer has a parent.  When the parent of a
4265392a9dadSAlex Elder 		 * layered image disappears we immediately set the
4266392a9dadSAlex Elder 		 * overlap to 0.  The effect of this is that all new
4267392a9dadSAlex Elder 		 * requests will be treated as if the image had no
4268392a9dadSAlex Elder 		 * parent.
4269392a9dadSAlex Elder 		 */
4270392a9dadSAlex Elder 		if (rbd_dev->parent_overlap) {
4271392a9dadSAlex Elder 			rbd_dev->parent_overlap = 0;
4272392a9dadSAlex Elder 			rbd_dev_parent_put(rbd_dev);
4273392a9dadSAlex Elder 			pr_info("%s: clone image has been flattened\n",
4274392a9dadSAlex Elder 				rbd_dev->disk->disk_name);
4275392a9dadSAlex Elder 		}
4276392a9dadSAlex Elder 
427786b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
4278392a9dadSAlex Elder 	}
427986b00e0dSAlex Elder 
42800903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
42810903e875SAlex Elder 
42820903e875SAlex Elder 	ret = -EIO;
4283642a2537SAlex Elder 	if (pool_id > (u64)U32_MAX) {
42849584d508SIlya Dryomov 		rbd_warn(NULL, "parent pool id too large (%llu > %u)",
4285642a2537SAlex Elder 			(unsigned long long)pool_id, U32_MAX);
428657385b51SAlex Elder 		goto out_err;
4287c0cd10dbSAlex Elder 	}
42880903e875SAlex Elder 
4289979ed480SAlex Elder 	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
429086b00e0dSAlex Elder 	if (IS_ERR(image_id)) {
429186b00e0dSAlex Elder 		ret = PTR_ERR(image_id);
429286b00e0dSAlex Elder 		goto out_err;
429386b00e0dSAlex Elder 	}
42943b5cf2a2SAlex Elder 	ceph_decode_64_safe(&p, end, snap_id, out_err);
429586b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, overlap, out_err);
429686b00e0dSAlex Elder 
42973b5cf2a2SAlex Elder 	/*
42983b5cf2a2SAlex Elder 	 * The parent won't change (except when the clone is
42993b5cf2a2SAlex Elder 	 * flattened, already handled that).  So we only need to
43003b5cf2a2SAlex Elder 	 * record the parent spec we have not already done so.
43013b5cf2a2SAlex Elder 	 */
43023b5cf2a2SAlex Elder 	if (!rbd_dev->parent_spec) {
43033b5cf2a2SAlex Elder 		parent_spec->pool_id = pool_id;
43043b5cf2a2SAlex Elder 		parent_spec->image_id = image_id;
43053b5cf2a2SAlex Elder 		parent_spec->snap_id = snap_id;
430686b00e0dSAlex Elder 		rbd_dev->parent_spec = parent_spec;
430786b00e0dSAlex Elder 		parent_spec = NULL;	/* rbd_dev now owns this */
4308fbba11b3SIlya Dryomov 	} else {
4309fbba11b3SIlya Dryomov 		kfree(image_id);
43103b5cf2a2SAlex Elder 	}
43113b5cf2a2SAlex Elder 
43123b5cf2a2SAlex Elder 	/*
4313cf32bd9cSIlya Dryomov 	 * We always update the parent overlap.  If it's zero we issue
4314cf32bd9cSIlya Dryomov 	 * a warning, as we will proceed as if there was no parent.
43153b5cf2a2SAlex Elder 	 */
43163b5cf2a2SAlex Elder 	if (!overlap) {
43173b5cf2a2SAlex Elder 		if (parent_spec) {
4318cf32bd9cSIlya Dryomov 			/* refresh, careful to warn just once */
4319cf32bd9cSIlya Dryomov 			if (rbd_dev->parent_overlap)
4320cf32bd9cSIlya Dryomov 				rbd_warn(rbd_dev,
4321cf32bd9cSIlya Dryomov 				    "clone now standalone (overlap became 0)");
432270cf49cfSAlex Elder 		} else {
4323cf32bd9cSIlya Dryomov 			/* initial probe */
4324cf32bd9cSIlya Dryomov 			rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
43253b5cf2a2SAlex Elder 		}
432670cf49cfSAlex Elder 	}
4327cf32bd9cSIlya Dryomov 	rbd_dev->parent_overlap = overlap;
4328cf32bd9cSIlya Dryomov 
432986b00e0dSAlex Elder out:
433086b00e0dSAlex Elder 	ret = 0;
433186b00e0dSAlex Elder out_err:
433286b00e0dSAlex Elder 	kfree(reply_buf);
433386b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
433486b00e0dSAlex Elder 
433586b00e0dSAlex Elder 	return ret;
433686b00e0dSAlex Elder }
433786b00e0dSAlex Elder 
4338cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
4339cc070d59SAlex Elder {
4340cc070d59SAlex Elder 	struct {
4341cc070d59SAlex Elder 		__le64 stripe_unit;
4342cc070d59SAlex Elder 		__le64 stripe_count;
4343cc070d59SAlex Elder 	} __attribute__ ((packed)) striping_info_buf = { 0 };
4344cc070d59SAlex Elder 	size_t size = sizeof (striping_info_buf);
4345cc070d59SAlex Elder 	void *p;
4346cc070d59SAlex Elder 	u64 obj_size;
4347cc070d59SAlex Elder 	u64 stripe_unit;
4348cc070d59SAlex Elder 	u64 stripe_count;
4349cc070d59SAlex Elder 	int ret;
4350cc070d59SAlex Elder 
4351cc070d59SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4352cc070d59SAlex Elder 				"rbd", "get_stripe_unit_count", NULL, 0,
4353e2a58ee5SAlex Elder 				(char *)&striping_info_buf, size);
4354cc070d59SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4355cc070d59SAlex Elder 	if (ret < 0)
4356cc070d59SAlex Elder 		return ret;
4357cc070d59SAlex Elder 	if (ret < size)
4358cc070d59SAlex Elder 		return -ERANGE;
4359cc070d59SAlex Elder 
4360cc070d59SAlex Elder 	/*
4361cc070d59SAlex Elder 	 * We don't actually support the "fancy striping" feature
4362cc070d59SAlex Elder 	 * (STRIPINGV2) yet, but if the striping sizes are the
4363cc070d59SAlex Elder 	 * defaults the behavior is the same as before.  So find
4364cc070d59SAlex Elder 	 * out, and only fail if the image has non-default values.
4365cc070d59SAlex Elder 	 */
4366cc070d59SAlex Elder 	ret = -EINVAL;
4367cc070d59SAlex Elder 	obj_size = (u64)1 << rbd_dev->header.obj_order;
4368cc070d59SAlex Elder 	p = &striping_info_buf;
4369cc070d59SAlex Elder 	stripe_unit = ceph_decode_64(&p);
4370cc070d59SAlex Elder 	if (stripe_unit != obj_size) {
4371cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe unit "
4372cc070d59SAlex Elder 				"(got %llu want %llu)",
4373cc070d59SAlex Elder 				stripe_unit, obj_size);
4374cc070d59SAlex Elder 		return -EINVAL;
4375cc070d59SAlex Elder 	}
4376cc070d59SAlex Elder 	stripe_count = ceph_decode_64(&p);
4377cc070d59SAlex Elder 	if (stripe_count != 1) {
4378cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe count "
4379cc070d59SAlex Elder 				"(got %llu want 1)", stripe_count);
4380cc070d59SAlex Elder 		return -EINVAL;
4381cc070d59SAlex Elder 	}
4382500d0c0fSAlex Elder 	rbd_dev->header.stripe_unit = stripe_unit;
4383500d0c0fSAlex Elder 	rbd_dev->header.stripe_count = stripe_count;
4384cc070d59SAlex Elder 
4385cc070d59SAlex Elder 	return 0;
4386cc070d59SAlex Elder }
4387cc070d59SAlex Elder 
43889e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
43899e15b77dSAlex Elder {
43909e15b77dSAlex Elder 	size_t image_id_size;
43919e15b77dSAlex Elder 	char *image_id;
43929e15b77dSAlex Elder 	void *p;
43939e15b77dSAlex Elder 	void *end;
43949e15b77dSAlex Elder 	size_t size;
43959e15b77dSAlex Elder 	void *reply_buf = NULL;
43969e15b77dSAlex Elder 	size_t len = 0;
43979e15b77dSAlex Elder 	char *image_name = NULL;
43989e15b77dSAlex Elder 	int ret;
43999e15b77dSAlex Elder 
44009e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
44019e15b77dSAlex Elder 
440269e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
440369e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
44049e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
44059e15b77dSAlex Elder 	if (!image_id)
44069e15b77dSAlex Elder 		return NULL;
44079e15b77dSAlex Elder 
44089e15b77dSAlex Elder 	p = image_id;
44094157976bSAlex Elder 	end = image_id + image_id_size;
441069e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
44119e15b77dSAlex Elder 
44129e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
44139e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
44149e15b77dSAlex Elder 	if (!reply_buf)
44159e15b77dSAlex Elder 		goto out;
44169e15b77dSAlex Elder 
441736be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
44189e15b77dSAlex Elder 				"rbd", "dir_get_name",
44199e15b77dSAlex Elder 				image_id, image_id_size,
4420e2a58ee5SAlex Elder 				reply_buf, size);
44219e15b77dSAlex Elder 	if (ret < 0)
44229e15b77dSAlex Elder 		goto out;
44239e15b77dSAlex Elder 	p = reply_buf;
4424f40eb349SAlex Elder 	end = reply_buf + ret;
4425f40eb349SAlex Elder 
44269e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
44279e15b77dSAlex Elder 	if (IS_ERR(image_name))
44289e15b77dSAlex Elder 		image_name = NULL;
44299e15b77dSAlex Elder 	else
44309e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
44319e15b77dSAlex Elder out:
44329e15b77dSAlex Elder 	kfree(reply_buf);
44339e15b77dSAlex Elder 	kfree(image_id);
44349e15b77dSAlex Elder 
44359e15b77dSAlex Elder 	return image_name;
44369e15b77dSAlex Elder }
44379e15b77dSAlex Elder 
44382ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
44392ad3d716SAlex Elder {
44402ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
44412ad3d716SAlex Elder 	const char *snap_name;
44422ad3d716SAlex Elder 	u32 which = 0;
44432ad3d716SAlex Elder 
44442ad3d716SAlex Elder 	/* Skip over names until we find the one we are looking for */
44452ad3d716SAlex Elder 
44462ad3d716SAlex Elder 	snap_name = rbd_dev->header.snap_names;
44472ad3d716SAlex Elder 	while (which < snapc->num_snaps) {
44482ad3d716SAlex Elder 		if (!strcmp(name, snap_name))
44492ad3d716SAlex Elder 			return snapc->snaps[which];
44502ad3d716SAlex Elder 		snap_name += strlen(snap_name) + 1;
44512ad3d716SAlex Elder 		which++;
44522ad3d716SAlex Elder 	}
44532ad3d716SAlex Elder 	return CEPH_NOSNAP;
44542ad3d716SAlex Elder }
44552ad3d716SAlex Elder 
44562ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
44572ad3d716SAlex Elder {
44582ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
44592ad3d716SAlex Elder 	u32 which;
44602ad3d716SAlex Elder 	bool found = false;
44612ad3d716SAlex Elder 	u64 snap_id;
44622ad3d716SAlex Elder 
44632ad3d716SAlex Elder 	for (which = 0; !found && which < snapc->num_snaps; which++) {
44642ad3d716SAlex Elder 		const char *snap_name;
44652ad3d716SAlex Elder 
44662ad3d716SAlex Elder 		snap_id = snapc->snaps[which];
44672ad3d716SAlex Elder 		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
4468efadc98aSJosh Durgin 		if (IS_ERR(snap_name)) {
4469efadc98aSJosh Durgin 			/* ignore no-longer existing snapshots */
4470efadc98aSJosh Durgin 			if (PTR_ERR(snap_name) == -ENOENT)
4471efadc98aSJosh Durgin 				continue;
4472efadc98aSJosh Durgin 			else
44732ad3d716SAlex Elder 				break;
4474efadc98aSJosh Durgin 		}
44752ad3d716SAlex Elder 		found = !strcmp(name, snap_name);
44762ad3d716SAlex Elder 		kfree(snap_name);
44772ad3d716SAlex Elder 	}
44782ad3d716SAlex Elder 	return found ? snap_id : CEPH_NOSNAP;
44792ad3d716SAlex Elder }
44802ad3d716SAlex Elder 
44812ad3d716SAlex Elder /*
44822ad3d716SAlex Elder  * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
44832ad3d716SAlex Elder  * no snapshot by that name is found, or if an error occurs.
44842ad3d716SAlex Elder  */
44852ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
44862ad3d716SAlex Elder {
44872ad3d716SAlex Elder 	if (rbd_dev->image_format == 1)
44882ad3d716SAlex Elder 		return rbd_v1_snap_id_by_name(rbd_dev, name);
44892ad3d716SAlex Elder 
44902ad3d716SAlex Elder 	return rbd_v2_snap_id_by_name(rbd_dev, name);
44912ad3d716SAlex Elder }
44922ad3d716SAlex Elder 
44939e15b77dSAlex Elder /*
449404077599SIlya Dryomov  * An image being mapped will have everything but the snap id.
44959e15b77dSAlex Elder  */
449604077599SIlya Dryomov static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
449704077599SIlya Dryomov {
449804077599SIlya Dryomov 	struct rbd_spec *spec = rbd_dev->spec;
449904077599SIlya Dryomov 
450004077599SIlya Dryomov 	rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
450104077599SIlya Dryomov 	rbd_assert(spec->image_id && spec->image_name);
450204077599SIlya Dryomov 	rbd_assert(spec->snap_name);
450304077599SIlya Dryomov 
450404077599SIlya Dryomov 	if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
450504077599SIlya Dryomov 		u64 snap_id;
450604077599SIlya Dryomov 
450704077599SIlya Dryomov 		snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
450804077599SIlya Dryomov 		if (snap_id == CEPH_NOSNAP)
450904077599SIlya Dryomov 			return -ENOENT;
451004077599SIlya Dryomov 
451104077599SIlya Dryomov 		spec->snap_id = snap_id;
451204077599SIlya Dryomov 	} else {
451304077599SIlya Dryomov 		spec->snap_id = CEPH_NOSNAP;
451404077599SIlya Dryomov 	}
451504077599SIlya Dryomov 
451604077599SIlya Dryomov 	return 0;
451704077599SIlya Dryomov }
451804077599SIlya Dryomov 
451904077599SIlya Dryomov /*
452004077599SIlya Dryomov  * A parent image will have all ids but none of the names.
452104077599SIlya Dryomov  *
452204077599SIlya Dryomov  * All names in an rbd spec are dynamically allocated.  It's OK if we
452304077599SIlya Dryomov  * can't figure out the name for an image id.
452404077599SIlya Dryomov  */
452504077599SIlya Dryomov static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
45269e15b77dSAlex Elder {
45272e9f7f1cSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
45282e9f7f1cSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
45292e9f7f1cSAlex Elder 	const char *pool_name;
45302e9f7f1cSAlex Elder 	const char *image_name;
45312e9f7f1cSAlex Elder 	const char *snap_name;
45329e15b77dSAlex Elder 	int ret;
45339e15b77dSAlex Elder 
453404077599SIlya Dryomov 	rbd_assert(spec->pool_id != CEPH_NOPOOL);
453504077599SIlya Dryomov 	rbd_assert(spec->image_id);
453604077599SIlya Dryomov 	rbd_assert(spec->snap_id != CEPH_NOSNAP);
45379e15b77dSAlex Elder 
45382e9f7f1cSAlex Elder 	/* Get the pool name; we have to make our own copy of this */
45399e15b77dSAlex Elder 
45402e9f7f1cSAlex Elder 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
45412e9f7f1cSAlex Elder 	if (!pool_name) {
45422e9f7f1cSAlex Elder 		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
4543935dc89fSAlex Elder 		return -EIO;
4544935dc89fSAlex Elder 	}
45452e9f7f1cSAlex Elder 	pool_name = kstrdup(pool_name, GFP_KERNEL);
45462e9f7f1cSAlex Elder 	if (!pool_name)
45479e15b77dSAlex Elder 		return -ENOMEM;
45489e15b77dSAlex Elder 
45499e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
45509e15b77dSAlex Elder 
45512e9f7f1cSAlex Elder 	image_name = rbd_dev_image_name(rbd_dev);
45522e9f7f1cSAlex Elder 	if (!image_name)
455306ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
45549e15b77dSAlex Elder 
455504077599SIlya Dryomov 	/* Fetch the snapshot name */
45569e15b77dSAlex Elder 
45572e9f7f1cSAlex Elder 	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
4558da6a6b63SJosh Durgin 	if (IS_ERR(snap_name)) {
4559da6a6b63SJosh Durgin 		ret = PTR_ERR(snap_name);
45609e15b77dSAlex Elder 		goto out_err;
45612e9f7f1cSAlex Elder 	}
45622e9f7f1cSAlex Elder 
45632e9f7f1cSAlex Elder 	spec->pool_name = pool_name;
45642e9f7f1cSAlex Elder 	spec->image_name = image_name;
45652e9f7f1cSAlex Elder 	spec->snap_name = snap_name;
45669e15b77dSAlex Elder 
45679e15b77dSAlex Elder 	return 0;
456804077599SIlya Dryomov 
45699e15b77dSAlex Elder out_err:
45702e9f7f1cSAlex Elder 	kfree(image_name);
45712e9f7f1cSAlex Elder 	kfree(pool_name);
45729e15b77dSAlex Elder 	return ret;
45739e15b77dSAlex Elder }
45749e15b77dSAlex Elder 
4575cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
457635d489f9SAlex Elder {
457735d489f9SAlex Elder 	size_t size;
457835d489f9SAlex Elder 	int ret;
457935d489f9SAlex Elder 	void *reply_buf;
458035d489f9SAlex Elder 	void *p;
458135d489f9SAlex Elder 	void *end;
458235d489f9SAlex Elder 	u64 seq;
458335d489f9SAlex Elder 	u32 snap_count;
458435d489f9SAlex Elder 	struct ceph_snap_context *snapc;
458535d489f9SAlex Elder 	u32 i;
458635d489f9SAlex Elder 
458735d489f9SAlex Elder 	/*
458835d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
458935d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
459035d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
459135d489f9SAlex Elder 	 * prepared to receive.
459235d489f9SAlex Elder 	 */
459335d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
459435d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
459535d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
459635d489f9SAlex Elder 	if (!reply_buf)
459735d489f9SAlex Elder 		return -ENOMEM;
459835d489f9SAlex Elder 
459936be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
46004157976bSAlex Elder 				"rbd", "get_snapcontext", NULL, 0,
4601e2a58ee5SAlex Elder 				reply_buf, size);
460236be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
460335d489f9SAlex Elder 	if (ret < 0)
460435d489f9SAlex Elder 		goto out;
460535d489f9SAlex Elder 
460635d489f9SAlex Elder 	p = reply_buf;
460757385b51SAlex Elder 	end = reply_buf + ret;
460857385b51SAlex Elder 	ret = -ERANGE;
460935d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
461035d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
461135d489f9SAlex Elder 
461235d489f9SAlex Elder 	/*
461335d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
461435d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
461535d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
461635d489f9SAlex Elder 	 * allocate is representable in a size_t.
461735d489f9SAlex Elder 	 */
461835d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
461935d489f9SAlex Elder 				 / sizeof (u64)) {
462035d489f9SAlex Elder 		ret = -EINVAL;
462135d489f9SAlex Elder 		goto out;
462235d489f9SAlex Elder 	}
462335d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
462435d489f9SAlex Elder 		goto out;
4625468521c1SAlex Elder 	ret = 0;
462635d489f9SAlex Elder 
4627812164f8SAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
462835d489f9SAlex Elder 	if (!snapc) {
462935d489f9SAlex Elder 		ret = -ENOMEM;
463035d489f9SAlex Elder 		goto out;
463135d489f9SAlex Elder 	}
463235d489f9SAlex Elder 	snapc->seq = seq;
463335d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
463435d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
463535d489f9SAlex Elder 
463649ece554SAlex Elder 	ceph_put_snap_context(rbd_dev->header.snapc);
463735d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
463835d489f9SAlex Elder 
463935d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
464035d489f9SAlex Elder 		(unsigned long long)seq, (unsigned int)snap_count);
464135d489f9SAlex Elder out:
464235d489f9SAlex Elder 	kfree(reply_buf);
464335d489f9SAlex Elder 
464457385b51SAlex Elder 	return ret;
464535d489f9SAlex Elder }
464635d489f9SAlex Elder 
464754cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
464854cac61fSAlex Elder 					u64 snap_id)
4649b8b1e2dbSAlex Elder {
4650b8b1e2dbSAlex Elder 	size_t size;
4651b8b1e2dbSAlex Elder 	void *reply_buf;
465254cac61fSAlex Elder 	__le64 snapid;
4653b8b1e2dbSAlex Elder 	int ret;
4654b8b1e2dbSAlex Elder 	void *p;
4655b8b1e2dbSAlex Elder 	void *end;
4656b8b1e2dbSAlex Elder 	char *snap_name;
4657b8b1e2dbSAlex Elder 
4658b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
4659b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
4660b8b1e2dbSAlex Elder 	if (!reply_buf)
4661b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
4662b8b1e2dbSAlex Elder 
466354cac61fSAlex Elder 	snapid = cpu_to_le64(snap_id);
466436be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4665b8b1e2dbSAlex Elder 				"rbd", "get_snapshot_name",
466654cac61fSAlex Elder 				&snapid, sizeof (snapid),
4667e2a58ee5SAlex Elder 				reply_buf, size);
466836be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4669f40eb349SAlex Elder 	if (ret < 0) {
4670f40eb349SAlex Elder 		snap_name = ERR_PTR(ret);
4671b8b1e2dbSAlex Elder 		goto out;
4672f40eb349SAlex Elder 	}
4673b8b1e2dbSAlex Elder 
4674b8b1e2dbSAlex Elder 	p = reply_buf;
4675f40eb349SAlex Elder 	end = reply_buf + ret;
4676e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
4677f40eb349SAlex Elder 	if (IS_ERR(snap_name))
4678b8b1e2dbSAlex Elder 		goto out;
4679f40eb349SAlex Elder 
4680b8b1e2dbSAlex Elder 	dout("  snap_id 0x%016llx snap_name = %s\n",
468154cac61fSAlex Elder 		(unsigned long long)snap_id, snap_name);
4682b8b1e2dbSAlex Elder out:
4683b8b1e2dbSAlex Elder 	kfree(reply_buf);
4684b8b1e2dbSAlex Elder 
4685f40eb349SAlex Elder 	return snap_name;
4686b8b1e2dbSAlex Elder }
4687b8b1e2dbSAlex Elder 
46882df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
4689117973fbSAlex Elder {
46902df3fac7SAlex Elder 	bool first_time = rbd_dev->header.object_prefix == NULL;
4691117973fbSAlex Elder 	int ret;
4692117973fbSAlex Elder 
46931617e40cSJosh Durgin 	ret = rbd_dev_v2_image_size(rbd_dev);
46941617e40cSJosh Durgin 	if (ret)
4695cfbf6377SAlex Elder 		return ret;
46961617e40cSJosh Durgin 
46972df3fac7SAlex Elder 	if (first_time) {
46982df3fac7SAlex Elder 		ret = rbd_dev_v2_header_onetime(rbd_dev);
46992df3fac7SAlex Elder 		if (ret)
4700cfbf6377SAlex Elder 			return ret;
47012df3fac7SAlex Elder 	}
47022df3fac7SAlex Elder 
4703cc4a38bdSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev);
4704117973fbSAlex Elder 	dout("rbd_dev_v2_snap_context returned %d\n", ret);
4705117973fbSAlex Elder 
4706117973fbSAlex Elder 	return ret;
4707117973fbSAlex Elder }
4708117973fbSAlex Elder 
4709a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev)
4710a720ae09SIlya Dryomov {
4711a720ae09SIlya Dryomov 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4712a720ae09SIlya Dryomov 
4713a720ae09SIlya Dryomov 	if (rbd_dev->image_format == 1)
4714a720ae09SIlya Dryomov 		return rbd_dev_v1_header_info(rbd_dev);
4715a720ae09SIlya Dryomov 
4716a720ae09SIlya Dryomov 	return rbd_dev_v2_header_info(rbd_dev);
4717a720ae09SIlya Dryomov }
4718a720ae09SIlya Dryomov 
4719dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
4720dfc5606dSYehuda Sadeh {
4721dfc5606dSYehuda Sadeh 	struct device *dev;
4722cd789ab9SAlex Elder 	int ret;
4723dfc5606dSYehuda Sadeh 
4724cd789ab9SAlex Elder 	dev = &rbd_dev->dev;
4725dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
4726dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
4727dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
4728200a6a8bSAlex Elder 	dev->release = rbd_dev_device_release;
4729de71a297SAlex Elder 	dev_set_name(dev, "%d", rbd_dev->dev_id);
4730dfc5606dSYehuda Sadeh 	ret = device_register(dev);
4731dfc5606dSYehuda Sadeh 
4732dfc5606dSYehuda Sadeh 	return ret;
4733602adf40SYehuda Sadeh }
4734602adf40SYehuda Sadeh 
4735dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
4736dfc5606dSYehuda Sadeh {
4737dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
4738dfc5606dSYehuda Sadeh }
4739dfc5606dSYehuda Sadeh 
47401ddbe94eSAlex Elder /*
4741499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
4742f8a22fc2SIlya Dryomov  * the rbd_dev to the global list.
47431ddbe94eSAlex Elder  */
4744f8a22fc2SIlya Dryomov static int rbd_dev_id_get(struct rbd_device *rbd_dev)
4745b7f23c36SAlex Elder {
4746f8a22fc2SIlya Dryomov 	int new_dev_id;
4747f8a22fc2SIlya Dryomov 
47489b60e70bSIlya Dryomov 	new_dev_id = ida_simple_get(&rbd_dev_id_ida,
47499b60e70bSIlya Dryomov 				    0, minor_to_rbd_dev_id(1 << MINORBITS),
47509b60e70bSIlya Dryomov 				    GFP_KERNEL);
4751f8a22fc2SIlya Dryomov 	if (new_dev_id < 0)
4752f8a22fc2SIlya Dryomov 		return new_dev_id;
4753f8a22fc2SIlya Dryomov 
4754f8a22fc2SIlya Dryomov 	rbd_dev->dev_id = new_dev_id;
4755499afd5bSAlex Elder 
4756499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4757499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
4758499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
4759f8a22fc2SIlya Dryomov 
476070eebd20SIlya Dryomov 	dout("rbd_dev %p given dev id %d\n", rbd_dev, rbd_dev->dev_id);
4761f8a22fc2SIlya Dryomov 
4762f8a22fc2SIlya Dryomov 	return 0;
4763b7f23c36SAlex Elder }
4764b7f23c36SAlex Elder 
47651ddbe94eSAlex Elder /*
4766499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
4767499afd5bSAlex Elder  * identifier is no longer in use.
47681ddbe94eSAlex Elder  */
4769e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev)
47701ddbe94eSAlex Elder {
4771499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4772499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
4773499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
47741ddbe94eSAlex Elder 
4775f8a22fc2SIlya Dryomov 	ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4776f8a22fc2SIlya Dryomov 
4777f8a22fc2SIlya Dryomov 	dout("rbd_dev %p released dev id %d\n", rbd_dev, rbd_dev->dev_id);
4778b7f23c36SAlex Elder }
4779b7f23c36SAlex Elder 
4780a725f65eSAlex Elder /*
4781e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
4782e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
4783593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
4784593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
4785e28fff26SAlex Elder  */
4786e28fff26SAlex Elder static inline size_t next_token(const char **buf)
4787e28fff26SAlex Elder {
4788e28fff26SAlex Elder         /*
4789e28fff26SAlex Elder         * These are the characters that produce nonzero for
4790e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
4791e28fff26SAlex Elder         */
4792e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
4793e28fff26SAlex Elder 
4794e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
4795e28fff26SAlex Elder 
4796e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
4797e28fff26SAlex Elder }
4798e28fff26SAlex Elder 
4799e28fff26SAlex Elder /*
4800ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
4801ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
4802ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
4803ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
4804ea3352f4SAlex Elder  *
4805ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
4806ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
4807ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
4808ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
4809ea3352f4SAlex Elder  *
4810ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
4811ea3352f4SAlex Elder  * the end of the found token.
4812ea3352f4SAlex Elder  *
4813ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
4814ea3352f4SAlex Elder  */
4815ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
4816ea3352f4SAlex Elder {
4817ea3352f4SAlex Elder 	char *dup;
4818ea3352f4SAlex Elder 	size_t len;
4819ea3352f4SAlex Elder 
4820ea3352f4SAlex Elder 	len = next_token(buf);
48214caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
4822ea3352f4SAlex Elder 	if (!dup)
4823ea3352f4SAlex Elder 		return NULL;
4824ea3352f4SAlex Elder 	*(dup + len) = '\0';
4825ea3352f4SAlex Elder 	*buf += len;
4826ea3352f4SAlex Elder 
4827ea3352f4SAlex Elder 	if (lenp)
4828ea3352f4SAlex Elder 		*lenp = len;
4829ea3352f4SAlex Elder 
4830ea3352f4SAlex Elder 	return dup;
4831ea3352f4SAlex Elder }
4832ea3352f4SAlex Elder 
4833ea3352f4SAlex Elder /*
4834859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
4835859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
4836859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
4837859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
4838d22f76e7SAlex Elder  *
4839859c31dfSAlex Elder  * The information extracted from these options is recorded in
4840859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
4841859c31dfSAlex Elder  * structures:
4842859c31dfSAlex Elder  *  ceph_opts
4843859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
4844859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
4845859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
4846859c31dfSAlex Elder  *  rbd_opts
4847859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
4848859c31dfSAlex Elder  *	this function; caller must release with kfree().
4849859c31dfSAlex Elder  *  spec
4850859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
4851859c31dfSAlex Elder  *	initialized by this function based on parsed options.
4852859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
4853859c31dfSAlex Elder  *
4854859c31dfSAlex Elder  * The options passed take this form:
4855859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4856859c31dfSAlex Elder  * where:
4857859c31dfSAlex Elder  *  <mon_addrs>
4858859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
4859859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
4860859c31dfSAlex Elder  *      by a port number (separated by a colon).
4861859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
4862859c31dfSAlex Elder  *  <options>
4863859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
4864859c31dfSAlex Elder  *  <pool_name>
4865859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
4866859c31dfSAlex Elder  *  <image_name>
4867859c31dfSAlex Elder  *      The name of the image in that pool to map.
4868859c31dfSAlex Elder  *  <snap_id>
4869859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
4870859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
4871859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
4872859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
4873a725f65eSAlex Elder  */
4874859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
4875dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
4876859c31dfSAlex Elder 				struct rbd_options **opts,
4877859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
4878a725f65eSAlex Elder {
4879e28fff26SAlex Elder 	size_t len;
4880859c31dfSAlex Elder 	char *options;
48810ddebc0cSAlex Elder 	const char *mon_addrs;
4882ecb4dc22SAlex Elder 	char *snap_name;
48830ddebc0cSAlex Elder 	size_t mon_addrs_size;
4884859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
48854e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
4886859c31dfSAlex Elder 	struct ceph_options *copts;
4887dc79b113SAlex Elder 	int ret;
4888e28fff26SAlex Elder 
4889e28fff26SAlex Elder 	/* The first four tokens are required */
4890e28fff26SAlex Elder 
48917ef3214aSAlex Elder 	len = next_token(&buf);
48924fb5d671SAlex Elder 	if (!len) {
48934fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
48944fb5d671SAlex Elder 		return -EINVAL;
48954fb5d671SAlex Elder 	}
48960ddebc0cSAlex Elder 	mon_addrs = buf;
4897f28e565aSAlex Elder 	mon_addrs_size = len + 1;
48987ef3214aSAlex Elder 	buf += len;
4899a725f65eSAlex Elder 
4900dc79b113SAlex Elder 	ret = -EINVAL;
4901f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
4902f28e565aSAlex Elder 	if (!options)
4903dc79b113SAlex Elder 		return -ENOMEM;
49044fb5d671SAlex Elder 	if (!*options) {
49054fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
49064fb5d671SAlex Elder 		goto out_err;
49074fb5d671SAlex Elder 	}
4908a725f65eSAlex Elder 
4909859c31dfSAlex Elder 	spec = rbd_spec_alloc();
4910859c31dfSAlex Elder 	if (!spec)
4911f28e565aSAlex Elder 		goto out_mem;
4912859c31dfSAlex Elder 
4913859c31dfSAlex Elder 	spec->pool_name = dup_token(&buf, NULL);
4914859c31dfSAlex Elder 	if (!spec->pool_name)
4915859c31dfSAlex Elder 		goto out_mem;
49164fb5d671SAlex Elder 	if (!*spec->pool_name) {
49174fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
49184fb5d671SAlex Elder 		goto out_err;
49194fb5d671SAlex Elder 	}
4920e28fff26SAlex Elder 
492169e7a02fSAlex Elder 	spec->image_name = dup_token(&buf, NULL);
4922859c31dfSAlex Elder 	if (!spec->image_name)
4923f28e565aSAlex Elder 		goto out_mem;
49244fb5d671SAlex Elder 	if (!*spec->image_name) {
49254fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
49264fb5d671SAlex Elder 		goto out_err;
49274fb5d671SAlex Elder 	}
4928e28fff26SAlex Elder 
4929f28e565aSAlex Elder 	/*
4930f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
4931f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
4932f28e565aSAlex Elder 	 */
49333feeb894SAlex Elder 	len = next_token(&buf);
4934820a5f3eSAlex Elder 	if (!len) {
49353feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
49363feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
4937f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
4938dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
4939f28e565aSAlex Elder 		goto out_err;
4940849b4260SAlex Elder 	}
4941ecb4dc22SAlex Elder 	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
4942ecb4dc22SAlex Elder 	if (!snap_name)
4943f28e565aSAlex Elder 		goto out_mem;
4944ecb4dc22SAlex Elder 	*(snap_name + len) = '\0';
4945ecb4dc22SAlex Elder 	spec->snap_name = snap_name;
4946e5c35534SAlex Elder 
49470ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
4948e28fff26SAlex Elder 
49494e9afebaSAlex Elder 	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
49504e9afebaSAlex Elder 	if (!rbd_opts)
49514e9afebaSAlex Elder 		goto out_mem;
49524e9afebaSAlex Elder 
49534e9afebaSAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
4954d22f76e7SAlex Elder 
4955859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
49560ddebc0cSAlex Elder 					mon_addrs + mon_addrs_size - 1,
49574e9afebaSAlex Elder 					parse_rbd_opts_token, rbd_opts);
4958859c31dfSAlex Elder 	if (IS_ERR(copts)) {
4959859c31dfSAlex Elder 		ret = PTR_ERR(copts);
4960dc79b113SAlex Elder 		goto out_err;
4961dc79b113SAlex Elder 	}
4962859c31dfSAlex Elder 	kfree(options);
4963859c31dfSAlex Elder 
4964859c31dfSAlex Elder 	*ceph_opts = copts;
49654e9afebaSAlex Elder 	*opts = rbd_opts;
4966859c31dfSAlex Elder 	*rbd_spec = spec;
49670ddebc0cSAlex Elder 
4968dc79b113SAlex Elder 	return 0;
4969f28e565aSAlex Elder out_mem:
4970dc79b113SAlex Elder 	ret = -ENOMEM;
4971d22f76e7SAlex Elder out_err:
4972859c31dfSAlex Elder 	kfree(rbd_opts);
4973859c31dfSAlex Elder 	rbd_spec_put(spec);
4974f28e565aSAlex Elder 	kfree(options);
4975d22f76e7SAlex Elder 
4976dc79b113SAlex Elder 	return ret;
4977a725f65eSAlex Elder }
4978a725f65eSAlex Elder 
4979589d30e0SAlex Elder /*
498030ba1f02SIlya Dryomov  * Return pool id (>= 0) or a negative error code.
498130ba1f02SIlya Dryomov  */
498230ba1f02SIlya Dryomov static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
498330ba1f02SIlya Dryomov {
4984a319bf56SIlya Dryomov 	struct ceph_options *opts = rbdc->client->options;
498530ba1f02SIlya Dryomov 	u64 newest_epoch;
498630ba1f02SIlya Dryomov 	int tries = 0;
498730ba1f02SIlya Dryomov 	int ret;
498830ba1f02SIlya Dryomov 
498930ba1f02SIlya Dryomov again:
499030ba1f02SIlya Dryomov 	ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
499130ba1f02SIlya Dryomov 	if (ret == -ENOENT && tries++ < 1) {
499230ba1f02SIlya Dryomov 		ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap",
499330ba1f02SIlya Dryomov 					       &newest_epoch);
499430ba1f02SIlya Dryomov 		if (ret < 0)
499530ba1f02SIlya Dryomov 			return ret;
499630ba1f02SIlya Dryomov 
499730ba1f02SIlya Dryomov 		if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
499830ba1f02SIlya Dryomov 			ceph_monc_request_next_osdmap(&rbdc->client->monc);
499930ba1f02SIlya Dryomov 			(void) ceph_monc_wait_osdmap(&rbdc->client->monc,
5000a319bf56SIlya Dryomov 						     newest_epoch,
5001a319bf56SIlya Dryomov 						     opts->mount_timeout);
500230ba1f02SIlya Dryomov 			goto again;
500330ba1f02SIlya Dryomov 		} else {
500430ba1f02SIlya Dryomov 			/* the osdmap we have is new enough */
500530ba1f02SIlya Dryomov 			return -ENOENT;
500630ba1f02SIlya Dryomov 		}
500730ba1f02SIlya Dryomov 	}
500830ba1f02SIlya Dryomov 
500930ba1f02SIlya Dryomov 	return ret;
501030ba1f02SIlya Dryomov }
501130ba1f02SIlya Dryomov 
501230ba1f02SIlya Dryomov /*
5013589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
5014589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
5015589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
5016589d30e0SAlex Elder  *
5017589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
5018589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
5019589d30e0SAlex Elder  * with the supplied name.
5020589d30e0SAlex Elder  *
5021589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
5022589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
5023589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
5024589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
5025589d30e0SAlex Elder  */
5026589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
5027589d30e0SAlex Elder {
5028589d30e0SAlex Elder 	int ret;
5029589d30e0SAlex Elder 	size_t size;
5030589d30e0SAlex Elder 	char *object_name;
5031589d30e0SAlex Elder 	void *response;
5032c0fba368SAlex Elder 	char *image_id;
50332f82ee54SAlex Elder 
5034589d30e0SAlex Elder 	/*
50352c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
50362c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
5037c0fba368SAlex Elder 	 * need to fetch the image id again in this case.  We
5038c0fba368SAlex Elder 	 * do still need to set the image format though.
50392c0d0a10SAlex Elder 	 */
5040c0fba368SAlex Elder 	if (rbd_dev->spec->image_id) {
5041c0fba368SAlex Elder 		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
5042c0fba368SAlex Elder 
50432c0d0a10SAlex Elder 		return 0;
5044c0fba368SAlex Elder 	}
50452c0d0a10SAlex Elder 
50462c0d0a10SAlex Elder 	/*
5047589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
5048589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
5049589d30e0SAlex Elder 	 */
505069e7a02fSAlex Elder 	size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
5051589d30e0SAlex Elder 	object_name = kmalloc(size, GFP_NOIO);
5052589d30e0SAlex Elder 	if (!object_name)
5053589d30e0SAlex Elder 		return -ENOMEM;
50540d7dbfceSAlex Elder 	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
5055589d30e0SAlex Elder 	dout("rbd id object name is %s\n", object_name);
5056589d30e0SAlex Elder 
5057589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
5058589d30e0SAlex Elder 
5059589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
5060589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
5061589d30e0SAlex Elder 	if (!response) {
5062589d30e0SAlex Elder 		ret = -ENOMEM;
5063589d30e0SAlex Elder 		goto out;
5064589d30e0SAlex Elder 	}
5065589d30e0SAlex Elder 
5066c0fba368SAlex Elder 	/* If it doesn't exist we'll assume it's a format 1 image */
5067c0fba368SAlex Elder 
506836be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, object_name,
50694157976bSAlex Elder 				"rbd", "get_id", NULL, 0,
5070e2a58ee5SAlex Elder 				response, RBD_IMAGE_ID_LEN_MAX);
507136be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5072c0fba368SAlex Elder 	if (ret == -ENOENT) {
5073c0fba368SAlex Elder 		image_id = kstrdup("", GFP_KERNEL);
5074c0fba368SAlex Elder 		ret = image_id ? 0 : -ENOMEM;
5075c0fba368SAlex Elder 		if (!ret)
5076c0fba368SAlex Elder 			rbd_dev->image_format = 1;
50777dd440c9SIlya Dryomov 	} else if (ret >= 0) {
5078c0fba368SAlex Elder 		void *p = response;
5079589d30e0SAlex Elder 
5080c0fba368SAlex Elder 		image_id = ceph_extract_encoded_string(&p, p + ret,
5081979ed480SAlex Elder 						NULL, GFP_NOIO);
5082461f758aSDuan Jiong 		ret = PTR_ERR_OR_ZERO(image_id);
5083c0fba368SAlex Elder 		if (!ret)
5084c0fba368SAlex Elder 			rbd_dev->image_format = 2;
5085c0fba368SAlex Elder 	}
5086c0fba368SAlex Elder 
5087c0fba368SAlex Elder 	if (!ret) {
5088c0fba368SAlex Elder 		rbd_dev->spec->image_id = image_id;
5089c0fba368SAlex Elder 		dout("image_id is %s\n", image_id);
5090589d30e0SAlex Elder 	}
5091589d30e0SAlex Elder out:
5092589d30e0SAlex Elder 	kfree(response);
5093589d30e0SAlex Elder 	kfree(object_name);
5094589d30e0SAlex Elder 
5095589d30e0SAlex Elder 	return ret;
5096589d30e0SAlex Elder }
5097589d30e0SAlex Elder 
50983abef3b3SAlex Elder /*
50993abef3b3SAlex Elder  * Undo whatever state changes are made by v1 or v2 header info
51003abef3b3SAlex Elder  * call.
51013abef3b3SAlex Elder  */
51026fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
51036fd48b3bSAlex Elder {
51046fd48b3bSAlex Elder 	struct rbd_image_header	*header;
51056fd48b3bSAlex Elder 
5106a2acd00eSAlex Elder 	rbd_dev_parent_put(rbd_dev);
51076fd48b3bSAlex Elder 
51086fd48b3bSAlex Elder 	/* Free dynamic fields from the header, then zero it out */
51096fd48b3bSAlex Elder 
51106fd48b3bSAlex Elder 	header = &rbd_dev->header;
5111812164f8SAlex Elder 	ceph_put_snap_context(header->snapc);
51126fd48b3bSAlex Elder 	kfree(header->snap_sizes);
51136fd48b3bSAlex Elder 	kfree(header->snap_names);
51146fd48b3bSAlex Elder 	kfree(header->object_prefix);
51156fd48b3bSAlex Elder 	memset(header, 0, sizeof (*header));
51166fd48b3bSAlex Elder }
51176fd48b3bSAlex Elder 
51182df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
5119a30b71b9SAlex Elder {
5120a30b71b9SAlex Elder 	int ret;
5121a30b71b9SAlex Elder 
51221e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
512357385b51SAlex Elder 	if (ret)
51241e130199SAlex Elder 		goto out_err;
5125b1b5402aSAlex Elder 
51262df3fac7SAlex Elder 	/*
51272df3fac7SAlex Elder 	 * Get the and check features for the image.  Currently the
51282df3fac7SAlex Elder 	 * features are assumed to never change.
51292df3fac7SAlex Elder 	 */
5130b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
513157385b51SAlex Elder 	if (ret)
5132b1b5402aSAlex Elder 		goto out_err;
513335d489f9SAlex Elder 
5134cc070d59SAlex Elder 	/* If the image supports fancy striping, get its parameters */
5135cc070d59SAlex Elder 
5136cc070d59SAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
5137cc070d59SAlex Elder 		ret = rbd_dev_v2_striping_info(rbd_dev);
5138cc070d59SAlex Elder 		if (ret < 0)
5139cc070d59SAlex Elder 			goto out_err;
5140cc070d59SAlex Elder 	}
51412df3fac7SAlex Elder 	/* No support for crypto and compression type format 2 images */
5142a30b71b9SAlex Elder 
514335152979SAlex Elder 	return 0;
51449d475de5SAlex Elder out_err:
5145642a2537SAlex Elder 	rbd_dev->header.features = 0;
51461e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
51471e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
51489d475de5SAlex Elder 
51499d475de5SAlex Elder 	return ret;
5150a30b71b9SAlex Elder }
5151a30b71b9SAlex Elder 
5152124afba2SAlex Elder static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
515383a06263SAlex Elder {
51542f82ee54SAlex Elder 	struct rbd_device *parent = NULL;
5155124afba2SAlex Elder 	struct rbd_spec *parent_spec;
5156124afba2SAlex Elder 	struct rbd_client *rbdc;
5157124afba2SAlex Elder 	int ret;
5158124afba2SAlex Elder 
5159124afba2SAlex Elder 	if (!rbd_dev->parent_spec)
5160124afba2SAlex Elder 		return 0;
5161124afba2SAlex Elder 	/*
5162124afba2SAlex Elder 	 * We need to pass a reference to the client and the parent
5163124afba2SAlex Elder 	 * spec when creating the parent rbd_dev.  Images related by
5164124afba2SAlex Elder 	 * parent/child relationships always share both.
5165124afba2SAlex Elder 	 */
5166124afba2SAlex Elder 	parent_spec = rbd_spec_get(rbd_dev->parent_spec);
5167124afba2SAlex Elder 	rbdc = __rbd_get_client(rbd_dev->rbd_client);
5168124afba2SAlex Elder 
5169124afba2SAlex Elder 	ret = -ENOMEM;
5170124afba2SAlex Elder 	parent = rbd_dev_create(rbdc, parent_spec);
5171124afba2SAlex Elder 	if (!parent)
5172124afba2SAlex Elder 		goto out_err;
5173124afba2SAlex Elder 
51741f3ef788SAlex Elder 	ret = rbd_dev_image_probe(parent, false);
5175124afba2SAlex Elder 	if (ret < 0)
5176124afba2SAlex Elder 		goto out_err;
5177124afba2SAlex Elder 	rbd_dev->parent = parent;
5178a2acd00eSAlex Elder 	atomic_set(&rbd_dev->parent_ref, 1);
5179124afba2SAlex Elder 
5180124afba2SAlex Elder 	return 0;
5181124afba2SAlex Elder out_err:
5182124afba2SAlex Elder 	if (parent) {
5183fb65d228SAlex Elder 		rbd_dev_unparent(rbd_dev);
5184124afba2SAlex Elder 		kfree(rbd_dev->header_name);
5185124afba2SAlex Elder 		rbd_dev_destroy(parent);
5186124afba2SAlex Elder 	} else {
5187124afba2SAlex Elder 		rbd_put_client(rbdc);
5188124afba2SAlex Elder 		rbd_spec_put(parent_spec);
5189124afba2SAlex Elder 	}
5190124afba2SAlex Elder 
5191124afba2SAlex Elder 	return ret;
5192124afba2SAlex Elder }
5193124afba2SAlex Elder 
5194200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
5195124afba2SAlex Elder {
519683a06263SAlex Elder 	int ret;
519783a06263SAlex Elder 
5198f8a22fc2SIlya Dryomov 	/* Get an id and fill in device name. */
519983a06263SAlex Elder 
5200f8a22fc2SIlya Dryomov 	ret = rbd_dev_id_get(rbd_dev);
5201f8a22fc2SIlya Dryomov 	if (ret)
5202f8a22fc2SIlya Dryomov 		return ret;
5203f8a22fc2SIlya Dryomov 
520483a06263SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
520583a06263SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
520683a06263SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
520783a06263SAlex Elder 
52089b60e70bSIlya Dryomov 	/* Record our major and minor device numbers. */
520983a06263SAlex Elder 
52109b60e70bSIlya Dryomov 	if (!single_major) {
521183a06263SAlex Elder 		ret = register_blkdev(0, rbd_dev->name);
521283a06263SAlex Elder 		if (ret < 0)
521383a06263SAlex Elder 			goto err_out_id;
52149b60e70bSIlya Dryomov 
521583a06263SAlex Elder 		rbd_dev->major = ret;
5216dd82fff1SIlya Dryomov 		rbd_dev->minor = 0;
52179b60e70bSIlya Dryomov 	} else {
52189b60e70bSIlya Dryomov 		rbd_dev->major = rbd_major;
52199b60e70bSIlya Dryomov 		rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
52209b60e70bSIlya Dryomov 	}
522183a06263SAlex Elder 
522283a06263SAlex Elder 	/* Set up the blkdev mapping. */
522383a06263SAlex Elder 
522483a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
522583a06263SAlex Elder 	if (ret)
522683a06263SAlex Elder 		goto err_out_blkdev;
522783a06263SAlex Elder 
5228f35a4deeSAlex Elder 	ret = rbd_dev_mapping_set(rbd_dev);
522983a06263SAlex Elder 	if (ret)
523083a06263SAlex Elder 		goto err_out_disk;
5231bc1ecc65SIlya Dryomov 
5232f35a4deeSAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
523322001f61SJosh Durgin 	set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only);
5234f35a4deeSAlex Elder 
5235f35a4deeSAlex Elder 	ret = rbd_bus_add_dev(rbd_dev);
5236f35a4deeSAlex Elder 	if (ret)
5237f5ee37bdSIlya Dryomov 		goto err_out_mapping;
523883a06263SAlex Elder 
523983a06263SAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
524083a06263SAlex Elder 
5241129b79d4SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
524283a06263SAlex Elder 	add_disk(rbd_dev->disk);
524383a06263SAlex Elder 
524483a06263SAlex Elder 	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
524583a06263SAlex Elder 		(unsigned long long) rbd_dev->mapping.size);
524683a06263SAlex Elder 
524783a06263SAlex Elder 	return ret;
52482f82ee54SAlex Elder 
5249f35a4deeSAlex Elder err_out_mapping:
5250f35a4deeSAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
525183a06263SAlex Elder err_out_disk:
525283a06263SAlex Elder 	rbd_free_disk(rbd_dev);
525383a06263SAlex Elder err_out_blkdev:
52549b60e70bSIlya Dryomov 	if (!single_major)
525583a06263SAlex Elder 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
525683a06263SAlex Elder err_out_id:
525783a06263SAlex Elder 	rbd_dev_id_put(rbd_dev);
5258d1cf5788SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
525983a06263SAlex Elder 
526083a06263SAlex Elder 	return ret;
526183a06263SAlex Elder }
526283a06263SAlex Elder 
5263332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev)
5264332bb12dSAlex Elder {
5265332bb12dSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
5266332bb12dSAlex Elder 	size_t size;
5267332bb12dSAlex Elder 
5268332bb12dSAlex Elder 	/* Record the header object name for this rbd image. */
5269332bb12dSAlex Elder 
5270332bb12dSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5271332bb12dSAlex Elder 
5272332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
5273332bb12dSAlex Elder 		size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
5274332bb12dSAlex Elder 	else
5275332bb12dSAlex Elder 		size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
5276332bb12dSAlex Elder 
5277332bb12dSAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
5278332bb12dSAlex Elder 	if (!rbd_dev->header_name)
5279332bb12dSAlex Elder 		return -ENOMEM;
5280332bb12dSAlex Elder 
5281332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
5282332bb12dSAlex Elder 		sprintf(rbd_dev->header_name, "%s%s",
5283332bb12dSAlex Elder 			spec->image_name, RBD_SUFFIX);
5284332bb12dSAlex Elder 	else
5285332bb12dSAlex Elder 		sprintf(rbd_dev->header_name, "%s%s",
5286332bb12dSAlex Elder 			RBD_HEADER_PREFIX, spec->image_id);
5287332bb12dSAlex Elder 	return 0;
5288332bb12dSAlex Elder }
5289332bb12dSAlex Elder 
5290200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev)
5291200a6a8bSAlex Elder {
52926fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
5293200a6a8bSAlex Elder 	kfree(rbd_dev->header_name);
52946fd48b3bSAlex Elder 	rbd_dev->header_name = NULL;
52956fd48b3bSAlex Elder 	rbd_dev->image_format = 0;
52966fd48b3bSAlex Elder 	kfree(rbd_dev->spec->image_id);
52976fd48b3bSAlex Elder 	rbd_dev->spec->image_id = NULL;
52986fd48b3bSAlex Elder 
5299200a6a8bSAlex Elder 	rbd_dev_destroy(rbd_dev);
5300200a6a8bSAlex Elder }
5301200a6a8bSAlex Elder 
5302a30b71b9SAlex Elder /*
5303a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
53041f3ef788SAlex Elder  * device.  If this image is the one being mapped (i.e., not a
53051f3ef788SAlex Elder  * parent), initiate a watch on its header object before using that
53061f3ef788SAlex Elder  * object to get detailed information about the rbd image.
5307a30b71b9SAlex Elder  */
53081f3ef788SAlex Elder static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping)
5309a30b71b9SAlex Elder {
5310a30b71b9SAlex Elder 	int ret;
5311a30b71b9SAlex Elder 
5312a30b71b9SAlex Elder 	/*
53133abef3b3SAlex Elder 	 * Get the id from the image id object.  Unless there's an
53143abef3b3SAlex Elder 	 * error, rbd_dev->spec->image_id will be filled in with
53153abef3b3SAlex Elder 	 * a dynamically-allocated string, and rbd_dev->image_format
53163abef3b3SAlex Elder 	 * will be set to either 1 or 2.
5317a30b71b9SAlex Elder 	 */
5318a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
5319a30b71b9SAlex Elder 	if (ret)
5320c0fba368SAlex Elder 		return ret;
5321c0fba368SAlex Elder 
5322332bb12dSAlex Elder 	ret = rbd_dev_header_name(rbd_dev);
5323332bb12dSAlex Elder 	if (ret)
5324332bb12dSAlex Elder 		goto err_out_format;
5325332bb12dSAlex Elder 
53261f3ef788SAlex Elder 	if (mapping) {
5327fca27065SIlya Dryomov 		ret = rbd_dev_header_watch_sync(rbd_dev);
53281fe48023SIlya Dryomov 		if (ret) {
53291fe48023SIlya Dryomov 			if (ret == -ENOENT)
53301fe48023SIlya Dryomov 				pr_info("image %s/%s does not exist\n",
53311fe48023SIlya Dryomov 					rbd_dev->spec->pool_name,
53321fe48023SIlya Dryomov 					rbd_dev->spec->image_name);
5333b644de2bSAlex Elder 			goto out_header_name;
53341f3ef788SAlex Elder 		}
53351fe48023SIlya Dryomov 	}
5336b644de2bSAlex Elder 
5337a720ae09SIlya Dryomov 	ret = rbd_dev_header_info(rbd_dev);
53385655c4d9SAlex Elder 	if (ret)
5339b644de2bSAlex Elder 		goto err_out_watch;
5340a30b71b9SAlex Elder 
534104077599SIlya Dryomov 	/*
534204077599SIlya Dryomov 	 * If this image is the one being mapped, we have pool name and
534304077599SIlya Dryomov 	 * id, image name and id, and snap name - need to fill snap id.
534404077599SIlya Dryomov 	 * Otherwise this is a parent image, identified by pool, image
534504077599SIlya Dryomov 	 * and snap ids - need to fill in names for those ids.
534604077599SIlya Dryomov 	 */
534704077599SIlya Dryomov 	if (mapping)
534804077599SIlya Dryomov 		ret = rbd_spec_fill_snap_id(rbd_dev);
534904077599SIlya Dryomov 	else
535004077599SIlya Dryomov 		ret = rbd_spec_fill_names(rbd_dev);
53511fe48023SIlya Dryomov 	if (ret) {
53521fe48023SIlya Dryomov 		if (ret == -ENOENT)
53531fe48023SIlya Dryomov 			pr_info("snap %s/%s@%s does not exist\n",
53541fe48023SIlya Dryomov 				rbd_dev->spec->pool_name,
53551fe48023SIlya Dryomov 				rbd_dev->spec->image_name,
53561fe48023SIlya Dryomov 				rbd_dev->spec->snap_name);
535733dca39fSAlex Elder 		goto err_out_probe;
53581fe48023SIlya Dryomov 	}
53599bb81c9bSAlex Elder 
5360e8f59b59SIlya Dryomov 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
5361e8f59b59SIlya Dryomov 		ret = rbd_dev_v2_parent_info(rbd_dev);
5362e8f59b59SIlya Dryomov 		if (ret)
5363e8f59b59SIlya Dryomov 			goto err_out_probe;
5364e8f59b59SIlya Dryomov 
5365e8f59b59SIlya Dryomov 		/*
5366e8f59b59SIlya Dryomov 		 * Need to warn users if this image is the one being
5367e8f59b59SIlya Dryomov 		 * mapped and has a parent.
5368e8f59b59SIlya Dryomov 		 */
5369e8f59b59SIlya Dryomov 		if (mapping && rbd_dev->parent_spec)
5370e8f59b59SIlya Dryomov 			rbd_warn(rbd_dev,
5371e8f59b59SIlya Dryomov 				 "WARNING: kernel layering is EXPERIMENTAL!");
5372e8f59b59SIlya Dryomov 	}
5373e8f59b59SIlya Dryomov 
53749bb81c9bSAlex Elder 	ret = rbd_dev_probe_parent(rbd_dev);
537530d60ba2SAlex Elder 	if (ret)
537630d60ba2SAlex Elder 		goto err_out_probe;
537783a06263SAlex Elder 
537830d60ba2SAlex Elder 	dout("discovered format %u image, header name is %s\n",
537930d60ba2SAlex Elder 		rbd_dev->image_format, rbd_dev->header_name);
538030d60ba2SAlex Elder 	return 0;
5381e8f59b59SIlya Dryomov 
53826fd48b3bSAlex Elder err_out_probe:
53836fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
5384b644de2bSAlex Elder err_out_watch:
5385fca27065SIlya Dryomov 	if (mapping)
5386fca27065SIlya Dryomov 		rbd_dev_header_unwatch_sync(rbd_dev);
5387332bb12dSAlex Elder out_header_name:
5388332bb12dSAlex Elder 	kfree(rbd_dev->header_name);
5389332bb12dSAlex Elder 	rbd_dev->header_name = NULL;
5390332bb12dSAlex Elder err_out_format:
5391332bb12dSAlex Elder 	rbd_dev->image_format = 0;
53925655c4d9SAlex Elder 	kfree(rbd_dev->spec->image_id);
53935655c4d9SAlex Elder 	rbd_dev->spec->image_id = NULL;
53945655c4d9SAlex Elder 	return ret;
539583a06263SAlex Elder }
539683a06263SAlex Elder 
53979b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus,
539859c2be1eSYehuda Sadeh 			  const char *buf,
539959c2be1eSYehuda Sadeh 			  size_t count)
5400602adf40SYehuda Sadeh {
5401cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
5402dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
54034e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
5404859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
54059d3997fdSAlex Elder 	struct rbd_client *rbdc;
540651344a38SAlex Elder 	bool read_only;
540727cc2594SAlex Elder 	int rc = -ENOMEM;
5408602adf40SYehuda Sadeh 
5409602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
5410602adf40SYehuda Sadeh 		return -ENODEV;
5411602adf40SYehuda Sadeh 
5412a725f65eSAlex Elder 	/* parse add command */
5413859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
5414dc79b113SAlex Elder 	if (rc < 0)
5415bd4ba655SAlex Elder 		goto err_out_module;
541651344a38SAlex Elder 	read_only = rbd_opts->read_only;
541751344a38SAlex Elder 	kfree(rbd_opts);
541851344a38SAlex Elder 	rbd_opts = NULL;	/* done with this */
5419a725f65eSAlex Elder 
54209d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
54219d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
54229d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
54230ddebc0cSAlex Elder 		goto err_out_args;
54249d3997fdSAlex Elder 	}
5425602adf40SYehuda Sadeh 
5426602adf40SYehuda Sadeh 	/* pick the pool */
542730ba1f02SIlya Dryomov 	rc = rbd_add_get_pool_id(rbdc, spec->pool_name);
54281fe48023SIlya Dryomov 	if (rc < 0) {
54291fe48023SIlya Dryomov 		if (rc == -ENOENT)
54301fe48023SIlya Dryomov 			pr_info("pool %s does not exist\n", spec->pool_name);
5431602adf40SYehuda Sadeh 		goto err_out_client;
54321fe48023SIlya Dryomov 	}
5433859c31dfSAlex Elder 	spec->pool_id = (u64)rc;
5434859c31dfSAlex Elder 
54350903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
54360903e875SAlex Elder 
5437c0cd10dbSAlex Elder 	if (spec->pool_id > (u64)U32_MAX) {
54389584d508SIlya Dryomov 		rbd_warn(NULL, "pool id too large (%llu > %u)",
5439c0cd10dbSAlex Elder 				(unsigned long long)spec->pool_id, U32_MAX);
54400903e875SAlex Elder 		rc = -EIO;
54410903e875SAlex Elder 		goto err_out_client;
54420903e875SAlex Elder 	}
54430903e875SAlex Elder 
5444c53d5893SAlex Elder 	rbd_dev = rbd_dev_create(rbdc, spec);
5445bd4ba655SAlex Elder 	if (!rbd_dev)
5446bd4ba655SAlex Elder 		goto err_out_client;
5447c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
5448c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
5449602adf40SYehuda Sadeh 
54501f3ef788SAlex Elder 	rc = rbd_dev_image_probe(rbd_dev, true);
5451a30b71b9SAlex Elder 	if (rc < 0)
5452c53d5893SAlex Elder 		goto err_out_rbd_dev;
545305fd6f6fSAlex Elder 
54547ce4eef7SAlex Elder 	/* If we are mapping a snapshot it must be marked read-only */
54557ce4eef7SAlex Elder 
54567ce4eef7SAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
54577ce4eef7SAlex Elder 		read_only = true;
54587ce4eef7SAlex Elder 	rbd_dev->mapping.read_only = read_only;
54597ce4eef7SAlex Elder 
5460b536f69aSAlex Elder 	rc = rbd_dev_device_setup(rbd_dev);
54613abef3b3SAlex Elder 	if (rc) {
5462e37180c0SIlya Dryomov 		/*
5463e37180c0SIlya Dryomov 		 * rbd_dev_header_unwatch_sync() can't be moved into
5464e37180c0SIlya Dryomov 		 * rbd_dev_image_release() without refactoring, see
5465e37180c0SIlya Dryomov 		 * commit 1f3ef78861ac.
5466e37180c0SIlya Dryomov 		 */
5467e37180c0SIlya Dryomov 		rbd_dev_header_unwatch_sync(rbd_dev);
54683abef3b3SAlex Elder 		rbd_dev_image_release(rbd_dev);
54693abef3b3SAlex Elder 		goto err_out_module;
54703abef3b3SAlex Elder 	}
54713abef3b3SAlex Elder 
5472602adf40SYehuda Sadeh 	return count;
5473b536f69aSAlex Elder 
5474c53d5893SAlex Elder err_out_rbd_dev:
5475c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
5476bd4ba655SAlex Elder err_out_client:
54779d3997fdSAlex Elder 	rbd_put_client(rbdc);
54780ddebc0cSAlex Elder err_out_args:
5479859c31dfSAlex Elder 	rbd_spec_put(spec);
5480bd4ba655SAlex Elder err_out_module:
5481bd4ba655SAlex Elder 	module_put(THIS_MODULE);
548227cc2594SAlex Elder 
5483602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
548427cc2594SAlex Elder 
548527cc2594SAlex Elder 	return (ssize_t)rc;
5486602adf40SYehuda Sadeh }
5487602adf40SYehuda Sadeh 
54889b60e70bSIlya Dryomov static ssize_t rbd_add(struct bus_type *bus,
54899b60e70bSIlya Dryomov 		       const char *buf,
54909b60e70bSIlya Dryomov 		       size_t count)
54919b60e70bSIlya Dryomov {
54929b60e70bSIlya Dryomov 	if (single_major)
54939b60e70bSIlya Dryomov 		return -EINVAL;
54949b60e70bSIlya Dryomov 
54959b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
54969b60e70bSIlya Dryomov }
54979b60e70bSIlya Dryomov 
54989b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus,
54999b60e70bSIlya Dryomov 				    const char *buf,
55009b60e70bSIlya Dryomov 				    size_t count)
55019b60e70bSIlya Dryomov {
55029b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
55039b60e70bSIlya Dryomov }
55049b60e70bSIlya Dryomov 
5505200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev)
5506602adf40SYehuda Sadeh {
5507593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5508602adf40SYehuda Sadeh 
5509602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
5510200a6a8bSAlex Elder 	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
55116d80b130SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
55129b60e70bSIlya Dryomov 	if (!single_major)
5513602adf40SYehuda Sadeh 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
5514e2839308SAlex Elder 	rbd_dev_id_put(rbd_dev);
5515d1cf5788SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
5516602adf40SYehuda Sadeh }
5517602adf40SYehuda Sadeh 
551805a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
551905a46afdSAlex Elder {
5520ad945fc1SAlex Elder 	while (rbd_dev->parent) {
552105a46afdSAlex Elder 		struct rbd_device *first = rbd_dev;
552205a46afdSAlex Elder 		struct rbd_device *second = first->parent;
552305a46afdSAlex Elder 		struct rbd_device *third;
552405a46afdSAlex Elder 
552505a46afdSAlex Elder 		/*
552605a46afdSAlex Elder 		 * Follow to the parent with no grandparent and
552705a46afdSAlex Elder 		 * remove it.
552805a46afdSAlex Elder 		 */
552905a46afdSAlex Elder 		while (second && (third = second->parent)) {
553005a46afdSAlex Elder 			first = second;
553105a46afdSAlex Elder 			second = third;
553205a46afdSAlex Elder 		}
5533ad945fc1SAlex Elder 		rbd_assert(second);
55348ad42cd0SAlex Elder 		rbd_dev_image_release(second);
5535ad945fc1SAlex Elder 		first->parent = NULL;
5536ad945fc1SAlex Elder 		first->parent_overlap = 0;
5537ad945fc1SAlex Elder 
5538ad945fc1SAlex Elder 		rbd_assert(first->parent_spec);
553905a46afdSAlex Elder 		rbd_spec_put(first->parent_spec);
554005a46afdSAlex Elder 		first->parent_spec = NULL;
554105a46afdSAlex Elder 	}
554205a46afdSAlex Elder }
554305a46afdSAlex Elder 
55449b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus,
5545602adf40SYehuda Sadeh 			     const char *buf,
5546602adf40SYehuda Sadeh 			     size_t count)
5547602adf40SYehuda Sadeh {
5548602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
5549751cc0e3SAlex Elder 	struct list_head *tmp;
5550751cc0e3SAlex Elder 	int dev_id;
5551602adf40SYehuda Sadeh 	unsigned long ul;
555282a442d2SAlex Elder 	bool already = false;
55530d8189e1SAlex Elder 	int ret;
5554602adf40SYehuda Sadeh 
5555bb8e0e84SJingoo Han 	ret = kstrtoul(buf, 10, &ul);
55560d8189e1SAlex Elder 	if (ret)
55570d8189e1SAlex Elder 		return ret;
5558602adf40SYehuda Sadeh 
5559602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
5560751cc0e3SAlex Elder 	dev_id = (int)ul;
5561751cc0e3SAlex Elder 	if (dev_id != ul)
5562602adf40SYehuda Sadeh 		return -EINVAL;
5563602adf40SYehuda Sadeh 
5564602adf40SYehuda Sadeh 	ret = -ENOENT;
5565751cc0e3SAlex Elder 	spin_lock(&rbd_dev_list_lock);
5566751cc0e3SAlex Elder 	list_for_each(tmp, &rbd_dev_list) {
5567751cc0e3SAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
5568751cc0e3SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
5569751cc0e3SAlex Elder 			ret = 0;
5570751cc0e3SAlex Elder 			break;
5571602adf40SYehuda Sadeh 		}
5572751cc0e3SAlex Elder 	}
5573751cc0e3SAlex Elder 	if (!ret) {
5574a14ea269SAlex Elder 		spin_lock_irq(&rbd_dev->lock);
5575b82d167bSAlex Elder 		if (rbd_dev->open_count)
557642382b70SAlex Elder 			ret = -EBUSY;
5577b82d167bSAlex Elder 		else
557882a442d2SAlex Elder 			already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
557982a442d2SAlex Elder 							&rbd_dev->flags);
5580a14ea269SAlex Elder 		spin_unlock_irq(&rbd_dev->lock);
5581751cc0e3SAlex Elder 	}
5582751cc0e3SAlex Elder 	spin_unlock(&rbd_dev_list_lock);
558382a442d2SAlex Elder 	if (ret < 0 || already)
55841ba0f1e7SAlex Elder 		return ret;
5585751cc0e3SAlex Elder 
5586fca27065SIlya Dryomov 	rbd_dev_header_unwatch_sync(rbd_dev);
55879abc5990SJosh Durgin 	/*
55889abc5990SJosh Durgin 	 * flush remaining watch callbacks - these must be complete
55899abc5990SJosh Durgin 	 * before the osd_client is shutdown
55909abc5990SJosh Durgin 	 */
55919abc5990SJosh Durgin 	dout("%s: flushing notifies", __func__);
55929abc5990SJosh Durgin 	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
5593fca27065SIlya Dryomov 
55949875201eSJosh Durgin 	/*
55959875201eSJosh Durgin 	 * Don't free anything from rbd_dev->disk until after all
55969875201eSJosh Durgin 	 * notifies are completely processed. Otherwise
55979875201eSJosh Durgin 	 * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting
55989875201eSJosh Durgin 	 * in a potential use after free of rbd_dev->disk or rbd_dev.
55999875201eSJosh Durgin 	 */
56009875201eSJosh Durgin 	rbd_bus_del_dev(rbd_dev);
56018ad42cd0SAlex Elder 	rbd_dev_image_release(rbd_dev);
560279ab7558SAlex Elder 	module_put(THIS_MODULE);
5603aafb230eSAlex Elder 
56041ba0f1e7SAlex Elder 	return count;
5605602adf40SYehuda Sadeh }
5606602adf40SYehuda Sadeh 
56079b60e70bSIlya Dryomov static ssize_t rbd_remove(struct bus_type *bus,
56089b60e70bSIlya Dryomov 			  const char *buf,
56099b60e70bSIlya Dryomov 			  size_t count)
56109b60e70bSIlya Dryomov {
56119b60e70bSIlya Dryomov 	if (single_major)
56129b60e70bSIlya Dryomov 		return -EINVAL;
56139b60e70bSIlya Dryomov 
56149b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
56159b60e70bSIlya Dryomov }
56169b60e70bSIlya Dryomov 
56179b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus,
56189b60e70bSIlya Dryomov 				       const char *buf,
56199b60e70bSIlya Dryomov 				       size_t count)
56209b60e70bSIlya Dryomov {
56219b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
56229b60e70bSIlya Dryomov }
56239b60e70bSIlya Dryomov 
5624602adf40SYehuda Sadeh /*
5625602adf40SYehuda Sadeh  * create control files in sysfs
5626dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
5627602adf40SYehuda Sadeh  */
5628602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
5629602adf40SYehuda Sadeh {
5630dfc5606dSYehuda Sadeh 	int ret;
5631602adf40SYehuda Sadeh 
5632fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
5633dfc5606dSYehuda Sadeh 	if (ret < 0)
5634dfc5606dSYehuda Sadeh 		return ret;
5635602adf40SYehuda Sadeh 
5636fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
5637fed4c143SAlex Elder 	if (ret < 0)
5638fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
5639602adf40SYehuda Sadeh 
5640602adf40SYehuda Sadeh 	return ret;
5641602adf40SYehuda Sadeh }
5642602adf40SYehuda Sadeh 
5643602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
5644602adf40SYehuda Sadeh {
5645dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
5646fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
5647602adf40SYehuda Sadeh }
5648602adf40SYehuda Sadeh 
56491c2a9dfeSAlex Elder static int rbd_slab_init(void)
56501c2a9dfeSAlex Elder {
56511c2a9dfeSAlex Elder 	rbd_assert(!rbd_img_request_cache);
56521c2a9dfeSAlex Elder 	rbd_img_request_cache = kmem_cache_create("rbd_img_request",
56531c2a9dfeSAlex Elder 					sizeof (struct rbd_img_request),
56541c2a9dfeSAlex Elder 					__alignof__(struct rbd_img_request),
56551c2a9dfeSAlex Elder 					0, NULL);
5656868311b1SAlex Elder 	if (!rbd_img_request_cache)
5657868311b1SAlex Elder 		return -ENOMEM;
5658868311b1SAlex Elder 
5659868311b1SAlex Elder 	rbd_assert(!rbd_obj_request_cache);
5660868311b1SAlex Elder 	rbd_obj_request_cache = kmem_cache_create("rbd_obj_request",
5661868311b1SAlex Elder 					sizeof (struct rbd_obj_request),
5662868311b1SAlex Elder 					__alignof__(struct rbd_obj_request),
5663868311b1SAlex Elder 					0, NULL);
566478c2a44aSAlex Elder 	if (!rbd_obj_request_cache)
566578c2a44aSAlex Elder 		goto out_err;
566678c2a44aSAlex Elder 
566778c2a44aSAlex Elder 	rbd_assert(!rbd_segment_name_cache);
566878c2a44aSAlex Elder 	rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
56692d0ebc5dSIlya Dryomov 					CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL);
567078c2a44aSAlex Elder 	if (rbd_segment_name_cache)
56711c2a9dfeSAlex Elder 		return 0;
567278c2a44aSAlex Elder out_err:
567378c2a44aSAlex Elder 	if (rbd_obj_request_cache) {
567478c2a44aSAlex Elder 		kmem_cache_destroy(rbd_obj_request_cache);
567578c2a44aSAlex Elder 		rbd_obj_request_cache = NULL;
567678c2a44aSAlex Elder 	}
56771c2a9dfeSAlex Elder 
5678868311b1SAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
5679868311b1SAlex Elder 	rbd_img_request_cache = NULL;
5680868311b1SAlex Elder 
56811c2a9dfeSAlex Elder 	return -ENOMEM;
56821c2a9dfeSAlex Elder }
56831c2a9dfeSAlex Elder 
56841c2a9dfeSAlex Elder static void rbd_slab_exit(void)
56851c2a9dfeSAlex Elder {
568678c2a44aSAlex Elder 	rbd_assert(rbd_segment_name_cache);
568778c2a44aSAlex Elder 	kmem_cache_destroy(rbd_segment_name_cache);
568878c2a44aSAlex Elder 	rbd_segment_name_cache = NULL;
568978c2a44aSAlex Elder 
5690868311b1SAlex Elder 	rbd_assert(rbd_obj_request_cache);
5691868311b1SAlex Elder 	kmem_cache_destroy(rbd_obj_request_cache);
5692868311b1SAlex Elder 	rbd_obj_request_cache = NULL;
5693868311b1SAlex Elder 
56941c2a9dfeSAlex Elder 	rbd_assert(rbd_img_request_cache);
56951c2a9dfeSAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
56961c2a9dfeSAlex Elder 	rbd_img_request_cache = NULL;
56971c2a9dfeSAlex Elder }
56981c2a9dfeSAlex Elder 
5699cc344fa1SAlex Elder static int __init rbd_init(void)
5700602adf40SYehuda Sadeh {
5701602adf40SYehuda Sadeh 	int rc;
5702602adf40SYehuda Sadeh 
57031e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
57041e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
57051e32d34cSAlex Elder 		return -EINVAL;
57061e32d34cSAlex Elder 	}
5707e1b4d96dSIlya Dryomov 
57081c2a9dfeSAlex Elder 	rc = rbd_slab_init();
5709602adf40SYehuda Sadeh 	if (rc)
5710602adf40SYehuda Sadeh 		return rc;
5711e1b4d96dSIlya Dryomov 
5712f5ee37bdSIlya Dryomov 	/*
5713f5ee37bdSIlya Dryomov 	 * The number of active work items is limited by the number of
5714f77303bdSIlya Dryomov 	 * rbd devices * queue depth, so leave @max_active at default.
5715f5ee37bdSIlya Dryomov 	 */
5716f5ee37bdSIlya Dryomov 	rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
5717f5ee37bdSIlya Dryomov 	if (!rbd_wq) {
5718f5ee37bdSIlya Dryomov 		rc = -ENOMEM;
5719f5ee37bdSIlya Dryomov 		goto err_out_slab;
5720f5ee37bdSIlya Dryomov 	}
5721f5ee37bdSIlya Dryomov 
57229b60e70bSIlya Dryomov 	if (single_major) {
57239b60e70bSIlya Dryomov 		rbd_major = register_blkdev(0, RBD_DRV_NAME);
57249b60e70bSIlya Dryomov 		if (rbd_major < 0) {
57259b60e70bSIlya Dryomov 			rc = rbd_major;
5726f5ee37bdSIlya Dryomov 			goto err_out_wq;
57279b60e70bSIlya Dryomov 		}
57289b60e70bSIlya Dryomov 	}
57299b60e70bSIlya Dryomov 
57301c2a9dfeSAlex Elder 	rc = rbd_sysfs_init();
57311c2a9dfeSAlex Elder 	if (rc)
57329b60e70bSIlya Dryomov 		goto err_out_blkdev;
57331c2a9dfeSAlex Elder 
57349b60e70bSIlya Dryomov 	if (single_major)
57359b60e70bSIlya Dryomov 		pr_info("loaded (major %d)\n", rbd_major);
57369b60e70bSIlya Dryomov 	else
5737e1b4d96dSIlya Dryomov 		pr_info("loaded\n");
57389b60e70bSIlya Dryomov 
5739e1b4d96dSIlya Dryomov 	return 0;
5740e1b4d96dSIlya Dryomov 
57419b60e70bSIlya Dryomov err_out_blkdev:
57429b60e70bSIlya Dryomov 	if (single_major)
57439b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
5744f5ee37bdSIlya Dryomov err_out_wq:
5745f5ee37bdSIlya Dryomov 	destroy_workqueue(rbd_wq);
5746e1b4d96dSIlya Dryomov err_out_slab:
5747e1b4d96dSIlya Dryomov 	rbd_slab_exit();
57481c2a9dfeSAlex Elder 	return rc;
5749602adf40SYehuda Sadeh }
5750602adf40SYehuda Sadeh 
5751cc344fa1SAlex Elder static void __exit rbd_exit(void)
5752602adf40SYehuda Sadeh {
5753ffe312cfSIlya Dryomov 	ida_destroy(&rbd_dev_id_ida);
5754602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
57559b60e70bSIlya Dryomov 	if (single_major)
57569b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
5757f5ee37bdSIlya Dryomov 	destroy_workqueue(rbd_wq);
57581c2a9dfeSAlex Elder 	rbd_slab_exit();
5759602adf40SYehuda Sadeh }
5760602adf40SYehuda Sadeh 
5761602adf40SYehuda Sadeh module_init(rbd_init);
5762602adf40SYehuda Sadeh module_exit(rbd_exit);
5763602adf40SYehuda Sadeh 
5764d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
5765602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5766602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
5767602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
5768602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5769602adf40SYehuda Sadeh 
577090da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
5771602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
5772