xref: /openbmc/linux/drivers/block/rbd.c (revision 6cac4695)
1e2a58ee5SAlex Elder 
2602adf40SYehuda Sadeh /*
3602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh 
6602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
7602adf40SYehuda Sadeh 
8602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
9602adf40SYehuda Sadeh 
10602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
11602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
12602adf40SYehuda Sadeh    the Free Software Foundation.
13602adf40SYehuda Sadeh 
14602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
15602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
16602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17602adf40SYehuda Sadeh    GNU General Public License for more details.
18602adf40SYehuda Sadeh 
19602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
20602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
21602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24602adf40SYehuda Sadeh 
25dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
26602adf40SYehuda Sadeh 
27dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
28602adf40SYehuda Sadeh 
29602adf40SYehuda Sadeh  */
30602adf40SYehuda Sadeh 
31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
34602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3559c2be1eSYehuda Sadeh #include <linux/parser.h>
3630d1cff8SAlex Elder #include <linux/bsearch.h>
37602adf40SYehuda Sadeh 
38602adf40SYehuda Sadeh #include <linux/kernel.h>
39602adf40SYehuda Sadeh #include <linux/device.h>
40602adf40SYehuda Sadeh #include <linux/module.h>
417ad18afaSChristoph Hellwig #include <linux/blk-mq.h>
42602adf40SYehuda Sadeh #include <linux/fs.h>
43602adf40SYehuda Sadeh #include <linux/blkdev.h>
441c2a9dfeSAlex Elder #include <linux/slab.h>
45f8a22fc2SIlya Dryomov #include <linux/idr.h>
46bc1ecc65SIlya Dryomov #include <linux/workqueue.h>
47602adf40SYehuda Sadeh 
48602adf40SYehuda Sadeh #include "rbd_types.h"
49602adf40SYehuda Sadeh 
50aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
51aafb230eSAlex Elder 
52593a9e7bSAlex Elder /*
53593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
54593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
55593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
56593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
57593a9e7bSAlex Elder  */
58593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
59593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
60593a9e7bSAlex Elder 
61a2acd00eSAlex Elder /*
62a2acd00eSAlex Elder  * Increment the given counter and return its updated value.
63a2acd00eSAlex Elder  * If the counter is already 0 it will not be incremented.
64a2acd00eSAlex Elder  * If the counter is already at its maximum value returns
65a2acd00eSAlex Elder  * -EINVAL without updating it.
66a2acd00eSAlex Elder  */
67a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v)
68a2acd00eSAlex Elder {
69a2acd00eSAlex Elder 	unsigned int counter;
70a2acd00eSAlex Elder 
71a2acd00eSAlex Elder 	counter = (unsigned int)__atomic_add_unless(v, 1, 0);
72a2acd00eSAlex Elder 	if (counter <= (unsigned int)INT_MAX)
73a2acd00eSAlex Elder 		return (int)counter;
74a2acd00eSAlex Elder 
75a2acd00eSAlex Elder 	atomic_dec(v);
76a2acd00eSAlex Elder 
77a2acd00eSAlex Elder 	return -EINVAL;
78a2acd00eSAlex Elder }
79a2acd00eSAlex Elder 
80a2acd00eSAlex Elder /* Decrement the counter.  Return the resulting value, or -EINVAL */
81a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v)
82a2acd00eSAlex Elder {
83a2acd00eSAlex Elder 	int counter;
84a2acd00eSAlex Elder 
85a2acd00eSAlex Elder 	counter = atomic_dec_return(v);
86a2acd00eSAlex Elder 	if (counter >= 0)
87a2acd00eSAlex Elder 		return counter;
88a2acd00eSAlex Elder 
89a2acd00eSAlex Elder 	atomic_inc(v);
90a2acd00eSAlex Elder 
91a2acd00eSAlex Elder 	return -EINVAL;
92a2acd00eSAlex Elder }
93a2acd00eSAlex Elder 
94f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
95602adf40SYehuda Sadeh 
967e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR		256
977e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT	4
98602adf40SYehuda Sadeh 
996d69bb53SIlya Dryomov #define RBD_MAX_PARENT_CHAIN_LEN	16
1006d69bb53SIlya Dryomov 
101d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
102d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
103d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
104d4b125e9SAlex Elder 
10535d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
106602adf40SYehuda Sadeh 
107602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
108602adf40SYehuda Sadeh 
1099682fc6dSAlex Elder #define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
1109682fc6dSAlex Elder 
1119e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
1129e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
113589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
1149e15b77dSAlex Elder 
1151e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
116589d30e0SAlex Elder 
117d889140cSAlex Elder /* Feature bits */
118d889140cSAlex Elder 
1195cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING	(1<<0)
1205cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2	(1<<1)
1215cbf6f12SAlex Elder #define RBD_FEATURES_ALL \
1225cbf6f12SAlex Elder 	    (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
123d889140cSAlex Elder 
124d889140cSAlex Elder /* Features supported by this (client software) implementation. */
125d889140cSAlex Elder 
126770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
127d889140cSAlex Elder 
12881a89793SAlex Elder /*
12981a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
13081a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
13181a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
13281a89793SAlex Elder  * enough to hold all possible device names.
13381a89793SAlex Elder  */
134602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
13581a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
136602adf40SYehuda Sadeh 
137602adf40SYehuda Sadeh /*
138602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
139602adf40SYehuda Sadeh  */
140602adf40SYehuda Sadeh struct rbd_image_header {
141f35a4deeSAlex Elder 	/* These six fields never change for a given rbd image */
142849b4260SAlex Elder 	char *object_prefix;
143602adf40SYehuda Sadeh 	__u8 obj_order;
144602adf40SYehuda Sadeh 	__u8 crypt_type;
145602adf40SYehuda Sadeh 	__u8 comp_type;
146f35a4deeSAlex Elder 	u64 stripe_unit;
147f35a4deeSAlex Elder 	u64 stripe_count;
148f35a4deeSAlex Elder 	u64 features;		/* Might be changeable someday? */
149602adf40SYehuda Sadeh 
150f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
151f84344f3SAlex Elder 	u64 image_size;
152f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
153f35a4deeSAlex Elder 	char *snap_names;	/* format 1 only */
154f35a4deeSAlex Elder 	u64 *snap_sizes;	/* format 1 only */
15559c2be1eSYehuda Sadeh };
15659c2be1eSYehuda Sadeh 
1570d7dbfceSAlex Elder /*
1580d7dbfceSAlex Elder  * An rbd image specification.
1590d7dbfceSAlex Elder  *
1600d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
161c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
162c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
163c66c6e0cSAlex Elder  *
164c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
165c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
166c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
167c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
168c66c6e0cSAlex Elder  *
169c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
170c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
171c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
172c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
173c66c6e0cSAlex Elder  * is shared between the parent and child).
174c66c6e0cSAlex Elder  *
175c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
176c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
177c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
178c66c6e0cSAlex Elder  *
179c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
180c66c6e0cSAlex Elder  * could be a null pointer).
1810d7dbfceSAlex Elder  */
1820d7dbfceSAlex Elder struct rbd_spec {
1830d7dbfceSAlex Elder 	u64		pool_id;
184ecb4dc22SAlex Elder 	const char	*pool_name;
1850d7dbfceSAlex Elder 
186ecb4dc22SAlex Elder 	const char	*image_id;
187ecb4dc22SAlex Elder 	const char	*image_name;
1880d7dbfceSAlex Elder 
1890d7dbfceSAlex Elder 	u64		snap_id;
190ecb4dc22SAlex Elder 	const char	*snap_name;
1910d7dbfceSAlex Elder 
1920d7dbfceSAlex Elder 	struct kref	kref;
1930d7dbfceSAlex Elder };
1940d7dbfceSAlex Elder 
195602adf40SYehuda Sadeh /*
196f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
197602adf40SYehuda Sadeh  */
198602adf40SYehuda Sadeh struct rbd_client {
199602adf40SYehuda Sadeh 	struct ceph_client	*client;
200602adf40SYehuda Sadeh 	struct kref		kref;
201602adf40SYehuda Sadeh 	struct list_head	node;
202602adf40SYehuda Sadeh };
203602adf40SYehuda Sadeh 
204bf0d5f50SAlex Elder struct rbd_img_request;
205bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
206bf0d5f50SAlex Elder 
207bf0d5f50SAlex Elder #define	BAD_WHICH	U32_MAX		/* Good which or bad which, which? */
208bf0d5f50SAlex Elder 
209bf0d5f50SAlex Elder struct rbd_obj_request;
210bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
211bf0d5f50SAlex Elder 
2129969ebc5SAlex Elder enum obj_request_type {
2139969ebc5SAlex Elder 	OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
2149969ebc5SAlex Elder };
215bf0d5f50SAlex Elder 
2166d2940c8SGuangliang Zhao enum obj_operation_type {
2176d2940c8SGuangliang Zhao 	OBJ_OP_WRITE,
2186d2940c8SGuangliang Zhao 	OBJ_OP_READ,
21990e98c52SGuangliang Zhao 	OBJ_OP_DISCARD,
2206d2940c8SGuangliang Zhao };
2216d2940c8SGuangliang Zhao 
222926f9b3fSAlex Elder enum obj_req_flags {
223926f9b3fSAlex Elder 	OBJ_REQ_DONE,		/* completion flag: not done = 0, done = 1 */
2246365d33aSAlex Elder 	OBJ_REQ_IMG_DATA,	/* object usage: standalone = 0, image = 1 */
2255679c59fSAlex Elder 	OBJ_REQ_KNOWN,		/* EXISTS flag valid: no = 0, yes = 1 */
2265679c59fSAlex Elder 	OBJ_REQ_EXISTS,		/* target exists: no = 0, yes = 1 */
227926f9b3fSAlex Elder };
228926f9b3fSAlex Elder 
229bf0d5f50SAlex Elder struct rbd_obj_request {
230bf0d5f50SAlex Elder 	const char		*object_name;
231bf0d5f50SAlex Elder 	u64			offset;		/* object start byte */
232bf0d5f50SAlex Elder 	u64			length;		/* bytes from offset */
233926f9b3fSAlex Elder 	unsigned long		flags;
234bf0d5f50SAlex Elder 
235c5b5ef6cSAlex Elder 	/*
236c5b5ef6cSAlex Elder 	 * An object request associated with an image will have its
237c5b5ef6cSAlex Elder 	 * img_data flag set; a standalone object request will not.
238c5b5ef6cSAlex Elder 	 *
239c5b5ef6cSAlex Elder 	 * A standalone object request will have which == BAD_WHICH
240c5b5ef6cSAlex Elder 	 * and a null obj_request pointer.
241c5b5ef6cSAlex Elder 	 *
242c5b5ef6cSAlex Elder 	 * An object request initiated in support of a layered image
243c5b5ef6cSAlex Elder 	 * object (to check for its existence before a write) will
244c5b5ef6cSAlex Elder 	 * have which == BAD_WHICH and a non-null obj_request pointer.
245c5b5ef6cSAlex Elder 	 *
246c5b5ef6cSAlex Elder 	 * Finally, an object request for rbd image data will have
247c5b5ef6cSAlex Elder 	 * which != BAD_WHICH, and will have a non-null img_request
248c5b5ef6cSAlex Elder 	 * pointer.  The value of which will be in the range
249c5b5ef6cSAlex Elder 	 * 0..(img_request->obj_request_count-1).
250c5b5ef6cSAlex Elder 	 */
251c5b5ef6cSAlex Elder 	union {
252c5b5ef6cSAlex Elder 		struct rbd_obj_request	*obj_request;	/* STAT op */
253c5b5ef6cSAlex Elder 		struct {
254bf0d5f50SAlex Elder 			struct rbd_img_request	*img_request;
255c5b5ef6cSAlex Elder 			u64			img_offset;
256c5b5ef6cSAlex Elder 			/* links for img_request->obj_requests list */
257c5b5ef6cSAlex Elder 			struct list_head	links;
258c5b5ef6cSAlex Elder 		};
259c5b5ef6cSAlex Elder 	};
260bf0d5f50SAlex Elder 	u32			which;		/* posn image request list */
261bf0d5f50SAlex Elder 
262bf0d5f50SAlex Elder 	enum obj_request_type	type;
263788e2df3SAlex Elder 	union {
264bf0d5f50SAlex Elder 		struct bio	*bio_list;
265788e2df3SAlex Elder 		struct {
266788e2df3SAlex Elder 			struct page	**pages;
267788e2df3SAlex Elder 			u32		page_count;
268788e2df3SAlex Elder 		};
269788e2df3SAlex Elder 	};
2700eefd470SAlex Elder 	struct page		**copyup_pages;
271ebda6408SAlex Elder 	u32			copyup_page_count;
272bf0d5f50SAlex Elder 
273bf0d5f50SAlex Elder 	struct ceph_osd_request	*osd_req;
274bf0d5f50SAlex Elder 
275bf0d5f50SAlex Elder 	u64			xferred;	/* bytes transferred */
2761b83bef2SSage Weil 	int			result;
277bf0d5f50SAlex Elder 
278bf0d5f50SAlex Elder 	rbd_obj_callback_t	callback;
279788e2df3SAlex Elder 	struct completion	completion;
280bf0d5f50SAlex Elder 
281bf0d5f50SAlex Elder 	struct kref		kref;
282bf0d5f50SAlex Elder };
283bf0d5f50SAlex Elder 
2840c425248SAlex Elder enum img_req_flags {
2859849e986SAlex Elder 	IMG_REQ_WRITE,		/* I/O direction: read = 0, write = 1 */
2869849e986SAlex Elder 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
287d0b2e944SAlex Elder 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
28890e98c52SGuangliang Zhao 	IMG_REQ_DISCARD,	/* discard: normal = 0, discard request = 1 */
2890c425248SAlex Elder };
2900c425248SAlex Elder 
291bf0d5f50SAlex Elder struct rbd_img_request {
292bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
293bf0d5f50SAlex Elder 	u64			offset;	/* starting image byte offset */
294bf0d5f50SAlex Elder 	u64			length;	/* byte count from offset */
2950c425248SAlex Elder 	unsigned long		flags;
296bf0d5f50SAlex Elder 	union {
297bf0d5f50SAlex Elder 		u64			snap_id;	/* for reads */
2989849e986SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
2999849e986SAlex Elder 	};
3009849e986SAlex Elder 	union {
3019849e986SAlex Elder 		struct request		*rq;		/* block request */
3029849e986SAlex Elder 		struct rbd_obj_request	*obj_request;	/* obj req initiator */
303bf0d5f50SAlex Elder 	};
3043d7efd18SAlex Elder 	struct page		**copyup_pages;
305ebda6408SAlex Elder 	u32			copyup_page_count;
306bf0d5f50SAlex Elder 	spinlock_t		completion_lock;/* protects next_completion */
307bf0d5f50SAlex Elder 	u32			next_completion;
308bf0d5f50SAlex Elder 	rbd_img_callback_t	callback;
30955f27e09SAlex Elder 	u64			xferred;/* aggregate bytes transferred */
310a5a337d4SAlex Elder 	int			result;	/* first nonzero obj_request result */
311bf0d5f50SAlex Elder 
312bf0d5f50SAlex Elder 	u32			obj_request_count;
313bf0d5f50SAlex Elder 	struct list_head	obj_requests;	/* rbd_obj_request structs */
314bf0d5f50SAlex Elder 
315bf0d5f50SAlex Elder 	struct kref		kref;
316bf0d5f50SAlex Elder };
317bf0d5f50SAlex Elder 
318bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
319ef06f4d3SAlex Elder 	list_for_each_entry(oreq, &(ireq)->obj_requests, links)
320bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \
321ef06f4d3SAlex Elder 	list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
322bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
323ef06f4d3SAlex Elder 	list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
324bf0d5f50SAlex Elder 
325f84344f3SAlex Elder struct rbd_mapping {
32699c1f08fSAlex Elder 	u64                     size;
32734b13184SAlex Elder 	u64                     features;
328f84344f3SAlex Elder 	bool			read_only;
329f84344f3SAlex Elder };
330f84344f3SAlex Elder 
331602adf40SYehuda Sadeh /*
332602adf40SYehuda Sadeh  * a single device
333602adf40SYehuda Sadeh  */
334602adf40SYehuda Sadeh struct rbd_device {
335de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
336602adf40SYehuda Sadeh 
337602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
338dd82fff1SIlya Dryomov 	int			minor;
339602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
340602adf40SYehuda Sadeh 
341a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
342602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
343602adf40SYehuda Sadeh 
344602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
345602adf40SYehuda Sadeh 
346b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
347602adf40SYehuda Sadeh 
348602adf40SYehuda Sadeh 	struct rbd_image_header	header;
349b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
3500d7dbfceSAlex Elder 	struct rbd_spec		*spec;
351d147543dSIlya Dryomov 	struct rbd_options	*opts;
352602adf40SYehuda Sadeh 
3530d7dbfceSAlex Elder 	char			*header_name;
354971f839aSAlex Elder 
3550903e875SAlex Elder 	struct ceph_file_layout	layout;
3560903e875SAlex Elder 
35759c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
358975241afSAlex Elder 	struct rbd_obj_request	*watch_request;
35959c2be1eSYehuda Sadeh 
36086b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
36186b00e0dSAlex Elder 	u64			parent_overlap;
362a2acd00eSAlex Elder 	atomic_t		parent_ref;
3632f82ee54SAlex Elder 	struct rbd_device	*parent;
36486b00e0dSAlex Elder 
3657ad18afaSChristoph Hellwig 	/* Block layer tags. */
3667ad18afaSChristoph Hellwig 	struct blk_mq_tag_set	tag_set;
3677ad18afaSChristoph Hellwig 
368c666601aSJosh Durgin 	/* protects updating the header */
369c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
370f84344f3SAlex Elder 
371f84344f3SAlex Elder 	struct rbd_mapping	mapping;
372602adf40SYehuda Sadeh 
373602adf40SYehuda Sadeh 	struct list_head	node;
374dfc5606dSYehuda Sadeh 
375dfc5606dSYehuda Sadeh 	/* sysfs related */
376dfc5606dSYehuda Sadeh 	struct device		dev;
377b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
378dfc5606dSYehuda Sadeh };
379dfc5606dSYehuda Sadeh 
380b82d167bSAlex Elder /*
381b82d167bSAlex Elder  * Flag bits for rbd_dev->flags.  If atomicity is required,
382b82d167bSAlex Elder  * rbd_dev->lock is used to protect access.
383b82d167bSAlex Elder  *
384b82d167bSAlex Elder  * Currently, only the "removing" flag (which is coupled with the
385b82d167bSAlex Elder  * "open_count" field) requires atomic access.
386b82d167bSAlex Elder  */
3876d292906SAlex Elder enum rbd_dev_flags {
3886d292906SAlex Elder 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
389b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
3906d292906SAlex Elder };
3916d292906SAlex Elder 
392cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex);	/* Serialize client creation */
393e124a82fSAlex Elder 
394602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
395e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
396e124a82fSAlex Elder 
397602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
398432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
399602adf40SYehuda Sadeh 
40078c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */
40178c2a44aSAlex Elder 
4021c2a9dfeSAlex Elder static struct kmem_cache	*rbd_img_request_cache;
403868311b1SAlex Elder static struct kmem_cache	*rbd_obj_request_cache;
40478c2a44aSAlex Elder static struct kmem_cache	*rbd_segment_name_cache;
4051c2a9dfeSAlex Elder 
4069b60e70bSIlya Dryomov static int rbd_major;
407f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida);
408f8a22fc2SIlya Dryomov 
409f5ee37bdSIlya Dryomov static struct workqueue_struct *rbd_wq;
410f5ee37bdSIlya Dryomov 
4119b60e70bSIlya Dryomov /*
4129b60e70bSIlya Dryomov  * Default to false for now, as single-major requires >= 0.75 version of
4139b60e70bSIlya Dryomov  * userspace rbd utility.
4149b60e70bSIlya Dryomov  */
4159b60e70bSIlya Dryomov static bool single_major = false;
4169b60e70bSIlya Dryomov module_param(single_major, bool, S_IRUGO);
4179b60e70bSIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)");
4189b60e70bSIlya Dryomov 
4193d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request);
4203d7efd18SAlex Elder 
421f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
422f0f8cef5SAlex Elder 		       size_t count);
423f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
424f0f8cef5SAlex Elder 			  size_t count);
4259b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
4269b60e70bSIlya Dryomov 				    size_t count);
4279b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
4289b60e70bSIlya Dryomov 				       size_t count);
4296d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
430a2acd00eSAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
431f0f8cef5SAlex Elder 
4329b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id)
4339b60e70bSIlya Dryomov {
4347e513d43SIlya Dryomov 	return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
4359b60e70bSIlya Dryomov }
4369b60e70bSIlya Dryomov 
4379b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor)
4389b60e70bSIlya Dryomov {
4397e513d43SIlya Dryomov 	return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
4409b60e70bSIlya Dryomov }
4419b60e70bSIlya Dryomov 
442b15a21ddSGreg Kroah-Hartman static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
443b15a21ddSGreg Kroah-Hartman static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
4449b60e70bSIlya Dryomov static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
4459b60e70bSIlya Dryomov static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
446b15a21ddSGreg Kroah-Hartman 
447b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = {
448b15a21ddSGreg Kroah-Hartman 	&bus_attr_add.attr,
449b15a21ddSGreg Kroah-Hartman 	&bus_attr_remove.attr,
4509b60e70bSIlya Dryomov 	&bus_attr_add_single_major.attr,
4519b60e70bSIlya Dryomov 	&bus_attr_remove_single_major.attr,
452b15a21ddSGreg Kroah-Hartman 	NULL,
453f0f8cef5SAlex Elder };
45492c76dc0SIlya Dryomov 
45592c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj,
45692c76dc0SIlya Dryomov 				  struct attribute *attr, int index)
45792c76dc0SIlya Dryomov {
4589b60e70bSIlya Dryomov 	if (!single_major &&
4599b60e70bSIlya Dryomov 	    (attr == &bus_attr_add_single_major.attr ||
4609b60e70bSIlya Dryomov 	     attr == &bus_attr_remove_single_major.attr))
4619b60e70bSIlya Dryomov 		return 0;
4629b60e70bSIlya Dryomov 
46392c76dc0SIlya Dryomov 	return attr->mode;
46492c76dc0SIlya Dryomov }
46592c76dc0SIlya Dryomov 
46692c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = {
46792c76dc0SIlya Dryomov 	.attrs = rbd_bus_attrs,
46892c76dc0SIlya Dryomov 	.is_visible = rbd_bus_is_visible,
46992c76dc0SIlya Dryomov };
47092c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus);
471f0f8cef5SAlex Elder 
472f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
473f0f8cef5SAlex Elder 	.name		= "rbd",
474b15a21ddSGreg Kroah-Hartman 	.bus_groups	= rbd_bus_groups,
475f0f8cef5SAlex Elder };
476f0f8cef5SAlex Elder 
477f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
478f0f8cef5SAlex Elder {
479f0f8cef5SAlex Elder }
480f0f8cef5SAlex Elder 
481f0f8cef5SAlex Elder static struct device rbd_root_dev = {
482f0f8cef5SAlex Elder 	.init_name =    "rbd",
483f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
484f0f8cef5SAlex Elder };
485f0f8cef5SAlex Elder 
48606ecc6cbSAlex Elder static __printf(2, 3)
48706ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
48806ecc6cbSAlex Elder {
48906ecc6cbSAlex Elder 	struct va_format vaf;
49006ecc6cbSAlex Elder 	va_list args;
49106ecc6cbSAlex Elder 
49206ecc6cbSAlex Elder 	va_start(args, fmt);
49306ecc6cbSAlex Elder 	vaf.fmt = fmt;
49406ecc6cbSAlex Elder 	vaf.va = &args;
49506ecc6cbSAlex Elder 
49606ecc6cbSAlex Elder 	if (!rbd_dev)
49706ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
49806ecc6cbSAlex Elder 	else if (rbd_dev->disk)
49906ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
50006ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
50106ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
50206ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
50306ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
50406ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
50506ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
50606ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
50706ecc6cbSAlex Elder 	else	/* punt */
50806ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
50906ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
51006ecc6cbSAlex Elder 	va_end(args);
51106ecc6cbSAlex Elder }
51206ecc6cbSAlex Elder 
513aafb230eSAlex Elder #ifdef RBD_DEBUG
514aafb230eSAlex Elder #define rbd_assert(expr)						\
515aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
516aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
517aafb230eSAlex Elder 						"at line %d:\n\n"	\
518aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
519aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
520aafb230eSAlex Elder 			BUG();						\
521aafb230eSAlex Elder 		}
522aafb230eSAlex Elder #else /* !RBD_DEBUG */
523aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
524aafb230eSAlex Elder #endif /* !RBD_DEBUG */
525dfc5606dSYehuda Sadeh 
5262761713dSIlya Dryomov static void rbd_osd_copyup_callback(struct rbd_obj_request *obj_request);
527b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
52805a46afdSAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
52905a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
5308b3e1a56SAlex Elder 
531cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev);
5322df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
533a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev);
534e8f59b59SIlya Dryomov static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
53554cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
53654cac61fSAlex Elder 					u64 snap_id);
5372ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
5382ad3d716SAlex Elder 				u8 *order, u64 *snap_size);
5392ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
5402ad3d716SAlex Elder 		u64 *snap_features);
5412ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name);
54259c2be1eSYehuda Sadeh 
543602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
544602adf40SYehuda Sadeh {
545f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
546b82d167bSAlex Elder 	bool removing = false;
547602adf40SYehuda Sadeh 
548f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
549602adf40SYehuda Sadeh 		return -EROFS;
550602adf40SYehuda Sadeh 
551a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
552b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
553b82d167bSAlex Elder 		removing = true;
554b82d167bSAlex Elder 	else
555b82d167bSAlex Elder 		rbd_dev->open_count++;
556a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
557b82d167bSAlex Elder 	if (removing)
558b82d167bSAlex Elder 		return -ENOENT;
559b82d167bSAlex Elder 
560c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
561340c7a2bSAlex Elder 
562602adf40SYehuda Sadeh 	return 0;
563602adf40SYehuda Sadeh }
564602adf40SYehuda Sadeh 
565db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode)
566dfc5606dSYehuda Sadeh {
567dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
568b82d167bSAlex Elder 	unsigned long open_count_before;
569b82d167bSAlex Elder 
570a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
571b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
572a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
573b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
574dfc5606dSYehuda Sadeh 
575c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
576dfc5606dSYehuda Sadeh }
577dfc5606dSYehuda Sadeh 
578131fd9f6SGuangliang Zhao static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
579131fd9f6SGuangliang Zhao {
58077f33c03SJosh Durgin 	int ret = 0;
581131fd9f6SGuangliang Zhao 	int val;
582131fd9f6SGuangliang Zhao 	bool ro;
58377f33c03SJosh Durgin 	bool ro_changed = false;
584131fd9f6SGuangliang Zhao 
58577f33c03SJosh Durgin 	/* get_user() may sleep, so call it before taking rbd_dev->lock */
586131fd9f6SGuangliang Zhao 	if (get_user(val, (int __user *)(arg)))
587131fd9f6SGuangliang Zhao 		return -EFAULT;
588131fd9f6SGuangliang Zhao 
589131fd9f6SGuangliang Zhao 	ro = val ? true : false;
590131fd9f6SGuangliang Zhao 	/* Snapshot doesn't allow to write*/
591131fd9f6SGuangliang Zhao 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
592131fd9f6SGuangliang Zhao 		return -EROFS;
593131fd9f6SGuangliang Zhao 
59477f33c03SJosh Durgin 	spin_lock_irq(&rbd_dev->lock);
59577f33c03SJosh Durgin 	/* prevent others open this device */
59677f33c03SJosh Durgin 	if (rbd_dev->open_count > 1) {
59777f33c03SJosh Durgin 		ret = -EBUSY;
59877f33c03SJosh Durgin 		goto out;
599131fd9f6SGuangliang Zhao 	}
600131fd9f6SGuangliang Zhao 
60177f33c03SJosh Durgin 	if (rbd_dev->mapping.read_only != ro) {
60277f33c03SJosh Durgin 		rbd_dev->mapping.read_only = ro;
60377f33c03SJosh Durgin 		ro_changed = true;
60477f33c03SJosh Durgin 	}
60577f33c03SJosh Durgin 
60677f33c03SJosh Durgin out:
60777f33c03SJosh Durgin 	spin_unlock_irq(&rbd_dev->lock);
60877f33c03SJosh Durgin 	/* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */
60977f33c03SJosh Durgin 	if (ret == 0 && ro_changed)
61077f33c03SJosh Durgin 		set_disk_ro(rbd_dev->disk, ro ? 1 : 0);
61177f33c03SJosh Durgin 
61277f33c03SJosh Durgin 	return ret;
613131fd9f6SGuangliang Zhao }
614131fd9f6SGuangliang Zhao 
615131fd9f6SGuangliang Zhao static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
616131fd9f6SGuangliang Zhao 			unsigned int cmd, unsigned long arg)
617131fd9f6SGuangliang Zhao {
618131fd9f6SGuangliang Zhao 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
619131fd9f6SGuangliang Zhao 	int ret = 0;
620131fd9f6SGuangliang Zhao 
621131fd9f6SGuangliang Zhao 	switch (cmd) {
622131fd9f6SGuangliang Zhao 	case BLKROSET:
623131fd9f6SGuangliang Zhao 		ret = rbd_ioctl_set_ro(rbd_dev, arg);
624131fd9f6SGuangliang Zhao 		break;
625131fd9f6SGuangliang Zhao 	default:
626131fd9f6SGuangliang Zhao 		ret = -ENOTTY;
627131fd9f6SGuangliang Zhao 	}
628131fd9f6SGuangliang Zhao 
629131fd9f6SGuangliang Zhao 	return ret;
630131fd9f6SGuangliang Zhao }
631131fd9f6SGuangliang Zhao 
632131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT
633131fd9f6SGuangliang Zhao static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
634131fd9f6SGuangliang Zhao 				unsigned int cmd, unsigned long arg)
635131fd9f6SGuangliang Zhao {
636131fd9f6SGuangliang Zhao 	return rbd_ioctl(bdev, mode, cmd, arg);
637131fd9f6SGuangliang Zhao }
638131fd9f6SGuangliang Zhao #endif /* CONFIG_COMPAT */
639131fd9f6SGuangliang Zhao 
640602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
641602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
642602adf40SYehuda Sadeh 	.open			= rbd_open,
643dfc5606dSYehuda Sadeh 	.release		= rbd_release,
644131fd9f6SGuangliang Zhao 	.ioctl			= rbd_ioctl,
645131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT
646131fd9f6SGuangliang Zhao 	.compat_ioctl		= rbd_compat_ioctl,
647131fd9f6SGuangliang Zhao #endif
648602adf40SYehuda Sadeh };
649602adf40SYehuda Sadeh 
650602adf40SYehuda Sadeh /*
6517262cfcaSAlex Elder  * Initialize an rbd client instance.  Success or not, this function
652cfbf6377SAlex Elder  * consumes ceph_opts.  Caller holds client_mutex.
653602adf40SYehuda Sadeh  */
654f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
655602adf40SYehuda Sadeh {
656602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
657602adf40SYehuda Sadeh 	int ret = -ENOMEM;
658602adf40SYehuda Sadeh 
65937206ee5SAlex Elder 	dout("%s:\n", __func__);
660602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
661602adf40SYehuda Sadeh 	if (!rbdc)
662602adf40SYehuda Sadeh 		goto out_opt;
663602adf40SYehuda Sadeh 
664602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
665602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
666602adf40SYehuda Sadeh 
66743ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
668602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
66908f75463SAlex Elder 		goto out_rbdc;
67043ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
671602adf40SYehuda Sadeh 
672602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
673602adf40SYehuda Sadeh 	if (ret < 0)
67408f75463SAlex Elder 		goto out_client;
675602adf40SYehuda Sadeh 
676432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
677602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
678432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
679602adf40SYehuda Sadeh 
68037206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
681bc534d86SAlex Elder 
682602adf40SYehuda Sadeh 	return rbdc;
68308f75463SAlex Elder out_client:
684602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
68508f75463SAlex Elder out_rbdc:
686602adf40SYehuda Sadeh 	kfree(rbdc);
687602adf40SYehuda Sadeh out_opt:
68843ae4701SAlex Elder 	if (ceph_opts)
68943ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
69037206ee5SAlex Elder 	dout("%s: error %d\n", __func__, ret);
69137206ee5SAlex Elder 
69228f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
693602adf40SYehuda Sadeh }
694602adf40SYehuda Sadeh 
6952f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
6962f82ee54SAlex Elder {
6972f82ee54SAlex Elder 	kref_get(&rbdc->kref);
6982f82ee54SAlex Elder 
6992f82ee54SAlex Elder 	return rbdc;
7002f82ee54SAlex Elder }
7012f82ee54SAlex Elder 
702602adf40SYehuda Sadeh /*
7031f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
7041f7ba331SAlex Elder  * found, bump its reference count.
705602adf40SYehuda Sadeh  */
7061f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
707602adf40SYehuda Sadeh {
708602adf40SYehuda Sadeh 	struct rbd_client *client_node;
7091f7ba331SAlex Elder 	bool found = false;
710602adf40SYehuda Sadeh 
71143ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
712602adf40SYehuda Sadeh 		return NULL;
713602adf40SYehuda Sadeh 
7141f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
7151f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
7161f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
7172f82ee54SAlex Elder 			__rbd_get_client(client_node);
7182f82ee54SAlex Elder 
7191f7ba331SAlex Elder 			found = true;
7201f7ba331SAlex Elder 			break;
7211f7ba331SAlex Elder 		}
7221f7ba331SAlex Elder 	}
7231f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
7241f7ba331SAlex Elder 
7251f7ba331SAlex Elder 	return found ? client_node : NULL;
726602adf40SYehuda Sadeh }
727602adf40SYehuda Sadeh 
728602adf40SYehuda Sadeh /*
729210c104cSIlya Dryomov  * (Per device) rbd map options
73059c2be1eSYehuda Sadeh  */
73159c2be1eSYehuda Sadeh enum {
732b5584180SIlya Dryomov 	Opt_queue_depth,
73359c2be1eSYehuda Sadeh 	Opt_last_int,
73459c2be1eSYehuda Sadeh 	/* int args above */
73559c2be1eSYehuda Sadeh 	Opt_last_string,
73659c2be1eSYehuda Sadeh 	/* string args above */
737cc0538b6SAlex Elder 	Opt_read_only,
738cc0538b6SAlex Elder 	Opt_read_write,
739210c104cSIlya Dryomov 	Opt_err
74059c2be1eSYehuda Sadeh };
74159c2be1eSYehuda Sadeh 
74243ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
743b5584180SIlya Dryomov 	{Opt_queue_depth, "queue_depth=%d"},
74459c2be1eSYehuda Sadeh 	/* int args above */
74559c2be1eSYehuda Sadeh 	/* string args above */
746be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
747cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
748cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
749cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
750210c104cSIlya Dryomov 	{Opt_err, NULL}
75159c2be1eSYehuda Sadeh };
75259c2be1eSYehuda Sadeh 
75398571b5aSAlex Elder struct rbd_options {
754b5584180SIlya Dryomov 	int	queue_depth;
75598571b5aSAlex Elder 	bool	read_only;
75698571b5aSAlex Elder };
75798571b5aSAlex Elder 
758b5584180SIlya Dryomov #define RBD_QUEUE_DEPTH_DEFAULT	BLKDEV_MAX_RQ
75998571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
76098571b5aSAlex Elder 
76159c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
76259c2be1eSYehuda Sadeh {
76343ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
76459c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
76559c2be1eSYehuda Sadeh 	int token, intval, ret;
76659c2be1eSYehuda Sadeh 
76743ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
76859c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
76959c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
77059c2be1eSYehuda Sadeh 		if (ret < 0) {
771210c104cSIlya Dryomov 			pr_err("bad mount option arg (not int) at '%s'\n", c);
77259c2be1eSYehuda Sadeh 			return ret;
77359c2be1eSYehuda Sadeh 		}
77459c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
77559c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
776210c104cSIlya Dryomov 		dout("got string token %d val %s\n", token, argstr[0].from);
77759c2be1eSYehuda Sadeh 	} else {
77859c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
77959c2be1eSYehuda Sadeh 	}
78059c2be1eSYehuda Sadeh 
78159c2be1eSYehuda Sadeh 	switch (token) {
782b5584180SIlya Dryomov 	case Opt_queue_depth:
783b5584180SIlya Dryomov 		if (intval < 1) {
784b5584180SIlya Dryomov 			pr_err("queue_depth out of range\n");
785b5584180SIlya Dryomov 			return -EINVAL;
786b5584180SIlya Dryomov 		}
787b5584180SIlya Dryomov 		rbd_opts->queue_depth = intval;
788b5584180SIlya Dryomov 		break;
789cc0538b6SAlex Elder 	case Opt_read_only:
790cc0538b6SAlex Elder 		rbd_opts->read_only = true;
791cc0538b6SAlex Elder 		break;
792cc0538b6SAlex Elder 	case Opt_read_write:
793cc0538b6SAlex Elder 		rbd_opts->read_only = false;
794cc0538b6SAlex Elder 		break;
79559c2be1eSYehuda Sadeh 	default:
796210c104cSIlya Dryomov 		/* libceph prints "bad option" msg */
797210c104cSIlya Dryomov 		return -EINVAL;
79859c2be1eSYehuda Sadeh 	}
799210c104cSIlya Dryomov 
80059c2be1eSYehuda Sadeh 	return 0;
80159c2be1eSYehuda Sadeh }
80259c2be1eSYehuda Sadeh 
8036d2940c8SGuangliang Zhao static char* obj_op_name(enum obj_operation_type op_type)
8046d2940c8SGuangliang Zhao {
8056d2940c8SGuangliang Zhao 	switch (op_type) {
8066d2940c8SGuangliang Zhao 	case OBJ_OP_READ:
8076d2940c8SGuangliang Zhao 		return "read";
8086d2940c8SGuangliang Zhao 	case OBJ_OP_WRITE:
8096d2940c8SGuangliang Zhao 		return "write";
81090e98c52SGuangliang Zhao 	case OBJ_OP_DISCARD:
81190e98c52SGuangliang Zhao 		return "discard";
8126d2940c8SGuangliang Zhao 	default:
8136d2940c8SGuangliang Zhao 		return "???";
8146d2940c8SGuangliang Zhao 	}
8156d2940c8SGuangliang Zhao }
8166d2940c8SGuangliang Zhao 
81759c2be1eSYehuda Sadeh /*
818602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
8197262cfcaSAlex Elder  * not exist create it.  Either way, ceph_opts is consumed by this
8207262cfcaSAlex Elder  * function.
821602adf40SYehuda Sadeh  */
8229d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
823602adf40SYehuda Sadeh {
824f8c38929SAlex Elder 	struct rbd_client *rbdc;
82559c2be1eSYehuda Sadeh 
826cfbf6377SAlex Elder 	mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
8271f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
8289d3997fdSAlex Elder 	if (rbdc)	/* using an existing client */
82943ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
8309d3997fdSAlex Elder 	else
831f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
832cfbf6377SAlex Elder 	mutex_unlock(&client_mutex);
833d720bcb0SAlex Elder 
8349d3997fdSAlex Elder 	return rbdc;
835602adf40SYehuda Sadeh }
836602adf40SYehuda Sadeh 
837602adf40SYehuda Sadeh /*
838602adf40SYehuda Sadeh  * Destroy ceph client
839d23a4b3fSAlex Elder  *
840432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
841602adf40SYehuda Sadeh  */
842602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
843602adf40SYehuda Sadeh {
844602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
845602adf40SYehuda Sadeh 
84637206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
847cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
848602adf40SYehuda Sadeh 	list_del(&rbdc->node);
849cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
850602adf40SYehuda Sadeh 
851602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
852602adf40SYehuda Sadeh 	kfree(rbdc);
853602adf40SYehuda Sadeh }
854602adf40SYehuda Sadeh 
855602adf40SYehuda Sadeh /*
856602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
857602adf40SYehuda Sadeh  * it.
858602adf40SYehuda Sadeh  */
8599d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
860602adf40SYehuda Sadeh {
861c53d5893SAlex Elder 	if (rbdc)
8629d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
863602adf40SYehuda Sadeh }
864602adf40SYehuda Sadeh 
865a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
866a30b71b9SAlex Elder {
867a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
868a30b71b9SAlex Elder }
869a30b71b9SAlex Elder 
8708e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
8718e94af8eSAlex Elder {
872103a150fSAlex Elder 	size_t size;
873103a150fSAlex Elder 	u32 snap_count;
874103a150fSAlex Elder 
875103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
876103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
877103a150fSAlex Elder 		return false;
878103a150fSAlex Elder 
879db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
880db2388b6SAlex Elder 
881db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
882db2388b6SAlex Elder 		return false;
883db2388b6SAlex Elder 
884db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
885db2388b6SAlex Elder 
886db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
887db2388b6SAlex Elder 		return false;
888db2388b6SAlex Elder 
889103a150fSAlex Elder 	/*
890103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
891103a150fSAlex Elder 	 * that limits the number of snapshots.
892103a150fSAlex Elder 	 */
893103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
894103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
895103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
896103a150fSAlex Elder 		return false;
897103a150fSAlex Elder 
898103a150fSAlex Elder 	/*
899103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
900103a150fSAlex Elder 	 * header must also be representable in a size_t.
901103a150fSAlex Elder 	 */
902103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
903103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
904103a150fSAlex Elder 		return false;
905103a150fSAlex Elder 
906103a150fSAlex Elder 	return true;
9078e94af8eSAlex Elder }
9088e94af8eSAlex Elder 
909602adf40SYehuda Sadeh /*
910bb23e37aSAlex Elder  * Fill an rbd image header with information from the given format 1
911bb23e37aSAlex Elder  * on-disk header.
912602adf40SYehuda Sadeh  */
913662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev,
9144156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
915602adf40SYehuda Sadeh {
916662518b1SAlex Elder 	struct rbd_image_header *header = &rbd_dev->header;
917bb23e37aSAlex Elder 	bool first_time = header->object_prefix == NULL;
918bb23e37aSAlex Elder 	struct ceph_snap_context *snapc;
919bb23e37aSAlex Elder 	char *object_prefix = NULL;
920bb23e37aSAlex Elder 	char *snap_names = NULL;
921bb23e37aSAlex Elder 	u64 *snap_sizes = NULL;
922ccece235SAlex Elder 	u32 snap_count;
923d2bb24e5SAlex Elder 	size_t size;
924bb23e37aSAlex Elder 	int ret = -ENOMEM;
925621901d6SAlex Elder 	u32 i;
926602adf40SYehuda Sadeh 
927bb23e37aSAlex Elder 	/* Allocate this now to avoid having to handle failure below */
928103a150fSAlex Elder 
929bb23e37aSAlex Elder 	if (first_time) {
930bb23e37aSAlex Elder 		size_t len;
931bb23e37aSAlex Elder 
932bb23e37aSAlex Elder 		len = strnlen(ondisk->object_prefix,
933bb23e37aSAlex Elder 				sizeof (ondisk->object_prefix));
934bb23e37aSAlex Elder 		object_prefix = kmalloc(len + 1, GFP_KERNEL);
935bb23e37aSAlex Elder 		if (!object_prefix)
936602adf40SYehuda Sadeh 			return -ENOMEM;
937bb23e37aSAlex Elder 		memcpy(object_prefix, ondisk->object_prefix, len);
938bb23e37aSAlex Elder 		object_prefix[len] = '\0';
939bb23e37aSAlex Elder 	}
94000f1f36fSAlex Elder 
941bb23e37aSAlex Elder 	/* Allocate the snapshot context and fill it in */
942d2bb24e5SAlex Elder 
943602adf40SYehuda Sadeh 	snap_count = le32_to_cpu(ondisk->snap_count);
944bb23e37aSAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
945bb23e37aSAlex Elder 	if (!snapc)
946bb23e37aSAlex Elder 		goto out_err;
947bb23e37aSAlex Elder 	snapc->seq = le64_to_cpu(ondisk->snap_seq);
948602adf40SYehuda Sadeh 	if (snap_count) {
949bb23e37aSAlex Elder 		struct rbd_image_snap_ondisk *snaps;
950f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
951f785cc1dSAlex Elder 
952bb23e37aSAlex Elder 		/* We'll keep a copy of the snapshot names... */
953621901d6SAlex Elder 
954f785cc1dSAlex Elder 		if (snap_names_len > (u64)SIZE_MAX)
955bb23e37aSAlex Elder 			goto out_2big;
956bb23e37aSAlex Elder 		snap_names = kmalloc(snap_names_len, GFP_KERNEL);
957bb23e37aSAlex Elder 		if (!snap_names)
958602adf40SYehuda Sadeh 			goto out_err;
959bb23e37aSAlex Elder 
960bb23e37aSAlex Elder 		/* ...as well as the array of their sizes. */
961bb23e37aSAlex Elder 
962bb23e37aSAlex Elder 		size = snap_count * sizeof (*header->snap_sizes);
963bb23e37aSAlex Elder 		snap_sizes = kmalloc(size, GFP_KERNEL);
964bb23e37aSAlex Elder 		if (!snap_sizes)
965bb23e37aSAlex Elder 			goto out_err;
966bb23e37aSAlex Elder 
967f785cc1dSAlex Elder 		/*
968bb23e37aSAlex Elder 		 * Copy the names, and fill in each snapshot's id
969bb23e37aSAlex Elder 		 * and size.
970bb23e37aSAlex Elder 		 *
97199a41ebcSAlex Elder 		 * Note that rbd_dev_v1_header_info() guarantees the
972bb23e37aSAlex Elder 		 * ondisk buffer we're working with has
973f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
974f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
975f785cc1dSAlex Elder 		 */
976bb23e37aSAlex Elder 		memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
977bb23e37aSAlex Elder 		snaps = ondisk->snaps;
978bb23e37aSAlex Elder 		for (i = 0; i < snap_count; i++) {
979bb23e37aSAlex Elder 			snapc->snaps[i] = le64_to_cpu(snaps[i].id);
980bb23e37aSAlex Elder 			snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
981bb23e37aSAlex Elder 		}
982602adf40SYehuda Sadeh 	}
983849b4260SAlex Elder 
984bb23e37aSAlex Elder 	/* We won't fail any more, fill in the header */
985bb23e37aSAlex Elder 
986bb23e37aSAlex Elder 	if (first_time) {
987bb23e37aSAlex Elder 		header->object_prefix = object_prefix;
988602adf40SYehuda Sadeh 		header->obj_order = ondisk->options.order;
989602adf40SYehuda Sadeh 		header->crypt_type = ondisk->options.crypt_type;
990602adf40SYehuda Sadeh 		header->comp_type = ondisk->options.comp_type;
991bb23e37aSAlex Elder 		/* The rest aren't used for format 1 images */
992bb23e37aSAlex Elder 		header->stripe_unit = 0;
993bb23e37aSAlex Elder 		header->stripe_count = 0;
994bb23e37aSAlex Elder 		header->features = 0;
995662518b1SAlex Elder 	} else {
996662518b1SAlex Elder 		ceph_put_snap_context(header->snapc);
997662518b1SAlex Elder 		kfree(header->snap_names);
998662518b1SAlex Elder 		kfree(header->snap_sizes);
999bb23e37aSAlex Elder 	}
10006a52325fSAlex Elder 
1001bb23e37aSAlex Elder 	/* The remaining fields always get updated (when we refresh) */
1002621901d6SAlex Elder 
1003f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
1004bb23e37aSAlex Elder 	header->snapc = snapc;
1005bb23e37aSAlex Elder 	header->snap_names = snap_names;
1006bb23e37aSAlex Elder 	header->snap_sizes = snap_sizes;
1007468521c1SAlex Elder 
1008602adf40SYehuda Sadeh 	return 0;
1009bb23e37aSAlex Elder out_2big:
1010bb23e37aSAlex Elder 	ret = -EIO;
10116a52325fSAlex Elder out_err:
1012bb23e37aSAlex Elder 	kfree(snap_sizes);
1013bb23e37aSAlex Elder 	kfree(snap_names);
1014bb23e37aSAlex Elder 	ceph_put_snap_context(snapc);
1015bb23e37aSAlex Elder 	kfree(object_prefix);
1016ccece235SAlex Elder 
1017bb23e37aSAlex Elder 	return ret;
1018602adf40SYehuda Sadeh }
1019602adf40SYehuda Sadeh 
10209682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
10219682fc6dSAlex Elder {
10229682fc6dSAlex Elder 	const char *snap_name;
10239682fc6dSAlex Elder 
10249682fc6dSAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
10259682fc6dSAlex Elder 
10269682fc6dSAlex Elder 	/* Skip over names until we find the one we are looking for */
10279682fc6dSAlex Elder 
10289682fc6dSAlex Elder 	snap_name = rbd_dev->header.snap_names;
10299682fc6dSAlex Elder 	while (which--)
10309682fc6dSAlex Elder 		snap_name += strlen(snap_name) + 1;
10319682fc6dSAlex Elder 
10329682fc6dSAlex Elder 	return kstrdup(snap_name, GFP_KERNEL);
10339682fc6dSAlex Elder }
10349682fc6dSAlex Elder 
103530d1cff8SAlex Elder /*
103630d1cff8SAlex Elder  * Snapshot id comparison function for use with qsort()/bsearch().
103730d1cff8SAlex Elder  * Note that result is for snapshots in *descending* order.
103830d1cff8SAlex Elder  */
103930d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2)
104030d1cff8SAlex Elder {
104130d1cff8SAlex Elder 	u64 snap_id1 = *(u64 *)s1;
104230d1cff8SAlex Elder 	u64 snap_id2 = *(u64 *)s2;
104330d1cff8SAlex Elder 
104430d1cff8SAlex Elder 	if (snap_id1 < snap_id2)
104530d1cff8SAlex Elder 		return 1;
104630d1cff8SAlex Elder 	return snap_id1 == snap_id2 ? 0 : -1;
104730d1cff8SAlex Elder }
104830d1cff8SAlex Elder 
104930d1cff8SAlex Elder /*
105030d1cff8SAlex Elder  * Search a snapshot context to see if the given snapshot id is
105130d1cff8SAlex Elder  * present.
105230d1cff8SAlex Elder  *
105330d1cff8SAlex Elder  * Returns the position of the snapshot id in the array if it's found,
105430d1cff8SAlex Elder  * or BAD_SNAP_INDEX otherwise.
105530d1cff8SAlex Elder  *
105630d1cff8SAlex Elder  * Note: The snapshot array is in kept sorted (by the osd) in
105730d1cff8SAlex Elder  * reverse order, highest snapshot id first.
105830d1cff8SAlex Elder  */
10599682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
10609682fc6dSAlex Elder {
10619682fc6dSAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
106230d1cff8SAlex Elder 	u64 *found;
10639682fc6dSAlex Elder 
106430d1cff8SAlex Elder 	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
106530d1cff8SAlex Elder 				sizeof (snap_id), snapid_compare_reverse);
10669682fc6dSAlex Elder 
106730d1cff8SAlex Elder 	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
10689682fc6dSAlex Elder }
10699682fc6dSAlex Elder 
10702ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
10712ad3d716SAlex Elder 					u64 snap_id)
107254cac61fSAlex Elder {
107354cac61fSAlex Elder 	u32 which;
1074da6a6b63SJosh Durgin 	const char *snap_name;
107554cac61fSAlex Elder 
107654cac61fSAlex Elder 	which = rbd_dev_snap_index(rbd_dev, snap_id);
107754cac61fSAlex Elder 	if (which == BAD_SNAP_INDEX)
1078da6a6b63SJosh Durgin 		return ERR_PTR(-ENOENT);
107954cac61fSAlex Elder 
1080da6a6b63SJosh Durgin 	snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1081da6a6b63SJosh Durgin 	return snap_name ? snap_name : ERR_PTR(-ENOMEM);
108254cac61fSAlex Elder }
108354cac61fSAlex Elder 
10849e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
10859e15b77dSAlex Elder {
10869e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
10879e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
10889e15b77dSAlex Elder 
108954cac61fSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
109054cac61fSAlex Elder 	if (rbd_dev->image_format == 1)
109154cac61fSAlex Elder 		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
10929e15b77dSAlex Elder 
109354cac61fSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
10949e15b77dSAlex Elder }
10959e15b77dSAlex Elder 
10962ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
10972ad3d716SAlex Elder 				u64 *snap_size)
1098602adf40SYehuda Sadeh {
10992ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
11002ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
11012ad3d716SAlex Elder 		*snap_size = rbd_dev->header.image_size;
11022ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
11032ad3d716SAlex Elder 		u32 which;
110400f1f36fSAlex Elder 
11052ad3d716SAlex Elder 		which = rbd_dev_snap_index(rbd_dev, snap_id);
11062ad3d716SAlex Elder 		if (which == BAD_SNAP_INDEX)
11072ad3d716SAlex Elder 			return -ENOENT;
110800f1f36fSAlex Elder 
11092ad3d716SAlex Elder 		*snap_size = rbd_dev->header.snap_sizes[which];
11102ad3d716SAlex Elder 	} else {
11112ad3d716SAlex Elder 		u64 size = 0;
11122ad3d716SAlex Elder 		int ret;
11132ad3d716SAlex Elder 
11142ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
11152ad3d716SAlex Elder 		if (ret)
11162ad3d716SAlex Elder 			return ret;
11172ad3d716SAlex Elder 
11182ad3d716SAlex Elder 		*snap_size = size;
11192ad3d716SAlex Elder 	}
11202ad3d716SAlex Elder 	return 0;
11212ad3d716SAlex Elder }
11222ad3d716SAlex Elder 
11232ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
11242ad3d716SAlex Elder 			u64 *snap_features)
11252ad3d716SAlex Elder {
11262ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
11272ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
11282ad3d716SAlex Elder 		*snap_features = rbd_dev->header.features;
11292ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
11302ad3d716SAlex Elder 		*snap_features = 0;	/* No features for format 1 */
11312ad3d716SAlex Elder 	} else {
11322ad3d716SAlex Elder 		u64 features = 0;
11332ad3d716SAlex Elder 		int ret;
11342ad3d716SAlex Elder 
11352ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
11362ad3d716SAlex Elder 		if (ret)
11372ad3d716SAlex Elder 			return ret;
11382ad3d716SAlex Elder 
11392ad3d716SAlex Elder 		*snap_features = features;
11402ad3d716SAlex Elder 	}
11412ad3d716SAlex Elder 	return 0;
114200f1f36fSAlex Elder }
1143602adf40SYehuda Sadeh 
1144d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1145602adf40SYehuda Sadeh {
11468f4b7d98SAlex Elder 	u64 snap_id = rbd_dev->spec->snap_id;
11472ad3d716SAlex Elder 	u64 size = 0;
11482ad3d716SAlex Elder 	u64 features = 0;
11492ad3d716SAlex Elder 	int ret;
11508b0241f8SAlex Elder 
11512ad3d716SAlex Elder 	ret = rbd_snap_size(rbd_dev, snap_id, &size);
11522ad3d716SAlex Elder 	if (ret)
11532ad3d716SAlex Elder 		return ret;
11542ad3d716SAlex Elder 	ret = rbd_snap_features(rbd_dev, snap_id, &features);
11552ad3d716SAlex Elder 	if (ret)
11562ad3d716SAlex Elder 		return ret;
11572ad3d716SAlex Elder 
11582ad3d716SAlex Elder 	rbd_dev->mapping.size = size;
11592ad3d716SAlex Elder 	rbd_dev->mapping.features = features;
11602ad3d716SAlex Elder 
11618b0241f8SAlex Elder 	return 0;
1162602adf40SYehuda Sadeh }
1163602adf40SYehuda Sadeh 
1164d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1165d1cf5788SAlex Elder {
1166d1cf5788SAlex Elder 	rbd_dev->mapping.size = 0;
1167d1cf5788SAlex Elder 	rbd_dev->mapping.features = 0;
1168200a6a8bSAlex Elder }
1169200a6a8bSAlex Elder 
11707d5079aaSHimangi Saraogi static void rbd_segment_name_free(const char *name)
11717d5079aaSHimangi Saraogi {
11727d5079aaSHimangi Saraogi 	/* The explicit cast here is needed to drop the const qualifier */
11737d5079aaSHimangi Saraogi 
11747d5079aaSHimangi Saraogi 	kmem_cache_free(rbd_segment_name_cache, (void *)name);
11757d5079aaSHimangi Saraogi }
11767d5079aaSHimangi Saraogi 
117798571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
1178602adf40SYehuda Sadeh {
117965ccfe21SAlex Elder 	char *name;
118065ccfe21SAlex Elder 	u64 segment;
118165ccfe21SAlex Elder 	int ret;
11823a96d5cdSJosh Durgin 	char *name_format;
1183602adf40SYehuda Sadeh 
118478c2a44aSAlex Elder 	name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
118565ccfe21SAlex Elder 	if (!name)
118665ccfe21SAlex Elder 		return NULL;
118765ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
11883a96d5cdSJosh Durgin 	name_format = "%s.%012llx";
11893a96d5cdSJosh Durgin 	if (rbd_dev->image_format == 2)
11903a96d5cdSJosh Durgin 		name_format = "%s.%016llx";
11912d0ebc5dSIlya Dryomov 	ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format,
119265ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
11932d0ebc5dSIlya Dryomov 	if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) {
119465ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
119565ccfe21SAlex Elder 			segment, ret);
11967d5079aaSHimangi Saraogi 		rbd_segment_name_free(name);
119765ccfe21SAlex Elder 		name = NULL;
119865ccfe21SAlex Elder 	}
1199602adf40SYehuda Sadeh 
120065ccfe21SAlex Elder 	return name;
120165ccfe21SAlex Elder }
1202602adf40SYehuda Sadeh 
120365ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
120465ccfe21SAlex Elder {
120565ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
1206602adf40SYehuda Sadeh 
120765ccfe21SAlex Elder 	return offset & (segment_size - 1);
120865ccfe21SAlex Elder }
120965ccfe21SAlex Elder 
121065ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
121165ccfe21SAlex Elder 				u64 offset, u64 length)
121265ccfe21SAlex Elder {
121365ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
121465ccfe21SAlex Elder 
121565ccfe21SAlex Elder 	offset &= segment_size - 1;
121665ccfe21SAlex Elder 
1217aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
121865ccfe21SAlex Elder 	if (offset + length > segment_size)
121965ccfe21SAlex Elder 		length = segment_size - offset;
122065ccfe21SAlex Elder 
122165ccfe21SAlex Elder 	return length;
1222602adf40SYehuda Sadeh }
1223602adf40SYehuda Sadeh 
1224602adf40SYehuda Sadeh /*
1225029bcbd8SJosh Durgin  * returns the size of an object in the image
1226029bcbd8SJosh Durgin  */
1227029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
1228029bcbd8SJosh Durgin {
1229029bcbd8SJosh Durgin 	return 1 << header->obj_order;
1230029bcbd8SJosh Durgin }
1231029bcbd8SJosh Durgin 
1232029bcbd8SJosh Durgin /*
1233602adf40SYehuda Sadeh  * bio helpers
1234602adf40SYehuda Sadeh  */
1235602adf40SYehuda Sadeh 
1236602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
1237602adf40SYehuda Sadeh {
1238602adf40SYehuda Sadeh 	struct bio *tmp;
1239602adf40SYehuda Sadeh 
1240602adf40SYehuda Sadeh 	while (chain) {
1241602adf40SYehuda Sadeh 		tmp = chain;
1242602adf40SYehuda Sadeh 		chain = chain->bi_next;
1243602adf40SYehuda Sadeh 		bio_put(tmp);
1244602adf40SYehuda Sadeh 	}
1245602adf40SYehuda Sadeh }
1246602adf40SYehuda Sadeh 
1247602adf40SYehuda Sadeh /*
1248602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
1249602adf40SYehuda Sadeh  */
1250602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
1251602adf40SYehuda Sadeh {
12527988613bSKent Overstreet 	struct bio_vec bv;
12537988613bSKent Overstreet 	struct bvec_iter iter;
1254602adf40SYehuda Sadeh 	unsigned long flags;
1255602adf40SYehuda Sadeh 	void *buf;
1256602adf40SYehuda Sadeh 	int pos = 0;
1257602adf40SYehuda Sadeh 
1258602adf40SYehuda Sadeh 	while (chain) {
12597988613bSKent Overstreet 		bio_for_each_segment(bv, chain, iter) {
12607988613bSKent Overstreet 			if (pos + bv.bv_len > start_ofs) {
1261602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
12627988613bSKent Overstreet 				buf = bvec_kmap_irq(&bv, &flags);
1263602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
12647988613bSKent Overstreet 				       bv.bv_len - remainder);
12657988613bSKent Overstreet 				flush_dcache_page(bv.bv_page);
126685b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
1267602adf40SYehuda Sadeh 			}
12687988613bSKent Overstreet 			pos += bv.bv_len;
1269602adf40SYehuda Sadeh 		}
1270602adf40SYehuda Sadeh 
1271602adf40SYehuda Sadeh 		chain = chain->bi_next;
1272602adf40SYehuda Sadeh 	}
1273602adf40SYehuda Sadeh }
1274602adf40SYehuda Sadeh 
1275602adf40SYehuda Sadeh /*
1276b9434c5bSAlex Elder  * similar to zero_bio_chain(), zeros data defined by a page array,
1277b9434c5bSAlex Elder  * starting at the given byte offset from the start of the array and
1278b9434c5bSAlex Elder  * continuing up to the given end offset.  The pages array is
1279b9434c5bSAlex Elder  * assumed to be big enough to hold all bytes up to the end.
1280b9434c5bSAlex Elder  */
1281b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end)
1282b9434c5bSAlex Elder {
1283b9434c5bSAlex Elder 	struct page **page = &pages[offset >> PAGE_SHIFT];
1284b9434c5bSAlex Elder 
1285b9434c5bSAlex Elder 	rbd_assert(end > offset);
1286b9434c5bSAlex Elder 	rbd_assert(end - offset <= (u64)SIZE_MAX);
1287b9434c5bSAlex Elder 	while (offset < end) {
1288b9434c5bSAlex Elder 		size_t page_offset;
1289b9434c5bSAlex Elder 		size_t length;
1290b9434c5bSAlex Elder 		unsigned long flags;
1291b9434c5bSAlex Elder 		void *kaddr;
1292b9434c5bSAlex Elder 
1293491205a8SGeert Uytterhoeven 		page_offset = offset & ~PAGE_MASK;
1294491205a8SGeert Uytterhoeven 		length = min_t(size_t, PAGE_SIZE - page_offset, end - offset);
1295b9434c5bSAlex Elder 		local_irq_save(flags);
1296b9434c5bSAlex Elder 		kaddr = kmap_atomic(*page);
1297b9434c5bSAlex Elder 		memset(kaddr + page_offset, 0, length);
1298e2156054SAlex Elder 		flush_dcache_page(*page);
1299b9434c5bSAlex Elder 		kunmap_atomic(kaddr);
1300b9434c5bSAlex Elder 		local_irq_restore(flags);
1301b9434c5bSAlex Elder 
1302b9434c5bSAlex Elder 		offset += length;
1303b9434c5bSAlex Elder 		page++;
1304b9434c5bSAlex Elder 	}
1305b9434c5bSAlex Elder }
1306b9434c5bSAlex Elder 
1307b9434c5bSAlex Elder /*
1308f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
1309f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
1310602adf40SYehuda Sadeh  */
1311f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
1312f7760dadSAlex Elder 					unsigned int offset,
1313f7760dadSAlex Elder 					unsigned int len,
1314f7760dadSAlex Elder 					gfp_t gfpmask)
1315602adf40SYehuda Sadeh {
1316f7760dadSAlex Elder 	struct bio *bio;
1317602adf40SYehuda Sadeh 
13185341a627SKent Overstreet 	bio = bio_clone(bio_src, gfpmask);
1319f7760dadSAlex Elder 	if (!bio)
1320f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
1321f7760dadSAlex Elder 
13225341a627SKent Overstreet 	bio_advance(bio, offset);
13234f024f37SKent Overstreet 	bio->bi_iter.bi_size = len;
1324602adf40SYehuda Sadeh 
1325f7760dadSAlex Elder 	return bio;
1326602adf40SYehuda Sadeh }
1327602adf40SYehuda Sadeh 
1328f7760dadSAlex Elder /*
1329f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
1330f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
1331f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
1332f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
1333f7760dadSAlex Elder  *
1334f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
1335f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
1336f7760dadSAlex Elder  * the start of data to be cloned is located.
1337f7760dadSAlex Elder  *
1338f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
1339f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
1340f7760dadSAlex Elder  * contain the offset of that byte within that bio.
1341f7760dadSAlex Elder  */
1342f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
1343f7760dadSAlex Elder 					unsigned int *offset,
1344f7760dadSAlex Elder 					unsigned int len,
1345f7760dadSAlex Elder 					gfp_t gfpmask)
1346f7760dadSAlex Elder {
1347f7760dadSAlex Elder 	struct bio *bi = *bio_src;
1348f7760dadSAlex Elder 	unsigned int off = *offset;
1349f7760dadSAlex Elder 	struct bio *chain = NULL;
1350f7760dadSAlex Elder 	struct bio **end;
1351602adf40SYehuda Sadeh 
1352f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
1353602adf40SYehuda Sadeh 
13544f024f37SKent Overstreet 	if (!bi || off >= bi->bi_iter.bi_size || !len)
1355f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
1356602adf40SYehuda Sadeh 
1357f7760dadSAlex Elder 	end = &chain;
1358f7760dadSAlex Elder 	while (len) {
1359f7760dadSAlex Elder 		unsigned int bi_size;
1360f7760dadSAlex Elder 		struct bio *bio;
1361f7760dadSAlex Elder 
1362f5400b7aSAlex Elder 		if (!bi) {
1363f5400b7aSAlex Elder 			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1364f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
1365f5400b7aSAlex Elder 		}
13664f024f37SKent Overstreet 		bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len);
1367f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
1368f7760dadSAlex Elder 		if (!bio)
1369f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
1370f7760dadSAlex Elder 
1371f7760dadSAlex Elder 		*end = bio;
1372f7760dadSAlex Elder 		end = &bio->bi_next;
1373f7760dadSAlex Elder 
1374f7760dadSAlex Elder 		off += bi_size;
13754f024f37SKent Overstreet 		if (off == bi->bi_iter.bi_size) {
1376f7760dadSAlex Elder 			bi = bi->bi_next;
1377f7760dadSAlex Elder 			off = 0;
1378f7760dadSAlex Elder 		}
1379f7760dadSAlex Elder 		len -= bi_size;
1380f7760dadSAlex Elder 	}
1381f7760dadSAlex Elder 	*bio_src = bi;
1382f7760dadSAlex Elder 	*offset = off;
1383f7760dadSAlex Elder 
1384f7760dadSAlex Elder 	return chain;
1385f7760dadSAlex Elder out_err:
1386f7760dadSAlex Elder 	bio_chain_put(chain);
1387f7760dadSAlex Elder 
1388602adf40SYehuda Sadeh 	return NULL;
1389602adf40SYehuda Sadeh }
1390602adf40SYehuda Sadeh 
1391926f9b3fSAlex Elder /*
1392926f9b3fSAlex Elder  * The default/initial value for all object request flags is 0.  For
1393926f9b3fSAlex Elder  * each flag, once its value is set to 1 it is never reset to 0
1394926f9b3fSAlex Elder  * again.
1395926f9b3fSAlex Elder  */
13966365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
13976365d33aSAlex Elder {
13986365d33aSAlex Elder 	if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
13996365d33aSAlex Elder 		struct rbd_device *rbd_dev;
14006365d33aSAlex Elder 
140157acbaa7SAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
14029584d508SIlya Dryomov 		rbd_warn(rbd_dev, "obj_request %p already marked img_data",
14036365d33aSAlex Elder 			obj_request);
14046365d33aSAlex Elder 	}
14056365d33aSAlex Elder }
14066365d33aSAlex Elder 
14076365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
14086365d33aSAlex Elder {
14096365d33aSAlex Elder 	smp_mb();
14106365d33aSAlex Elder 	return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
14116365d33aSAlex Elder }
14126365d33aSAlex Elder 
141357acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request)
141457acbaa7SAlex Elder {
141557acbaa7SAlex Elder 	if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
141657acbaa7SAlex Elder 		struct rbd_device *rbd_dev = NULL;
141757acbaa7SAlex Elder 
141857acbaa7SAlex Elder 		if (obj_request_img_data_test(obj_request))
141957acbaa7SAlex Elder 			rbd_dev = obj_request->img_request->rbd_dev;
14209584d508SIlya Dryomov 		rbd_warn(rbd_dev, "obj_request %p already marked done",
142157acbaa7SAlex Elder 			obj_request);
142257acbaa7SAlex Elder 	}
142357acbaa7SAlex Elder }
142457acbaa7SAlex Elder 
142557acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request)
142657acbaa7SAlex Elder {
142757acbaa7SAlex Elder 	smp_mb();
142857acbaa7SAlex Elder 	return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
142957acbaa7SAlex Elder }
143057acbaa7SAlex Elder 
14315679c59fSAlex Elder /*
14325679c59fSAlex Elder  * This sets the KNOWN flag after (possibly) setting the EXISTS
14335679c59fSAlex Elder  * flag.  The latter is set based on the "exists" value provided.
14345679c59fSAlex Elder  *
14355679c59fSAlex Elder  * Note that for our purposes once an object exists it never goes
14365679c59fSAlex Elder  * away again.  It's possible that the response from two existence
14375679c59fSAlex Elder  * checks are separated by the creation of the target object, and
14385679c59fSAlex Elder  * the first ("doesn't exist") response arrives *after* the second
14395679c59fSAlex Elder  * ("does exist").  In that case we ignore the second one.
14405679c59fSAlex Elder  */
14415679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request,
14425679c59fSAlex Elder 				bool exists)
14435679c59fSAlex Elder {
14445679c59fSAlex Elder 	if (exists)
14455679c59fSAlex Elder 		set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
14465679c59fSAlex Elder 	set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
14475679c59fSAlex Elder 	smp_mb();
14485679c59fSAlex Elder }
14495679c59fSAlex Elder 
14505679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request)
14515679c59fSAlex Elder {
14525679c59fSAlex Elder 	smp_mb();
14535679c59fSAlex Elder 	return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
14545679c59fSAlex Elder }
14555679c59fSAlex Elder 
14565679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
14575679c59fSAlex Elder {
14585679c59fSAlex Elder 	smp_mb();
14595679c59fSAlex Elder 	return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
14605679c59fSAlex Elder }
14615679c59fSAlex Elder 
14629638556aSIlya Dryomov static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request)
14639638556aSIlya Dryomov {
14649638556aSIlya Dryomov 	struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
14659638556aSIlya Dryomov 
14669638556aSIlya Dryomov 	return obj_request->img_offset <
14679638556aSIlya Dryomov 	    round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header));
14689638556aSIlya Dryomov }
14699638556aSIlya Dryomov 
1470bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1471bf0d5f50SAlex Elder {
147237206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
147337206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1474bf0d5f50SAlex Elder 	kref_get(&obj_request->kref);
1475bf0d5f50SAlex Elder }
1476bf0d5f50SAlex Elder 
1477bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
1478bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1479bf0d5f50SAlex Elder {
1480bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
148137206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
148237206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1483bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1484bf0d5f50SAlex Elder }
1485bf0d5f50SAlex Elder 
14860f2d5be7SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request)
14870f2d5be7SAlex Elder {
14880f2d5be7SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
14890f2d5be7SAlex Elder 	     atomic_read(&img_request->kref.refcount));
14900f2d5be7SAlex Elder 	kref_get(&img_request->kref);
14910f2d5be7SAlex Elder }
14920f2d5be7SAlex Elder 
1493e93f3152SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request);
1494e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref);
1495bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref);
1496bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request)
1497bf0d5f50SAlex Elder {
1498bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
149937206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
150037206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1501e93f3152SAlex Elder 	if (img_request_child_test(img_request))
1502e93f3152SAlex Elder 		kref_put(&img_request->kref, rbd_parent_request_destroy);
1503e93f3152SAlex Elder 	else
1504bf0d5f50SAlex Elder 		kref_put(&img_request->kref, rbd_img_request_destroy);
1505bf0d5f50SAlex Elder }
1506bf0d5f50SAlex Elder 
1507bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1508bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1509bf0d5f50SAlex Elder {
151025dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
151125dcf954SAlex Elder 
1512b155e86cSAlex Elder 	/* Image request now owns object's original reference */
1513bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
151425dcf954SAlex Elder 	obj_request->which = img_request->obj_request_count;
15156365d33aSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
15166365d33aSAlex Elder 	obj_request_img_data_set(obj_request);
1517bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
151825dcf954SAlex Elder 	img_request->obj_request_count++;
151925dcf954SAlex Elder 	list_add_tail(&obj_request->links, &img_request->obj_requests);
152037206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
152137206ee5SAlex Elder 		obj_request->which);
1522bf0d5f50SAlex Elder }
1523bf0d5f50SAlex Elder 
1524bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1525bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1526bf0d5f50SAlex Elder {
1527bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
152825dcf954SAlex Elder 
152937206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
153037206ee5SAlex Elder 		obj_request->which);
1531bf0d5f50SAlex Elder 	list_del(&obj_request->links);
153225dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
153325dcf954SAlex Elder 	img_request->obj_request_count--;
153425dcf954SAlex Elder 	rbd_assert(obj_request->which == img_request->obj_request_count);
153525dcf954SAlex Elder 	obj_request->which = BAD_WHICH;
15366365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
1537bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1538bf0d5f50SAlex Elder 	obj_request->img_request = NULL;
153925dcf954SAlex Elder 	obj_request->callback = NULL;
1540bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1541bf0d5f50SAlex Elder }
1542bf0d5f50SAlex Elder 
1543bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type)
1544bf0d5f50SAlex Elder {
1545bf0d5f50SAlex Elder 	switch (type) {
15469969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1547bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1548788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1549bf0d5f50SAlex Elder 		return true;
1550bf0d5f50SAlex Elder 	default:
1551bf0d5f50SAlex Elder 		return false;
1552bf0d5f50SAlex Elder 	}
1553bf0d5f50SAlex Elder }
1554bf0d5f50SAlex Elder 
1555bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1556bf0d5f50SAlex Elder 				struct rbd_obj_request *obj_request)
1557bf0d5f50SAlex Elder {
155871c20a06SIlya Dryomov 	dout("%s %p\n", __func__, obj_request);
1559bf0d5f50SAlex Elder 	return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1560bf0d5f50SAlex Elder }
1561bf0d5f50SAlex Elder 
156271c20a06SIlya Dryomov static void rbd_obj_request_end(struct rbd_obj_request *obj_request)
156371c20a06SIlya Dryomov {
156471c20a06SIlya Dryomov 	dout("%s %p\n", __func__, obj_request);
156571c20a06SIlya Dryomov 	ceph_osdc_cancel_request(obj_request->osd_req);
156671c20a06SIlya Dryomov }
156771c20a06SIlya Dryomov 
156871c20a06SIlya Dryomov /*
156971c20a06SIlya Dryomov  * Wait for an object request to complete.  If interrupted, cancel the
157071c20a06SIlya Dryomov  * underlying osd request.
15712894e1d7SIlya Dryomov  *
15722894e1d7SIlya Dryomov  * @timeout: in jiffies, 0 means "wait forever"
157371c20a06SIlya Dryomov  */
15742894e1d7SIlya Dryomov static int __rbd_obj_request_wait(struct rbd_obj_request *obj_request,
15752894e1d7SIlya Dryomov 				  unsigned long timeout)
157671c20a06SIlya Dryomov {
15772894e1d7SIlya Dryomov 	long ret;
157871c20a06SIlya Dryomov 
157971c20a06SIlya Dryomov 	dout("%s %p\n", __func__, obj_request);
15802894e1d7SIlya Dryomov 	ret = wait_for_completion_interruptible_timeout(
15812894e1d7SIlya Dryomov 					&obj_request->completion,
15822894e1d7SIlya Dryomov 					ceph_timeout_jiffies(timeout));
15832894e1d7SIlya Dryomov 	if (ret <= 0) {
15842894e1d7SIlya Dryomov 		if (ret == 0)
15852894e1d7SIlya Dryomov 			ret = -ETIMEDOUT;
158671c20a06SIlya Dryomov 		rbd_obj_request_end(obj_request);
15872894e1d7SIlya Dryomov 	} else {
15882894e1d7SIlya Dryomov 		ret = 0;
15892894e1d7SIlya Dryomov 	}
15902894e1d7SIlya Dryomov 
15912894e1d7SIlya Dryomov 	dout("%s %p ret %d\n", __func__, obj_request, (int)ret);
159271c20a06SIlya Dryomov 	return ret;
159371c20a06SIlya Dryomov }
159471c20a06SIlya Dryomov 
15952894e1d7SIlya Dryomov static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
15962894e1d7SIlya Dryomov {
15972894e1d7SIlya Dryomov 	return __rbd_obj_request_wait(obj_request, 0);
15982894e1d7SIlya Dryomov }
15992894e1d7SIlya Dryomov 
16002894e1d7SIlya Dryomov static int rbd_obj_request_wait_timeout(struct rbd_obj_request *obj_request,
16012894e1d7SIlya Dryomov 					unsigned long timeout)
16022894e1d7SIlya Dryomov {
16032894e1d7SIlya Dryomov 	return __rbd_obj_request_wait(obj_request, timeout);
160471c20a06SIlya Dryomov }
160571c20a06SIlya Dryomov 
1606bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request)
1607bf0d5f50SAlex Elder {
160855f27e09SAlex Elder 
160937206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
161055f27e09SAlex Elder 
161155f27e09SAlex Elder 	/*
161255f27e09SAlex Elder 	 * If no error occurred, compute the aggregate transfer
161355f27e09SAlex Elder 	 * count for the image request.  We could instead use
161455f27e09SAlex Elder 	 * atomic64_cmpxchg() to update it as each object request
161555f27e09SAlex Elder 	 * completes; not clear which way is better off hand.
161655f27e09SAlex Elder 	 */
161755f27e09SAlex Elder 	if (!img_request->result) {
161855f27e09SAlex Elder 		struct rbd_obj_request *obj_request;
161955f27e09SAlex Elder 		u64 xferred = 0;
162055f27e09SAlex Elder 
162155f27e09SAlex Elder 		for_each_obj_request(img_request, obj_request)
162255f27e09SAlex Elder 			xferred += obj_request->xferred;
162355f27e09SAlex Elder 		img_request->xferred = xferred;
162455f27e09SAlex Elder 	}
162555f27e09SAlex Elder 
1626bf0d5f50SAlex Elder 	if (img_request->callback)
1627bf0d5f50SAlex Elder 		img_request->callback(img_request);
1628bf0d5f50SAlex Elder 	else
1629bf0d5f50SAlex Elder 		rbd_img_request_put(img_request);
1630bf0d5f50SAlex Elder }
1631bf0d5f50SAlex Elder 
16320c425248SAlex Elder /*
16330c425248SAlex Elder  * The default/initial value for all image request flags is 0.  Each
16340c425248SAlex Elder  * is conditionally set to 1 at image request initialization time
16350c425248SAlex Elder  * and currently never change thereafter.
16360c425248SAlex Elder  */
16370c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request)
16380c425248SAlex Elder {
16390c425248SAlex Elder 	set_bit(IMG_REQ_WRITE, &img_request->flags);
16400c425248SAlex Elder 	smp_mb();
16410c425248SAlex Elder }
16420c425248SAlex Elder 
16430c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request)
16440c425248SAlex Elder {
16450c425248SAlex Elder 	smp_mb();
16460c425248SAlex Elder 	return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
16470c425248SAlex Elder }
16480c425248SAlex Elder 
164990e98c52SGuangliang Zhao /*
165090e98c52SGuangliang Zhao  * Set the discard flag when the img_request is an discard request
165190e98c52SGuangliang Zhao  */
165290e98c52SGuangliang Zhao static void img_request_discard_set(struct rbd_img_request *img_request)
165390e98c52SGuangliang Zhao {
165490e98c52SGuangliang Zhao 	set_bit(IMG_REQ_DISCARD, &img_request->flags);
165590e98c52SGuangliang Zhao 	smp_mb();
165690e98c52SGuangliang Zhao }
165790e98c52SGuangliang Zhao 
165890e98c52SGuangliang Zhao static bool img_request_discard_test(struct rbd_img_request *img_request)
165990e98c52SGuangliang Zhao {
166090e98c52SGuangliang Zhao 	smp_mb();
166190e98c52SGuangliang Zhao 	return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0;
166290e98c52SGuangliang Zhao }
166390e98c52SGuangliang Zhao 
16649849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request)
16659849e986SAlex Elder {
16669849e986SAlex Elder 	set_bit(IMG_REQ_CHILD, &img_request->flags);
16679849e986SAlex Elder 	smp_mb();
16689849e986SAlex Elder }
16699849e986SAlex Elder 
1670e93f3152SAlex Elder static void img_request_child_clear(struct rbd_img_request *img_request)
1671e93f3152SAlex Elder {
1672e93f3152SAlex Elder 	clear_bit(IMG_REQ_CHILD, &img_request->flags);
1673e93f3152SAlex Elder 	smp_mb();
1674e93f3152SAlex Elder }
1675e93f3152SAlex Elder 
16769849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request)
16779849e986SAlex Elder {
16789849e986SAlex Elder 	smp_mb();
16799849e986SAlex Elder 	return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
16809849e986SAlex Elder }
16819849e986SAlex Elder 
1682d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request)
1683d0b2e944SAlex Elder {
1684d0b2e944SAlex Elder 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1685d0b2e944SAlex Elder 	smp_mb();
1686d0b2e944SAlex Elder }
1687d0b2e944SAlex Elder 
1688a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request)
1689a2acd00eSAlex Elder {
1690a2acd00eSAlex Elder 	clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1691a2acd00eSAlex Elder 	smp_mb();
1692a2acd00eSAlex Elder }
1693a2acd00eSAlex Elder 
1694d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request)
1695d0b2e944SAlex Elder {
1696d0b2e944SAlex Elder 	smp_mb();
1697d0b2e944SAlex Elder 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1698d0b2e944SAlex Elder }
1699d0b2e944SAlex Elder 
17003b434a2aSJosh Durgin static enum obj_operation_type
17013b434a2aSJosh Durgin rbd_img_request_op_type(struct rbd_img_request *img_request)
17023b434a2aSJosh Durgin {
17033b434a2aSJosh Durgin 	if (img_request_write_test(img_request))
17043b434a2aSJosh Durgin 		return OBJ_OP_WRITE;
17053b434a2aSJosh Durgin 	else if (img_request_discard_test(img_request))
17063b434a2aSJosh Durgin 		return OBJ_OP_DISCARD;
17073b434a2aSJosh Durgin 	else
17083b434a2aSJosh Durgin 		return OBJ_OP_READ;
17093b434a2aSJosh Durgin }
17103b434a2aSJosh Durgin 
17116e2a4505SAlex Elder static void
17126e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
17136e2a4505SAlex Elder {
1714b9434c5bSAlex Elder 	u64 xferred = obj_request->xferred;
1715b9434c5bSAlex Elder 	u64 length = obj_request->length;
1716b9434c5bSAlex Elder 
17176e2a4505SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
17186e2a4505SAlex Elder 		obj_request, obj_request->img_request, obj_request->result,
1719b9434c5bSAlex Elder 		xferred, length);
17206e2a4505SAlex Elder 	/*
172117c1cc1dSJosh Durgin 	 * ENOENT means a hole in the image.  We zero-fill the entire
172217c1cc1dSJosh Durgin 	 * length of the request.  A short read also implies zero-fill
172317c1cc1dSJosh Durgin 	 * to the end of the request.  An error requires the whole
172417c1cc1dSJosh Durgin 	 * length of the request to be reported finished with an error
172517c1cc1dSJosh Durgin 	 * to the block layer.  In each case we update the xferred
172617c1cc1dSJosh Durgin 	 * count to indicate the whole request was satisfied.
17276e2a4505SAlex Elder 	 */
1728b9434c5bSAlex Elder 	rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
17296e2a4505SAlex Elder 	if (obj_request->result == -ENOENT) {
1730b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
17316e2a4505SAlex Elder 			zero_bio_chain(obj_request->bio_list, 0);
1732b9434c5bSAlex Elder 		else
1733b9434c5bSAlex Elder 			zero_pages(obj_request->pages, 0, length);
17346e2a4505SAlex Elder 		obj_request->result = 0;
1735b9434c5bSAlex Elder 	} else if (xferred < length && !obj_request->result) {
1736b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
1737b9434c5bSAlex Elder 			zero_bio_chain(obj_request->bio_list, xferred);
1738b9434c5bSAlex Elder 		else
1739b9434c5bSAlex Elder 			zero_pages(obj_request->pages, xferred, length);
17406e2a4505SAlex Elder 	}
174117c1cc1dSJosh Durgin 	obj_request->xferred = length;
17426e2a4505SAlex Elder 	obj_request_done_set(obj_request);
17436e2a4505SAlex Elder }
17446e2a4505SAlex Elder 
1745bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1746bf0d5f50SAlex Elder {
174737206ee5SAlex Elder 	dout("%s: obj %p cb %p\n", __func__, obj_request,
174837206ee5SAlex Elder 		obj_request->callback);
1749bf0d5f50SAlex Elder 	if (obj_request->callback)
1750bf0d5f50SAlex Elder 		obj_request->callback(obj_request);
1751788e2df3SAlex Elder 	else
1752788e2df3SAlex Elder 		complete_all(&obj_request->completion);
1753bf0d5f50SAlex Elder }
1754bf0d5f50SAlex Elder 
1755c47f9371SAlex Elder static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
175639bf2c5dSAlex Elder {
175739bf2c5dSAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
175839bf2c5dSAlex Elder 	obj_request_done_set(obj_request);
175939bf2c5dSAlex Elder }
176039bf2c5dSAlex Elder 
1761c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1762bf0d5f50SAlex Elder {
176357acbaa7SAlex Elder 	struct rbd_img_request *img_request = NULL;
1764a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev = NULL;
176557acbaa7SAlex Elder 	bool layered = false;
176657acbaa7SAlex Elder 
176757acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
176857acbaa7SAlex Elder 		img_request = obj_request->img_request;
176957acbaa7SAlex Elder 		layered = img_request && img_request_layered_test(img_request);
1770a9e8ba2cSAlex Elder 		rbd_dev = img_request->rbd_dev;
177157acbaa7SAlex Elder 	}
17728b3e1a56SAlex Elder 
17738b3e1a56SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
17748b3e1a56SAlex Elder 		obj_request, img_request, obj_request->result,
17758b3e1a56SAlex Elder 		obj_request->xferred, obj_request->length);
1776a9e8ba2cSAlex Elder 	if (layered && obj_request->result == -ENOENT &&
1777a9e8ba2cSAlex Elder 			obj_request->img_offset < rbd_dev->parent_overlap)
17788b3e1a56SAlex Elder 		rbd_img_parent_read(obj_request);
17798b3e1a56SAlex Elder 	else if (img_request)
17806e2a4505SAlex Elder 		rbd_img_obj_request_read_callback(obj_request);
17816e2a4505SAlex Elder 	else
178207741308SAlex Elder 		obj_request_done_set(obj_request);
1783bf0d5f50SAlex Elder }
1784bf0d5f50SAlex Elder 
1785c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1786bf0d5f50SAlex Elder {
17871b83bef2SSage Weil 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
17881b83bef2SSage Weil 		obj_request->result, obj_request->length);
17891b83bef2SSage Weil 	/*
17908b3e1a56SAlex Elder 	 * There is no such thing as a successful short write.  Set
17918b3e1a56SAlex Elder 	 * it to our originally-requested length.
17921b83bef2SSage Weil 	 */
17931b83bef2SSage Weil 	obj_request->xferred = obj_request->length;
179407741308SAlex Elder 	obj_request_done_set(obj_request);
1795bf0d5f50SAlex Elder }
1796bf0d5f50SAlex Elder 
179790e98c52SGuangliang Zhao static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request)
179890e98c52SGuangliang Zhao {
179990e98c52SGuangliang Zhao 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
180090e98c52SGuangliang Zhao 		obj_request->result, obj_request->length);
180190e98c52SGuangliang Zhao 	/*
180290e98c52SGuangliang Zhao 	 * There is no such thing as a successful short discard.  Set
180390e98c52SGuangliang Zhao 	 * it to our originally-requested length.
180490e98c52SGuangliang Zhao 	 */
180590e98c52SGuangliang Zhao 	obj_request->xferred = obj_request->length;
1806d0265de7SJosh Durgin 	/* discarding a non-existent object is not a problem */
1807d0265de7SJosh Durgin 	if (obj_request->result == -ENOENT)
1808d0265de7SJosh Durgin 		obj_request->result = 0;
180990e98c52SGuangliang Zhao 	obj_request_done_set(obj_request);
181090e98c52SGuangliang Zhao }
181190e98c52SGuangliang Zhao 
1812fbfab539SAlex Elder /*
1813fbfab539SAlex Elder  * For a simple stat call there's nothing to do.  We'll do more if
1814fbfab539SAlex Elder  * this is part of a write sequence for a layered image.
1815fbfab539SAlex Elder  */
1816c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1817fbfab539SAlex Elder {
181837206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
1819fbfab539SAlex Elder 	obj_request_done_set(obj_request);
1820fbfab539SAlex Elder }
1821fbfab539SAlex Elder 
18222761713dSIlya Dryomov static void rbd_osd_call_callback(struct rbd_obj_request *obj_request)
18232761713dSIlya Dryomov {
18242761713dSIlya Dryomov 	dout("%s: obj %p\n", __func__, obj_request);
18252761713dSIlya Dryomov 
18262761713dSIlya Dryomov 	if (obj_request_img_data_test(obj_request))
18272761713dSIlya Dryomov 		rbd_osd_copyup_callback(obj_request);
18282761713dSIlya Dryomov 	else
18292761713dSIlya Dryomov 		obj_request_done_set(obj_request);
18302761713dSIlya Dryomov }
18312761713dSIlya Dryomov 
1832bf0d5f50SAlex Elder static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1833bf0d5f50SAlex Elder 				struct ceph_msg *msg)
1834bf0d5f50SAlex Elder {
1835bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1836bf0d5f50SAlex Elder 	u16 opcode;
1837bf0d5f50SAlex Elder 
183837206ee5SAlex Elder 	dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1839bf0d5f50SAlex Elder 	rbd_assert(osd_req == obj_request->osd_req);
184057acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
184157acbaa7SAlex Elder 		rbd_assert(obj_request->img_request);
184257acbaa7SAlex Elder 		rbd_assert(obj_request->which != BAD_WHICH);
184357acbaa7SAlex Elder 	} else {
184457acbaa7SAlex Elder 		rbd_assert(obj_request->which == BAD_WHICH);
184557acbaa7SAlex Elder 	}
1846bf0d5f50SAlex Elder 
18471b83bef2SSage Weil 	if (osd_req->r_result < 0)
18481b83bef2SSage Weil 		obj_request->result = osd_req->r_result;
1849bf0d5f50SAlex Elder 
18507cc69d42SIlya Dryomov 	rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP);
1851bf0d5f50SAlex Elder 
1852c47f9371SAlex Elder 	/*
1853c47f9371SAlex Elder 	 * We support a 64-bit length, but ultimately it has to be
18547ad18afaSChristoph Hellwig 	 * passed to the block layer, which just supports a 32-bit
18557ad18afaSChristoph Hellwig 	 * length field.
1856c47f9371SAlex Elder 	 */
18571b83bef2SSage Weil 	obj_request->xferred = osd_req->r_reply_op_len[0];
1858c47f9371SAlex Elder 	rbd_assert(obj_request->xferred < (u64)UINT_MAX);
18590ccd5926SIlya Dryomov 
186079528734SAlex Elder 	opcode = osd_req->r_ops[0].op;
1861bf0d5f50SAlex Elder 	switch (opcode) {
1862bf0d5f50SAlex Elder 	case CEPH_OSD_OP_READ:
1863c47f9371SAlex Elder 		rbd_osd_read_callback(obj_request);
1864bf0d5f50SAlex Elder 		break;
18650ccd5926SIlya Dryomov 	case CEPH_OSD_OP_SETALLOCHINT:
1866e30b7577SIlya Dryomov 		rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE ||
1867e30b7577SIlya Dryomov 			   osd_req->r_ops[1].op == CEPH_OSD_OP_WRITEFULL);
18680ccd5926SIlya Dryomov 		/* fall through */
1869bf0d5f50SAlex Elder 	case CEPH_OSD_OP_WRITE:
1870e30b7577SIlya Dryomov 	case CEPH_OSD_OP_WRITEFULL:
1871c47f9371SAlex Elder 		rbd_osd_write_callback(obj_request);
1872bf0d5f50SAlex Elder 		break;
1873fbfab539SAlex Elder 	case CEPH_OSD_OP_STAT:
1874c47f9371SAlex Elder 		rbd_osd_stat_callback(obj_request);
1875fbfab539SAlex Elder 		break;
187690e98c52SGuangliang Zhao 	case CEPH_OSD_OP_DELETE:
187790e98c52SGuangliang Zhao 	case CEPH_OSD_OP_TRUNCATE:
187890e98c52SGuangliang Zhao 	case CEPH_OSD_OP_ZERO:
187990e98c52SGuangliang Zhao 		rbd_osd_discard_callback(obj_request);
188090e98c52SGuangliang Zhao 		break;
188136be9a76SAlex Elder 	case CEPH_OSD_OP_CALL:
18822761713dSIlya Dryomov 		rbd_osd_call_callback(obj_request);
18832761713dSIlya Dryomov 		break;
1884b8d70035SAlex Elder 	case CEPH_OSD_OP_NOTIFY_ACK:
18859969ebc5SAlex Elder 	case CEPH_OSD_OP_WATCH:
1886c47f9371SAlex Elder 		rbd_osd_trivial_callback(obj_request);
18879969ebc5SAlex Elder 		break;
1888bf0d5f50SAlex Elder 	default:
18899584d508SIlya Dryomov 		rbd_warn(NULL, "%s: unsupported op %hu",
1890bf0d5f50SAlex Elder 			obj_request->object_name, (unsigned short) opcode);
1891bf0d5f50SAlex Elder 		break;
1892bf0d5f50SAlex Elder 	}
1893bf0d5f50SAlex Elder 
189407741308SAlex Elder 	if (obj_request_done_test(obj_request))
1895bf0d5f50SAlex Elder 		rbd_obj_request_complete(obj_request);
1896bf0d5f50SAlex Elder }
1897bf0d5f50SAlex Elder 
18989d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1899430c28c3SAlex Elder {
1900430c28c3SAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
19018c042b0dSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
19029d4df01fSAlex Elder 	u64 snap_id;
1903430c28c3SAlex Elder 
19048c042b0dSAlex Elder 	rbd_assert(osd_req != NULL);
1905430c28c3SAlex Elder 
19069d4df01fSAlex Elder 	snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
19078c042b0dSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
19089d4df01fSAlex Elder 			NULL, snap_id, NULL);
19099d4df01fSAlex Elder }
19109d4df01fSAlex Elder 
19119d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
19129d4df01fSAlex Elder {
19139d4df01fSAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
19149d4df01fSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
19159d4df01fSAlex Elder 	struct ceph_snap_context *snapc;
19169d4df01fSAlex Elder 	struct timespec mtime = CURRENT_TIME;
19179d4df01fSAlex Elder 
19189d4df01fSAlex Elder 	rbd_assert(osd_req != NULL);
19199d4df01fSAlex Elder 
19209d4df01fSAlex Elder 	snapc = img_request ? img_request->snapc : NULL;
19219d4df01fSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
19229d4df01fSAlex Elder 			snapc, CEPH_NOSNAP, &mtime);
1923430c28c3SAlex Elder }
1924430c28c3SAlex Elder 
19250ccd5926SIlya Dryomov /*
19260ccd5926SIlya Dryomov  * Create an osd request.  A read request has one osd op (read).
19270ccd5926SIlya Dryomov  * A write request has either one (watch) or two (hint+write) osd ops.
19280ccd5926SIlya Dryomov  * (All rbd data writes are prefixed with an allocation hint op, but
19290ccd5926SIlya Dryomov  * technically osd watch is a write request, hence this distinction.)
19300ccd5926SIlya Dryomov  */
1931bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create(
1932bf0d5f50SAlex Elder 					struct rbd_device *rbd_dev,
19336d2940c8SGuangliang Zhao 					enum obj_operation_type op_type,
1934deb236b3SIlya Dryomov 					unsigned int num_ops,
1935430c28c3SAlex Elder 					struct rbd_obj_request *obj_request)
1936bf0d5f50SAlex Elder {
1937bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1938bf0d5f50SAlex Elder 	struct ceph_osd_client *osdc;
1939bf0d5f50SAlex Elder 	struct ceph_osd_request *osd_req;
1940bf0d5f50SAlex Elder 
194190e98c52SGuangliang Zhao 	if (obj_request_img_data_test(obj_request) &&
194290e98c52SGuangliang Zhao 		(op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) {
19436365d33aSAlex Elder 		struct rbd_img_request *img_request = obj_request->img_request;
194490e98c52SGuangliang Zhao 		if (op_type == OBJ_OP_WRITE) {
19456d2940c8SGuangliang Zhao 			rbd_assert(img_request_write_test(img_request));
194690e98c52SGuangliang Zhao 		} else {
194790e98c52SGuangliang Zhao 			rbd_assert(img_request_discard_test(img_request));
194890e98c52SGuangliang Zhao 		}
1949bf0d5f50SAlex Elder 		snapc = img_request->snapc;
1950bf0d5f50SAlex Elder 	}
1951bf0d5f50SAlex Elder 
19526d2940c8SGuangliang Zhao 	rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2));
1953deb236b3SIlya Dryomov 
1954deb236b3SIlya Dryomov 	/* Allocate and initialize the request, for the num_ops ops */
1955bf0d5f50SAlex Elder 
1956bf0d5f50SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
1957deb236b3SIlya Dryomov 	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
1958deb236b3SIlya Dryomov 					  GFP_ATOMIC);
1959bf0d5f50SAlex Elder 	if (!osd_req)
1960bf0d5f50SAlex Elder 		return NULL;	/* ENOMEM */
1961bf0d5f50SAlex Elder 
196290e98c52SGuangliang Zhao 	if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
1963bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1964430c28c3SAlex Elder 	else
1965bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_READ;
1966bf0d5f50SAlex Elder 
1967bf0d5f50SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
1968bf0d5f50SAlex Elder 	osd_req->r_priv = obj_request;
1969bf0d5f50SAlex Elder 
19703c972c95SIlya Dryomov 	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
19713c972c95SIlya Dryomov 	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
1972bf0d5f50SAlex Elder 
1973bf0d5f50SAlex Elder 	return osd_req;
1974bf0d5f50SAlex Elder }
1975bf0d5f50SAlex Elder 
19760eefd470SAlex Elder /*
1977d3246fb0SJosh Durgin  * Create a copyup osd request based on the information in the object
1978d3246fb0SJosh Durgin  * request supplied.  A copyup request has two or three osd ops, a
1979d3246fb0SJosh Durgin  * copyup method call, potentially a hint op, and a write or truncate
1980d3246fb0SJosh Durgin  * or zero op.
19810eefd470SAlex Elder  */
19820eefd470SAlex Elder static struct ceph_osd_request *
19830eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
19840eefd470SAlex Elder {
19850eefd470SAlex Elder 	struct rbd_img_request *img_request;
19860eefd470SAlex Elder 	struct ceph_snap_context *snapc;
19870eefd470SAlex Elder 	struct rbd_device *rbd_dev;
19880eefd470SAlex Elder 	struct ceph_osd_client *osdc;
19890eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
1990d3246fb0SJosh Durgin 	int num_osd_ops = 3;
19910eefd470SAlex Elder 
19920eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
19930eefd470SAlex Elder 	img_request = obj_request->img_request;
19940eefd470SAlex Elder 	rbd_assert(img_request);
1995d3246fb0SJosh Durgin 	rbd_assert(img_request_write_test(img_request) ||
1996d3246fb0SJosh Durgin 			img_request_discard_test(img_request));
19970eefd470SAlex Elder 
1998d3246fb0SJosh Durgin 	if (img_request_discard_test(img_request))
1999d3246fb0SJosh Durgin 		num_osd_ops = 2;
2000d3246fb0SJosh Durgin 
2001d3246fb0SJosh Durgin 	/* Allocate and initialize the request, for all the ops */
20020eefd470SAlex Elder 
20030eefd470SAlex Elder 	snapc = img_request->snapc;
20040eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
20050eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2006d3246fb0SJosh Durgin 	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops,
2007d3246fb0SJosh Durgin 						false, GFP_ATOMIC);
20080eefd470SAlex Elder 	if (!osd_req)
20090eefd470SAlex Elder 		return NULL;	/* ENOMEM */
20100eefd470SAlex Elder 
20110eefd470SAlex Elder 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
20120eefd470SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
20130eefd470SAlex Elder 	osd_req->r_priv = obj_request;
20140eefd470SAlex Elder 
20153c972c95SIlya Dryomov 	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
20163c972c95SIlya Dryomov 	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
20170eefd470SAlex Elder 
20180eefd470SAlex Elder 	return osd_req;
20190eefd470SAlex Elder }
20200eefd470SAlex Elder 
20210eefd470SAlex Elder 
2022bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
2023bf0d5f50SAlex Elder {
2024bf0d5f50SAlex Elder 	ceph_osdc_put_request(osd_req);
2025bf0d5f50SAlex Elder }
2026bf0d5f50SAlex Elder 
2027bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */
2028bf0d5f50SAlex Elder 
2029bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
2030bf0d5f50SAlex Elder 						u64 offset, u64 length,
2031bf0d5f50SAlex Elder 						enum obj_request_type type)
2032bf0d5f50SAlex Elder {
2033bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2034bf0d5f50SAlex Elder 	size_t size;
2035bf0d5f50SAlex Elder 	char *name;
2036bf0d5f50SAlex Elder 
2037bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(type));
2038bf0d5f50SAlex Elder 
2039bf0d5f50SAlex Elder 	size = strlen(object_name) + 1;
20405a60e876SIlya Dryomov 	name = kmalloc(size, GFP_NOIO);
2041f907ad55SAlex Elder 	if (!name)
2042bf0d5f50SAlex Elder 		return NULL;
2043bf0d5f50SAlex Elder 
20445a60e876SIlya Dryomov 	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
2045f907ad55SAlex Elder 	if (!obj_request) {
2046f907ad55SAlex Elder 		kfree(name);
2047f907ad55SAlex Elder 		return NULL;
2048f907ad55SAlex Elder 	}
2049f907ad55SAlex Elder 
2050bf0d5f50SAlex Elder 	obj_request->object_name = memcpy(name, object_name, size);
2051bf0d5f50SAlex Elder 	obj_request->offset = offset;
2052bf0d5f50SAlex Elder 	obj_request->length = length;
2053926f9b3fSAlex Elder 	obj_request->flags = 0;
2054bf0d5f50SAlex Elder 	obj_request->which = BAD_WHICH;
2055bf0d5f50SAlex Elder 	obj_request->type = type;
2056bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&obj_request->links);
2057788e2df3SAlex Elder 	init_completion(&obj_request->completion);
2058bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
2059bf0d5f50SAlex Elder 
206037206ee5SAlex Elder 	dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
206137206ee5SAlex Elder 		offset, length, (int)type, obj_request);
206237206ee5SAlex Elder 
2063bf0d5f50SAlex Elder 	return obj_request;
2064bf0d5f50SAlex Elder }
2065bf0d5f50SAlex Elder 
2066bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
2067bf0d5f50SAlex Elder {
2068bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2069bf0d5f50SAlex Elder 
2070bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
2071bf0d5f50SAlex Elder 
207237206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
207337206ee5SAlex Elder 
2074bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
2075bf0d5f50SAlex Elder 	rbd_assert(obj_request->which == BAD_WHICH);
2076bf0d5f50SAlex Elder 
2077bf0d5f50SAlex Elder 	if (obj_request->osd_req)
2078bf0d5f50SAlex Elder 		rbd_osd_req_destroy(obj_request->osd_req);
2079bf0d5f50SAlex Elder 
2080bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
2081bf0d5f50SAlex Elder 	switch (obj_request->type) {
20829969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
20839969ebc5SAlex Elder 		break;		/* Nothing to do */
2084bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
2085bf0d5f50SAlex Elder 		if (obj_request->bio_list)
2086bf0d5f50SAlex Elder 			bio_chain_put(obj_request->bio_list);
2087bf0d5f50SAlex Elder 		break;
2088788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
2089788e2df3SAlex Elder 		if (obj_request->pages)
2090788e2df3SAlex Elder 			ceph_release_page_vector(obj_request->pages,
2091788e2df3SAlex Elder 						obj_request->page_count);
2092788e2df3SAlex Elder 		break;
2093bf0d5f50SAlex Elder 	}
2094bf0d5f50SAlex Elder 
2095f907ad55SAlex Elder 	kfree(obj_request->object_name);
2096868311b1SAlex Elder 	obj_request->object_name = NULL;
2097868311b1SAlex Elder 	kmem_cache_free(rbd_obj_request_cache, obj_request);
2098bf0d5f50SAlex Elder }
2099bf0d5f50SAlex Elder 
2100fb65d228SAlex Elder /* It's OK to call this for a device with no parent */
2101fb65d228SAlex Elder 
2102fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
2103fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev)
2104fb65d228SAlex Elder {
2105fb65d228SAlex Elder 	rbd_dev_remove_parent(rbd_dev);
2106fb65d228SAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
2107fb65d228SAlex Elder 	rbd_dev->parent_spec = NULL;
2108fb65d228SAlex Elder 	rbd_dev->parent_overlap = 0;
2109fb65d228SAlex Elder }
2110fb65d228SAlex Elder 
2111bf0d5f50SAlex Elder /*
2112a2acd00eSAlex Elder  * Parent image reference counting is used to determine when an
2113a2acd00eSAlex Elder  * image's parent fields can be safely torn down--after there are no
2114a2acd00eSAlex Elder  * more in-flight requests to the parent image.  When the last
2115a2acd00eSAlex Elder  * reference is dropped, cleaning them up is safe.
2116a2acd00eSAlex Elder  */
2117a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
2118a2acd00eSAlex Elder {
2119a2acd00eSAlex Elder 	int counter;
2120a2acd00eSAlex Elder 
2121a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
2122a2acd00eSAlex Elder 		return;
2123a2acd00eSAlex Elder 
2124a2acd00eSAlex Elder 	counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
2125a2acd00eSAlex Elder 	if (counter > 0)
2126a2acd00eSAlex Elder 		return;
2127a2acd00eSAlex Elder 
2128a2acd00eSAlex Elder 	/* Last reference; clean up parent data structures */
2129a2acd00eSAlex Elder 
2130a2acd00eSAlex Elder 	if (!counter)
2131a2acd00eSAlex Elder 		rbd_dev_unparent(rbd_dev);
2132a2acd00eSAlex Elder 	else
21339584d508SIlya Dryomov 		rbd_warn(rbd_dev, "parent reference underflow");
2134a2acd00eSAlex Elder }
2135a2acd00eSAlex Elder 
2136a2acd00eSAlex Elder /*
2137a2acd00eSAlex Elder  * If an image has a non-zero parent overlap, get a reference to its
2138a2acd00eSAlex Elder  * parent.
2139a2acd00eSAlex Elder  *
2140a2acd00eSAlex Elder  * Returns true if the rbd device has a parent with a non-zero
2141a2acd00eSAlex Elder  * overlap and a reference for it was successfully taken, or
2142a2acd00eSAlex Elder  * false otherwise.
2143a2acd00eSAlex Elder  */
2144a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
2145a2acd00eSAlex Elder {
2146ae43e9d0SIlya Dryomov 	int counter = 0;
2147a2acd00eSAlex Elder 
2148a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
2149a2acd00eSAlex Elder 		return false;
2150a2acd00eSAlex Elder 
2151ae43e9d0SIlya Dryomov 	down_read(&rbd_dev->header_rwsem);
2152ae43e9d0SIlya Dryomov 	if (rbd_dev->parent_overlap)
2153a2acd00eSAlex Elder 		counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
2154ae43e9d0SIlya Dryomov 	up_read(&rbd_dev->header_rwsem);
2155a2acd00eSAlex Elder 
2156a2acd00eSAlex Elder 	if (counter < 0)
21579584d508SIlya Dryomov 		rbd_warn(rbd_dev, "parent reference overflow");
2158a2acd00eSAlex Elder 
2159ae43e9d0SIlya Dryomov 	return counter > 0;
2160a2acd00eSAlex Elder }
2161a2acd00eSAlex Elder 
2162bf0d5f50SAlex Elder /*
2163bf0d5f50SAlex Elder  * Caller is responsible for filling in the list of object requests
2164bf0d5f50SAlex Elder  * that comprises the image request, and the Linux request pointer
2165bf0d5f50SAlex Elder  * (if there is one).
2166bf0d5f50SAlex Elder  */
2167cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create(
2168cc344fa1SAlex Elder 					struct rbd_device *rbd_dev,
2169bf0d5f50SAlex Elder 					u64 offset, u64 length,
21706d2940c8SGuangliang Zhao 					enum obj_operation_type op_type,
21714e752f0aSJosh Durgin 					struct ceph_snap_context *snapc)
2172bf0d5f50SAlex Elder {
2173bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
2174bf0d5f50SAlex Elder 
21757a716aacSIlya Dryomov 	img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
2176bf0d5f50SAlex Elder 	if (!img_request)
2177bf0d5f50SAlex Elder 		return NULL;
2178bf0d5f50SAlex Elder 
2179bf0d5f50SAlex Elder 	img_request->rq = NULL;
2180bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
2181bf0d5f50SAlex Elder 	img_request->offset = offset;
2182bf0d5f50SAlex Elder 	img_request->length = length;
21830c425248SAlex Elder 	img_request->flags = 0;
218490e98c52SGuangliang Zhao 	if (op_type == OBJ_OP_DISCARD) {
218590e98c52SGuangliang Zhao 		img_request_discard_set(img_request);
218690e98c52SGuangliang Zhao 		img_request->snapc = snapc;
218790e98c52SGuangliang Zhao 	} else if (op_type == OBJ_OP_WRITE) {
21880c425248SAlex Elder 		img_request_write_set(img_request);
21894e752f0aSJosh Durgin 		img_request->snapc = snapc;
21900c425248SAlex Elder 	} else {
2191bf0d5f50SAlex Elder 		img_request->snap_id = rbd_dev->spec->snap_id;
21920c425248SAlex Elder 	}
2193a2acd00eSAlex Elder 	if (rbd_dev_parent_get(rbd_dev))
2194d0b2e944SAlex Elder 		img_request_layered_set(img_request);
2195bf0d5f50SAlex Elder 	spin_lock_init(&img_request->completion_lock);
2196bf0d5f50SAlex Elder 	img_request->next_completion = 0;
2197bf0d5f50SAlex Elder 	img_request->callback = NULL;
2198a5a337d4SAlex Elder 	img_request->result = 0;
2199bf0d5f50SAlex Elder 	img_request->obj_request_count = 0;
2200bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&img_request->obj_requests);
2201bf0d5f50SAlex Elder 	kref_init(&img_request->kref);
2202bf0d5f50SAlex Elder 
220337206ee5SAlex Elder 	dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
22046d2940c8SGuangliang Zhao 		obj_op_name(op_type), offset, length, img_request);
220537206ee5SAlex Elder 
2206bf0d5f50SAlex Elder 	return img_request;
2207bf0d5f50SAlex Elder }
2208bf0d5f50SAlex Elder 
2209bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref)
2210bf0d5f50SAlex Elder {
2211bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
2212bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2213bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
2214bf0d5f50SAlex Elder 
2215bf0d5f50SAlex Elder 	img_request = container_of(kref, struct rbd_img_request, kref);
2216bf0d5f50SAlex Elder 
221737206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
221837206ee5SAlex Elder 
2219bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2220bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
222125dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count == 0);
2222bf0d5f50SAlex Elder 
2223a2acd00eSAlex Elder 	if (img_request_layered_test(img_request)) {
2224a2acd00eSAlex Elder 		img_request_layered_clear(img_request);
2225a2acd00eSAlex Elder 		rbd_dev_parent_put(img_request->rbd_dev);
2226a2acd00eSAlex Elder 	}
2227a2acd00eSAlex Elder 
2228bef95455SJosh Durgin 	if (img_request_write_test(img_request) ||
2229bef95455SJosh Durgin 		img_request_discard_test(img_request))
2230812164f8SAlex Elder 		ceph_put_snap_context(img_request->snapc);
2231bf0d5f50SAlex Elder 
22321c2a9dfeSAlex Elder 	kmem_cache_free(rbd_img_request_cache, img_request);
2233bf0d5f50SAlex Elder }
2234bf0d5f50SAlex Elder 
2235e93f3152SAlex Elder static struct rbd_img_request *rbd_parent_request_create(
2236e93f3152SAlex Elder 					struct rbd_obj_request *obj_request,
2237e93f3152SAlex Elder 					u64 img_offset, u64 length)
2238e93f3152SAlex Elder {
2239e93f3152SAlex Elder 	struct rbd_img_request *parent_request;
2240e93f3152SAlex Elder 	struct rbd_device *rbd_dev;
2241e93f3152SAlex Elder 
2242e93f3152SAlex Elder 	rbd_assert(obj_request->img_request);
2243e93f3152SAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2244e93f3152SAlex Elder 
22454e752f0aSJosh Durgin 	parent_request = rbd_img_request_create(rbd_dev->parent, img_offset,
22466d2940c8SGuangliang Zhao 						length, OBJ_OP_READ, NULL);
2247e93f3152SAlex Elder 	if (!parent_request)
2248e93f3152SAlex Elder 		return NULL;
2249e93f3152SAlex Elder 
2250e93f3152SAlex Elder 	img_request_child_set(parent_request);
2251e93f3152SAlex Elder 	rbd_obj_request_get(obj_request);
2252e93f3152SAlex Elder 	parent_request->obj_request = obj_request;
2253e93f3152SAlex Elder 
2254e93f3152SAlex Elder 	return parent_request;
2255e93f3152SAlex Elder }
2256e93f3152SAlex Elder 
2257e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref)
2258e93f3152SAlex Elder {
2259e93f3152SAlex Elder 	struct rbd_img_request *parent_request;
2260e93f3152SAlex Elder 	struct rbd_obj_request *orig_request;
2261e93f3152SAlex Elder 
2262e93f3152SAlex Elder 	parent_request = container_of(kref, struct rbd_img_request, kref);
2263e93f3152SAlex Elder 	orig_request = parent_request->obj_request;
2264e93f3152SAlex Elder 
2265e93f3152SAlex Elder 	parent_request->obj_request = NULL;
2266e93f3152SAlex Elder 	rbd_obj_request_put(orig_request);
2267e93f3152SAlex Elder 	img_request_child_clear(parent_request);
2268e93f3152SAlex Elder 
2269e93f3152SAlex Elder 	rbd_img_request_destroy(kref);
2270e93f3152SAlex Elder }
2271e93f3152SAlex Elder 
22721217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
22731217857fSAlex Elder {
22746365d33aSAlex Elder 	struct rbd_img_request *img_request;
22751217857fSAlex Elder 	unsigned int xferred;
22761217857fSAlex Elder 	int result;
22778b3e1a56SAlex Elder 	bool more;
22781217857fSAlex Elder 
22796365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
22806365d33aSAlex Elder 	img_request = obj_request->img_request;
22816365d33aSAlex Elder 
22821217857fSAlex Elder 	rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
22831217857fSAlex Elder 	xferred = (unsigned int)obj_request->xferred;
22841217857fSAlex Elder 	result = obj_request->result;
22851217857fSAlex Elder 	if (result) {
22861217857fSAlex Elder 		struct rbd_device *rbd_dev = img_request->rbd_dev;
22876d2940c8SGuangliang Zhao 		enum obj_operation_type op_type;
22886d2940c8SGuangliang Zhao 
228990e98c52SGuangliang Zhao 		if (img_request_discard_test(img_request))
229090e98c52SGuangliang Zhao 			op_type = OBJ_OP_DISCARD;
229190e98c52SGuangliang Zhao 		else if (img_request_write_test(img_request))
229290e98c52SGuangliang Zhao 			op_type = OBJ_OP_WRITE;
229390e98c52SGuangliang Zhao 		else
229490e98c52SGuangliang Zhao 			op_type = OBJ_OP_READ;
22951217857fSAlex Elder 
22969584d508SIlya Dryomov 		rbd_warn(rbd_dev, "%s %llx at %llx (%llx)",
22976d2940c8SGuangliang Zhao 			obj_op_name(op_type), obj_request->length,
22986d2940c8SGuangliang Zhao 			obj_request->img_offset, obj_request->offset);
22999584d508SIlya Dryomov 		rbd_warn(rbd_dev, "  result %d xferred %x",
23001217857fSAlex Elder 			result, xferred);
23011217857fSAlex Elder 		if (!img_request->result)
23021217857fSAlex Elder 			img_request->result = result;
2303082a75daSIlya Dryomov 		/*
2304082a75daSIlya Dryomov 		 * Need to end I/O on the entire obj_request worth of
2305082a75daSIlya Dryomov 		 * bytes in case of error.
2306082a75daSIlya Dryomov 		 */
2307082a75daSIlya Dryomov 		xferred = obj_request->length;
23081217857fSAlex Elder 	}
23091217857fSAlex Elder 
2310f1a4739fSAlex Elder 	/* Image object requests don't own their page array */
2311f1a4739fSAlex Elder 
2312f1a4739fSAlex Elder 	if (obj_request->type == OBJ_REQUEST_PAGES) {
2313f1a4739fSAlex Elder 		obj_request->pages = NULL;
2314f1a4739fSAlex Elder 		obj_request->page_count = 0;
2315f1a4739fSAlex Elder 	}
2316f1a4739fSAlex Elder 
23178b3e1a56SAlex Elder 	if (img_request_child_test(img_request)) {
23188b3e1a56SAlex Elder 		rbd_assert(img_request->obj_request != NULL);
23198b3e1a56SAlex Elder 		more = obj_request->which < img_request->obj_request_count - 1;
23208b3e1a56SAlex Elder 	} else {
23218b3e1a56SAlex Elder 		rbd_assert(img_request->rq != NULL);
23227ad18afaSChristoph Hellwig 
23237ad18afaSChristoph Hellwig 		more = blk_update_request(img_request->rq, result, xferred);
23247ad18afaSChristoph Hellwig 		if (!more)
23257ad18afaSChristoph Hellwig 			__blk_mq_end_request(img_request->rq, result);
23268b3e1a56SAlex Elder 	}
23278b3e1a56SAlex Elder 
23288b3e1a56SAlex Elder 	return more;
23291217857fSAlex Elder }
23301217857fSAlex Elder 
23312169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
23322169238dSAlex Elder {
23332169238dSAlex Elder 	struct rbd_img_request *img_request;
23342169238dSAlex Elder 	u32 which = obj_request->which;
23352169238dSAlex Elder 	bool more = true;
23362169238dSAlex Elder 
23376365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
23382169238dSAlex Elder 	img_request = obj_request->img_request;
23392169238dSAlex Elder 
23402169238dSAlex Elder 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
23412169238dSAlex Elder 	rbd_assert(img_request != NULL);
23422169238dSAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
23432169238dSAlex Elder 	rbd_assert(which != BAD_WHICH);
23442169238dSAlex Elder 	rbd_assert(which < img_request->obj_request_count);
23452169238dSAlex Elder 
23462169238dSAlex Elder 	spin_lock_irq(&img_request->completion_lock);
23472169238dSAlex Elder 	if (which != img_request->next_completion)
23482169238dSAlex Elder 		goto out;
23492169238dSAlex Elder 
23502169238dSAlex Elder 	for_each_obj_request_from(img_request, obj_request) {
23512169238dSAlex Elder 		rbd_assert(more);
23522169238dSAlex Elder 		rbd_assert(which < img_request->obj_request_count);
23532169238dSAlex Elder 
23542169238dSAlex Elder 		if (!obj_request_done_test(obj_request))
23552169238dSAlex Elder 			break;
23561217857fSAlex Elder 		more = rbd_img_obj_end_request(obj_request);
23572169238dSAlex Elder 		which++;
23582169238dSAlex Elder 	}
23592169238dSAlex Elder 
23602169238dSAlex Elder 	rbd_assert(more ^ (which == img_request->obj_request_count));
23612169238dSAlex Elder 	img_request->next_completion = which;
23622169238dSAlex Elder out:
23632169238dSAlex Elder 	spin_unlock_irq(&img_request->completion_lock);
23640f2d5be7SAlex Elder 	rbd_img_request_put(img_request);
23652169238dSAlex Elder 
23662169238dSAlex Elder 	if (!more)
23672169238dSAlex Elder 		rbd_img_request_complete(img_request);
23682169238dSAlex Elder }
23692169238dSAlex Elder 
2370f1a4739fSAlex Elder /*
23713b434a2aSJosh Durgin  * Add individual osd ops to the given ceph_osd_request and prepare
23723b434a2aSJosh Durgin  * them for submission. num_ops is the current number of
23733b434a2aSJosh Durgin  * osd operations already to the object request.
23743b434a2aSJosh Durgin  */
23753b434a2aSJosh Durgin static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request,
23763b434a2aSJosh Durgin 				struct ceph_osd_request *osd_request,
23773b434a2aSJosh Durgin 				enum obj_operation_type op_type,
23783b434a2aSJosh Durgin 				unsigned int num_ops)
23793b434a2aSJosh Durgin {
23803b434a2aSJosh Durgin 	struct rbd_img_request *img_request = obj_request->img_request;
23813b434a2aSJosh Durgin 	struct rbd_device *rbd_dev = img_request->rbd_dev;
23823b434a2aSJosh Durgin 	u64 object_size = rbd_obj_bytes(&rbd_dev->header);
23833b434a2aSJosh Durgin 	u64 offset = obj_request->offset;
23843b434a2aSJosh Durgin 	u64 length = obj_request->length;
23853b434a2aSJosh Durgin 	u64 img_end;
23863b434a2aSJosh Durgin 	u16 opcode;
23873b434a2aSJosh Durgin 
23883b434a2aSJosh Durgin 	if (op_type == OBJ_OP_DISCARD) {
2389d3246fb0SJosh Durgin 		if (!offset && length == object_size &&
2390d3246fb0SJosh Durgin 		    (!img_request_layered_test(img_request) ||
2391d3246fb0SJosh Durgin 		     !obj_request_overlaps_parent(obj_request))) {
23923b434a2aSJosh Durgin 			opcode = CEPH_OSD_OP_DELETE;
23933b434a2aSJosh Durgin 		} else if ((offset + length == object_size)) {
23943b434a2aSJosh Durgin 			opcode = CEPH_OSD_OP_TRUNCATE;
23953b434a2aSJosh Durgin 		} else {
23963b434a2aSJosh Durgin 			down_read(&rbd_dev->header_rwsem);
23973b434a2aSJosh Durgin 			img_end = rbd_dev->header.image_size;
23983b434a2aSJosh Durgin 			up_read(&rbd_dev->header_rwsem);
23993b434a2aSJosh Durgin 
24003b434a2aSJosh Durgin 			if (obj_request->img_offset + length == img_end)
24013b434a2aSJosh Durgin 				opcode = CEPH_OSD_OP_TRUNCATE;
24023b434a2aSJosh Durgin 			else
24033b434a2aSJosh Durgin 				opcode = CEPH_OSD_OP_ZERO;
24043b434a2aSJosh Durgin 		}
24053b434a2aSJosh Durgin 	} else if (op_type == OBJ_OP_WRITE) {
2406e30b7577SIlya Dryomov 		if (!offset && length == object_size)
2407e30b7577SIlya Dryomov 			opcode = CEPH_OSD_OP_WRITEFULL;
2408e30b7577SIlya Dryomov 		else
24093b434a2aSJosh Durgin 			opcode = CEPH_OSD_OP_WRITE;
24103b434a2aSJosh Durgin 		osd_req_op_alloc_hint_init(osd_request, num_ops,
24113b434a2aSJosh Durgin 					object_size, object_size);
24123b434a2aSJosh Durgin 		num_ops++;
24133b434a2aSJosh Durgin 	} else {
24143b434a2aSJosh Durgin 		opcode = CEPH_OSD_OP_READ;
24153b434a2aSJosh Durgin 	}
24163b434a2aSJosh Durgin 
24177e868b6eSIlya Dryomov 	if (opcode == CEPH_OSD_OP_DELETE)
2418144cba14SYan, Zheng 		osd_req_op_init(osd_request, num_ops, opcode, 0);
24197e868b6eSIlya Dryomov 	else
24207e868b6eSIlya Dryomov 		osd_req_op_extent_init(osd_request, num_ops, opcode,
24217e868b6eSIlya Dryomov 				       offset, length, 0, 0);
24227e868b6eSIlya Dryomov 
24233b434a2aSJosh Durgin 	if (obj_request->type == OBJ_REQUEST_BIO)
24243b434a2aSJosh Durgin 		osd_req_op_extent_osd_data_bio(osd_request, num_ops,
24253b434a2aSJosh Durgin 					obj_request->bio_list, length);
24263b434a2aSJosh Durgin 	else if (obj_request->type == OBJ_REQUEST_PAGES)
24273b434a2aSJosh Durgin 		osd_req_op_extent_osd_data_pages(osd_request, num_ops,
24283b434a2aSJosh Durgin 					obj_request->pages, length,
24293b434a2aSJosh Durgin 					offset & ~PAGE_MASK, false, false);
24303b434a2aSJosh Durgin 
24313b434a2aSJosh Durgin 	/* Discards are also writes */
24323b434a2aSJosh Durgin 	if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
24333b434a2aSJosh Durgin 		rbd_osd_req_format_write(obj_request);
24343b434a2aSJosh Durgin 	else
24353b434a2aSJosh Durgin 		rbd_osd_req_format_read(obj_request);
24363b434a2aSJosh Durgin }
24373b434a2aSJosh Durgin 
24383b434a2aSJosh Durgin /*
2439f1a4739fSAlex Elder  * Split up an image request into one or more object requests, each
2440f1a4739fSAlex Elder  * to a different object.  The "type" parameter indicates whether
2441f1a4739fSAlex Elder  * "data_desc" is the pointer to the head of a list of bio
2442f1a4739fSAlex Elder  * structures, or the base of a page array.  In either case this
2443f1a4739fSAlex Elder  * function assumes data_desc describes memory sufficient to hold
2444f1a4739fSAlex Elder  * all data described by the image request.
2445f1a4739fSAlex Elder  */
2446f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request,
2447f1a4739fSAlex Elder 					enum obj_request_type type,
2448f1a4739fSAlex Elder 					void *data_desc)
2449bf0d5f50SAlex Elder {
2450bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = img_request->rbd_dev;
2451bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = NULL;
2452bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
2453a158073cSJingoo Han 	struct bio *bio_list = NULL;
2454f1a4739fSAlex Elder 	unsigned int bio_offset = 0;
2455a158073cSJingoo Han 	struct page **pages = NULL;
24566d2940c8SGuangliang Zhao 	enum obj_operation_type op_type;
24577da22d29SAlex Elder 	u64 img_offset;
2458bf0d5f50SAlex Elder 	u64 resid;
2459bf0d5f50SAlex Elder 
2460f1a4739fSAlex Elder 	dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2461f1a4739fSAlex Elder 		(int)type, data_desc);
246237206ee5SAlex Elder 
24637da22d29SAlex Elder 	img_offset = img_request->offset;
2464bf0d5f50SAlex Elder 	resid = img_request->length;
24654dda41d3SAlex Elder 	rbd_assert(resid > 0);
24663b434a2aSJosh Durgin 	op_type = rbd_img_request_op_type(img_request);
2467f1a4739fSAlex Elder 
2468f1a4739fSAlex Elder 	if (type == OBJ_REQUEST_BIO) {
2469f1a4739fSAlex Elder 		bio_list = data_desc;
24704f024f37SKent Overstreet 		rbd_assert(img_offset ==
24714f024f37SKent Overstreet 			   bio_list->bi_iter.bi_sector << SECTOR_SHIFT);
247290e98c52SGuangliang Zhao 	} else if (type == OBJ_REQUEST_PAGES) {
2473f1a4739fSAlex Elder 		pages = data_desc;
2474f1a4739fSAlex Elder 	}
2475f1a4739fSAlex Elder 
2476bf0d5f50SAlex Elder 	while (resid) {
24772fa12320SAlex Elder 		struct ceph_osd_request *osd_req;
2478bf0d5f50SAlex Elder 		const char *object_name;
2479bf0d5f50SAlex Elder 		u64 offset;
2480bf0d5f50SAlex Elder 		u64 length;
2481bf0d5f50SAlex Elder 
24827da22d29SAlex Elder 		object_name = rbd_segment_name(rbd_dev, img_offset);
2483bf0d5f50SAlex Elder 		if (!object_name)
2484bf0d5f50SAlex Elder 			goto out_unwind;
24857da22d29SAlex Elder 		offset = rbd_segment_offset(rbd_dev, img_offset);
24867da22d29SAlex Elder 		length = rbd_segment_length(rbd_dev, img_offset, resid);
2487bf0d5f50SAlex Elder 		obj_request = rbd_obj_request_create(object_name,
2488f1a4739fSAlex Elder 						offset, length, type);
248978c2a44aSAlex Elder 		/* object request has its own copy of the object name */
249078c2a44aSAlex Elder 		rbd_segment_name_free(object_name);
2491bf0d5f50SAlex Elder 		if (!obj_request)
2492bf0d5f50SAlex Elder 			goto out_unwind;
249362054da6SIlya Dryomov 
249403507db6SJosh Durgin 		/*
249503507db6SJosh Durgin 		 * set obj_request->img_request before creating the
249603507db6SJosh Durgin 		 * osd_request so that it gets the right snapc
249703507db6SJosh Durgin 		 */
249803507db6SJosh Durgin 		rbd_img_obj_request_add(img_request, obj_request);
2499bf0d5f50SAlex Elder 
2500f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO) {
2501f1a4739fSAlex Elder 			unsigned int clone_size;
2502f1a4739fSAlex Elder 
2503bf0d5f50SAlex Elder 			rbd_assert(length <= (u64)UINT_MAX);
2504bf0d5f50SAlex Elder 			clone_size = (unsigned int)length;
2505f1a4739fSAlex Elder 			obj_request->bio_list =
2506f1a4739fSAlex Elder 					bio_chain_clone_range(&bio_list,
2507f1a4739fSAlex Elder 								&bio_offset,
2508f1a4739fSAlex Elder 								clone_size,
2509bf0d5f50SAlex Elder 								GFP_ATOMIC);
2510bf0d5f50SAlex Elder 			if (!obj_request->bio_list)
251162054da6SIlya Dryomov 				goto out_unwind;
251290e98c52SGuangliang Zhao 		} else if (type == OBJ_REQUEST_PAGES) {
2513f1a4739fSAlex Elder 			unsigned int page_count;
2514f1a4739fSAlex Elder 
2515f1a4739fSAlex Elder 			obj_request->pages = pages;
2516f1a4739fSAlex Elder 			page_count = (u32)calc_pages_for(offset, length);
2517f1a4739fSAlex Elder 			obj_request->page_count = page_count;
2518f1a4739fSAlex Elder 			if ((offset + length) & ~PAGE_MASK)
2519f1a4739fSAlex Elder 				page_count--;	/* more on last page */
2520f1a4739fSAlex Elder 			pages += page_count;
2521f1a4739fSAlex Elder 		}
2522bf0d5f50SAlex Elder 
25236d2940c8SGuangliang Zhao 		osd_req = rbd_osd_req_create(rbd_dev, op_type,
25246d2940c8SGuangliang Zhao 					(op_type == OBJ_OP_WRITE) ? 2 : 1,
25252fa12320SAlex Elder 					obj_request);
25262fa12320SAlex Elder 		if (!osd_req)
252762054da6SIlya Dryomov 			goto out_unwind;
25283b434a2aSJosh Durgin 
25292fa12320SAlex Elder 		obj_request->osd_req = osd_req;
25302169238dSAlex Elder 		obj_request->callback = rbd_img_obj_callback;
25317da22d29SAlex Elder 		obj_request->img_offset = img_offset;
2532bf0d5f50SAlex Elder 
25333b434a2aSJosh Durgin 		rbd_img_obj_request_fill(obj_request, osd_req, op_type, 0);
25343b434a2aSJosh Durgin 
25353b434a2aSJosh Durgin 		rbd_img_request_get(img_request);
25363b434a2aSJosh Durgin 
25377da22d29SAlex Elder 		img_offset += length;
2538bf0d5f50SAlex Elder 		resid -= length;
2539bf0d5f50SAlex Elder 	}
2540bf0d5f50SAlex Elder 
2541bf0d5f50SAlex Elder 	return 0;
2542bf0d5f50SAlex Elder 
2543bf0d5f50SAlex Elder out_unwind:
2544bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
254542dd037cSIlya Dryomov 		rbd_img_obj_request_del(img_request, obj_request);
2546bf0d5f50SAlex Elder 
2547bf0d5f50SAlex Elder 	return -ENOMEM;
2548bf0d5f50SAlex Elder }
2549bf0d5f50SAlex Elder 
25503d7efd18SAlex Elder static void
25512761713dSIlya Dryomov rbd_osd_copyup_callback(struct rbd_obj_request *obj_request)
25520eefd470SAlex Elder {
25530eefd470SAlex Elder 	struct rbd_img_request *img_request;
25540eefd470SAlex Elder 	struct rbd_device *rbd_dev;
2555ebda6408SAlex Elder 	struct page **pages;
25560eefd470SAlex Elder 	u32 page_count;
25570eefd470SAlex Elder 
25582761713dSIlya Dryomov 	dout("%s: obj %p\n", __func__, obj_request);
25592761713dSIlya Dryomov 
2560d3246fb0SJosh Durgin 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO ||
2561d3246fb0SJosh Durgin 		obj_request->type == OBJ_REQUEST_NODATA);
25620eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
25630eefd470SAlex Elder 	img_request = obj_request->img_request;
25640eefd470SAlex Elder 	rbd_assert(img_request);
25650eefd470SAlex Elder 
25660eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
25670eefd470SAlex Elder 	rbd_assert(rbd_dev);
25680eefd470SAlex Elder 
2569ebda6408SAlex Elder 	pages = obj_request->copyup_pages;
2570ebda6408SAlex Elder 	rbd_assert(pages != NULL);
25710eefd470SAlex Elder 	obj_request->copyup_pages = NULL;
2572ebda6408SAlex Elder 	page_count = obj_request->copyup_page_count;
2573ebda6408SAlex Elder 	rbd_assert(page_count);
2574ebda6408SAlex Elder 	obj_request->copyup_page_count = 0;
2575ebda6408SAlex Elder 	ceph_release_page_vector(pages, page_count);
25760eefd470SAlex Elder 
25770eefd470SAlex Elder 	/*
25780eefd470SAlex Elder 	 * We want the transfer count to reflect the size of the
25790eefd470SAlex Elder 	 * original write request.  There is no such thing as a
25800eefd470SAlex Elder 	 * successful short write, so if the request was successful
25810eefd470SAlex Elder 	 * we can just set it to the originally-requested length.
25820eefd470SAlex Elder 	 */
25830eefd470SAlex Elder 	if (!obj_request->result)
25840eefd470SAlex Elder 		obj_request->xferred = obj_request->length;
25850eefd470SAlex Elder 
25862761713dSIlya Dryomov 	obj_request_done_set(obj_request);
25870eefd470SAlex Elder }
25880eefd470SAlex Elder 
25890eefd470SAlex Elder static void
25903d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
25913d7efd18SAlex Elder {
25923d7efd18SAlex Elder 	struct rbd_obj_request *orig_request;
25930eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
25940eefd470SAlex Elder 	struct ceph_osd_client *osdc;
25950eefd470SAlex Elder 	struct rbd_device *rbd_dev;
25963d7efd18SAlex Elder 	struct page **pages;
2597d3246fb0SJosh Durgin 	enum obj_operation_type op_type;
2598ebda6408SAlex Elder 	u32 page_count;
2599bbea1c1aSAlex Elder 	int img_result;
2600ebda6408SAlex Elder 	u64 parent_length;
26013d7efd18SAlex Elder 
26023d7efd18SAlex Elder 	rbd_assert(img_request_child_test(img_request));
26033d7efd18SAlex Elder 
26043d7efd18SAlex Elder 	/* First get what we need from the image request */
26053d7efd18SAlex Elder 
26063d7efd18SAlex Elder 	pages = img_request->copyup_pages;
26073d7efd18SAlex Elder 	rbd_assert(pages != NULL);
26083d7efd18SAlex Elder 	img_request->copyup_pages = NULL;
2609ebda6408SAlex Elder 	page_count = img_request->copyup_page_count;
2610ebda6408SAlex Elder 	rbd_assert(page_count);
2611ebda6408SAlex Elder 	img_request->copyup_page_count = 0;
26123d7efd18SAlex Elder 
26133d7efd18SAlex Elder 	orig_request = img_request->obj_request;
26143d7efd18SAlex Elder 	rbd_assert(orig_request != NULL);
2615b91f09f1SAlex Elder 	rbd_assert(obj_request_type_valid(orig_request->type));
2616bbea1c1aSAlex Elder 	img_result = img_request->result;
2617ebda6408SAlex Elder 	parent_length = img_request->length;
2618ebda6408SAlex Elder 	rbd_assert(parent_length == img_request->xferred);
26193d7efd18SAlex Elder 	rbd_img_request_put(img_request);
26203d7efd18SAlex Elder 
262191c6febbSAlex Elder 	rbd_assert(orig_request->img_request);
262291c6febbSAlex Elder 	rbd_dev = orig_request->img_request->rbd_dev;
26233d7efd18SAlex Elder 	rbd_assert(rbd_dev);
26243d7efd18SAlex Elder 
2625bbea1c1aSAlex Elder 	/*
2626bbea1c1aSAlex Elder 	 * If the overlap has become 0 (most likely because the
2627bbea1c1aSAlex Elder 	 * image has been flattened) we need to free the pages
2628bbea1c1aSAlex Elder 	 * and re-submit the original write request.
2629bbea1c1aSAlex Elder 	 */
2630bbea1c1aSAlex Elder 	if (!rbd_dev->parent_overlap) {
2631bbea1c1aSAlex Elder 		struct ceph_osd_client *osdc;
2632bbea1c1aSAlex Elder 
2633bbea1c1aSAlex Elder 		ceph_release_page_vector(pages, page_count);
2634bbea1c1aSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2635bbea1c1aSAlex Elder 		img_result = rbd_obj_request_submit(osdc, orig_request);
2636bbea1c1aSAlex Elder 		if (!img_result)
2637bbea1c1aSAlex Elder 			return;
2638bbea1c1aSAlex Elder 	}
2639bbea1c1aSAlex Elder 
2640bbea1c1aSAlex Elder 	if (img_result)
26410eefd470SAlex Elder 		goto out_err;
26423d7efd18SAlex Elder 
26438785b1d4SAlex Elder 	/*
26448785b1d4SAlex Elder 	 * The original osd request is of no use to use any more.
26450ccd5926SIlya Dryomov 	 * We need a new one that can hold the three ops in a copyup
26468785b1d4SAlex Elder 	 * request.  Allocate the new copyup osd request for the
26478785b1d4SAlex Elder 	 * original request, and release the old one.
26488785b1d4SAlex Elder 	 */
2649bbea1c1aSAlex Elder 	img_result = -ENOMEM;
26500eefd470SAlex Elder 	osd_req = rbd_osd_req_create_copyup(orig_request);
26510eefd470SAlex Elder 	if (!osd_req)
26520eefd470SAlex Elder 		goto out_err;
26538785b1d4SAlex Elder 	rbd_osd_req_destroy(orig_request->osd_req);
26540eefd470SAlex Elder 	orig_request->osd_req = osd_req;
26550eefd470SAlex Elder 	orig_request->copyup_pages = pages;
2656ebda6408SAlex Elder 	orig_request->copyup_page_count = page_count;
26573d7efd18SAlex Elder 
26580eefd470SAlex Elder 	/* Initialize the copyup op */
26590eefd470SAlex Elder 
26600eefd470SAlex Elder 	osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2661ebda6408SAlex Elder 	osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
26620eefd470SAlex Elder 						false, false);
26630eefd470SAlex Elder 
2664d3246fb0SJosh Durgin 	/* Add the other op(s) */
26650ccd5926SIlya Dryomov 
2666d3246fb0SJosh Durgin 	op_type = rbd_img_request_op_type(orig_request->img_request);
2667d3246fb0SJosh Durgin 	rbd_img_obj_request_fill(orig_request, osd_req, op_type, 1);
26680eefd470SAlex Elder 
26690eefd470SAlex Elder 	/* All set, send it off. */
26700eefd470SAlex Elder 
26710eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2672bbea1c1aSAlex Elder 	img_result = rbd_obj_request_submit(osdc, orig_request);
2673bbea1c1aSAlex Elder 	if (!img_result)
26740eefd470SAlex Elder 		return;
26750eefd470SAlex Elder out_err:
26760eefd470SAlex Elder 	/* Record the error code and complete the request */
26770eefd470SAlex Elder 
2678bbea1c1aSAlex Elder 	orig_request->result = img_result;
26790eefd470SAlex Elder 	orig_request->xferred = 0;
26803d7efd18SAlex Elder 	obj_request_done_set(orig_request);
26813d7efd18SAlex Elder 	rbd_obj_request_complete(orig_request);
26823d7efd18SAlex Elder }
26833d7efd18SAlex Elder 
26843d7efd18SAlex Elder /*
26853d7efd18SAlex Elder  * Read from the parent image the range of data that covers the
26863d7efd18SAlex Elder  * entire target of the given object request.  This is used for
26873d7efd18SAlex Elder  * satisfying a layered image write request when the target of an
26883d7efd18SAlex Elder  * object request from the image request does not exist.
26893d7efd18SAlex Elder  *
26903d7efd18SAlex Elder  * A page array big enough to hold the returned data is allocated
26913d7efd18SAlex Elder  * and supplied to rbd_img_request_fill() as the "data descriptor."
26923d7efd18SAlex Elder  * When the read completes, this page array will be transferred to
26933d7efd18SAlex Elder  * the original object request for the copyup operation.
26943d7efd18SAlex Elder  *
26953d7efd18SAlex Elder  * If an error occurs, record it as the result of the original
26963d7efd18SAlex Elder  * object request and mark it done so it gets completed.
26973d7efd18SAlex Elder  */
26983d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
26993d7efd18SAlex Elder {
27003d7efd18SAlex Elder 	struct rbd_img_request *img_request = NULL;
27013d7efd18SAlex Elder 	struct rbd_img_request *parent_request = NULL;
27023d7efd18SAlex Elder 	struct rbd_device *rbd_dev;
27033d7efd18SAlex Elder 	u64 img_offset;
27043d7efd18SAlex Elder 	u64 length;
27053d7efd18SAlex Elder 	struct page **pages = NULL;
27063d7efd18SAlex Elder 	u32 page_count;
27073d7efd18SAlex Elder 	int result;
27083d7efd18SAlex Elder 
27093d7efd18SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
2710b91f09f1SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
27113d7efd18SAlex Elder 
27123d7efd18SAlex Elder 	img_request = obj_request->img_request;
27133d7efd18SAlex Elder 	rbd_assert(img_request != NULL);
27143d7efd18SAlex Elder 	rbd_dev = img_request->rbd_dev;
27153d7efd18SAlex Elder 	rbd_assert(rbd_dev->parent != NULL);
27163d7efd18SAlex Elder 
27173d7efd18SAlex Elder 	/*
27183d7efd18SAlex Elder 	 * Determine the byte range covered by the object in the
27193d7efd18SAlex Elder 	 * child image to which the original request was to be sent.
27203d7efd18SAlex Elder 	 */
27213d7efd18SAlex Elder 	img_offset = obj_request->img_offset - obj_request->offset;
27223d7efd18SAlex Elder 	length = (u64)1 << rbd_dev->header.obj_order;
27233d7efd18SAlex Elder 
27243d7efd18SAlex Elder 	/*
2725a9e8ba2cSAlex Elder 	 * There is no defined parent data beyond the parent
2726a9e8ba2cSAlex Elder 	 * overlap, so limit what we read at that boundary if
2727a9e8ba2cSAlex Elder 	 * necessary.
2728a9e8ba2cSAlex Elder 	 */
2729a9e8ba2cSAlex Elder 	if (img_offset + length > rbd_dev->parent_overlap) {
2730a9e8ba2cSAlex Elder 		rbd_assert(img_offset < rbd_dev->parent_overlap);
2731a9e8ba2cSAlex Elder 		length = rbd_dev->parent_overlap - img_offset;
2732a9e8ba2cSAlex Elder 	}
2733a9e8ba2cSAlex Elder 
2734a9e8ba2cSAlex Elder 	/*
27353d7efd18SAlex Elder 	 * Allocate a page array big enough to receive the data read
27363d7efd18SAlex Elder 	 * from the parent.
27373d7efd18SAlex Elder 	 */
27383d7efd18SAlex Elder 	page_count = (u32)calc_pages_for(0, length);
27393d7efd18SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
27403d7efd18SAlex Elder 	if (IS_ERR(pages)) {
27413d7efd18SAlex Elder 		result = PTR_ERR(pages);
27423d7efd18SAlex Elder 		pages = NULL;
27433d7efd18SAlex Elder 		goto out_err;
27443d7efd18SAlex Elder 	}
27453d7efd18SAlex Elder 
27463d7efd18SAlex Elder 	result = -ENOMEM;
2747e93f3152SAlex Elder 	parent_request = rbd_parent_request_create(obj_request,
2748e93f3152SAlex Elder 						img_offset, length);
27493d7efd18SAlex Elder 	if (!parent_request)
27503d7efd18SAlex Elder 		goto out_err;
27513d7efd18SAlex Elder 
27523d7efd18SAlex Elder 	result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
27533d7efd18SAlex Elder 	if (result)
27543d7efd18SAlex Elder 		goto out_err;
27553d7efd18SAlex Elder 	parent_request->copyup_pages = pages;
2756ebda6408SAlex Elder 	parent_request->copyup_page_count = page_count;
27573d7efd18SAlex Elder 
27583d7efd18SAlex Elder 	parent_request->callback = rbd_img_obj_parent_read_full_callback;
27593d7efd18SAlex Elder 	result = rbd_img_request_submit(parent_request);
27603d7efd18SAlex Elder 	if (!result)
27613d7efd18SAlex Elder 		return 0;
27623d7efd18SAlex Elder 
27633d7efd18SAlex Elder 	parent_request->copyup_pages = NULL;
2764ebda6408SAlex Elder 	parent_request->copyup_page_count = 0;
27653d7efd18SAlex Elder 	parent_request->obj_request = NULL;
27663d7efd18SAlex Elder 	rbd_obj_request_put(obj_request);
27673d7efd18SAlex Elder out_err:
27683d7efd18SAlex Elder 	if (pages)
27693d7efd18SAlex Elder 		ceph_release_page_vector(pages, page_count);
27703d7efd18SAlex Elder 	if (parent_request)
27713d7efd18SAlex Elder 		rbd_img_request_put(parent_request);
27723d7efd18SAlex Elder 	obj_request->result = result;
27733d7efd18SAlex Elder 	obj_request->xferred = 0;
27743d7efd18SAlex Elder 	obj_request_done_set(obj_request);
27753d7efd18SAlex Elder 
27763d7efd18SAlex Elder 	return result;
27773d7efd18SAlex Elder }
27783d7efd18SAlex Elder 
2779c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2780c5b5ef6cSAlex Elder {
2781c5b5ef6cSAlex Elder 	struct rbd_obj_request *orig_request;
2782638f5abeSAlex Elder 	struct rbd_device *rbd_dev;
2783c5b5ef6cSAlex Elder 	int result;
2784c5b5ef6cSAlex Elder 
2785c5b5ef6cSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
2786c5b5ef6cSAlex Elder 
2787c5b5ef6cSAlex Elder 	/*
2788c5b5ef6cSAlex Elder 	 * All we need from the object request is the original
2789c5b5ef6cSAlex Elder 	 * request and the result of the STAT op.  Grab those, then
2790c5b5ef6cSAlex Elder 	 * we're done with the request.
2791c5b5ef6cSAlex Elder 	 */
2792c5b5ef6cSAlex Elder 	orig_request = obj_request->obj_request;
2793c5b5ef6cSAlex Elder 	obj_request->obj_request = NULL;
2794912c317dSAlex Elder 	rbd_obj_request_put(orig_request);
2795c5b5ef6cSAlex Elder 	rbd_assert(orig_request);
2796c5b5ef6cSAlex Elder 	rbd_assert(orig_request->img_request);
2797c5b5ef6cSAlex Elder 
2798c5b5ef6cSAlex Elder 	result = obj_request->result;
2799c5b5ef6cSAlex Elder 	obj_request->result = 0;
2800c5b5ef6cSAlex Elder 
2801c5b5ef6cSAlex Elder 	dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2802c5b5ef6cSAlex Elder 		obj_request, orig_request, result,
2803c5b5ef6cSAlex Elder 		obj_request->xferred, obj_request->length);
2804c5b5ef6cSAlex Elder 	rbd_obj_request_put(obj_request);
2805c5b5ef6cSAlex Elder 
2806638f5abeSAlex Elder 	/*
2807638f5abeSAlex Elder 	 * If the overlap has become 0 (most likely because the
2808638f5abeSAlex Elder 	 * image has been flattened) we need to free the pages
2809638f5abeSAlex Elder 	 * and re-submit the original write request.
2810638f5abeSAlex Elder 	 */
2811638f5abeSAlex Elder 	rbd_dev = orig_request->img_request->rbd_dev;
2812638f5abeSAlex Elder 	if (!rbd_dev->parent_overlap) {
2813638f5abeSAlex Elder 		struct ceph_osd_client *osdc;
2814638f5abeSAlex Elder 
2815638f5abeSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2816638f5abeSAlex Elder 		result = rbd_obj_request_submit(osdc, orig_request);
2817638f5abeSAlex Elder 		if (!result)
2818638f5abeSAlex Elder 			return;
2819638f5abeSAlex Elder 	}
2820c5b5ef6cSAlex Elder 
2821c5b5ef6cSAlex Elder 	/*
2822c5b5ef6cSAlex Elder 	 * Our only purpose here is to determine whether the object
2823c5b5ef6cSAlex Elder 	 * exists, and we don't want to treat the non-existence as
2824c5b5ef6cSAlex Elder 	 * an error.  If something else comes back, transfer the
2825c5b5ef6cSAlex Elder 	 * error to the original request and complete it now.
2826c5b5ef6cSAlex Elder 	 */
2827c5b5ef6cSAlex Elder 	if (!result) {
2828c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, true);
2829c5b5ef6cSAlex Elder 	} else if (result == -ENOENT) {
2830c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, false);
2831c5b5ef6cSAlex Elder 	} else if (result) {
2832c5b5ef6cSAlex Elder 		orig_request->result = result;
28333d7efd18SAlex Elder 		goto out;
2834c5b5ef6cSAlex Elder 	}
2835c5b5ef6cSAlex Elder 
2836c5b5ef6cSAlex Elder 	/*
2837c5b5ef6cSAlex Elder 	 * Resubmit the original request now that we have recorded
2838c5b5ef6cSAlex Elder 	 * whether the target object exists.
2839c5b5ef6cSAlex Elder 	 */
2840b454e36dSAlex Elder 	orig_request->result = rbd_img_obj_request_submit(orig_request);
28413d7efd18SAlex Elder out:
2842c5b5ef6cSAlex Elder 	if (orig_request->result)
2843c5b5ef6cSAlex Elder 		rbd_obj_request_complete(orig_request);
2844c5b5ef6cSAlex Elder }
2845c5b5ef6cSAlex Elder 
2846c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2847c5b5ef6cSAlex Elder {
2848c5b5ef6cSAlex Elder 	struct rbd_obj_request *stat_request;
2849c5b5ef6cSAlex Elder 	struct rbd_device *rbd_dev;
2850c5b5ef6cSAlex Elder 	struct ceph_osd_client *osdc;
2851c5b5ef6cSAlex Elder 	struct page **pages = NULL;
2852c5b5ef6cSAlex Elder 	u32 page_count;
2853c5b5ef6cSAlex Elder 	size_t size;
2854c5b5ef6cSAlex Elder 	int ret;
2855c5b5ef6cSAlex Elder 
2856c5b5ef6cSAlex Elder 	/*
2857c5b5ef6cSAlex Elder 	 * The response data for a STAT call consists of:
2858c5b5ef6cSAlex Elder 	 *     le64 length;
2859c5b5ef6cSAlex Elder 	 *     struct {
2860c5b5ef6cSAlex Elder 	 *         le32 tv_sec;
2861c5b5ef6cSAlex Elder 	 *         le32 tv_nsec;
2862c5b5ef6cSAlex Elder 	 *     } mtime;
2863c5b5ef6cSAlex Elder 	 */
2864c5b5ef6cSAlex Elder 	size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2865c5b5ef6cSAlex Elder 	page_count = (u32)calc_pages_for(0, size);
2866c5b5ef6cSAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2867c5b5ef6cSAlex Elder 	if (IS_ERR(pages))
2868c5b5ef6cSAlex Elder 		return PTR_ERR(pages);
2869c5b5ef6cSAlex Elder 
2870c5b5ef6cSAlex Elder 	ret = -ENOMEM;
2871c5b5ef6cSAlex Elder 	stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2872c5b5ef6cSAlex Elder 							OBJ_REQUEST_PAGES);
2873c5b5ef6cSAlex Elder 	if (!stat_request)
2874c5b5ef6cSAlex Elder 		goto out;
2875c5b5ef6cSAlex Elder 
2876c5b5ef6cSAlex Elder 	rbd_obj_request_get(obj_request);
2877c5b5ef6cSAlex Elder 	stat_request->obj_request = obj_request;
2878c5b5ef6cSAlex Elder 	stat_request->pages = pages;
2879c5b5ef6cSAlex Elder 	stat_request->page_count = page_count;
2880c5b5ef6cSAlex Elder 
2881c5b5ef6cSAlex Elder 	rbd_assert(obj_request->img_request);
2882c5b5ef6cSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
28836d2940c8SGuangliang Zhao 	stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
2884c5b5ef6cSAlex Elder 						   stat_request);
2885c5b5ef6cSAlex Elder 	if (!stat_request->osd_req)
2886c5b5ef6cSAlex Elder 		goto out;
2887c5b5ef6cSAlex Elder 	stat_request->callback = rbd_img_obj_exists_callback;
2888c5b5ef6cSAlex Elder 
2889144cba14SYan, Zheng 	osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0);
2890c5b5ef6cSAlex Elder 	osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2891c5b5ef6cSAlex Elder 					false, false);
28929d4df01fSAlex Elder 	rbd_osd_req_format_read(stat_request);
2893c5b5ef6cSAlex Elder 
2894c5b5ef6cSAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2895c5b5ef6cSAlex Elder 	ret = rbd_obj_request_submit(osdc, stat_request);
2896c5b5ef6cSAlex Elder out:
2897c5b5ef6cSAlex Elder 	if (ret)
2898c5b5ef6cSAlex Elder 		rbd_obj_request_put(obj_request);
2899c5b5ef6cSAlex Elder 
2900c5b5ef6cSAlex Elder 	return ret;
2901c5b5ef6cSAlex Elder }
2902c5b5ef6cSAlex Elder 
290370d045f6SIlya Dryomov static bool img_obj_request_simple(struct rbd_obj_request *obj_request)
2904b454e36dSAlex Elder {
2905b454e36dSAlex Elder 	struct rbd_img_request *img_request;
2906a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
2907b454e36dSAlex Elder 
2908b454e36dSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
2909b454e36dSAlex Elder 
2910b454e36dSAlex Elder 	img_request = obj_request->img_request;
2911b454e36dSAlex Elder 	rbd_assert(img_request);
2912a9e8ba2cSAlex Elder 	rbd_dev = img_request->rbd_dev;
2913b454e36dSAlex Elder 
291470d045f6SIlya Dryomov 	/* Reads */
29151c220881SJosh Durgin 	if (!img_request_write_test(img_request) &&
29161c220881SJosh Durgin 	    !img_request_discard_test(img_request))
291770d045f6SIlya Dryomov 		return true;
2918b454e36dSAlex Elder 
291970d045f6SIlya Dryomov 	/* Non-layered writes */
292070d045f6SIlya Dryomov 	if (!img_request_layered_test(img_request))
292170d045f6SIlya Dryomov 		return true;
292270d045f6SIlya Dryomov 
292370d045f6SIlya Dryomov 	/*
292470d045f6SIlya Dryomov 	 * Layered writes outside of the parent overlap range don't
292570d045f6SIlya Dryomov 	 * share any data with the parent.
292670d045f6SIlya Dryomov 	 */
292770d045f6SIlya Dryomov 	if (!obj_request_overlaps_parent(obj_request))
292870d045f6SIlya Dryomov 		return true;
292970d045f6SIlya Dryomov 
293070d045f6SIlya Dryomov 	/*
2931c622d226SGuangliang Zhao 	 * Entire-object layered writes - we will overwrite whatever
2932c622d226SGuangliang Zhao 	 * parent data there is anyway.
2933c622d226SGuangliang Zhao 	 */
2934c622d226SGuangliang Zhao 	if (!obj_request->offset &&
2935c622d226SGuangliang Zhao 	    obj_request->length == rbd_obj_bytes(&rbd_dev->header))
2936c622d226SGuangliang Zhao 		return true;
2937c622d226SGuangliang Zhao 
2938c622d226SGuangliang Zhao 	/*
293970d045f6SIlya Dryomov 	 * If the object is known to already exist, its parent data has
294070d045f6SIlya Dryomov 	 * already been copied.
294170d045f6SIlya Dryomov 	 */
294270d045f6SIlya Dryomov 	if (obj_request_known_test(obj_request) &&
294370d045f6SIlya Dryomov 	    obj_request_exists_test(obj_request))
294470d045f6SIlya Dryomov 		return true;
294570d045f6SIlya Dryomov 
294670d045f6SIlya Dryomov 	return false;
294770d045f6SIlya Dryomov }
294870d045f6SIlya Dryomov 
294970d045f6SIlya Dryomov static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
295070d045f6SIlya Dryomov {
295170d045f6SIlya Dryomov 	if (img_obj_request_simple(obj_request)) {
2952b454e36dSAlex Elder 		struct rbd_device *rbd_dev;
2953b454e36dSAlex Elder 		struct ceph_osd_client *osdc;
2954b454e36dSAlex Elder 
2955b454e36dSAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
2956b454e36dSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2957b454e36dSAlex Elder 
2958b454e36dSAlex Elder 		return rbd_obj_request_submit(osdc, obj_request);
2959b454e36dSAlex Elder 	}
2960b454e36dSAlex Elder 
2961b454e36dSAlex Elder 	/*
29623d7efd18SAlex Elder 	 * It's a layered write.  The target object might exist but
29633d7efd18SAlex Elder 	 * we may not know that yet.  If we know it doesn't exist,
29643d7efd18SAlex Elder 	 * start by reading the data for the full target object from
29653d7efd18SAlex Elder 	 * the parent so we can use it for a copyup to the target.
2966b454e36dSAlex Elder 	 */
296770d045f6SIlya Dryomov 	if (obj_request_known_test(obj_request))
29683d7efd18SAlex Elder 		return rbd_img_obj_parent_read_full(obj_request);
29693d7efd18SAlex Elder 
29703d7efd18SAlex Elder 	/* We don't know whether the target exists.  Go find out. */
2971b454e36dSAlex Elder 
2972b454e36dSAlex Elder 	return rbd_img_obj_exists_submit(obj_request);
2973b454e36dSAlex Elder }
2974b454e36dSAlex Elder 
2975bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request)
2976bf0d5f50SAlex Elder {
2977bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
297846faeed4SAlex Elder 	struct rbd_obj_request *next_obj_request;
2979bf0d5f50SAlex Elder 
298037206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
298146faeed4SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
2982bf0d5f50SAlex Elder 		int ret;
2983bf0d5f50SAlex Elder 
2984b454e36dSAlex Elder 		ret = rbd_img_obj_request_submit(obj_request);
2985bf0d5f50SAlex Elder 		if (ret)
2986bf0d5f50SAlex Elder 			return ret;
2987bf0d5f50SAlex Elder 	}
2988bf0d5f50SAlex Elder 
2989bf0d5f50SAlex Elder 	return 0;
2990bf0d5f50SAlex Elder }
2991bf0d5f50SAlex Elder 
29928b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
29938b3e1a56SAlex Elder {
29948b3e1a56SAlex Elder 	struct rbd_obj_request *obj_request;
2995a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
2996a9e8ba2cSAlex Elder 	u64 obj_end;
299702c74fbaSAlex Elder 	u64 img_xferred;
299802c74fbaSAlex Elder 	int img_result;
29998b3e1a56SAlex Elder 
30008b3e1a56SAlex Elder 	rbd_assert(img_request_child_test(img_request));
30018b3e1a56SAlex Elder 
300202c74fbaSAlex Elder 	/* First get what we need from the image request and release it */
300302c74fbaSAlex Elder 
30048b3e1a56SAlex Elder 	obj_request = img_request->obj_request;
300502c74fbaSAlex Elder 	img_xferred = img_request->xferred;
300602c74fbaSAlex Elder 	img_result = img_request->result;
300702c74fbaSAlex Elder 	rbd_img_request_put(img_request);
300802c74fbaSAlex Elder 
300902c74fbaSAlex Elder 	/*
301002c74fbaSAlex Elder 	 * If the overlap has become 0 (most likely because the
301102c74fbaSAlex Elder 	 * image has been flattened) we need to re-submit the
301202c74fbaSAlex Elder 	 * original request.
301302c74fbaSAlex Elder 	 */
3014a9e8ba2cSAlex Elder 	rbd_assert(obj_request);
3015a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_request);
301602c74fbaSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
301702c74fbaSAlex Elder 	if (!rbd_dev->parent_overlap) {
301802c74fbaSAlex Elder 		struct ceph_osd_client *osdc;
30198b3e1a56SAlex Elder 
302002c74fbaSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
302102c74fbaSAlex Elder 		img_result = rbd_obj_request_submit(osdc, obj_request);
302202c74fbaSAlex Elder 		if (!img_result)
302302c74fbaSAlex Elder 			return;
302402c74fbaSAlex Elder 	}
302502c74fbaSAlex Elder 
302602c74fbaSAlex Elder 	obj_request->result = img_result;
3027a9e8ba2cSAlex Elder 	if (obj_request->result)
3028a9e8ba2cSAlex Elder 		goto out;
3029a9e8ba2cSAlex Elder 
3030a9e8ba2cSAlex Elder 	/*
3031a9e8ba2cSAlex Elder 	 * We need to zero anything beyond the parent overlap
3032a9e8ba2cSAlex Elder 	 * boundary.  Since rbd_img_obj_request_read_callback()
3033a9e8ba2cSAlex Elder 	 * will zero anything beyond the end of a short read, an
3034a9e8ba2cSAlex Elder 	 * easy way to do this is to pretend the data from the
3035a9e8ba2cSAlex Elder 	 * parent came up short--ending at the overlap boundary.
3036a9e8ba2cSAlex Elder 	 */
3037a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
3038a9e8ba2cSAlex Elder 	obj_end = obj_request->img_offset + obj_request->length;
3039a9e8ba2cSAlex Elder 	if (obj_end > rbd_dev->parent_overlap) {
3040a9e8ba2cSAlex Elder 		u64 xferred = 0;
3041a9e8ba2cSAlex Elder 
3042a9e8ba2cSAlex Elder 		if (obj_request->img_offset < rbd_dev->parent_overlap)
3043a9e8ba2cSAlex Elder 			xferred = rbd_dev->parent_overlap -
3044a9e8ba2cSAlex Elder 					obj_request->img_offset;
3045a9e8ba2cSAlex Elder 
304602c74fbaSAlex Elder 		obj_request->xferred = min(img_xferred, xferred);
3047a9e8ba2cSAlex Elder 	} else {
304802c74fbaSAlex Elder 		obj_request->xferred = img_xferred;
3049a9e8ba2cSAlex Elder 	}
3050a9e8ba2cSAlex Elder out:
30518b3e1a56SAlex Elder 	rbd_img_obj_request_read_callback(obj_request);
30528b3e1a56SAlex Elder 	rbd_obj_request_complete(obj_request);
30538b3e1a56SAlex Elder }
30548b3e1a56SAlex Elder 
30558b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
30568b3e1a56SAlex Elder {
30578b3e1a56SAlex Elder 	struct rbd_img_request *img_request;
30588b3e1a56SAlex Elder 	int result;
30598b3e1a56SAlex Elder 
30608b3e1a56SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
30618b3e1a56SAlex Elder 	rbd_assert(obj_request->img_request != NULL);
30628b3e1a56SAlex Elder 	rbd_assert(obj_request->result == (s32) -ENOENT);
30635b2ab72dSAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
30648b3e1a56SAlex Elder 
30658b3e1a56SAlex Elder 	/* rbd_read_finish(obj_request, obj_request->length); */
3066e93f3152SAlex Elder 	img_request = rbd_parent_request_create(obj_request,
30678b3e1a56SAlex Elder 						obj_request->img_offset,
3068e93f3152SAlex Elder 						obj_request->length);
30698b3e1a56SAlex Elder 	result = -ENOMEM;
30708b3e1a56SAlex Elder 	if (!img_request)
30718b3e1a56SAlex Elder 		goto out_err;
30728b3e1a56SAlex Elder 
30735b2ab72dSAlex Elder 	if (obj_request->type == OBJ_REQUEST_BIO)
3074f1a4739fSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
3075f1a4739fSAlex Elder 						obj_request->bio_list);
30765b2ab72dSAlex Elder 	else
30775b2ab72dSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES,
30785b2ab72dSAlex Elder 						obj_request->pages);
30798b3e1a56SAlex Elder 	if (result)
30808b3e1a56SAlex Elder 		goto out_err;
30818b3e1a56SAlex Elder 
30828b3e1a56SAlex Elder 	img_request->callback = rbd_img_parent_read_callback;
30838b3e1a56SAlex Elder 	result = rbd_img_request_submit(img_request);
30848b3e1a56SAlex Elder 	if (result)
30858b3e1a56SAlex Elder 		goto out_err;
30868b3e1a56SAlex Elder 
30878b3e1a56SAlex Elder 	return;
30888b3e1a56SAlex Elder out_err:
30898b3e1a56SAlex Elder 	if (img_request)
30908b3e1a56SAlex Elder 		rbd_img_request_put(img_request);
30918b3e1a56SAlex Elder 	obj_request->result = result;
30928b3e1a56SAlex Elder 	obj_request->xferred = 0;
30938b3e1a56SAlex Elder 	obj_request_done_set(obj_request);
30948b3e1a56SAlex Elder }
30958b3e1a56SAlex Elder 
309620e0af67SJosh Durgin static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id)
3097b8d70035SAlex Elder {
3098b8d70035SAlex Elder 	struct rbd_obj_request *obj_request;
30992169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3100b8d70035SAlex Elder 	int ret;
3101b8d70035SAlex Elder 
3102b8d70035SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
3103b8d70035SAlex Elder 							OBJ_REQUEST_NODATA);
3104b8d70035SAlex Elder 	if (!obj_request)
3105b8d70035SAlex Elder 		return -ENOMEM;
3106b8d70035SAlex Elder 
3107b8d70035SAlex Elder 	ret = -ENOMEM;
31086d2940c8SGuangliang Zhao 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
3109deb236b3SIlya Dryomov 						  obj_request);
3110b8d70035SAlex Elder 	if (!obj_request->osd_req)
3111b8d70035SAlex Elder 		goto out;
3112b8d70035SAlex Elder 
3113c99d2d4aSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
3114cc4a38bdSAlex Elder 					notify_id, 0, 0);
31159d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
3116430c28c3SAlex Elder 
3117b8d70035SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
3118cf81b60eSAlex Elder 	if (ret)
311920e0af67SJosh Durgin 		goto out;
312020e0af67SJosh Durgin 	ret = rbd_obj_request_wait(obj_request);
312120e0af67SJosh Durgin out:
3122b8d70035SAlex Elder 	rbd_obj_request_put(obj_request);
3123b8d70035SAlex Elder 
3124b8d70035SAlex Elder 	return ret;
3125b8d70035SAlex Elder }
3126b8d70035SAlex Elder 
3127b8d70035SAlex Elder static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
3128b8d70035SAlex Elder {
3129b8d70035SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
3130e627db08SAlex Elder 	int ret;
3131b8d70035SAlex Elder 
3132b8d70035SAlex Elder 	if (!rbd_dev)
3133b8d70035SAlex Elder 		return;
3134b8d70035SAlex Elder 
313537206ee5SAlex Elder 	dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
3136b8d70035SAlex Elder 		rbd_dev->header_name, (unsigned long long)notify_id,
3137b8d70035SAlex Elder 		(unsigned int)opcode);
313852bb1f9bSIlya Dryomov 
313952bb1f9bSIlya Dryomov 	/*
314052bb1f9bSIlya Dryomov 	 * Until adequate refresh error handling is in place, there is
314152bb1f9bSIlya Dryomov 	 * not much we can do here, except warn.
314252bb1f9bSIlya Dryomov 	 *
314352bb1f9bSIlya Dryomov 	 * See http://tracker.ceph.com/issues/5040
314452bb1f9bSIlya Dryomov 	 */
3145e627db08SAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
3146e627db08SAlex Elder 	if (ret)
31479584d508SIlya Dryomov 		rbd_warn(rbd_dev, "refresh failed: %d", ret);
3148b8d70035SAlex Elder 
314952bb1f9bSIlya Dryomov 	ret = rbd_obj_notify_ack_sync(rbd_dev, notify_id);
315052bb1f9bSIlya Dryomov 	if (ret)
31519584d508SIlya Dryomov 		rbd_warn(rbd_dev, "notify_ack ret %d", ret);
3152b8d70035SAlex Elder }
3153b8d70035SAlex Elder 
31549969ebc5SAlex Elder /*
3155bb040aa0SIlya Dryomov  * Send a (un)watch request and wait for the ack.  Return a request
3156bb040aa0SIlya Dryomov  * with a ref held on success or error.
3157bb040aa0SIlya Dryomov  */
3158bb040aa0SIlya Dryomov static struct rbd_obj_request *rbd_obj_watch_request_helper(
3159bb040aa0SIlya Dryomov 						struct rbd_device *rbd_dev,
3160bb040aa0SIlya Dryomov 						bool watch)
3161bb040aa0SIlya Dryomov {
3162bb040aa0SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
31632894e1d7SIlya Dryomov 	struct ceph_options *opts = osdc->client->options;
3164bb040aa0SIlya Dryomov 	struct rbd_obj_request *obj_request;
3165bb040aa0SIlya Dryomov 	int ret;
3166bb040aa0SIlya Dryomov 
3167bb040aa0SIlya Dryomov 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
3168bb040aa0SIlya Dryomov 					     OBJ_REQUEST_NODATA);
3169bb040aa0SIlya Dryomov 	if (!obj_request)
3170bb040aa0SIlya Dryomov 		return ERR_PTR(-ENOMEM);
3171bb040aa0SIlya Dryomov 
31726d2940c8SGuangliang Zhao 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_WRITE, 1,
3173bb040aa0SIlya Dryomov 						  obj_request);
3174bb040aa0SIlya Dryomov 	if (!obj_request->osd_req) {
3175bb040aa0SIlya Dryomov 		ret = -ENOMEM;
3176bb040aa0SIlya Dryomov 		goto out;
3177bb040aa0SIlya Dryomov 	}
3178bb040aa0SIlya Dryomov 
3179bb040aa0SIlya Dryomov 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
3180bb040aa0SIlya Dryomov 			      rbd_dev->watch_event->cookie, 0, watch);
3181bb040aa0SIlya Dryomov 	rbd_osd_req_format_write(obj_request);
3182bb040aa0SIlya Dryomov 
3183bb040aa0SIlya Dryomov 	if (watch)
3184bb040aa0SIlya Dryomov 		ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
3185bb040aa0SIlya Dryomov 
3186bb040aa0SIlya Dryomov 	ret = rbd_obj_request_submit(osdc, obj_request);
3187bb040aa0SIlya Dryomov 	if (ret)
3188bb040aa0SIlya Dryomov 		goto out;
3189bb040aa0SIlya Dryomov 
31902894e1d7SIlya Dryomov 	ret = rbd_obj_request_wait_timeout(obj_request, opts->mount_timeout);
3191bb040aa0SIlya Dryomov 	if (ret)
3192bb040aa0SIlya Dryomov 		goto out;
3193bb040aa0SIlya Dryomov 
3194bb040aa0SIlya Dryomov 	ret = obj_request->result;
3195bb040aa0SIlya Dryomov 	if (ret) {
3196bb040aa0SIlya Dryomov 		if (watch)
3197bb040aa0SIlya Dryomov 			rbd_obj_request_end(obj_request);
3198bb040aa0SIlya Dryomov 		goto out;
3199bb040aa0SIlya Dryomov 	}
3200bb040aa0SIlya Dryomov 
3201bb040aa0SIlya Dryomov 	return obj_request;
3202bb040aa0SIlya Dryomov 
3203bb040aa0SIlya Dryomov out:
3204bb040aa0SIlya Dryomov 	rbd_obj_request_put(obj_request);
3205bb040aa0SIlya Dryomov 	return ERR_PTR(ret);
3206bb040aa0SIlya Dryomov }
3207bb040aa0SIlya Dryomov 
3208bb040aa0SIlya Dryomov /*
3209b30a01f2SIlya Dryomov  * Initiate a watch request, synchronously.
32109969ebc5SAlex Elder  */
3211b30a01f2SIlya Dryomov static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
32129969ebc5SAlex Elder {
32139969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
32149969ebc5SAlex Elder 	struct rbd_obj_request *obj_request;
32159969ebc5SAlex Elder 	int ret;
32169969ebc5SAlex Elder 
3217b30a01f2SIlya Dryomov 	rbd_assert(!rbd_dev->watch_event);
3218b30a01f2SIlya Dryomov 	rbd_assert(!rbd_dev->watch_request);
32199969ebc5SAlex Elder 
32203c663bbdSAlex Elder 	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
32219969ebc5SAlex Elder 				     &rbd_dev->watch_event);
32229969ebc5SAlex Elder 	if (ret < 0)
32239969ebc5SAlex Elder 		return ret;
32249969ebc5SAlex Elder 
322576756a51SIlya Dryomov 	obj_request = rbd_obj_watch_request_helper(rbd_dev, true);
322676756a51SIlya Dryomov 	if (IS_ERR(obj_request)) {
322776756a51SIlya Dryomov 		ceph_osdc_cancel_event(rbd_dev->watch_event);
322876756a51SIlya Dryomov 		rbd_dev->watch_event = NULL;
322976756a51SIlya Dryomov 		return PTR_ERR(obj_request);
3230b30a01f2SIlya Dryomov 	}
32319969ebc5SAlex Elder 
32328eb87565SAlex Elder 	/*
32338eb87565SAlex Elder 	 * A watch request is set to linger, so the underlying osd
32348eb87565SAlex Elder 	 * request won't go away until we unregister it.  We retain
32358eb87565SAlex Elder 	 * a pointer to the object request during that time (in
323676756a51SIlya Dryomov 	 * rbd_dev->watch_request), so we'll keep a reference to it.
323776756a51SIlya Dryomov 	 * We'll drop that reference after we've unregistered it in
323876756a51SIlya Dryomov 	 * rbd_dev_header_unwatch_sync().
32398eb87565SAlex Elder 	 */
32408eb87565SAlex Elder 	rbd_dev->watch_request = obj_request;
32418eb87565SAlex Elder 
32428eb87565SAlex Elder 	return 0;
32439969ebc5SAlex Elder }
32449969ebc5SAlex Elder 
3245b30a01f2SIlya Dryomov /*
3246b30a01f2SIlya Dryomov  * Tear down a watch request, synchronously.
3247b30a01f2SIlya Dryomov  */
324876756a51SIlya Dryomov static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
3249fca27065SIlya Dryomov {
3250b30a01f2SIlya Dryomov 	struct rbd_obj_request *obj_request;
3251b30a01f2SIlya Dryomov 
3252b30a01f2SIlya Dryomov 	rbd_assert(rbd_dev->watch_event);
3253b30a01f2SIlya Dryomov 	rbd_assert(rbd_dev->watch_request);
3254b30a01f2SIlya Dryomov 
325576756a51SIlya Dryomov 	rbd_obj_request_end(rbd_dev->watch_request);
3256b30a01f2SIlya Dryomov 	rbd_obj_request_put(rbd_dev->watch_request);
3257b30a01f2SIlya Dryomov 	rbd_dev->watch_request = NULL;
3258b30a01f2SIlya Dryomov 
325976756a51SIlya Dryomov 	obj_request = rbd_obj_watch_request_helper(rbd_dev, false);
326076756a51SIlya Dryomov 	if (!IS_ERR(obj_request))
3261b30a01f2SIlya Dryomov 		rbd_obj_request_put(obj_request);
326276756a51SIlya Dryomov 	else
326376756a51SIlya Dryomov 		rbd_warn(rbd_dev, "unable to tear down watch request (%ld)",
326476756a51SIlya Dryomov 			 PTR_ERR(obj_request));
326576756a51SIlya Dryomov 
3266b30a01f2SIlya Dryomov 	ceph_osdc_cancel_event(rbd_dev->watch_event);
3267b30a01f2SIlya Dryomov 	rbd_dev->watch_event = NULL;
3268fca27065SIlya Dryomov }
3269fca27065SIlya Dryomov 
327036be9a76SAlex Elder /*
3271f40eb349SAlex Elder  * Synchronous osd object method call.  Returns the number of bytes
3272f40eb349SAlex Elder  * returned in the outbound buffer, or a negative error code.
327336be9a76SAlex Elder  */
327436be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
327536be9a76SAlex Elder 			     const char *object_name,
327636be9a76SAlex Elder 			     const char *class_name,
327736be9a76SAlex Elder 			     const char *method_name,
32784157976bSAlex Elder 			     const void *outbound,
327936be9a76SAlex Elder 			     size_t outbound_size,
32804157976bSAlex Elder 			     void *inbound,
3281e2a58ee5SAlex Elder 			     size_t inbound_size)
328236be9a76SAlex Elder {
32832169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
328436be9a76SAlex Elder 	struct rbd_obj_request *obj_request;
328536be9a76SAlex Elder 	struct page **pages;
328636be9a76SAlex Elder 	u32 page_count;
328736be9a76SAlex Elder 	int ret;
328836be9a76SAlex Elder 
328936be9a76SAlex Elder 	/*
32906010a451SAlex Elder 	 * Method calls are ultimately read operations.  The result
32916010a451SAlex Elder 	 * should placed into the inbound buffer provided.  They
32926010a451SAlex Elder 	 * also supply outbound data--parameters for the object
32936010a451SAlex Elder 	 * method.  Currently if this is present it will be a
32946010a451SAlex Elder 	 * snapshot id.
329536be9a76SAlex Elder 	 */
329636be9a76SAlex Elder 	page_count = (u32)calc_pages_for(0, inbound_size);
329736be9a76SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
329836be9a76SAlex Elder 	if (IS_ERR(pages))
329936be9a76SAlex Elder 		return PTR_ERR(pages);
330036be9a76SAlex Elder 
330136be9a76SAlex Elder 	ret = -ENOMEM;
33026010a451SAlex Elder 	obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
330336be9a76SAlex Elder 							OBJ_REQUEST_PAGES);
330436be9a76SAlex Elder 	if (!obj_request)
330536be9a76SAlex Elder 		goto out;
330636be9a76SAlex Elder 
330736be9a76SAlex Elder 	obj_request->pages = pages;
330836be9a76SAlex Elder 	obj_request->page_count = page_count;
330936be9a76SAlex Elder 
33106d2940c8SGuangliang Zhao 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
3311deb236b3SIlya Dryomov 						  obj_request);
331236be9a76SAlex Elder 	if (!obj_request->osd_req)
331336be9a76SAlex Elder 		goto out;
331436be9a76SAlex Elder 
3315c99d2d4aSAlex Elder 	osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
331604017e29SAlex Elder 					class_name, method_name);
331704017e29SAlex Elder 	if (outbound_size) {
331804017e29SAlex Elder 		struct ceph_pagelist *pagelist;
331904017e29SAlex Elder 
332004017e29SAlex Elder 		pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
332104017e29SAlex Elder 		if (!pagelist)
332204017e29SAlex Elder 			goto out;
332304017e29SAlex Elder 
332404017e29SAlex Elder 		ceph_pagelist_init(pagelist);
332504017e29SAlex Elder 		ceph_pagelist_append(pagelist, outbound, outbound_size);
332604017e29SAlex Elder 		osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
332704017e29SAlex Elder 						pagelist);
332804017e29SAlex Elder 	}
3329a4ce40a9SAlex Elder 	osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
3330a4ce40a9SAlex Elder 					obj_request->pages, inbound_size,
333144cd188dSAlex Elder 					0, false, false);
33329d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
3333430c28c3SAlex Elder 
333436be9a76SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
333536be9a76SAlex Elder 	if (ret)
333636be9a76SAlex Elder 		goto out;
333736be9a76SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
333836be9a76SAlex Elder 	if (ret)
333936be9a76SAlex Elder 		goto out;
334036be9a76SAlex Elder 
334136be9a76SAlex Elder 	ret = obj_request->result;
334236be9a76SAlex Elder 	if (ret < 0)
334336be9a76SAlex Elder 		goto out;
334457385b51SAlex Elder 
334557385b51SAlex Elder 	rbd_assert(obj_request->xferred < (u64)INT_MAX);
334657385b51SAlex Elder 	ret = (int)obj_request->xferred;
3347903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
334836be9a76SAlex Elder out:
334936be9a76SAlex Elder 	if (obj_request)
335036be9a76SAlex Elder 		rbd_obj_request_put(obj_request);
335136be9a76SAlex Elder 	else
335236be9a76SAlex Elder 		ceph_release_page_vector(pages, page_count);
335336be9a76SAlex Elder 
335436be9a76SAlex Elder 	return ret;
335536be9a76SAlex Elder }
335636be9a76SAlex Elder 
33577ad18afaSChristoph Hellwig static void rbd_queue_workfn(struct work_struct *work)
3358bc1ecc65SIlya Dryomov {
33597ad18afaSChristoph Hellwig 	struct request *rq = blk_mq_rq_from_pdu(work);
33607ad18afaSChristoph Hellwig 	struct rbd_device *rbd_dev = rq->q->queuedata;
3361bc1ecc65SIlya Dryomov 	struct rbd_img_request *img_request;
33624e752f0aSJosh Durgin 	struct ceph_snap_context *snapc = NULL;
3363bc1ecc65SIlya Dryomov 	u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
3364bc1ecc65SIlya Dryomov 	u64 length = blk_rq_bytes(rq);
33656d2940c8SGuangliang Zhao 	enum obj_operation_type op_type;
33664e752f0aSJosh Durgin 	u64 mapping_size;
3367bc1ecc65SIlya Dryomov 	int result;
3368bc1ecc65SIlya Dryomov 
33697ad18afaSChristoph Hellwig 	if (rq->cmd_type != REQ_TYPE_FS) {
33707ad18afaSChristoph Hellwig 		dout("%s: non-fs request type %d\n", __func__,
33717ad18afaSChristoph Hellwig 			(int) rq->cmd_type);
33727ad18afaSChristoph Hellwig 		result = -EIO;
33737ad18afaSChristoph Hellwig 		goto err;
33747ad18afaSChristoph Hellwig 	}
33757ad18afaSChristoph Hellwig 
337690e98c52SGuangliang Zhao 	if (rq->cmd_flags & REQ_DISCARD)
337790e98c52SGuangliang Zhao 		op_type = OBJ_OP_DISCARD;
337890e98c52SGuangliang Zhao 	else if (rq->cmd_flags & REQ_WRITE)
33796d2940c8SGuangliang Zhao 		op_type = OBJ_OP_WRITE;
33806d2940c8SGuangliang Zhao 	else
33816d2940c8SGuangliang Zhao 		op_type = OBJ_OP_READ;
33826d2940c8SGuangliang Zhao 
3383bc1ecc65SIlya Dryomov 	/* Ignore/skip any zero-length requests */
3384bc1ecc65SIlya Dryomov 
3385bc1ecc65SIlya Dryomov 	if (!length) {
3386bc1ecc65SIlya Dryomov 		dout("%s: zero-length request\n", __func__);
3387bc1ecc65SIlya Dryomov 		result = 0;
3388bc1ecc65SIlya Dryomov 		goto err_rq;
3389bc1ecc65SIlya Dryomov 	}
3390bc1ecc65SIlya Dryomov 
33916d2940c8SGuangliang Zhao 	/* Only reads are allowed to a read-only device */
3392bc1ecc65SIlya Dryomov 
33936d2940c8SGuangliang Zhao 	if (op_type != OBJ_OP_READ) {
3394bc1ecc65SIlya Dryomov 		if (rbd_dev->mapping.read_only) {
3395bc1ecc65SIlya Dryomov 			result = -EROFS;
3396bc1ecc65SIlya Dryomov 			goto err_rq;
3397bc1ecc65SIlya Dryomov 		}
3398bc1ecc65SIlya Dryomov 		rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
3399bc1ecc65SIlya Dryomov 	}
3400bc1ecc65SIlya Dryomov 
3401bc1ecc65SIlya Dryomov 	/*
3402bc1ecc65SIlya Dryomov 	 * Quit early if the mapped snapshot no longer exists.  It's
3403bc1ecc65SIlya Dryomov 	 * still possible the snapshot will have disappeared by the
3404bc1ecc65SIlya Dryomov 	 * time our request arrives at the osd, but there's no sense in
3405bc1ecc65SIlya Dryomov 	 * sending it if we already know.
3406bc1ecc65SIlya Dryomov 	 */
3407bc1ecc65SIlya Dryomov 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3408bc1ecc65SIlya Dryomov 		dout("request for non-existent snapshot");
3409bc1ecc65SIlya Dryomov 		rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3410bc1ecc65SIlya Dryomov 		result = -ENXIO;
3411bc1ecc65SIlya Dryomov 		goto err_rq;
3412bc1ecc65SIlya Dryomov 	}
3413bc1ecc65SIlya Dryomov 
3414bc1ecc65SIlya Dryomov 	if (offset && length > U64_MAX - offset + 1) {
3415bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
3416bc1ecc65SIlya Dryomov 			 length);
3417bc1ecc65SIlya Dryomov 		result = -EINVAL;
3418bc1ecc65SIlya Dryomov 		goto err_rq;	/* Shouldn't happen */
3419bc1ecc65SIlya Dryomov 	}
3420bc1ecc65SIlya Dryomov 
34217ad18afaSChristoph Hellwig 	blk_mq_start_request(rq);
34227ad18afaSChristoph Hellwig 
34234e752f0aSJosh Durgin 	down_read(&rbd_dev->header_rwsem);
34244e752f0aSJosh Durgin 	mapping_size = rbd_dev->mapping.size;
34256d2940c8SGuangliang Zhao 	if (op_type != OBJ_OP_READ) {
34264e752f0aSJosh Durgin 		snapc = rbd_dev->header.snapc;
34274e752f0aSJosh Durgin 		ceph_get_snap_context(snapc);
34284e752f0aSJosh Durgin 	}
34294e752f0aSJosh Durgin 	up_read(&rbd_dev->header_rwsem);
34304e752f0aSJosh Durgin 
34314e752f0aSJosh Durgin 	if (offset + length > mapping_size) {
3432bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
34334e752f0aSJosh Durgin 			 length, mapping_size);
3434bc1ecc65SIlya Dryomov 		result = -EIO;
3435bc1ecc65SIlya Dryomov 		goto err_rq;
3436bc1ecc65SIlya Dryomov 	}
3437bc1ecc65SIlya Dryomov 
34386d2940c8SGuangliang Zhao 	img_request = rbd_img_request_create(rbd_dev, offset, length, op_type,
34394e752f0aSJosh Durgin 					     snapc);
3440bc1ecc65SIlya Dryomov 	if (!img_request) {
3441bc1ecc65SIlya Dryomov 		result = -ENOMEM;
3442bc1ecc65SIlya Dryomov 		goto err_rq;
3443bc1ecc65SIlya Dryomov 	}
3444bc1ecc65SIlya Dryomov 	img_request->rq = rq;
3445bc1ecc65SIlya Dryomov 
344690e98c52SGuangliang Zhao 	if (op_type == OBJ_OP_DISCARD)
344790e98c52SGuangliang Zhao 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA,
344890e98c52SGuangliang Zhao 					      NULL);
344990e98c52SGuangliang Zhao 	else
345090e98c52SGuangliang Zhao 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
345190e98c52SGuangliang Zhao 					      rq->bio);
3452bc1ecc65SIlya Dryomov 	if (result)
3453bc1ecc65SIlya Dryomov 		goto err_img_request;
3454bc1ecc65SIlya Dryomov 
3455bc1ecc65SIlya Dryomov 	result = rbd_img_request_submit(img_request);
3456bc1ecc65SIlya Dryomov 	if (result)
3457bc1ecc65SIlya Dryomov 		goto err_img_request;
3458bc1ecc65SIlya Dryomov 
3459bc1ecc65SIlya Dryomov 	return;
3460bc1ecc65SIlya Dryomov 
3461bc1ecc65SIlya Dryomov err_img_request:
3462bc1ecc65SIlya Dryomov 	rbd_img_request_put(img_request);
3463bc1ecc65SIlya Dryomov err_rq:
3464bc1ecc65SIlya Dryomov 	if (result)
3465bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "%s %llx at %llx result %d",
34666d2940c8SGuangliang Zhao 			 obj_op_name(op_type), length, offset, result);
34674e752f0aSJosh Durgin 	ceph_put_snap_context(snapc);
34687ad18afaSChristoph Hellwig err:
34697ad18afaSChristoph Hellwig 	blk_mq_end_request(rq, result);
3470bc1ecc65SIlya Dryomov }
3471bc1ecc65SIlya Dryomov 
34727ad18afaSChristoph Hellwig static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
34737ad18afaSChristoph Hellwig 		const struct blk_mq_queue_data *bd)
3474bc1ecc65SIlya Dryomov {
34757ad18afaSChristoph Hellwig 	struct request *rq = bd->rq;
34767ad18afaSChristoph Hellwig 	struct work_struct *work = blk_mq_rq_to_pdu(rq);
3477bc1ecc65SIlya Dryomov 
34787ad18afaSChristoph Hellwig 	queue_work(rbd_wq, work);
34797ad18afaSChristoph Hellwig 	return BLK_MQ_RQ_QUEUE_OK;
3480bf0d5f50SAlex Elder }
3481bf0d5f50SAlex Elder 
3482602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
3483602adf40SYehuda Sadeh {
3484602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
3485602adf40SYehuda Sadeh 
3486602adf40SYehuda Sadeh 	if (!disk)
3487602adf40SYehuda Sadeh 		return;
3488602adf40SYehuda Sadeh 
3489a0cab924SAlex Elder 	rbd_dev->disk = NULL;
3490a0cab924SAlex Elder 	if (disk->flags & GENHD_FL_UP) {
3491602adf40SYehuda Sadeh 		del_gendisk(disk);
3492602adf40SYehuda Sadeh 		if (disk->queue)
3493602adf40SYehuda Sadeh 			blk_cleanup_queue(disk->queue);
34947ad18afaSChristoph Hellwig 		blk_mq_free_tag_set(&rbd_dev->tag_set);
3495a0cab924SAlex Elder 	}
3496602adf40SYehuda Sadeh 	put_disk(disk);
3497602adf40SYehuda Sadeh }
3498602adf40SYehuda Sadeh 
3499788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
3500788e2df3SAlex Elder 				const char *object_name,
35017097f8dfSAlex Elder 				u64 offset, u64 length, void *buf)
3502788e2df3SAlex Elder 
3503788e2df3SAlex Elder {
35042169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3505788e2df3SAlex Elder 	struct rbd_obj_request *obj_request;
3506788e2df3SAlex Elder 	struct page **pages = NULL;
3507788e2df3SAlex Elder 	u32 page_count;
35081ceae7efSAlex Elder 	size_t size;
3509788e2df3SAlex Elder 	int ret;
3510788e2df3SAlex Elder 
3511788e2df3SAlex Elder 	page_count = (u32) calc_pages_for(offset, length);
3512788e2df3SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
3513788e2df3SAlex Elder 	if (IS_ERR(pages))
3514a8d42056SJan Kara 		return PTR_ERR(pages);
3515788e2df3SAlex Elder 
3516788e2df3SAlex Elder 	ret = -ENOMEM;
3517788e2df3SAlex Elder 	obj_request = rbd_obj_request_create(object_name, offset, length,
3518788e2df3SAlex Elder 							OBJ_REQUEST_PAGES);
3519788e2df3SAlex Elder 	if (!obj_request)
3520788e2df3SAlex Elder 		goto out;
3521788e2df3SAlex Elder 
3522788e2df3SAlex Elder 	obj_request->pages = pages;
3523788e2df3SAlex Elder 	obj_request->page_count = page_count;
3524788e2df3SAlex Elder 
35256d2940c8SGuangliang Zhao 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
3526deb236b3SIlya Dryomov 						  obj_request);
3527788e2df3SAlex Elder 	if (!obj_request->osd_req)
3528788e2df3SAlex Elder 		goto out;
3529788e2df3SAlex Elder 
3530c99d2d4aSAlex Elder 	osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
3531c99d2d4aSAlex Elder 					offset, length, 0, 0);
3532406e2c9fSAlex Elder 	osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
3533a4ce40a9SAlex Elder 					obj_request->pages,
353444cd188dSAlex Elder 					obj_request->length,
353544cd188dSAlex Elder 					obj_request->offset & ~PAGE_MASK,
353644cd188dSAlex Elder 					false, false);
35379d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
3538430c28c3SAlex Elder 
3539788e2df3SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
3540788e2df3SAlex Elder 	if (ret)
3541788e2df3SAlex Elder 		goto out;
3542788e2df3SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
3543788e2df3SAlex Elder 	if (ret)
3544788e2df3SAlex Elder 		goto out;
3545788e2df3SAlex Elder 
3546788e2df3SAlex Elder 	ret = obj_request->result;
3547788e2df3SAlex Elder 	if (ret < 0)
3548788e2df3SAlex Elder 		goto out;
35491ceae7efSAlex Elder 
35501ceae7efSAlex Elder 	rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
35511ceae7efSAlex Elder 	size = (size_t) obj_request->xferred;
3552903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, buf, 0, size);
355323ed6e13SAlex Elder 	rbd_assert(size <= (size_t)INT_MAX);
355423ed6e13SAlex Elder 	ret = (int)size;
3555788e2df3SAlex Elder out:
3556788e2df3SAlex Elder 	if (obj_request)
3557788e2df3SAlex Elder 		rbd_obj_request_put(obj_request);
3558788e2df3SAlex Elder 	else
3559788e2df3SAlex Elder 		ceph_release_page_vector(pages, page_count);
3560788e2df3SAlex Elder 
3561788e2df3SAlex Elder 	return ret;
3562788e2df3SAlex Elder }
3563788e2df3SAlex Elder 
3564602adf40SYehuda Sadeh /*
3565662518b1SAlex Elder  * Read the complete header for the given rbd device.  On successful
3566662518b1SAlex Elder  * return, the rbd_dev->header field will contain up-to-date
3567662518b1SAlex Elder  * information about the image.
35684156d998SAlex Elder  */
356999a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
35704156d998SAlex Elder {
35714156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
35724156d998SAlex Elder 	u32 snap_count = 0;
35734156d998SAlex Elder 	u64 names_size = 0;
35744156d998SAlex Elder 	u32 want_count;
35754156d998SAlex Elder 	int ret;
35764156d998SAlex Elder 
35774156d998SAlex Elder 	/*
35784156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
35794156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
35804156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
35814156d998SAlex Elder 	 * the number of snapshots could change by the time we read
35824156d998SAlex Elder 	 * it in, in which case we re-read it.
35834156d998SAlex Elder 	 */
35844156d998SAlex Elder 	do {
35854156d998SAlex Elder 		size_t size;
35864156d998SAlex Elder 
35874156d998SAlex Elder 		kfree(ondisk);
35884156d998SAlex Elder 
35894156d998SAlex Elder 		size = sizeof (*ondisk);
35904156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
35914156d998SAlex Elder 		size += names_size;
35924156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
35934156d998SAlex Elder 		if (!ondisk)
3594662518b1SAlex Elder 			return -ENOMEM;
35954156d998SAlex Elder 
3596788e2df3SAlex Elder 		ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
35977097f8dfSAlex Elder 				       0, size, ondisk);
35984156d998SAlex Elder 		if (ret < 0)
3599662518b1SAlex Elder 			goto out;
3600c0cd10dbSAlex Elder 		if ((size_t)ret < size) {
36014156d998SAlex Elder 			ret = -ENXIO;
360206ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
360306ecc6cbSAlex Elder 				size, ret);
3604662518b1SAlex Elder 			goto out;
36054156d998SAlex Elder 		}
36064156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
36074156d998SAlex Elder 			ret = -ENXIO;
360806ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
3609662518b1SAlex Elder 			goto out;
36104156d998SAlex Elder 		}
36114156d998SAlex Elder 
36124156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
36134156d998SAlex Elder 		want_count = snap_count;
36144156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
36154156d998SAlex Elder 	} while (snap_count != want_count);
36164156d998SAlex Elder 
3617662518b1SAlex Elder 	ret = rbd_header_from_disk(rbd_dev, ondisk);
3618662518b1SAlex Elder out:
36194156d998SAlex Elder 	kfree(ondisk);
36204156d998SAlex Elder 
3621dfc5606dSYehuda Sadeh 	return ret;
3622602adf40SYehuda Sadeh }
3623602adf40SYehuda Sadeh 
362415228edeSAlex Elder /*
362515228edeSAlex Elder  * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
362615228edeSAlex Elder  * has disappeared from the (just updated) snapshot context.
362715228edeSAlex Elder  */
362815228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev)
362915228edeSAlex Elder {
363015228edeSAlex Elder 	u64 snap_id;
363115228edeSAlex Elder 
363215228edeSAlex Elder 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
363315228edeSAlex Elder 		return;
363415228edeSAlex Elder 
363515228edeSAlex Elder 	snap_id = rbd_dev->spec->snap_id;
363615228edeSAlex Elder 	if (snap_id == CEPH_NOSNAP)
363715228edeSAlex Elder 		return;
363815228edeSAlex Elder 
363915228edeSAlex Elder 	if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
364015228edeSAlex Elder 		clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
364115228edeSAlex Elder }
364215228edeSAlex Elder 
36439875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev)
36449875201eSJosh Durgin {
36459875201eSJosh Durgin 	sector_t size;
36469875201eSJosh Durgin 	bool removing;
36479875201eSJosh Durgin 
36489875201eSJosh Durgin 	/*
36499875201eSJosh Durgin 	 * Don't hold the lock while doing disk operations,
36509875201eSJosh Durgin 	 * or lock ordering will conflict with the bdev mutex via:
36519875201eSJosh Durgin 	 * rbd_add() -> blkdev_get() -> rbd_open()
36529875201eSJosh Durgin 	 */
36539875201eSJosh Durgin 	spin_lock_irq(&rbd_dev->lock);
36549875201eSJosh Durgin 	removing = test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
36559875201eSJosh Durgin 	spin_unlock_irq(&rbd_dev->lock);
36569875201eSJosh Durgin 	/*
36579875201eSJosh Durgin 	 * If the device is being removed, rbd_dev->disk has
36589875201eSJosh Durgin 	 * been destroyed, so don't try to update its size
36599875201eSJosh Durgin 	 */
36609875201eSJosh Durgin 	if (!removing) {
36619875201eSJosh Durgin 		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
36629875201eSJosh Durgin 		dout("setting size to %llu sectors", (unsigned long long)size);
36639875201eSJosh Durgin 		set_capacity(rbd_dev->disk, size);
36649875201eSJosh Durgin 		revalidate_disk(rbd_dev->disk);
36659875201eSJosh Durgin 	}
36669875201eSJosh Durgin }
36679875201eSJosh Durgin 
3668cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev)
36691fe5e993SAlex Elder {
3670e627db08SAlex Elder 	u64 mapping_size;
36711fe5e993SAlex Elder 	int ret;
36721fe5e993SAlex Elder 
3673cfbf6377SAlex Elder 	down_write(&rbd_dev->header_rwsem);
36743b5cf2a2SAlex Elder 	mapping_size = rbd_dev->mapping.size;
3675a720ae09SIlya Dryomov 
3676a720ae09SIlya Dryomov 	ret = rbd_dev_header_info(rbd_dev);
367752bb1f9bSIlya Dryomov 	if (ret)
367873e39e4dSIlya Dryomov 		goto out;
367915228edeSAlex Elder 
3680e8f59b59SIlya Dryomov 	/*
3681e8f59b59SIlya Dryomov 	 * If there is a parent, see if it has disappeared due to the
3682e8f59b59SIlya Dryomov 	 * mapped image getting flattened.
3683e8f59b59SIlya Dryomov 	 */
3684e8f59b59SIlya Dryomov 	if (rbd_dev->parent) {
3685e8f59b59SIlya Dryomov 		ret = rbd_dev_v2_parent_info(rbd_dev);
3686e8f59b59SIlya Dryomov 		if (ret)
368773e39e4dSIlya Dryomov 			goto out;
3688e8f59b59SIlya Dryomov 	}
3689e8f59b59SIlya Dryomov 
36905ff1108cSIlya Dryomov 	if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
36915ff1108cSIlya Dryomov 		rbd_dev->mapping.size = rbd_dev->header.image_size;
36925ff1108cSIlya Dryomov 	} else {
36935ff1108cSIlya Dryomov 		/* validate mapped snapshot's EXISTS flag */
369415228edeSAlex Elder 		rbd_exists_validate(rbd_dev);
36955ff1108cSIlya Dryomov 	}
36965ff1108cSIlya Dryomov 
369773e39e4dSIlya Dryomov out:
3698cfbf6377SAlex Elder 	up_write(&rbd_dev->header_rwsem);
369973e39e4dSIlya Dryomov 	if (!ret && mapping_size != rbd_dev->mapping.size)
37009875201eSJosh Durgin 		rbd_dev_update_size(rbd_dev);
37011fe5e993SAlex Elder 
370273e39e4dSIlya Dryomov 	return ret;
37031fe5e993SAlex Elder }
37041fe5e993SAlex Elder 
37057ad18afaSChristoph Hellwig static int rbd_init_request(void *data, struct request *rq,
37067ad18afaSChristoph Hellwig 		unsigned int hctx_idx, unsigned int request_idx,
37077ad18afaSChristoph Hellwig 		unsigned int numa_node)
37087ad18afaSChristoph Hellwig {
37097ad18afaSChristoph Hellwig 	struct work_struct *work = blk_mq_rq_to_pdu(rq);
37107ad18afaSChristoph Hellwig 
37117ad18afaSChristoph Hellwig 	INIT_WORK(work, rbd_queue_workfn);
37127ad18afaSChristoph Hellwig 	return 0;
37137ad18afaSChristoph Hellwig }
37147ad18afaSChristoph Hellwig 
37157ad18afaSChristoph Hellwig static struct blk_mq_ops rbd_mq_ops = {
37167ad18afaSChristoph Hellwig 	.queue_rq	= rbd_queue_rq,
37177ad18afaSChristoph Hellwig 	.map_queue	= blk_mq_map_queue,
37187ad18afaSChristoph Hellwig 	.init_request	= rbd_init_request,
37197ad18afaSChristoph Hellwig };
37207ad18afaSChristoph Hellwig 
3721602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
3722602adf40SYehuda Sadeh {
3723602adf40SYehuda Sadeh 	struct gendisk *disk;
3724602adf40SYehuda Sadeh 	struct request_queue *q;
3725593a9e7bSAlex Elder 	u64 segment_size;
37267ad18afaSChristoph Hellwig 	int err;
3727602adf40SYehuda Sadeh 
3728602adf40SYehuda Sadeh 	/* create gendisk info */
37297e513d43SIlya Dryomov 	disk = alloc_disk(single_major ?
37307e513d43SIlya Dryomov 			  (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
37317e513d43SIlya Dryomov 			  RBD_MINORS_PER_MAJOR);
3732602adf40SYehuda Sadeh 	if (!disk)
37331fcdb8aaSAlex Elder 		return -ENOMEM;
3734602adf40SYehuda Sadeh 
3735f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
3736de71a297SAlex Elder 		 rbd_dev->dev_id);
3737602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
3738dd82fff1SIlya Dryomov 	disk->first_minor = rbd_dev->minor;
37397e513d43SIlya Dryomov 	if (single_major)
37407e513d43SIlya Dryomov 		disk->flags |= GENHD_FL_EXT_DEVT;
3741602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
3742602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
3743602adf40SYehuda Sadeh 
37447ad18afaSChristoph Hellwig 	memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
37457ad18afaSChristoph Hellwig 	rbd_dev->tag_set.ops = &rbd_mq_ops;
3746b5584180SIlya Dryomov 	rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
37477ad18afaSChristoph Hellwig 	rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
3748b5584180SIlya Dryomov 	rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
37497ad18afaSChristoph Hellwig 	rbd_dev->tag_set.nr_hw_queues = 1;
37507ad18afaSChristoph Hellwig 	rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
37517ad18afaSChristoph Hellwig 
37527ad18afaSChristoph Hellwig 	err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
37537ad18afaSChristoph Hellwig 	if (err)
3754602adf40SYehuda Sadeh 		goto out_disk;
3755029bcbd8SJosh Durgin 
37567ad18afaSChristoph Hellwig 	q = blk_mq_init_queue(&rbd_dev->tag_set);
37577ad18afaSChristoph Hellwig 	if (IS_ERR(q)) {
37587ad18afaSChristoph Hellwig 		err = PTR_ERR(q);
37597ad18afaSChristoph Hellwig 		goto out_tag_set;
37607ad18afaSChristoph Hellwig 	}
37617ad18afaSChristoph Hellwig 
3762d8a2c89cSIlya Dryomov 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
3763d8a2c89cSIlya Dryomov 	/* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
3764593a9e7bSAlex Elder 
3765029bcbd8SJosh Durgin 	/* set io sizes to object size */
3766593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
3767593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
37680d9fde4fSIlya Dryomov 	q->limits.max_sectors = queue_max_hw_sectors(q);
3769d3834fefSIlya Dryomov 	blk_queue_max_segments(q, segment_size / SECTOR_SIZE);
3770593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
3771593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
3772593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
3773029bcbd8SJosh Durgin 
377490e98c52SGuangliang Zhao 	/* enable the discard support */
377590e98c52SGuangliang Zhao 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
377690e98c52SGuangliang Zhao 	q->limits.discard_granularity = segment_size;
377790e98c52SGuangliang Zhao 	q->limits.discard_alignment = segment_size;
37782bb4cd5cSJens Axboe 	blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE);
3779b76f8239SJosh Durgin 	q->limits.discard_zeroes_data = 1;
378090e98c52SGuangliang Zhao 
3781bae818eeSRonny Hegewald 	if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
3782bae818eeSRonny Hegewald 		q->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES;
3783bae818eeSRonny Hegewald 
3784602adf40SYehuda Sadeh 	disk->queue = q;
3785602adf40SYehuda Sadeh 
3786602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
3787602adf40SYehuda Sadeh 
3788602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
3789602adf40SYehuda Sadeh 
3790602adf40SYehuda Sadeh 	return 0;
37917ad18afaSChristoph Hellwig out_tag_set:
37927ad18afaSChristoph Hellwig 	blk_mq_free_tag_set(&rbd_dev->tag_set);
3793602adf40SYehuda Sadeh out_disk:
3794602adf40SYehuda Sadeh 	put_disk(disk);
37957ad18afaSChristoph Hellwig 	return err;
3796602adf40SYehuda Sadeh }
3797602adf40SYehuda Sadeh 
3798dfc5606dSYehuda Sadeh /*
3799dfc5606dSYehuda Sadeh   sysfs
3800dfc5606dSYehuda Sadeh */
3801602adf40SYehuda Sadeh 
3802593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3803593a9e7bSAlex Elder {
3804593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
3805593a9e7bSAlex Elder }
3806593a9e7bSAlex Elder 
3807dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
3808dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3809602adf40SYehuda Sadeh {
3810593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3811dfc5606dSYehuda Sadeh 
3812fc71d833SAlex Elder 	return sprintf(buf, "%llu\n",
3813fc71d833SAlex Elder 		(unsigned long long)rbd_dev->mapping.size);
3814602adf40SYehuda Sadeh }
3815602adf40SYehuda Sadeh 
381634b13184SAlex Elder /*
381734b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
381834b13184SAlex Elder  * necessarily the base image.
381934b13184SAlex Elder  */
382034b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
382134b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
382234b13184SAlex Elder {
382334b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
382434b13184SAlex Elder 
382534b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
382634b13184SAlex Elder 			(unsigned long long)rbd_dev->mapping.features);
382734b13184SAlex Elder }
382834b13184SAlex Elder 
3829dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
3830dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
3831602adf40SYehuda Sadeh {
3832593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3833dfc5606dSYehuda Sadeh 
3834fc71d833SAlex Elder 	if (rbd_dev->major)
3835dfc5606dSYehuda Sadeh 		return sprintf(buf, "%d\n", rbd_dev->major);
3836fc71d833SAlex Elder 
3837fc71d833SAlex Elder 	return sprintf(buf, "(none)\n");
3838dd82fff1SIlya Dryomov }
3839fc71d833SAlex Elder 
3840dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev,
3841dd82fff1SIlya Dryomov 			      struct device_attribute *attr, char *buf)
3842dd82fff1SIlya Dryomov {
3843dd82fff1SIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3844dd82fff1SIlya Dryomov 
3845dd82fff1SIlya Dryomov 	return sprintf(buf, "%d\n", rbd_dev->minor);
3846dfc5606dSYehuda Sadeh }
3847dfc5606dSYehuda Sadeh 
3848dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
3849dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
3850dfc5606dSYehuda Sadeh {
3851593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3852dfc5606dSYehuda Sadeh 
38531dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
38541dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
3855dfc5606dSYehuda Sadeh }
3856dfc5606dSYehuda Sadeh 
3857dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
3858dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3859dfc5606dSYehuda Sadeh {
3860593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3861dfc5606dSYehuda Sadeh 
38620d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
3863dfc5606dSYehuda Sadeh }
3864dfc5606dSYehuda Sadeh 
38659bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
38669bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
38679bb2f334SAlex Elder {
38689bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
38699bb2f334SAlex Elder 
38700d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
38710d7dbfceSAlex Elder 			(unsigned long long) rbd_dev->spec->pool_id);
38729bb2f334SAlex Elder }
38739bb2f334SAlex Elder 
3874dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
3875dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3876dfc5606dSYehuda Sadeh {
3877593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3878dfc5606dSYehuda Sadeh 
3879a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
38800d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3881a92ffdf8SAlex Elder 
3882a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
3883dfc5606dSYehuda Sadeh }
3884dfc5606dSYehuda Sadeh 
3885589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
3886589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
3887589d30e0SAlex Elder {
3888589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3889589d30e0SAlex Elder 
38900d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
3891589d30e0SAlex Elder }
3892589d30e0SAlex Elder 
389334b13184SAlex Elder /*
389434b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
389534b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
389634b13184SAlex Elder  */
3897dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
3898dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
3899dfc5606dSYehuda Sadeh 			     char *buf)
3900dfc5606dSYehuda Sadeh {
3901593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3902dfc5606dSYehuda Sadeh 
39030d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
3904dfc5606dSYehuda Sadeh }
3905dfc5606dSYehuda Sadeh 
390686b00e0dSAlex Elder /*
3907ff96128fSIlya Dryomov  * For a v2 image, shows the chain of parent images, separated by empty
3908ff96128fSIlya Dryomov  * lines.  For v1 images or if there is no parent, shows "(no parent
3909ff96128fSIlya Dryomov  * image)".
391086b00e0dSAlex Elder  */
391186b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
391286b00e0dSAlex Elder 			       struct device_attribute *attr,
391386b00e0dSAlex Elder 			       char *buf)
391486b00e0dSAlex Elder {
391586b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3916ff96128fSIlya Dryomov 	ssize_t count = 0;
391786b00e0dSAlex Elder 
3918ff96128fSIlya Dryomov 	if (!rbd_dev->parent)
391986b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
392086b00e0dSAlex Elder 
3921ff96128fSIlya Dryomov 	for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
3922ff96128fSIlya Dryomov 		struct rbd_spec *spec = rbd_dev->parent_spec;
392386b00e0dSAlex Elder 
3924ff96128fSIlya Dryomov 		count += sprintf(&buf[count], "%s"
3925ff96128fSIlya Dryomov 			    "pool_id %llu\npool_name %s\n"
3926ff96128fSIlya Dryomov 			    "image_id %s\nimage_name %s\n"
3927ff96128fSIlya Dryomov 			    "snap_id %llu\nsnap_name %s\n"
3928ff96128fSIlya Dryomov 			    "overlap %llu\n",
3929ff96128fSIlya Dryomov 			    !count ? "" : "\n", /* first? */
3930ff96128fSIlya Dryomov 			    spec->pool_id, spec->pool_name,
3931ff96128fSIlya Dryomov 			    spec->image_id, spec->image_name ?: "(unknown)",
3932ff96128fSIlya Dryomov 			    spec->snap_id, spec->snap_name,
3933ff96128fSIlya Dryomov 			    rbd_dev->parent_overlap);
3934ff96128fSIlya Dryomov 	}
393586b00e0dSAlex Elder 
393686b00e0dSAlex Elder 	return count;
393786b00e0dSAlex Elder }
393886b00e0dSAlex Elder 
3939dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
3940dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
3941dfc5606dSYehuda Sadeh 				 const char *buf,
3942dfc5606dSYehuda Sadeh 				 size_t size)
3943dfc5606dSYehuda Sadeh {
3944593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3945b813623aSAlex Elder 	int ret;
3946602adf40SYehuda Sadeh 
3947cc4a38bdSAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
3948e627db08SAlex Elder 	if (ret)
394952bb1f9bSIlya Dryomov 		return ret;
3950b813623aSAlex Elder 
395152bb1f9bSIlya Dryomov 	return size;
3952dfc5606dSYehuda Sadeh }
3953602adf40SYehuda Sadeh 
3954dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
395534b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
3956dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
3957dd82fff1SIlya Dryomov static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
3958dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
3959dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
39609bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
3961dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
3962589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
3963dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3964dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
396586b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
3966dfc5606dSYehuda Sadeh 
3967dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
3968dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
396934b13184SAlex Elder 	&dev_attr_features.attr,
3970dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
3971dd82fff1SIlya Dryomov 	&dev_attr_minor.attr,
3972dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
3973dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
39749bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
3975dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
3976589d30e0SAlex Elder 	&dev_attr_image_id.attr,
3977dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
397886b00e0dSAlex Elder 	&dev_attr_parent.attr,
3979dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
3980dfc5606dSYehuda Sadeh 	NULL
3981dfc5606dSYehuda Sadeh };
3982dfc5606dSYehuda Sadeh 
3983dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
3984dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
3985dfc5606dSYehuda Sadeh };
3986dfc5606dSYehuda Sadeh 
3987dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
3988dfc5606dSYehuda Sadeh 	&rbd_attr_group,
3989dfc5606dSYehuda Sadeh 	NULL
3990dfc5606dSYehuda Sadeh };
3991dfc5606dSYehuda Sadeh 
39926cac4695SIlya Dryomov static void rbd_dev_release(struct device *dev);
3993dfc5606dSYehuda Sadeh 
3994dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
3995dfc5606dSYehuda Sadeh 	.name		= "rbd",
3996dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
39976cac4695SIlya Dryomov 	.release	= rbd_dev_release,
3998dfc5606dSYehuda Sadeh };
3999dfc5606dSYehuda Sadeh 
40008b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
40018b8fb99cSAlex Elder {
40028b8fb99cSAlex Elder 	kref_get(&spec->kref);
40038b8fb99cSAlex Elder 
40048b8fb99cSAlex Elder 	return spec;
40058b8fb99cSAlex Elder }
40068b8fb99cSAlex Elder 
40078b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
40088b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
40098b8fb99cSAlex Elder {
40108b8fb99cSAlex Elder 	if (spec)
40118b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
40128b8fb99cSAlex Elder }
40138b8fb99cSAlex Elder 
40148b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
40158b8fb99cSAlex Elder {
40168b8fb99cSAlex Elder 	struct rbd_spec *spec;
40178b8fb99cSAlex Elder 
40188b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
40198b8fb99cSAlex Elder 	if (!spec)
40208b8fb99cSAlex Elder 		return NULL;
402104077599SIlya Dryomov 
402204077599SIlya Dryomov 	spec->pool_id = CEPH_NOPOOL;
402304077599SIlya Dryomov 	spec->snap_id = CEPH_NOSNAP;
40248b8fb99cSAlex Elder 	kref_init(&spec->kref);
40258b8fb99cSAlex Elder 
40268b8fb99cSAlex Elder 	return spec;
40278b8fb99cSAlex Elder }
40288b8fb99cSAlex Elder 
40298b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
40308b8fb99cSAlex Elder {
40318b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
40328b8fb99cSAlex Elder 
40338b8fb99cSAlex Elder 	kfree(spec->pool_name);
40348b8fb99cSAlex Elder 	kfree(spec->image_id);
40358b8fb99cSAlex Elder 	kfree(spec->image_name);
40368b8fb99cSAlex Elder 	kfree(spec->snap_name);
40378b8fb99cSAlex Elder 	kfree(spec);
40388b8fb99cSAlex Elder }
40398b8fb99cSAlex Elder 
4040dd5ac32dSIlya Dryomov static void rbd_dev_release(struct device *dev)
4041dd5ac32dSIlya Dryomov {
4042dd5ac32dSIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4043dd5ac32dSIlya Dryomov 	bool need_put = !!rbd_dev->opts;
4044dd5ac32dSIlya Dryomov 
4045dd5ac32dSIlya Dryomov 	rbd_put_client(rbd_dev->rbd_client);
4046dd5ac32dSIlya Dryomov 	rbd_spec_put(rbd_dev->spec);
4047dd5ac32dSIlya Dryomov 	kfree(rbd_dev->opts);
4048dd5ac32dSIlya Dryomov 	kfree(rbd_dev);
4049dd5ac32dSIlya Dryomov 
4050dd5ac32dSIlya Dryomov 	/*
4051dd5ac32dSIlya Dryomov 	 * This is racy, but way better than putting module outside of
4052dd5ac32dSIlya Dryomov 	 * the release callback.  The race window is pretty small, so
4053dd5ac32dSIlya Dryomov 	 * doing something similar to dm (dm-builtin.c) is overkill.
4054dd5ac32dSIlya Dryomov 	 */
4055dd5ac32dSIlya Dryomov 	if (need_put)
4056dd5ac32dSIlya Dryomov 		module_put(THIS_MODULE);
4057dd5ac32dSIlya Dryomov }
4058dd5ac32dSIlya Dryomov 
4059cc344fa1SAlex Elder static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
4060d147543dSIlya Dryomov 					 struct rbd_spec *spec,
4061d147543dSIlya Dryomov 					 struct rbd_options *opts)
4062c53d5893SAlex Elder {
4063c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
4064c53d5893SAlex Elder 
4065c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
4066c53d5893SAlex Elder 	if (!rbd_dev)
4067c53d5893SAlex Elder 		return NULL;
4068c53d5893SAlex Elder 
4069c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
40706d292906SAlex Elder 	rbd_dev->flags = 0;
4071a2acd00eSAlex Elder 	atomic_set(&rbd_dev->parent_ref, 0);
4072c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
4073c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
4074c53d5893SAlex Elder 
4075dd5ac32dSIlya Dryomov 	rbd_dev->dev.bus = &rbd_bus_type;
4076dd5ac32dSIlya Dryomov 	rbd_dev->dev.type = &rbd_device_type;
4077dd5ac32dSIlya Dryomov 	rbd_dev->dev.parent = &rbd_root_dev;
4078dd5ac32dSIlya Dryomov 	device_initialize(&rbd_dev->dev);
4079dd5ac32dSIlya Dryomov 
4080c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
4081d147543dSIlya Dryomov 	rbd_dev->spec = spec;
4082d147543dSIlya Dryomov 	rbd_dev->opts = opts;
4083c53d5893SAlex Elder 
40840903e875SAlex Elder 	/* Initialize the layout used for all rbd requests */
40850903e875SAlex Elder 
40860903e875SAlex Elder 	rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
40870903e875SAlex Elder 	rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
40880903e875SAlex Elder 	rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
40890903e875SAlex Elder 	rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
40900903e875SAlex Elder 
4091dd5ac32dSIlya Dryomov 	/*
4092dd5ac32dSIlya Dryomov 	 * If this is a mapping rbd_dev (as opposed to a parent one),
4093dd5ac32dSIlya Dryomov 	 * pin our module.  We have a ref from do_rbd_add(), so use
4094dd5ac32dSIlya Dryomov 	 * __module_get().
4095dd5ac32dSIlya Dryomov 	 */
4096dd5ac32dSIlya Dryomov 	if (rbd_dev->opts)
4097dd5ac32dSIlya Dryomov 		__module_get(THIS_MODULE);
4098dd5ac32dSIlya Dryomov 
4099c53d5893SAlex Elder 	return rbd_dev;
4100c53d5893SAlex Elder }
4101c53d5893SAlex Elder 
4102c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4103c53d5893SAlex Elder {
4104dd5ac32dSIlya Dryomov 	if (rbd_dev)
4105dd5ac32dSIlya Dryomov 		put_device(&rbd_dev->dev);
4106c53d5893SAlex Elder }
4107c53d5893SAlex Elder 
4108dfc5606dSYehuda Sadeh /*
41099d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
41109d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
41119d475de5SAlex Elder  * image.
41129d475de5SAlex Elder  */
41139d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
41149d475de5SAlex Elder 				u8 *order, u64 *snap_size)
41159d475de5SAlex Elder {
41169d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
41179d475de5SAlex Elder 	int ret;
41189d475de5SAlex Elder 	struct {
41199d475de5SAlex Elder 		u8 order;
41209d475de5SAlex Elder 		__le64 size;
41219d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
41229d475de5SAlex Elder 
412336be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
41249d475de5SAlex Elder 				"rbd", "get_size",
41254157976bSAlex Elder 				&snapid, sizeof (snapid),
4126e2a58ee5SAlex Elder 				&size_buf, sizeof (size_buf));
412736be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
41289d475de5SAlex Elder 	if (ret < 0)
41299d475de5SAlex Elder 		return ret;
413057385b51SAlex Elder 	if (ret < sizeof (size_buf))
413157385b51SAlex Elder 		return -ERANGE;
41329d475de5SAlex Elder 
4133c3545579SJosh Durgin 	if (order) {
41349d475de5SAlex Elder 		*order = size_buf.order;
4135c3545579SJosh Durgin 		dout("  order %u", (unsigned int)*order);
4136c3545579SJosh Durgin 	}
41379d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
41389d475de5SAlex Elder 
4139c3545579SJosh Durgin 	dout("  snap_id 0x%016llx snap_size = %llu\n",
4140c3545579SJosh Durgin 		(unsigned long long)snap_id,
41419d475de5SAlex Elder 		(unsigned long long)*snap_size);
41429d475de5SAlex Elder 
41439d475de5SAlex Elder 	return 0;
41449d475de5SAlex Elder }
41459d475de5SAlex Elder 
41469d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
41479d475de5SAlex Elder {
41489d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
41499d475de5SAlex Elder 					&rbd_dev->header.obj_order,
41509d475de5SAlex Elder 					&rbd_dev->header.image_size);
41519d475de5SAlex Elder }
41529d475de5SAlex Elder 
41531e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
41541e130199SAlex Elder {
41551e130199SAlex Elder 	void *reply_buf;
41561e130199SAlex Elder 	int ret;
41571e130199SAlex Elder 	void *p;
41581e130199SAlex Elder 
41591e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
41601e130199SAlex Elder 	if (!reply_buf)
41611e130199SAlex Elder 		return -ENOMEM;
41621e130199SAlex Elder 
416336be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
41644157976bSAlex Elder 				"rbd", "get_object_prefix", NULL, 0,
4165e2a58ee5SAlex Elder 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
416636be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
41671e130199SAlex Elder 	if (ret < 0)
41681e130199SAlex Elder 		goto out;
41691e130199SAlex Elder 
41701e130199SAlex Elder 	p = reply_buf;
41711e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
417257385b51SAlex Elder 						p + ret, NULL, GFP_NOIO);
417357385b51SAlex Elder 	ret = 0;
41741e130199SAlex Elder 
41751e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
41761e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
41771e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
41781e130199SAlex Elder 	} else {
41791e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
41801e130199SAlex Elder 	}
41811e130199SAlex Elder out:
41821e130199SAlex Elder 	kfree(reply_buf);
41831e130199SAlex Elder 
41841e130199SAlex Elder 	return ret;
41851e130199SAlex Elder }
41861e130199SAlex Elder 
4187b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4188b1b5402aSAlex Elder 		u64 *snap_features)
4189b1b5402aSAlex Elder {
4190b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
4191b1b5402aSAlex Elder 	struct {
4192b1b5402aSAlex Elder 		__le64 features;
4193b1b5402aSAlex Elder 		__le64 incompat;
41944157976bSAlex Elder 	} __attribute__ ((packed)) features_buf = { 0 };
4195d889140cSAlex Elder 	u64 incompat;
4196b1b5402aSAlex Elder 	int ret;
4197b1b5402aSAlex Elder 
419836be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4199b1b5402aSAlex Elder 				"rbd", "get_features",
42004157976bSAlex Elder 				&snapid, sizeof (snapid),
4201e2a58ee5SAlex Elder 				&features_buf, sizeof (features_buf));
420236be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4203b1b5402aSAlex Elder 	if (ret < 0)
4204b1b5402aSAlex Elder 		return ret;
420557385b51SAlex Elder 	if (ret < sizeof (features_buf))
420657385b51SAlex Elder 		return -ERANGE;
4207d889140cSAlex Elder 
4208d889140cSAlex Elder 	incompat = le64_to_cpu(features_buf.incompat);
42095cbf6f12SAlex Elder 	if (incompat & ~RBD_FEATURES_SUPPORTED)
4210b8f5c6edSAlex Elder 		return -ENXIO;
4211d889140cSAlex Elder 
4212b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
4213b1b5402aSAlex Elder 
4214b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
4215b1b5402aSAlex Elder 		(unsigned long long)snap_id,
4216b1b5402aSAlex Elder 		(unsigned long long)*snap_features,
4217b1b5402aSAlex Elder 		(unsigned long long)le64_to_cpu(features_buf.incompat));
4218b1b5402aSAlex Elder 
4219b1b5402aSAlex Elder 	return 0;
4220b1b5402aSAlex Elder }
4221b1b5402aSAlex Elder 
4222b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
4223b1b5402aSAlex Elder {
4224b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
4225b1b5402aSAlex Elder 						&rbd_dev->header.features);
4226b1b5402aSAlex Elder }
4227b1b5402aSAlex Elder 
422886b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
422986b00e0dSAlex Elder {
423086b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
423186b00e0dSAlex Elder 	size_t size;
423286b00e0dSAlex Elder 	void *reply_buf = NULL;
423386b00e0dSAlex Elder 	__le64 snapid;
423486b00e0dSAlex Elder 	void *p;
423586b00e0dSAlex Elder 	void *end;
4236642a2537SAlex Elder 	u64 pool_id;
423786b00e0dSAlex Elder 	char *image_id;
42383b5cf2a2SAlex Elder 	u64 snap_id;
423986b00e0dSAlex Elder 	u64 overlap;
424086b00e0dSAlex Elder 	int ret;
424186b00e0dSAlex Elder 
424286b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
424386b00e0dSAlex Elder 	if (!parent_spec)
424486b00e0dSAlex Elder 		return -ENOMEM;
424586b00e0dSAlex Elder 
424686b00e0dSAlex Elder 	size = sizeof (__le64) +				/* pool_id */
424786b00e0dSAlex Elder 		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
424886b00e0dSAlex Elder 		sizeof (__le64) +				/* snap_id */
424986b00e0dSAlex Elder 		sizeof (__le64);				/* overlap */
425086b00e0dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
425186b00e0dSAlex Elder 	if (!reply_buf) {
425286b00e0dSAlex Elder 		ret = -ENOMEM;
425386b00e0dSAlex Elder 		goto out_err;
425486b00e0dSAlex Elder 	}
425586b00e0dSAlex Elder 
42564d9b67cdSIlya Dryomov 	snapid = cpu_to_le64(rbd_dev->spec->snap_id);
425736be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
425886b00e0dSAlex Elder 				"rbd", "get_parent",
42594157976bSAlex Elder 				&snapid, sizeof (snapid),
4260e2a58ee5SAlex Elder 				reply_buf, size);
426136be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
426286b00e0dSAlex Elder 	if (ret < 0)
426386b00e0dSAlex Elder 		goto out_err;
426486b00e0dSAlex Elder 
426586b00e0dSAlex Elder 	p = reply_buf;
426657385b51SAlex Elder 	end = reply_buf + ret;
426757385b51SAlex Elder 	ret = -ERANGE;
4268642a2537SAlex Elder 	ceph_decode_64_safe(&p, end, pool_id, out_err);
4269392a9dadSAlex Elder 	if (pool_id == CEPH_NOPOOL) {
4270392a9dadSAlex Elder 		/*
4271392a9dadSAlex Elder 		 * Either the parent never existed, or we have
4272392a9dadSAlex Elder 		 * record of it but the image got flattened so it no
4273392a9dadSAlex Elder 		 * longer has a parent.  When the parent of a
4274392a9dadSAlex Elder 		 * layered image disappears we immediately set the
4275392a9dadSAlex Elder 		 * overlap to 0.  The effect of this is that all new
4276392a9dadSAlex Elder 		 * requests will be treated as if the image had no
4277392a9dadSAlex Elder 		 * parent.
4278392a9dadSAlex Elder 		 */
4279392a9dadSAlex Elder 		if (rbd_dev->parent_overlap) {
4280392a9dadSAlex Elder 			rbd_dev->parent_overlap = 0;
4281392a9dadSAlex Elder 			rbd_dev_parent_put(rbd_dev);
4282392a9dadSAlex Elder 			pr_info("%s: clone image has been flattened\n",
4283392a9dadSAlex Elder 				rbd_dev->disk->disk_name);
4284392a9dadSAlex Elder 		}
4285392a9dadSAlex Elder 
428686b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
4287392a9dadSAlex Elder 	}
428886b00e0dSAlex Elder 
42890903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
42900903e875SAlex Elder 
42910903e875SAlex Elder 	ret = -EIO;
4292642a2537SAlex Elder 	if (pool_id > (u64)U32_MAX) {
42939584d508SIlya Dryomov 		rbd_warn(NULL, "parent pool id too large (%llu > %u)",
4294642a2537SAlex Elder 			(unsigned long long)pool_id, U32_MAX);
429557385b51SAlex Elder 		goto out_err;
4296c0cd10dbSAlex Elder 	}
42970903e875SAlex Elder 
4298979ed480SAlex Elder 	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
429986b00e0dSAlex Elder 	if (IS_ERR(image_id)) {
430086b00e0dSAlex Elder 		ret = PTR_ERR(image_id);
430186b00e0dSAlex Elder 		goto out_err;
430286b00e0dSAlex Elder 	}
43033b5cf2a2SAlex Elder 	ceph_decode_64_safe(&p, end, snap_id, out_err);
430486b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, overlap, out_err);
430586b00e0dSAlex Elder 
43063b5cf2a2SAlex Elder 	/*
43073b5cf2a2SAlex Elder 	 * The parent won't change (except when the clone is
43083b5cf2a2SAlex Elder 	 * flattened, already handled that).  So we only need to
43093b5cf2a2SAlex Elder 	 * record the parent spec we have not already done so.
43103b5cf2a2SAlex Elder 	 */
43113b5cf2a2SAlex Elder 	if (!rbd_dev->parent_spec) {
43123b5cf2a2SAlex Elder 		parent_spec->pool_id = pool_id;
43133b5cf2a2SAlex Elder 		parent_spec->image_id = image_id;
43143b5cf2a2SAlex Elder 		parent_spec->snap_id = snap_id;
431586b00e0dSAlex Elder 		rbd_dev->parent_spec = parent_spec;
431686b00e0dSAlex Elder 		parent_spec = NULL;	/* rbd_dev now owns this */
4317fbba11b3SIlya Dryomov 	} else {
4318fbba11b3SIlya Dryomov 		kfree(image_id);
43193b5cf2a2SAlex Elder 	}
43203b5cf2a2SAlex Elder 
43213b5cf2a2SAlex Elder 	/*
4322cf32bd9cSIlya Dryomov 	 * We always update the parent overlap.  If it's zero we issue
4323cf32bd9cSIlya Dryomov 	 * a warning, as we will proceed as if there was no parent.
43243b5cf2a2SAlex Elder 	 */
43253b5cf2a2SAlex Elder 	if (!overlap) {
43263b5cf2a2SAlex Elder 		if (parent_spec) {
4327cf32bd9cSIlya Dryomov 			/* refresh, careful to warn just once */
4328cf32bd9cSIlya Dryomov 			if (rbd_dev->parent_overlap)
4329cf32bd9cSIlya Dryomov 				rbd_warn(rbd_dev,
4330cf32bd9cSIlya Dryomov 				    "clone now standalone (overlap became 0)");
433170cf49cfSAlex Elder 		} else {
4332cf32bd9cSIlya Dryomov 			/* initial probe */
4333cf32bd9cSIlya Dryomov 			rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
43343b5cf2a2SAlex Elder 		}
433570cf49cfSAlex Elder 	}
4336cf32bd9cSIlya Dryomov 	rbd_dev->parent_overlap = overlap;
4337cf32bd9cSIlya Dryomov 
433886b00e0dSAlex Elder out:
433986b00e0dSAlex Elder 	ret = 0;
434086b00e0dSAlex Elder out_err:
434186b00e0dSAlex Elder 	kfree(reply_buf);
434286b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
434386b00e0dSAlex Elder 
434486b00e0dSAlex Elder 	return ret;
434586b00e0dSAlex Elder }
434686b00e0dSAlex Elder 
4347cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
4348cc070d59SAlex Elder {
4349cc070d59SAlex Elder 	struct {
4350cc070d59SAlex Elder 		__le64 stripe_unit;
4351cc070d59SAlex Elder 		__le64 stripe_count;
4352cc070d59SAlex Elder 	} __attribute__ ((packed)) striping_info_buf = { 0 };
4353cc070d59SAlex Elder 	size_t size = sizeof (striping_info_buf);
4354cc070d59SAlex Elder 	void *p;
4355cc070d59SAlex Elder 	u64 obj_size;
4356cc070d59SAlex Elder 	u64 stripe_unit;
4357cc070d59SAlex Elder 	u64 stripe_count;
4358cc070d59SAlex Elder 	int ret;
4359cc070d59SAlex Elder 
4360cc070d59SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4361cc070d59SAlex Elder 				"rbd", "get_stripe_unit_count", NULL, 0,
4362e2a58ee5SAlex Elder 				(char *)&striping_info_buf, size);
4363cc070d59SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4364cc070d59SAlex Elder 	if (ret < 0)
4365cc070d59SAlex Elder 		return ret;
4366cc070d59SAlex Elder 	if (ret < size)
4367cc070d59SAlex Elder 		return -ERANGE;
4368cc070d59SAlex Elder 
4369cc070d59SAlex Elder 	/*
4370cc070d59SAlex Elder 	 * We don't actually support the "fancy striping" feature
4371cc070d59SAlex Elder 	 * (STRIPINGV2) yet, but if the striping sizes are the
4372cc070d59SAlex Elder 	 * defaults the behavior is the same as before.  So find
4373cc070d59SAlex Elder 	 * out, and only fail if the image has non-default values.
4374cc070d59SAlex Elder 	 */
4375cc070d59SAlex Elder 	ret = -EINVAL;
4376cc070d59SAlex Elder 	obj_size = (u64)1 << rbd_dev->header.obj_order;
4377cc070d59SAlex Elder 	p = &striping_info_buf;
4378cc070d59SAlex Elder 	stripe_unit = ceph_decode_64(&p);
4379cc070d59SAlex Elder 	if (stripe_unit != obj_size) {
4380cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe unit "
4381cc070d59SAlex Elder 				"(got %llu want %llu)",
4382cc070d59SAlex Elder 				stripe_unit, obj_size);
4383cc070d59SAlex Elder 		return -EINVAL;
4384cc070d59SAlex Elder 	}
4385cc070d59SAlex Elder 	stripe_count = ceph_decode_64(&p);
4386cc070d59SAlex Elder 	if (stripe_count != 1) {
4387cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe count "
4388cc070d59SAlex Elder 				"(got %llu want 1)", stripe_count);
4389cc070d59SAlex Elder 		return -EINVAL;
4390cc070d59SAlex Elder 	}
4391500d0c0fSAlex Elder 	rbd_dev->header.stripe_unit = stripe_unit;
4392500d0c0fSAlex Elder 	rbd_dev->header.stripe_count = stripe_count;
4393cc070d59SAlex Elder 
4394cc070d59SAlex Elder 	return 0;
4395cc070d59SAlex Elder }
4396cc070d59SAlex Elder 
43979e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
43989e15b77dSAlex Elder {
43999e15b77dSAlex Elder 	size_t image_id_size;
44009e15b77dSAlex Elder 	char *image_id;
44019e15b77dSAlex Elder 	void *p;
44029e15b77dSAlex Elder 	void *end;
44039e15b77dSAlex Elder 	size_t size;
44049e15b77dSAlex Elder 	void *reply_buf = NULL;
44059e15b77dSAlex Elder 	size_t len = 0;
44069e15b77dSAlex Elder 	char *image_name = NULL;
44079e15b77dSAlex Elder 	int ret;
44089e15b77dSAlex Elder 
44099e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
44109e15b77dSAlex Elder 
441169e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
441269e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
44139e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
44149e15b77dSAlex Elder 	if (!image_id)
44159e15b77dSAlex Elder 		return NULL;
44169e15b77dSAlex Elder 
44179e15b77dSAlex Elder 	p = image_id;
44184157976bSAlex Elder 	end = image_id + image_id_size;
441969e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
44209e15b77dSAlex Elder 
44219e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
44229e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
44239e15b77dSAlex Elder 	if (!reply_buf)
44249e15b77dSAlex Elder 		goto out;
44259e15b77dSAlex Elder 
442636be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
44279e15b77dSAlex Elder 				"rbd", "dir_get_name",
44289e15b77dSAlex Elder 				image_id, image_id_size,
4429e2a58ee5SAlex Elder 				reply_buf, size);
44309e15b77dSAlex Elder 	if (ret < 0)
44319e15b77dSAlex Elder 		goto out;
44329e15b77dSAlex Elder 	p = reply_buf;
4433f40eb349SAlex Elder 	end = reply_buf + ret;
4434f40eb349SAlex Elder 
44359e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
44369e15b77dSAlex Elder 	if (IS_ERR(image_name))
44379e15b77dSAlex Elder 		image_name = NULL;
44389e15b77dSAlex Elder 	else
44399e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
44409e15b77dSAlex Elder out:
44419e15b77dSAlex Elder 	kfree(reply_buf);
44429e15b77dSAlex Elder 	kfree(image_id);
44439e15b77dSAlex Elder 
44449e15b77dSAlex Elder 	return image_name;
44459e15b77dSAlex Elder }
44469e15b77dSAlex Elder 
44472ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
44482ad3d716SAlex Elder {
44492ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
44502ad3d716SAlex Elder 	const char *snap_name;
44512ad3d716SAlex Elder 	u32 which = 0;
44522ad3d716SAlex Elder 
44532ad3d716SAlex Elder 	/* Skip over names until we find the one we are looking for */
44542ad3d716SAlex Elder 
44552ad3d716SAlex Elder 	snap_name = rbd_dev->header.snap_names;
44562ad3d716SAlex Elder 	while (which < snapc->num_snaps) {
44572ad3d716SAlex Elder 		if (!strcmp(name, snap_name))
44582ad3d716SAlex Elder 			return snapc->snaps[which];
44592ad3d716SAlex Elder 		snap_name += strlen(snap_name) + 1;
44602ad3d716SAlex Elder 		which++;
44612ad3d716SAlex Elder 	}
44622ad3d716SAlex Elder 	return CEPH_NOSNAP;
44632ad3d716SAlex Elder }
44642ad3d716SAlex Elder 
44652ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
44662ad3d716SAlex Elder {
44672ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
44682ad3d716SAlex Elder 	u32 which;
44692ad3d716SAlex Elder 	bool found = false;
44702ad3d716SAlex Elder 	u64 snap_id;
44712ad3d716SAlex Elder 
44722ad3d716SAlex Elder 	for (which = 0; !found && which < snapc->num_snaps; which++) {
44732ad3d716SAlex Elder 		const char *snap_name;
44742ad3d716SAlex Elder 
44752ad3d716SAlex Elder 		snap_id = snapc->snaps[which];
44762ad3d716SAlex Elder 		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
4477efadc98aSJosh Durgin 		if (IS_ERR(snap_name)) {
4478efadc98aSJosh Durgin 			/* ignore no-longer existing snapshots */
4479efadc98aSJosh Durgin 			if (PTR_ERR(snap_name) == -ENOENT)
4480efadc98aSJosh Durgin 				continue;
4481efadc98aSJosh Durgin 			else
44822ad3d716SAlex Elder 				break;
4483efadc98aSJosh Durgin 		}
44842ad3d716SAlex Elder 		found = !strcmp(name, snap_name);
44852ad3d716SAlex Elder 		kfree(snap_name);
44862ad3d716SAlex Elder 	}
44872ad3d716SAlex Elder 	return found ? snap_id : CEPH_NOSNAP;
44882ad3d716SAlex Elder }
44892ad3d716SAlex Elder 
44902ad3d716SAlex Elder /*
44912ad3d716SAlex Elder  * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
44922ad3d716SAlex Elder  * no snapshot by that name is found, or if an error occurs.
44932ad3d716SAlex Elder  */
44942ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
44952ad3d716SAlex Elder {
44962ad3d716SAlex Elder 	if (rbd_dev->image_format == 1)
44972ad3d716SAlex Elder 		return rbd_v1_snap_id_by_name(rbd_dev, name);
44982ad3d716SAlex Elder 
44992ad3d716SAlex Elder 	return rbd_v2_snap_id_by_name(rbd_dev, name);
45002ad3d716SAlex Elder }
45012ad3d716SAlex Elder 
45029e15b77dSAlex Elder /*
450304077599SIlya Dryomov  * An image being mapped will have everything but the snap id.
45049e15b77dSAlex Elder  */
450504077599SIlya Dryomov static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
450604077599SIlya Dryomov {
450704077599SIlya Dryomov 	struct rbd_spec *spec = rbd_dev->spec;
450804077599SIlya Dryomov 
450904077599SIlya Dryomov 	rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
451004077599SIlya Dryomov 	rbd_assert(spec->image_id && spec->image_name);
451104077599SIlya Dryomov 	rbd_assert(spec->snap_name);
451204077599SIlya Dryomov 
451304077599SIlya Dryomov 	if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
451404077599SIlya Dryomov 		u64 snap_id;
451504077599SIlya Dryomov 
451604077599SIlya Dryomov 		snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
451704077599SIlya Dryomov 		if (snap_id == CEPH_NOSNAP)
451804077599SIlya Dryomov 			return -ENOENT;
451904077599SIlya Dryomov 
452004077599SIlya Dryomov 		spec->snap_id = snap_id;
452104077599SIlya Dryomov 	} else {
452204077599SIlya Dryomov 		spec->snap_id = CEPH_NOSNAP;
452304077599SIlya Dryomov 	}
452404077599SIlya Dryomov 
452504077599SIlya Dryomov 	return 0;
452604077599SIlya Dryomov }
452704077599SIlya Dryomov 
452804077599SIlya Dryomov /*
452904077599SIlya Dryomov  * A parent image will have all ids but none of the names.
453004077599SIlya Dryomov  *
453104077599SIlya Dryomov  * All names in an rbd spec are dynamically allocated.  It's OK if we
453204077599SIlya Dryomov  * can't figure out the name for an image id.
453304077599SIlya Dryomov  */
453404077599SIlya Dryomov static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
45359e15b77dSAlex Elder {
45362e9f7f1cSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
45372e9f7f1cSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
45382e9f7f1cSAlex Elder 	const char *pool_name;
45392e9f7f1cSAlex Elder 	const char *image_name;
45402e9f7f1cSAlex Elder 	const char *snap_name;
45419e15b77dSAlex Elder 	int ret;
45429e15b77dSAlex Elder 
454304077599SIlya Dryomov 	rbd_assert(spec->pool_id != CEPH_NOPOOL);
454404077599SIlya Dryomov 	rbd_assert(spec->image_id);
454504077599SIlya Dryomov 	rbd_assert(spec->snap_id != CEPH_NOSNAP);
45469e15b77dSAlex Elder 
45472e9f7f1cSAlex Elder 	/* Get the pool name; we have to make our own copy of this */
45489e15b77dSAlex Elder 
45492e9f7f1cSAlex Elder 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
45502e9f7f1cSAlex Elder 	if (!pool_name) {
45512e9f7f1cSAlex Elder 		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
4552935dc89fSAlex Elder 		return -EIO;
4553935dc89fSAlex Elder 	}
45542e9f7f1cSAlex Elder 	pool_name = kstrdup(pool_name, GFP_KERNEL);
45552e9f7f1cSAlex Elder 	if (!pool_name)
45569e15b77dSAlex Elder 		return -ENOMEM;
45579e15b77dSAlex Elder 
45589e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
45599e15b77dSAlex Elder 
45602e9f7f1cSAlex Elder 	image_name = rbd_dev_image_name(rbd_dev);
45612e9f7f1cSAlex Elder 	if (!image_name)
456206ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
45639e15b77dSAlex Elder 
456404077599SIlya Dryomov 	/* Fetch the snapshot name */
45659e15b77dSAlex Elder 
45662e9f7f1cSAlex Elder 	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
4567da6a6b63SJosh Durgin 	if (IS_ERR(snap_name)) {
4568da6a6b63SJosh Durgin 		ret = PTR_ERR(snap_name);
45699e15b77dSAlex Elder 		goto out_err;
45702e9f7f1cSAlex Elder 	}
45712e9f7f1cSAlex Elder 
45722e9f7f1cSAlex Elder 	spec->pool_name = pool_name;
45732e9f7f1cSAlex Elder 	spec->image_name = image_name;
45742e9f7f1cSAlex Elder 	spec->snap_name = snap_name;
45759e15b77dSAlex Elder 
45769e15b77dSAlex Elder 	return 0;
457704077599SIlya Dryomov 
45789e15b77dSAlex Elder out_err:
45792e9f7f1cSAlex Elder 	kfree(image_name);
45802e9f7f1cSAlex Elder 	kfree(pool_name);
45819e15b77dSAlex Elder 	return ret;
45829e15b77dSAlex Elder }
45839e15b77dSAlex Elder 
4584cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
458535d489f9SAlex Elder {
458635d489f9SAlex Elder 	size_t size;
458735d489f9SAlex Elder 	int ret;
458835d489f9SAlex Elder 	void *reply_buf;
458935d489f9SAlex Elder 	void *p;
459035d489f9SAlex Elder 	void *end;
459135d489f9SAlex Elder 	u64 seq;
459235d489f9SAlex Elder 	u32 snap_count;
459335d489f9SAlex Elder 	struct ceph_snap_context *snapc;
459435d489f9SAlex Elder 	u32 i;
459535d489f9SAlex Elder 
459635d489f9SAlex Elder 	/*
459735d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
459835d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
459935d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
460035d489f9SAlex Elder 	 * prepared to receive.
460135d489f9SAlex Elder 	 */
460235d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
460335d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
460435d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
460535d489f9SAlex Elder 	if (!reply_buf)
460635d489f9SAlex Elder 		return -ENOMEM;
460735d489f9SAlex Elder 
460836be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
46094157976bSAlex Elder 				"rbd", "get_snapcontext", NULL, 0,
4610e2a58ee5SAlex Elder 				reply_buf, size);
461136be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
461235d489f9SAlex Elder 	if (ret < 0)
461335d489f9SAlex Elder 		goto out;
461435d489f9SAlex Elder 
461535d489f9SAlex Elder 	p = reply_buf;
461657385b51SAlex Elder 	end = reply_buf + ret;
461757385b51SAlex Elder 	ret = -ERANGE;
461835d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
461935d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
462035d489f9SAlex Elder 
462135d489f9SAlex Elder 	/*
462235d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
462335d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
462435d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
462535d489f9SAlex Elder 	 * allocate is representable in a size_t.
462635d489f9SAlex Elder 	 */
462735d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
462835d489f9SAlex Elder 				 / sizeof (u64)) {
462935d489f9SAlex Elder 		ret = -EINVAL;
463035d489f9SAlex Elder 		goto out;
463135d489f9SAlex Elder 	}
463235d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
463335d489f9SAlex Elder 		goto out;
4634468521c1SAlex Elder 	ret = 0;
463535d489f9SAlex Elder 
4636812164f8SAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
463735d489f9SAlex Elder 	if (!snapc) {
463835d489f9SAlex Elder 		ret = -ENOMEM;
463935d489f9SAlex Elder 		goto out;
464035d489f9SAlex Elder 	}
464135d489f9SAlex Elder 	snapc->seq = seq;
464235d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
464335d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
464435d489f9SAlex Elder 
464549ece554SAlex Elder 	ceph_put_snap_context(rbd_dev->header.snapc);
464635d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
464735d489f9SAlex Elder 
464835d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
464935d489f9SAlex Elder 		(unsigned long long)seq, (unsigned int)snap_count);
465035d489f9SAlex Elder out:
465135d489f9SAlex Elder 	kfree(reply_buf);
465235d489f9SAlex Elder 
465357385b51SAlex Elder 	return ret;
465435d489f9SAlex Elder }
465535d489f9SAlex Elder 
465654cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
465754cac61fSAlex Elder 					u64 snap_id)
4658b8b1e2dbSAlex Elder {
4659b8b1e2dbSAlex Elder 	size_t size;
4660b8b1e2dbSAlex Elder 	void *reply_buf;
466154cac61fSAlex Elder 	__le64 snapid;
4662b8b1e2dbSAlex Elder 	int ret;
4663b8b1e2dbSAlex Elder 	void *p;
4664b8b1e2dbSAlex Elder 	void *end;
4665b8b1e2dbSAlex Elder 	char *snap_name;
4666b8b1e2dbSAlex Elder 
4667b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
4668b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
4669b8b1e2dbSAlex Elder 	if (!reply_buf)
4670b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
4671b8b1e2dbSAlex Elder 
467254cac61fSAlex Elder 	snapid = cpu_to_le64(snap_id);
467336be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4674b8b1e2dbSAlex Elder 				"rbd", "get_snapshot_name",
467554cac61fSAlex Elder 				&snapid, sizeof (snapid),
4676e2a58ee5SAlex Elder 				reply_buf, size);
467736be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4678f40eb349SAlex Elder 	if (ret < 0) {
4679f40eb349SAlex Elder 		snap_name = ERR_PTR(ret);
4680b8b1e2dbSAlex Elder 		goto out;
4681f40eb349SAlex Elder 	}
4682b8b1e2dbSAlex Elder 
4683b8b1e2dbSAlex Elder 	p = reply_buf;
4684f40eb349SAlex Elder 	end = reply_buf + ret;
4685e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
4686f40eb349SAlex Elder 	if (IS_ERR(snap_name))
4687b8b1e2dbSAlex Elder 		goto out;
4688f40eb349SAlex Elder 
4689b8b1e2dbSAlex Elder 	dout("  snap_id 0x%016llx snap_name = %s\n",
469054cac61fSAlex Elder 		(unsigned long long)snap_id, snap_name);
4691b8b1e2dbSAlex Elder out:
4692b8b1e2dbSAlex Elder 	kfree(reply_buf);
4693b8b1e2dbSAlex Elder 
4694f40eb349SAlex Elder 	return snap_name;
4695b8b1e2dbSAlex Elder }
4696b8b1e2dbSAlex Elder 
46972df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
4698117973fbSAlex Elder {
46992df3fac7SAlex Elder 	bool first_time = rbd_dev->header.object_prefix == NULL;
4700117973fbSAlex Elder 	int ret;
4701117973fbSAlex Elder 
47021617e40cSJosh Durgin 	ret = rbd_dev_v2_image_size(rbd_dev);
47031617e40cSJosh Durgin 	if (ret)
4704cfbf6377SAlex Elder 		return ret;
47051617e40cSJosh Durgin 
47062df3fac7SAlex Elder 	if (first_time) {
47072df3fac7SAlex Elder 		ret = rbd_dev_v2_header_onetime(rbd_dev);
47082df3fac7SAlex Elder 		if (ret)
4709cfbf6377SAlex Elder 			return ret;
47102df3fac7SAlex Elder 	}
47112df3fac7SAlex Elder 
4712cc4a38bdSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev);
4713d194cd1dSIlya Dryomov 	if (ret && first_time) {
4714d194cd1dSIlya Dryomov 		kfree(rbd_dev->header.object_prefix);
4715d194cd1dSIlya Dryomov 		rbd_dev->header.object_prefix = NULL;
4716d194cd1dSIlya Dryomov 	}
4717117973fbSAlex Elder 
4718117973fbSAlex Elder 	return ret;
4719117973fbSAlex Elder }
4720117973fbSAlex Elder 
4721a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev)
4722a720ae09SIlya Dryomov {
4723a720ae09SIlya Dryomov 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4724a720ae09SIlya Dryomov 
4725a720ae09SIlya Dryomov 	if (rbd_dev->image_format == 1)
4726a720ae09SIlya Dryomov 		return rbd_dev_v1_header_info(rbd_dev);
4727a720ae09SIlya Dryomov 
4728a720ae09SIlya Dryomov 	return rbd_dev_v2_header_info(rbd_dev);
4729a720ae09SIlya Dryomov }
4730a720ae09SIlya Dryomov 
47311ddbe94eSAlex Elder /*
4732499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
4733f8a22fc2SIlya Dryomov  * the rbd_dev to the global list.
47341ddbe94eSAlex Elder  */
4735f8a22fc2SIlya Dryomov static int rbd_dev_id_get(struct rbd_device *rbd_dev)
4736b7f23c36SAlex Elder {
4737f8a22fc2SIlya Dryomov 	int new_dev_id;
4738f8a22fc2SIlya Dryomov 
47399b60e70bSIlya Dryomov 	new_dev_id = ida_simple_get(&rbd_dev_id_ida,
47409b60e70bSIlya Dryomov 				    0, minor_to_rbd_dev_id(1 << MINORBITS),
47419b60e70bSIlya Dryomov 				    GFP_KERNEL);
4742f8a22fc2SIlya Dryomov 	if (new_dev_id < 0)
4743f8a22fc2SIlya Dryomov 		return new_dev_id;
4744f8a22fc2SIlya Dryomov 
4745f8a22fc2SIlya Dryomov 	rbd_dev->dev_id = new_dev_id;
4746499afd5bSAlex Elder 
4747499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4748499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
4749499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
4750f8a22fc2SIlya Dryomov 
475170eebd20SIlya Dryomov 	dout("rbd_dev %p given dev id %d\n", rbd_dev, rbd_dev->dev_id);
4752f8a22fc2SIlya Dryomov 
4753f8a22fc2SIlya Dryomov 	return 0;
4754b7f23c36SAlex Elder }
4755b7f23c36SAlex Elder 
47561ddbe94eSAlex Elder /*
4757499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
4758499afd5bSAlex Elder  * identifier is no longer in use.
47591ddbe94eSAlex Elder  */
4760e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev)
47611ddbe94eSAlex Elder {
4762499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4763499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
4764499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
47651ddbe94eSAlex Elder 
4766f8a22fc2SIlya Dryomov 	ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4767f8a22fc2SIlya Dryomov 
4768f8a22fc2SIlya Dryomov 	dout("rbd_dev %p released dev id %d\n", rbd_dev, rbd_dev->dev_id);
4769b7f23c36SAlex Elder }
4770b7f23c36SAlex Elder 
4771a725f65eSAlex Elder /*
4772e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
4773e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
4774593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
4775593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
4776e28fff26SAlex Elder  */
4777e28fff26SAlex Elder static inline size_t next_token(const char **buf)
4778e28fff26SAlex Elder {
4779e28fff26SAlex Elder         /*
4780e28fff26SAlex Elder         * These are the characters that produce nonzero for
4781e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
4782e28fff26SAlex Elder         */
4783e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
4784e28fff26SAlex Elder 
4785e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
4786e28fff26SAlex Elder 
4787e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
4788e28fff26SAlex Elder }
4789e28fff26SAlex Elder 
4790e28fff26SAlex Elder /*
4791ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
4792ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
4793ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
4794ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
4795ea3352f4SAlex Elder  *
4796ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
4797ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
4798ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
4799ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
4800ea3352f4SAlex Elder  *
4801ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
4802ea3352f4SAlex Elder  * the end of the found token.
4803ea3352f4SAlex Elder  *
4804ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
4805ea3352f4SAlex Elder  */
4806ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
4807ea3352f4SAlex Elder {
4808ea3352f4SAlex Elder 	char *dup;
4809ea3352f4SAlex Elder 	size_t len;
4810ea3352f4SAlex Elder 
4811ea3352f4SAlex Elder 	len = next_token(buf);
48124caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
4813ea3352f4SAlex Elder 	if (!dup)
4814ea3352f4SAlex Elder 		return NULL;
4815ea3352f4SAlex Elder 	*(dup + len) = '\0';
4816ea3352f4SAlex Elder 	*buf += len;
4817ea3352f4SAlex Elder 
4818ea3352f4SAlex Elder 	if (lenp)
4819ea3352f4SAlex Elder 		*lenp = len;
4820ea3352f4SAlex Elder 
4821ea3352f4SAlex Elder 	return dup;
4822ea3352f4SAlex Elder }
4823ea3352f4SAlex Elder 
4824ea3352f4SAlex Elder /*
4825859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
4826859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
4827859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
4828859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
4829d22f76e7SAlex Elder  *
4830859c31dfSAlex Elder  * The information extracted from these options is recorded in
4831859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
4832859c31dfSAlex Elder  * structures:
4833859c31dfSAlex Elder  *  ceph_opts
4834859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
4835859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
4836859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
4837859c31dfSAlex Elder  *  rbd_opts
4838859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
4839859c31dfSAlex Elder  *	this function; caller must release with kfree().
4840859c31dfSAlex Elder  *  spec
4841859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
4842859c31dfSAlex Elder  *	initialized by this function based on parsed options.
4843859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
4844859c31dfSAlex Elder  *
4845859c31dfSAlex Elder  * The options passed take this form:
4846859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4847859c31dfSAlex Elder  * where:
4848859c31dfSAlex Elder  *  <mon_addrs>
4849859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
4850859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
4851859c31dfSAlex Elder  *      by a port number (separated by a colon).
4852859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
4853859c31dfSAlex Elder  *  <options>
4854859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
4855859c31dfSAlex Elder  *  <pool_name>
4856859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
4857859c31dfSAlex Elder  *  <image_name>
4858859c31dfSAlex Elder  *      The name of the image in that pool to map.
4859859c31dfSAlex Elder  *  <snap_id>
4860859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
4861859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
4862859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
4863859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
4864a725f65eSAlex Elder  */
4865859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
4866dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
4867859c31dfSAlex Elder 				struct rbd_options **opts,
4868859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
4869a725f65eSAlex Elder {
4870e28fff26SAlex Elder 	size_t len;
4871859c31dfSAlex Elder 	char *options;
48720ddebc0cSAlex Elder 	const char *mon_addrs;
4873ecb4dc22SAlex Elder 	char *snap_name;
48740ddebc0cSAlex Elder 	size_t mon_addrs_size;
4875859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
48764e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
4877859c31dfSAlex Elder 	struct ceph_options *copts;
4878dc79b113SAlex Elder 	int ret;
4879e28fff26SAlex Elder 
4880e28fff26SAlex Elder 	/* The first four tokens are required */
4881e28fff26SAlex Elder 
48827ef3214aSAlex Elder 	len = next_token(&buf);
48834fb5d671SAlex Elder 	if (!len) {
48844fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
48854fb5d671SAlex Elder 		return -EINVAL;
48864fb5d671SAlex Elder 	}
48870ddebc0cSAlex Elder 	mon_addrs = buf;
4888f28e565aSAlex Elder 	mon_addrs_size = len + 1;
48897ef3214aSAlex Elder 	buf += len;
4890a725f65eSAlex Elder 
4891dc79b113SAlex Elder 	ret = -EINVAL;
4892f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
4893f28e565aSAlex Elder 	if (!options)
4894dc79b113SAlex Elder 		return -ENOMEM;
48954fb5d671SAlex Elder 	if (!*options) {
48964fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
48974fb5d671SAlex Elder 		goto out_err;
48984fb5d671SAlex Elder 	}
4899a725f65eSAlex Elder 
4900859c31dfSAlex Elder 	spec = rbd_spec_alloc();
4901859c31dfSAlex Elder 	if (!spec)
4902f28e565aSAlex Elder 		goto out_mem;
4903859c31dfSAlex Elder 
4904859c31dfSAlex Elder 	spec->pool_name = dup_token(&buf, NULL);
4905859c31dfSAlex Elder 	if (!spec->pool_name)
4906859c31dfSAlex Elder 		goto out_mem;
49074fb5d671SAlex Elder 	if (!*spec->pool_name) {
49084fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
49094fb5d671SAlex Elder 		goto out_err;
49104fb5d671SAlex Elder 	}
4911e28fff26SAlex Elder 
491269e7a02fSAlex Elder 	spec->image_name = dup_token(&buf, NULL);
4913859c31dfSAlex Elder 	if (!spec->image_name)
4914f28e565aSAlex Elder 		goto out_mem;
49154fb5d671SAlex Elder 	if (!*spec->image_name) {
49164fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
49174fb5d671SAlex Elder 		goto out_err;
49184fb5d671SAlex Elder 	}
4919e28fff26SAlex Elder 
4920f28e565aSAlex Elder 	/*
4921f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
4922f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
4923f28e565aSAlex Elder 	 */
49243feeb894SAlex Elder 	len = next_token(&buf);
4925820a5f3eSAlex Elder 	if (!len) {
49263feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
49273feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
4928f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
4929dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
4930f28e565aSAlex Elder 		goto out_err;
4931849b4260SAlex Elder 	}
4932ecb4dc22SAlex Elder 	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
4933ecb4dc22SAlex Elder 	if (!snap_name)
4934f28e565aSAlex Elder 		goto out_mem;
4935ecb4dc22SAlex Elder 	*(snap_name + len) = '\0';
4936ecb4dc22SAlex Elder 	spec->snap_name = snap_name;
4937e5c35534SAlex Elder 
49380ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
4939e28fff26SAlex Elder 
49404e9afebaSAlex Elder 	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
49414e9afebaSAlex Elder 	if (!rbd_opts)
49424e9afebaSAlex Elder 		goto out_mem;
49434e9afebaSAlex Elder 
49444e9afebaSAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
4945b5584180SIlya Dryomov 	rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
4946d22f76e7SAlex Elder 
4947859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
49480ddebc0cSAlex Elder 					mon_addrs + mon_addrs_size - 1,
49494e9afebaSAlex Elder 					parse_rbd_opts_token, rbd_opts);
4950859c31dfSAlex Elder 	if (IS_ERR(copts)) {
4951859c31dfSAlex Elder 		ret = PTR_ERR(copts);
4952dc79b113SAlex Elder 		goto out_err;
4953dc79b113SAlex Elder 	}
4954859c31dfSAlex Elder 	kfree(options);
4955859c31dfSAlex Elder 
4956859c31dfSAlex Elder 	*ceph_opts = copts;
49574e9afebaSAlex Elder 	*opts = rbd_opts;
4958859c31dfSAlex Elder 	*rbd_spec = spec;
49590ddebc0cSAlex Elder 
4960dc79b113SAlex Elder 	return 0;
4961f28e565aSAlex Elder out_mem:
4962dc79b113SAlex Elder 	ret = -ENOMEM;
4963d22f76e7SAlex Elder out_err:
4964859c31dfSAlex Elder 	kfree(rbd_opts);
4965859c31dfSAlex Elder 	rbd_spec_put(spec);
4966f28e565aSAlex Elder 	kfree(options);
4967d22f76e7SAlex Elder 
4968dc79b113SAlex Elder 	return ret;
4969a725f65eSAlex Elder }
4970a725f65eSAlex Elder 
4971589d30e0SAlex Elder /*
497230ba1f02SIlya Dryomov  * Return pool id (>= 0) or a negative error code.
497330ba1f02SIlya Dryomov  */
497430ba1f02SIlya Dryomov static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
497530ba1f02SIlya Dryomov {
4976a319bf56SIlya Dryomov 	struct ceph_options *opts = rbdc->client->options;
497730ba1f02SIlya Dryomov 	u64 newest_epoch;
497830ba1f02SIlya Dryomov 	int tries = 0;
497930ba1f02SIlya Dryomov 	int ret;
498030ba1f02SIlya Dryomov 
498130ba1f02SIlya Dryomov again:
498230ba1f02SIlya Dryomov 	ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
498330ba1f02SIlya Dryomov 	if (ret == -ENOENT && tries++ < 1) {
498430ba1f02SIlya Dryomov 		ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap",
498530ba1f02SIlya Dryomov 					       &newest_epoch);
498630ba1f02SIlya Dryomov 		if (ret < 0)
498730ba1f02SIlya Dryomov 			return ret;
498830ba1f02SIlya Dryomov 
498930ba1f02SIlya Dryomov 		if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
499030ba1f02SIlya Dryomov 			ceph_monc_request_next_osdmap(&rbdc->client->monc);
499130ba1f02SIlya Dryomov 			(void) ceph_monc_wait_osdmap(&rbdc->client->monc,
4992a319bf56SIlya Dryomov 						     newest_epoch,
4993a319bf56SIlya Dryomov 						     opts->mount_timeout);
499430ba1f02SIlya Dryomov 			goto again;
499530ba1f02SIlya Dryomov 		} else {
499630ba1f02SIlya Dryomov 			/* the osdmap we have is new enough */
499730ba1f02SIlya Dryomov 			return -ENOENT;
499830ba1f02SIlya Dryomov 		}
499930ba1f02SIlya Dryomov 	}
500030ba1f02SIlya Dryomov 
500130ba1f02SIlya Dryomov 	return ret;
500230ba1f02SIlya Dryomov }
500330ba1f02SIlya Dryomov 
500430ba1f02SIlya Dryomov /*
5005589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
5006589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
5007589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
5008589d30e0SAlex Elder  *
5009589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
5010589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
5011589d30e0SAlex Elder  * with the supplied name.
5012589d30e0SAlex Elder  *
5013589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
5014589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
5015589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
5016589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
5017589d30e0SAlex Elder  */
5018589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
5019589d30e0SAlex Elder {
5020589d30e0SAlex Elder 	int ret;
5021589d30e0SAlex Elder 	size_t size;
5022589d30e0SAlex Elder 	char *object_name;
5023589d30e0SAlex Elder 	void *response;
5024c0fba368SAlex Elder 	char *image_id;
50252f82ee54SAlex Elder 
5026589d30e0SAlex Elder 	/*
50272c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
50282c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
5029c0fba368SAlex Elder 	 * need to fetch the image id again in this case.  We
5030c0fba368SAlex Elder 	 * do still need to set the image format though.
50312c0d0a10SAlex Elder 	 */
5032c0fba368SAlex Elder 	if (rbd_dev->spec->image_id) {
5033c0fba368SAlex Elder 		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
5034c0fba368SAlex Elder 
50352c0d0a10SAlex Elder 		return 0;
5036c0fba368SAlex Elder 	}
50372c0d0a10SAlex Elder 
50382c0d0a10SAlex Elder 	/*
5039589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
5040589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
5041589d30e0SAlex Elder 	 */
504269e7a02fSAlex Elder 	size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
5043589d30e0SAlex Elder 	object_name = kmalloc(size, GFP_NOIO);
5044589d30e0SAlex Elder 	if (!object_name)
5045589d30e0SAlex Elder 		return -ENOMEM;
50460d7dbfceSAlex Elder 	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
5047589d30e0SAlex Elder 	dout("rbd id object name is %s\n", object_name);
5048589d30e0SAlex Elder 
5049589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
5050589d30e0SAlex Elder 
5051589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
5052589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
5053589d30e0SAlex Elder 	if (!response) {
5054589d30e0SAlex Elder 		ret = -ENOMEM;
5055589d30e0SAlex Elder 		goto out;
5056589d30e0SAlex Elder 	}
5057589d30e0SAlex Elder 
5058c0fba368SAlex Elder 	/* If it doesn't exist we'll assume it's a format 1 image */
5059c0fba368SAlex Elder 
506036be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, object_name,
50614157976bSAlex Elder 				"rbd", "get_id", NULL, 0,
5062e2a58ee5SAlex Elder 				response, RBD_IMAGE_ID_LEN_MAX);
506336be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5064c0fba368SAlex Elder 	if (ret == -ENOENT) {
5065c0fba368SAlex Elder 		image_id = kstrdup("", GFP_KERNEL);
5066c0fba368SAlex Elder 		ret = image_id ? 0 : -ENOMEM;
5067c0fba368SAlex Elder 		if (!ret)
5068c0fba368SAlex Elder 			rbd_dev->image_format = 1;
50697dd440c9SIlya Dryomov 	} else if (ret >= 0) {
5070c0fba368SAlex Elder 		void *p = response;
5071589d30e0SAlex Elder 
5072c0fba368SAlex Elder 		image_id = ceph_extract_encoded_string(&p, p + ret,
5073979ed480SAlex Elder 						NULL, GFP_NOIO);
5074461f758aSDuan Jiong 		ret = PTR_ERR_OR_ZERO(image_id);
5075c0fba368SAlex Elder 		if (!ret)
5076c0fba368SAlex Elder 			rbd_dev->image_format = 2;
5077c0fba368SAlex Elder 	}
5078c0fba368SAlex Elder 
5079c0fba368SAlex Elder 	if (!ret) {
5080c0fba368SAlex Elder 		rbd_dev->spec->image_id = image_id;
5081c0fba368SAlex Elder 		dout("image_id is %s\n", image_id);
5082589d30e0SAlex Elder 	}
5083589d30e0SAlex Elder out:
5084589d30e0SAlex Elder 	kfree(response);
5085589d30e0SAlex Elder 	kfree(object_name);
5086589d30e0SAlex Elder 
5087589d30e0SAlex Elder 	return ret;
5088589d30e0SAlex Elder }
5089589d30e0SAlex Elder 
50903abef3b3SAlex Elder /*
50913abef3b3SAlex Elder  * Undo whatever state changes are made by v1 or v2 header info
50923abef3b3SAlex Elder  * call.
50933abef3b3SAlex Elder  */
50946fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
50956fd48b3bSAlex Elder {
50966fd48b3bSAlex Elder 	struct rbd_image_header	*header;
50976fd48b3bSAlex Elder 
5098a2acd00eSAlex Elder 	rbd_dev_parent_put(rbd_dev);
50996fd48b3bSAlex Elder 
51006fd48b3bSAlex Elder 	/* Free dynamic fields from the header, then zero it out */
51016fd48b3bSAlex Elder 
51026fd48b3bSAlex Elder 	header = &rbd_dev->header;
5103812164f8SAlex Elder 	ceph_put_snap_context(header->snapc);
51046fd48b3bSAlex Elder 	kfree(header->snap_sizes);
51056fd48b3bSAlex Elder 	kfree(header->snap_names);
51066fd48b3bSAlex Elder 	kfree(header->object_prefix);
51076fd48b3bSAlex Elder 	memset(header, 0, sizeof (*header));
51086fd48b3bSAlex Elder }
51096fd48b3bSAlex Elder 
51102df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
5111a30b71b9SAlex Elder {
5112a30b71b9SAlex Elder 	int ret;
5113a30b71b9SAlex Elder 
51141e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
511557385b51SAlex Elder 	if (ret)
51161e130199SAlex Elder 		goto out_err;
5117b1b5402aSAlex Elder 
51182df3fac7SAlex Elder 	/*
51192df3fac7SAlex Elder 	 * Get the and check features for the image.  Currently the
51202df3fac7SAlex Elder 	 * features are assumed to never change.
51212df3fac7SAlex Elder 	 */
5122b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
512357385b51SAlex Elder 	if (ret)
5124b1b5402aSAlex Elder 		goto out_err;
512535d489f9SAlex Elder 
5126cc070d59SAlex Elder 	/* If the image supports fancy striping, get its parameters */
5127cc070d59SAlex Elder 
5128cc070d59SAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
5129cc070d59SAlex Elder 		ret = rbd_dev_v2_striping_info(rbd_dev);
5130cc070d59SAlex Elder 		if (ret < 0)
5131cc070d59SAlex Elder 			goto out_err;
5132cc070d59SAlex Elder 	}
51332df3fac7SAlex Elder 	/* No support for crypto and compression type format 2 images */
5134a30b71b9SAlex Elder 
513535152979SAlex Elder 	return 0;
51369d475de5SAlex Elder out_err:
5137642a2537SAlex Elder 	rbd_dev->header.features = 0;
51381e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
51391e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
51409d475de5SAlex Elder 
51419d475de5SAlex Elder 	return ret;
5142a30b71b9SAlex Elder }
5143a30b71b9SAlex Elder 
51446d69bb53SIlya Dryomov /*
51456d69bb53SIlya Dryomov  * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
51466d69bb53SIlya Dryomov  * rbd_dev_image_probe() recursion depth, which means it's also the
51476d69bb53SIlya Dryomov  * length of the already discovered part of the parent chain.
51486d69bb53SIlya Dryomov  */
51496d69bb53SIlya Dryomov static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
515083a06263SAlex Elder {
51512f82ee54SAlex Elder 	struct rbd_device *parent = NULL;
5152124afba2SAlex Elder 	int ret;
5153124afba2SAlex Elder 
5154124afba2SAlex Elder 	if (!rbd_dev->parent_spec)
5155124afba2SAlex Elder 		return 0;
5156124afba2SAlex Elder 
51576d69bb53SIlya Dryomov 	if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
51586d69bb53SIlya Dryomov 		pr_info("parent chain is too long (%d)\n", depth);
51596d69bb53SIlya Dryomov 		ret = -EINVAL;
51606d69bb53SIlya Dryomov 		goto out_err;
51616d69bb53SIlya Dryomov 	}
51626d69bb53SIlya Dryomov 
51631f2c6651SIlya Dryomov 	parent = rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec,
51641f2c6651SIlya Dryomov 				NULL);
51651f2c6651SIlya Dryomov 	if (!parent) {
5166124afba2SAlex Elder 		ret = -ENOMEM;
5167124afba2SAlex Elder 		goto out_err;
51681f2c6651SIlya Dryomov 	}
51691f2c6651SIlya Dryomov 
51701f2c6651SIlya Dryomov 	/*
51711f2c6651SIlya Dryomov 	 * Images related by parent/child relationships always share
51721f2c6651SIlya Dryomov 	 * rbd_client and spec/parent_spec, so bump their refcounts.
51731f2c6651SIlya Dryomov 	 */
51741f2c6651SIlya Dryomov 	__rbd_get_client(rbd_dev->rbd_client);
51751f2c6651SIlya Dryomov 	rbd_spec_get(rbd_dev->parent_spec);
5176124afba2SAlex Elder 
51776d69bb53SIlya Dryomov 	ret = rbd_dev_image_probe(parent, depth);
5178124afba2SAlex Elder 	if (ret < 0)
5179124afba2SAlex Elder 		goto out_err;
51801f2c6651SIlya Dryomov 
5181124afba2SAlex Elder 	rbd_dev->parent = parent;
5182a2acd00eSAlex Elder 	atomic_set(&rbd_dev->parent_ref, 1);
5183124afba2SAlex Elder 	return 0;
5184124afba2SAlex Elder 
51851f2c6651SIlya Dryomov out_err:
51861f2c6651SIlya Dryomov 	rbd_dev_unparent(rbd_dev);
51871f2c6651SIlya Dryomov 	if (parent)
51881f2c6651SIlya Dryomov 		rbd_dev_destroy(parent);
5189124afba2SAlex Elder 	return ret;
5190124afba2SAlex Elder }
5191124afba2SAlex Elder 
5192200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
5193124afba2SAlex Elder {
519483a06263SAlex Elder 	int ret;
519583a06263SAlex Elder 
5196f8a22fc2SIlya Dryomov 	/* Get an id and fill in device name. */
519783a06263SAlex Elder 
5198f8a22fc2SIlya Dryomov 	ret = rbd_dev_id_get(rbd_dev);
5199f8a22fc2SIlya Dryomov 	if (ret)
5200f8a22fc2SIlya Dryomov 		return ret;
5201f8a22fc2SIlya Dryomov 
520283a06263SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
520383a06263SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
520483a06263SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
520583a06263SAlex Elder 
52069b60e70bSIlya Dryomov 	/* Record our major and minor device numbers. */
520783a06263SAlex Elder 
52089b60e70bSIlya Dryomov 	if (!single_major) {
520983a06263SAlex Elder 		ret = register_blkdev(0, rbd_dev->name);
521083a06263SAlex Elder 		if (ret < 0)
521183a06263SAlex Elder 			goto err_out_id;
52129b60e70bSIlya Dryomov 
521383a06263SAlex Elder 		rbd_dev->major = ret;
5214dd82fff1SIlya Dryomov 		rbd_dev->minor = 0;
52159b60e70bSIlya Dryomov 	} else {
52169b60e70bSIlya Dryomov 		rbd_dev->major = rbd_major;
52179b60e70bSIlya Dryomov 		rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
52189b60e70bSIlya Dryomov 	}
521983a06263SAlex Elder 
522083a06263SAlex Elder 	/* Set up the blkdev mapping. */
522183a06263SAlex Elder 
522283a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
522383a06263SAlex Elder 	if (ret)
522483a06263SAlex Elder 		goto err_out_blkdev;
522583a06263SAlex Elder 
5226f35a4deeSAlex Elder 	ret = rbd_dev_mapping_set(rbd_dev);
522783a06263SAlex Elder 	if (ret)
522883a06263SAlex Elder 		goto err_out_disk;
5229bc1ecc65SIlya Dryomov 
5230f35a4deeSAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
523122001f61SJosh Durgin 	set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only);
5232f35a4deeSAlex Elder 
5233dd5ac32dSIlya Dryomov 	dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
5234dd5ac32dSIlya Dryomov 	ret = device_add(&rbd_dev->dev);
5235f35a4deeSAlex Elder 	if (ret)
5236f5ee37bdSIlya Dryomov 		goto err_out_mapping;
523783a06263SAlex Elder 
523883a06263SAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
523983a06263SAlex Elder 
5240129b79d4SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
524183a06263SAlex Elder 	add_disk(rbd_dev->disk);
524283a06263SAlex Elder 
524383a06263SAlex Elder 	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
524483a06263SAlex Elder 		(unsigned long long) rbd_dev->mapping.size);
524583a06263SAlex Elder 
524683a06263SAlex Elder 	return ret;
52472f82ee54SAlex Elder 
5248f35a4deeSAlex Elder err_out_mapping:
5249f35a4deeSAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
525083a06263SAlex Elder err_out_disk:
525183a06263SAlex Elder 	rbd_free_disk(rbd_dev);
525283a06263SAlex Elder err_out_blkdev:
52539b60e70bSIlya Dryomov 	if (!single_major)
525483a06263SAlex Elder 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
525583a06263SAlex Elder err_out_id:
525683a06263SAlex Elder 	rbd_dev_id_put(rbd_dev);
5257d1cf5788SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
525883a06263SAlex Elder 
525983a06263SAlex Elder 	return ret;
526083a06263SAlex Elder }
526183a06263SAlex Elder 
5262332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev)
5263332bb12dSAlex Elder {
5264332bb12dSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
5265332bb12dSAlex Elder 	size_t size;
5266332bb12dSAlex Elder 
5267332bb12dSAlex Elder 	/* Record the header object name for this rbd image. */
5268332bb12dSAlex Elder 
5269332bb12dSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5270332bb12dSAlex Elder 
5271332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
5272332bb12dSAlex Elder 		size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
5273332bb12dSAlex Elder 	else
5274332bb12dSAlex Elder 		size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
5275332bb12dSAlex Elder 
5276332bb12dSAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
5277332bb12dSAlex Elder 	if (!rbd_dev->header_name)
5278332bb12dSAlex Elder 		return -ENOMEM;
5279332bb12dSAlex Elder 
5280332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
5281332bb12dSAlex Elder 		sprintf(rbd_dev->header_name, "%s%s",
5282332bb12dSAlex Elder 			spec->image_name, RBD_SUFFIX);
5283332bb12dSAlex Elder 	else
5284332bb12dSAlex Elder 		sprintf(rbd_dev->header_name, "%s%s",
5285332bb12dSAlex Elder 			RBD_HEADER_PREFIX, spec->image_id);
5286332bb12dSAlex Elder 	return 0;
5287332bb12dSAlex Elder }
5288332bb12dSAlex Elder 
5289200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev)
5290200a6a8bSAlex Elder {
52916fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
5292200a6a8bSAlex Elder 	kfree(rbd_dev->header_name);
52936fd48b3bSAlex Elder 	rbd_dev->header_name = NULL;
52946fd48b3bSAlex Elder 	rbd_dev->image_format = 0;
52956fd48b3bSAlex Elder 	kfree(rbd_dev->spec->image_id);
52966fd48b3bSAlex Elder 	rbd_dev->spec->image_id = NULL;
52976fd48b3bSAlex Elder 
5298200a6a8bSAlex Elder 	rbd_dev_destroy(rbd_dev);
5299200a6a8bSAlex Elder }
5300200a6a8bSAlex Elder 
5301a30b71b9SAlex Elder /*
5302a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
53031f3ef788SAlex Elder  * device.  If this image is the one being mapped (i.e., not a
53041f3ef788SAlex Elder  * parent), initiate a watch on its header object before using that
53051f3ef788SAlex Elder  * object to get detailed information about the rbd image.
5306a30b71b9SAlex Elder  */
53076d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
5308a30b71b9SAlex Elder {
5309a30b71b9SAlex Elder 	int ret;
5310a30b71b9SAlex Elder 
5311a30b71b9SAlex Elder 	/*
53123abef3b3SAlex Elder 	 * Get the id from the image id object.  Unless there's an
53133abef3b3SAlex Elder 	 * error, rbd_dev->spec->image_id will be filled in with
53143abef3b3SAlex Elder 	 * a dynamically-allocated string, and rbd_dev->image_format
53153abef3b3SAlex Elder 	 * will be set to either 1 or 2.
5316a30b71b9SAlex Elder 	 */
5317a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
5318a30b71b9SAlex Elder 	if (ret)
5319c0fba368SAlex Elder 		return ret;
5320c0fba368SAlex Elder 
5321332bb12dSAlex Elder 	ret = rbd_dev_header_name(rbd_dev);
5322332bb12dSAlex Elder 	if (ret)
5323332bb12dSAlex Elder 		goto err_out_format;
5324332bb12dSAlex Elder 
53256d69bb53SIlya Dryomov 	if (!depth) {
5326fca27065SIlya Dryomov 		ret = rbd_dev_header_watch_sync(rbd_dev);
53271fe48023SIlya Dryomov 		if (ret) {
53281fe48023SIlya Dryomov 			if (ret == -ENOENT)
53291fe48023SIlya Dryomov 				pr_info("image %s/%s does not exist\n",
53301fe48023SIlya Dryomov 					rbd_dev->spec->pool_name,
53311fe48023SIlya Dryomov 					rbd_dev->spec->image_name);
5332b644de2bSAlex Elder 			goto out_header_name;
53331f3ef788SAlex Elder 		}
53341fe48023SIlya Dryomov 	}
5335b644de2bSAlex Elder 
5336a720ae09SIlya Dryomov 	ret = rbd_dev_header_info(rbd_dev);
53375655c4d9SAlex Elder 	if (ret)
5338b644de2bSAlex Elder 		goto err_out_watch;
5339a30b71b9SAlex Elder 
534004077599SIlya Dryomov 	/*
534104077599SIlya Dryomov 	 * If this image is the one being mapped, we have pool name and
534204077599SIlya Dryomov 	 * id, image name and id, and snap name - need to fill snap id.
534304077599SIlya Dryomov 	 * Otherwise this is a parent image, identified by pool, image
534404077599SIlya Dryomov 	 * and snap ids - need to fill in names for those ids.
534504077599SIlya Dryomov 	 */
53466d69bb53SIlya Dryomov 	if (!depth)
534704077599SIlya Dryomov 		ret = rbd_spec_fill_snap_id(rbd_dev);
534804077599SIlya Dryomov 	else
534904077599SIlya Dryomov 		ret = rbd_spec_fill_names(rbd_dev);
53501fe48023SIlya Dryomov 	if (ret) {
53511fe48023SIlya Dryomov 		if (ret == -ENOENT)
53521fe48023SIlya Dryomov 			pr_info("snap %s/%s@%s does not exist\n",
53531fe48023SIlya Dryomov 				rbd_dev->spec->pool_name,
53541fe48023SIlya Dryomov 				rbd_dev->spec->image_name,
53551fe48023SIlya Dryomov 				rbd_dev->spec->snap_name);
535633dca39fSAlex Elder 		goto err_out_probe;
53571fe48023SIlya Dryomov 	}
53589bb81c9bSAlex Elder 
5359e8f59b59SIlya Dryomov 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
5360e8f59b59SIlya Dryomov 		ret = rbd_dev_v2_parent_info(rbd_dev);
5361e8f59b59SIlya Dryomov 		if (ret)
5362e8f59b59SIlya Dryomov 			goto err_out_probe;
5363e8f59b59SIlya Dryomov 
5364e8f59b59SIlya Dryomov 		/*
5365e8f59b59SIlya Dryomov 		 * Need to warn users if this image is the one being
5366e8f59b59SIlya Dryomov 		 * mapped and has a parent.
5367e8f59b59SIlya Dryomov 		 */
53686d69bb53SIlya Dryomov 		if (!depth && rbd_dev->parent_spec)
5369e8f59b59SIlya Dryomov 			rbd_warn(rbd_dev,
5370e8f59b59SIlya Dryomov 				 "WARNING: kernel layering is EXPERIMENTAL!");
5371e8f59b59SIlya Dryomov 	}
5372e8f59b59SIlya Dryomov 
53736d69bb53SIlya Dryomov 	ret = rbd_dev_probe_parent(rbd_dev, depth);
537430d60ba2SAlex Elder 	if (ret)
537530d60ba2SAlex Elder 		goto err_out_probe;
537683a06263SAlex Elder 
537730d60ba2SAlex Elder 	dout("discovered format %u image, header name is %s\n",
537830d60ba2SAlex Elder 		rbd_dev->image_format, rbd_dev->header_name);
537930d60ba2SAlex Elder 	return 0;
5380e8f59b59SIlya Dryomov 
53816fd48b3bSAlex Elder err_out_probe:
53826fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
5383b644de2bSAlex Elder err_out_watch:
53846d69bb53SIlya Dryomov 	if (!depth)
5385fca27065SIlya Dryomov 		rbd_dev_header_unwatch_sync(rbd_dev);
5386332bb12dSAlex Elder out_header_name:
5387332bb12dSAlex Elder 	kfree(rbd_dev->header_name);
5388332bb12dSAlex Elder 	rbd_dev->header_name = NULL;
5389332bb12dSAlex Elder err_out_format:
5390332bb12dSAlex Elder 	rbd_dev->image_format = 0;
53915655c4d9SAlex Elder 	kfree(rbd_dev->spec->image_id);
53925655c4d9SAlex Elder 	rbd_dev->spec->image_id = NULL;
53935655c4d9SAlex Elder 	return ret;
539483a06263SAlex Elder }
539583a06263SAlex Elder 
53969b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus,
539759c2be1eSYehuda Sadeh 			  const char *buf,
539859c2be1eSYehuda Sadeh 			  size_t count)
5399602adf40SYehuda Sadeh {
5400cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
5401dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
54024e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
5403859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
54049d3997fdSAlex Elder 	struct rbd_client *rbdc;
540551344a38SAlex Elder 	bool read_only;
5406b51c83c2SIlya Dryomov 	int rc;
5407602adf40SYehuda Sadeh 
5408602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
5409602adf40SYehuda Sadeh 		return -ENODEV;
5410602adf40SYehuda Sadeh 
5411a725f65eSAlex Elder 	/* parse add command */
5412859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
5413dc79b113SAlex Elder 	if (rc < 0)
5414dd5ac32dSIlya Dryomov 		goto out;
5415a725f65eSAlex Elder 
54169d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
54179d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
54189d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
54190ddebc0cSAlex Elder 		goto err_out_args;
54209d3997fdSAlex Elder 	}
5421602adf40SYehuda Sadeh 
5422602adf40SYehuda Sadeh 	/* pick the pool */
542330ba1f02SIlya Dryomov 	rc = rbd_add_get_pool_id(rbdc, spec->pool_name);
54241fe48023SIlya Dryomov 	if (rc < 0) {
54251fe48023SIlya Dryomov 		if (rc == -ENOENT)
54261fe48023SIlya Dryomov 			pr_info("pool %s does not exist\n", spec->pool_name);
5427602adf40SYehuda Sadeh 		goto err_out_client;
54281fe48023SIlya Dryomov 	}
5429859c31dfSAlex Elder 	spec->pool_id = (u64)rc;
5430859c31dfSAlex Elder 
54310903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
54320903e875SAlex Elder 
5433c0cd10dbSAlex Elder 	if (spec->pool_id > (u64)U32_MAX) {
54349584d508SIlya Dryomov 		rbd_warn(NULL, "pool id too large (%llu > %u)",
5435c0cd10dbSAlex Elder 				(unsigned long long)spec->pool_id, U32_MAX);
54360903e875SAlex Elder 		rc = -EIO;
54370903e875SAlex Elder 		goto err_out_client;
54380903e875SAlex Elder 	}
54390903e875SAlex Elder 
5440d147543dSIlya Dryomov 	rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
5441b51c83c2SIlya Dryomov 	if (!rbd_dev) {
5442b51c83c2SIlya Dryomov 		rc = -ENOMEM;
5443bd4ba655SAlex Elder 		goto err_out_client;
5444b51c83c2SIlya Dryomov 	}
5445c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
5446c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
5447d147543dSIlya Dryomov 	rbd_opts = NULL;	/* rbd_dev now owns this */
5448602adf40SYehuda Sadeh 
54496d69bb53SIlya Dryomov 	rc = rbd_dev_image_probe(rbd_dev, 0);
5450a30b71b9SAlex Elder 	if (rc < 0)
5451c53d5893SAlex Elder 		goto err_out_rbd_dev;
545205fd6f6fSAlex Elder 
54537ce4eef7SAlex Elder 	/* If we are mapping a snapshot it must be marked read-only */
54547ce4eef7SAlex Elder 
5455d147543dSIlya Dryomov 	read_only = rbd_dev->opts->read_only;
54567ce4eef7SAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
54577ce4eef7SAlex Elder 		read_only = true;
54587ce4eef7SAlex Elder 	rbd_dev->mapping.read_only = read_only;
54597ce4eef7SAlex Elder 
5460b536f69aSAlex Elder 	rc = rbd_dev_device_setup(rbd_dev);
54613abef3b3SAlex Elder 	if (rc) {
5462e37180c0SIlya Dryomov 		/*
5463e37180c0SIlya Dryomov 		 * rbd_dev_header_unwatch_sync() can't be moved into
5464e37180c0SIlya Dryomov 		 * rbd_dev_image_release() without refactoring, see
5465e37180c0SIlya Dryomov 		 * commit 1f3ef78861ac.
5466e37180c0SIlya Dryomov 		 */
5467e37180c0SIlya Dryomov 		rbd_dev_header_unwatch_sync(rbd_dev);
54683abef3b3SAlex Elder 		rbd_dev_image_release(rbd_dev);
5469dd5ac32dSIlya Dryomov 		goto out;
54703abef3b3SAlex Elder 	}
54713abef3b3SAlex Elder 
5472dd5ac32dSIlya Dryomov 	rc = count;
5473dd5ac32dSIlya Dryomov out:
5474dd5ac32dSIlya Dryomov 	module_put(THIS_MODULE);
5475dd5ac32dSIlya Dryomov 	return rc;
5476b536f69aSAlex Elder 
5477c53d5893SAlex Elder err_out_rbd_dev:
5478c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
5479bd4ba655SAlex Elder err_out_client:
54809d3997fdSAlex Elder 	rbd_put_client(rbdc);
54810ddebc0cSAlex Elder err_out_args:
5482859c31dfSAlex Elder 	rbd_spec_put(spec);
5483d147543dSIlya Dryomov 	kfree(rbd_opts);
5484dd5ac32dSIlya Dryomov 	goto out;
5485602adf40SYehuda Sadeh }
5486602adf40SYehuda Sadeh 
54879b60e70bSIlya Dryomov static ssize_t rbd_add(struct bus_type *bus,
54889b60e70bSIlya Dryomov 		       const char *buf,
54899b60e70bSIlya Dryomov 		       size_t count)
54909b60e70bSIlya Dryomov {
54919b60e70bSIlya Dryomov 	if (single_major)
54929b60e70bSIlya Dryomov 		return -EINVAL;
54939b60e70bSIlya Dryomov 
54949b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
54959b60e70bSIlya Dryomov }
54969b60e70bSIlya Dryomov 
54979b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus,
54989b60e70bSIlya Dryomov 				    const char *buf,
54999b60e70bSIlya Dryomov 				    size_t count)
55009b60e70bSIlya Dryomov {
55019b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
55029b60e70bSIlya Dryomov }
55039b60e70bSIlya Dryomov 
5504dd5ac32dSIlya Dryomov static void rbd_dev_device_release(struct rbd_device *rbd_dev)
5505602adf40SYehuda Sadeh {
5506602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
5507200a6a8bSAlex Elder 	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5508dd5ac32dSIlya Dryomov 	device_del(&rbd_dev->dev);
55096d80b130SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
55109b60e70bSIlya Dryomov 	if (!single_major)
5511602adf40SYehuda Sadeh 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
5512e2839308SAlex Elder 	rbd_dev_id_put(rbd_dev);
5513d1cf5788SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
5514602adf40SYehuda Sadeh }
5515602adf40SYehuda Sadeh 
551605a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
551705a46afdSAlex Elder {
5518ad945fc1SAlex Elder 	while (rbd_dev->parent) {
551905a46afdSAlex Elder 		struct rbd_device *first = rbd_dev;
552005a46afdSAlex Elder 		struct rbd_device *second = first->parent;
552105a46afdSAlex Elder 		struct rbd_device *third;
552205a46afdSAlex Elder 
552305a46afdSAlex Elder 		/*
552405a46afdSAlex Elder 		 * Follow to the parent with no grandparent and
552505a46afdSAlex Elder 		 * remove it.
552605a46afdSAlex Elder 		 */
552705a46afdSAlex Elder 		while (second && (third = second->parent)) {
552805a46afdSAlex Elder 			first = second;
552905a46afdSAlex Elder 			second = third;
553005a46afdSAlex Elder 		}
5531ad945fc1SAlex Elder 		rbd_assert(second);
55328ad42cd0SAlex Elder 		rbd_dev_image_release(second);
5533ad945fc1SAlex Elder 		first->parent = NULL;
5534ad945fc1SAlex Elder 		first->parent_overlap = 0;
5535ad945fc1SAlex Elder 
5536ad945fc1SAlex Elder 		rbd_assert(first->parent_spec);
553705a46afdSAlex Elder 		rbd_spec_put(first->parent_spec);
553805a46afdSAlex Elder 		first->parent_spec = NULL;
553905a46afdSAlex Elder 	}
554005a46afdSAlex Elder }
554105a46afdSAlex Elder 
55429b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus,
5543602adf40SYehuda Sadeh 			     const char *buf,
5544602adf40SYehuda Sadeh 			     size_t count)
5545602adf40SYehuda Sadeh {
5546602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
5547751cc0e3SAlex Elder 	struct list_head *tmp;
5548751cc0e3SAlex Elder 	int dev_id;
5549602adf40SYehuda Sadeh 	unsigned long ul;
555082a442d2SAlex Elder 	bool already = false;
55510d8189e1SAlex Elder 	int ret;
5552602adf40SYehuda Sadeh 
5553bb8e0e84SJingoo Han 	ret = kstrtoul(buf, 10, &ul);
55540d8189e1SAlex Elder 	if (ret)
55550d8189e1SAlex Elder 		return ret;
5556602adf40SYehuda Sadeh 
5557602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
5558751cc0e3SAlex Elder 	dev_id = (int)ul;
5559751cc0e3SAlex Elder 	if (dev_id != ul)
5560602adf40SYehuda Sadeh 		return -EINVAL;
5561602adf40SYehuda Sadeh 
5562602adf40SYehuda Sadeh 	ret = -ENOENT;
5563751cc0e3SAlex Elder 	spin_lock(&rbd_dev_list_lock);
5564751cc0e3SAlex Elder 	list_for_each(tmp, &rbd_dev_list) {
5565751cc0e3SAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
5566751cc0e3SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
5567751cc0e3SAlex Elder 			ret = 0;
5568751cc0e3SAlex Elder 			break;
5569602adf40SYehuda Sadeh 		}
5570751cc0e3SAlex Elder 	}
5571751cc0e3SAlex Elder 	if (!ret) {
5572a14ea269SAlex Elder 		spin_lock_irq(&rbd_dev->lock);
5573b82d167bSAlex Elder 		if (rbd_dev->open_count)
557442382b70SAlex Elder 			ret = -EBUSY;
5575b82d167bSAlex Elder 		else
557682a442d2SAlex Elder 			already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
557782a442d2SAlex Elder 							&rbd_dev->flags);
5578a14ea269SAlex Elder 		spin_unlock_irq(&rbd_dev->lock);
5579751cc0e3SAlex Elder 	}
5580751cc0e3SAlex Elder 	spin_unlock(&rbd_dev_list_lock);
558182a442d2SAlex Elder 	if (ret < 0 || already)
55821ba0f1e7SAlex Elder 		return ret;
5583751cc0e3SAlex Elder 
5584fca27065SIlya Dryomov 	rbd_dev_header_unwatch_sync(rbd_dev);
55859abc5990SJosh Durgin 	/*
55869abc5990SJosh Durgin 	 * flush remaining watch callbacks - these must be complete
55879abc5990SJosh Durgin 	 * before the osd_client is shutdown
55889abc5990SJosh Durgin 	 */
55899abc5990SJosh Durgin 	dout("%s: flushing notifies", __func__);
55909abc5990SJosh Durgin 	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
5591fca27065SIlya Dryomov 
55929875201eSJosh Durgin 	/*
55939875201eSJosh Durgin 	 * Don't free anything from rbd_dev->disk until after all
55949875201eSJosh Durgin 	 * notifies are completely processed. Otherwise
55959875201eSJosh Durgin 	 * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting
55969875201eSJosh Durgin 	 * in a potential use after free of rbd_dev->disk or rbd_dev.
55979875201eSJosh Durgin 	 */
5598dd5ac32dSIlya Dryomov 	rbd_dev_device_release(rbd_dev);
55998ad42cd0SAlex Elder 	rbd_dev_image_release(rbd_dev);
5600aafb230eSAlex Elder 
56011ba0f1e7SAlex Elder 	return count;
5602602adf40SYehuda Sadeh }
5603602adf40SYehuda Sadeh 
56049b60e70bSIlya Dryomov static ssize_t rbd_remove(struct bus_type *bus,
56059b60e70bSIlya Dryomov 			  const char *buf,
56069b60e70bSIlya Dryomov 			  size_t count)
56079b60e70bSIlya Dryomov {
56089b60e70bSIlya Dryomov 	if (single_major)
56099b60e70bSIlya Dryomov 		return -EINVAL;
56109b60e70bSIlya Dryomov 
56119b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
56129b60e70bSIlya Dryomov }
56139b60e70bSIlya Dryomov 
56149b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus,
56159b60e70bSIlya Dryomov 				       const char *buf,
56169b60e70bSIlya Dryomov 				       size_t count)
56179b60e70bSIlya Dryomov {
56189b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
56199b60e70bSIlya Dryomov }
56209b60e70bSIlya Dryomov 
5621602adf40SYehuda Sadeh /*
5622602adf40SYehuda Sadeh  * create control files in sysfs
5623dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
5624602adf40SYehuda Sadeh  */
5625602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
5626602adf40SYehuda Sadeh {
5627dfc5606dSYehuda Sadeh 	int ret;
5628602adf40SYehuda Sadeh 
5629fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
5630dfc5606dSYehuda Sadeh 	if (ret < 0)
5631dfc5606dSYehuda Sadeh 		return ret;
5632602adf40SYehuda Sadeh 
5633fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
5634fed4c143SAlex Elder 	if (ret < 0)
5635fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
5636602adf40SYehuda Sadeh 
5637602adf40SYehuda Sadeh 	return ret;
5638602adf40SYehuda Sadeh }
5639602adf40SYehuda Sadeh 
5640602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
5641602adf40SYehuda Sadeh {
5642dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
5643fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
5644602adf40SYehuda Sadeh }
5645602adf40SYehuda Sadeh 
56461c2a9dfeSAlex Elder static int rbd_slab_init(void)
56471c2a9dfeSAlex Elder {
56481c2a9dfeSAlex Elder 	rbd_assert(!rbd_img_request_cache);
56491c2a9dfeSAlex Elder 	rbd_img_request_cache = kmem_cache_create("rbd_img_request",
56501c2a9dfeSAlex Elder 					sizeof (struct rbd_img_request),
56511c2a9dfeSAlex Elder 					__alignof__(struct rbd_img_request),
56521c2a9dfeSAlex Elder 					0, NULL);
5653868311b1SAlex Elder 	if (!rbd_img_request_cache)
5654868311b1SAlex Elder 		return -ENOMEM;
5655868311b1SAlex Elder 
5656868311b1SAlex Elder 	rbd_assert(!rbd_obj_request_cache);
5657868311b1SAlex Elder 	rbd_obj_request_cache = kmem_cache_create("rbd_obj_request",
5658868311b1SAlex Elder 					sizeof (struct rbd_obj_request),
5659868311b1SAlex Elder 					__alignof__(struct rbd_obj_request),
5660868311b1SAlex Elder 					0, NULL);
566178c2a44aSAlex Elder 	if (!rbd_obj_request_cache)
566278c2a44aSAlex Elder 		goto out_err;
566378c2a44aSAlex Elder 
566478c2a44aSAlex Elder 	rbd_assert(!rbd_segment_name_cache);
566578c2a44aSAlex Elder 	rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
56662d0ebc5dSIlya Dryomov 					CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL);
566778c2a44aSAlex Elder 	if (rbd_segment_name_cache)
56681c2a9dfeSAlex Elder 		return 0;
566978c2a44aSAlex Elder out_err:
567078c2a44aSAlex Elder 	kmem_cache_destroy(rbd_obj_request_cache);
567178c2a44aSAlex Elder 	rbd_obj_request_cache = NULL;
56721c2a9dfeSAlex Elder 
5673868311b1SAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
5674868311b1SAlex Elder 	rbd_img_request_cache = NULL;
5675868311b1SAlex Elder 
56761c2a9dfeSAlex Elder 	return -ENOMEM;
56771c2a9dfeSAlex Elder }
56781c2a9dfeSAlex Elder 
56791c2a9dfeSAlex Elder static void rbd_slab_exit(void)
56801c2a9dfeSAlex Elder {
568178c2a44aSAlex Elder 	rbd_assert(rbd_segment_name_cache);
568278c2a44aSAlex Elder 	kmem_cache_destroy(rbd_segment_name_cache);
568378c2a44aSAlex Elder 	rbd_segment_name_cache = NULL;
568478c2a44aSAlex Elder 
5685868311b1SAlex Elder 	rbd_assert(rbd_obj_request_cache);
5686868311b1SAlex Elder 	kmem_cache_destroy(rbd_obj_request_cache);
5687868311b1SAlex Elder 	rbd_obj_request_cache = NULL;
5688868311b1SAlex Elder 
56891c2a9dfeSAlex Elder 	rbd_assert(rbd_img_request_cache);
56901c2a9dfeSAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
56911c2a9dfeSAlex Elder 	rbd_img_request_cache = NULL;
56921c2a9dfeSAlex Elder }
56931c2a9dfeSAlex Elder 
5694cc344fa1SAlex Elder static int __init rbd_init(void)
5695602adf40SYehuda Sadeh {
5696602adf40SYehuda Sadeh 	int rc;
5697602adf40SYehuda Sadeh 
56981e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
56991e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
57001e32d34cSAlex Elder 		return -EINVAL;
57011e32d34cSAlex Elder 	}
5702e1b4d96dSIlya Dryomov 
57031c2a9dfeSAlex Elder 	rc = rbd_slab_init();
5704602adf40SYehuda Sadeh 	if (rc)
5705602adf40SYehuda Sadeh 		return rc;
5706e1b4d96dSIlya Dryomov 
5707f5ee37bdSIlya Dryomov 	/*
5708f5ee37bdSIlya Dryomov 	 * The number of active work items is limited by the number of
5709f77303bdSIlya Dryomov 	 * rbd devices * queue depth, so leave @max_active at default.
5710f5ee37bdSIlya Dryomov 	 */
5711f5ee37bdSIlya Dryomov 	rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
5712f5ee37bdSIlya Dryomov 	if (!rbd_wq) {
5713f5ee37bdSIlya Dryomov 		rc = -ENOMEM;
5714f5ee37bdSIlya Dryomov 		goto err_out_slab;
5715f5ee37bdSIlya Dryomov 	}
5716f5ee37bdSIlya Dryomov 
57179b60e70bSIlya Dryomov 	if (single_major) {
57189b60e70bSIlya Dryomov 		rbd_major = register_blkdev(0, RBD_DRV_NAME);
57199b60e70bSIlya Dryomov 		if (rbd_major < 0) {
57209b60e70bSIlya Dryomov 			rc = rbd_major;
5721f5ee37bdSIlya Dryomov 			goto err_out_wq;
57229b60e70bSIlya Dryomov 		}
57239b60e70bSIlya Dryomov 	}
57249b60e70bSIlya Dryomov 
57251c2a9dfeSAlex Elder 	rc = rbd_sysfs_init();
57261c2a9dfeSAlex Elder 	if (rc)
57279b60e70bSIlya Dryomov 		goto err_out_blkdev;
57281c2a9dfeSAlex Elder 
57299b60e70bSIlya Dryomov 	if (single_major)
57309b60e70bSIlya Dryomov 		pr_info("loaded (major %d)\n", rbd_major);
57319b60e70bSIlya Dryomov 	else
5732e1b4d96dSIlya Dryomov 		pr_info("loaded\n");
57339b60e70bSIlya Dryomov 
5734e1b4d96dSIlya Dryomov 	return 0;
5735e1b4d96dSIlya Dryomov 
57369b60e70bSIlya Dryomov err_out_blkdev:
57379b60e70bSIlya Dryomov 	if (single_major)
57389b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
5739f5ee37bdSIlya Dryomov err_out_wq:
5740f5ee37bdSIlya Dryomov 	destroy_workqueue(rbd_wq);
5741e1b4d96dSIlya Dryomov err_out_slab:
5742e1b4d96dSIlya Dryomov 	rbd_slab_exit();
57431c2a9dfeSAlex Elder 	return rc;
5744602adf40SYehuda Sadeh }
5745602adf40SYehuda Sadeh 
5746cc344fa1SAlex Elder static void __exit rbd_exit(void)
5747602adf40SYehuda Sadeh {
5748ffe312cfSIlya Dryomov 	ida_destroy(&rbd_dev_id_ida);
5749602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
57509b60e70bSIlya Dryomov 	if (single_major)
57519b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
5752f5ee37bdSIlya Dryomov 	destroy_workqueue(rbd_wq);
57531c2a9dfeSAlex Elder 	rbd_slab_exit();
5754602adf40SYehuda Sadeh }
5755602adf40SYehuda Sadeh 
5756602adf40SYehuda Sadeh module_init(rbd_init);
5757602adf40SYehuda Sadeh module_exit(rbd_exit);
5758602adf40SYehuda Sadeh 
5759d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
5760602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5761602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
5762602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
5763602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5764602adf40SYehuda Sadeh 
576590da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
5766602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
5767