xref: /openbmc/linux/drivers/block/rbd.c (revision bae818ee)
1e2a58ee5SAlex Elder 
2602adf40SYehuda Sadeh /*
3602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh 
6602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
7602adf40SYehuda Sadeh 
8602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
9602adf40SYehuda Sadeh 
10602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
11602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
12602adf40SYehuda Sadeh    the Free Software Foundation.
13602adf40SYehuda Sadeh 
14602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
15602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
16602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17602adf40SYehuda Sadeh    GNU General Public License for more details.
18602adf40SYehuda Sadeh 
19602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
20602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
21602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24602adf40SYehuda Sadeh 
25dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
26602adf40SYehuda Sadeh 
27dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
28602adf40SYehuda Sadeh 
29602adf40SYehuda Sadeh  */
30602adf40SYehuda Sadeh 
31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
34602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3559c2be1eSYehuda Sadeh #include <linux/parser.h>
3630d1cff8SAlex Elder #include <linux/bsearch.h>
37602adf40SYehuda Sadeh 
38602adf40SYehuda Sadeh #include <linux/kernel.h>
39602adf40SYehuda Sadeh #include <linux/device.h>
40602adf40SYehuda Sadeh #include <linux/module.h>
417ad18afaSChristoph Hellwig #include <linux/blk-mq.h>
42602adf40SYehuda Sadeh #include <linux/fs.h>
43602adf40SYehuda Sadeh #include <linux/blkdev.h>
441c2a9dfeSAlex Elder #include <linux/slab.h>
45f8a22fc2SIlya Dryomov #include <linux/idr.h>
46bc1ecc65SIlya Dryomov #include <linux/workqueue.h>
47602adf40SYehuda Sadeh 
48602adf40SYehuda Sadeh #include "rbd_types.h"
49602adf40SYehuda Sadeh 
50aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
51aafb230eSAlex Elder 
52593a9e7bSAlex Elder /*
53593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
54593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
55593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
56593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
57593a9e7bSAlex Elder  */
58593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
59593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
60593a9e7bSAlex Elder 
61a2acd00eSAlex Elder /*
62a2acd00eSAlex Elder  * Increment the given counter and return its updated value.
63a2acd00eSAlex Elder  * If the counter is already 0 it will not be incremented.
64a2acd00eSAlex Elder  * If the counter is already at its maximum value returns
65a2acd00eSAlex Elder  * -EINVAL without updating it.
66a2acd00eSAlex Elder  */
67a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v)
68a2acd00eSAlex Elder {
69a2acd00eSAlex Elder 	unsigned int counter;
70a2acd00eSAlex Elder 
71a2acd00eSAlex Elder 	counter = (unsigned int)__atomic_add_unless(v, 1, 0);
72a2acd00eSAlex Elder 	if (counter <= (unsigned int)INT_MAX)
73a2acd00eSAlex Elder 		return (int)counter;
74a2acd00eSAlex Elder 
75a2acd00eSAlex Elder 	atomic_dec(v);
76a2acd00eSAlex Elder 
77a2acd00eSAlex Elder 	return -EINVAL;
78a2acd00eSAlex Elder }
79a2acd00eSAlex Elder 
80a2acd00eSAlex Elder /* Decrement the counter.  Return the resulting value, or -EINVAL */
81a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v)
82a2acd00eSAlex Elder {
83a2acd00eSAlex Elder 	int counter;
84a2acd00eSAlex Elder 
85a2acd00eSAlex Elder 	counter = atomic_dec_return(v);
86a2acd00eSAlex Elder 	if (counter >= 0)
87a2acd00eSAlex Elder 		return counter;
88a2acd00eSAlex Elder 
89a2acd00eSAlex Elder 	atomic_inc(v);
90a2acd00eSAlex Elder 
91a2acd00eSAlex Elder 	return -EINVAL;
92a2acd00eSAlex Elder }
93a2acd00eSAlex Elder 
94f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
95602adf40SYehuda Sadeh 
967e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR		256
977e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT	4
98602adf40SYehuda Sadeh 
996d69bb53SIlya Dryomov #define RBD_MAX_PARENT_CHAIN_LEN	16
1006d69bb53SIlya Dryomov 
101d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
102d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
103d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
104d4b125e9SAlex Elder 
10535d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
106602adf40SYehuda Sadeh 
107602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
108602adf40SYehuda Sadeh 
1099682fc6dSAlex Elder #define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
1109682fc6dSAlex Elder 
1119e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
1129e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
113589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
1149e15b77dSAlex Elder 
1151e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
116589d30e0SAlex Elder 
117d889140cSAlex Elder /* Feature bits */
118d889140cSAlex Elder 
1195cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING	(1<<0)
1205cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2	(1<<1)
1215cbf6f12SAlex Elder #define RBD_FEATURES_ALL \
1225cbf6f12SAlex Elder 	    (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
123d889140cSAlex Elder 
124d889140cSAlex Elder /* Features supported by this (client software) implementation. */
125d889140cSAlex Elder 
126770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
127d889140cSAlex Elder 
12881a89793SAlex Elder /*
12981a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
13081a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
13181a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
13281a89793SAlex Elder  * enough to hold all possible device names.
13381a89793SAlex Elder  */
134602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
13581a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
136602adf40SYehuda Sadeh 
137602adf40SYehuda Sadeh /*
138602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
139602adf40SYehuda Sadeh  */
140602adf40SYehuda Sadeh struct rbd_image_header {
141f35a4deeSAlex Elder 	/* These six fields never change for a given rbd image */
142849b4260SAlex Elder 	char *object_prefix;
143602adf40SYehuda Sadeh 	__u8 obj_order;
144602adf40SYehuda Sadeh 	__u8 crypt_type;
145602adf40SYehuda Sadeh 	__u8 comp_type;
146f35a4deeSAlex Elder 	u64 stripe_unit;
147f35a4deeSAlex Elder 	u64 stripe_count;
148f35a4deeSAlex Elder 	u64 features;		/* Might be changeable someday? */
149602adf40SYehuda Sadeh 
150f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
151f84344f3SAlex Elder 	u64 image_size;
152f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
153f35a4deeSAlex Elder 	char *snap_names;	/* format 1 only */
154f35a4deeSAlex Elder 	u64 *snap_sizes;	/* format 1 only */
15559c2be1eSYehuda Sadeh };
15659c2be1eSYehuda Sadeh 
1570d7dbfceSAlex Elder /*
1580d7dbfceSAlex Elder  * An rbd image specification.
1590d7dbfceSAlex Elder  *
1600d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
161c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
162c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
163c66c6e0cSAlex Elder  *
164c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
165c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
166c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
167c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
168c66c6e0cSAlex Elder  *
169c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
170c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
171c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
172c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
173c66c6e0cSAlex Elder  * is shared between the parent and child).
174c66c6e0cSAlex Elder  *
175c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
176c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
177c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
178c66c6e0cSAlex Elder  *
179c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
180c66c6e0cSAlex Elder  * could be a null pointer).
1810d7dbfceSAlex Elder  */
1820d7dbfceSAlex Elder struct rbd_spec {
1830d7dbfceSAlex Elder 	u64		pool_id;
184ecb4dc22SAlex Elder 	const char	*pool_name;
1850d7dbfceSAlex Elder 
186ecb4dc22SAlex Elder 	const char	*image_id;
187ecb4dc22SAlex Elder 	const char	*image_name;
1880d7dbfceSAlex Elder 
1890d7dbfceSAlex Elder 	u64		snap_id;
190ecb4dc22SAlex Elder 	const char	*snap_name;
1910d7dbfceSAlex Elder 
1920d7dbfceSAlex Elder 	struct kref	kref;
1930d7dbfceSAlex Elder };
1940d7dbfceSAlex Elder 
195602adf40SYehuda Sadeh /*
196f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
197602adf40SYehuda Sadeh  */
198602adf40SYehuda Sadeh struct rbd_client {
199602adf40SYehuda Sadeh 	struct ceph_client	*client;
200602adf40SYehuda Sadeh 	struct kref		kref;
201602adf40SYehuda Sadeh 	struct list_head	node;
202602adf40SYehuda Sadeh };
203602adf40SYehuda Sadeh 
204bf0d5f50SAlex Elder struct rbd_img_request;
205bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
206bf0d5f50SAlex Elder 
207bf0d5f50SAlex Elder #define	BAD_WHICH	U32_MAX		/* Good which or bad which, which? */
208bf0d5f50SAlex Elder 
209bf0d5f50SAlex Elder struct rbd_obj_request;
210bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
211bf0d5f50SAlex Elder 
2129969ebc5SAlex Elder enum obj_request_type {
2139969ebc5SAlex Elder 	OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
2149969ebc5SAlex Elder };
215bf0d5f50SAlex Elder 
2166d2940c8SGuangliang Zhao enum obj_operation_type {
2176d2940c8SGuangliang Zhao 	OBJ_OP_WRITE,
2186d2940c8SGuangliang Zhao 	OBJ_OP_READ,
21990e98c52SGuangliang Zhao 	OBJ_OP_DISCARD,
2206d2940c8SGuangliang Zhao };
2216d2940c8SGuangliang Zhao 
222926f9b3fSAlex Elder enum obj_req_flags {
223926f9b3fSAlex Elder 	OBJ_REQ_DONE,		/* completion flag: not done = 0, done = 1 */
2246365d33aSAlex Elder 	OBJ_REQ_IMG_DATA,	/* object usage: standalone = 0, image = 1 */
2255679c59fSAlex Elder 	OBJ_REQ_KNOWN,		/* EXISTS flag valid: no = 0, yes = 1 */
2265679c59fSAlex Elder 	OBJ_REQ_EXISTS,		/* target exists: no = 0, yes = 1 */
227926f9b3fSAlex Elder };
228926f9b3fSAlex Elder 
229bf0d5f50SAlex Elder struct rbd_obj_request {
230bf0d5f50SAlex Elder 	const char		*object_name;
231bf0d5f50SAlex Elder 	u64			offset;		/* object start byte */
232bf0d5f50SAlex Elder 	u64			length;		/* bytes from offset */
233926f9b3fSAlex Elder 	unsigned long		flags;
234bf0d5f50SAlex Elder 
235c5b5ef6cSAlex Elder 	/*
236c5b5ef6cSAlex Elder 	 * An object request associated with an image will have its
237c5b5ef6cSAlex Elder 	 * img_data flag set; a standalone object request will not.
238c5b5ef6cSAlex Elder 	 *
239c5b5ef6cSAlex Elder 	 * A standalone object request will have which == BAD_WHICH
240c5b5ef6cSAlex Elder 	 * and a null obj_request pointer.
241c5b5ef6cSAlex Elder 	 *
242c5b5ef6cSAlex Elder 	 * An object request initiated in support of a layered image
243c5b5ef6cSAlex Elder 	 * object (to check for its existence before a write) will
244c5b5ef6cSAlex Elder 	 * have which == BAD_WHICH and a non-null obj_request pointer.
245c5b5ef6cSAlex Elder 	 *
246c5b5ef6cSAlex Elder 	 * Finally, an object request for rbd image data will have
247c5b5ef6cSAlex Elder 	 * which != BAD_WHICH, and will have a non-null img_request
248c5b5ef6cSAlex Elder 	 * pointer.  The value of which will be in the range
249c5b5ef6cSAlex Elder 	 * 0..(img_request->obj_request_count-1).
250c5b5ef6cSAlex Elder 	 */
251c5b5ef6cSAlex Elder 	union {
252c5b5ef6cSAlex Elder 		struct rbd_obj_request	*obj_request;	/* STAT op */
253c5b5ef6cSAlex Elder 		struct {
254bf0d5f50SAlex Elder 			struct rbd_img_request	*img_request;
255c5b5ef6cSAlex Elder 			u64			img_offset;
256c5b5ef6cSAlex Elder 			/* links for img_request->obj_requests list */
257c5b5ef6cSAlex Elder 			struct list_head	links;
258c5b5ef6cSAlex Elder 		};
259c5b5ef6cSAlex Elder 	};
260bf0d5f50SAlex Elder 	u32			which;		/* posn image request list */
261bf0d5f50SAlex Elder 
262bf0d5f50SAlex Elder 	enum obj_request_type	type;
263788e2df3SAlex Elder 	union {
264bf0d5f50SAlex Elder 		struct bio	*bio_list;
265788e2df3SAlex Elder 		struct {
266788e2df3SAlex Elder 			struct page	**pages;
267788e2df3SAlex Elder 			u32		page_count;
268788e2df3SAlex Elder 		};
269788e2df3SAlex Elder 	};
2700eefd470SAlex Elder 	struct page		**copyup_pages;
271ebda6408SAlex Elder 	u32			copyup_page_count;
272bf0d5f50SAlex Elder 
273bf0d5f50SAlex Elder 	struct ceph_osd_request	*osd_req;
274bf0d5f50SAlex Elder 
275bf0d5f50SAlex Elder 	u64			xferred;	/* bytes transferred */
2761b83bef2SSage Weil 	int			result;
277bf0d5f50SAlex Elder 
278bf0d5f50SAlex Elder 	rbd_obj_callback_t	callback;
279788e2df3SAlex Elder 	struct completion	completion;
280bf0d5f50SAlex Elder 
281bf0d5f50SAlex Elder 	struct kref		kref;
282bf0d5f50SAlex Elder };
283bf0d5f50SAlex Elder 
2840c425248SAlex Elder enum img_req_flags {
2859849e986SAlex Elder 	IMG_REQ_WRITE,		/* I/O direction: read = 0, write = 1 */
2869849e986SAlex Elder 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
287d0b2e944SAlex Elder 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
28890e98c52SGuangliang Zhao 	IMG_REQ_DISCARD,	/* discard: normal = 0, discard request = 1 */
2890c425248SAlex Elder };
2900c425248SAlex Elder 
291bf0d5f50SAlex Elder struct rbd_img_request {
292bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
293bf0d5f50SAlex Elder 	u64			offset;	/* starting image byte offset */
294bf0d5f50SAlex Elder 	u64			length;	/* byte count from offset */
2950c425248SAlex Elder 	unsigned long		flags;
296bf0d5f50SAlex Elder 	union {
297bf0d5f50SAlex Elder 		u64			snap_id;	/* for reads */
2989849e986SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
2999849e986SAlex Elder 	};
3009849e986SAlex Elder 	union {
3019849e986SAlex Elder 		struct request		*rq;		/* block request */
3029849e986SAlex Elder 		struct rbd_obj_request	*obj_request;	/* obj req initiator */
303bf0d5f50SAlex Elder 	};
3043d7efd18SAlex Elder 	struct page		**copyup_pages;
305ebda6408SAlex Elder 	u32			copyup_page_count;
306bf0d5f50SAlex Elder 	spinlock_t		completion_lock;/* protects next_completion */
307bf0d5f50SAlex Elder 	u32			next_completion;
308bf0d5f50SAlex Elder 	rbd_img_callback_t	callback;
30955f27e09SAlex Elder 	u64			xferred;/* aggregate bytes transferred */
310a5a337d4SAlex Elder 	int			result;	/* first nonzero obj_request result */
311bf0d5f50SAlex Elder 
312bf0d5f50SAlex Elder 	u32			obj_request_count;
313bf0d5f50SAlex Elder 	struct list_head	obj_requests;	/* rbd_obj_request structs */
314bf0d5f50SAlex Elder 
315bf0d5f50SAlex Elder 	struct kref		kref;
316bf0d5f50SAlex Elder };
317bf0d5f50SAlex Elder 
318bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
319ef06f4d3SAlex Elder 	list_for_each_entry(oreq, &(ireq)->obj_requests, links)
320bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \
321ef06f4d3SAlex Elder 	list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
322bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
323ef06f4d3SAlex Elder 	list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
324bf0d5f50SAlex Elder 
325f84344f3SAlex Elder struct rbd_mapping {
32699c1f08fSAlex Elder 	u64                     size;
32734b13184SAlex Elder 	u64                     features;
328f84344f3SAlex Elder 	bool			read_only;
329f84344f3SAlex Elder };
330f84344f3SAlex Elder 
331602adf40SYehuda Sadeh /*
332602adf40SYehuda Sadeh  * a single device
333602adf40SYehuda Sadeh  */
334602adf40SYehuda Sadeh struct rbd_device {
335de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
336602adf40SYehuda Sadeh 
337602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
338dd82fff1SIlya Dryomov 	int			minor;
339602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
340602adf40SYehuda Sadeh 
341a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
342602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
343602adf40SYehuda Sadeh 
344602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
345602adf40SYehuda Sadeh 
346b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
347602adf40SYehuda Sadeh 
348602adf40SYehuda Sadeh 	struct rbd_image_header	header;
349b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
3500d7dbfceSAlex Elder 	struct rbd_spec		*spec;
351d147543dSIlya Dryomov 	struct rbd_options	*opts;
352602adf40SYehuda Sadeh 
3530d7dbfceSAlex Elder 	char			*header_name;
354971f839aSAlex Elder 
3550903e875SAlex Elder 	struct ceph_file_layout	layout;
3560903e875SAlex Elder 
35759c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
358975241afSAlex Elder 	struct rbd_obj_request	*watch_request;
35959c2be1eSYehuda Sadeh 
36086b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
36186b00e0dSAlex Elder 	u64			parent_overlap;
362a2acd00eSAlex Elder 	atomic_t		parent_ref;
3632f82ee54SAlex Elder 	struct rbd_device	*parent;
36486b00e0dSAlex Elder 
3657ad18afaSChristoph Hellwig 	/* Block layer tags. */
3667ad18afaSChristoph Hellwig 	struct blk_mq_tag_set	tag_set;
3677ad18afaSChristoph Hellwig 
368c666601aSJosh Durgin 	/* protects updating the header */
369c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
370f84344f3SAlex Elder 
371f84344f3SAlex Elder 	struct rbd_mapping	mapping;
372602adf40SYehuda Sadeh 
373602adf40SYehuda Sadeh 	struct list_head	node;
374dfc5606dSYehuda Sadeh 
375dfc5606dSYehuda Sadeh 	/* sysfs related */
376dfc5606dSYehuda Sadeh 	struct device		dev;
377b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
378dfc5606dSYehuda Sadeh };
379dfc5606dSYehuda Sadeh 
380b82d167bSAlex Elder /*
381b82d167bSAlex Elder  * Flag bits for rbd_dev->flags.  If atomicity is required,
382b82d167bSAlex Elder  * rbd_dev->lock is used to protect access.
383b82d167bSAlex Elder  *
384b82d167bSAlex Elder  * Currently, only the "removing" flag (which is coupled with the
385b82d167bSAlex Elder  * "open_count" field) requires atomic access.
386b82d167bSAlex Elder  */
3876d292906SAlex Elder enum rbd_dev_flags {
3886d292906SAlex Elder 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
389b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
3906d292906SAlex Elder };
3916d292906SAlex Elder 
392cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex);	/* Serialize client creation */
393e124a82fSAlex Elder 
394602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
395e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
396e124a82fSAlex Elder 
397602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
398432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
399602adf40SYehuda Sadeh 
40078c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */
40178c2a44aSAlex Elder 
4021c2a9dfeSAlex Elder static struct kmem_cache	*rbd_img_request_cache;
403868311b1SAlex Elder static struct kmem_cache	*rbd_obj_request_cache;
40478c2a44aSAlex Elder static struct kmem_cache	*rbd_segment_name_cache;
4051c2a9dfeSAlex Elder 
4069b60e70bSIlya Dryomov static int rbd_major;
407f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida);
408f8a22fc2SIlya Dryomov 
409f5ee37bdSIlya Dryomov static struct workqueue_struct *rbd_wq;
410f5ee37bdSIlya Dryomov 
4119b60e70bSIlya Dryomov /*
4129b60e70bSIlya Dryomov  * Default to false for now, as single-major requires >= 0.75 version of
4139b60e70bSIlya Dryomov  * userspace rbd utility.
4149b60e70bSIlya Dryomov  */
4159b60e70bSIlya Dryomov static bool single_major = false;
4169b60e70bSIlya Dryomov module_param(single_major, bool, S_IRUGO);
4179b60e70bSIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)");
4189b60e70bSIlya Dryomov 
4193d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request);
4203d7efd18SAlex Elder 
421200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev);
422dfc5606dSYehuda Sadeh 
423f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
424f0f8cef5SAlex Elder 		       size_t count);
425f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
426f0f8cef5SAlex Elder 			  size_t count);
4279b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
4289b60e70bSIlya Dryomov 				    size_t count);
4299b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
4309b60e70bSIlya Dryomov 				       size_t count);
4316d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
432a2acd00eSAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
433f0f8cef5SAlex Elder 
4349b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id)
4359b60e70bSIlya Dryomov {
4367e513d43SIlya Dryomov 	return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
4379b60e70bSIlya Dryomov }
4389b60e70bSIlya Dryomov 
4399b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor)
4409b60e70bSIlya Dryomov {
4417e513d43SIlya Dryomov 	return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
4429b60e70bSIlya Dryomov }
4439b60e70bSIlya Dryomov 
444b15a21ddSGreg Kroah-Hartman static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
445b15a21ddSGreg Kroah-Hartman static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
4469b60e70bSIlya Dryomov static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
4479b60e70bSIlya Dryomov static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
448b15a21ddSGreg Kroah-Hartman 
449b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = {
450b15a21ddSGreg Kroah-Hartman 	&bus_attr_add.attr,
451b15a21ddSGreg Kroah-Hartman 	&bus_attr_remove.attr,
4529b60e70bSIlya Dryomov 	&bus_attr_add_single_major.attr,
4539b60e70bSIlya Dryomov 	&bus_attr_remove_single_major.attr,
454b15a21ddSGreg Kroah-Hartman 	NULL,
455f0f8cef5SAlex Elder };
45692c76dc0SIlya Dryomov 
45792c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj,
45892c76dc0SIlya Dryomov 				  struct attribute *attr, int index)
45992c76dc0SIlya Dryomov {
4609b60e70bSIlya Dryomov 	if (!single_major &&
4619b60e70bSIlya Dryomov 	    (attr == &bus_attr_add_single_major.attr ||
4629b60e70bSIlya Dryomov 	     attr == &bus_attr_remove_single_major.attr))
4639b60e70bSIlya Dryomov 		return 0;
4649b60e70bSIlya Dryomov 
46592c76dc0SIlya Dryomov 	return attr->mode;
46692c76dc0SIlya Dryomov }
46792c76dc0SIlya Dryomov 
46892c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = {
46992c76dc0SIlya Dryomov 	.attrs = rbd_bus_attrs,
47092c76dc0SIlya Dryomov 	.is_visible = rbd_bus_is_visible,
47192c76dc0SIlya Dryomov };
47292c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus);
473f0f8cef5SAlex Elder 
474f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
475f0f8cef5SAlex Elder 	.name		= "rbd",
476b15a21ddSGreg Kroah-Hartman 	.bus_groups	= rbd_bus_groups,
477f0f8cef5SAlex Elder };
478f0f8cef5SAlex Elder 
479f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
480f0f8cef5SAlex Elder {
481f0f8cef5SAlex Elder }
482f0f8cef5SAlex Elder 
483f0f8cef5SAlex Elder static struct device rbd_root_dev = {
484f0f8cef5SAlex Elder 	.init_name =    "rbd",
485f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
486f0f8cef5SAlex Elder };
487f0f8cef5SAlex Elder 
48806ecc6cbSAlex Elder static __printf(2, 3)
48906ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
49006ecc6cbSAlex Elder {
49106ecc6cbSAlex Elder 	struct va_format vaf;
49206ecc6cbSAlex Elder 	va_list args;
49306ecc6cbSAlex Elder 
49406ecc6cbSAlex Elder 	va_start(args, fmt);
49506ecc6cbSAlex Elder 	vaf.fmt = fmt;
49606ecc6cbSAlex Elder 	vaf.va = &args;
49706ecc6cbSAlex Elder 
49806ecc6cbSAlex Elder 	if (!rbd_dev)
49906ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
50006ecc6cbSAlex Elder 	else if (rbd_dev->disk)
50106ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
50206ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
50306ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
50406ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
50506ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
50606ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
50706ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
50806ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
50906ecc6cbSAlex Elder 	else	/* punt */
51006ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
51106ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
51206ecc6cbSAlex Elder 	va_end(args);
51306ecc6cbSAlex Elder }
51406ecc6cbSAlex Elder 
515aafb230eSAlex Elder #ifdef RBD_DEBUG
516aafb230eSAlex Elder #define rbd_assert(expr)						\
517aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
518aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
519aafb230eSAlex Elder 						"at line %d:\n\n"	\
520aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
521aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
522aafb230eSAlex Elder 			BUG();						\
523aafb230eSAlex Elder 		}
524aafb230eSAlex Elder #else /* !RBD_DEBUG */
525aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
526aafb230eSAlex Elder #endif /* !RBD_DEBUG */
527dfc5606dSYehuda Sadeh 
5282761713dSIlya Dryomov static void rbd_osd_copyup_callback(struct rbd_obj_request *obj_request);
529b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
53005a46afdSAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
53105a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
5328b3e1a56SAlex Elder 
533cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev);
5342df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
535a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev);
536e8f59b59SIlya Dryomov static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
53754cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
53854cac61fSAlex Elder 					u64 snap_id);
5392ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
5402ad3d716SAlex Elder 				u8 *order, u64 *snap_size);
5412ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
5422ad3d716SAlex Elder 		u64 *snap_features);
5432ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name);
54459c2be1eSYehuda Sadeh 
545602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
546602adf40SYehuda Sadeh {
547f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
548b82d167bSAlex Elder 	bool removing = false;
549602adf40SYehuda Sadeh 
550f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
551602adf40SYehuda Sadeh 		return -EROFS;
552602adf40SYehuda Sadeh 
553a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
554b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
555b82d167bSAlex Elder 		removing = true;
556b82d167bSAlex Elder 	else
557b82d167bSAlex Elder 		rbd_dev->open_count++;
558a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
559b82d167bSAlex Elder 	if (removing)
560b82d167bSAlex Elder 		return -ENOENT;
561b82d167bSAlex Elder 
562c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
563340c7a2bSAlex Elder 
564602adf40SYehuda Sadeh 	return 0;
565602adf40SYehuda Sadeh }
566602adf40SYehuda Sadeh 
567db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode)
568dfc5606dSYehuda Sadeh {
569dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
570b82d167bSAlex Elder 	unsigned long open_count_before;
571b82d167bSAlex Elder 
572a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
573b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
574a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
575b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
576dfc5606dSYehuda Sadeh 
577c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
578dfc5606dSYehuda Sadeh }
579dfc5606dSYehuda Sadeh 
580131fd9f6SGuangliang Zhao static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
581131fd9f6SGuangliang Zhao {
58277f33c03SJosh Durgin 	int ret = 0;
583131fd9f6SGuangliang Zhao 	int val;
584131fd9f6SGuangliang Zhao 	bool ro;
58577f33c03SJosh Durgin 	bool ro_changed = false;
586131fd9f6SGuangliang Zhao 
58777f33c03SJosh Durgin 	/* get_user() may sleep, so call it before taking rbd_dev->lock */
588131fd9f6SGuangliang Zhao 	if (get_user(val, (int __user *)(arg)))
589131fd9f6SGuangliang Zhao 		return -EFAULT;
590131fd9f6SGuangliang Zhao 
591131fd9f6SGuangliang Zhao 	ro = val ? true : false;
592131fd9f6SGuangliang Zhao 	/* Snapshot doesn't allow to write*/
593131fd9f6SGuangliang Zhao 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
594131fd9f6SGuangliang Zhao 		return -EROFS;
595131fd9f6SGuangliang Zhao 
59677f33c03SJosh Durgin 	spin_lock_irq(&rbd_dev->lock);
59777f33c03SJosh Durgin 	/* prevent others open this device */
59877f33c03SJosh Durgin 	if (rbd_dev->open_count > 1) {
59977f33c03SJosh Durgin 		ret = -EBUSY;
60077f33c03SJosh Durgin 		goto out;
601131fd9f6SGuangliang Zhao 	}
602131fd9f6SGuangliang Zhao 
60377f33c03SJosh Durgin 	if (rbd_dev->mapping.read_only != ro) {
60477f33c03SJosh Durgin 		rbd_dev->mapping.read_only = ro;
60577f33c03SJosh Durgin 		ro_changed = true;
60677f33c03SJosh Durgin 	}
60777f33c03SJosh Durgin 
60877f33c03SJosh Durgin out:
60977f33c03SJosh Durgin 	spin_unlock_irq(&rbd_dev->lock);
61077f33c03SJosh Durgin 	/* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */
61177f33c03SJosh Durgin 	if (ret == 0 && ro_changed)
61277f33c03SJosh Durgin 		set_disk_ro(rbd_dev->disk, ro ? 1 : 0);
61377f33c03SJosh Durgin 
61477f33c03SJosh Durgin 	return ret;
615131fd9f6SGuangliang Zhao }
616131fd9f6SGuangliang Zhao 
617131fd9f6SGuangliang Zhao static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
618131fd9f6SGuangliang Zhao 			unsigned int cmd, unsigned long arg)
619131fd9f6SGuangliang Zhao {
620131fd9f6SGuangliang Zhao 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
621131fd9f6SGuangliang Zhao 	int ret = 0;
622131fd9f6SGuangliang Zhao 
623131fd9f6SGuangliang Zhao 	switch (cmd) {
624131fd9f6SGuangliang Zhao 	case BLKROSET:
625131fd9f6SGuangliang Zhao 		ret = rbd_ioctl_set_ro(rbd_dev, arg);
626131fd9f6SGuangliang Zhao 		break;
627131fd9f6SGuangliang Zhao 	default:
628131fd9f6SGuangliang Zhao 		ret = -ENOTTY;
629131fd9f6SGuangliang Zhao 	}
630131fd9f6SGuangliang Zhao 
631131fd9f6SGuangliang Zhao 	return ret;
632131fd9f6SGuangliang Zhao }
633131fd9f6SGuangliang Zhao 
634131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT
635131fd9f6SGuangliang Zhao static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
636131fd9f6SGuangliang Zhao 				unsigned int cmd, unsigned long arg)
637131fd9f6SGuangliang Zhao {
638131fd9f6SGuangliang Zhao 	return rbd_ioctl(bdev, mode, cmd, arg);
639131fd9f6SGuangliang Zhao }
640131fd9f6SGuangliang Zhao #endif /* CONFIG_COMPAT */
641131fd9f6SGuangliang Zhao 
642602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
643602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
644602adf40SYehuda Sadeh 	.open			= rbd_open,
645dfc5606dSYehuda Sadeh 	.release		= rbd_release,
646131fd9f6SGuangliang Zhao 	.ioctl			= rbd_ioctl,
647131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT
648131fd9f6SGuangliang Zhao 	.compat_ioctl		= rbd_compat_ioctl,
649131fd9f6SGuangliang Zhao #endif
650602adf40SYehuda Sadeh };
651602adf40SYehuda Sadeh 
652602adf40SYehuda Sadeh /*
6537262cfcaSAlex Elder  * Initialize an rbd client instance.  Success or not, this function
654cfbf6377SAlex Elder  * consumes ceph_opts.  Caller holds client_mutex.
655602adf40SYehuda Sadeh  */
656f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
657602adf40SYehuda Sadeh {
658602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
659602adf40SYehuda Sadeh 	int ret = -ENOMEM;
660602adf40SYehuda Sadeh 
66137206ee5SAlex Elder 	dout("%s:\n", __func__);
662602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
663602adf40SYehuda Sadeh 	if (!rbdc)
664602adf40SYehuda Sadeh 		goto out_opt;
665602adf40SYehuda Sadeh 
666602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
667602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
668602adf40SYehuda Sadeh 
66943ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
670602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
67108f75463SAlex Elder 		goto out_rbdc;
67243ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
673602adf40SYehuda Sadeh 
674602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
675602adf40SYehuda Sadeh 	if (ret < 0)
67608f75463SAlex Elder 		goto out_client;
677602adf40SYehuda Sadeh 
678432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
679602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
680432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
681602adf40SYehuda Sadeh 
68237206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
683bc534d86SAlex Elder 
684602adf40SYehuda Sadeh 	return rbdc;
68508f75463SAlex Elder out_client:
686602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
68708f75463SAlex Elder out_rbdc:
688602adf40SYehuda Sadeh 	kfree(rbdc);
689602adf40SYehuda Sadeh out_opt:
69043ae4701SAlex Elder 	if (ceph_opts)
69143ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
69237206ee5SAlex Elder 	dout("%s: error %d\n", __func__, ret);
69337206ee5SAlex Elder 
69428f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
695602adf40SYehuda Sadeh }
696602adf40SYehuda Sadeh 
6972f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
6982f82ee54SAlex Elder {
6992f82ee54SAlex Elder 	kref_get(&rbdc->kref);
7002f82ee54SAlex Elder 
7012f82ee54SAlex Elder 	return rbdc;
7022f82ee54SAlex Elder }
7032f82ee54SAlex Elder 
704602adf40SYehuda Sadeh /*
7051f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
7061f7ba331SAlex Elder  * found, bump its reference count.
707602adf40SYehuda Sadeh  */
7081f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
709602adf40SYehuda Sadeh {
710602adf40SYehuda Sadeh 	struct rbd_client *client_node;
7111f7ba331SAlex Elder 	bool found = false;
712602adf40SYehuda Sadeh 
71343ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
714602adf40SYehuda Sadeh 		return NULL;
715602adf40SYehuda Sadeh 
7161f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
7171f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
7181f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
7192f82ee54SAlex Elder 			__rbd_get_client(client_node);
7202f82ee54SAlex Elder 
7211f7ba331SAlex Elder 			found = true;
7221f7ba331SAlex Elder 			break;
7231f7ba331SAlex Elder 		}
7241f7ba331SAlex Elder 	}
7251f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
7261f7ba331SAlex Elder 
7271f7ba331SAlex Elder 	return found ? client_node : NULL;
728602adf40SYehuda Sadeh }
729602adf40SYehuda Sadeh 
730602adf40SYehuda Sadeh /*
731210c104cSIlya Dryomov  * (Per device) rbd map options
73259c2be1eSYehuda Sadeh  */
73359c2be1eSYehuda Sadeh enum {
734b5584180SIlya Dryomov 	Opt_queue_depth,
73559c2be1eSYehuda Sadeh 	Opt_last_int,
73659c2be1eSYehuda Sadeh 	/* int args above */
73759c2be1eSYehuda Sadeh 	Opt_last_string,
73859c2be1eSYehuda Sadeh 	/* string args above */
739cc0538b6SAlex Elder 	Opt_read_only,
740cc0538b6SAlex Elder 	Opt_read_write,
741210c104cSIlya Dryomov 	Opt_err
74259c2be1eSYehuda Sadeh };
74359c2be1eSYehuda Sadeh 
74443ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
745b5584180SIlya Dryomov 	{Opt_queue_depth, "queue_depth=%d"},
74659c2be1eSYehuda Sadeh 	/* int args above */
74759c2be1eSYehuda Sadeh 	/* string args above */
748be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
749cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
750cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
751cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
752210c104cSIlya Dryomov 	{Opt_err, NULL}
75359c2be1eSYehuda Sadeh };
75459c2be1eSYehuda Sadeh 
75598571b5aSAlex Elder struct rbd_options {
756b5584180SIlya Dryomov 	int	queue_depth;
75798571b5aSAlex Elder 	bool	read_only;
75898571b5aSAlex Elder };
75998571b5aSAlex Elder 
760b5584180SIlya Dryomov #define RBD_QUEUE_DEPTH_DEFAULT	BLKDEV_MAX_RQ
76198571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
76298571b5aSAlex Elder 
76359c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
76459c2be1eSYehuda Sadeh {
76543ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
76659c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
76759c2be1eSYehuda Sadeh 	int token, intval, ret;
76859c2be1eSYehuda Sadeh 
76943ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
77059c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
77159c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
77259c2be1eSYehuda Sadeh 		if (ret < 0) {
773210c104cSIlya Dryomov 			pr_err("bad mount option arg (not int) at '%s'\n", c);
77459c2be1eSYehuda Sadeh 			return ret;
77559c2be1eSYehuda Sadeh 		}
77659c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
77759c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
778210c104cSIlya Dryomov 		dout("got string token %d val %s\n", token, argstr[0].from);
77959c2be1eSYehuda Sadeh 	} else {
78059c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
78159c2be1eSYehuda Sadeh 	}
78259c2be1eSYehuda Sadeh 
78359c2be1eSYehuda Sadeh 	switch (token) {
784b5584180SIlya Dryomov 	case Opt_queue_depth:
785b5584180SIlya Dryomov 		if (intval < 1) {
786b5584180SIlya Dryomov 			pr_err("queue_depth out of range\n");
787b5584180SIlya Dryomov 			return -EINVAL;
788b5584180SIlya Dryomov 		}
789b5584180SIlya Dryomov 		rbd_opts->queue_depth = intval;
790b5584180SIlya Dryomov 		break;
791cc0538b6SAlex Elder 	case Opt_read_only:
792cc0538b6SAlex Elder 		rbd_opts->read_only = true;
793cc0538b6SAlex Elder 		break;
794cc0538b6SAlex Elder 	case Opt_read_write:
795cc0538b6SAlex Elder 		rbd_opts->read_only = false;
796cc0538b6SAlex Elder 		break;
79759c2be1eSYehuda Sadeh 	default:
798210c104cSIlya Dryomov 		/* libceph prints "bad option" msg */
799210c104cSIlya Dryomov 		return -EINVAL;
80059c2be1eSYehuda Sadeh 	}
801210c104cSIlya Dryomov 
80259c2be1eSYehuda Sadeh 	return 0;
80359c2be1eSYehuda Sadeh }
80459c2be1eSYehuda Sadeh 
8056d2940c8SGuangliang Zhao static char* obj_op_name(enum obj_operation_type op_type)
8066d2940c8SGuangliang Zhao {
8076d2940c8SGuangliang Zhao 	switch (op_type) {
8086d2940c8SGuangliang Zhao 	case OBJ_OP_READ:
8096d2940c8SGuangliang Zhao 		return "read";
8106d2940c8SGuangliang Zhao 	case OBJ_OP_WRITE:
8116d2940c8SGuangliang Zhao 		return "write";
81290e98c52SGuangliang Zhao 	case OBJ_OP_DISCARD:
81390e98c52SGuangliang Zhao 		return "discard";
8146d2940c8SGuangliang Zhao 	default:
8156d2940c8SGuangliang Zhao 		return "???";
8166d2940c8SGuangliang Zhao 	}
8176d2940c8SGuangliang Zhao }
8186d2940c8SGuangliang Zhao 
81959c2be1eSYehuda Sadeh /*
820602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
8217262cfcaSAlex Elder  * not exist create it.  Either way, ceph_opts is consumed by this
8227262cfcaSAlex Elder  * function.
823602adf40SYehuda Sadeh  */
8249d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
825602adf40SYehuda Sadeh {
826f8c38929SAlex Elder 	struct rbd_client *rbdc;
82759c2be1eSYehuda Sadeh 
828cfbf6377SAlex Elder 	mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
8291f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
8309d3997fdSAlex Elder 	if (rbdc)	/* using an existing client */
83143ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
8329d3997fdSAlex Elder 	else
833f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
834cfbf6377SAlex Elder 	mutex_unlock(&client_mutex);
835d720bcb0SAlex Elder 
8369d3997fdSAlex Elder 	return rbdc;
837602adf40SYehuda Sadeh }
838602adf40SYehuda Sadeh 
839602adf40SYehuda Sadeh /*
840602adf40SYehuda Sadeh  * Destroy ceph client
841d23a4b3fSAlex Elder  *
842432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
843602adf40SYehuda Sadeh  */
844602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
845602adf40SYehuda Sadeh {
846602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
847602adf40SYehuda Sadeh 
84837206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
849cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
850602adf40SYehuda Sadeh 	list_del(&rbdc->node);
851cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
852602adf40SYehuda Sadeh 
853602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
854602adf40SYehuda Sadeh 	kfree(rbdc);
855602adf40SYehuda Sadeh }
856602adf40SYehuda Sadeh 
857602adf40SYehuda Sadeh /*
858602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
859602adf40SYehuda Sadeh  * it.
860602adf40SYehuda Sadeh  */
8619d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
862602adf40SYehuda Sadeh {
863c53d5893SAlex Elder 	if (rbdc)
8649d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
865602adf40SYehuda Sadeh }
866602adf40SYehuda Sadeh 
867a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
868a30b71b9SAlex Elder {
869a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
870a30b71b9SAlex Elder }
871a30b71b9SAlex Elder 
8728e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
8738e94af8eSAlex Elder {
874103a150fSAlex Elder 	size_t size;
875103a150fSAlex Elder 	u32 snap_count;
876103a150fSAlex Elder 
877103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
878103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
879103a150fSAlex Elder 		return false;
880103a150fSAlex Elder 
881db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
882db2388b6SAlex Elder 
883db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
884db2388b6SAlex Elder 		return false;
885db2388b6SAlex Elder 
886db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
887db2388b6SAlex Elder 
888db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
889db2388b6SAlex Elder 		return false;
890db2388b6SAlex Elder 
891103a150fSAlex Elder 	/*
892103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
893103a150fSAlex Elder 	 * that limits the number of snapshots.
894103a150fSAlex Elder 	 */
895103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
896103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
897103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
898103a150fSAlex Elder 		return false;
899103a150fSAlex Elder 
900103a150fSAlex Elder 	/*
901103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
902103a150fSAlex Elder 	 * header must also be representable in a size_t.
903103a150fSAlex Elder 	 */
904103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
905103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
906103a150fSAlex Elder 		return false;
907103a150fSAlex Elder 
908103a150fSAlex Elder 	return true;
9098e94af8eSAlex Elder }
9108e94af8eSAlex Elder 
911602adf40SYehuda Sadeh /*
912bb23e37aSAlex Elder  * Fill an rbd image header with information from the given format 1
913bb23e37aSAlex Elder  * on-disk header.
914602adf40SYehuda Sadeh  */
915662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev,
9164156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
917602adf40SYehuda Sadeh {
918662518b1SAlex Elder 	struct rbd_image_header *header = &rbd_dev->header;
919bb23e37aSAlex Elder 	bool first_time = header->object_prefix == NULL;
920bb23e37aSAlex Elder 	struct ceph_snap_context *snapc;
921bb23e37aSAlex Elder 	char *object_prefix = NULL;
922bb23e37aSAlex Elder 	char *snap_names = NULL;
923bb23e37aSAlex Elder 	u64 *snap_sizes = NULL;
924ccece235SAlex Elder 	u32 snap_count;
925d2bb24e5SAlex Elder 	size_t size;
926bb23e37aSAlex Elder 	int ret = -ENOMEM;
927621901d6SAlex Elder 	u32 i;
928602adf40SYehuda Sadeh 
929bb23e37aSAlex Elder 	/* Allocate this now to avoid having to handle failure below */
930103a150fSAlex Elder 
931bb23e37aSAlex Elder 	if (first_time) {
932bb23e37aSAlex Elder 		size_t len;
933bb23e37aSAlex Elder 
934bb23e37aSAlex Elder 		len = strnlen(ondisk->object_prefix,
935bb23e37aSAlex Elder 				sizeof (ondisk->object_prefix));
936bb23e37aSAlex Elder 		object_prefix = kmalloc(len + 1, GFP_KERNEL);
937bb23e37aSAlex Elder 		if (!object_prefix)
938602adf40SYehuda Sadeh 			return -ENOMEM;
939bb23e37aSAlex Elder 		memcpy(object_prefix, ondisk->object_prefix, len);
940bb23e37aSAlex Elder 		object_prefix[len] = '\0';
941bb23e37aSAlex Elder 	}
94200f1f36fSAlex Elder 
943bb23e37aSAlex Elder 	/* Allocate the snapshot context and fill it in */
944d2bb24e5SAlex Elder 
945602adf40SYehuda Sadeh 	snap_count = le32_to_cpu(ondisk->snap_count);
946bb23e37aSAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
947bb23e37aSAlex Elder 	if (!snapc)
948bb23e37aSAlex Elder 		goto out_err;
949bb23e37aSAlex Elder 	snapc->seq = le64_to_cpu(ondisk->snap_seq);
950602adf40SYehuda Sadeh 	if (snap_count) {
951bb23e37aSAlex Elder 		struct rbd_image_snap_ondisk *snaps;
952f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
953f785cc1dSAlex Elder 
954bb23e37aSAlex Elder 		/* We'll keep a copy of the snapshot names... */
955621901d6SAlex Elder 
956f785cc1dSAlex Elder 		if (snap_names_len > (u64)SIZE_MAX)
957bb23e37aSAlex Elder 			goto out_2big;
958bb23e37aSAlex Elder 		snap_names = kmalloc(snap_names_len, GFP_KERNEL);
959bb23e37aSAlex Elder 		if (!snap_names)
960602adf40SYehuda Sadeh 			goto out_err;
961bb23e37aSAlex Elder 
962bb23e37aSAlex Elder 		/* ...as well as the array of their sizes. */
963bb23e37aSAlex Elder 
964bb23e37aSAlex Elder 		size = snap_count * sizeof (*header->snap_sizes);
965bb23e37aSAlex Elder 		snap_sizes = kmalloc(size, GFP_KERNEL);
966bb23e37aSAlex Elder 		if (!snap_sizes)
967bb23e37aSAlex Elder 			goto out_err;
968bb23e37aSAlex Elder 
969f785cc1dSAlex Elder 		/*
970bb23e37aSAlex Elder 		 * Copy the names, and fill in each snapshot's id
971bb23e37aSAlex Elder 		 * and size.
972bb23e37aSAlex Elder 		 *
97399a41ebcSAlex Elder 		 * Note that rbd_dev_v1_header_info() guarantees the
974bb23e37aSAlex Elder 		 * ondisk buffer we're working with has
975f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
976f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
977f785cc1dSAlex Elder 		 */
978bb23e37aSAlex Elder 		memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
979bb23e37aSAlex Elder 		snaps = ondisk->snaps;
980bb23e37aSAlex Elder 		for (i = 0; i < snap_count; i++) {
981bb23e37aSAlex Elder 			snapc->snaps[i] = le64_to_cpu(snaps[i].id);
982bb23e37aSAlex Elder 			snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
983bb23e37aSAlex Elder 		}
984602adf40SYehuda Sadeh 	}
985849b4260SAlex Elder 
986bb23e37aSAlex Elder 	/* We won't fail any more, fill in the header */
987bb23e37aSAlex Elder 
988bb23e37aSAlex Elder 	if (first_time) {
989bb23e37aSAlex Elder 		header->object_prefix = object_prefix;
990602adf40SYehuda Sadeh 		header->obj_order = ondisk->options.order;
991602adf40SYehuda Sadeh 		header->crypt_type = ondisk->options.crypt_type;
992602adf40SYehuda Sadeh 		header->comp_type = ondisk->options.comp_type;
993bb23e37aSAlex Elder 		/* The rest aren't used for format 1 images */
994bb23e37aSAlex Elder 		header->stripe_unit = 0;
995bb23e37aSAlex Elder 		header->stripe_count = 0;
996bb23e37aSAlex Elder 		header->features = 0;
997662518b1SAlex Elder 	} else {
998662518b1SAlex Elder 		ceph_put_snap_context(header->snapc);
999662518b1SAlex Elder 		kfree(header->snap_names);
1000662518b1SAlex Elder 		kfree(header->snap_sizes);
1001bb23e37aSAlex Elder 	}
10026a52325fSAlex Elder 
1003bb23e37aSAlex Elder 	/* The remaining fields always get updated (when we refresh) */
1004621901d6SAlex Elder 
1005f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
1006bb23e37aSAlex Elder 	header->snapc = snapc;
1007bb23e37aSAlex Elder 	header->snap_names = snap_names;
1008bb23e37aSAlex Elder 	header->snap_sizes = snap_sizes;
1009468521c1SAlex Elder 
1010602adf40SYehuda Sadeh 	return 0;
1011bb23e37aSAlex Elder out_2big:
1012bb23e37aSAlex Elder 	ret = -EIO;
10136a52325fSAlex Elder out_err:
1014bb23e37aSAlex Elder 	kfree(snap_sizes);
1015bb23e37aSAlex Elder 	kfree(snap_names);
1016bb23e37aSAlex Elder 	ceph_put_snap_context(snapc);
1017bb23e37aSAlex Elder 	kfree(object_prefix);
1018ccece235SAlex Elder 
1019bb23e37aSAlex Elder 	return ret;
1020602adf40SYehuda Sadeh }
1021602adf40SYehuda Sadeh 
10229682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
10239682fc6dSAlex Elder {
10249682fc6dSAlex Elder 	const char *snap_name;
10259682fc6dSAlex Elder 
10269682fc6dSAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
10279682fc6dSAlex Elder 
10289682fc6dSAlex Elder 	/* Skip over names until we find the one we are looking for */
10299682fc6dSAlex Elder 
10309682fc6dSAlex Elder 	snap_name = rbd_dev->header.snap_names;
10319682fc6dSAlex Elder 	while (which--)
10329682fc6dSAlex Elder 		snap_name += strlen(snap_name) + 1;
10339682fc6dSAlex Elder 
10349682fc6dSAlex Elder 	return kstrdup(snap_name, GFP_KERNEL);
10359682fc6dSAlex Elder }
10369682fc6dSAlex Elder 
103730d1cff8SAlex Elder /*
103830d1cff8SAlex Elder  * Snapshot id comparison function for use with qsort()/bsearch().
103930d1cff8SAlex Elder  * Note that result is for snapshots in *descending* order.
104030d1cff8SAlex Elder  */
104130d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2)
104230d1cff8SAlex Elder {
104330d1cff8SAlex Elder 	u64 snap_id1 = *(u64 *)s1;
104430d1cff8SAlex Elder 	u64 snap_id2 = *(u64 *)s2;
104530d1cff8SAlex Elder 
104630d1cff8SAlex Elder 	if (snap_id1 < snap_id2)
104730d1cff8SAlex Elder 		return 1;
104830d1cff8SAlex Elder 	return snap_id1 == snap_id2 ? 0 : -1;
104930d1cff8SAlex Elder }
105030d1cff8SAlex Elder 
105130d1cff8SAlex Elder /*
105230d1cff8SAlex Elder  * Search a snapshot context to see if the given snapshot id is
105330d1cff8SAlex Elder  * present.
105430d1cff8SAlex Elder  *
105530d1cff8SAlex Elder  * Returns the position of the snapshot id in the array if it's found,
105630d1cff8SAlex Elder  * or BAD_SNAP_INDEX otherwise.
105730d1cff8SAlex Elder  *
105830d1cff8SAlex Elder  * Note: The snapshot array is in kept sorted (by the osd) in
105930d1cff8SAlex Elder  * reverse order, highest snapshot id first.
106030d1cff8SAlex Elder  */
10619682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
10629682fc6dSAlex Elder {
10639682fc6dSAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
106430d1cff8SAlex Elder 	u64 *found;
10659682fc6dSAlex Elder 
106630d1cff8SAlex Elder 	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
106730d1cff8SAlex Elder 				sizeof (snap_id), snapid_compare_reverse);
10689682fc6dSAlex Elder 
106930d1cff8SAlex Elder 	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
10709682fc6dSAlex Elder }
10719682fc6dSAlex Elder 
10722ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
10732ad3d716SAlex Elder 					u64 snap_id)
107454cac61fSAlex Elder {
107554cac61fSAlex Elder 	u32 which;
1076da6a6b63SJosh Durgin 	const char *snap_name;
107754cac61fSAlex Elder 
107854cac61fSAlex Elder 	which = rbd_dev_snap_index(rbd_dev, snap_id);
107954cac61fSAlex Elder 	if (which == BAD_SNAP_INDEX)
1080da6a6b63SJosh Durgin 		return ERR_PTR(-ENOENT);
108154cac61fSAlex Elder 
1082da6a6b63SJosh Durgin 	snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1083da6a6b63SJosh Durgin 	return snap_name ? snap_name : ERR_PTR(-ENOMEM);
108454cac61fSAlex Elder }
108554cac61fSAlex Elder 
10869e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
10879e15b77dSAlex Elder {
10889e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
10899e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
10909e15b77dSAlex Elder 
109154cac61fSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
109254cac61fSAlex Elder 	if (rbd_dev->image_format == 1)
109354cac61fSAlex Elder 		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
10949e15b77dSAlex Elder 
109554cac61fSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
10969e15b77dSAlex Elder }
10979e15b77dSAlex Elder 
10982ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
10992ad3d716SAlex Elder 				u64 *snap_size)
1100602adf40SYehuda Sadeh {
11012ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
11022ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
11032ad3d716SAlex Elder 		*snap_size = rbd_dev->header.image_size;
11042ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
11052ad3d716SAlex Elder 		u32 which;
110600f1f36fSAlex Elder 
11072ad3d716SAlex Elder 		which = rbd_dev_snap_index(rbd_dev, snap_id);
11082ad3d716SAlex Elder 		if (which == BAD_SNAP_INDEX)
11092ad3d716SAlex Elder 			return -ENOENT;
111000f1f36fSAlex Elder 
11112ad3d716SAlex Elder 		*snap_size = rbd_dev->header.snap_sizes[which];
11122ad3d716SAlex Elder 	} else {
11132ad3d716SAlex Elder 		u64 size = 0;
11142ad3d716SAlex Elder 		int ret;
11152ad3d716SAlex Elder 
11162ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
11172ad3d716SAlex Elder 		if (ret)
11182ad3d716SAlex Elder 			return ret;
11192ad3d716SAlex Elder 
11202ad3d716SAlex Elder 		*snap_size = size;
11212ad3d716SAlex Elder 	}
11222ad3d716SAlex Elder 	return 0;
11232ad3d716SAlex Elder }
11242ad3d716SAlex Elder 
11252ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
11262ad3d716SAlex Elder 			u64 *snap_features)
11272ad3d716SAlex Elder {
11282ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
11292ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
11302ad3d716SAlex Elder 		*snap_features = rbd_dev->header.features;
11312ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
11322ad3d716SAlex Elder 		*snap_features = 0;	/* No features for format 1 */
11332ad3d716SAlex Elder 	} else {
11342ad3d716SAlex Elder 		u64 features = 0;
11352ad3d716SAlex Elder 		int ret;
11362ad3d716SAlex Elder 
11372ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
11382ad3d716SAlex Elder 		if (ret)
11392ad3d716SAlex Elder 			return ret;
11402ad3d716SAlex Elder 
11412ad3d716SAlex Elder 		*snap_features = features;
11422ad3d716SAlex Elder 	}
11432ad3d716SAlex Elder 	return 0;
114400f1f36fSAlex Elder }
1145602adf40SYehuda Sadeh 
1146d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1147602adf40SYehuda Sadeh {
11488f4b7d98SAlex Elder 	u64 snap_id = rbd_dev->spec->snap_id;
11492ad3d716SAlex Elder 	u64 size = 0;
11502ad3d716SAlex Elder 	u64 features = 0;
11512ad3d716SAlex Elder 	int ret;
11528b0241f8SAlex Elder 
11532ad3d716SAlex Elder 	ret = rbd_snap_size(rbd_dev, snap_id, &size);
11542ad3d716SAlex Elder 	if (ret)
11552ad3d716SAlex Elder 		return ret;
11562ad3d716SAlex Elder 	ret = rbd_snap_features(rbd_dev, snap_id, &features);
11572ad3d716SAlex Elder 	if (ret)
11582ad3d716SAlex Elder 		return ret;
11592ad3d716SAlex Elder 
11602ad3d716SAlex Elder 	rbd_dev->mapping.size = size;
11612ad3d716SAlex Elder 	rbd_dev->mapping.features = features;
11622ad3d716SAlex Elder 
11638b0241f8SAlex Elder 	return 0;
1164602adf40SYehuda Sadeh }
1165602adf40SYehuda Sadeh 
1166d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1167d1cf5788SAlex Elder {
1168d1cf5788SAlex Elder 	rbd_dev->mapping.size = 0;
1169d1cf5788SAlex Elder 	rbd_dev->mapping.features = 0;
1170200a6a8bSAlex Elder }
1171200a6a8bSAlex Elder 
11727d5079aaSHimangi Saraogi static void rbd_segment_name_free(const char *name)
11737d5079aaSHimangi Saraogi {
11747d5079aaSHimangi Saraogi 	/* The explicit cast here is needed to drop the const qualifier */
11757d5079aaSHimangi Saraogi 
11767d5079aaSHimangi Saraogi 	kmem_cache_free(rbd_segment_name_cache, (void *)name);
11777d5079aaSHimangi Saraogi }
11787d5079aaSHimangi Saraogi 
117998571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
1180602adf40SYehuda Sadeh {
118165ccfe21SAlex Elder 	char *name;
118265ccfe21SAlex Elder 	u64 segment;
118365ccfe21SAlex Elder 	int ret;
11843a96d5cdSJosh Durgin 	char *name_format;
1185602adf40SYehuda Sadeh 
118678c2a44aSAlex Elder 	name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
118765ccfe21SAlex Elder 	if (!name)
118865ccfe21SAlex Elder 		return NULL;
118965ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
11903a96d5cdSJosh Durgin 	name_format = "%s.%012llx";
11913a96d5cdSJosh Durgin 	if (rbd_dev->image_format == 2)
11923a96d5cdSJosh Durgin 		name_format = "%s.%016llx";
11932d0ebc5dSIlya Dryomov 	ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format,
119465ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
11952d0ebc5dSIlya Dryomov 	if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) {
119665ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
119765ccfe21SAlex Elder 			segment, ret);
11987d5079aaSHimangi Saraogi 		rbd_segment_name_free(name);
119965ccfe21SAlex Elder 		name = NULL;
120065ccfe21SAlex Elder 	}
1201602adf40SYehuda Sadeh 
120265ccfe21SAlex Elder 	return name;
120365ccfe21SAlex Elder }
1204602adf40SYehuda Sadeh 
120565ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
120665ccfe21SAlex Elder {
120765ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
1208602adf40SYehuda Sadeh 
120965ccfe21SAlex Elder 	return offset & (segment_size - 1);
121065ccfe21SAlex Elder }
121165ccfe21SAlex Elder 
121265ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
121365ccfe21SAlex Elder 				u64 offset, u64 length)
121465ccfe21SAlex Elder {
121565ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
121665ccfe21SAlex Elder 
121765ccfe21SAlex Elder 	offset &= segment_size - 1;
121865ccfe21SAlex Elder 
1219aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
122065ccfe21SAlex Elder 	if (offset + length > segment_size)
122165ccfe21SAlex Elder 		length = segment_size - offset;
122265ccfe21SAlex Elder 
122365ccfe21SAlex Elder 	return length;
1224602adf40SYehuda Sadeh }
1225602adf40SYehuda Sadeh 
1226602adf40SYehuda Sadeh /*
1227029bcbd8SJosh Durgin  * returns the size of an object in the image
1228029bcbd8SJosh Durgin  */
1229029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
1230029bcbd8SJosh Durgin {
1231029bcbd8SJosh Durgin 	return 1 << header->obj_order;
1232029bcbd8SJosh Durgin }
1233029bcbd8SJosh Durgin 
1234029bcbd8SJosh Durgin /*
1235602adf40SYehuda Sadeh  * bio helpers
1236602adf40SYehuda Sadeh  */
1237602adf40SYehuda Sadeh 
1238602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
1239602adf40SYehuda Sadeh {
1240602adf40SYehuda Sadeh 	struct bio *tmp;
1241602adf40SYehuda Sadeh 
1242602adf40SYehuda Sadeh 	while (chain) {
1243602adf40SYehuda Sadeh 		tmp = chain;
1244602adf40SYehuda Sadeh 		chain = chain->bi_next;
1245602adf40SYehuda Sadeh 		bio_put(tmp);
1246602adf40SYehuda Sadeh 	}
1247602adf40SYehuda Sadeh }
1248602adf40SYehuda Sadeh 
1249602adf40SYehuda Sadeh /*
1250602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
1251602adf40SYehuda Sadeh  */
1252602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
1253602adf40SYehuda Sadeh {
12547988613bSKent Overstreet 	struct bio_vec bv;
12557988613bSKent Overstreet 	struct bvec_iter iter;
1256602adf40SYehuda Sadeh 	unsigned long flags;
1257602adf40SYehuda Sadeh 	void *buf;
1258602adf40SYehuda Sadeh 	int pos = 0;
1259602adf40SYehuda Sadeh 
1260602adf40SYehuda Sadeh 	while (chain) {
12617988613bSKent Overstreet 		bio_for_each_segment(bv, chain, iter) {
12627988613bSKent Overstreet 			if (pos + bv.bv_len > start_ofs) {
1263602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
12647988613bSKent Overstreet 				buf = bvec_kmap_irq(&bv, &flags);
1265602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
12667988613bSKent Overstreet 				       bv.bv_len - remainder);
12677988613bSKent Overstreet 				flush_dcache_page(bv.bv_page);
126885b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
1269602adf40SYehuda Sadeh 			}
12707988613bSKent Overstreet 			pos += bv.bv_len;
1271602adf40SYehuda Sadeh 		}
1272602adf40SYehuda Sadeh 
1273602adf40SYehuda Sadeh 		chain = chain->bi_next;
1274602adf40SYehuda Sadeh 	}
1275602adf40SYehuda Sadeh }
1276602adf40SYehuda Sadeh 
1277602adf40SYehuda Sadeh /*
1278b9434c5bSAlex Elder  * similar to zero_bio_chain(), zeros data defined by a page array,
1279b9434c5bSAlex Elder  * starting at the given byte offset from the start of the array and
1280b9434c5bSAlex Elder  * continuing up to the given end offset.  The pages array is
1281b9434c5bSAlex Elder  * assumed to be big enough to hold all bytes up to the end.
1282b9434c5bSAlex Elder  */
1283b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end)
1284b9434c5bSAlex Elder {
1285b9434c5bSAlex Elder 	struct page **page = &pages[offset >> PAGE_SHIFT];
1286b9434c5bSAlex Elder 
1287b9434c5bSAlex Elder 	rbd_assert(end > offset);
1288b9434c5bSAlex Elder 	rbd_assert(end - offset <= (u64)SIZE_MAX);
1289b9434c5bSAlex Elder 	while (offset < end) {
1290b9434c5bSAlex Elder 		size_t page_offset;
1291b9434c5bSAlex Elder 		size_t length;
1292b9434c5bSAlex Elder 		unsigned long flags;
1293b9434c5bSAlex Elder 		void *kaddr;
1294b9434c5bSAlex Elder 
1295491205a8SGeert Uytterhoeven 		page_offset = offset & ~PAGE_MASK;
1296491205a8SGeert Uytterhoeven 		length = min_t(size_t, PAGE_SIZE - page_offset, end - offset);
1297b9434c5bSAlex Elder 		local_irq_save(flags);
1298b9434c5bSAlex Elder 		kaddr = kmap_atomic(*page);
1299b9434c5bSAlex Elder 		memset(kaddr + page_offset, 0, length);
1300e2156054SAlex Elder 		flush_dcache_page(*page);
1301b9434c5bSAlex Elder 		kunmap_atomic(kaddr);
1302b9434c5bSAlex Elder 		local_irq_restore(flags);
1303b9434c5bSAlex Elder 
1304b9434c5bSAlex Elder 		offset += length;
1305b9434c5bSAlex Elder 		page++;
1306b9434c5bSAlex Elder 	}
1307b9434c5bSAlex Elder }
1308b9434c5bSAlex Elder 
1309b9434c5bSAlex Elder /*
1310f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
1311f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
1312602adf40SYehuda Sadeh  */
1313f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
1314f7760dadSAlex Elder 					unsigned int offset,
1315f7760dadSAlex Elder 					unsigned int len,
1316f7760dadSAlex Elder 					gfp_t gfpmask)
1317602adf40SYehuda Sadeh {
1318f7760dadSAlex Elder 	struct bio *bio;
1319602adf40SYehuda Sadeh 
13205341a627SKent Overstreet 	bio = bio_clone(bio_src, gfpmask);
1321f7760dadSAlex Elder 	if (!bio)
1322f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
1323f7760dadSAlex Elder 
13245341a627SKent Overstreet 	bio_advance(bio, offset);
13254f024f37SKent Overstreet 	bio->bi_iter.bi_size = len;
1326602adf40SYehuda Sadeh 
1327f7760dadSAlex Elder 	return bio;
1328602adf40SYehuda Sadeh }
1329602adf40SYehuda Sadeh 
1330f7760dadSAlex Elder /*
1331f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
1332f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
1333f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
1334f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
1335f7760dadSAlex Elder  *
1336f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
1337f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
1338f7760dadSAlex Elder  * the start of data to be cloned is located.
1339f7760dadSAlex Elder  *
1340f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
1341f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
1342f7760dadSAlex Elder  * contain the offset of that byte within that bio.
1343f7760dadSAlex Elder  */
1344f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
1345f7760dadSAlex Elder 					unsigned int *offset,
1346f7760dadSAlex Elder 					unsigned int len,
1347f7760dadSAlex Elder 					gfp_t gfpmask)
1348f7760dadSAlex Elder {
1349f7760dadSAlex Elder 	struct bio *bi = *bio_src;
1350f7760dadSAlex Elder 	unsigned int off = *offset;
1351f7760dadSAlex Elder 	struct bio *chain = NULL;
1352f7760dadSAlex Elder 	struct bio **end;
1353602adf40SYehuda Sadeh 
1354f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
1355602adf40SYehuda Sadeh 
13564f024f37SKent Overstreet 	if (!bi || off >= bi->bi_iter.bi_size || !len)
1357f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
1358602adf40SYehuda Sadeh 
1359f7760dadSAlex Elder 	end = &chain;
1360f7760dadSAlex Elder 	while (len) {
1361f7760dadSAlex Elder 		unsigned int bi_size;
1362f7760dadSAlex Elder 		struct bio *bio;
1363f7760dadSAlex Elder 
1364f5400b7aSAlex Elder 		if (!bi) {
1365f5400b7aSAlex Elder 			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1366f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
1367f5400b7aSAlex Elder 		}
13684f024f37SKent Overstreet 		bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len);
1369f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
1370f7760dadSAlex Elder 		if (!bio)
1371f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
1372f7760dadSAlex Elder 
1373f7760dadSAlex Elder 		*end = bio;
1374f7760dadSAlex Elder 		end = &bio->bi_next;
1375f7760dadSAlex Elder 
1376f7760dadSAlex Elder 		off += bi_size;
13774f024f37SKent Overstreet 		if (off == bi->bi_iter.bi_size) {
1378f7760dadSAlex Elder 			bi = bi->bi_next;
1379f7760dadSAlex Elder 			off = 0;
1380f7760dadSAlex Elder 		}
1381f7760dadSAlex Elder 		len -= bi_size;
1382f7760dadSAlex Elder 	}
1383f7760dadSAlex Elder 	*bio_src = bi;
1384f7760dadSAlex Elder 	*offset = off;
1385f7760dadSAlex Elder 
1386f7760dadSAlex Elder 	return chain;
1387f7760dadSAlex Elder out_err:
1388f7760dadSAlex Elder 	bio_chain_put(chain);
1389f7760dadSAlex Elder 
1390602adf40SYehuda Sadeh 	return NULL;
1391602adf40SYehuda Sadeh }
1392602adf40SYehuda Sadeh 
1393926f9b3fSAlex Elder /*
1394926f9b3fSAlex Elder  * The default/initial value for all object request flags is 0.  For
1395926f9b3fSAlex Elder  * each flag, once its value is set to 1 it is never reset to 0
1396926f9b3fSAlex Elder  * again.
1397926f9b3fSAlex Elder  */
13986365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
13996365d33aSAlex Elder {
14006365d33aSAlex Elder 	if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
14016365d33aSAlex Elder 		struct rbd_device *rbd_dev;
14026365d33aSAlex Elder 
140357acbaa7SAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
14049584d508SIlya Dryomov 		rbd_warn(rbd_dev, "obj_request %p already marked img_data",
14056365d33aSAlex Elder 			obj_request);
14066365d33aSAlex Elder 	}
14076365d33aSAlex Elder }
14086365d33aSAlex Elder 
14096365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
14106365d33aSAlex Elder {
14116365d33aSAlex Elder 	smp_mb();
14126365d33aSAlex Elder 	return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
14136365d33aSAlex Elder }
14146365d33aSAlex Elder 
141557acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request)
141657acbaa7SAlex Elder {
141757acbaa7SAlex Elder 	if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
141857acbaa7SAlex Elder 		struct rbd_device *rbd_dev = NULL;
141957acbaa7SAlex Elder 
142057acbaa7SAlex Elder 		if (obj_request_img_data_test(obj_request))
142157acbaa7SAlex Elder 			rbd_dev = obj_request->img_request->rbd_dev;
14229584d508SIlya Dryomov 		rbd_warn(rbd_dev, "obj_request %p already marked done",
142357acbaa7SAlex Elder 			obj_request);
142457acbaa7SAlex Elder 	}
142557acbaa7SAlex Elder }
142657acbaa7SAlex Elder 
142757acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request)
142857acbaa7SAlex Elder {
142957acbaa7SAlex Elder 	smp_mb();
143057acbaa7SAlex Elder 	return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
143157acbaa7SAlex Elder }
143257acbaa7SAlex Elder 
14335679c59fSAlex Elder /*
14345679c59fSAlex Elder  * This sets the KNOWN flag after (possibly) setting the EXISTS
14355679c59fSAlex Elder  * flag.  The latter is set based on the "exists" value provided.
14365679c59fSAlex Elder  *
14375679c59fSAlex Elder  * Note that for our purposes once an object exists it never goes
14385679c59fSAlex Elder  * away again.  It's possible that the response from two existence
14395679c59fSAlex Elder  * checks are separated by the creation of the target object, and
14405679c59fSAlex Elder  * the first ("doesn't exist") response arrives *after* the second
14415679c59fSAlex Elder  * ("does exist").  In that case we ignore the second one.
14425679c59fSAlex Elder  */
14435679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request,
14445679c59fSAlex Elder 				bool exists)
14455679c59fSAlex Elder {
14465679c59fSAlex Elder 	if (exists)
14475679c59fSAlex Elder 		set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
14485679c59fSAlex Elder 	set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
14495679c59fSAlex Elder 	smp_mb();
14505679c59fSAlex Elder }
14515679c59fSAlex Elder 
14525679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request)
14535679c59fSAlex Elder {
14545679c59fSAlex Elder 	smp_mb();
14555679c59fSAlex Elder 	return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
14565679c59fSAlex Elder }
14575679c59fSAlex Elder 
14585679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
14595679c59fSAlex Elder {
14605679c59fSAlex Elder 	smp_mb();
14615679c59fSAlex Elder 	return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
14625679c59fSAlex Elder }
14635679c59fSAlex Elder 
14649638556aSIlya Dryomov static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request)
14659638556aSIlya Dryomov {
14669638556aSIlya Dryomov 	struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
14679638556aSIlya Dryomov 
14689638556aSIlya Dryomov 	return obj_request->img_offset <
14699638556aSIlya Dryomov 	    round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header));
14709638556aSIlya Dryomov }
14719638556aSIlya Dryomov 
1472bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1473bf0d5f50SAlex Elder {
147437206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
147537206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1476bf0d5f50SAlex Elder 	kref_get(&obj_request->kref);
1477bf0d5f50SAlex Elder }
1478bf0d5f50SAlex Elder 
1479bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
1480bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1481bf0d5f50SAlex Elder {
1482bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
148337206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
148437206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1485bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1486bf0d5f50SAlex Elder }
1487bf0d5f50SAlex Elder 
14880f2d5be7SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request)
14890f2d5be7SAlex Elder {
14900f2d5be7SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
14910f2d5be7SAlex Elder 	     atomic_read(&img_request->kref.refcount));
14920f2d5be7SAlex Elder 	kref_get(&img_request->kref);
14930f2d5be7SAlex Elder }
14940f2d5be7SAlex Elder 
1495e93f3152SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request);
1496e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref);
1497bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref);
1498bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request)
1499bf0d5f50SAlex Elder {
1500bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
150137206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
150237206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1503e93f3152SAlex Elder 	if (img_request_child_test(img_request))
1504e93f3152SAlex Elder 		kref_put(&img_request->kref, rbd_parent_request_destroy);
1505e93f3152SAlex Elder 	else
1506bf0d5f50SAlex Elder 		kref_put(&img_request->kref, rbd_img_request_destroy);
1507bf0d5f50SAlex Elder }
1508bf0d5f50SAlex Elder 
1509bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1510bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1511bf0d5f50SAlex Elder {
151225dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
151325dcf954SAlex Elder 
1514b155e86cSAlex Elder 	/* Image request now owns object's original reference */
1515bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
151625dcf954SAlex Elder 	obj_request->which = img_request->obj_request_count;
15176365d33aSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
15186365d33aSAlex Elder 	obj_request_img_data_set(obj_request);
1519bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
152025dcf954SAlex Elder 	img_request->obj_request_count++;
152125dcf954SAlex Elder 	list_add_tail(&obj_request->links, &img_request->obj_requests);
152237206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
152337206ee5SAlex Elder 		obj_request->which);
1524bf0d5f50SAlex Elder }
1525bf0d5f50SAlex Elder 
1526bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1527bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1528bf0d5f50SAlex Elder {
1529bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
153025dcf954SAlex Elder 
153137206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
153237206ee5SAlex Elder 		obj_request->which);
1533bf0d5f50SAlex Elder 	list_del(&obj_request->links);
153425dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
153525dcf954SAlex Elder 	img_request->obj_request_count--;
153625dcf954SAlex Elder 	rbd_assert(obj_request->which == img_request->obj_request_count);
153725dcf954SAlex Elder 	obj_request->which = BAD_WHICH;
15386365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
1539bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1540bf0d5f50SAlex Elder 	obj_request->img_request = NULL;
154125dcf954SAlex Elder 	obj_request->callback = NULL;
1542bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1543bf0d5f50SAlex Elder }
1544bf0d5f50SAlex Elder 
1545bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type)
1546bf0d5f50SAlex Elder {
1547bf0d5f50SAlex Elder 	switch (type) {
15489969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1549bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1550788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1551bf0d5f50SAlex Elder 		return true;
1552bf0d5f50SAlex Elder 	default:
1553bf0d5f50SAlex Elder 		return false;
1554bf0d5f50SAlex Elder 	}
1555bf0d5f50SAlex Elder }
1556bf0d5f50SAlex Elder 
1557bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1558bf0d5f50SAlex Elder 				struct rbd_obj_request *obj_request)
1559bf0d5f50SAlex Elder {
156071c20a06SIlya Dryomov 	dout("%s %p\n", __func__, obj_request);
1561bf0d5f50SAlex Elder 	return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1562bf0d5f50SAlex Elder }
1563bf0d5f50SAlex Elder 
156471c20a06SIlya Dryomov static void rbd_obj_request_end(struct rbd_obj_request *obj_request)
156571c20a06SIlya Dryomov {
156671c20a06SIlya Dryomov 	dout("%s %p\n", __func__, obj_request);
156771c20a06SIlya Dryomov 	ceph_osdc_cancel_request(obj_request->osd_req);
156871c20a06SIlya Dryomov }
156971c20a06SIlya Dryomov 
157071c20a06SIlya Dryomov /*
157171c20a06SIlya Dryomov  * Wait for an object request to complete.  If interrupted, cancel the
157271c20a06SIlya Dryomov  * underlying osd request.
15732894e1d7SIlya Dryomov  *
15742894e1d7SIlya Dryomov  * @timeout: in jiffies, 0 means "wait forever"
157571c20a06SIlya Dryomov  */
15762894e1d7SIlya Dryomov static int __rbd_obj_request_wait(struct rbd_obj_request *obj_request,
15772894e1d7SIlya Dryomov 				  unsigned long timeout)
157871c20a06SIlya Dryomov {
15792894e1d7SIlya Dryomov 	long ret;
158071c20a06SIlya Dryomov 
158171c20a06SIlya Dryomov 	dout("%s %p\n", __func__, obj_request);
15822894e1d7SIlya Dryomov 	ret = wait_for_completion_interruptible_timeout(
15832894e1d7SIlya Dryomov 					&obj_request->completion,
15842894e1d7SIlya Dryomov 					ceph_timeout_jiffies(timeout));
15852894e1d7SIlya Dryomov 	if (ret <= 0) {
15862894e1d7SIlya Dryomov 		if (ret == 0)
15872894e1d7SIlya Dryomov 			ret = -ETIMEDOUT;
158871c20a06SIlya Dryomov 		rbd_obj_request_end(obj_request);
15892894e1d7SIlya Dryomov 	} else {
15902894e1d7SIlya Dryomov 		ret = 0;
15912894e1d7SIlya Dryomov 	}
15922894e1d7SIlya Dryomov 
15932894e1d7SIlya Dryomov 	dout("%s %p ret %d\n", __func__, obj_request, (int)ret);
159471c20a06SIlya Dryomov 	return ret;
159571c20a06SIlya Dryomov }
159671c20a06SIlya Dryomov 
15972894e1d7SIlya Dryomov static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
15982894e1d7SIlya Dryomov {
15992894e1d7SIlya Dryomov 	return __rbd_obj_request_wait(obj_request, 0);
16002894e1d7SIlya Dryomov }
16012894e1d7SIlya Dryomov 
16022894e1d7SIlya Dryomov static int rbd_obj_request_wait_timeout(struct rbd_obj_request *obj_request,
16032894e1d7SIlya Dryomov 					unsigned long timeout)
16042894e1d7SIlya Dryomov {
16052894e1d7SIlya Dryomov 	return __rbd_obj_request_wait(obj_request, timeout);
160671c20a06SIlya Dryomov }
160771c20a06SIlya Dryomov 
1608bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request)
1609bf0d5f50SAlex Elder {
161055f27e09SAlex Elder 
161137206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
161255f27e09SAlex Elder 
161355f27e09SAlex Elder 	/*
161455f27e09SAlex Elder 	 * If no error occurred, compute the aggregate transfer
161555f27e09SAlex Elder 	 * count for the image request.  We could instead use
161655f27e09SAlex Elder 	 * atomic64_cmpxchg() to update it as each object request
161755f27e09SAlex Elder 	 * completes; not clear which way is better off hand.
161855f27e09SAlex Elder 	 */
161955f27e09SAlex Elder 	if (!img_request->result) {
162055f27e09SAlex Elder 		struct rbd_obj_request *obj_request;
162155f27e09SAlex Elder 		u64 xferred = 0;
162255f27e09SAlex Elder 
162355f27e09SAlex Elder 		for_each_obj_request(img_request, obj_request)
162455f27e09SAlex Elder 			xferred += obj_request->xferred;
162555f27e09SAlex Elder 		img_request->xferred = xferred;
162655f27e09SAlex Elder 	}
162755f27e09SAlex Elder 
1628bf0d5f50SAlex Elder 	if (img_request->callback)
1629bf0d5f50SAlex Elder 		img_request->callback(img_request);
1630bf0d5f50SAlex Elder 	else
1631bf0d5f50SAlex Elder 		rbd_img_request_put(img_request);
1632bf0d5f50SAlex Elder }
1633bf0d5f50SAlex Elder 
16340c425248SAlex Elder /*
16350c425248SAlex Elder  * The default/initial value for all image request flags is 0.  Each
16360c425248SAlex Elder  * is conditionally set to 1 at image request initialization time
16370c425248SAlex Elder  * and currently never change thereafter.
16380c425248SAlex Elder  */
16390c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request)
16400c425248SAlex Elder {
16410c425248SAlex Elder 	set_bit(IMG_REQ_WRITE, &img_request->flags);
16420c425248SAlex Elder 	smp_mb();
16430c425248SAlex Elder }
16440c425248SAlex Elder 
16450c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request)
16460c425248SAlex Elder {
16470c425248SAlex Elder 	smp_mb();
16480c425248SAlex Elder 	return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
16490c425248SAlex Elder }
16500c425248SAlex Elder 
165190e98c52SGuangliang Zhao /*
165290e98c52SGuangliang Zhao  * Set the discard flag when the img_request is an discard request
165390e98c52SGuangliang Zhao  */
165490e98c52SGuangliang Zhao static void img_request_discard_set(struct rbd_img_request *img_request)
165590e98c52SGuangliang Zhao {
165690e98c52SGuangliang Zhao 	set_bit(IMG_REQ_DISCARD, &img_request->flags);
165790e98c52SGuangliang Zhao 	smp_mb();
165890e98c52SGuangliang Zhao }
165990e98c52SGuangliang Zhao 
166090e98c52SGuangliang Zhao static bool img_request_discard_test(struct rbd_img_request *img_request)
166190e98c52SGuangliang Zhao {
166290e98c52SGuangliang Zhao 	smp_mb();
166390e98c52SGuangliang Zhao 	return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0;
166490e98c52SGuangliang Zhao }
166590e98c52SGuangliang Zhao 
16669849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request)
16679849e986SAlex Elder {
16689849e986SAlex Elder 	set_bit(IMG_REQ_CHILD, &img_request->flags);
16699849e986SAlex Elder 	smp_mb();
16709849e986SAlex Elder }
16719849e986SAlex Elder 
1672e93f3152SAlex Elder static void img_request_child_clear(struct rbd_img_request *img_request)
1673e93f3152SAlex Elder {
1674e93f3152SAlex Elder 	clear_bit(IMG_REQ_CHILD, &img_request->flags);
1675e93f3152SAlex Elder 	smp_mb();
1676e93f3152SAlex Elder }
1677e93f3152SAlex Elder 
16789849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request)
16799849e986SAlex Elder {
16809849e986SAlex Elder 	smp_mb();
16819849e986SAlex Elder 	return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
16829849e986SAlex Elder }
16839849e986SAlex Elder 
1684d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request)
1685d0b2e944SAlex Elder {
1686d0b2e944SAlex Elder 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1687d0b2e944SAlex Elder 	smp_mb();
1688d0b2e944SAlex Elder }
1689d0b2e944SAlex Elder 
1690a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request)
1691a2acd00eSAlex Elder {
1692a2acd00eSAlex Elder 	clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1693a2acd00eSAlex Elder 	smp_mb();
1694a2acd00eSAlex Elder }
1695a2acd00eSAlex Elder 
1696d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request)
1697d0b2e944SAlex Elder {
1698d0b2e944SAlex Elder 	smp_mb();
1699d0b2e944SAlex Elder 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1700d0b2e944SAlex Elder }
1701d0b2e944SAlex Elder 
17023b434a2aSJosh Durgin static enum obj_operation_type
17033b434a2aSJosh Durgin rbd_img_request_op_type(struct rbd_img_request *img_request)
17043b434a2aSJosh Durgin {
17053b434a2aSJosh Durgin 	if (img_request_write_test(img_request))
17063b434a2aSJosh Durgin 		return OBJ_OP_WRITE;
17073b434a2aSJosh Durgin 	else if (img_request_discard_test(img_request))
17083b434a2aSJosh Durgin 		return OBJ_OP_DISCARD;
17093b434a2aSJosh Durgin 	else
17103b434a2aSJosh Durgin 		return OBJ_OP_READ;
17113b434a2aSJosh Durgin }
17123b434a2aSJosh Durgin 
17136e2a4505SAlex Elder static void
17146e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
17156e2a4505SAlex Elder {
1716b9434c5bSAlex Elder 	u64 xferred = obj_request->xferred;
1717b9434c5bSAlex Elder 	u64 length = obj_request->length;
1718b9434c5bSAlex Elder 
17196e2a4505SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
17206e2a4505SAlex Elder 		obj_request, obj_request->img_request, obj_request->result,
1721b9434c5bSAlex Elder 		xferred, length);
17226e2a4505SAlex Elder 	/*
172317c1cc1dSJosh Durgin 	 * ENOENT means a hole in the image.  We zero-fill the entire
172417c1cc1dSJosh Durgin 	 * length of the request.  A short read also implies zero-fill
172517c1cc1dSJosh Durgin 	 * to the end of the request.  An error requires the whole
172617c1cc1dSJosh Durgin 	 * length of the request to be reported finished with an error
172717c1cc1dSJosh Durgin 	 * to the block layer.  In each case we update the xferred
172817c1cc1dSJosh Durgin 	 * count to indicate the whole request was satisfied.
17296e2a4505SAlex Elder 	 */
1730b9434c5bSAlex Elder 	rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
17316e2a4505SAlex Elder 	if (obj_request->result == -ENOENT) {
1732b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
17336e2a4505SAlex Elder 			zero_bio_chain(obj_request->bio_list, 0);
1734b9434c5bSAlex Elder 		else
1735b9434c5bSAlex Elder 			zero_pages(obj_request->pages, 0, length);
17366e2a4505SAlex Elder 		obj_request->result = 0;
1737b9434c5bSAlex Elder 	} else if (xferred < length && !obj_request->result) {
1738b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
1739b9434c5bSAlex Elder 			zero_bio_chain(obj_request->bio_list, xferred);
1740b9434c5bSAlex Elder 		else
1741b9434c5bSAlex Elder 			zero_pages(obj_request->pages, xferred, length);
17426e2a4505SAlex Elder 	}
174317c1cc1dSJosh Durgin 	obj_request->xferred = length;
17446e2a4505SAlex Elder 	obj_request_done_set(obj_request);
17456e2a4505SAlex Elder }
17466e2a4505SAlex Elder 
1747bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1748bf0d5f50SAlex Elder {
174937206ee5SAlex Elder 	dout("%s: obj %p cb %p\n", __func__, obj_request,
175037206ee5SAlex Elder 		obj_request->callback);
1751bf0d5f50SAlex Elder 	if (obj_request->callback)
1752bf0d5f50SAlex Elder 		obj_request->callback(obj_request);
1753788e2df3SAlex Elder 	else
1754788e2df3SAlex Elder 		complete_all(&obj_request->completion);
1755bf0d5f50SAlex Elder }
1756bf0d5f50SAlex Elder 
1757c47f9371SAlex Elder static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
175839bf2c5dSAlex Elder {
175939bf2c5dSAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
176039bf2c5dSAlex Elder 	obj_request_done_set(obj_request);
176139bf2c5dSAlex Elder }
176239bf2c5dSAlex Elder 
1763c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1764bf0d5f50SAlex Elder {
176557acbaa7SAlex Elder 	struct rbd_img_request *img_request = NULL;
1766a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev = NULL;
176757acbaa7SAlex Elder 	bool layered = false;
176857acbaa7SAlex Elder 
176957acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
177057acbaa7SAlex Elder 		img_request = obj_request->img_request;
177157acbaa7SAlex Elder 		layered = img_request && img_request_layered_test(img_request);
1772a9e8ba2cSAlex Elder 		rbd_dev = img_request->rbd_dev;
177357acbaa7SAlex Elder 	}
17748b3e1a56SAlex Elder 
17758b3e1a56SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
17768b3e1a56SAlex Elder 		obj_request, img_request, obj_request->result,
17778b3e1a56SAlex Elder 		obj_request->xferred, obj_request->length);
1778a9e8ba2cSAlex Elder 	if (layered && obj_request->result == -ENOENT &&
1779a9e8ba2cSAlex Elder 			obj_request->img_offset < rbd_dev->parent_overlap)
17808b3e1a56SAlex Elder 		rbd_img_parent_read(obj_request);
17818b3e1a56SAlex Elder 	else if (img_request)
17826e2a4505SAlex Elder 		rbd_img_obj_request_read_callback(obj_request);
17836e2a4505SAlex Elder 	else
178407741308SAlex Elder 		obj_request_done_set(obj_request);
1785bf0d5f50SAlex Elder }
1786bf0d5f50SAlex Elder 
1787c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1788bf0d5f50SAlex Elder {
17891b83bef2SSage Weil 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
17901b83bef2SSage Weil 		obj_request->result, obj_request->length);
17911b83bef2SSage Weil 	/*
17928b3e1a56SAlex Elder 	 * There is no such thing as a successful short write.  Set
17938b3e1a56SAlex Elder 	 * it to our originally-requested length.
17941b83bef2SSage Weil 	 */
17951b83bef2SSage Weil 	obj_request->xferred = obj_request->length;
179607741308SAlex Elder 	obj_request_done_set(obj_request);
1797bf0d5f50SAlex Elder }
1798bf0d5f50SAlex Elder 
179990e98c52SGuangliang Zhao static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request)
180090e98c52SGuangliang Zhao {
180190e98c52SGuangliang Zhao 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
180290e98c52SGuangliang Zhao 		obj_request->result, obj_request->length);
180390e98c52SGuangliang Zhao 	/*
180490e98c52SGuangliang Zhao 	 * There is no such thing as a successful short discard.  Set
180590e98c52SGuangliang Zhao 	 * it to our originally-requested length.
180690e98c52SGuangliang Zhao 	 */
180790e98c52SGuangliang Zhao 	obj_request->xferred = obj_request->length;
1808d0265de7SJosh Durgin 	/* discarding a non-existent object is not a problem */
1809d0265de7SJosh Durgin 	if (obj_request->result == -ENOENT)
1810d0265de7SJosh Durgin 		obj_request->result = 0;
181190e98c52SGuangliang Zhao 	obj_request_done_set(obj_request);
181290e98c52SGuangliang Zhao }
181390e98c52SGuangliang Zhao 
1814fbfab539SAlex Elder /*
1815fbfab539SAlex Elder  * For a simple stat call there's nothing to do.  We'll do more if
1816fbfab539SAlex Elder  * this is part of a write sequence for a layered image.
1817fbfab539SAlex Elder  */
1818c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1819fbfab539SAlex Elder {
182037206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
1821fbfab539SAlex Elder 	obj_request_done_set(obj_request);
1822fbfab539SAlex Elder }
1823fbfab539SAlex Elder 
18242761713dSIlya Dryomov static void rbd_osd_call_callback(struct rbd_obj_request *obj_request)
18252761713dSIlya Dryomov {
18262761713dSIlya Dryomov 	dout("%s: obj %p\n", __func__, obj_request);
18272761713dSIlya Dryomov 
18282761713dSIlya Dryomov 	if (obj_request_img_data_test(obj_request))
18292761713dSIlya Dryomov 		rbd_osd_copyup_callback(obj_request);
18302761713dSIlya Dryomov 	else
18312761713dSIlya Dryomov 		obj_request_done_set(obj_request);
18322761713dSIlya Dryomov }
18332761713dSIlya Dryomov 
1834bf0d5f50SAlex Elder static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1835bf0d5f50SAlex Elder 				struct ceph_msg *msg)
1836bf0d5f50SAlex Elder {
1837bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1838bf0d5f50SAlex Elder 	u16 opcode;
1839bf0d5f50SAlex Elder 
184037206ee5SAlex Elder 	dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1841bf0d5f50SAlex Elder 	rbd_assert(osd_req == obj_request->osd_req);
184257acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
184357acbaa7SAlex Elder 		rbd_assert(obj_request->img_request);
184457acbaa7SAlex Elder 		rbd_assert(obj_request->which != BAD_WHICH);
184557acbaa7SAlex Elder 	} else {
184657acbaa7SAlex Elder 		rbd_assert(obj_request->which == BAD_WHICH);
184757acbaa7SAlex Elder 	}
1848bf0d5f50SAlex Elder 
18491b83bef2SSage Weil 	if (osd_req->r_result < 0)
18501b83bef2SSage Weil 		obj_request->result = osd_req->r_result;
1851bf0d5f50SAlex Elder 
18527cc69d42SIlya Dryomov 	rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP);
1853bf0d5f50SAlex Elder 
1854c47f9371SAlex Elder 	/*
1855c47f9371SAlex Elder 	 * We support a 64-bit length, but ultimately it has to be
18567ad18afaSChristoph Hellwig 	 * passed to the block layer, which just supports a 32-bit
18577ad18afaSChristoph Hellwig 	 * length field.
1858c47f9371SAlex Elder 	 */
18591b83bef2SSage Weil 	obj_request->xferred = osd_req->r_reply_op_len[0];
1860c47f9371SAlex Elder 	rbd_assert(obj_request->xferred < (u64)UINT_MAX);
18610ccd5926SIlya Dryomov 
186279528734SAlex Elder 	opcode = osd_req->r_ops[0].op;
1863bf0d5f50SAlex Elder 	switch (opcode) {
1864bf0d5f50SAlex Elder 	case CEPH_OSD_OP_READ:
1865c47f9371SAlex Elder 		rbd_osd_read_callback(obj_request);
1866bf0d5f50SAlex Elder 		break;
18670ccd5926SIlya Dryomov 	case CEPH_OSD_OP_SETALLOCHINT:
1868e30b7577SIlya Dryomov 		rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE ||
1869e30b7577SIlya Dryomov 			   osd_req->r_ops[1].op == CEPH_OSD_OP_WRITEFULL);
18700ccd5926SIlya Dryomov 		/* fall through */
1871bf0d5f50SAlex Elder 	case CEPH_OSD_OP_WRITE:
1872e30b7577SIlya Dryomov 	case CEPH_OSD_OP_WRITEFULL:
1873c47f9371SAlex Elder 		rbd_osd_write_callback(obj_request);
1874bf0d5f50SAlex Elder 		break;
1875fbfab539SAlex Elder 	case CEPH_OSD_OP_STAT:
1876c47f9371SAlex Elder 		rbd_osd_stat_callback(obj_request);
1877fbfab539SAlex Elder 		break;
187890e98c52SGuangliang Zhao 	case CEPH_OSD_OP_DELETE:
187990e98c52SGuangliang Zhao 	case CEPH_OSD_OP_TRUNCATE:
188090e98c52SGuangliang Zhao 	case CEPH_OSD_OP_ZERO:
188190e98c52SGuangliang Zhao 		rbd_osd_discard_callback(obj_request);
188290e98c52SGuangliang Zhao 		break;
188336be9a76SAlex Elder 	case CEPH_OSD_OP_CALL:
18842761713dSIlya Dryomov 		rbd_osd_call_callback(obj_request);
18852761713dSIlya Dryomov 		break;
1886b8d70035SAlex Elder 	case CEPH_OSD_OP_NOTIFY_ACK:
18879969ebc5SAlex Elder 	case CEPH_OSD_OP_WATCH:
1888c47f9371SAlex Elder 		rbd_osd_trivial_callback(obj_request);
18899969ebc5SAlex Elder 		break;
1890bf0d5f50SAlex Elder 	default:
18919584d508SIlya Dryomov 		rbd_warn(NULL, "%s: unsupported op %hu",
1892bf0d5f50SAlex Elder 			obj_request->object_name, (unsigned short) opcode);
1893bf0d5f50SAlex Elder 		break;
1894bf0d5f50SAlex Elder 	}
1895bf0d5f50SAlex Elder 
189607741308SAlex Elder 	if (obj_request_done_test(obj_request))
1897bf0d5f50SAlex Elder 		rbd_obj_request_complete(obj_request);
1898bf0d5f50SAlex Elder }
1899bf0d5f50SAlex Elder 
19009d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1901430c28c3SAlex Elder {
1902430c28c3SAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
19038c042b0dSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
19049d4df01fSAlex Elder 	u64 snap_id;
1905430c28c3SAlex Elder 
19068c042b0dSAlex Elder 	rbd_assert(osd_req != NULL);
1907430c28c3SAlex Elder 
19089d4df01fSAlex Elder 	snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
19098c042b0dSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
19109d4df01fSAlex Elder 			NULL, snap_id, NULL);
19119d4df01fSAlex Elder }
19129d4df01fSAlex Elder 
19139d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
19149d4df01fSAlex Elder {
19159d4df01fSAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
19169d4df01fSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
19179d4df01fSAlex Elder 	struct ceph_snap_context *snapc;
19189d4df01fSAlex Elder 	struct timespec mtime = CURRENT_TIME;
19199d4df01fSAlex Elder 
19209d4df01fSAlex Elder 	rbd_assert(osd_req != NULL);
19219d4df01fSAlex Elder 
19229d4df01fSAlex Elder 	snapc = img_request ? img_request->snapc : NULL;
19239d4df01fSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
19249d4df01fSAlex Elder 			snapc, CEPH_NOSNAP, &mtime);
1925430c28c3SAlex Elder }
1926430c28c3SAlex Elder 
19270ccd5926SIlya Dryomov /*
19280ccd5926SIlya Dryomov  * Create an osd request.  A read request has one osd op (read).
19290ccd5926SIlya Dryomov  * A write request has either one (watch) or two (hint+write) osd ops.
19300ccd5926SIlya Dryomov  * (All rbd data writes are prefixed with an allocation hint op, but
19310ccd5926SIlya Dryomov  * technically osd watch is a write request, hence this distinction.)
19320ccd5926SIlya Dryomov  */
1933bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create(
1934bf0d5f50SAlex Elder 					struct rbd_device *rbd_dev,
19356d2940c8SGuangliang Zhao 					enum obj_operation_type op_type,
1936deb236b3SIlya Dryomov 					unsigned int num_ops,
1937430c28c3SAlex Elder 					struct rbd_obj_request *obj_request)
1938bf0d5f50SAlex Elder {
1939bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1940bf0d5f50SAlex Elder 	struct ceph_osd_client *osdc;
1941bf0d5f50SAlex Elder 	struct ceph_osd_request *osd_req;
1942bf0d5f50SAlex Elder 
194390e98c52SGuangliang Zhao 	if (obj_request_img_data_test(obj_request) &&
194490e98c52SGuangliang Zhao 		(op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) {
19456365d33aSAlex Elder 		struct rbd_img_request *img_request = obj_request->img_request;
194690e98c52SGuangliang Zhao 		if (op_type == OBJ_OP_WRITE) {
19476d2940c8SGuangliang Zhao 			rbd_assert(img_request_write_test(img_request));
194890e98c52SGuangliang Zhao 		} else {
194990e98c52SGuangliang Zhao 			rbd_assert(img_request_discard_test(img_request));
195090e98c52SGuangliang Zhao 		}
1951bf0d5f50SAlex Elder 		snapc = img_request->snapc;
1952bf0d5f50SAlex Elder 	}
1953bf0d5f50SAlex Elder 
19546d2940c8SGuangliang Zhao 	rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2));
1955deb236b3SIlya Dryomov 
1956deb236b3SIlya Dryomov 	/* Allocate and initialize the request, for the num_ops ops */
1957bf0d5f50SAlex Elder 
1958bf0d5f50SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
1959deb236b3SIlya Dryomov 	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
1960deb236b3SIlya Dryomov 					  GFP_ATOMIC);
1961bf0d5f50SAlex Elder 	if (!osd_req)
1962bf0d5f50SAlex Elder 		return NULL;	/* ENOMEM */
1963bf0d5f50SAlex Elder 
196490e98c52SGuangliang Zhao 	if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
1965bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1966430c28c3SAlex Elder 	else
1967bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_READ;
1968bf0d5f50SAlex Elder 
1969bf0d5f50SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
1970bf0d5f50SAlex Elder 	osd_req->r_priv = obj_request;
1971bf0d5f50SAlex Elder 
19723c972c95SIlya Dryomov 	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
19733c972c95SIlya Dryomov 	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
1974bf0d5f50SAlex Elder 
1975bf0d5f50SAlex Elder 	return osd_req;
1976bf0d5f50SAlex Elder }
1977bf0d5f50SAlex Elder 
19780eefd470SAlex Elder /*
1979d3246fb0SJosh Durgin  * Create a copyup osd request based on the information in the object
1980d3246fb0SJosh Durgin  * request supplied.  A copyup request has two or three osd ops, a
1981d3246fb0SJosh Durgin  * copyup method call, potentially a hint op, and a write or truncate
1982d3246fb0SJosh Durgin  * or zero op.
19830eefd470SAlex Elder  */
19840eefd470SAlex Elder static struct ceph_osd_request *
19850eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
19860eefd470SAlex Elder {
19870eefd470SAlex Elder 	struct rbd_img_request *img_request;
19880eefd470SAlex Elder 	struct ceph_snap_context *snapc;
19890eefd470SAlex Elder 	struct rbd_device *rbd_dev;
19900eefd470SAlex Elder 	struct ceph_osd_client *osdc;
19910eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
1992d3246fb0SJosh Durgin 	int num_osd_ops = 3;
19930eefd470SAlex Elder 
19940eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
19950eefd470SAlex Elder 	img_request = obj_request->img_request;
19960eefd470SAlex Elder 	rbd_assert(img_request);
1997d3246fb0SJosh Durgin 	rbd_assert(img_request_write_test(img_request) ||
1998d3246fb0SJosh Durgin 			img_request_discard_test(img_request));
19990eefd470SAlex Elder 
2000d3246fb0SJosh Durgin 	if (img_request_discard_test(img_request))
2001d3246fb0SJosh Durgin 		num_osd_ops = 2;
2002d3246fb0SJosh Durgin 
2003d3246fb0SJosh Durgin 	/* Allocate and initialize the request, for all the ops */
20040eefd470SAlex Elder 
20050eefd470SAlex Elder 	snapc = img_request->snapc;
20060eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
20070eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2008d3246fb0SJosh Durgin 	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops,
2009d3246fb0SJosh Durgin 						false, GFP_ATOMIC);
20100eefd470SAlex Elder 	if (!osd_req)
20110eefd470SAlex Elder 		return NULL;	/* ENOMEM */
20120eefd470SAlex Elder 
20130eefd470SAlex Elder 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
20140eefd470SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
20150eefd470SAlex Elder 	osd_req->r_priv = obj_request;
20160eefd470SAlex Elder 
20173c972c95SIlya Dryomov 	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
20183c972c95SIlya Dryomov 	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
20190eefd470SAlex Elder 
20200eefd470SAlex Elder 	return osd_req;
20210eefd470SAlex Elder }
20220eefd470SAlex Elder 
20230eefd470SAlex Elder 
2024bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
2025bf0d5f50SAlex Elder {
2026bf0d5f50SAlex Elder 	ceph_osdc_put_request(osd_req);
2027bf0d5f50SAlex Elder }
2028bf0d5f50SAlex Elder 
2029bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */
2030bf0d5f50SAlex Elder 
2031bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
2032bf0d5f50SAlex Elder 						u64 offset, u64 length,
2033bf0d5f50SAlex Elder 						enum obj_request_type type)
2034bf0d5f50SAlex Elder {
2035bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2036bf0d5f50SAlex Elder 	size_t size;
2037bf0d5f50SAlex Elder 	char *name;
2038bf0d5f50SAlex Elder 
2039bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(type));
2040bf0d5f50SAlex Elder 
2041bf0d5f50SAlex Elder 	size = strlen(object_name) + 1;
20425a60e876SIlya Dryomov 	name = kmalloc(size, GFP_NOIO);
2043f907ad55SAlex Elder 	if (!name)
2044bf0d5f50SAlex Elder 		return NULL;
2045bf0d5f50SAlex Elder 
20465a60e876SIlya Dryomov 	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
2047f907ad55SAlex Elder 	if (!obj_request) {
2048f907ad55SAlex Elder 		kfree(name);
2049f907ad55SAlex Elder 		return NULL;
2050f907ad55SAlex Elder 	}
2051f907ad55SAlex Elder 
2052bf0d5f50SAlex Elder 	obj_request->object_name = memcpy(name, object_name, size);
2053bf0d5f50SAlex Elder 	obj_request->offset = offset;
2054bf0d5f50SAlex Elder 	obj_request->length = length;
2055926f9b3fSAlex Elder 	obj_request->flags = 0;
2056bf0d5f50SAlex Elder 	obj_request->which = BAD_WHICH;
2057bf0d5f50SAlex Elder 	obj_request->type = type;
2058bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&obj_request->links);
2059788e2df3SAlex Elder 	init_completion(&obj_request->completion);
2060bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
2061bf0d5f50SAlex Elder 
206237206ee5SAlex Elder 	dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
206337206ee5SAlex Elder 		offset, length, (int)type, obj_request);
206437206ee5SAlex Elder 
2065bf0d5f50SAlex Elder 	return obj_request;
2066bf0d5f50SAlex Elder }
2067bf0d5f50SAlex Elder 
2068bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
2069bf0d5f50SAlex Elder {
2070bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2071bf0d5f50SAlex Elder 
2072bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
2073bf0d5f50SAlex Elder 
207437206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
207537206ee5SAlex Elder 
2076bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
2077bf0d5f50SAlex Elder 	rbd_assert(obj_request->which == BAD_WHICH);
2078bf0d5f50SAlex Elder 
2079bf0d5f50SAlex Elder 	if (obj_request->osd_req)
2080bf0d5f50SAlex Elder 		rbd_osd_req_destroy(obj_request->osd_req);
2081bf0d5f50SAlex Elder 
2082bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
2083bf0d5f50SAlex Elder 	switch (obj_request->type) {
20849969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
20859969ebc5SAlex Elder 		break;		/* Nothing to do */
2086bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
2087bf0d5f50SAlex Elder 		if (obj_request->bio_list)
2088bf0d5f50SAlex Elder 			bio_chain_put(obj_request->bio_list);
2089bf0d5f50SAlex Elder 		break;
2090788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
2091788e2df3SAlex Elder 		if (obj_request->pages)
2092788e2df3SAlex Elder 			ceph_release_page_vector(obj_request->pages,
2093788e2df3SAlex Elder 						obj_request->page_count);
2094788e2df3SAlex Elder 		break;
2095bf0d5f50SAlex Elder 	}
2096bf0d5f50SAlex Elder 
2097f907ad55SAlex Elder 	kfree(obj_request->object_name);
2098868311b1SAlex Elder 	obj_request->object_name = NULL;
2099868311b1SAlex Elder 	kmem_cache_free(rbd_obj_request_cache, obj_request);
2100bf0d5f50SAlex Elder }
2101bf0d5f50SAlex Elder 
2102fb65d228SAlex Elder /* It's OK to call this for a device with no parent */
2103fb65d228SAlex Elder 
2104fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
2105fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev)
2106fb65d228SAlex Elder {
2107fb65d228SAlex Elder 	rbd_dev_remove_parent(rbd_dev);
2108fb65d228SAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
2109fb65d228SAlex Elder 	rbd_dev->parent_spec = NULL;
2110fb65d228SAlex Elder 	rbd_dev->parent_overlap = 0;
2111fb65d228SAlex Elder }
2112fb65d228SAlex Elder 
2113bf0d5f50SAlex Elder /*
2114a2acd00eSAlex Elder  * Parent image reference counting is used to determine when an
2115a2acd00eSAlex Elder  * image's parent fields can be safely torn down--after there are no
2116a2acd00eSAlex Elder  * more in-flight requests to the parent image.  When the last
2117a2acd00eSAlex Elder  * reference is dropped, cleaning them up is safe.
2118a2acd00eSAlex Elder  */
2119a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
2120a2acd00eSAlex Elder {
2121a2acd00eSAlex Elder 	int counter;
2122a2acd00eSAlex Elder 
2123a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
2124a2acd00eSAlex Elder 		return;
2125a2acd00eSAlex Elder 
2126a2acd00eSAlex Elder 	counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
2127a2acd00eSAlex Elder 	if (counter > 0)
2128a2acd00eSAlex Elder 		return;
2129a2acd00eSAlex Elder 
2130a2acd00eSAlex Elder 	/* Last reference; clean up parent data structures */
2131a2acd00eSAlex Elder 
2132a2acd00eSAlex Elder 	if (!counter)
2133a2acd00eSAlex Elder 		rbd_dev_unparent(rbd_dev);
2134a2acd00eSAlex Elder 	else
21359584d508SIlya Dryomov 		rbd_warn(rbd_dev, "parent reference underflow");
2136a2acd00eSAlex Elder }
2137a2acd00eSAlex Elder 
2138a2acd00eSAlex Elder /*
2139a2acd00eSAlex Elder  * If an image has a non-zero parent overlap, get a reference to its
2140a2acd00eSAlex Elder  * parent.
2141a2acd00eSAlex Elder  *
2142a2acd00eSAlex Elder  * Returns true if the rbd device has a parent with a non-zero
2143a2acd00eSAlex Elder  * overlap and a reference for it was successfully taken, or
2144a2acd00eSAlex Elder  * false otherwise.
2145a2acd00eSAlex Elder  */
2146a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
2147a2acd00eSAlex Elder {
2148ae43e9d0SIlya Dryomov 	int counter = 0;
2149a2acd00eSAlex Elder 
2150a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
2151a2acd00eSAlex Elder 		return false;
2152a2acd00eSAlex Elder 
2153ae43e9d0SIlya Dryomov 	down_read(&rbd_dev->header_rwsem);
2154ae43e9d0SIlya Dryomov 	if (rbd_dev->parent_overlap)
2155a2acd00eSAlex Elder 		counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
2156ae43e9d0SIlya Dryomov 	up_read(&rbd_dev->header_rwsem);
2157a2acd00eSAlex Elder 
2158a2acd00eSAlex Elder 	if (counter < 0)
21599584d508SIlya Dryomov 		rbd_warn(rbd_dev, "parent reference overflow");
2160a2acd00eSAlex Elder 
2161ae43e9d0SIlya Dryomov 	return counter > 0;
2162a2acd00eSAlex Elder }
2163a2acd00eSAlex Elder 
2164bf0d5f50SAlex Elder /*
2165bf0d5f50SAlex Elder  * Caller is responsible for filling in the list of object requests
2166bf0d5f50SAlex Elder  * that comprises the image request, and the Linux request pointer
2167bf0d5f50SAlex Elder  * (if there is one).
2168bf0d5f50SAlex Elder  */
2169cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create(
2170cc344fa1SAlex Elder 					struct rbd_device *rbd_dev,
2171bf0d5f50SAlex Elder 					u64 offset, u64 length,
21726d2940c8SGuangliang Zhao 					enum obj_operation_type op_type,
21734e752f0aSJosh Durgin 					struct ceph_snap_context *snapc)
2174bf0d5f50SAlex Elder {
2175bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
2176bf0d5f50SAlex Elder 
21777a716aacSIlya Dryomov 	img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
2178bf0d5f50SAlex Elder 	if (!img_request)
2179bf0d5f50SAlex Elder 		return NULL;
2180bf0d5f50SAlex Elder 
2181bf0d5f50SAlex Elder 	img_request->rq = NULL;
2182bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
2183bf0d5f50SAlex Elder 	img_request->offset = offset;
2184bf0d5f50SAlex Elder 	img_request->length = length;
21850c425248SAlex Elder 	img_request->flags = 0;
218690e98c52SGuangliang Zhao 	if (op_type == OBJ_OP_DISCARD) {
218790e98c52SGuangliang Zhao 		img_request_discard_set(img_request);
218890e98c52SGuangliang Zhao 		img_request->snapc = snapc;
218990e98c52SGuangliang Zhao 	} else if (op_type == OBJ_OP_WRITE) {
21900c425248SAlex Elder 		img_request_write_set(img_request);
21914e752f0aSJosh Durgin 		img_request->snapc = snapc;
21920c425248SAlex Elder 	} else {
2193bf0d5f50SAlex Elder 		img_request->snap_id = rbd_dev->spec->snap_id;
21940c425248SAlex Elder 	}
2195a2acd00eSAlex Elder 	if (rbd_dev_parent_get(rbd_dev))
2196d0b2e944SAlex Elder 		img_request_layered_set(img_request);
2197bf0d5f50SAlex Elder 	spin_lock_init(&img_request->completion_lock);
2198bf0d5f50SAlex Elder 	img_request->next_completion = 0;
2199bf0d5f50SAlex Elder 	img_request->callback = NULL;
2200a5a337d4SAlex Elder 	img_request->result = 0;
2201bf0d5f50SAlex Elder 	img_request->obj_request_count = 0;
2202bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&img_request->obj_requests);
2203bf0d5f50SAlex Elder 	kref_init(&img_request->kref);
2204bf0d5f50SAlex Elder 
220537206ee5SAlex Elder 	dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
22066d2940c8SGuangliang Zhao 		obj_op_name(op_type), offset, length, img_request);
220737206ee5SAlex Elder 
2208bf0d5f50SAlex Elder 	return img_request;
2209bf0d5f50SAlex Elder }
2210bf0d5f50SAlex Elder 
2211bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref)
2212bf0d5f50SAlex Elder {
2213bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
2214bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2215bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
2216bf0d5f50SAlex Elder 
2217bf0d5f50SAlex Elder 	img_request = container_of(kref, struct rbd_img_request, kref);
2218bf0d5f50SAlex Elder 
221937206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
222037206ee5SAlex Elder 
2221bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2222bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
222325dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count == 0);
2224bf0d5f50SAlex Elder 
2225a2acd00eSAlex Elder 	if (img_request_layered_test(img_request)) {
2226a2acd00eSAlex Elder 		img_request_layered_clear(img_request);
2227a2acd00eSAlex Elder 		rbd_dev_parent_put(img_request->rbd_dev);
2228a2acd00eSAlex Elder 	}
2229a2acd00eSAlex Elder 
2230bef95455SJosh Durgin 	if (img_request_write_test(img_request) ||
2231bef95455SJosh Durgin 		img_request_discard_test(img_request))
2232812164f8SAlex Elder 		ceph_put_snap_context(img_request->snapc);
2233bf0d5f50SAlex Elder 
22341c2a9dfeSAlex Elder 	kmem_cache_free(rbd_img_request_cache, img_request);
2235bf0d5f50SAlex Elder }
2236bf0d5f50SAlex Elder 
2237e93f3152SAlex Elder static struct rbd_img_request *rbd_parent_request_create(
2238e93f3152SAlex Elder 					struct rbd_obj_request *obj_request,
2239e93f3152SAlex Elder 					u64 img_offset, u64 length)
2240e93f3152SAlex Elder {
2241e93f3152SAlex Elder 	struct rbd_img_request *parent_request;
2242e93f3152SAlex Elder 	struct rbd_device *rbd_dev;
2243e93f3152SAlex Elder 
2244e93f3152SAlex Elder 	rbd_assert(obj_request->img_request);
2245e93f3152SAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2246e93f3152SAlex Elder 
22474e752f0aSJosh Durgin 	parent_request = rbd_img_request_create(rbd_dev->parent, img_offset,
22486d2940c8SGuangliang Zhao 						length, OBJ_OP_READ, NULL);
2249e93f3152SAlex Elder 	if (!parent_request)
2250e93f3152SAlex Elder 		return NULL;
2251e93f3152SAlex Elder 
2252e93f3152SAlex Elder 	img_request_child_set(parent_request);
2253e93f3152SAlex Elder 	rbd_obj_request_get(obj_request);
2254e93f3152SAlex Elder 	parent_request->obj_request = obj_request;
2255e93f3152SAlex Elder 
2256e93f3152SAlex Elder 	return parent_request;
2257e93f3152SAlex Elder }
2258e93f3152SAlex Elder 
2259e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref)
2260e93f3152SAlex Elder {
2261e93f3152SAlex Elder 	struct rbd_img_request *parent_request;
2262e93f3152SAlex Elder 	struct rbd_obj_request *orig_request;
2263e93f3152SAlex Elder 
2264e93f3152SAlex Elder 	parent_request = container_of(kref, struct rbd_img_request, kref);
2265e93f3152SAlex Elder 	orig_request = parent_request->obj_request;
2266e93f3152SAlex Elder 
2267e93f3152SAlex Elder 	parent_request->obj_request = NULL;
2268e93f3152SAlex Elder 	rbd_obj_request_put(orig_request);
2269e93f3152SAlex Elder 	img_request_child_clear(parent_request);
2270e93f3152SAlex Elder 
2271e93f3152SAlex Elder 	rbd_img_request_destroy(kref);
2272e93f3152SAlex Elder }
2273e93f3152SAlex Elder 
22741217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
22751217857fSAlex Elder {
22766365d33aSAlex Elder 	struct rbd_img_request *img_request;
22771217857fSAlex Elder 	unsigned int xferred;
22781217857fSAlex Elder 	int result;
22798b3e1a56SAlex Elder 	bool more;
22801217857fSAlex Elder 
22816365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
22826365d33aSAlex Elder 	img_request = obj_request->img_request;
22836365d33aSAlex Elder 
22841217857fSAlex Elder 	rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
22851217857fSAlex Elder 	xferred = (unsigned int)obj_request->xferred;
22861217857fSAlex Elder 	result = obj_request->result;
22871217857fSAlex Elder 	if (result) {
22881217857fSAlex Elder 		struct rbd_device *rbd_dev = img_request->rbd_dev;
22896d2940c8SGuangliang Zhao 		enum obj_operation_type op_type;
22906d2940c8SGuangliang Zhao 
229190e98c52SGuangliang Zhao 		if (img_request_discard_test(img_request))
229290e98c52SGuangliang Zhao 			op_type = OBJ_OP_DISCARD;
229390e98c52SGuangliang Zhao 		else if (img_request_write_test(img_request))
229490e98c52SGuangliang Zhao 			op_type = OBJ_OP_WRITE;
229590e98c52SGuangliang Zhao 		else
229690e98c52SGuangliang Zhao 			op_type = OBJ_OP_READ;
22971217857fSAlex Elder 
22989584d508SIlya Dryomov 		rbd_warn(rbd_dev, "%s %llx at %llx (%llx)",
22996d2940c8SGuangliang Zhao 			obj_op_name(op_type), obj_request->length,
23006d2940c8SGuangliang Zhao 			obj_request->img_offset, obj_request->offset);
23019584d508SIlya Dryomov 		rbd_warn(rbd_dev, "  result %d xferred %x",
23021217857fSAlex Elder 			result, xferred);
23031217857fSAlex Elder 		if (!img_request->result)
23041217857fSAlex Elder 			img_request->result = result;
2305082a75daSIlya Dryomov 		/*
2306082a75daSIlya Dryomov 		 * Need to end I/O on the entire obj_request worth of
2307082a75daSIlya Dryomov 		 * bytes in case of error.
2308082a75daSIlya Dryomov 		 */
2309082a75daSIlya Dryomov 		xferred = obj_request->length;
23101217857fSAlex Elder 	}
23111217857fSAlex Elder 
2312f1a4739fSAlex Elder 	/* Image object requests don't own their page array */
2313f1a4739fSAlex Elder 
2314f1a4739fSAlex Elder 	if (obj_request->type == OBJ_REQUEST_PAGES) {
2315f1a4739fSAlex Elder 		obj_request->pages = NULL;
2316f1a4739fSAlex Elder 		obj_request->page_count = 0;
2317f1a4739fSAlex Elder 	}
2318f1a4739fSAlex Elder 
23198b3e1a56SAlex Elder 	if (img_request_child_test(img_request)) {
23208b3e1a56SAlex Elder 		rbd_assert(img_request->obj_request != NULL);
23218b3e1a56SAlex Elder 		more = obj_request->which < img_request->obj_request_count - 1;
23228b3e1a56SAlex Elder 	} else {
23238b3e1a56SAlex Elder 		rbd_assert(img_request->rq != NULL);
23247ad18afaSChristoph Hellwig 
23257ad18afaSChristoph Hellwig 		more = blk_update_request(img_request->rq, result, xferred);
23267ad18afaSChristoph Hellwig 		if (!more)
23277ad18afaSChristoph Hellwig 			__blk_mq_end_request(img_request->rq, result);
23288b3e1a56SAlex Elder 	}
23298b3e1a56SAlex Elder 
23308b3e1a56SAlex Elder 	return more;
23311217857fSAlex Elder }
23321217857fSAlex Elder 
23332169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
23342169238dSAlex Elder {
23352169238dSAlex Elder 	struct rbd_img_request *img_request;
23362169238dSAlex Elder 	u32 which = obj_request->which;
23372169238dSAlex Elder 	bool more = true;
23382169238dSAlex Elder 
23396365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
23402169238dSAlex Elder 	img_request = obj_request->img_request;
23412169238dSAlex Elder 
23422169238dSAlex Elder 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
23432169238dSAlex Elder 	rbd_assert(img_request != NULL);
23442169238dSAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
23452169238dSAlex Elder 	rbd_assert(which != BAD_WHICH);
23462169238dSAlex Elder 	rbd_assert(which < img_request->obj_request_count);
23472169238dSAlex Elder 
23482169238dSAlex Elder 	spin_lock_irq(&img_request->completion_lock);
23492169238dSAlex Elder 	if (which != img_request->next_completion)
23502169238dSAlex Elder 		goto out;
23512169238dSAlex Elder 
23522169238dSAlex Elder 	for_each_obj_request_from(img_request, obj_request) {
23532169238dSAlex Elder 		rbd_assert(more);
23542169238dSAlex Elder 		rbd_assert(which < img_request->obj_request_count);
23552169238dSAlex Elder 
23562169238dSAlex Elder 		if (!obj_request_done_test(obj_request))
23572169238dSAlex Elder 			break;
23581217857fSAlex Elder 		more = rbd_img_obj_end_request(obj_request);
23592169238dSAlex Elder 		which++;
23602169238dSAlex Elder 	}
23612169238dSAlex Elder 
23622169238dSAlex Elder 	rbd_assert(more ^ (which == img_request->obj_request_count));
23632169238dSAlex Elder 	img_request->next_completion = which;
23642169238dSAlex Elder out:
23652169238dSAlex Elder 	spin_unlock_irq(&img_request->completion_lock);
23660f2d5be7SAlex Elder 	rbd_img_request_put(img_request);
23672169238dSAlex Elder 
23682169238dSAlex Elder 	if (!more)
23692169238dSAlex Elder 		rbd_img_request_complete(img_request);
23702169238dSAlex Elder }
23712169238dSAlex Elder 
2372f1a4739fSAlex Elder /*
23733b434a2aSJosh Durgin  * Add individual osd ops to the given ceph_osd_request and prepare
23743b434a2aSJosh Durgin  * them for submission. num_ops is the current number of
23753b434a2aSJosh Durgin  * osd operations already to the object request.
23763b434a2aSJosh Durgin  */
23773b434a2aSJosh Durgin static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request,
23783b434a2aSJosh Durgin 				struct ceph_osd_request *osd_request,
23793b434a2aSJosh Durgin 				enum obj_operation_type op_type,
23803b434a2aSJosh Durgin 				unsigned int num_ops)
23813b434a2aSJosh Durgin {
23823b434a2aSJosh Durgin 	struct rbd_img_request *img_request = obj_request->img_request;
23833b434a2aSJosh Durgin 	struct rbd_device *rbd_dev = img_request->rbd_dev;
23843b434a2aSJosh Durgin 	u64 object_size = rbd_obj_bytes(&rbd_dev->header);
23853b434a2aSJosh Durgin 	u64 offset = obj_request->offset;
23863b434a2aSJosh Durgin 	u64 length = obj_request->length;
23873b434a2aSJosh Durgin 	u64 img_end;
23883b434a2aSJosh Durgin 	u16 opcode;
23893b434a2aSJosh Durgin 
23903b434a2aSJosh Durgin 	if (op_type == OBJ_OP_DISCARD) {
2391d3246fb0SJosh Durgin 		if (!offset && length == object_size &&
2392d3246fb0SJosh Durgin 		    (!img_request_layered_test(img_request) ||
2393d3246fb0SJosh Durgin 		     !obj_request_overlaps_parent(obj_request))) {
23943b434a2aSJosh Durgin 			opcode = CEPH_OSD_OP_DELETE;
23953b434a2aSJosh Durgin 		} else if ((offset + length == object_size)) {
23963b434a2aSJosh Durgin 			opcode = CEPH_OSD_OP_TRUNCATE;
23973b434a2aSJosh Durgin 		} else {
23983b434a2aSJosh Durgin 			down_read(&rbd_dev->header_rwsem);
23993b434a2aSJosh Durgin 			img_end = rbd_dev->header.image_size;
24003b434a2aSJosh Durgin 			up_read(&rbd_dev->header_rwsem);
24013b434a2aSJosh Durgin 
24023b434a2aSJosh Durgin 			if (obj_request->img_offset + length == img_end)
24033b434a2aSJosh Durgin 				opcode = CEPH_OSD_OP_TRUNCATE;
24043b434a2aSJosh Durgin 			else
24053b434a2aSJosh Durgin 				opcode = CEPH_OSD_OP_ZERO;
24063b434a2aSJosh Durgin 		}
24073b434a2aSJosh Durgin 	} else if (op_type == OBJ_OP_WRITE) {
2408e30b7577SIlya Dryomov 		if (!offset && length == object_size)
2409e30b7577SIlya Dryomov 			opcode = CEPH_OSD_OP_WRITEFULL;
2410e30b7577SIlya Dryomov 		else
24113b434a2aSJosh Durgin 			opcode = CEPH_OSD_OP_WRITE;
24123b434a2aSJosh Durgin 		osd_req_op_alloc_hint_init(osd_request, num_ops,
24133b434a2aSJosh Durgin 					object_size, object_size);
24143b434a2aSJosh Durgin 		num_ops++;
24153b434a2aSJosh Durgin 	} else {
24163b434a2aSJosh Durgin 		opcode = CEPH_OSD_OP_READ;
24173b434a2aSJosh Durgin 	}
24183b434a2aSJosh Durgin 
24197e868b6eSIlya Dryomov 	if (opcode == CEPH_OSD_OP_DELETE)
2420144cba14SYan, Zheng 		osd_req_op_init(osd_request, num_ops, opcode, 0);
24217e868b6eSIlya Dryomov 	else
24227e868b6eSIlya Dryomov 		osd_req_op_extent_init(osd_request, num_ops, opcode,
24237e868b6eSIlya Dryomov 				       offset, length, 0, 0);
24247e868b6eSIlya Dryomov 
24253b434a2aSJosh Durgin 	if (obj_request->type == OBJ_REQUEST_BIO)
24263b434a2aSJosh Durgin 		osd_req_op_extent_osd_data_bio(osd_request, num_ops,
24273b434a2aSJosh Durgin 					obj_request->bio_list, length);
24283b434a2aSJosh Durgin 	else if (obj_request->type == OBJ_REQUEST_PAGES)
24293b434a2aSJosh Durgin 		osd_req_op_extent_osd_data_pages(osd_request, num_ops,
24303b434a2aSJosh Durgin 					obj_request->pages, length,
24313b434a2aSJosh Durgin 					offset & ~PAGE_MASK, false, false);
24323b434a2aSJosh Durgin 
24333b434a2aSJosh Durgin 	/* Discards are also writes */
24343b434a2aSJosh Durgin 	if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
24353b434a2aSJosh Durgin 		rbd_osd_req_format_write(obj_request);
24363b434a2aSJosh Durgin 	else
24373b434a2aSJosh Durgin 		rbd_osd_req_format_read(obj_request);
24383b434a2aSJosh Durgin }
24393b434a2aSJosh Durgin 
24403b434a2aSJosh Durgin /*
2441f1a4739fSAlex Elder  * Split up an image request into one or more object requests, each
2442f1a4739fSAlex Elder  * to a different object.  The "type" parameter indicates whether
2443f1a4739fSAlex Elder  * "data_desc" is the pointer to the head of a list of bio
2444f1a4739fSAlex Elder  * structures, or the base of a page array.  In either case this
2445f1a4739fSAlex Elder  * function assumes data_desc describes memory sufficient to hold
2446f1a4739fSAlex Elder  * all data described by the image request.
2447f1a4739fSAlex Elder  */
2448f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request,
2449f1a4739fSAlex Elder 					enum obj_request_type type,
2450f1a4739fSAlex Elder 					void *data_desc)
2451bf0d5f50SAlex Elder {
2452bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = img_request->rbd_dev;
2453bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = NULL;
2454bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
2455a158073cSJingoo Han 	struct bio *bio_list = NULL;
2456f1a4739fSAlex Elder 	unsigned int bio_offset = 0;
2457a158073cSJingoo Han 	struct page **pages = NULL;
24586d2940c8SGuangliang Zhao 	enum obj_operation_type op_type;
24597da22d29SAlex Elder 	u64 img_offset;
2460bf0d5f50SAlex Elder 	u64 resid;
2461bf0d5f50SAlex Elder 
2462f1a4739fSAlex Elder 	dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2463f1a4739fSAlex Elder 		(int)type, data_desc);
246437206ee5SAlex Elder 
24657da22d29SAlex Elder 	img_offset = img_request->offset;
2466bf0d5f50SAlex Elder 	resid = img_request->length;
24674dda41d3SAlex Elder 	rbd_assert(resid > 0);
24683b434a2aSJosh Durgin 	op_type = rbd_img_request_op_type(img_request);
2469f1a4739fSAlex Elder 
2470f1a4739fSAlex Elder 	if (type == OBJ_REQUEST_BIO) {
2471f1a4739fSAlex Elder 		bio_list = data_desc;
24724f024f37SKent Overstreet 		rbd_assert(img_offset ==
24734f024f37SKent Overstreet 			   bio_list->bi_iter.bi_sector << SECTOR_SHIFT);
247490e98c52SGuangliang Zhao 	} else if (type == OBJ_REQUEST_PAGES) {
2475f1a4739fSAlex Elder 		pages = data_desc;
2476f1a4739fSAlex Elder 	}
2477f1a4739fSAlex Elder 
2478bf0d5f50SAlex Elder 	while (resid) {
24792fa12320SAlex Elder 		struct ceph_osd_request *osd_req;
2480bf0d5f50SAlex Elder 		const char *object_name;
2481bf0d5f50SAlex Elder 		u64 offset;
2482bf0d5f50SAlex Elder 		u64 length;
2483bf0d5f50SAlex Elder 
24847da22d29SAlex Elder 		object_name = rbd_segment_name(rbd_dev, img_offset);
2485bf0d5f50SAlex Elder 		if (!object_name)
2486bf0d5f50SAlex Elder 			goto out_unwind;
24877da22d29SAlex Elder 		offset = rbd_segment_offset(rbd_dev, img_offset);
24887da22d29SAlex Elder 		length = rbd_segment_length(rbd_dev, img_offset, resid);
2489bf0d5f50SAlex Elder 		obj_request = rbd_obj_request_create(object_name,
2490f1a4739fSAlex Elder 						offset, length, type);
249178c2a44aSAlex Elder 		/* object request has its own copy of the object name */
249278c2a44aSAlex Elder 		rbd_segment_name_free(object_name);
2493bf0d5f50SAlex Elder 		if (!obj_request)
2494bf0d5f50SAlex Elder 			goto out_unwind;
249562054da6SIlya Dryomov 
249603507db6SJosh Durgin 		/*
249703507db6SJosh Durgin 		 * set obj_request->img_request before creating the
249803507db6SJosh Durgin 		 * osd_request so that it gets the right snapc
249903507db6SJosh Durgin 		 */
250003507db6SJosh Durgin 		rbd_img_obj_request_add(img_request, obj_request);
2501bf0d5f50SAlex Elder 
2502f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO) {
2503f1a4739fSAlex Elder 			unsigned int clone_size;
2504f1a4739fSAlex Elder 
2505bf0d5f50SAlex Elder 			rbd_assert(length <= (u64)UINT_MAX);
2506bf0d5f50SAlex Elder 			clone_size = (unsigned int)length;
2507f1a4739fSAlex Elder 			obj_request->bio_list =
2508f1a4739fSAlex Elder 					bio_chain_clone_range(&bio_list,
2509f1a4739fSAlex Elder 								&bio_offset,
2510f1a4739fSAlex Elder 								clone_size,
2511bf0d5f50SAlex Elder 								GFP_ATOMIC);
2512bf0d5f50SAlex Elder 			if (!obj_request->bio_list)
251362054da6SIlya Dryomov 				goto out_unwind;
251490e98c52SGuangliang Zhao 		} else if (type == OBJ_REQUEST_PAGES) {
2515f1a4739fSAlex Elder 			unsigned int page_count;
2516f1a4739fSAlex Elder 
2517f1a4739fSAlex Elder 			obj_request->pages = pages;
2518f1a4739fSAlex Elder 			page_count = (u32)calc_pages_for(offset, length);
2519f1a4739fSAlex Elder 			obj_request->page_count = page_count;
2520f1a4739fSAlex Elder 			if ((offset + length) & ~PAGE_MASK)
2521f1a4739fSAlex Elder 				page_count--;	/* more on last page */
2522f1a4739fSAlex Elder 			pages += page_count;
2523f1a4739fSAlex Elder 		}
2524bf0d5f50SAlex Elder 
25256d2940c8SGuangliang Zhao 		osd_req = rbd_osd_req_create(rbd_dev, op_type,
25266d2940c8SGuangliang Zhao 					(op_type == OBJ_OP_WRITE) ? 2 : 1,
25272fa12320SAlex Elder 					obj_request);
25282fa12320SAlex Elder 		if (!osd_req)
252962054da6SIlya Dryomov 			goto out_unwind;
25303b434a2aSJosh Durgin 
25312fa12320SAlex Elder 		obj_request->osd_req = osd_req;
25322169238dSAlex Elder 		obj_request->callback = rbd_img_obj_callback;
25337da22d29SAlex Elder 		obj_request->img_offset = img_offset;
2534bf0d5f50SAlex Elder 
25353b434a2aSJosh Durgin 		rbd_img_obj_request_fill(obj_request, osd_req, op_type, 0);
25363b434a2aSJosh Durgin 
25373b434a2aSJosh Durgin 		rbd_img_request_get(img_request);
25383b434a2aSJosh Durgin 
25397da22d29SAlex Elder 		img_offset += length;
2540bf0d5f50SAlex Elder 		resid -= length;
2541bf0d5f50SAlex Elder 	}
2542bf0d5f50SAlex Elder 
2543bf0d5f50SAlex Elder 	return 0;
2544bf0d5f50SAlex Elder 
2545bf0d5f50SAlex Elder out_unwind:
2546bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
254742dd037cSIlya Dryomov 		rbd_img_obj_request_del(img_request, obj_request);
2548bf0d5f50SAlex Elder 
2549bf0d5f50SAlex Elder 	return -ENOMEM;
2550bf0d5f50SAlex Elder }
2551bf0d5f50SAlex Elder 
25523d7efd18SAlex Elder static void
25532761713dSIlya Dryomov rbd_osd_copyup_callback(struct rbd_obj_request *obj_request)
25540eefd470SAlex Elder {
25550eefd470SAlex Elder 	struct rbd_img_request *img_request;
25560eefd470SAlex Elder 	struct rbd_device *rbd_dev;
2557ebda6408SAlex Elder 	struct page **pages;
25580eefd470SAlex Elder 	u32 page_count;
25590eefd470SAlex Elder 
25602761713dSIlya Dryomov 	dout("%s: obj %p\n", __func__, obj_request);
25612761713dSIlya Dryomov 
2562d3246fb0SJosh Durgin 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO ||
2563d3246fb0SJosh Durgin 		obj_request->type == OBJ_REQUEST_NODATA);
25640eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
25650eefd470SAlex Elder 	img_request = obj_request->img_request;
25660eefd470SAlex Elder 	rbd_assert(img_request);
25670eefd470SAlex Elder 
25680eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
25690eefd470SAlex Elder 	rbd_assert(rbd_dev);
25700eefd470SAlex Elder 
2571ebda6408SAlex Elder 	pages = obj_request->copyup_pages;
2572ebda6408SAlex Elder 	rbd_assert(pages != NULL);
25730eefd470SAlex Elder 	obj_request->copyup_pages = NULL;
2574ebda6408SAlex Elder 	page_count = obj_request->copyup_page_count;
2575ebda6408SAlex Elder 	rbd_assert(page_count);
2576ebda6408SAlex Elder 	obj_request->copyup_page_count = 0;
2577ebda6408SAlex Elder 	ceph_release_page_vector(pages, page_count);
25780eefd470SAlex Elder 
25790eefd470SAlex Elder 	/*
25800eefd470SAlex Elder 	 * We want the transfer count to reflect the size of the
25810eefd470SAlex Elder 	 * original write request.  There is no such thing as a
25820eefd470SAlex Elder 	 * successful short write, so if the request was successful
25830eefd470SAlex Elder 	 * we can just set it to the originally-requested length.
25840eefd470SAlex Elder 	 */
25850eefd470SAlex Elder 	if (!obj_request->result)
25860eefd470SAlex Elder 		obj_request->xferred = obj_request->length;
25870eefd470SAlex Elder 
25882761713dSIlya Dryomov 	obj_request_done_set(obj_request);
25890eefd470SAlex Elder }
25900eefd470SAlex Elder 
25910eefd470SAlex Elder static void
25923d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
25933d7efd18SAlex Elder {
25943d7efd18SAlex Elder 	struct rbd_obj_request *orig_request;
25950eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
25960eefd470SAlex Elder 	struct ceph_osd_client *osdc;
25970eefd470SAlex Elder 	struct rbd_device *rbd_dev;
25983d7efd18SAlex Elder 	struct page **pages;
2599d3246fb0SJosh Durgin 	enum obj_operation_type op_type;
2600ebda6408SAlex Elder 	u32 page_count;
2601bbea1c1aSAlex Elder 	int img_result;
2602ebda6408SAlex Elder 	u64 parent_length;
26033d7efd18SAlex Elder 
26043d7efd18SAlex Elder 	rbd_assert(img_request_child_test(img_request));
26053d7efd18SAlex Elder 
26063d7efd18SAlex Elder 	/* First get what we need from the image request */
26073d7efd18SAlex Elder 
26083d7efd18SAlex Elder 	pages = img_request->copyup_pages;
26093d7efd18SAlex Elder 	rbd_assert(pages != NULL);
26103d7efd18SAlex Elder 	img_request->copyup_pages = NULL;
2611ebda6408SAlex Elder 	page_count = img_request->copyup_page_count;
2612ebda6408SAlex Elder 	rbd_assert(page_count);
2613ebda6408SAlex Elder 	img_request->copyup_page_count = 0;
26143d7efd18SAlex Elder 
26153d7efd18SAlex Elder 	orig_request = img_request->obj_request;
26163d7efd18SAlex Elder 	rbd_assert(orig_request != NULL);
2617b91f09f1SAlex Elder 	rbd_assert(obj_request_type_valid(orig_request->type));
2618bbea1c1aSAlex Elder 	img_result = img_request->result;
2619ebda6408SAlex Elder 	parent_length = img_request->length;
2620ebda6408SAlex Elder 	rbd_assert(parent_length == img_request->xferred);
26213d7efd18SAlex Elder 	rbd_img_request_put(img_request);
26223d7efd18SAlex Elder 
262391c6febbSAlex Elder 	rbd_assert(orig_request->img_request);
262491c6febbSAlex Elder 	rbd_dev = orig_request->img_request->rbd_dev;
26253d7efd18SAlex Elder 	rbd_assert(rbd_dev);
26263d7efd18SAlex Elder 
2627bbea1c1aSAlex Elder 	/*
2628bbea1c1aSAlex Elder 	 * If the overlap has become 0 (most likely because the
2629bbea1c1aSAlex Elder 	 * image has been flattened) we need to free the pages
2630bbea1c1aSAlex Elder 	 * and re-submit the original write request.
2631bbea1c1aSAlex Elder 	 */
2632bbea1c1aSAlex Elder 	if (!rbd_dev->parent_overlap) {
2633bbea1c1aSAlex Elder 		struct ceph_osd_client *osdc;
2634bbea1c1aSAlex Elder 
2635bbea1c1aSAlex Elder 		ceph_release_page_vector(pages, page_count);
2636bbea1c1aSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2637bbea1c1aSAlex Elder 		img_result = rbd_obj_request_submit(osdc, orig_request);
2638bbea1c1aSAlex Elder 		if (!img_result)
2639bbea1c1aSAlex Elder 			return;
2640bbea1c1aSAlex Elder 	}
2641bbea1c1aSAlex Elder 
2642bbea1c1aSAlex Elder 	if (img_result)
26430eefd470SAlex Elder 		goto out_err;
26443d7efd18SAlex Elder 
26458785b1d4SAlex Elder 	/*
26468785b1d4SAlex Elder 	 * The original osd request is of no use to use any more.
26470ccd5926SIlya Dryomov 	 * We need a new one that can hold the three ops in a copyup
26488785b1d4SAlex Elder 	 * request.  Allocate the new copyup osd request for the
26498785b1d4SAlex Elder 	 * original request, and release the old one.
26508785b1d4SAlex Elder 	 */
2651bbea1c1aSAlex Elder 	img_result = -ENOMEM;
26520eefd470SAlex Elder 	osd_req = rbd_osd_req_create_copyup(orig_request);
26530eefd470SAlex Elder 	if (!osd_req)
26540eefd470SAlex Elder 		goto out_err;
26558785b1d4SAlex Elder 	rbd_osd_req_destroy(orig_request->osd_req);
26560eefd470SAlex Elder 	orig_request->osd_req = osd_req;
26570eefd470SAlex Elder 	orig_request->copyup_pages = pages;
2658ebda6408SAlex Elder 	orig_request->copyup_page_count = page_count;
26593d7efd18SAlex Elder 
26600eefd470SAlex Elder 	/* Initialize the copyup op */
26610eefd470SAlex Elder 
26620eefd470SAlex Elder 	osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2663ebda6408SAlex Elder 	osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
26640eefd470SAlex Elder 						false, false);
26650eefd470SAlex Elder 
2666d3246fb0SJosh Durgin 	/* Add the other op(s) */
26670ccd5926SIlya Dryomov 
2668d3246fb0SJosh Durgin 	op_type = rbd_img_request_op_type(orig_request->img_request);
2669d3246fb0SJosh Durgin 	rbd_img_obj_request_fill(orig_request, osd_req, op_type, 1);
26700eefd470SAlex Elder 
26710eefd470SAlex Elder 	/* All set, send it off. */
26720eefd470SAlex Elder 
26730eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2674bbea1c1aSAlex Elder 	img_result = rbd_obj_request_submit(osdc, orig_request);
2675bbea1c1aSAlex Elder 	if (!img_result)
26760eefd470SAlex Elder 		return;
26770eefd470SAlex Elder out_err:
26780eefd470SAlex Elder 	/* Record the error code and complete the request */
26790eefd470SAlex Elder 
2680bbea1c1aSAlex Elder 	orig_request->result = img_result;
26810eefd470SAlex Elder 	orig_request->xferred = 0;
26823d7efd18SAlex Elder 	obj_request_done_set(orig_request);
26833d7efd18SAlex Elder 	rbd_obj_request_complete(orig_request);
26843d7efd18SAlex Elder }
26853d7efd18SAlex Elder 
26863d7efd18SAlex Elder /*
26873d7efd18SAlex Elder  * Read from the parent image the range of data that covers the
26883d7efd18SAlex Elder  * entire target of the given object request.  This is used for
26893d7efd18SAlex Elder  * satisfying a layered image write request when the target of an
26903d7efd18SAlex Elder  * object request from the image request does not exist.
26913d7efd18SAlex Elder  *
26923d7efd18SAlex Elder  * A page array big enough to hold the returned data is allocated
26933d7efd18SAlex Elder  * and supplied to rbd_img_request_fill() as the "data descriptor."
26943d7efd18SAlex Elder  * When the read completes, this page array will be transferred to
26953d7efd18SAlex Elder  * the original object request for the copyup operation.
26963d7efd18SAlex Elder  *
26973d7efd18SAlex Elder  * If an error occurs, record it as the result of the original
26983d7efd18SAlex Elder  * object request and mark it done so it gets completed.
26993d7efd18SAlex Elder  */
27003d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
27013d7efd18SAlex Elder {
27023d7efd18SAlex Elder 	struct rbd_img_request *img_request = NULL;
27033d7efd18SAlex Elder 	struct rbd_img_request *parent_request = NULL;
27043d7efd18SAlex Elder 	struct rbd_device *rbd_dev;
27053d7efd18SAlex Elder 	u64 img_offset;
27063d7efd18SAlex Elder 	u64 length;
27073d7efd18SAlex Elder 	struct page **pages = NULL;
27083d7efd18SAlex Elder 	u32 page_count;
27093d7efd18SAlex Elder 	int result;
27103d7efd18SAlex Elder 
27113d7efd18SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
2712b91f09f1SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
27133d7efd18SAlex Elder 
27143d7efd18SAlex Elder 	img_request = obj_request->img_request;
27153d7efd18SAlex Elder 	rbd_assert(img_request != NULL);
27163d7efd18SAlex Elder 	rbd_dev = img_request->rbd_dev;
27173d7efd18SAlex Elder 	rbd_assert(rbd_dev->parent != NULL);
27183d7efd18SAlex Elder 
27193d7efd18SAlex Elder 	/*
27203d7efd18SAlex Elder 	 * Determine the byte range covered by the object in the
27213d7efd18SAlex Elder 	 * child image to which the original request was to be sent.
27223d7efd18SAlex Elder 	 */
27233d7efd18SAlex Elder 	img_offset = obj_request->img_offset - obj_request->offset;
27243d7efd18SAlex Elder 	length = (u64)1 << rbd_dev->header.obj_order;
27253d7efd18SAlex Elder 
27263d7efd18SAlex Elder 	/*
2727a9e8ba2cSAlex Elder 	 * There is no defined parent data beyond the parent
2728a9e8ba2cSAlex Elder 	 * overlap, so limit what we read at that boundary if
2729a9e8ba2cSAlex Elder 	 * necessary.
2730a9e8ba2cSAlex Elder 	 */
2731a9e8ba2cSAlex Elder 	if (img_offset + length > rbd_dev->parent_overlap) {
2732a9e8ba2cSAlex Elder 		rbd_assert(img_offset < rbd_dev->parent_overlap);
2733a9e8ba2cSAlex Elder 		length = rbd_dev->parent_overlap - img_offset;
2734a9e8ba2cSAlex Elder 	}
2735a9e8ba2cSAlex Elder 
2736a9e8ba2cSAlex Elder 	/*
27373d7efd18SAlex Elder 	 * Allocate a page array big enough to receive the data read
27383d7efd18SAlex Elder 	 * from the parent.
27393d7efd18SAlex Elder 	 */
27403d7efd18SAlex Elder 	page_count = (u32)calc_pages_for(0, length);
27413d7efd18SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
27423d7efd18SAlex Elder 	if (IS_ERR(pages)) {
27433d7efd18SAlex Elder 		result = PTR_ERR(pages);
27443d7efd18SAlex Elder 		pages = NULL;
27453d7efd18SAlex Elder 		goto out_err;
27463d7efd18SAlex Elder 	}
27473d7efd18SAlex Elder 
27483d7efd18SAlex Elder 	result = -ENOMEM;
2749e93f3152SAlex Elder 	parent_request = rbd_parent_request_create(obj_request,
2750e93f3152SAlex Elder 						img_offset, length);
27513d7efd18SAlex Elder 	if (!parent_request)
27523d7efd18SAlex Elder 		goto out_err;
27533d7efd18SAlex Elder 
27543d7efd18SAlex Elder 	result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
27553d7efd18SAlex Elder 	if (result)
27563d7efd18SAlex Elder 		goto out_err;
27573d7efd18SAlex Elder 	parent_request->copyup_pages = pages;
2758ebda6408SAlex Elder 	parent_request->copyup_page_count = page_count;
27593d7efd18SAlex Elder 
27603d7efd18SAlex Elder 	parent_request->callback = rbd_img_obj_parent_read_full_callback;
27613d7efd18SAlex Elder 	result = rbd_img_request_submit(parent_request);
27623d7efd18SAlex Elder 	if (!result)
27633d7efd18SAlex Elder 		return 0;
27643d7efd18SAlex Elder 
27653d7efd18SAlex Elder 	parent_request->copyup_pages = NULL;
2766ebda6408SAlex Elder 	parent_request->copyup_page_count = 0;
27673d7efd18SAlex Elder 	parent_request->obj_request = NULL;
27683d7efd18SAlex Elder 	rbd_obj_request_put(obj_request);
27693d7efd18SAlex Elder out_err:
27703d7efd18SAlex Elder 	if (pages)
27713d7efd18SAlex Elder 		ceph_release_page_vector(pages, page_count);
27723d7efd18SAlex Elder 	if (parent_request)
27733d7efd18SAlex Elder 		rbd_img_request_put(parent_request);
27743d7efd18SAlex Elder 	obj_request->result = result;
27753d7efd18SAlex Elder 	obj_request->xferred = 0;
27763d7efd18SAlex Elder 	obj_request_done_set(obj_request);
27773d7efd18SAlex Elder 
27783d7efd18SAlex Elder 	return result;
27793d7efd18SAlex Elder }
27803d7efd18SAlex Elder 
2781c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2782c5b5ef6cSAlex Elder {
2783c5b5ef6cSAlex Elder 	struct rbd_obj_request *orig_request;
2784638f5abeSAlex Elder 	struct rbd_device *rbd_dev;
2785c5b5ef6cSAlex Elder 	int result;
2786c5b5ef6cSAlex Elder 
2787c5b5ef6cSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
2788c5b5ef6cSAlex Elder 
2789c5b5ef6cSAlex Elder 	/*
2790c5b5ef6cSAlex Elder 	 * All we need from the object request is the original
2791c5b5ef6cSAlex Elder 	 * request and the result of the STAT op.  Grab those, then
2792c5b5ef6cSAlex Elder 	 * we're done with the request.
2793c5b5ef6cSAlex Elder 	 */
2794c5b5ef6cSAlex Elder 	orig_request = obj_request->obj_request;
2795c5b5ef6cSAlex Elder 	obj_request->obj_request = NULL;
2796912c317dSAlex Elder 	rbd_obj_request_put(orig_request);
2797c5b5ef6cSAlex Elder 	rbd_assert(orig_request);
2798c5b5ef6cSAlex Elder 	rbd_assert(orig_request->img_request);
2799c5b5ef6cSAlex Elder 
2800c5b5ef6cSAlex Elder 	result = obj_request->result;
2801c5b5ef6cSAlex Elder 	obj_request->result = 0;
2802c5b5ef6cSAlex Elder 
2803c5b5ef6cSAlex Elder 	dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2804c5b5ef6cSAlex Elder 		obj_request, orig_request, result,
2805c5b5ef6cSAlex Elder 		obj_request->xferred, obj_request->length);
2806c5b5ef6cSAlex Elder 	rbd_obj_request_put(obj_request);
2807c5b5ef6cSAlex Elder 
2808638f5abeSAlex Elder 	/*
2809638f5abeSAlex Elder 	 * If the overlap has become 0 (most likely because the
2810638f5abeSAlex Elder 	 * image has been flattened) we need to free the pages
2811638f5abeSAlex Elder 	 * and re-submit the original write request.
2812638f5abeSAlex Elder 	 */
2813638f5abeSAlex Elder 	rbd_dev = orig_request->img_request->rbd_dev;
2814638f5abeSAlex Elder 	if (!rbd_dev->parent_overlap) {
2815638f5abeSAlex Elder 		struct ceph_osd_client *osdc;
2816638f5abeSAlex Elder 
2817638f5abeSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2818638f5abeSAlex Elder 		result = rbd_obj_request_submit(osdc, orig_request);
2819638f5abeSAlex Elder 		if (!result)
2820638f5abeSAlex Elder 			return;
2821638f5abeSAlex Elder 	}
2822c5b5ef6cSAlex Elder 
2823c5b5ef6cSAlex Elder 	/*
2824c5b5ef6cSAlex Elder 	 * Our only purpose here is to determine whether the object
2825c5b5ef6cSAlex Elder 	 * exists, and we don't want to treat the non-existence as
2826c5b5ef6cSAlex Elder 	 * an error.  If something else comes back, transfer the
2827c5b5ef6cSAlex Elder 	 * error to the original request and complete it now.
2828c5b5ef6cSAlex Elder 	 */
2829c5b5ef6cSAlex Elder 	if (!result) {
2830c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, true);
2831c5b5ef6cSAlex Elder 	} else if (result == -ENOENT) {
2832c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, false);
2833c5b5ef6cSAlex Elder 	} else if (result) {
2834c5b5ef6cSAlex Elder 		orig_request->result = result;
28353d7efd18SAlex Elder 		goto out;
2836c5b5ef6cSAlex Elder 	}
2837c5b5ef6cSAlex Elder 
2838c5b5ef6cSAlex Elder 	/*
2839c5b5ef6cSAlex Elder 	 * Resubmit the original request now that we have recorded
2840c5b5ef6cSAlex Elder 	 * whether the target object exists.
2841c5b5ef6cSAlex Elder 	 */
2842b454e36dSAlex Elder 	orig_request->result = rbd_img_obj_request_submit(orig_request);
28433d7efd18SAlex Elder out:
2844c5b5ef6cSAlex Elder 	if (orig_request->result)
2845c5b5ef6cSAlex Elder 		rbd_obj_request_complete(orig_request);
2846c5b5ef6cSAlex Elder }
2847c5b5ef6cSAlex Elder 
2848c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2849c5b5ef6cSAlex Elder {
2850c5b5ef6cSAlex Elder 	struct rbd_obj_request *stat_request;
2851c5b5ef6cSAlex Elder 	struct rbd_device *rbd_dev;
2852c5b5ef6cSAlex Elder 	struct ceph_osd_client *osdc;
2853c5b5ef6cSAlex Elder 	struct page **pages = NULL;
2854c5b5ef6cSAlex Elder 	u32 page_count;
2855c5b5ef6cSAlex Elder 	size_t size;
2856c5b5ef6cSAlex Elder 	int ret;
2857c5b5ef6cSAlex Elder 
2858c5b5ef6cSAlex Elder 	/*
2859c5b5ef6cSAlex Elder 	 * The response data for a STAT call consists of:
2860c5b5ef6cSAlex Elder 	 *     le64 length;
2861c5b5ef6cSAlex Elder 	 *     struct {
2862c5b5ef6cSAlex Elder 	 *         le32 tv_sec;
2863c5b5ef6cSAlex Elder 	 *         le32 tv_nsec;
2864c5b5ef6cSAlex Elder 	 *     } mtime;
2865c5b5ef6cSAlex Elder 	 */
2866c5b5ef6cSAlex Elder 	size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2867c5b5ef6cSAlex Elder 	page_count = (u32)calc_pages_for(0, size);
2868c5b5ef6cSAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2869c5b5ef6cSAlex Elder 	if (IS_ERR(pages))
2870c5b5ef6cSAlex Elder 		return PTR_ERR(pages);
2871c5b5ef6cSAlex Elder 
2872c5b5ef6cSAlex Elder 	ret = -ENOMEM;
2873c5b5ef6cSAlex Elder 	stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2874c5b5ef6cSAlex Elder 							OBJ_REQUEST_PAGES);
2875c5b5ef6cSAlex Elder 	if (!stat_request)
2876c5b5ef6cSAlex Elder 		goto out;
2877c5b5ef6cSAlex Elder 
2878c5b5ef6cSAlex Elder 	rbd_obj_request_get(obj_request);
2879c5b5ef6cSAlex Elder 	stat_request->obj_request = obj_request;
2880c5b5ef6cSAlex Elder 	stat_request->pages = pages;
2881c5b5ef6cSAlex Elder 	stat_request->page_count = page_count;
2882c5b5ef6cSAlex Elder 
2883c5b5ef6cSAlex Elder 	rbd_assert(obj_request->img_request);
2884c5b5ef6cSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
28856d2940c8SGuangliang Zhao 	stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
2886c5b5ef6cSAlex Elder 						   stat_request);
2887c5b5ef6cSAlex Elder 	if (!stat_request->osd_req)
2888c5b5ef6cSAlex Elder 		goto out;
2889c5b5ef6cSAlex Elder 	stat_request->callback = rbd_img_obj_exists_callback;
2890c5b5ef6cSAlex Elder 
2891144cba14SYan, Zheng 	osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0);
2892c5b5ef6cSAlex Elder 	osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2893c5b5ef6cSAlex Elder 					false, false);
28949d4df01fSAlex Elder 	rbd_osd_req_format_read(stat_request);
2895c5b5ef6cSAlex Elder 
2896c5b5ef6cSAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2897c5b5ef6cSAlex Elder 	ret = rbd_obj_request_submit(osdc, stat_request);
2898c5b5ef6cSAlex Elder out:
2899c5b5ef6cSAlex Elder 	if (ret)
2900c5b5ef6cSAlex Elder 		rbd_obj_request_put(obj_request);
2901c5b5ef6cSAlex Elder 
2902c5b5ef6cSAlex Elder 	return ret;
2903c5b5ef6cSAlex Elder }
2904c5b5ef6cSAlex Elder 
290570d045f6SIlya Dryomov static bool img_obj_request_simple(struct rbd_obj_request *obj_request)
2906b454e36dSAlex Elder {
2907b454e36dSAlex Elder 	struct rbd_img_request *img_request;
2908a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
2909b454e36dSAlex Elder 
2910b454e36dSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
2911b454e36dSAlex Elder 
2912b454e36dSAlex Elder 	img_request = obj_request->img_request;
2913b454e36dSAlex Elder 	rbd_assert(img_request);
2914a9e8ba2cSAlex Elder 	rbd_dev = img_request->rbd_dev;
2915b454e36dSAlex Elder 
291670d045f6SIlya Dryomov 	/* Reads */
29171c220881SJosh Durgin 	if (!img_request_write_test(img_request) &&
29181c220881SJosh Durgin 	    !img_request_discard_test(img_request))
291970d045f6SIlya Dryomov 		return true;
2920b454e36dSAlex Elder 
292170d045f6SIlya Dryomov 	/* Non-layered writes */
292270d045f6SIlya Dryomov 	if (!img_request_layered_test(img_request))
292370d045f6SIlya Dryomov 		return true;
292470d045f6SIlya Dryomov 
292570d045f6SIlya Dryomov 	/*
292670d045f6SIlya Dryomov 	 * Layered writes outside of the parent overlap range don't
292770d045f6SIlya Dryomov 	 * share any data with the parent.
292870d045f6SIlya Dryomov 	 */
292970d045f6SIlya Dryomov 	if (!obj_request_overlaps_parent(obj_request))
293070d045f6SIlya Dryomov 		return true;
293170d045f6SIlya Dryomov 
293270d045f6SIlya Dryomov 	/*
2933c622d226SGuangliang Zhao 	 * Entire-object layered writes - we will overwrite whatever
2934c622d226SGuangliang Zhao 	 * parent data there is anyway.
2935c622d226SGuangliang Zhao 	 */
2936c622d226SGuangliang Zhao 	if (!obj_request->offset &&
2937c622d226SGuangliang Zhao 	    obj_request->length == rbd_obj_bytes(&rbd_dev->header))
2938c622d226SGuangliang Zhao 		return true;
2939c622d226SGuangliang Zhao 
2940c622d226SGuangliang Zhao 	/*
294170d045f6SIlya Dryomov 	 * If the object is known to already exist, its parent data has
294270d045f6SIlya Dryomov 	 * already been copied.
294370d045f6SIlya Dryomov 	 */
294470d045f6SIlya Dryomov 	if (obj_request_known_test(obj_request) &&
294570d045f6SIlya Dryomov 	    obj_request_exists_test(obj_request))
294670d045f6SIlya Dryomov 		return true;
294770d045f6SIlya Dryomov 
294870d045f6SIlya Dryomov 	return false;
294970d045f6SIlya Dryomov }
295070d045f6SIlya Dryomov 
295170d045f6SIlya Dryomov static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
295270d045f6SIlya Dryomov {
295370d045f6SIlya Dryomov 	if (img_obj_request_simple(obj_request)) {
2954b454e36dSAlex Elder 		struct rbd_device *rbd_dev;
2955b454e36dSAlex Elder 		struct ceph_osd_client *osdc;
2956b454e36dSAlex Elder 
2957b454e36dSAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
2958b454e36dSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2959b454e36dSAlex Elder 
2960b454e36dSAlex Elder 		return rbd_obj_request_submit(osdc, obj_request);
2961b454e36dSAlex Elder 	}
2962b454e36dSAlex Elder 
2963b454e36dSAlex Elder 	/*
29643d7efd18SAlex Elder 	 * It's a layered write.  The target object might exist but
29653d7efd18SAlex Elder 	 * we may not know that yet.  If we know it doesn't exist,
29663d7efd18SAlex Elder 	 * start by reading the data for the full target object from
29673d7efd18SAlex Elder 	 * the parent so we can use it for a copyup to the target.
2968b454e36dSAlex Elder 	 */
296970d045f6SIlya Dryomov 	if (obj_request_known_test(obj_request))
29703d7efd18SAlex Elder 		return rbd_img_obj_parent_read_full(obj_request);
29713d7efd18SAlex Elder 
29723d7efd18SAlex Elder 	/* We don't know whether the target exists.  Go find out. */
2973b454e36dSAlex Elder 
2974b454e36dSAlex Elder 	return rbd_img_obj_exists_submit(obj_request);
2975b454e36dSAlex Elder }
2976b454e36dSAlex Elder 
2977bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request)
2978bf0d5f50SAlex Elder {
2979bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
298046faeed4SAlex Elder 	struct rbd_obj_request *next_obj_request;
2981bf0d5f50SAlex Elder 
298237206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
298346faeed4SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
2984bf0d5f50SAlex Elder 		int ret;
2985bf0d5f50SAlex Elder 
2986b454e36dSAlex Elder 		ret = rbd_img_obj_request_submit(obj_request);
2987bf0d5f50SAlex Elder 		if (ret)
2988bf0d5f50SAlex Elder 			return ret;
2989bf0d5f50SAlex Elder 	}
2990bf0d5f50SAlex Elder 
2991bf0d5f50SAlex Elder 	return 0;
2992bf0d5f50SAlex Elder }
2993bf0d5f50SAlex Elder 
29948b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
29958b3e1a56SAlex Elder {
29968b3e1a56SAlex Elder 	struct rbd_obj_request *obj_request;
2997a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
2998a9e8ba2cSAlex Elder 	u64 obj_end;
299902c74fbaSAlex Elder 	u64 img_xferred;
300002c74fbaSAlex Elder 	int img_result;
30018b3e1a56SAlex Elder 
30028b3e1a56SAlex Elder 	rbd_assert(img_request_child_test(img_request));
30038b3e1a56SAlex Elder 
300402c74fbaSAlex Elder 	/* First get what we need from the image request and release it */
300502c74fbaSAlex Elder 
30068b3e1a56SAlex Elder 	obj_request = img_request->obj_request;
300702c74fbaSAlex Elder 	img_xferred = img_request->xferred;
300802c74fbaSAlex Elder 	img_result = img_request->result;
300902c74fbaSAlex Elder 	rbd_img_request_put(img_request);
301002c74fbaSAlex Elder 
301102c74fbaSAlex Elder 	/*
301202c74fbaSAlex Elder 	 * If the overlap has become 0 (most likely because the
301302c74fbaSAlex Elder 	 * image has been flattened) we need to re-submit the
301402c74fbaSAlex Elder 	 * original request.
301502c74fbaSAlex Elder 	 */
3016a9e8ba2cSAlex Elder 	rbd_assert(obj_request);
3017a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_request);
301802c74fbaSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
301902c74fbaSAlex Elder 	if (!rbd_dev->parent_overlap) {
302002c74fbaSAlex Elder 		struct ceph_osd_client *osdc;
30218b3e1a56SAlex Elder 
302202c74fbaSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
302302c74fbaSAlex Elder 		img_result = rbd_obj_request_submit(osdc, obj_request);
302402c74fbaSAlex Elder 		if (!img_result)
302502c74fbaSAlex Elder 			return;
302602c74fbaSAlex Elder 	}
302702c74fbaSAlex Elder 
302802c74fbaSAlex Elder 	obj_request->result = img_result;
3029a9e8ba2cSAlex Elder 	if (obj_request->result)
3030a9e8ba2cSAlex Elder 		goto out;
3031a9e8ba2cSAlex Elder 
3032a9e8ba2cSAlex Elder 	/*
3033a9e8ba2cSAlex Elder 	 * We need to zero anything beyond the parent overlap
3034a9e8ba2cSAlex Elder 	 * boundary.  Since rbd_img_obj_request_read_callback()
3035a9e8ba2cSAlex Elder 	 * will zero anything beyond the end of a short read, an
3036a9e8ba2cSAlex Elder 	 * easy way to do this is to pretend the data from the
3037a9e8ba2cSAlex Elder 	 * parent came up short--ending at the overlap boundary.
3038a9e8ba2cSAlex Elder 	 */
3039a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
3040a9e8ba2cSAlex Elder 	obj_end = obj_request->img_offset + obj_request->length;
3041a9e8ba2cSAlex Elder 	if (obj_end > rbd_dev->parent_overlap) {
3042a9e8ba2cSAlex Elder 		u64 xferred = 0;
3043a9e8ba2cSAlex Elder 
3044a9e8ba2cSAlex Elder 		if (obj_request->img_offset < rbd_dev->parent_overlap)
3045a9e8ba2cSAlex Elder 			xferred = rbd_dev->parent_overlap -
3046a9e8ba2cSAlex Elder 					obj_request->img_offset;
3047a9e8ba2cSAlex Elder 
304802c74fbaSAlex Elder 		obj_request->xferred = min(img_xferred, xferred);
3049a9e8ba2cSAlex Elder 	} else {
305002c74fbaSAlex Elder 		obj_request->xferred = img_xferred;
3051a9e8ba2cSAlex Elder 	}
3052a9e8ba2cSAlex Elder out:
30538b3e1a56SAlex Elder 	rbd_img_obj_request_read_callback(obj_request);
30548b3e1a56SAlex Elder 	rbd_obj_request_complete(obj_request);
30558b3e1a56SAlex Elder }
30568b3e1a56SAlex Elder 
30578b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
30588b3e1a56SAlex Elder {
30598b3e1a56SAlex Elder 	struct rbd_img_request *img_request;
30608b3e1a56SAlex Elder 	int result;
30618b3e1a56SAlex Elder 
30628b3e1a56SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
30638b3e1a56SAlex Elder 	rbd_assert(obj_request->img_request != NULL);
30648b3e1a56SAlex Elder 	rbd_assert(obj_request->result == (s32) -ENOENT);
30655b2ab72dSAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
30668b3e1a56SAlex Elder 
30678b3e1a56SAlex Elder 	/* rbd_read_finish(obj_request, obj_request->length); */
3068e93f3152SAlex Elder 	img_request = rbd_parent_request_create(obj_request,
30698b3e1a56SAlex Elder 						obj_request->img_offset,
3070e93f3152SAlex Elder 						obj_request->length);
30718b3e1a56SAlex Elder 	result = -ENOMEM;
30728b3e1a56SAlex Elder 	if (!img_request)
30738b3e1a56SAlex Elder 		goto out_err;
30748b3e1a56SAlex Elder 
30755b2ab72dSAlex Elder 	if (obj_request->type == OBJ_REQUEST_BIO)
3076f1a4739fSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
3077f1a4739fSAlex Elder 						obj_request->bio_list);
30785b2ab72dSAlex Elder 	else
30795b2ab72dSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES,
30805b2ab72dSAlex Elder 						obj_request->pages);
30818b3e1a56SAlex Elder 	if (result)
30828b3e1a56SAlex Elder 		goto out_err;
30838b3e1a56SAlex Elder 
30848b3e1a56SAlex Elder 	img_request->callback = rbd_img_parent_read_callback;
30858b3e1a56SAlex Elder 	result = rbd_img_request_submit(img_request);
30868b3e1a56SAlex Elder 	if (result)
30878b3e1a56SAlex Elder 		goto out_err;
30888b3e1a56SAlex Elder 
30898b3e1a56SAlex Elder 	return;
30908b3e1a56SAlex Elder out_err:
30918b3e1a56SAlex Elder 	if (img_request)
30928b3e1a56SAlex Elder 		rbd_img_request_put(img_request);
30938b3e1a56SAlex Elder 	obj_request->result = result;
30948b3e1a56SAlex Elder 	obj_request->xferred = 0;
30958b3e1a56SAlex Elder 	obj_request_done_set(obj_request);
30968b3e1a56SAlex Elder }
30978b3e1a56SAlex Elder 
309820e0af67SJosh Durgin static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id)
3099b8d70035SAlex Elder {
3100b8d70035SAlex Elder 	struct rbd_obj_request *obj_request;
31012169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3102b8d70035SAlex Elder 	int ret;
3103b8d70035SAlex Elder 
3104b8d70035SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
3105b8d70035SAlex Elder 							OBJ_REQUEST_NODATA);
3106b8d70035SAlex Elder 	if (!obj_request)
3107b8d70035SAlex Elder 		return -ENOMEM;
3108b8d70035SAlex Elder 
3109b8d70035SAlex Elder 	ret = -ENOMEM;
31106d2940c8SGuangliang Zhao 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
3111deb236b3SIlya Dryomov 						  obj_request);
3112b8d70035SAlex Elder 	if (!obj_request->osd_req)
3113b8d70035SAlex Elder 		goto out;
3114b8d70035SAlex Elder 
3115c99d2d4aSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
3116cc4a38bdSAlex Elder 					notify_id, 0, 0);
31179d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
3118430c28c3SAlex Elder 
3119b8d70035SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
3120cf81b60eSAlex Elder 	if (ret)
312120e0af67SJosh Durgin 		goto out;
312220e0af67SJosh Durgin 	ret = rbd_obj_request_wait(obj_request);
312320e0af67SJosh Durgin out:
3124b8d70035SAlex Elder 	rbd_obj_request_put(obj_request);
3125b8d70035SAlex Elder 
3126b8d70035SAlex Elder 	return ret;
3127b8d70035SAlex Elder }
3128b8d70035SAlex Elder 
3129b8d70035SAlex Elder static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
3130b8d70035SAlex Elder {
3131b8d70035SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
3132e627db08SAlex Elder 	int ret;
3133b8d70035SAlex Elder 
3134b8d70035SAlex Elder 	if (!rbd_dev)
3135b8d70035SAlex Elder 		return;
3136b8d70035SAlex Elder 
313737206ee5SAlex Elder 	dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
3138b8d70035SAlex Elder 		rbd_dev->header_name, (unsigned long long)notify_id,
3139b8d70035SAlex Elder 		(unsigned int)opcode);
314052bb1f9bSIlya Dryomov 
314152bb1f9bSIlya Dryomov 	/*
314252bb1f9bSIlya Dryomov 	 * Until adequate refresh error handling is in place, there is
314352bb1f9bSIlya Dryomov 	 * not much we can do here, except warn.
314452bb1f9bSIlya Dryomov 	 *
314552bb1f9bSIlya Dryomov 	 * See http://tracker.ceph.com/issues/5040
314652bb1f9bSIlya Dryomov 	 */
3147e627db08SAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
3148e627db08SAlex Elder 	if (ret)
31499584d508SIlya Dryomov 		rbd_warn(rbd_dev, "refresh failed: %d", ret);
3150b8d70035SAlex Elder 
315152bb1f9bSIlya Dryomov 	ret = rbd_obj_notify_ack_sync(rbd_dev, notify_id);
315252bb1f9bSIlya Dryomov 	if (ret)
31539584d508SIlya Dryomov 		rbd_warn(rbd_dev, "notify_ack ret %d", ret);
3154b8d70035SAlex Elder }
3155b8d70035SAlex Elder 
31569969ebc5SAlex Elder /*
3157bb040aa0SIlya Dryomov  * Send a (un)watch request and wait for the ack.  Return a request
3158bb040aa0SIlya Dryomov  * with a ref held on success or error.
3159bb040aa0SIlya Dryomov  */
3160bb040aa0SIlya Dryomov static struct rbd_obj_request *rbd_obj_watch_request_helper(
3161bb040aa0SIlya Dryomov 						struct rbd_device *rbd_dev,
3162bb040aa0SIlya Dryomov 						bool watch)
3163bb040aa0SIlya Dryomov {
3164bb040aa0SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
31652894e1d7SIlya Dryomov 	struct ceph_options *opts = osdc->client->options;
3166bb040aa0SIlya Dryomov 	struct rbd_obj_request *obj_request;
3167bb040aa0SIlya Dryomov 	int ret;
3168bb040aa0SIlya Dryomov 
3169bb040aa0SIlya Dryomov 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
3170bb040aa0SIlya Dryomov 					     OBJ_REQUEST_NODATA);
3171bb040aa0SIlya Dryomov 	if (!obj_request)
3172bb040aa0SIlya Dryomov 		return ERR_PTR(-ENOMEM);
3173bb040aa0SIlya Dryomov 
31746d2940c8SGuangliang Zhao 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_WRITE, 1,
3175bb040aa0SIlya Dryomov 						  obj_request);
3176bb040aa0SIlya Dryomov 	if (!obj_request->osd_req) {
3177bb040aa0SIlya Dryomov 		ret = -ENOMEM;
3178bb040aa0SIlya Dryomov 		goto out;
3179bb040aa0SIlya Dryomov 	}
3180bb040aa0SIlya Dryomov 
3181bb040aa0SIlya Dryomov 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
3182bb040aa0SIlya Dryomov 			      rbd_dev->watch_event->cookie, 0, watch);
3183bb040aa0SIlya Dryomov 	rbd_osd_req_format_write(obj_request);
3184bb040aa0SIlya Dryomov 
3185bb040aa0SIlya Dryomov 	if (watch)
3186bb040aa0SIlya Dryomov 		ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
3187bb040aa0SIlya Dryomov 
3188bb040aa0SIlya Dryomov 	ret = rbd_obj_request_submit(osdc, obj_request);
3189bb040aa0SIlya Dryomov 	if (ret)
3190bb040aa0SIlya Dryomov 		goto out;
3191bb040aa0SIlya Dryomov 
31922894e1d7SIlya Dryomov 	ret = rbd_obj_request_wait_timeout(obj_request, opts->mount_timeout);
3193bb040aa0SIlya Dryomov 	if (ret)
3194bb040aa0SIlya Dryomov 		goto out;
3195bb040aa0SIlya Dryomov 
3196bb040aa0SIlya Dryomov 	ret = obj_request->result;
3197bb040aa0SIlya Dryomov 	if (ret) {
3198bb040aa0SIlya Dryomov 		if (watch)
3199bb040aa0SIlya Dryomov 			rbd_obj_request_end(obj_request);
3200bb040aa0SIlya Dryomov 		goto out;
3201bb040aa0SIlya Dryomov 	}
3202bb040aa0SIlya Dryomov 
3203bb040aa0SIlya Dryomov 	return obj_request;
3204bb040aa0SIlya Dryomov 
3205bb040aa0SIlya Dryomov out:
3206bb040aa0SIlya Dryomov 	rbd_obj_request_put(obj_request);
3207bb040aa0SIlya Dryomov 	return ERR_PTR(ret);
3208bb040aa0SIlya Dryomov }
3209bb040aa0SIlya Dryomov 
3210bb040aa0SIlya Dryomov /*
3211b30a01f2SIlya Dryomov  * Initiate a watch request, synchronously.
32129969ebc5SAlex Elder  */
3213b30a01f2SIlya Dryomov static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
32149969ebc5SAlex Elder {
32159969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
32169969ebc5SAlex Elder 	struct rbd_obj_request *obj_request;
32179969ebc5SAlex Elder 	int ret;
32189969ebc5SAlex Elder 
3219b30a01f2SIlya Dryomov 	rbd_assert(!rbd_dev->watch_event);
3220b30a01f2SIlya Dryomov 	rbd_assert(!rbd_dev->watch_request);
32219969ebc5SAlex Elder 
32223c663bbdSAlex Elder 	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
32239969ebc5SAlex Elder 				     &rbd_dev->watch_event);
32249969ebc5SAlex Elder 	if (ret < 0)
32259969ebc5SAlex Elder 		return ret;
32269969ebc5SAlex Elder 
322776756a51SIlya Dryomov 	obj_request = rbd_obj_watch_request_helper(rbd_dev, true);
322876756a51SIlya Dryomov 	if (IS_ERR(obj_request)) {
322976756a51SIlya Dryomov 		ceph_osdc_cancel_event(rbd_dev->watch_event);
323076756a51SIlya Dryomov 		rbd_dev->watch_event = NULL;
323176756a51SIlya Dryomov 		return PTR_ERR(obj_request);
3232b30a01f2SIlya Dryomov 	}
32339969ebc5SAlex Elder 
32348eb87565SAlex Elder 	/*
32358eb87565SAlex Elder 	 * A watch request is set to linger, so the underlying osd
32368eb87565SAlex Elder 	 * request won't go away until we unregister it.  We retain
32378eb87565SAlex Elder 	 * a pointer to the object request during that time (in
323876756a51SIlya Dryomov 	 * rbd_dev->watch_request), so we'll keep a reference to it.
323976756a51SIlya Dryomov 	 * We'll drop that reference after we've unregistered it in
324076756a51SIlya Dryomov 	 * rbd_dev_header_unwatch_sync().
32418eb87565SAlex Elder 	 */
32428eb87565SAlex Elder 	rbd_dev->watch_request = obj_request;
32438eb87565SAlex Elder 
32448eb87565SAlex Elder 	return 0;
32459969ebc5SAlex Elder }
32469969ebc5SAlex Elder 
3247b30a01f2SIlya Dryomov /*
3248b30a01f2SIlya Dryomov  * Tear down a watch request, synchronously.
3249b30a01f2SIlya Dryomov  */
325076756a51SIlya Dryomov static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
3251fca27065SIlya Dryomov {
3252b30a01f2SIlya Dryomov 	struct rbd_obj_request *obj_request;
3253b30a01f2SIlya Dryomov 
3254b30a01f2SIlya Dryomov 	rbd_assert(rbd_dev->watch_event);
3255b30a01f2SIlya Dryomov 	rbd_assert(rbd_dev->watch_request);
3256b30a01f2SIlya Dryomov 
325776756a51SIlya Dryomov 	rbd_obj_request_end(rbd_dev->watch_request);
3258b30a01f2SIlya Dryomov 	rbd_obj_request_put(rbd_dev->watch_request);
3259b30a01f2SIlya Dryomov 	rbd_dev->watch_request = NULL;
3260b30a01f2SIlya Dryomov 
326176756a51SIlya Dryomov 	obj_request = rbd_obj_watch_request_helper(rbd_dev, false);
326276756a51SIlya Dryomov 	if (!IS_ERR(obj_request))
3263b30a01f2SIlya Dryomov 		rbd_obj_request_put(obj_request);
326476756a51SIlya Dryomov 	else
326576756a51SIlya Dryomov 		rbd_warn(rbd_dev, "unable to tear down watch request (%ld)",
326676756a51SIlya Dryomov 			 PTR_ERR(obj_request));
326776756a51SIlya Dryomov 
3268b30a01f2SIlya Dryomov 	ceph_osdc_cancel_event(rbd_dev->watch_event);
3269b30a01f2SIlya Dryomov 	rbd_dev->watch_event = NULL;
3270fca27065SIlya Dryomov }
3271fca27065SIlya Dryomov 
327236be9a76SAlex Elder /*
3273f40eb349SAlex Elder  * Synchronous osd object method call.  Returns the number of bytes
3274f40eb349SAlex Elder  * returned in the outbound buffer, or a negative error code.
327536be9a76SAlex Elder  */
327636be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
327736be9a76SAlex Elder 			     const char *object_name,
327836be9a76SAlex Elder 			     const char *class_name,
327936be9a76SAlex Elder 			     const char *method_name,
32804157976bSAlex Elder 			     const void *outbound,
328136be9a76SAlex Elder 			     size_t outbound_size,
32824157976bSAlex Elder 			     void *inbound,
3283e2a58ee5SAlex Elder 			     size_t inbound_size)
328436be9a76SAlex Elder {
32852169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
328636be9a76SAlex Elder 	struct rbd_obj_request *obj_request;
328736be9a76SAlex Elder 	struct page **pages;
328836be9a76SAlex Elder 	u32 page_count;
328936be9a76SAlex Elder 	int ret;
329036be9a76SAlex Elder 
329136be9a76SAlex Elder 	/*
32926010a451SAlex Elder 	 * Method calls are ultimately read operations.  The result
32936010a451SAlex Elder 	 * should placed into the inbound buffer provided.  They
32946010a451SAlex Elder 	 * also supply outbound data--parameters for the object
32956010a451SAlex Elder 	 * method.  Currently if this is present it will be a
32966010a451SAlex Elder 	 * snapshot id.
329736be9a76SAlex Elder 	 */
329836be9a76SAlex Elder 	page_count = (u32)calc_pages_for(0, inbound_size);
329936be9a76SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
330036be9a76SAlex Elder 	if (IS_ERR(pages))
330136be9a76SAlex Elder 		return PTR_ERR(pages);
330236be9a76SAlex Elder 
330336be9a76SAlex Elder 	ret = -ENOMEM;
33046010a451SAlex Elder 	obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
330536be9a76SAlex Elder 							OBJ_REQUEST_PAGES);
330636be9a76SAlex Elder 	if (!obj_request)
330736be9a76SAlex Elder 		goto out;
330836be9a76SAlex Elder 
330936be9a76SAlex Elder 	obj_request->pages = pages;
331036be9a76SAlex Elder 	obj_request->page_count = page_count;
331136be9a76SAlex Elder 
33126d2940c8SGuangliang Zhao 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
3313deb236b3SIlya Dryomov 						  obj_request);
331436be9a76SAlex Elder 	if (!obj_request->osd_req)
331536be9a76SAlex Elder 		goto out;
331636be9a76SAlex Elder 
3317c99d2d4aSAlex Elder 	osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
331804017e29SAlex Elder 					class_name, method_name);
331904017e29SAlex Elder 	if (outbound_size) {
332004017e29SAlex Elder 		struct ceph_pagelist *pagelist;
332104017e29SAlex Elder 
332204017e29SAlex Elder 		pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
332304017e29SAlex Elder 		if (!pagelist)
332404017e29SAlex Elder 			goto out;
332504017e29SAlex Elder 
332604017e29SAlex Elder 		ceph_pagelist_init(pagelist);
332704017e29SAlex Elder 		ceph_pagelist_append(pagelist, outbound, outbound_size);
332804017e29SAlex Elder 		osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
332904017e29SAlex Elder 						pagelist);
333004017e29SAlex Elder 	}
3331a4ce40a9SAlex Elder 	osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
3332a4ce40a9SAlex Elder 					obj_request->pages, inbound_size,
333344cd188dSAlex Elder 					0, false, false);
33349d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
3335430c28c3SAlex Elder 
333636be9a76SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
333736be9a76SAlex Elder 	if (ret)
333836be9a76SAlex Elder 		goto out;
333936be9a76SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
334036be9a76SAlex Elder 	if (ret)
334136be9a76SAlex Elder 		goto out;
334236be9a76SAlex Elder 
334336be9a76SAlex Elder 	ret = obj_request->result;
334436be9a76SAlex Elder 	if (ret < 0)
334536be9a76SAlex Elder 		goto out;
334657385b51SAlex Elder 
334757385b51SAlex Elder 	rbd_assert(obj_request->xferred < (u64)INT_MAX);
334857385b51SAlex Elder 	ret = (int)obj_request->xferred;
3349903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
335036be9a76SAlex Elder out:
335136be9a76SAlex Elder 	if (obj_request)
335236be9a76SAlex Elder 		rbd_obj_request_put(obj_request);
335336be9a76SAlex Elder 	else
335436be9a76SAlex Elder 		ceph_release_page_vector(pages, page_count);
335536be9a76SAlex Elder 
335636be9a76SAlex Elder 	return ret;
335736be9a76SAlex Elder }
335836be9a76SAlex Elder 
33597ad18afaSChristoph Hellwig static void rbd_queue_workfn(struct work_struct *work)
3360bc1ecc65SIlya Dryomov {
33617ad18afaSChristoph Hellwig 	struct request *rq = blk_mq_rq_from_pdu(work);
33627ad18afaSChristoph Hellwig 	struct rbd_device *rbd_dev = rq->q->queuedata;
3363bc1ecc65SIlya Dryomov 	struct rbd_img_request *img_request;
33644e752f0aSJosh Durgin 	struct ceph_snap_context *snapc = NULL;
3365bc1ecc65SIlya Dryomov 	u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
3366bc1ecc65SIlya Dryomov 	u64 length = blk_rq_bytes(rq);
33676d2940c8SGuangliang Zhao 	enum obj_operation_type op_type;
33684e752f0aSJosh Durgin 	u64 mapping_size;
3369bc1ecc65SIlya Dryomov 	int result;
3370bc1ecc65SIlya Dryomov 
33717ad18afaSChristoph Hellwig 	if (rq->cmd_type != REQ_TYPE_FS) {
33727ad18afaSChristoph Hellwig 		dout("%s: non-fs request type %d\n", __func__,
33737ad18afaSChristoph Hellwig 			(int) rq->cmd_type);
33747ad18afaSChristoph Hellwig 		result = -EIO;
33757ad18afaSChristoph Hellwig 		goto err;
33767ad18afaSChristoph Hellwig 	}
33777ad18afaSChristoph Hellwig 
337890e98c52SGuangliang Zhao 	if (rq->cmd_flags & REQ_DISCARD)
337990e98c52SGuangliang Zhao 		op_type = OBJ_OP_DISCARD;
338090e98c52SGuangliang Zhao 	else if (rq->cmd_flags & REQ_WRITE)
33816d2940c8SGuangliang Zhao 		op_type = OBJ_OP_WRITE;
33826d2940c8SGuangliang Zhao 	else
33836d2940c8SGuangliang Zhao 		op_type = OBJ_OP_READ;
33846d2940c8SGuangliang Zhao 
3385bc1ecc65SIlya Dryomov 	/* Ignore/skip any zero-length requests */
3386bc1ecc65SIlya Dryomov 
3387bc1ecc65SIlya Dryomov 	if (!length) {
3388bc1ecc65SIlya Dryomov 		dout("%s: zero-length request\n", __func__);
3389bc1ecc65SIlya Dryomov 		result = 0;
3390bc1ecc65SIlya Dryomov 		goto err_rq;
3391bc1ecc65SIlya Dryomov 	}
3392bc1ecc65SIlya Dryomov 
33936d2940c8SGuangliang Zhao 	/* Only reads are allowed to a read-only device */
3394bc1ecc65SIlya Dryomov 
33956d2940c8SGuangliang Zhao 	if (op_type != OBJ_OP_READ) {
3396bc1ecc65SIlya Dryomov 		if (rbd_dev->mapping.read_only) {
3397bc1ecc65SIlya Dryomov 			result = -EROFS;
3398bc1ecc65SIlya Dryomov 			goto err_rq;
3399bc1ecc65SIlya Dryomov 		}
3400bc1ecc65SIlya Dryomov 		rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
3401bc1ecc65SIlya Dryomov 	}
3402bc1ecc65SIlya Dryomov 
3403bc1ecc65SIlya Dryomov 	/*
3404bc1ecc65SIlya Dryomov 	 * Quit early if the mapped snapshot no longer exists.  It's
3405bc1ecc65SIlya Dryomov 	 * still possible the snapshot will have disappeared by the
3406bc1ecc65SIlya Dryomov 	 * time our request arrives at the osd, but there's no sense in
3407bc1ecc65SIlya Dryomov 	 * sending it if we already know.
3408bc1ecc65SIlya Dryomov 	 */
3409bc1ecc65SIlya Dryomov 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3410bc1ecc65SIlya Dryomov 		dout("request for non-existent snapshot");
3411bc1ecc65SIlya Dryomov 		rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3412bc1ecc65SIlya Dryomov 		result = -ENXIO;
3413bc1ecc65SIlya Dryomov 		goto err_rq;
3414bc1ecc65SIlya Dryomov 	}
3415bc1ecc65SIlya Dryomov 
3416bc1ecc65SIlya Dryomov 	if (offset && length > U64_MAX - offset + 1) {
3417bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
3418bc1ecc65SIlya Dryomov 			 length);
3419bc1ecc65SIlya Dryomov 		result = -EINVAL;
3420bc1ecc65SIlya Dryomov 		goto err_rq;	/* Shouldn't happen */
3421bc1ecc65SIlya Dryomov 	}
3422bc1ecc65SIlya Dryomov 
34237ad18afaSChristoph Hellwig 	blk_mq_start_request(rq);
34247ad18afaSChristoph Hellwig 
34254e752f0aSJosh Durgin 	down_read(&rbd_dev->header_rwsem);
34264e752f0aSJosh Durgin 	mapping_size = rbd_dev->mapping.size;
34276d2940c8SGuangliang Zhao 	if (op_type != OBJ_OP_READ) {
34284e752f0aSJosh Durgin 		snapc = rbd_dev->header.snapc;
34294e752f0aSJosh Durgin 		ceph_get_snap_context(snapc);
34304e752f0aSJosh Durgin 	}
34314e752f0aSJosh Durgin 	up_read(&rbd_dev->header_rwsem);
34324e752f0aSJosh Durgin 
34334e752f0aSJosh Durgin 	if (offset + length > mapping_size) {
3434bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
34354e752f0aSJosh Durgin 			 length, mapping_size);
3436bc1ecc65SIlya Dryomov 		result = -EIO;
3437bc1ecc65SIlya Dryomov 		goto err_rq;
3438bc1ecc65SIlya Dryomov 	}
3439bc1ecc65SIlya Dryomov 
34406d2940c8SGuangliang Zhao 	img_request = rbd_img_request_create(rbd_dev, offset, length, op_type,
34414e752f0aSJosh Durgin 					     snapc);
3442bc1ecc65SIlya Dryomov 	if (!img_request) {
3443bc1ecc65SIlya Dryomov 		result = -ENOMEM;
3444bc1ecc65SIlya Dryomov 		goto err_rq;
3445bc1ecc65SIlya Dryomov 	}
3446bc1ecc65SIlya Dryomov 	img_request->rq = rq;
3447bc1ecc65SIlya Dryomov 
344890e98c52SGuangliang Zhao 	if (op_type == OBJ_OP_DISCARD)
344990e98c52SGuangliang Zhao 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA,
345090e98c52SGuangliang Zhao 					      NULL);
345190e98c52SGuangliang Zhao 	else
345290e98c52SGuangliang Zhao 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
345390e98c52SGuangliang Zhao 					      rq->bio);
3454bc1ecc65SIlya Dryomov 	if (result)
3455bc1ecc65SIlya Dryomov 		goto err_img_request;
3456bc1ecc65SIlya Dryomov 
3457bc1ecc65SIlya Dryomov 	result = rbd_img_request_submit(img_request);
3458bc1ecc65SIlya Dryomov 	if (result)
3459bc1ecc65SIlya Dryomov 		goto err_img_request;
3460bc1ecc65SIlya Dryomov 
3461bc1ecc65SIlya Dryomov 	return;
3462bc1ecc65SIlya Dryomov 
3463bc1ecc65SIlya Dryomov err_img_request:
3464bc1ecc65SIlya Dryomov 	rbd_img_request_put(img_request);
3465bc1ecc65SIlya Dryomov err_rq:
3466bc1ecc65SIlya Dryomov 	if (result)
3467bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "%s %llx at %llx result %d",
34686d2940c8SGuangliang Zhao 			 obj_op_name(op_type), length, offset, result);
34694e752f0aSJosh Durgin 	ceph_put_snap_context(snapc);
34707ad18afaSChristoph Hellwig err:
34717ad18afaSChristoph Hellwig 	blk_mq_end_request(rq, result);
3472bc1ecc65SIlya Dryomov }
3473bc1ecc65SIlya Dryomov 
34747ad18afaSChristoph Hellwig static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
34757ad18afaSChristoph Hellwig 		const struct blk_mq_queue_data *bd)
3476bc1ecc65SIlya Dryomov {
34777ad18afaSChristoph Hellwig 	struct request *rq = bd->rq;
34787ad18afaSChristoph Hellwig 	struct work_struct *work = blk_mq_rq_to_pdu(rq);
3479bc1ecc65SIlya Dryomov 
34807ad18afaSChristoph Hellwig 	queue_work(rbd_wq, work);
34817ad18afaSChristoph Hellwig 	return BLK_MQ_RQ_QUEUE_OK;
3482bf0d5f50SAlex Elder }
3483bf0d5f50SAlex Elder 
3484602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
3485602adf40SYehuda Sadeh {
3486602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
3487602adf40SYehuda Sadeh 
3488602adf40SYehuda Sadeh 	if (!disk)
3489602adf40SYehuda Sadeh 		return;
3490602adf40SYehuda Sadeh 
3491a0cab924SAlex Elder 	rbd_dev->disk = NULL;
3492a0cab924SAlex Elder 	if (disk->flags & GENHD_FL_UP) {
3493602adf40SYehuda Sadeh 		del_gendisk(disk);
3494602adf40SYehuda Sadeh 		if (disk->queue)
3495602adf40SYehuda Sadeh 			blk_cleanup_queue(disk->queue);
34967ad18afaSChristoph Hellwig 		blk_mq_free_tag_set(&rbd_dev->tag_set);
3497a0cab924SAlex Elder 	}
3498602adf40SYehuda Sadeh 	put_disk(disk);
3499602adf40SYehuda Sadeh }
3500602adf40SYehuda Sadeh 
3501788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
3502788e2df3SAlex Elder 				const char *object_name,
35037097f8dfSAlex Elder 				u64 offset, u64 length, void *buf)
3504788e2df3SAlex Elder 
3505788e2df3SAlex Elder {
35062169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3507788e2df3SAlex Elder 	struct rbd_obj_request *obj_request;
3508788e2df3SAlex Elder 	struct page **pages = NULL;
3509788e2df3SAlex Elder 	u32 page_count;
35101ceae7efSAlex Elder 	size_t size;
3511788e2df3SAlex Elder 	int ret;
3512788e2df3SAlex Elder 
3513788e2df3SAlex Elder 	page_count = (u32) calc_pages_for(offset, length);
3514788e2df3SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
3515788e2df3SAlex Elder 	if (IS_ERR(pages))
3516a8d42056SJan Kara 		return PTR_ERR(pages);
3517788e2df3SAlex Elder 
3518788e2df3SAlex Elder 	ret = -ENOMEM;
3519788e2df3SAlex Elder 	obj_request = rbd_obj_request_create(object_name, offset, length,
3520788e2df3SAlex Elder 							OBJ_REQUEST_PAGES);
3521788e2df3SAlex Elder 	if (!obj_request)
3522788e2df3SAlex Elder 		goto out;
3523788e2df3SAlex Elder 
3524788e2df3SAlex Elder 	obj_request->pages = pages;
3525788e2df3SAlex Elder 	obj_request->page_count = page_count;
3526788e2df3SAlex Elder 
35276d2940c8SGuangliang Zhao 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
3528deb236b3SIlya Dryomov 						  obj_request);
3529788e2df3SAlex Elder 	if (!obj_request->osd_req)
3530788e2df3SAlex Elder 		goto out;
3531788e2df3SAlex Elder 
3532c99d2d4aSAlex Elder 	osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
3533c99d2d4aSAlex Elder 					offset, length, 0, 0);
3534406e2c9fSAlex Elder 	osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
3535a4ce40a9SAlex Elder 					obj_request->pages,
353644cd188dSAlex Elder 					obj_request->length,
353744cd188dSAlex Elder 					obj_request->offset & ~PAGE_MASK,
353844cd188dSAlex Elder 					false, false);
35399d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
3540430c28c3SAlex Elder 
3541788e2df3SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
3542788e2df3SAlex Elder 	if (ret)
3543788e2df3SAlex Elder 		goto out;
3544788e2df3SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
3545788e2df3SAlex Elder 	if (ret)
3546788e2df3SAlex Elder 		goto out;
3547788e2df3SAlex Elder 
3548788e2df3SAlex Elder 	ret = obj_request->result;
3549788e2df3SAlex Elder 	if (ret < 0)
3550788e2df3SAlex Elder 		goto out;
35511ceae7efSAlex Elder 
35521ceae7efSAlex Elder 	rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
35531ceae7efSAlex Elder 	size = (size_t) obj_request->xferred;
3554903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, buf, 0, size);
355523ed6e13SAlex Elder 	rbd_assert(size <= (size_t)INT_MAX);
355623ed6e13SAlex Elder 	ret = (int)size;
3557788e2df3SAlex Elder out:
3558788e2df3SAlex Elder 	if (obj_request)
3559788e2df3SAlex Elder 		rbd_obj_request_put(obj_request);
3560788e2df3SAlex Elder 	else
3561788e2df3SAlex Elder 		ceph_release_page_vector(pages, page_count);
3562788e2df3SAlex Elder 
3563788e2df3SAlex Elder 	return ret;
3564788e2df3SAlex Elder }
3565788e2df3SAlex Elder 
3566602adf40SYehuda Sadeh /*
3567662518b1SAlex Elder  * Read the complete header for the given rbd device.  On successful
3568662518b1SAlex Elder  * return, the rbd_dev->header field will contain up-to-date
3569662518b1SAlex Elder  * information about the image.
35704156d998SAlex Elder  */
357199a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
35724156d998SAlex Elder {
35734156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
35744156d998SAlex Elder 	u32 snap_count = 0;
35754156d998SAlex Elder 	u64 names_size = 0;
35764156d998SAlex Elder 	u32 want_count;
35774156d998SAlex Elder 	int ret;
35784156d998SAlex Elder 
35794156d998SAlex Elder 	/*
35804156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
35814156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
35824156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
35834156d998SAlex Elder 	 * the number of snapshots could change by the time we read
35844156d998SAlex Elder 	 * it in, in which case we re-read it.
35854156d998SAlex Elder 	 */
35864156d998SAlex Elder 	do {
35874156d998SAlex Elder 		size_t size;
35884156d998SAlex Elder 
35894156d998SAlex Elder 		kfree(ondisk);
35904156d998SAlex Elder 
35914156d998SAlex Elder 		size = sizeof (*ondisk);
35924156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
35934156d998SAlex Elder 		size += names_size;
35944156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
35954156d998SAlex Elder 		if (!ondisk)
3596662518b1SAlex Elder 			return -ENOMEM;
35974156d998SAlex Elder 
3598788e2df3SAlex Elder 		ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
35997097f8dfSAlex Elder 				       0, size, ondisk);
36004156d998SAlex Elder 		if (ret < 0)
3601662518b1SAlex Elder 			goto out;
3602c0cd10dbSAlex Elder 		if ((size_t)ret < size) {
36034156d998SAlex Elder 			ret = -ENXIO;
360406ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
360506ecc6cbSAlex Elder 				size, ret);
3606662518b1SAlex Elder 			goto out;
36074156d998SAlex Elder 		}
36084156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
36094156d998SAlex Elder 			ret = -ENXIO;
361006ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
3611662518b1SAlex Elder 			goto out;
36124156d998SAlex Elder 		}
36134156d998SAlex Elder 
36144156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
36154156d998SAlex Elder 		want_count = snap_count;
36164156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
36174156d998SAlex Elder 	} while (snap_count != want_count);
36184156d998SAlex Elder 
3619662518b1SAlex Elder 	ret = rbd_header_from_disk(rbd_dev, ondisk);
3620662518b1SAlex Elder out:
36214156d998SAlex Elder 	kfree(ondisk);
36224156d998SAlex Elder 
3623dfc5606dSYehuda Sadeh 	return ret;
3624602adf40SYehuda Sadeh }
3625602adf40SYehuda Sadeh 
362615228edeSAlex Elder /*
362715228edeSAlex Elder  * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
362815228edeSAlex Elder  * has disappeared from the (just updated) snapshot context.
362915228edeSAlex Elder  */
363015228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev)
363115228edeSAlex Elder {
363215228edeSAlex Elder 	u64 snap_id;
363315228edeSAlex Elder 
363415228edeSAlex Elder 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
363515228edeSAlex Elder 		return;
363615228edeSAlex Elder 
363715228edeSAlex Elder 	snap_id = rbd_dev->spec->snap_id;
363815228edeSAlex Elder 	if (snap_id == CEPH_NOSNAP)
363915228edeSAlex Elder 		return;
364015228edeSAlex Elder 
364115228edeSAlex Elder 	if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
364215228edeSAlex Elder 		clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
364315228edeSAlex Elder }
364415228edeSAlex Elder 
36459875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev)
36469875201eSJosh Durgin {
36479875201eSJosh Durgin 	sector_t size;
36489875201eSJosh Durgin 	bool removing;
36499875201eSJosh Durgin 
36509875201eSJosh Durgin 	/*
36519875201eSJosh Durgin 	 * Don't hold the lock while doing disk operations,
36529875201eSJosh Durgin 	 * or lock ordering will conflict with the bdev mutex via:
36539875201eSJosh Durgin 	 * rbd_add() -> blkdev_get() -> rbd_open()
36549875201eSJosh Durgin 	 */
36559875201eSJosh Durgin 	spin_lock_irq(&rbd_dev->lock);
36569875201eSJosh Durgin 	removing = test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
36579875201eSJosh Durgin 	spin_unlock_irq(&rbd_dev->lock);
36589875201eSJosh Durgin 	/*
36599875201eSJosh Durgin 	 * If the device is being removed, rbd_dev->disk has
36609875201eSJosh Durgin 	 * been destroyed, so don't try to update its size
36619875201eSJosh Durgin 	 */
36629875201eSJosh Durgin 	if (!removing) {
36639875201eSJosh Durgin 		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
36649875201eSJosh Durgin 		dout("setting size to %llu sectors", (unsigned long long)size);
36659875201eSJosh Durgin 		set_capacity(rbd_dev->disk, size);
36669875201eSJosh Durgin 		revalidate_disk(rbd_dev->disk);
36679875201eSJosh Durgin 	}
36689875201eSJosh Durgin }
36699875201eSJosh Durgin 
3670cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev)
36711fe5e993SAlex Elder {
3672e627db08SAlex Elder 	u64 mapping_size;
36731fe5e993SAlex Elder 	int ret;
36741fe5e993SAlex Elder 
3675cfbf6377SAlex Elder 	down_write(&rbd_dev->header_rwsem);
36763b5cf2a2SAlex Elder 	mapping_size = rbd_dev->mapping.size;
3677a720ae09SIlya Dryomov 
3678a720ae09SIlya Dryomov 	ret = rbd_dev_header_info(rbd_dev);
367952bb1f9bSIlya Dryomov 	if (ret)
368073e39e4dSIlya Dryomov 		goto out;
368115228edeSAlex Elder 
3682e8f59b59SIlya Dryomov 	/*
3683e8f59b59SIlya Dryomov 	 * If there is a parent, see if it has disappeared due to the
3684e8f59b59SIlya Dryomov 	 * mapped image getting flattened.
3685e8f59b59SIlya Dryomov 	 */
3686e8f59b59SIlya Dryomov 	if (rbd_dev->parent) {
3687e8f59b59SIlya Dryomov 		ret = rbd_dev_v2_parent_info(rbd_dev);
3688e8f59b59SIlya Dryomov 		if (ret)
368973e39e4dSIlya Dryomov 			goto out;
3690e8f59b59SIlya Dryomov 	}
3691e8f59b59SIlya Dryomov 
36925ff1108cSIlya Dryomov 	if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
36935ff1108cSIlya Dryomov 		rbd_dev->mapping.size = rbd_dev->header.image_size;
36945ff1108cSIlya Dryomov 	} else {
36955ff1108cSIlya Dryomov 		/* validate mapped snapshot's EXISTS flag */
369615228edeSAlex Elder 		rbd_exists_validate(rbd_dev);
36975ff1108cSIlya Dryomov 	}
36985ff1108cSIlya Dryomov 
369973e39e4dSIlya Dryomov out:
3700cfbf6377SAlex Elder 	up_write(&rbd_dev->header_rwsem);
370173e39e4dSIlya Dryomov 	if (!ret && mapping_size != rbd_dev->mapping.size)
37029875201eSJosh Durgin 		rbd_dev_update_size(rbd_dev);
37031fe5e993SAlex Elder 
370473e39e4dSIlya Dryomov 	return ret;
37051fe5e993SAlex Elder }
37061fe5e993SAlex Elder 
37077ad18afaSChristoph Hellwig static int rbd_init_request(void *data, struct request *rq,
37087ad18afaSChristoph Hellwig 		unsigned int hctx_idx, unsigned int request_idx,
37097ad18afaSChristoph Hellwig 		unsigned int numa_node)
37107ad18afaSChristoph Hellwig {
37117ad18afaSChristoph Hellwig 	struct work_struct *work = blk_mq_rq_to_pdu(rq);
37127ad18afaSChristoph Hellwig 
37137ad18afaSChristoph Hellwig 	INIT_WORK(work, rbd_queue_workfn);
37147ad18afaSChristoph Hellwig 	return 0;
37157ad18afaSChristoph Hellwig }
37167ad18afaSChristoph Hellwig 
37177ad18afaSChristoph Hellwig static struct blk_mq_ops rbd_mq_ops = {
37187ad18afaSChristoph Hellwig 	.queue_rq	= rbd_queue_rq,
37197ad18afaSChristoph Hellwig 	.map_queue	= blk_mq_map_queue,
37207ad18afaSChristoph Hellwig 	.init_request	= rbd_init_request,
37217ad18afaSChristoph Hellwig };
37227ad18afaSChristoph Hellwig 
3723602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
3724602adf40SYehuda Sadeh {
3725602adf40SYehuda Sadeh 	struct gendisk *disk;
3726602adf40SYehuda Sadeh 	struct request_queue *q;
3727593a9e7bSAlex Elder 	u64 segment_size;
37287ad18afaSChristoph Hellwig 	int err;
3729602adf40SYehuda Sadeh 
3730602adf40SYehuda Sadeh 	/* create gendisk info */
37317e513d43SIlya Dryomov 	disk = alloc_disk(single_major ?
37327e513d43SIlya Dryomov 			  (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
37337e513d43SIlya Dryomov 			  RBD_MINORS_PER_MAJOR);
3734602adf40SYehuda Sadeh 	if (!disk)
37351fcdb8aaSAlex Elder 		return -ENOMEM;
3736602adf40SYehuda Sadeh 
3737f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
3738de71a297SAlex Elder 		 rbd_dev->dev_id);
3739602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
3740dd82fff1SIlya Dryomov 	disk->first_minor = rbd_dev->minor;
37417e513d43SIlya Dryomov 	if (single_major)
37427e513d43SIlya Dryomov 		disk->flags |= GENHD_FL_EXT_DEVT;
3743602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
3744602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
3745602adf40SYehuda Sadeh 
37467ad18afaSChristoph Hellwig 	memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
37477ad18afaSChristoph Hellwig 	rbd_dev->tag_set.ops = &rbd_mq_ops;
3748b5584180SIlya Dryomov 	rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
37497ad18afaSChristoph Hellwig 	rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
3750b5584180SIlya Dryomov 	rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
37517ad18afaSChristoph Hellwig 	rbd_dev->tag_set.nr_hw_queues = 1;
37527ad18afaSChristoph Hellwig 	rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
37537ad18afaSChristoph Hellwig 
37547ad18afaSChristoph Hellwig 	err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
37557ad18afaSChristoph Hellwig 	if (err)
3756602adf40SYehuda Sadeh 		goto out_disk;
3757029bcbd8SJosh Durgin 
37587ad18afaSChristoph Hellwig 	q = blk_mq_init_queue(&rbd_dev->tag_set);
37597ad18afaSChristoph Hellwig 	if (IS_ERR(q)) {
37607ad18afaSChristoph Hellwig 		err = PTR_ERR(q);
37617ad18afaSChristoph Hellwig 		goto out_tag_set;
37627ad18afaSChristoph Hellwig 	}
37637ad18afaSChristoph Hellwig 
3764d8a2c89cSIlya Dryomov 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
3765d8a2c89cSIlya Dryomov 	/* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
3766593a9e7bSAlex Elder 
3767029bcbd8SJosh Durgin 	/* set io sizes to object size */
3768593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
3769593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
37700d9fde4fSIlya Dryomov 	q->limits.max_sectors = queue_max_hw_sectors(q);
3771d3834fefSIlya Dryomov 	blk_queue_max_segments(q, segment_size / SECTOR_SIZE);
3772593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
3773593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
3774593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
3775029bcbd8SJosh Durgin 
377690e98c52SGuangliang Zhao 	/* enable the discard support */
377790e98c52SGuangliang Zhao 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
377890e98c52SGuangliang Zhao 	q->limits.discard_granularity = segment_size;
377990e98c52SGuangliang Zhao 	q->limits.discard_alignment = segment_size;
37802bb4cd5cSJens Axboe 	blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE);
3781b76f8239SJosh Durgin 	q->limits.discard_zeroes_data = 1;
378290e98c52SGuangliang Zhao 
3783bae818eeSRonny Hegewald 	if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
3784bae818eeSRonny Hegewald 		q->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES;
3785bae818eeSRonny Hegewald 
3786602adf40SYehuda Sadeh 	disk->queue = q;
3787602adf40SYehuda Sadeh 
3788602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
3789602adf40SYehuda Sadeh 
3790602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
3791602adf40SYehuda Sadeh 
3792602adf40SYehuda Sadeh 	return 0;
37937ad18afaSChristoph Hellwig out_tag_set:
37947ad18afaSChristoph Hellwig 	blk_mq_free_tag_set(&rbd_dev->tag_set);
3795602adf40SYehuda Sadeh out_disk:
3796602adf40SYehuda Sadeh 	put_disk(disk);
37977ad18afaSChristoph Hellwig 	return err;
3798602adf40SYehuda Sadeh }
3799602adf40SYehuda Sadeh 
3800dfc5606dSYehuda Sadeh /*
3801dfc5606dSYehuda Sadeh   sysfs
3802dfc5606dSYehuda Sadeh */
3803602adf40SYehuda Sadeh 
3804593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3805593a9e7bSAlex Elder {
3806593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
3807593a9e7bSAlex Elder }
3808593a9e7bSAlex Elder 
3809dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
3810dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3811602adf40SYehuda Sadeh {
3812593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3813dfc5606dSYehuda Sadeh 
3814fc71d833SAlex Elder 	return sprintf(buf, "%llu\n",
3815fc71d833SAlex Elder 		(unsigned long long)rbd_dev->mapping.size);
3816602adf40SYehuda Sadeh }
3817602adf40SYehuda Sadeh 
381834b13184SAlex Elder /*
381934b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
382034b13184SAlex Elder  * necessarily the base image.
382134b13184SAlex Elder  */
382234b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
382334b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
382434b13184SAlex Elder {
382534b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
382634b13184SAlex Elder 
382734b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
382834b13184SAlex Elder 			(unsigned long long)rbd_dev->mapping.features);
382934b13184SAlex Elder }
383034b13184SAlex Elder 
3831dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
3832dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
3833602adf40SYehuda Sadeh {
3834593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3835dfc5606dSYehuda Sadeh 
3836fc71d833SAlex Elder 	if (rbd_dev->major)
3837dfc5606dSYehuda Sadeh 		return sprintf(buf, "%d\n", rbd_dev->major);
3838fc71d833SAlex Elder 
3839fc71d833SAlex Elder 	return sprintf(buf, "(none)\n");
3840dd82fff1SIlya Dryomov }
3841fc71d833SAlex Elder 
3842dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev,
3843dd82fff1SIlya Dryomov 			      struct device_attribute *attr, char *buf)
3844dd82fff1SIlya Dryomov {
3845dd82fff1SIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3846dd82fff1SIlya Dryomov 
3847dd82fff1SIlya Dryomov 	return sprintf(buf, "%d\n", rbd_dev->minor);
3848dfc5606dSYehuda Sadeh }
3849dfc5606dSYehuda Sadeh 
3850dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
3851dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
3852dfc5606dSYehuda Sadeh {
3853593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3854dfc5606dSYehuda Sadeh 
38551dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
38561dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
3857dfc5606dSYehuda Sadeh }
3858dfc5606dSYehuda Sadeh 
3859dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
3860dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3861dfc5606dSYehuda Sadeh {
3862593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3863dfc5606dSYehuda Sadeh 
38640d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
3865dfc5606dSYehuda Sadeh }
3866dfc5606dSYehuda Sadeh 
38679bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
38689bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
38699bb2f334SAlex Elder {
38709bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
38719bb2f334SAlex Elder 
38720d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
38730d7dbfceSAlex Elder 			(unsigned long long) rbd_dev->spec->pool_id);
38749bb2f334SAlex Elder }
38759bb2f334SAlex Elder 
3876dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
3877dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3878dfc5606dSYehuda Sadeh {
3879593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3880dfc5606dSYehuda Sadeh 
3881a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
38820d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3883a92ffdf8SAlex Elder 
3884a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
3885dfc5606dSYehuda Sadeh }
3886dfc5606dSYehuda Sadeh 
3887589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
3888589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
3889589d30e0SAlex Elder {
3890589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3891589d30e0SAlex Elder 
38920d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
3893589d30e0SAlex Elder }
3894589d30e0SAlex Elder 
389534b13184SAlex Elder /*
389634b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
389734b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
389834b13184SAlex Elder  */
3899dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
3900dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
3901dfc5606dSYehuda Sadeh 			     char *buf)
3902dfc5606dSYehuda Sadeh {
3903593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3904dfc5606dSYehuda Sadeh 
39050d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
3906dfc5606dSYehuda Sadeh }
3907dfc5606dSYehuda Sadeh 
390886b00e0dSAlex Elder /*
3909ff96128fSIlya Dryomov  * For a v2 image, shows the chain of parent images, separated by empty
3910ff96128fSIlya Dryomov  * lines.  For v1 images or if there is no parent, shows "(no parent
3911ff96128fSIlya Dryomov  * image)".
391286b00e0dSAlex Elder  */
391386b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
391486b00e0dSAlex Elder 			       struct device_attribute *attr,
391586b00e0dSAlex Elder 			       char *buf)
391686b00e0dSAlex Elder {
391786b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3918ff96128fSIlya Dryomov 	ssize_t count = 0;
391986b00e0dSAlex Elder 
3920ff96128fSIlya Dryomov 	if (!rbd_dev->parent)
392186b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
392286b00e0dSAlex Elder 
3923ff96128fSIlya Dryomov 	for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
3924ff96128fSIlya Dryomov 		struct rbd_spec *spec = rbd_dev->parent_spec;
392586b00e0dSAlex Elder 
3926ff96128fSIlya Dryomov 		count += sprintf(&buf[count], "%s"
3927ff96128fSIlya Dryomov 			    "pool_id %llu\npool_name %s\n"
3928ff96128fSIlya Dryomov 			    "image_id %s\nimage_name %s\n"
3929ff96128fSIlya Dryomov 			    "snap_id %llu\nsnap_name %s\n"
3930ff96128fSIlya Dryomov 			    "overlap %llu\n",
3931ff96128fSIlya Dryomov 			    !count ? "" : "\n", /* first? */
3932ff96128fSIlya Dryomov 			    spec->pool_id, spec->pool_name,
3933ff96128fSIlya Dryomov 			    spec->image_id, spec->image_name ?: "(unknown)",
3934ff96128fSIlya Dryomov 			    spec->snap_id, spec->snap_name,
3935ff96128fSIlya Dryomov 			    rbd_dev->parent_overlap);
3936ff96128fSIlya Dryomov 	}
393786b00e0dSAlex Elder 
393886b00e0dSAlex Elder 	return count;
393986b00e0dSAlex Elder }
394086b00e0dSAlex Elder 
3941dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
3942dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
3943dfc5606dSYehuda Sadeh 				 const char *buf,
3944dfc5606dSYehuda Sadeh 				 size_t size)
3945dfc5606dSYehuda Sadeh {
3946593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3947b813623aSAlex Elder 	int ret;
3948602adf40SYehuda Sadeh 
3949cc4a38bdSAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
3950e627db08SAlex Elder 	if (ret)
395152bb1f9bSIlya Dryomov 		return ret;
3952b813623aSAlex Elder 
395352bb1f9bSIlya Dryomov 	return size;
3954dfc5606dSYehuda Sadeh }
3955602adf40SYehuda Sadeh 
3956dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
395734b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
3958dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
3959dd82fff1SIlya Dryomov static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
3960dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
3961dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
39629bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
3963dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
3964589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
3965dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3966dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
396786b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
3968dfc5606dSYehuda Sadeh 
3969dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
3970dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
397134b13184SAlex Elder 	&dev_attr_features.attr,
3972dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
3973dd82fff1SIlya Dryomov 	&dev_attr_minor.attr,
3974dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
3975dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
39769bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
3977dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
3978589d30e0SAlex Elder 	&dev_attr_image_id.attr,
3979dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
398086b00e0dSAlex Elder 	&dev_attr_parent.attr,
3981dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
3982dfc5606dSYehuda Sadeh 	NULL
3983dfc5606dSYehuda Sadeh };
3984dfc5606dSYehuda Sadeh 
3985dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
3986dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
3987dfc5606dSYehuda Sadeh };
3988dfc5606dSYehuda Sadeh 
3989dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
3990dfc5606dSYehuda Sadeh 	&rbd_attr_group,
3991dfc5606dSYehuda Sadeh 	NULL
3992dfc5606dSYehuda Sadeh };
3993dfc5606dSYehuda Sadeh 
3994dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
3995dfc5606dSYehuda Sadeh {
3996dfc5606dSYehuda Sadeh }
3997dfc5606dSYehuda Sadeh 
3998dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
3999dfc5606dSYehuda Sadeh 	.name		= "rbd",
4000dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
4001dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
4002dfc5606dSYehuda Sadeh };
4003dfc5606dSYehuda Sadeh 
40048b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
40058b8fb99cSAlex Elder {
40068b8fb99cSAlex Elder 	kref_get(&spec->kref);
40078b8fb99cSAlex Elder 
40088b8fb99cSAlex Elder 	return spec;
40098b8fb99cSAlex Elder }
40108b8fb99cSAlex Elder 
40118b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
40128b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
40138b8fb99cSAlex Elder {
40148b8fb99cSAlex Elder 	if (spec)
40158b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
40168b8fb99cSAlex Elder }
40178b8fb99cSAlex Elder 
40188b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
40198b8fb99cSAlex Elder {
40208b8fb99cSAlex Elder 	struct rbd_spec *spec;
40218b8fb99cSAlex Elder 
40228b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
40238b8fb99cSAlex Elder 	if (!spec)
40248b8fb99cSAlex Elder 		return NULL;
402504077599SIlya Dryomov 
402604077599SIlya Dryomov 	spec->pool_id = CEPH_NOPOOL;
402704077599SIlya Dryomov 	spec->snap_id = CEPH_NOSNAP;
40288b8fb99cSAlex Elder 	kref_init(&spec->kref);
40298b8fb99cSAlex Elder 
40308b8fb99cSAlex Elder 	return spec;
40318b8fb99cSAlex Elder }
40328b8fb99cSAlex Elder 
40338b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
40348b8fb99cSAlex Elder {
40358b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
40368b8fb99cSAlex Elder 
40378b8fb99cSAlex Elder 	kfree(spec->pool_name);
40388b8fb99cSAlex Elder 	kfree(spec->image_id);
40398b8fb99cSAlex Elder 	kfree(spec->image_name);
40408b8fb99cSAlex Elder 	kfree(spec->snap_name);
40418b8fb99cSAlex Elder 	kfree(spec);
40428b8fb99cSAlex Elder }
40438b8fb99cSAlex Elder 
4044cc344fa1SAlex Elder static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
4045d147543dSIlya Dryomov 					 struct rbd_spec *spec,
4046d147543dSIlya Dryomov 					 struct rbd_options *opts)
4047c53d5893SAlex Elder {
4048c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
4049c53d5893SAlex Elder 
4050c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
4051c53d5893SAlex Elder 	if (!rbd_dev)
4052c53d5893SAlex Elder 		return NULL;
4053c53d5893SAlex Elder 
4054c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
40556d292906SAlex Elder 	rbd_dev->flags = 0;
4056a2acd00eSAlex Elder 	atomic_set(&rbd_dev->parent_ref, 0);
4057c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
4058c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
4059c53d5893SAlex Elder 
4060c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
4061d147543dSIlya Dryomov 	rbd_dev->spec = spec;
4062d147543dSIlya Dryomov 	rbd_dev->opts = opts;
4063c53d5893SAlex Elder 
40640903e875SAlex Elder 	/* Initialize the layout used for all rbd requests */
40650903e875SAlex Elder 
40660903e875SAlex Elder 	rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
40670903e875SAlex Elder 	rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
40680903e875SAlex Elder 	rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
40690903e875SAlex Elder 	rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
40700903e875SAlex Elder 
4071c53d5893SAlex Elder 	return rbd_dev;
4072c53d5893SAlex Elder }
4073c53d5893SAlex Elder 
4074c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4075c53d5893SAlex Elder {
4076c53d5893SAlex Elder 	rbd_put_client(rbd_dev->rbd_client);
4077c53d5893SAlex Elder 	rbd_spec_put(rbd_dev->spec);
4078d147543dSIlya Dryomov 	kfree(rbd_dev->opts);
4079c53d5893SAlex Elder 	kfree(rbd_dev);
4080c53d5893SAlex Elder }
4081c53d5893SAlex Elder 
4082dfc5606dSYehuda Sadeh /*
40839d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
40849d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
40859d475de5SAlex Elder  * image.
40869d475de5SAlex Elder  */
40879d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
40889d475de5SAlex Elder 				u8 *order, u64 *snap_size)
40899d475de5SAlex Elder {
40909d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
40919d475de5SAlex Elder 	int ret;
40929d475de5SAlex Elder 	struct {
40939d475de5SAlex Elder 		u8 order;
40949d475de5SAlex Elder 		__le64 size;
40959d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
40969d475de5SAlex Elder 
409736be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
40989d475de5SAlex Elder 				"rbd", "get_size",
40994157976bSAlex Elder 				&snapid, sizeof (snapid),
4100e2a58ee5SAlex Elder 				&size_buf, sizeof (size_buf));
410136be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
41029d475de5SAlex Elder 	if (ret < 0)
41039d475de5SAlex Elder 		return ret;
410457385b51SAlex Elder 	if (ret < sizeof (size_buf))
410557385b51SAlex Elder 		return -ERANGE;
41069d475de5SAlex Elder 
4107c3545579SJosh Durgin 	if (order) {
41089d475de5SAlex Elder 		*order = size_buf.order;
4109c3545579SJosh Durgin 		dout("  order %u", (unsigned int)*order);
4110c3545579SJosh Durgin 	}
41119d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
41129d475de5SAlex Elder 
4113c3545579SJosh Durgin 	dout("  snap_id 0x%016llx snap_size = %llu\n",
4114c3545579SJosh Durgin 		(unsigned long long)snap_id,
41159d475de5SAlex Elder 		(unsigned long long)*snap_size);
41169d475de5SAlex Elder 
41179d475de5SAlex Elder 	return 0;
41189d475de5SAlex Elder }
41199d475de5SAlex Elder 
41209d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
41219d475de5SAlex Elder {
41229d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
41239d475de5SAlex Elder 					&rbd_dev->header.obj_order,
41249d475de5SAlex Elder 					&rbd_dev->header.image_size);
41259d475de5SAlex Elder }
41269d475de5SAlex Elder 
41271e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
41281e130199SAlex Elder {
41291e130199SAlex Elder 	void *reply_buf;
41301e130199SAlex Elder 	int ret;
41311e130199SAlex Elder 	void *p;
41321e130199SAlex Elder 
41331e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
41341e130199SAlex Elder 	if (!reply_buf)
41351e130199SAlex Elder 		return -ENOMEM;
41361e130199SAlex Elder 
413736be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
41384157976bSAlex Elder 				"rbd", "get_object_prefix", NULL, 0,
4139e2a58ee5SAlex Elder 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
414036be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
41411e130199SAlex Elder 	if (ret < 0)
41421e130199SAlex Elder 		goto out;
41431e130199SAlex Elder 
41441e130199SAlex Elder 	p = reply_buf;
41451e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
414657385b51SAlex Elder 						p + ret, NULL, GFP_NOIO);
414757385b51SAlex Elder 	ret = 0;
41481e130199SAlex Elder 
41491e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
41501e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
41511e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
41521e130199SAlex Elder 	} else {
41531e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
41541e130199SAlex Elder 	}
41551e130199SAlex Elder out:
41561e130199SAlex Elder 	kfree(reply_buf);
41571e130199SAlex Elder 
41581e130199SAlex Elder 	return ret;
41591e130199SAlex Elder }
41601e130199SAlex Elder 
4161b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4162b1b5402aSAlex Elder 		u64 *snap_features)
4163b1b5402aSAlex Elder {
4164b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
4165b1b5402aSAlex Elder 	struct {
4166b1b5402aSAlex Elder 		__le64 features;
4167b1b5402aSAlex Elder 		__le64 incompat;
41684157976bSAlex Elder 	} __attribute__ ((packed)) features_buf = { 0 };
4169d889140cSAlex Elder 	u64 incompat;
4170b1b5402aSAlex Elder 	int ret;
4171b1b5402aSAlex Elder 
417236be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4173b1b5402aSAlex Elder 				"rbd", "get_features",
41744157976bSAlex Elder 				&snapid, sizeof (snapid),
4175e2a58ee5SAlex Elder 				&features_buf, sizeof (features_buf));
417636be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4177b1b5402aSAlex Elder 	if (ret < 0)
4178b1b5402aSAlex Elder 		return ret;
417957385b51SAlex Elder 	if (ret < sizeof (features_buf))
418057385b51SAlex Elder 		return -ERANGE;
4181d889140cSAlex Elder 
4182d889140cSAlex Elder 	incompat = le64_to_cpu(features_buf.incompat);
41835cbf6f12SAlex Elder 	if (incompat & ~RBD_FEATURES_SUPPORTED)
4184b8f5c6edSAlex Elder 		return -ENXIO;
4185d889140cSAlex Elder 
4186b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
4187b1b5402aSAlex Elder 
4188b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
4189b1b5402aSAlex Elder 		(unsigned long long)snap_id,
4190b1b5402aSAlex Elder 		(unsigned long long)*snap_features,
4191b1b5402aSAlex Elder 		(unsigned long long)le64_to_cpu(features_buf.incompat));
4192b1b5402aSAlex Elder 
4193b1b5402aSAlex Elder 	return 0;
4194b1b5402aSAlex Elder }
4195b1b5402aSAlex Elder 
4196b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
4197b1b5402aSAlex Elder {
4198b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
4199b1b5402aSAlex Elder 						&rbd_dev->header.features);
4200b1b5402aSAlex Elder }
4201b1b5402aSAlex Elder 
420286b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
420386b00e0dSAlex Elder {
420486b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
420586b00e0dSAlex Elder 	size_t size;
420686b00e0dSAlex Elder 	void *reply_buf = NULL;
420786b00e0dSAlex Elder 	__le64 snapid;
420886b00e0dSAlex Elder 	void *p;
420986b00e0dSAlex Elder 	void *end;
4210642a2537SAlex Elder 	u64 pool_id;
421186b00e0dSAlex Elder 	char *image_id;
42123b5cf2a2SAlex Elder 	u64 snap_id;
421386b00e0dSAlex Elder 	u64 overlap;
421486b00e0dSAlex Elder 	int ret;
421586b00e0dSAlex Elder 
421686b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
421786b00e0dSAlex Elder 	if (!parent_spec)
421886b00e0dSAlex Elder 		return -ENOMEM;
421986b00e0dSAlex Elder 
422086b00e0dSAlex Elder 	size = sizeof (__le64) +				/* pool_id */
422186b00e0dSAlex Elder 		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
422286b00e0dSAlex Elder 		sizeof (__le64) +				/* snap_id */
422386b00e0dSAlex Elder 		sizeof (__le64);				/* overlap */
422486b00e0dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
422586b00e0dSAlex Elder 	if (!reply_buf) {
422686b00e0dSAlex Elder 		ret = -ENOMEM;
422786b00e0dSAlex Elder 		goto out_err;
422886b00e0dSAlex Elder 	}
422986b00e0dSAlex Elder 
42304d9b67cdSIlya Dryomov 	snapid = cpu_to_le64(rbd_dev->spec->snap_id);
423136be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
423286b00e0dSAlex Elder 				"rbd", "get_parent",
42334157976bSAlex Elder 				&snapid, sizeof (snapid),
4234e2a58ee5SAlex Elder 				reply_buf, size);
423536be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
423686b00e0dSAlex Elder 	if (ret < 0)
423786b00e0dSAlex Elder 		goto out_err;
423886b00e0dSAlex Elder 
423986b00e0dSAlex Elder 	p = reply_buf;
424057385b51SAlex Elder 	end = reply_buf + ret;
424157385b51SAlex Elder 	ret = -ERANGE;
4242642a2537SAlex Elder 	ceph_decode_64_safe(&p, end, pool_id, out_err);
4243392a9dadSAlex Elder 	if (pool_id == CEPH_NOPOOL) {
4244392a9dadSAlex Elder 		/*
4245392a9dadSAlex Elder 		 * Either the parent never existed, or we have
4246392a9dadSAlex Elder 		 * record of it but the image got flattened so it no
4247392a9dadSAlex Elder 		 * longer has a parent.  When the parent of a
4248392a9dadSAlex Elder 		 * layered image disappears we immediately set the
4249392a9dadSAlex Elder 		 * overlap to 0.  The effect of this is that all new
4250392a9dadSAlex Elder 		 * requests will be treated as if the image had no
4251392a9dadSAlex Elder 		 * parent.
4252392a9dadSAlex Elder 		 */
4253392a9dadSAlex Elder 		if (rbd_dev->parent_overlap) {
4254392a9dadSAlex Elder 			rbd_dev->parent_overlap = 0;
4255392a9dadSAlex Elder 			rbd_dev_parent_put(rbd_dev);
4256392a9dadSAlex Elder 			pr_info("%s: clone image has been flattened\n",
4257392a9dadSAlex Elder 				rbd_dev->disk->disk_name);
4258392a9dadSAlex Elder 		}
4259392a9dadSAlex Elder 
426086b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
4261392a9dadSAlex Elder 	}
426286b00e0dSAlex Elder 
42630903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
42640903e875SAlex Elder 
42650903e875SAlex Elder 	ret = -EIO;
4266642a2537SAlex Elder 	if (pool_id > (u64)U32_MAX) {
42679584d508SIlya Dryomov 		rbd_warn(NULL, "parent pool id too large (%llu > %u)",
4268642a2537SAlex Elder 			(unsigned long long)pool_id, U32_MAX);
426957385b51SAlex Elder 		goto out_err;
4270c0cd10dbSAlex Elder 	}
42710903e875SAlex Elder 
4272979ed480SAlex Elder 	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
427386b00e0dSAlex Elder 	if (IS_ERR(image_id)) {
427486b00e0dSAlex Elder 		ret = PTR_ERR(image_id);
427586b00e0dSAlex Elder 		goto out_err;
427686b00e0dSAlex Elder 	}
42773b5cf2a2SAlex Elder 	ceph_decode_64_safe(&p, end, snap_id, out_err);
427886b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, overlap, out_err);
427986b00e0dSAlex Elder 
42803b5cf2a2SAlex Elder 	/*
42813b5cf2a2SAlex Elder 	 * The parent won't change (except when the clone is
42823b5cf2a2SAlex Elder 	 * flattened, already handled that).  So we only need to
42833b5cf2a2SAlex Elder 	 * record the parent spec we have not already done so.
42843b5cf2a2SAlex Elder 	 */
42853b5cf2a2SAlex Elder 	if (!rbd_dev->parent_spec) {
42863b5cf2a2SAlex Elder 		parent_spec->pool_id = pool_id;
42873b5cf2a2SAlex Elder 		parent_spec->image_id = image_id;
42883b5cf2a2SAlex Elder 		parent_spec->snap_id = snap_id;
428986b00e0dSAlex Elder 		rbd_dev->parent_spec = parent_spec;
429086b00e0dSAlex Elder 		parent_spec = NULL;	/* rbd_dev now owns this */
4291fbba11b3SIlya Dryomov 	} else {
4292fbba11b3SIlya Dryomov 		kfree(image_id);
42933b5cf2a2SAlex Elder 	}
42943b5cf2a2SAlex Elder 
42953b5cf2a2SAlex Elder 	/*
4296cf32bd9cSIlya Dryomov 	 * We always update the parent overlap.  If it's zero we issue
4297cf32bd9cSIlya Dryomov 	 * a warning, as we will proceed as if there was no parent.
42983b5cf2a2SAlex Elder 	 */
42993b5cf2a2SAlex Elder 	if (!overlap) {
43003b5cf2a2SAlex Elder 		if (parent_spec) {
4301cf32bd9cSIlya Dryomov 			/* refresh, careful to warn just once */
4302cf32bd9cSIlya Dryomov 			if (rbd_dev->parent_overlap)
4303cf32bd9cSIlya Dryomov 				rbd_warn(rbd_dev,
4304cf32bd9cSIlya Dryomov 				    "clone now standalone (overlap became 0)");
430570cf49cfSAlex Elder 		} else {
4306cf32bd9cSIlya Dryomov 			/* initial probe */
4307cf32bd9cSIlya Dryomov 			rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
43083b5cf2a2SAlex Elder 		}
430970cf49cfSAlex Elder 	}
4310cf32bd9cSIlya Dryomov 	rbd_dev->parent_overlap = overlap;
4311cf32bd9cSIlya Dryomov 
431286b00e0dSAlex Elder out:
431386b00e0dSAlex Elder 	ret = 0;
431486b00e0dSAlex Elder out_err:
431586b00e0dSAlex Elder 	kfree(reply_buf);
431686b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
431786b00e0dSAlex Elder 
431886b00e0dSAlex Elder 	return ret;
431986b00e0dSAlex Elder }
432086b00e0dSAlex Elder 
4321cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
4322cc070d59SAlex Elder {
4323cc070d59SAlex Elder 	struct {
4324cc070d59SAlex Elder 		__le64 stripe_unit;
4325cc070d59SAlex Elder 		__le64 stripe_count;
4326cc070d59SAlex Elder 	} __attribute__ ((packed)) striping_info_buf = { 0 };
4327cc070d59SAlex Elder 	size_t size = sizeof (striping_info_buf);
4328cc070d59SAlex Elder 	void *p;
4329cc070d59SAlex Elder 	u64 obj_size;
4330cc070d59SAlex Elder 	u64 stripe_unit;
4331cc070d59SAlex Elder 	u64 stripe_count;
4332cc070d59SAlex Elder 	int ret;
4333cc070d59SAlex Elder 
4334cc070d59SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4335cc070d59SAlex Elder 				"rbd", "get_stripe_unit_count", NULL, 0,
4336e2a58ee5SAlex Elder 				(char *)&striping_info_buf, size);
4337cc070d59SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4338cc070d59SAlex Elder 	if (ret < 0)
4339cc070d59SAlex Elder 		return ret;
4340cc070d59SAlex Elder 	if (ret < size)
4341cc070d59SAlex Elder 		return -ERANGE;
4342cc070d59SAlex Elder 
4343cc070d59SAlex Elder 	/*
4344cc070d59SAlex Elder 	 * We don't actually support the "fancy striping" feature
4345cc070d59SAlex Elder 	 * (STRIPINGV2) yet, but if the striping sizes are the
4346cc070d59SAlex Elder 	 * defaults the behavior is the same as before.  So find
4347cc070d59SAlex Elder 	 * out, and only fail if the image has non-default values.
4348cc070d59SAlex Elder 	 */
4349cc070d59SAlex Elder 	ret = -EINVAL;
4350cc070d59SAlex Elder 	obj_size = (u64)1 << rbd_dev->header.obj_order;
4351cc070d59SAlex Elder 	p = &striping_info_buf;
4352cc070d59SAlex Elder 	stripe_unit = ceph_decode_64(&p);
4353cc070d59SAlex Elder 	if (stripe_unit != obj_size) {
4354cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe unit "
4355cc070d59SAlex Elder 				"(got %llu want %llu)",
4356cc070d59SAlex Elder 				stripe_unit, obj_size);
4357cc070d59SAlex Elder 		return -EINVAL;
4358cc070d59SAlex Elder 	}
4359cc070d59SAlex Elder 	stripe_count = ceph_decode_64(&p);
4360cc070d59SAlex Elder 	if (stripe_count != 1) {
4361cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe count "
4362cc070d59SAlex Elder 				"(got %llu want 1)", stripe_count);
4363cc070d59SAlex Elder 		return -EINVAL;
4364cc070d59SAlex Elder 	}
4365500d0c0fSAlex Elder 	rbd_dev->header.stripe_unit = stripe_unit;
4366500d0c0fSAlex Elder 	rbd_dev->header.stripe_count = stripe_count;
4367cc070d59SAlex Elder 
4368cc070d59SAlex Elder 	return 0;
4369cc070d59SAlex Elder }
4370cc070d59SAlex Elder 
43719e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
43729e15b77dSAlex Elder {
43739e15b77dSAlex Elder 	size_t image_id_size;
43749e15b77dSAlex Elder 	char *image_id;
43759e15b77dSAlex Elder 	void *p;
43769e15b77dSAlex Elder 	void *end;
43779e15b77dSAlex Elder 	size_t size;
43789e15b77dSAlex Elder 	void *reply_buf = NULL;
43799e15b77dSAlex Elder 	size_t len = 0;
43809e15b77dSAlex Elder 	char *image_name = NULL;
43819e15b77dSAlex Elder 	int ret;
43829e15b77dSAlex Elder 
43839e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
43849e15b77dSAlex Elder 
438569e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
438669e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
43879e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
43889e15b77dSAlex Elder 	if (!image_id)
43899e15b77dSAlex Elder 		return NULL;
43909e15b77dSAlex Elder 
43919e15b77dSAlex Elder 	p = image_id;
43924157976bSAlex Elder 	end = image_id + image_id_size;
439369e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
43949e15b77dSAlex Elder 
43959e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
43969e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
43979e15b77dSAlex Elder 	if (!reply_buf)
43989e15b77dSAlex Elder 		goto out;
43999e15b77dSAlex Elder 
440036be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
44019e15b77dSAlex Elder 				"rbd", "dir_get_name",
44029e15b77dSAlex Elder 				image_id, image_id_size,
4403e2a58ee5SAlex Elder 				reply_buf, size);
44049e15b77dSAlex Elder 	if (ret < 0)
44059e15b77dSAlex Elder 		goto out;
44069e15b77dSAlex Elder 	p = reply_buf;
4407f40eb349SAlex Elder 	end = reply_buf + ret;
4408f40eb349SAlex Elder 
44099e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
44109e15b77dSAlex Elder 	if (IS_ERR(image_name))
44119e15b77dSAlex Elder 		image_name = NULL;
44129e15b77dSAlex Elder 	else
44139e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
44149e15b77dSAlex Elder out:
44159e15b77dSAlex Elder 	kfree(reply_buf);
44169e15b77dSAlex Elder 	kfree(image_id);
44179e15b77dSAlex Elder 
44189e15b77dSAlex Elder 	return image_name;
44199e15b77dSAlex Elder }
44209e15b77dSAlex Elder 
44212ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
44222ad3d716SAlex Elder {
44232ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
44242ad3d716SAlex Elder 	const char *snap_name;
44252ad3d716SAlex Elder 	u32 which = 0;
44262ad3d716SAlex Elder 
44272ad3d716SAlex Elder 	/* Skip over names until we find the one we are looking for */
44282ad3d716SAlex Elder 
44292ad3d716SAlex Elder 	snap_name = rbd_dev->header.snap_names;
44302ad3d716SAlex Elder 	while (which < snapc->num_snaps) {
44312ad3d716SAlex Elder 		if (!strcmp(name, snap_name))
44322ad3d716SAlex Elder 			return snapc->snaps[which];
44332ad3d716SAlex Elder 		snap_name += strlen(snap_name) + 1;
44342ad3d716SAlex Elder 		which++;
44352ad3d716SAlex Elder 	}
44362ad3d716SAlex Elder 	return CEPH_NOSNAP;
44372ad3d716SAlex Elder }
44382ad3d716SAlex Elder 
44392ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
44402ad3d716SAlex Elder {
44412ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
44422ad3d716SAlex Elder 	u32 which;
44432ad3d716SAlex Elder 	bool found = false;
44442ad3d716SAlex Elder 	u64 snap_id;
44452ad3d716SAlex Elder 
44462ad3d716SAlex Elder 	for (which = 0; !found && which < snapc->num_snaps; which++) {
44472ad3d716SAlex Elder 		const char *snap_name;
44482ad3d716SAlex Elder 
44492ad3d716SAlex Elder 		snap_id = snapc->snaps[which];
44502ad3d716SAlex Elder 		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
4451efadc98aSJosh Durgin 		if (IS_ERR(snap_name)) {
4452efadc98aSJosh Durgin 			/* ignore no-longer existing snapshots */
4453efadc98aSJosh Durgin 			if (PTR_ERR(snap_name) == -ENOENT)
4454efadc98aSJosh Durgin 				continue;
4455efadc98aSJosh Durgin 			else
44562ad3d716SAlex Elder 				break;
4457efadc98aSJosh Durgin 		}
44582ad3d716SAlex Elder 		found = !strcmp(name, snap_name);
44592ad3d716SAlex Elder 		kfree(snap_name);
44602ad3d716SAlex Elder 	}
44612ad3d716SAlex Elder 	return found ? snap_id : CEPH_NOSNAP;
44622ad3d716SAlex Elder }
44632ad3d716SAlex Elder 
44642ad3d716SAlex Elder /*
44652ad3d716SAlex Elder  * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
44662ad3d716SAlex Elder  * no snapshot by that name is found, or if an error occurs.
44672ad3d716SAlex Elder  */
44682ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
44692ad3d716SAlex Elder {
44702ad3d716SAlex Elder 	if (rbd_dev->image_format == 1)
44712ad3d716SAlex Elder 		return rbd_v1_snap_id_by_name(rbd_dev, name);
44722ad3d716SAlex Elder 
44732ad3d716SAlex Elder 	return rbd_v2_snap_id_by_name(rbd_dev, name);
44742ad3d716SAlex Elder }
44752ad3d716SAlex Elder 
44769e15b77dSAlex Elder /*
447704077599SIlya Dryomov  * An image being mapped will have everything but the snap id.
44789e15b77dSAlex Elder  */
447904077599SIlya Dryomov static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
448004077599SIlya Dryomov {
448104077599SIlya Dryomov 	struct rbd_spec *spec = rbd_dev->spec;
448204077599SIlya Dryomov 
448304077599SIlya Dryomov 	rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
448404077599SIlya Dryomov 	rbd_assert(spec->image_id && spec->image_name);
448504077599SIlya Dryomov 	rbd_assert(spec->snap_name);
448604077599SIlya Dryomov 
448704077599SIlya Dryomov 	if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
448804077599SIlya Dryomov 		u64 snap_id;
448904077599SIlya Dryomov 
449004077599SIlya Dryomov 		snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
449104077599SIlya Dryomov 		if (snap_id == CEPH_NOSNAP)
449204077599SIlya Dryomov 			return -ENOENT;
449304077599SIlya Dryomov 
449404077599SIlya Dryomov 		spec->snap_id = snap_id;
449504077599SIlya Dryomov 	} else {
449604077599SIlya Dryomov 		spec->snap_id = CEPH_NOSNAP;
449704077599SIlya Dryomov 	}
449804077599SIlya Dryomov 
449904077599SIlya Dryomov 	return 0;
450004077599SIlya Dryomov }
450104077599SIlya Dryomov 
450204077599SIlya Dryomov /*
450304077599SIlya Dryomov  * A parent image will have all ids but none of the names.
450404077599SIlya Dryomov  *
450504077599SIlya Dryomov  * All names in an rbd spec are dynamically allocated.  It's OK if we
450604077599SIlya Dryomov  * can't figure out the name for an image id.
450704077599SIlya Dryomov  */
450804077599SIlya Dryomov static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
45099e15b77dSAlex Elder {
45102e9f7f1cSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
45112e9f7f1cSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
45122e9f7f1cSAlex Elder 	const char *pool_name;
45132e9f7f1cSAlex Elder 	const char *image_name;
45142e9f7f1cSAlex Elder 	const char *snap_name;
45159e15b77dSAlex Elder 	int ret;
45169e15b77dSAlex Elder 
451704077599SIlya Dryomov 	rbd_assert(spec->pool_id != CEPH_NOPOOL);
451804077599SIlya Dryomov 	rbd_assert(spec->image_id);
451904077599SIlya Dryomov 	rbd_assert(spec->snap_id != CEPH_NOSNAP);
45209e15b77dSAlex Elder 
45212e9f7f1cSAlex Elder 	/* Get the pool name; we have to make our own copy of this */
45229e15b77dSAlex Elder 
45232e9f7f1cSAlex Elder 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
45242e9f7f1cSAlex Elder 	if (!pool_name) {
45252e9f7f1cSAlex Elder 		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
4526935dc89fSAlex Elder 		return -EIO;
4527935dc89fSAlex Elder 	}
45282e9f7f1cSAlex Elder 	pool_name = kstrdup(pool_name, GFP_KERNEL);
45292e9f7f1cSAlex Elder 	if (!pool_name)
45309e15b77dSAlex Elder 		return -ENOMEM;
45319e15b77dSAlex Elder 
45329e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
45339e15b77dSAlex Elder 
45342e9f7f1cSAlex Elder 	image_name = rbd_dev_image_name(rbd_dev);
45352e9f7f1cSAlex Elder 	if (!image_name)
453606ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
45379e15b77dSAlex Elder 
453804077599SIlya Dryomov 	/* Fetch the snapshot name */
45399e15b77dSAlex Elder 
45402e9f7f1cSAlex Elder 	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
4541da6a6b63SJosh Durgin 	if (IS_ERR(snap_name)) {
4542da6a6b63SJosh Durgin 		ret = PTR_ERR(snap_name);
45439e15b77dSAlex Elder 		goto out_err;
45442e9f7f1cSAlex Elder 	}
45452e9f7f1cSAlex Elder 
45462e9f7f1cSAlex Elder 	spec->pool_name = pool_name;
45472e9f7f1cSAlex Elder 	spec->image_name = image_name;
45482e9f7f1cSAlex Elder 	spec->snap_name = snap_name;
45499e15b77dSAlex Elder 
45509e15b77dSAlex Elder 	return 0;
455104077599SIlya Dryomov 
45529e15b77dSAlex Elder out_err:
45532e9f7f1cSAlex Elder 	kfree(image_name);
45542e9f7f1cSAlex Elder 	kfree(pool_name);
45559e15b77dSAlex Elder 	return ret;
45569e15b77dSAlex Elder }
45579e15b77dSAlex Elder 
4558cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
455935d489f9SAlex Elder {
456035d489f9SAlex Elder 	size_t size;
456135d489f9SAlex Elder 	int ret;
456235d489f9SAlex Elder 	void *reply_buf;
456335d489f9SAlex Elder 	void *p;
456435d489f9SAlex Elder 	void *end;
456535d489f9SAlex Elder 	u64 seq;
456635d489f9SAlex Elder 	u32 snap_count;
456735d489f9SAlex Elder 	struct ceph_snap_context *snapc;
456835d489f9SAlex Elder 	u32 i;
456935d489f9SAlex Elder 
457035d489f9SAlex Elder 	/*
457135d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
457235d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
457335d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
457435d489f9SAlex Elder 	 * prepared to receive.
457535d489f9SAlex Elder 	 */
457635d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
457735d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
457835d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
457935d489f9SAlex Elder 	if (!reply_buf)
458035d489f9SAlex Elder 		return -ENOMEM;
458135d489f9SAlex Elder 
458236be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
45834157976bSAlex Elder 				"rbd", "get_snapcontext", NULL, 0,
4584e2a58ee5SAlex Elder 				reply_buf, size);
458536be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
458635d489f9SAlex Elder 	if (ret < 0)
458735d489f9SAlex Elder 		goto out;
458835d489f9SAlex Elder 
458935d489f9SAlex Elder 	p = reply_buf;
459057385b51SAlex Elder 	end = reply_buf + ret;
459157385b51SAlex Elder 	ret = -ERANGE;
459235d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
459335d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
459435d489f9SAlex Elder 
459535d489f9SAlex Elder 	/*
459635d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
459735d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
459835d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
459935d489f9SAlex Elder 	 * allocate is representable in a size_t.
460035d489f9SAlex Elder 	 */
460135d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
460235d489f9SAlex Elder 				 / sizeof (u64)) {
460335d489f9SAlex Elder 		ret = -EINVAL;
460435d489f9SAlex Elder 		goto out;
460535d489f9SAlex Elder 	}
460635d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
460735d489f9SAlex Elder 		goto out;
4608468521c1SAlex Elder 	ret = 0;
460935d489f9SAlex Elder 
4610812164f8SAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
461135d489f9SAlex Elder 	if (!snapc) {
461235d489f9SAlex Elder 		ret = -ENOMEM;
461335d489f9SAlex Elder 		goto out;
461435d489f9SAlex Elder 	}
461535d489f9SAlex Elder 	snapc->seq = seq;
461635d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
461735d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
461835d489f9SAlex Elder 
461949ece554SAlex Elder 	ceph_put_snap_context(rbd_dev->header.snapc);
462035d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
462135d489f9SAlex Elder 
462235d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
462335d489f9SAlex Elder 		(unsigned long long)seq, (unsigned int)snap_count);
462435d489f9SAlex Elder out:
462535d489f9SAlex Elder 	kfree(reply_buf);
462635d489f9SAlex Elder 
462757385b51SAlex Elder 	return ret;
462835d489f9SAlex Elder }
462935d489f9SAlex Elder 
463054cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
463154cac61fSAlex Elder 					u64 snap_id)
4632b8b1e2dbSAlex Elder {
4633b8b1e2dbSAlex Elder 	size_t size;
4634b8b1e2dbSAlex Elder 	void *reply_buf;
463554cac61fSAlex Elder 	__le64 snapid;
4636b8b1e2dbSAlex Elder 	int ret;
4637b8b1e2dbSAlex Elder 	void *p;
4638b8b1e2dbSAlex Elder 	void *end;
4639b8b1e2dbSAlex Elder 	char *snap_name;
4640b8b1e2dbSAlex Elder 
4641b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
4642b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
4643b8b1e2dbSAlex Elder 	if (!reply_buf)
4644b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
4645b8b1e2dbSAlex Elder 
464654cac61fSAlex Elder 	snapid = cpu_to_le64(snap_id);
464736be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4648b8b1e2dbSAlex Elder 				"rbd", "get_snapshot_name",
464954cac61fSAlex Elder 				&snapid, sizeof (snapid),
4650e2a58ee5SAlex Elder 				reply_buf, size);
465136be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4652f40eb349SAlex Elder 	if (ret < 0) {
4653f40eb349SAlex Elder 		snap_name = ERR_PTR(ret);
4654b8b1e2dbSAlex Elder 		goto out;
4655f40eb349SAlex Elder 	}
4656b8b1e2dbSAlex Elder 
4657b8b1e2dbSAlex Elder 	p = reply_buf;
4658f40eb349SAlex Elder 	end = reply_buf + ret;
4659e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
4660f40eb349SAlex Elder 	if (IS_ERR(snap_name))
4661b8b1e2dbSAlex Elder 		goto out;
4662f40eb349SAlex Elder 
4663b8b1e2dbSAlex Elder 	dout("  snap_id 0x%016llx snap_name = %s\n",
466454cac61fSAlex Elder 		(unsigned long long)snap_id, snap_name);
4665b8b1e2dbSAlex Elder out:
4666b8b1e2dbSAlex Elder 	kfree(reply_buf);
4667b8b1e2dbSAlex Elder 
4668f40eb349SAlex Elder 	return snap_name;
4669b8b1e2dbSAlex Elder }
4670b8b1e2dbSAlex Elder 
46712df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
4672117973fbSAlex Elder {
46732df3fac7SAlex Elder 	bool first_time = rbd_dev->header.object_prefix == NULL;
4674117973fbSAlex Elder 	int ret;
4675117973fbSAlex Elder 
46761617e40cSJosh Durgin 	ret = rbd_dev_v2_image_size(rbd_dev);
46771617e40cSJosh Durgin 	if (ret)
4678cfbf6377SAlex Elder 		return ret;
46791617e40cSJosh Durgin 
46802df3fac7SAlex Elder 	if (first_time) {
46812df3fac7SAlex Elder 		ret = rbd_dev_v2_header_onetime(rbd_dev);
46822df3fac7SAlex Elder 		if (ret)
4683cfbf6377SAlex Elder 			return ret;
46842df3fac7SAlex Elder 	}
46852df3fac7SAlex Elder 
4686cc4a38bdSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev);
4687d194cd1dSIlya Dryomov 	if (ret && first_time) {
4688d194cd1dSIlya Dryomov 		kfree(rbd_dev->header.object_prefix);
4689d194cd1dSIlya Dryomov 		rbd_dev->header.object_prefix = NULL;
4690d194cd1dSIlya Dryomov 	}
4691117973fbSAlex Elder 
4692117973fbSAlex Elder 	return ret;
4693117973fbSAlex Elder }
4694117973fbSAlex Elder 
4695a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev)
4696a720ae09SIlya Dryomov {
4697a720ae09SIlya Dryomov 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4698a720ae09SIlya Dryomov 
4699a720ae09SIlya Dryomov 	if (rbd_dev->image_format == 1)
4700a720ae09SIlya Dryomov 		return rbd_dev_v1_header_info(rbd_dev);
4701a720ae09SIlya Dryomov 
4702a720ae09SIlya Dryomov 	return rbd_dev_v2_header_info(rbd_dev);
4703a720ae09SIlya Dryomov }
4704a720ae09SIlya Dryomov 
4705dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
4706dfc5606dSYehuda Sadeh {
4707dfc5606dSYehuda Sadeh 	struct device *dev;
4708cd789ab9SAlex Elder 	int ret;
4709dfc5606dSYehuda Sadeh 
4710cd789ab9SAlex Elder 	dev = &rbd_dev->dev;
4711dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
4712dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
4713dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
4714200a6a8bSAlex Elder 	dev->release = rbd_dev_device_release;
4715de71a297SAlex Elder 	dev_set_name(dev, "%d", rbd_dev->dev_id);
4716dfc5606dSYehuda Sadeh 	ret = device_register(dev);
4717dfc5606dSYehuda Sadeh 
4718dfc5606dSYehuda Sadeh 	return ret;
4719602adf40SYehuda Sadeh }
4720602adf40SYehuda Sadeh 
4721dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
4722dfc5606dSYehuda Sadeh {
4723dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
4724dfc5606dSYehuda Sadeh }
4725dfc5606dSYehuda Sadeh 
47261ddbe94eSAlex Elder /*
4727499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
4728f8a22fc2SIlya Dryomov  * the rbd_dev to the global list.
47291ddbe94eSAlex Elder  */
4730f8a22fc2SIlya Dryomov static int rbd_dev_id_get(struct rbd_device *rbd_dev)
4731b7f23c36SAlex Elder {
4732f8a22fc2SIlya Dryomov 	int new_dev_id;
4733f8a22fc2SIlya Dryomov 
47349b60e70bSIlya Dryomov 	new_dev_id = ida_simple_get(&rbd_dev_id_ida,
47359b60e70bSIlya Dryomov 				    0, minor_to_rbd_dev_id(1 << MINORBITS),
47369b60e70bSIlya Dryomov 				    GFP_KERNEL);
4737f8a22fc2SIlya Dryomov 	if (new_dev_id < 0)
4738f8a22fc2SIlya Dryomov 		return new_dev_id;
4739f8a22fc2SIlya Dryomov 
4740f8a22fc2SIlya Dryomov 	rbd_dev->dev_id = new_dev_id;
4741499afd5bSAlex Elder 
4742499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4743499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
4744499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
4745f8a22fc2SIlya Dryomov 
474670eebd20SIlya Dryomov 	dout("rbd_dev %p given dev id %d\n", rbd_dev, rbd_dev->dev_id);
4747f8a22fc2SIlya Dryomov 
4748f8a22fc2SIlya Dryomov 	return 0;
4749b7f23c36SAlex Elder }
4750b7f23c36SAlex Elder 
47511ddbe94eSAlex Elder /*
4752499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
4753499afd5bSAlex Elder  * identifier is no longer in use.
47541ddbe94eSAlex Elder  */
4755e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev)
47561ddbe94eSAlex Elder {
4757499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4758499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
4759499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
47601ddbe94eSAlex Elder 
4761f8a22fc2SIlya Dryomov 	ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4762f8a22fc2SIlya Dryomov 
4763f8a22fc2SIlya Dryomov 	dout("rbd_dev %p released dev id %d\n", rbd_dev, rbd_dev->dev_id);
4764b7f23c36SAlex Elder }
4765b7f23c36SAlex Elder 
4766a725f65eSAlex Elder /*
4767e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
4768e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
4769593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
4770593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
4771e28fff26SAlex Elder  */
4772e28fff26SAlex Elder static inline size_t next_token(const char **buf)
4773e28fff26SAlex Elder {
4774e28fff26SAlex Elder         /*
4775e28fff26SAlex Elder         * These are the characters that produce nonzero for
4776e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
4777e28fff26SAlex Elder         */
4778e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
4779e28fff26SAlex Elder 
4780e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
4781e28fff26SAlex Elder 
4782e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
4783e28fff26SAlex Elder }
4784e28fff26SAlex Elder 
4785e28fff26SAlex Elder /*
4786ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
4787ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
4788ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
4789ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
4790ea3352f4SAlex Elder  *
4791ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
4792ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
4793ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
4794ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
4795ea3352f4SAlex Elder  *
4796ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
4797ea3352f4SAlex Elder  * the end of the found token.
4798ea3352f4SAlex Elder  *
4799ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
4800ea3352f4SAlex Elder  */
4801ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
4802ea3352f4SAlex Elder {
4803ea3352f4SAlex Elder 	char *dup;
4804ea3352f4SAlex Elder 	size_t len;
4805ea3352f4SAlex Elder 
4806ea3352f4SAlex Elder 	len = next_token(buf);
48074caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
4808ea3352f4SAlex Elder 	if (!dup)
4809ea3352f4SAlex Elder 		return NULL;
4810ea3352f4SAlex Elder 	*(dup + len) = '\0';
4811ea3352f4SAlex Elder 	*buf += len;
4812ea3352f4SAlex Elder 
4813ea3352f4SAlex Elder 	if (lenp)
4814ea3352f4SAlex Elder 		*lenp = len;
4815ea3352f4SAlex Elder 
4816ea3352f4SAlex Elder 	return dup;
4817ea3352f4SAlex Elder }
4818ea3352f4SAlex Elder 
4819ea3352f4SAlex Elder /*
4820859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
4821859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
4822859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
4823859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
4824d22f76e7SAlex Elder  *
4825859c31dfSAlex Elder  * The information extracted from these options is recorded in
4826859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
4827859c31dfSAlex Elder  * structures:
4828859c31dfSAlex Elder  *  ceph_opts
4829859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
4830859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
4831859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
4832859c31dfSAlex Elder  *  rbd_opts
4833859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
4834859c31dfSAlex Elder  *	this function; caller must release with kfree().
4835859c31dfSAlex Elder  *  spec
4836859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
4837859c31dfSAlex Elder  *	initialized by this function based on parsed options.
4838859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
4839859c31dfSAlex Elder  *
4840859c31dfSAlex Elder  * The options passed take this form:
4841859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4842859c31dfSAlex Elder  * where:
4843859c31dfSAlex Elder  *  <mon_addrs>
4844859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
4845859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
4846859c31dfSAlex Elder  *      by a port number (separated by a colon).
4847859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
4848859c31dfSAlex Elder  *  <options>
4849859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
4850859c31dfSAlex Elder  *  <pool_name>
4851859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
4852859c31dfSAlex Elder  *  <image_name>
4853859c31dfSAlex Elder  *      The name of the image in that pool to map.
4854859c31dfSAlex Elder  *  <snap_id>
4855859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
4856859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
4857859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
4858859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
4859a725f65eSAlex Elder  */
4860859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
4861dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
4862859c31dfSAlex Elder 				struct rbd_options **opts,
4863859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
4864a725f65eSAlex Elder {
4865e28fff26SAlex Elder 	size_t len;
4866859c31dfSAlex Elder 	char *options;
48670ddebc0cSAlex Elder 	const char *mon_addrs;
4868ecb4dc22SAlex Elder 	char *snap_name;
48690ddebc0cSAlex Elder 	size_t mon_addrs_size;
4870859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
48714e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
4872859c31dfSAlex Elder 	struct ceph_options *copts;
4873dc79b113SAlex Elder 	int ret;
4874e28fff26SAlex Elder 
4875e28fff26SAlex Elder 	/* The first four tokens are required */
4876e28fff26SAlex Elder 
48777ef3214aSAlex Elder 	len = next_token(&buf);
48784fb5d671SAlex Elder 	if (!len) {
48794fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
48804fb5d671SAlex Elder 		return -EINVAL;
48814fb5d671SAlex Elder 	}
48820ddebc0cSAlex Elder 	mon_addrs = buf;
4883f28e565aSAlex Elder 	mon_addrs_size = len + 1;
48847ef3214aSAlex Elder 	buf += len;
4885a725f65eSAlex Elder 
4886dc79b113SAlex Elder 	ret = -EINVAL;
4887f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
4888f28e565aSAlex Elder 	if (!options)
4889dc79b113SAlex Elder 		return -ENOMEM;
48904fb5d671SAlex Elder 	if (!*options) {
48914fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
48924fb5d671SAlex Elder 		goto out_err;
48934fb5d671SAlex Elder 	}
4894a725f65eSAlex Elder 
4895859c31dfSAlex Elder 	spec = rbd_spec_alloc();
4896859c31dfSAlex Elder 	if (!spec)
4897f28e565aSAlex Elder 		goto out_mem;
4898859c31dfSAlex Elder 
4899859c31dfSAlex Elder 	spec->pool_name = dup_token(&buf, NULL);
4900859c31dfSAlex Elder 	if (!spec->pool_name)
4901859c31dfSAlex Elder 		goto out_mem;
49024fb5d671SAlex Elder 	if (!*spec->pool_name) {
49034fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
49044fb5d671SAlex Elder 		goto out_err;
49054fb5d671SAlex Elder 	}
4906e28fff26SAlex Elder 
490769e7a02fSAlex Elder 	spec->image_name = dup_token(&buf, NULL);
4908859c31dfSAlex Elder 	if (!spec->image_name)
4909f28e565aSAlex Elder 		goto out_mem;
49104fb5d671SAlex Elder 	if (!*spec->image_name) {
49114fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
49124fb5d671SAlex Elder 		goto out_err;
49134fb5d671SAlex Elder 	}
4914e28fff26SAlex Elder 
4915f28e565aSAlex Elder 	/*
4916f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
4917f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
4918f28e565aSAlex Elder 	 */
49193feeb894SAlex Elder 	len = next_token(&buf);
4920820a5f3eSAlex Elder 	if (!len) {
49213feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
49223feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
4923f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
4924dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
4925f28e565aSAlex Elder 		goto out_err;
4926849b4260SAlex Elder 	}
4927ecb4dc22SAlex Elder 	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
4928ecb4dc22SAlex Elder 	if (!snap_name)
4929f28e565aSAlex Elder 		goto out_mem;
4930ecb4dc22SAlex Elder 	*(snap_name + len) = '\0';
4931ecb4dc22SAlex Elder 	spec->snap_name = snap_name;
4932e5c35534SAlex Elder 
49330ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
4934e28fff26SAlex Elder 
49354e9afebaSAlex Elder 	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
49364e9afebaSAlex Elder 	if (!rbd_opts)
49374e9afebaSAlex Elder 		goto out_mem;
49384e9afebaSAlex Elder 
49394e9afebaSAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
4940b5584180SIlya Dryomov 	rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
4941d22f76e7SAlex Elder 
4942859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
49430ddebc0cSAlex Elder 					mon_addrs + mon_addrs_size - 1,
49444e9afebaSAlex Elder 					parse_rbd_opts_token, rbd_opts);
4945859c31dfSAlex Elder 	if (IS_ERR(copts)) {
4946859c31dfSAlex Elder 		ret = PTR_ERR(copts);
4947dc79b113SAlex Elder 		goto out_err;
4948dc79b113SAlex Elder 	}
4949859c31dfSAlex Elder 	kfree(options);
4950859c31dfSAlex Elder 
4951859c31dfSAlex Elder 	*ceph_opts = copts;
49524e9afebaSAlex Elder 	*opts = rbd_opts;
4953859c31dfSAlex Elder 	*rbd_spec = spec;
49540ddebc0cSAlex Elder 
4955dc79b113SAlex Elder 	return 0;
4956f28e565aSAlex Elder out_mem:
4957dc79b113SAlex Elder 	ret = -ENOMEM;
4958d22f76e7SAlex Elder out_err:
4959859c31dfSAlex Elder 	kfree(rbd_opts);
4960859c31dfSAlex Elder 	rbd_spec_put(spec);
4961f28e565aSAlex Elder 	kfree(options);
4962d22f76e7SAlex Elder 
4963dc79b113SAlex Elder 	return ret;
4964a725f65eSAlex Elder }
4965a725f65eSAlex Elder 
4966589d30e0SAlex Elder /*
496730ba1f02SIlya Dryomov  * Return pool id (>= 0) or a negative error code.
496830ba1f02SIlya Dryomov  */
496930ba1f02SIlya Dryomov static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
497030ba1f02SIlya Dryomov {
4971a319bf56SIlya Dryomov 	struct ceph_options *opts = rbdc->client->options;
497230ba1f02SIlya Dryomov 	u64 newest_epoch;
497330ba1f02SIlya Dryomov 	int tries = 0;
497430ba1f02SIlya Dryomov 	int ret;
497530ba1f02SIlya Dryomov 
497630ba1f02SIlya Dryomov again:
497730ba1f02SIlya Dryomov 	ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
497830ba1f02SIlya Dryomov 	if (ret == -ENOENT && tries++ < 1) {
497930ba1f02SIlya Dryomov 		ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap",
498030ba1f02SIlya Dryomov 					       &newest_epoch);
498130ba1f02SIlya Dryomov 		if (ret < 0)
498230ba1f02SIlya Dryomov 			return ret;
498330ba1f02SIlya Dryomov 
498430ba1f02SIlya Dryomov 		if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
498530ba1f02SIlya Dryomov 			ceph_monc_request_next_osdmap(&rbdc->client->monc);
498630ba1f02SIlya Dryomov 			(void) ceph_monc_wait_osdmap(&rbdc->client->monc,
4987a319bf56SIlya Dryomov 						     newest_epoch,
4988a319bf56SIlya Dryomov 						     opts->mount_timeout);
498930ba1f02SIlya Dryomov 			goto again;
499030ba1f02SIlya Dryomov 		} else {
499130ba1f02SIlya Dryomov 			/* the osdmap we have is new enough */
499230ba1f02SIlya Dryomov 			return -ENOENT;
499330ba1f02SIlya Dryomov 		}
499430ba1f02SIlya Dryomov 	}
499530ba1f02SIlya Dryomov 
499630ba1f02SIlya Dryomov 	return ret;
499730ba1f02SIlya Dryomov }
499830ba1f02SIlya Dryomov 
499930ba1f02SIlya Dryomov /*
5000589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
5001589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
5002589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
5003589d30e0SAlex Elder  *
5004589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
5005589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
5006589d30e0SAlex Elder  * with the supplied name.
5007589d30e0SAlex Elder  *
5008589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
5009589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
5010589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
5011589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
5012589d30e0SAlex Elder  */
5013589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
5014589d30e0SAlex Elder {
5015589d30e0SAlex Elder 	int ret;
5016589d30e0SAlex Elder 	size_t size;
5017589d30e0SAlex Elder 	char *object_name;
5018589d30e0SAlex Elder 	void *response;
5019c0fba368SAlex Elder 	char *image_id;
50202f82ee54SAlex Elder 
5021589d30e0SAlex Elder 	/*
50222c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
50232c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
5024c0fba368SAlex Elder 	 * need to fetch the image id again in this case.  We
5025c0fba368SAlex Elder 	 * do still need to set the image format though.
50262c0d0a10SAlex Elder 	 */
5027c0fba368SAlex Elder 	if (rbd_dev->spec->image_id) {
5028c0fba368SAlex Elder 		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
5029c0fba368SAlex Elder 
50302c0d0a10SAlex Elder 		return 0;
5031c0fba368SAlex Elder 	}
50322c0d0a10SAlex Elder 
50332c0d0a10SAlex Elder 	/*
5034589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
5035589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
5036589d30e0SAlex Elder 	 */
503769e7a02fSAlex Elder 	size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
5038589d30e0SAlex Elder 	object_name = kmalloc(size, GFP_NOIO);
5039589d30e0SAlex Elder 	if (!object_name)
5040589d30e0SAlex Elder 		return -ENOMEM;
50410d7dbfceSAlex Elder 	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
5042589d30e0SAlex Elder 	dout("rbd id object name is %s\n", object_name);
5043589d30e0SAlex Elder 
5044589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
5045589d30e0SAlex Elder 
5046589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
5047589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
5048589d30e0SAlex Elder 	if (!response) {
5049589d30e0SAlex Elder 		ret = -ENOMEM;
5050589d30e0SAlex Elder 		goto out;
5051589d30e0SAlex Elder 	}
5052589d30e0SAlex Elder 
5053c0fba368SAlex Elder 	/* If it doesn't exist we'll assume it's a format 1 image */
5054c0fba368SAlex Elder 
505536be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, object_name,
50564157976bSAlex Elder 				"rbd", "get_id", NULL, 0,
5057e2a58ee5SAlex Elder 				response, RBD_IMAGE_ID_LEN_MAX);
505836be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5059c0fba368SAlex Elder 	if (ret == -ENOENT) {
5060c0fba368SAlex Elder 		image_id = kstrdup("", GFP_KERNEL);
5061c0fba368SAlex Elder 		ret = image_id ? 0 : -ENOMEM;
5062c0fba368SAlex Elder 		if (!ret)
5063c0fba368SAlex Elder 			rbd_dev->image_format = 1;
50647dd440c9SIlya Dryomov 	} else if (ret >= 0) {
5065c0fba368SAlex Elder 		void *p = response;
5066589d30e0SAlex Elder 
5067c0fba368SAlex Elder 		image_id = ceph_extract_encoded_string(&p, p + ret,
5068979ed480SAlex Elder 						NULL, GFP_NOIO);
5069461f758aSDuan Jiong 		ret = PTR_ERR_OR_ZERO(image_id);
5070c0fba368SAlex Elder 		if (!ret)
5071c0fba368SAlex Elder 			rbd_dev->image_format = 2;
5072c0fba368SAlex Elder 	}
5073c0fba368SAlex Elder 
5074c0fba368SAlex Elder 	if (!ret) {
5075c0fba368SAlex Elder 		rbd_dev->spec->image_id = image_id;
5076c0fba368SAlex Elder 		dout("image_id is %s\n", image_id);
5077589d30e0SAlex Elder 	}
5078589d30e0SAlex Elder out:
5079589d30e0SAlex Elder 	kfree(response);
5080589d30e0SAlex Elder 	kfree(object_name);
5081589d30e0SAlex Elder 
5082589d30e0SAlex Elder 	return ret;
5083589d30e0SAlex Elder }
5084589d30e0SAlex Elder 
50853abef3b3SAlex Elder /*
50863abef3b3SAlex Elder  * Undo whatever state changes are made by v1 or v2 header info
50873abef3b3SAlex Elder  * call.
50883abef3b3SAlex Elder  */
50896fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
50906fd48b3bSAlex Elder {
50916fd48b3bSAlex Elder 	struct rbd_image_header	*header;
50926fd48b3bSAlex Elder 
5093a2acd00eSAlex Elder 	rbd_dev_parent_put(rbd_dev);
50946fd48b3bSAlex Elder 
50956fd48b3bSAlex Elder 	/* Free dynamic fields from the header, then zero it out */
50966fd48b3bSAlex Elder 
50976fd48b3bSAlex Elder 	header = &rbd_dev->header;
5098812164f8SAlex Elder 	ceph_put_snap_context(header->snapc);
50996fd48b3bSAlex Elder 	kfree(header->snap_sizes);
51006fd48b3bSAlex Elder 	kfree(header->snap_names);
51016fd48b3bSAlex Elder 	kfree(header->object_prefix);
51026fd48b3bSAlex Elder 	memset(header, 0, sizeof (*header));
51036fd48b3bSAlex Elder }
51046fd48b3bSAlex Elder 
51052df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
5106a30b71b9SAlex Elder {
5107a30b71b9SAlex Elder 	int ret;
5108a30b71b9SAlex Elder 
51091e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
511057385b51SAlex Elder 	if (ret)
51111e130199SAlex Elder 		goto out_err;
5112b1b5402aSAlex Elder 
51132df3fac7SAlex Elder 	/*
51142df3fac7SAlex Elder 	 * Get the and check features for the image.  Currently the
51152df3fac7SAlex Elder 	 * features are assumed to never change.
51162df3fac7SAlex Elder 	 */
5117b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
511857385b51SAlex Elder 	if (ret)
5119b1b5402aSAlex Elder 		goto out_err;
512035d489f9SAlex Elder 
5121cc070d59SAlex Elder 	/* If the image supports fancy striping, get its parameters */
5122cc070d59SAlex Elder 
5123cc070d59SAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
5124cc070d59SAlex Elder 		ret = rbd_dev_v2_striping_info(rbd_dev);
5125cc070d59SAlex Elder 		if (ret < 0)
5126cc070d59SAlex Elder 			goto out_err;
5127cc070d59SAlex Elder 	}
51282df3fac7SAlex Elder 	/* No support for crypto and compression type format 2 images */
5129a30b71b9SAlex Elder 
513035152979SAlex Elder 	return 0;
51319d475de5SAlex Elder out_err:
5132642a2537SAlex Elder 	rbd_dev->header.features = 0;
51331e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
51341e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
51359d475de5SAlex Elder 
51369d475de5SAlex Elder 	return ret;
5137a30b71b9SAlex Elder }
5138a30b71b9SAlex Elder 
51396d69bb53SIlya Dryomov /*
51406d69bb53SIlya Dryomov  * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
51416d69bb53SIlya Dryomov  * rbd_dev_image_probe() recursion depth, which means it's also the
51426d69bb53SIlya Dryomov  * length of the already discovered part of the parent chain.
51436d69bb53SIlya Dryomov  */
51446d69bb53SIlya Dryomov static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
514583a06263SAlex Elder {
51462f82ee54SAlex Elder 	struct rbd_device *parent = NULL;
5147124afba2SAlex Elder 	int ret;
5148124afba2SAlex Elder 
5149124afba2SAlex Elder 	if (!rbd_dev->parent_spec)
5150124afba2SAlex Elder 		return 0;
5151124afba2SAlex Elder 
51526d69bb53SIlya Dryomov 	if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
51536d69bb53SIlya Dryomov 		pr_info("parent chain is too long (%d)\n", depth);
51546d69bb53SIlya Dryomov 		ret = -EINVAL;
51556d69bb53SIlya Dryomov 		goto out_err;
51566d69bb53SIlya Dryomov 	}
51576d69bb53SIlya Dryomov 
51581f2c6651SIlya Dryomov 	parent = rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec,
51591f2c6651SIlya Dryomov 				NULL);
51601f2c6651SIlya Dryomov 	if (!parent) {
5161124afba2SAlex Elder 		ret = -ENOMEM;
5162124afba2SAlex Elder 		goto out_err;
51631f2c6651SIlya Dryomov 	}
51641f2c6651SIlya Dryomov 
51651f2c6651SIlya Dryomov 	/*
51661f2c6651SIlya Dryomov 	 * Images related by parent/child relationships always share
51671f2c6651SIlya Dryomov 	 * rbd_client and spec/parent_spec, so bump their refcounts.
51681f2c6651SIlya Dryomov 	 */
51691f2c6651SIlya Dryomov 	__rbd_get_client(rbd_dev->rbd_client);
51701f2c6651SIlya Dryomov 	rbd_spec_get(rbd_dev->parent_spec);
5171124afba2SAlex Elder 
51726d69bb53SIlya Dryomov 	ret = rbd_dev_image_probe(parent, depth);
5173124afba2SAlex Elder 	if (ret < 0)
5174124afba2SAlex Elder 		goto out_err;
51751f2c6651SIlya Dryomov 
5176124afba2SAlex Elder 	rbd_dev->parent = parent;
5177a2acd00eSAlex Elder 	atomic_set(&rbd_dev->parent_ref, 1);
5178124afba2SAlex Elder 	return 0;
5179124afba2SAlex Elder 
51801f2c6651SIlya Dryomov out_err:
51811f2c6651SIlya Dryomov 	rbd_dev_unparent(rbd_dev);
51821f2c6651SIlya Dryomov 	if (parent)
51831f2c6651SIlya Dryomov 		rbd_dev_destroy(parent);
5184124afba2SAlex Elder 	return ret;
5185124afba2SAlex Elder }
5186124afba2SAlex Elder 
5187200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
5188124afba2SAlex Elder {
518983a06263SAlex Elder 	int ret;
519083a06263SAlex Elder 
5191f8a22fc2SIlya Dryomov 	/* Get an id and fill in device name. */
519283a06263SAlex Elder 
5193f8a22fc2SIlya Dryomov 	ret = rbd_dev_id_get(rbd_dev);
5194f8a22fc2SIlya Dryomov 	if (ret)
5195f8a22fc2SIlya Dryomov 		return ret;
5196f8a22fc2SIlya Dryomov 
519783a06263SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
519883a06263SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
519983a06263SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
520083a06263SAlex Elder 
52019b60e70bSIlya Dryomov 	/* Record our major and minor device numbers. */
520283a06263SAlex Elder 
52039b60e70bSIlya Dryomov 	if (!single_major) {
520483a06263SAlex Elder 		ret = register_blkdev(0, rbd_dev->name);
520583a06263SAlex Elder 		if (ret < 0)
520683a06263SAlex Elder 			goto err_out_id;
52079b60e70bSIlya Dryomov 
520883a06263SAlex Elder 		rbd_dev->major = ret;
5209dd82fff1SIlya Dryomov 		rbd_dev->minor = 0;
52109b60e70bSIlya Dryomov 	} else {
52119b60e70bSIlya Dryomov 		rbd_dev->major = rbd_major;
52129b60e70bSIlya Dryomov 		rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
52139b60e70bSIlya Dryomov 	}
521483a06263SAlex Elder 
521583a06263SAlex Elder 	/* Set up the blkdev mapping. */
521683a06263SAlex Elder 
521783a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
521883a06263SAlex Elder 	if (ret)
521983a06263SAlex Elder 		goto err_out_blkdev;
522083a06263SAlex Elder 
5221f35a4deeSAlex Elder 	ret = rbd_dev_mapping_set(rbd_dev);
522283a06263SAlex Elder 	if (ret)
522383a06263SAlex Elder 		goto err_out_disk;
5224bc1ecc65SIlya Dryomov 
5225f35a4deeSAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
522622001f61SJosh Durgin 	set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only);
5227f35a4deeSAlex Elder 
5228f35a4deeSAlex Elder 	ret = rbd_bus_add_dev(rbd_dev);
5229f35a4deeSAlex Elder 	if (ret)
5230f5ee37bdSIlya Dryomov 		goto err_out_mapping;
523183a06263SAlex Elder 
523283a06263SAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
523383a06263SAlex Elder 
5234129b79d4SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
523583a06263SAlex Elder 	add_disk(rbd_dev->disk);
523683a06263SAlex Elder 
523783a06263SAlex Elder 	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
523883a06263SAlex Elder 		(unsigned long long) rbd_dev->mapping.size);
523983a06263SAlex Elder 
524083a06263SAlex Elder 	return ret;
52412f82ee54SAlex Elder 
5242f35a4deeSAlex Elder err_out_mapping:
5243f35a4deeSAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
524483a06263SAlex Elder err_out_disk:
524583a06263SAlex Elder 	rbd_free_disk(rbd_dev);
524683a06263SAlex Elder err_out_blkdev:
52479b60e70bSIlya Dryomov 	if (!single_major)
524883a06263SAlex Elder 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
524983a06263SAlex Elder err_out_id:
525083a06263SAlex Elder 	rbd_dev_id_put(rbd_dev);
5251d1cf5788SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
525283a06263SAlex Elder 
525383a06263SAlex Elder 	return ret;
525483a06263SAlex Elder }
525583a06263SAlex Elder 
5256332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev)
5257332bb12dSAlex Elder {
5258332bb12dSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
5259332bb12dSAlex Elder 	size_t size;
5260332bb12dSAlex Elder 
5261332bb12dSAlex Elder 	/* Record the header object name for this rbd image. */
5262332bb12dSAlex Elder 
5263332bb12dSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5264332bb12dSAlex Elder 
5265332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
5266332bb12dSAlex Elder 		size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
5267332bb12dSAlex Elder 	else
5268332bb12dSAlex Elder 		size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
5269332bb12dSAlex Elder 
5270332bb12dSAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
5271332bb12dSAlex Elder 	if (!rbd_dev->header_name)
5272332bb12dSAlex Elder 		return -ENOMEM;
5273332bb12dSAlex Elder 
5274332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
5275332bb12dSAlex Elder 		sprintf(rbd_dev->header_name, "%s%s",
5276332bb12dSAlex Elder 			spec->image_name, RBD_SUFFIX);
5277332bb12dSAlex Elder 	else
5278332bb12dSAlex Elder 		sprintf(rbd_dev->header_name, "%s%s",
5279332bb12dSAlex Elder 			RBD_HEADER_PREFIX, spec->image_id);
5280332bb12dSAlex Elder 	return 0;
5281332bb12dSAlex Elder }
5282332bb12dSAlex Elder 
5283200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev)
5284200a6a8bSAlex Elder {
52856fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
5286200a6a8bSAlex Elder 	kfree(rbd_dev->header_name);
52876fd48b3bSAlex Elder 	rbd_dev->header_name = NULL;
52886fd48b3bSAlex Elder 	rbd_dev->image_format = 0;
52896fd48b3bSAlex Elder 	kfree(rbd_dev->spec->image_id);
52906fd48b3bSAlex Elder 	rbd_dev->spec->image_id = NULL;
52916fd48b3bSAlex Elder 
5292200a6a8bSAlex Elder 	rbd_dev_destroy(rbd_dev);
5293200a6a8bSAlex Elder }
5294200a6a8bSAlex Elder 
5295a30b71b9SAlex Elder /*
5296a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
52971f3ef788SAlex Elder  * device.  If this image is the one being mapped (i.e., not a
52981f3ef788SAlex Elder  * parent), initiate a watch on its header object before using that
52991f3ef788SAlex Elder  * object to get detailed information about the rbd image.
5300a30b71b9SAlex Elder  */
53016d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
5302a30b71b9SAlex Elder {
5303a30b71b9SAlex Elder 	int ret;
5304a30b71b9SAlex Elder 
5305a30b71b9SAlex Elder 	/*
53063abef3b3SAlex Elder 	 * Get the id from the image id object.  Unless there's an
53073abef3b3SAlex Elder 	 * error, rbd_dev->spec->image_id will be filled in with
53083abef3b3SAlex Elder 	 * a dynamically-allocated string, and rbd_dev->image_format
53093abef3b3SAlex Elder 	 * will be set to either 1 or 2.
5310a30b71b9SAlex Elder 	 */
5311a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
5312a30b71b9SAlex Elder 	if (ret)
5313c0fba368SAlex Elder 		return ret;
5314c0fba368SAlex Elder 
5315332bb12dSAlex Elder 	ret = rbd_dev_header_name(rbd_dev);
5316332bb12dSAlex Elder 	if (ret)
5317332bb12dSAlex Elder 		goto err_out_format;
5318332bb12dSAlex Elder 
53196d69bb53SIlya Dryomov 	if (!depth) {
5320fca27065SIlya Dryomov 		ret = rbd_dev_header_watch_sync(rbd_dev);
53211fe48023SIlya Dryomov 		if (ret) {
53221fe48023SIlya Dryomov 			if (ret == -ENOENT)
53231fe48023SIlya Dryomov 				pr_info("image %s/%s does not exist\n",
53241fe48023SIlya Dryomov 					rbd_dev->spec->pool_name,
53251fe48023SIlya Dryomov 					rbd_dev->spec->image_name);
5326b644de2bSAlex Elder 			goto out_header_name;
53271f3ef788SAlex Elder 		}
53281fe48023SIlya Dryomov 	}
5329b644de2bSAlex Elder 
5330a720ae09SIlya Dryomov 	ret = rbd_dev_header_info(rbd_dev);
53315655c4d9SAlex Elder 	if (ret)
5332b644de2bSAlex Elder 		goto err_out_watch;
5333a30b71b9SAlex Elder 
533404077599SIlya Dryomov 	/*
533504077599SIlya Dryomov 	 * If this image is the one being mapped, we have pool name and
533604077599SIlya Dryomov 	 * id, image name and id, and snap name - need to fill snap id.
533704077599SIlya Dryomov 	 * Otherwise this is a parent image, identified by pool, image
533804077599SIlya Dryomov 	 * and snap ids - need to fill in names for those ids.
533904077599SIlya Dryomov 	 */
53406d69bb53SIlya Dryomov 	if (!depth)
534104077599SIlya Dryomov 		ret = rbd_spec_fill_snap_id(rbd_dev);
534204077599SIlya Dryomov 	else
534304077599SIlya Dryomov 		ret = rbd_spec_fill_names(rbd_dev);
53441fe48023SIlya Dryomov 	if (ret) {
53451fe48023SIlya Dryomov 		if (ret == -ENOENT)
53461fe48023SIlya Dryomov 			pr_info("snap %s/%s@%s does not exist\n",
53471fe48023SIlya Dryomov 				rbd_dev->spec->pool_name,
53481fe48023SIlya Dryomov 				rbd_dev->spec->image_name,
53491fe48023SIlya Dryomov 				rbd_dev->spec->snap_name);
535033dca39fSAlex Elder 		goto err_out_probe;
53511fe48023SIlya Dryomov 	}
53529bb81c9bSAlex Elder 
5353e8f59b59SIlya Dryomov 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
5354e8f59b59SIlya Dryomov 		ret = rbd_dev_v2_parent_info(rbd_dev);
5355e8f59b59SIlya Dryomov 		if (ret)
5356e8f59b59SIlya Dryomov 			goto err_out_probe;
5357e8f59b59SIlya Dryomov 
5358e8f59b59SIlya Dryomov 		/*
5359e8f59b59SIlya Dryomov 		 * Need to warn users if this image is the one being
5360e8f59b59SIlya Dryomov 		 * mapped and has a parent.
5361e8f59b59SIlya Dryomov 		 */
53626d69bb53SIlya Dryomov 		if (!depth && rbd_dev->parent_spec)
5363e8f59b59SIlya Dryomov 			rbd_warn(rbd_dev,
5364e8f59b59SIlya Dryomov 				 "WARNING: kernel layering is EXPERIMENTAL!");
5365e8f59b59SIlya Dryomov 	}
5366e8f59b59SIlya Dryomov 
53676d69bb53SIlya Dryomov 	ret = rbd_dev_probe_parent(rbd_dev, depth);
536830d60ba2SAlex Elder 	if (ret)
536930d60ba2SAlex Elder 		goto err_out_probe;
537083a06263SAlex Elder 
537130d60ba2SAlex Elder 	dout("discovered format %u image, header name is %s\n",
537230d60ba2SAlex Elder 		rbd_dev->image_format, rbd_dev->header_name);
537330d60ba2SAlex Elder 	return 0;
5374e8f59b59SIlya Dryomov 
53756fd48b3bSAlex Elder err_out_probe:
53766fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
5377b644de2bSAlex Elder err_out_watch:
53786d69bb53SIlya Dryomov 	if (!depth)
5379fca27065SIlya Dryomov 		rbd_dev_header_unwatch_sync(rbd_dev);
5380332bb12dSAlex Elder out_header_name:
5381332bb12dSAlex Elder 	kfree(rbd_dev->header_name);
5382332bb12dSAlex Elder 	rbd_dev->header_name = NULL;
5383332bb12dSAlex Elder err_out_format:
5384332bb12dSAlex Elder 	rbd_dev->image_format = 0;
53855655c4d9SAlex Elder 	kfree(rbd_dev->spec->image_id);
53865655c4d9SAlex Elder 	rbd_dev->spec->image_id = NULL;
53875655c4d9SAlex Elder 	return ret;
538883a06263SAlex Elder }
538983a06263SAlex Elder 
53909b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus,
539159c2be1eSYehuda Sadeh 			  const char *buf,
539259c2be1eSYehuda Sadeh 			  size_t count)
5393602adf40SYehuda Sadeh {
5394cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
5395dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
53964e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
5397859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
53989d3997fdSAlex Elder 	struct rbd_client *rbdc;
539951344a38SAlex Elder 	bool read_only;
540027cc2594SAlex Elder 	int rc = -ENOMEM;
5401602adf40SYehuda Sadeh 
5402602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
5403602adf40SYehuda Sadeh 		return -ENODEV;
5404602adf40SYehuda Sadeh 
5405a725f65eSAlex Elder 	/* parse add command */
5406859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
5407dc79b113SAlex Elder 	if (rc < 0)
5408bd4ba655SAlex Elder 		goto err_out_module;
5409a725f65eSAlex Elder 
54109d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
54119d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
54129d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
54130ddebc0cSAlex Elder 		goto err_out_args;
54149d3997fdSAlex Elder 	}
5415602adf40SYehuda Sadeh 
5416602adf40SYehuda Sadeh 	/* pick the pool */
541730ba1f02SIlya Dryomov 	rc = rbd_add_get_pool_id(rbdc, spec->pool_name);
54181fe48023SIlya Dryomov 	if (rc < 0) {
54191fe48023SIlya Dryomov 		if (rc == -ENOENT)
54201fe48023SIlya Dryomov 			pr_info("pool %s does not exist\n", spec->pool_name);
5421602adf40SYehuda Sadeh 		goto err_out_client;
54221fe48023SIlya Dryomov 	}
5423859c31dfSAlex Elder 	spec->pool_id = (u64)rc;
5424859c31dfSAlex Elder 
54250903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
54260903e875SAlex Elder 
5427c0cd10dbSAlex Elder 	if (spec->pool_id > (u64)U32_MAX) {
54289584d508SIlya Dryomov 		rbd_warn(NULL, "pool id too large (%llu > %u)",
5429c0cd10dbSAlex Elder 				(unsigned long long)spec->pool_id, U32_MAX);
54300903e875SAlex Elder 		rc = -EIO;
54310903e875SAlex Elder 		goto err_out_client;
54320903e875SAlex Elder 	}
54330903e875SAlex Elder 
5434d147543dSIlya Dryomov 	rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
5435bd4ba655SAlex Elder 	if (!rbd_dev)
5436bd4ba655SAlex Elder 		goto err_out_client;
5437c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
5438c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
5439d147543dSIlya Dryomov 	rbd_opts = NULL;	/* rbd_dev now owns this */
5440602adf40SYehuda Sadeh 
54416d69bb53SIlya Dryomov 	rc = rbd_dev_image_probe(rbd_dev, 0);
5442a30b71b9SAlex Elder 	if (rc < 0)
5443c53d5893SAlex Elder 		goto err_out_rbd_dev;
544405fd6f6fSAlex Elder 
54457ce4eef7SAlex Elder 	/* If we are mapping a snapshot it must be marked read-only */
54467ce4eef7SAlex Elder 
5447d147543dSIlya Dryomov 	read_only = rbd_dev->opts->read_only;
54487ce4eef7SAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
54497ce4eef7SAlex Elder 		read_only = true;
54507ce4eef7SAlex Elder 	rbd_dev->mapping.read_only = read_only;
54517ce4eef7SAlex Elder 
5452b536f69aSAlex Elder 	rc = rbd_dev_device_setup(rbd_dev);
54533abef3b3SAlex Elder 	if (rc) {
5454e37180c0SIlya Dryomov 		/*
5455e37180c0SIlya Dryomov 		 * rbd_dev_header_unwatch_sync() can't be moved into
5456e37180c0SIlya Dryomov 		 * rbd_dev_image_release() without refactoring, see
5457e37180c0SIlya Dryomov 		 * commit 1f3ef78861ac.
5458e37180c0SIlya Dryomov 		 */
5459e37180c0SIlya Dryomov 		rbd_dev_header_unwatch_sync(rbd_dev);
54603abef3b3SAlex Elder 		rbd_dev_image_release(rbd_dev);
54613abef3b3SAlex Elder 		goto err_out_module;
54623abef3b3SAlex Elder 	}
54633abef3b3SAlex Elder 
5464602adf40SYehuda Sadeh 	return count;
5465b536f69aSAlex Elder 
5466c53d5893SAlex Elder err_out_rbd_dev:
5467c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
5468bd4ba655SAlex Elder err_out_client:
54699d3997fdSAlex Elder 	rbd_put_client(rbdc);
54700ddebc0cSAlex Elder err_out_args:
5471859c31dfSAlex Elder 	rbd_spec_put(spec);
5472d147543dSIlya Dryomov 	kfree(rbd_opts);
5473bd4ba655SAlex Elder err_out_module:
5474bd4ba655SAlex Elder 	module_put(THIS_MODULE);
547527cc2594SAlex Elder 
5476602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
547727cc2594SAlex Elder 
547827cc2594SAlex Elder 	return (ssize_t)rc;
5479602adf40SYehuda Sadeh }
5480602adf40SYehuda Sadeh 
54819b60e70bSIlya Dryomov static ssize_t rbd_add(struct bus_type *bus,
54829b60e70bSIlya Dryomov 		       const char *buf,
54839b60e70bSIlya Dryomov 		       size_t count)
54849b60e70bSIlya Dryomov {
54859b60e70bSIlya Dryomov 	if (single_major)
54869b60e70bSIlya Dryomov 		return -EINVAL;
54879b60e70bSIlya Dryomov 
54889b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
54899b60e70bSIlya Dryomov }
54909b60e70bSIlya Dryomov 
54919b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus,
54929b60e70bSIlya Dryomov 				    const char *buf,
54939b60e70bSIlya Dryomov 				    size_t count)
54949b60e70bSIlya Dryomov {
54959b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
54969b60e70bSIlya Dryomov }
54979b60e70bSIlya Dryomov 
5498200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev)
5499602adf40SYehuda Sadeh {
5500593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5501602adf40SYehuda Sadeh 
5502602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
5503200a6a8bSAlex Elder 	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
55046d80b130SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
55059b60e70bSIlya Dryomov 	if (!single_major)
5506602adf40SYehuda Sadeh 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
5507e2839308SAlex Elder 	rbd_dev_id_put(rbd_dev);
5508d1cf5788SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
5509602adf40SYehuda Sadeh }
5510602adf40SYehuda Sadeh 
551105a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
551205a46afdSAlex Elder {
5513ad945fc1SAlex Elder 	while (rbd_dev->parent) {
551405a46afdSAlex Elder 		struct rbd_device *first = rbd_dev;
551505a46afdSAlex Elder 		struct rbd_device *second = first->parent;
551605a46afdSAlex Elder 		struct rbd_device *third;
551705a46afdSAlex Elder 
551805a46afdSAlex Elder 		/*
551905a46afdSAlex Elder 		 * Follow to the parent with no grandparent and
552005a46afdSAlex Elder 		 * remove it.
552105a46afdSAlex Elder 		 */
552205a46afdSAlex Elder 		while (second && (third = second->parent)) {
552305a46afdSAlex Elder 			first = second;
552405a46afdSAlex Elder 			second = third;
552505a46afdSAlex Elder 		}
5526ad945fc1SAlex Elder 		rbd_assert(second);
55278ad42cd0SAlex Elder 		rbd_dev_image_release(second);
5528ad945fc1SAlex Elder 		first->parent = NULL;
5529ad945fc1SAlex Elder 		first->parent_overlap = 0;
5530ad945fc1SAlex Elder 
5531ad945fc1SAlex Elder 		rbd_assert(first->parent_spec);
553205a46afdSAlex Elder 		rbd_spec_put(first->parent_spec);
553305a46afdSAlex Elder 		first->parent_spec = NULL;
553405a46afdSAlex Elder 	}
553505a46afdSAlex Elder }
553605a46afdSAlex Elder 
55379b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus,
5538602adf40SYehuda Sadeh 			     const char *buf,
5539602adf40SYehuda Sadeh 			     size_t count)
5540602adf40SYehuda Sadeh {
5541602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
5542751cc0e3SAlex Elder 	struct list_head *tmp;
5543751cc0e3SAlex Elder 	int dev_id;
5544602adf40SYehuda Sadeh 	unsigned long ul;
554582a442d2SAlex Elder 	bool already = false;
55460d8189e1SAlex Elder 	int ret;
5547602adf40SYehuda Sadeh 
5548bb8e0e84SJingoo Han 	ret = kstrtoul(buf, 10, &ul);
55490d8189e1SAlex Elder 	if (ret)
55500d8189e1SAlex Elder 		return ret;
5551602adf40SYehuda Sadeh 
5552602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
5553751cc0e3SAlex Elder 	dev_id = (int)ul;
5554751cc0e3SAlex Elder 	if (dev_id != ul)
5555602adf40SYehuda Sadeh 		return -EINVAL;
5556602adf40SYehuda Sadeh 
5557602adf40SYehuda Sadeh 	ret = -ENOENT;
5558751cc0e3SAlex Elder 	spin_lock(&rbd_dev_list_lock);
5559751cc0e3SAlex Elder 	list_for_each(tmp, &rbd_dev_list) {
5560751cc0e3SAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
5561751cc0e3SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
5562751cc0e3SAlex Elder 			ret = 0;
5563751cc0e3SAlex Elder 			break;
5564602adf40SYehuda Sadeh 		}
5565751cc0e3SAlex Elder 	}
5566751cc0e3SAlex Elder 	if (!ret) {
5567a14ea269SAlex Elder 		spin_lock_irq(&rbd_dev->lock);
5568b82d167bSAlex Elder 		if (rbd_dev->open_count)
556942382b70SAlex Elder 			ret = -EBUSY;
5570b82d167bSAlex Elder 		else
557182a442d2SAlex Elder 			already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
557282a442d2SAlex Elder 							&rbd_dev->flags);
5573a14ea269SAlex Elder 		spin_unlock_irq(&rbd_dev->lock);
5574751cc0e3SAlex Elder 	}
5575751cc0e3SAlex Elder 	spin_unlock(&rbd_dev_list_lock);
557682a442d2SAlex Elder 	if (ret < 0 || already)
55771ba0f1e7SAlex Elder 		return ret;
5578751cc0e3SAlex Elder 
5579fca27065SIlya Dryomov 	rbd_dev_header_unwatch_sync(rbd_dev);
55809abc5990SJosh Durgin 	/*
55819abc5990SJosh Durgin 	 * flush remaining watch callbacks - these must be complete
55829abc5990SJosh Durgin 	 * before the osd_client is shutdown
55839abc5990SJosh Durgin 	 */
55849abc5990SJosh Durgin 	dout("%s: flushing notifies", __func__);
55859abc5990SJosh Durgin 	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
5586fca27065SIlya Dryomov 
55879875201eSJosh Durgin 	/*
55889875201eSJosh Durgin 	 * Don't free anything from rbd_dev->disk until after all
55899875201eSJosh Durgin 	 * notifies are completely processed. Otherwise
55909875201eSJosh Durgin 	 * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting
55919875201eSJosh Durgin 	 * in a potential use after free of rbd_dev->disk or rbd_dev.
55929875201eSJosh Durgin 	 */
55939875201eSJosh Durgin 	rbd_bus_del_dev(rbd_dev);
55948ad42cd0SAlex Elder 	rbd_dev_image_release(rbd_dev);
559579ab7558SAlex Elder 	module_put(THIS_MODULE);
5596aafb230eSAlex Elder 
55971ba0f1e7SAlex Elder 	return count;
5598602adf40SYehuda Sadeh }
5599602adf40SYehuda Sadeh 
56009b60e70bSIlya Dryomov static ssize_t rbd_remove(struct bus_type *bus,
56019b60e70bSIlya Dryomov 			  const char *buf,
56029b60e70bSIlya Dryomov 			  size_t count)
56039b60e70bSIlya Dryomov {
56049b60e70bSIlya Dryomov 	if (single_major)
56059b60e70bSIlya Dryomov 		return -EINVAL;
56069b60e70bSIlya Dryomov 
56079b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
56089b60e70bSIlya Dryomov }
56099b60e70bSIlya Dryomov 
56109b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus,
56119b60e70bSIlya Dryomov 				       const char *buf,
56129b60e70bSIlya Dryomov 				       size_t count)
56139b60e70bSIlya Dryomov {
56149b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
56159b60e70bSIlya Dryomov }
56169b60e70bSIlya Dryomov 
5617602adf40SYehuda Sadeh /*
5618602adf40SYehuda Sadeh  * create control files in sysfs
5619dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
5620602adf40SYehuda Sadeh  */
5621602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
5622602adf40SYehuda Sadeh {
5623dfc5606dSYehuda Sadeh 	int ret;
5624602adf40SYehuda Sadeh 
5625fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
5626dfc5606dSYehuda Sadeh 	if (ret < 0)
5627dfc5606dSYehuda Sadeh 		return ret;
5628602adf40SYehuda Sadeh 
5629fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
5630fed4c143SAlex Elder 	if (ret < 0)
5631fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
5632602adf40SYehuda Sadeh 
5633602adf40SYehuda Sadeh 	return ret;
5634602adf40SYehuda Sadeh }
5635602adf40SYehuda Sadeh 
5636602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
5637602adf40SYehuda Sadeh {
5638dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
5639fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
5640602adf40SYehuda Sadeh }
5641602adf40SYehuda Sadeh 
56421c2a9dfeSAlex Elder static int rbd_slab_init(void)
56431c2a9dfeSAlex Elder {
56441c2a9dfeSAlex Elder 	rbd_assert(!rbd_img_request_cache);
56451c2a9dfeSAlex Elder 	rbd_img_request_cache = kmem_cache_create("rbd_img_request",
56461c2a9dfeSAlex Elder 					sizeof (struct rbd_img_request),
56471c2a9dfeSAlex Elder 					__alignof__(struct rbd_img_request),
56481c2a9dfeSAlex Elder 					0, NULL);
5649868311b1SAlex Elder 	if (!rbd_img_request_cache)
5650868311b1SAlex Elder 		return -ENOMEM;
5651868311b1SAlex Elder 
5652868311b1SAlex Elder 	rbd_assert(!rbd_obj_request_cache);
5653868311b1SAlex Elder 	rbd_obj_request_cache = kmem_cache_create("rbd_obj_request",
5654868311b1SAlex Elder 					sizeof (struct rbd_obj_request),
5655868311b1SAlex Elder 					__alignof__(struct rbd_obj_request),
5656868311b1SAlex Elder 					0, NULL);
565778c2a44aSAlex Elder 	if (!rbd_obj_request_cache)
565878c2a44aSAlex Elder 		goto out_err;
565978c2a44aSAlex Elder 
566078c2a44aSAlex Elder 	rbd_assert(!rbd_segment_name_cache);
566178c2a44aSAlex Elder 	rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
56622d0ebc5dSIlya Dryomov 					CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL);
566378c2a44aSAlex Elder 	if (rbd_segment_name_cache)
56641c2a9dfeSAlex Elder 		return 0;
566578c2a44aSAlex Elder out_err:
566678c2a44aSAlex Elder 	if (rbd_obj_request_cache) {
566778c2a44aSAlex Elder 		kmem_cache_destroy(rbd_obj_request_cache);
566878c2a44aSAlex Elder 		rbd_obj_request_cache = NULL;
566978c2a44aSAlex Elder 	}
56701c2a9dfeSAlex Elder 
5671868311b1SAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
5672868311b1SAlex Elder 	rbd_img_request_cache = NULL;
5673868311b1SAlex Elder 
56741c2a9dfeSAlex Elder 	return -ENOMEM;
56751c2a9dfeSAlex Elder }
56761c2a9dfeSAlex Elder 
56771c2a9dfeSAlex Elder static void rbd_slab_exit(void)
56781c2a9dfeSAlex Elder {
567978c2a44aSAlex Elder 	rbd_assert(rbd_segment_name_cache);
568078c2a44aSAlex Elder 	kmem_cache_destroy(rbd_segment_name_cache);
568178c2a44aSAlex Elder 	rbd_segment_name_cache = NULL;
568278c2a44aSAlex Elder 
5683868311b1SAlex Elder 	rbd_assert(rbd_obj_request_cache);
5684868311b1SAlex Elder 	kmem_cache_destroy(rbd_obj_request_cache);
5685868311b1SAlex Elder 	rbd_obj_request_cache = NULL;
5686868311b1SAlex Elder 
56871c2a9dfeSAlex Elder 	rbd_assert(rbd_img_request_cache);
56881c2a9dfeSAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
56891c2a9dfeSAlex Elder 	rbd_img_request_cache = NULL;
56901c2a9dfeSAlex Elder }
56911c2a9dfeSAlex Elder 
5692cc344fa1SAlex Elder static int __init rbd_init(void)
5693602adf40SYehuda Sadeh {
5694602adf40SYehuda Sadeh 	int rc;
5695602adf40SYehuda Sadeh 
56961e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
56971e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
56981e32d34cSAlex Elder 		return -EINVAL;
56991e32d34cSAlex Elder 	}
5700e1b4d96dSIlya Dryomov 
57011c2a9dfeSAlex Elder 	rc = rbd_slab_init();
5702602adf40SYehuda Sadeh 	if (rc)
5703602adf40SYehuda Sadeh 		return rc;
5704e1b4d96dSIlya Dryomov 
5705f5ee37bdSIlya Dryomov 	/*
5706f5ee37bdSIlya Dryomov 	 * The number of active work items is limited by the number of
5707f77303bdSIlya Dryomov 	 * rbd devices * queue depth, so leave @max_active at default.
5708f5ee37bdSIlya Dryomov 	 */
5709f5ee37bdSIlya Dryomov 	rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
5710f5ee37bdSIlya Dryomov 	if (!rbd_wq) {
5711f5ee37bdSIlya Dryomov 		rc = -ENOMEM;
5712f5ee37bdSIlya Dryomov 		goto err_out_slab;
5713f5ee37bdSIlya Dryomov 	}
5714f5ee37bdSIlya Dryomov 
57159b60e70bSIlya Dryomov 	if (single_major) {
57169b60e70bSIlya Dryomov 		rbd_major = register_blkdev(0, RBD_DRV_NAME);
57179b60e70bSIlya Dryomov 		if (rbd_major < 0) {
57189b60e70bSIlya Dryomov 			rc = rbd_major;
5719f5ee37bdSIlya Dryomov 			goto err_out_wq;
57209b60e70bSIlya Dryomov 		}
57219b60e70bSIlya Dryomov 	}
57229b60e70bSIlya Dryomov 
57231c2a9dfeSAlex Elder 	rc = rbd_sysfs_init();
57241c2a9dfeSAlex Elder 	if (rc)
57259b60e70bSIlya Dryomov 		goto err_out_blkdev;
57261c2a9dfeSAlex Elder 
57279b60e70bSIlya Dryomov 	if (single_major)
57289b60e70bSIlya Dryomov 		pr_info("loaded (major %d)\n", rbd_major);
57299b60e70bSIlya Dryomov 	else
5730e1b4d96dSIlya Dryomov 		pr_info("loaded\n");
57319b60e70bSIlya Dryomov 
5732e1b4d96dSIlya Dryomov 	return 0;
5733e1b4d96dSIlya Dryomov 
57349b60e70bSIlya Dryomov err_out_blkdev:
57359b60e70bSIlya Dryomov 	if (single_major)
57369b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
5737f5ee37bdSIlya Dryomov err_out_wq:
5738f5ee37bdSIlya Dryomov 	destroy_workqueue(rbd_wq);
5739e1b4d96dSIlya Dryomov err_out_slab:
5740e1b4d96dSIlya Dryomov 	rbd_slab_exit();
57411c2a9dfeSAlex Elder 	return rc;
5742602adf40SYehuda Sadeh }
5743602adf40SYehuda Sadeh 
5744cc344fa1SAlex Elder static void __exit rbd_exit(void)
5745602adf40SYehuda Sadeh {
5746ffe312cfSIlya Dryomov 	ida_destroy(&rbd_dev_id_ida);
5747602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
57489b60e70bSIlya Dryomov 	if (single_major)
57499b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
5750f5ee37bdSIlya Dryomov 	destroy_workqueue(rbd_wq);
57511c2a9dfeSAlex Elder 	rbd_slab_exit();
5752602adf40SYehuda Sadeh }
5753602adf40SYehuda Sadeh 
5754602adf40SYehuda Sadeh module_init(rbd_init);
5755602adf40SYehuda Sadeh module_exit(rbd_exit);
5756602adf40SYehuda Sadeh 
5757d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
5758602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5759602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
5760602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
5761602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5762602adf40SYehuda Sadeh 
576390da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
5764602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
5765